##  Step 1: Business Understanding (What is the goal?)

In [1]:
#Gathering facts and requirements
#Who will be using the model you build?
    #House sellers/buyers
#How will they be using it?
    #See how much to sell there house for, if they are selling for a good value or too low
    #See how much to buy a house for, if they spending too much money on a house or getting a deal
#How will this help the goals of the business or organization overall?
    #A seller can avoid selling their house for too much or realize when they are getting a fair or good offer
    #A buyer can avaoid spending too much on a property and realize if they are spending a fair or value offer
#Who are the stakeholders in this project? Who will be directly affected by the creation of this project?
    #People who flip houses
    #Real estate agents
    #House sellers
    #House buyers
#What business problem(s) will this Data Science project solve for the organization?
    #This will help save money where too much is being spent
#What problems are inside the scope of this project?
#What problems are outside the scope of this project?
#What data sources are available to us?
#What is the expected timeline for this project? Are there hard deadlines (e.g. "must be live before holiday season shopping") or is this an ongoing project?
#Do stakeholders from different parts of the company or organization all have the exact same understanding about what this project is and isn't?

Goal as stated by Flatiron:


You'll clean, explore, and model this dataset with a multivariate linear regression to predict the sale price of houses as accurately as possible.

## Step 2: Data Understanding

In [2]:
#What data is available to us? Where does it live? Do we have the data, or can we scrape/buy/source the data from somewhere else?
    #'kc_house_data.csv'
#Who controls the data sources, and what steps are needed to get access to the data?
#What is our target?
    #House price
#What predictors are available to us?
    #'id', 'date', 'bedrooms', 'bathrooms', 'sqft_living','sqft_lot', 'floors', 'waterfront', 'view', 
    #'condition', 'grade', 'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode',
    #'lat', 'long', 'sqft_living15', 'sqft_lot15', 'yr_old', 'year_sold',
    #'since_sold', 'price_log'
#What data types are the predictors we'll be working with?
    #All are numeric except date
#What is the distribution of our data?
#How many observations does our dataset contain? Do we have a lot of data? Only a little?
    #21600 observations
#Do we have enough data to build a model? Will we need to use resampling methods?
    #We have enough data
#How do we know the data is correct? How is the data collected? Is there a chance the data could be wrong?



### Import Libraries

In [3]:
# Import Pandads
import pandas as pd

# Import Numpy
import numpy as np

# Import train_test_split from sklearn
from sklearn.model_selection import train_test_split

# Import StandardScaler from sklearn
from sklearn.preprocessing import StandardScaler

# Import Seaborn
import seaborn as sns
sns.set(style="whitegrid")

### Import Dataset

In [4]:
# Import dataset, 'kc_house_data.csv', called 'kc_housing_data_for_feat_engineering_lab.csv' in this file

df = pd.read_csv('kc_housing_data_for_feat_engineering_lab.csv')

### Preview Dataset

In [5]:
# Preview dataset

df.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15,yr_old,year_sold,since_sold,price_log
0,7129300520,2014-10-13,221900.0,3,1.0,1180,5650,1.0,0,0,...,0,98178,47.5112,-122.257,1340,5650,62,2014,3,12.309982
1,6414100192,2014-12-09,538000.0,3,2.25,2570,7242,2.0,0,0,...,1991,98125,47.721,-122.319,1690,7639,66,2014,3,13.195614
2,5631500400,2015-02-25,180000.0,2,1.0,770,10000,1.0,0,0,...,0,98028,47.7379,-122.233,2720,8062,84,2015,2,12.100712
3,2487200875,2014-12-09,604000.0,4,3.0,1960,5000,1.0,0,0,...,0,98136,47.5208,-122.393,1360,5000,52,2014,3,13.311329
4,1954400510,2015-02-18,510000.0,3,2.0,1680,8080,1.0,0,0,...,0,98074,47.6168,-122.045,1800,7503,30,2015,2,13.142166


In [6]:
#Preview all column names

df.columns

Index(['id', 'date', 'price', 'bedrooms', 'bathrooms', 'sqft_living',
       'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade',
       'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode',
       'lat', 'long', 'sqft_living15', 'sqft_lot15', 'yr_old', 'year_sold',
       'since_sold', 'price_log'],
      dtype='object')

In [7]:
# Preview datatypes

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21600 entries, 0 to 21599
Data columns (total 25 columns):
id               21600 non-null int64
date             21600 non-null object
price            21600 non-null float64
bedrooms         21600 non-null int64
bathrooms        21600 non-null float64
sqft_living      21600 non-null int64
sqft_lot         21600 non-null int64
floors           21600 non-null float64
waterfront       21600 non-null int64
view             21600 non-null int64
condition        21600 non-null int64
grade            21600 non-null int64
sqft_above       21600 non-null int64
sqft_basement    21600 non-null int64
yr_built         21600 non-null int64
yr_renovated     21600 non-null int64
zipcode          21600 non-null int64
lat              21600 non-null float64
long             21600 non-null float64
sqft_living15    21600 non-null int64
sqft_lot15       21600 non-null int64
yr_old           21600 non-null int64
year_sold        21600 non-null int64
since_

In [8]:
# Shape of dataset

df.shape

(21600, 25)

## Step 3: Data Preperation

In [9]:
#Detecting and dealing with missing values - done
#Data type conversions (e.g. numeric data mistakenly encoded as strings) - done
#Checking for and removing multicollinearity (correlated predictors)
#Normalizing our numeric data
#Converting categorical data to numeric format through one-hot encoding

In [10]:
# Check for null values in the dataset

pd.isnull(df).sum()

id               0
date             0
price            0
bedrooms         0
bathrooms        0
sqft_living      0
sqft_lot         0
floors           0
waterfront       0
view             0
condition        0
grade            0
sqft_above       0
sqft_basement    0
yr_built         0
yr_renovated     0
zipcode          0
lat              0
long             0
sqft_living15    0
sqft_lot15       0
yr_old           0
year_sold        0
since_sold       0
price_log        0
dtype: int64

In [None]:
interactions = []
data = df.copy()
for combo in combinations:
    data['interaction'] = data[combo[0]] * data[combo[1]]
    score = np.mean(cross_val_score(regression, data, y, scoring="r2", cv=cross_validation))
    if score > baseline: interactions.append((combo[0], combo[1], round(score,3)))
        
print("Top 3 interactions: %s" %sorted(interactions, key=lambda inter: inter[2], reverse=True)[:5])

In [11]:
#Convert 'date' column from string object to datetime

df['date'] = pd.to_datetime(df['date'])

In [27]:
# Create a 'season' column

seasons = ['Winter', 'Winter', 'Spring', 'Spring', 'Spring',\
           'Summer', 'Summer', 'Summer','Fall', 'Fall', 'Fall', 'Winter']

df['season'] = df['date'].map(lambda x: seasons[x.month-1] )

#Create dummy variables for season

season_dummies = pd.get_dummies(df['season'], prefix="season_", drop_first=False)
#season_dummies.head()



#Add the season dummy variables to the main dataframe

df = pd.concat([df, season_dummies], axis=1)
df.head()

NameError: name 'KC' is not defined

In [None]:
# Create 'sq_living_x_lot' colimun

df['sq_living_x_lot'] = df['sqft_living'] * df['sqft_lot']

In [14]:
# Create column 'yard_size'

df['sqft_yard_size'] = df['sqft_lot'] - (df['sqft_above']/df['floors'])

In [15]:
#Create dummy variables for zipcode

zipcode_dummies = pd.get_dummies(df['zipcode'], prefix="zipcode_", drop_first=False)
zipcode_dummies.head()

#Add the zipcode dummy variables to the main dataframe

df = pd.concat([df, zipcode_dummies], axis=1)
df.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,zipcode__98146,zipcode__98148,zipcode__98155,zipcode__98166,zipcode__98168,zipcode__98177,zipcode__98178,zipcode__98188,zipcode__98198,zipcode__98199
0,7129300520,2014-10-13,221900.0,3,1.0,1180,5650,1.0,0,0,...,0,0,0,0,0,0,1,0,0,0
1,6414100192,2014-12-09,538000.0,3,2.25,2570,7242,2.0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,5631500400,2015-02-25,180000.0,2,1.0,770,10000,1.0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2487200875,2014-12-09,604000.0,4,3.0,1960,5000,1.0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1954400510,2015-02-18,510000.0,3,2.0,1680,8080,1.0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [16]:
df.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,zipcode__98146,zipcode__98148,zipcode__98155,zipcode__98166,zipcode__98168,zipcode__98177,zipcode__98178,zipcode__98188,zipcode__98198,zipcode__98199
0,7129300520,2014-10-13,221900.0,3,1.0,1180,5650,1.0,0,0,...,0,0,0,0,0,0,1,0,0,0
1,6414100192,2014-12-09,538000.0,3,2.25,2570,7242,2.0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,5631500400,2015-02-25,180000.0,2,1.0,770,10000,1.0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2487200875,2014-12-09,604000.0,4,3.0,1960,5000,1.0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1954400510,2015-02-18,510000.0,3,2.0,1680,8080,1.0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [17]:
df.skew()

id                  0.243593
price               2.968485
bedrooms            0.517933
bathrooms           0.447546
sqft_living         1.312390
sqft_lot           13.058313
floors              0.617284
waterfront         11.564376
view                3.402318
condition           1.032412
grade               0.753757
sqft_above          1.370752
sqft_basement       1.520061
yr_built           -0.469408
yr_renovated        4.556468
zipcode             0.405309
lat                -0.484599
long                0.884620
sqft_living15       1.105448
sqft_lot15          9.505946
yr_old              0.469408
year_sold           0.757028
since_sold         -0.757028
price_log           0.386622
season__Fall        1.255262
season__Spring      0.864053
season__Summer      0.910610
season__Winter      1.745416
sq_living_x_lot    16.445204
sqft_yard_size     13.116823
                     ...    
zipcode__98092      7.652654
zipcode__98102     14.308235
zipcode__98103      5.742238
zipcode__98105

In [18]:
# Add columns where skeness is > 3

df['sqft_lot_log'] = np.log(df['sqft_lot'])
#df['waterfront_log'] = np.log(df['waterfront'])
#df['view_log'] = np.log(df['view'])
#df['sqft_lot15_log'] = np.log(df['sqft_lot15'])
#df['yr_renovated_log'] = np.log(df['yr_renovated'])
df['sq_living_x_lot_log'] = np.log(df['sq_living_x_lot'])
df['sqft_yard_size_log'] = np.log(df['sqft_yard_size'])

  if __name__ == '__main__':


In [19]:
df.describe()

Unnamed: 0,id,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,...,zipcode__98166,zipcode__98168,zipcode__98177,zipcode__98178,zipcode__98188,zipcode__98198,zipcode__98199,sqft_lot_log,sq_living_x_lot_log,sqft_yard_size_log
count,21600.0,21600.0,21600.0,21600.0,21600.0,21600.0,21600.0,21600.0,21600.0,21600.0,...,21600.0,21600.0,21600.0,21600.0,21600.0,21600.0,21600.0,21600.0,21600.0,21596.0
mean,4579615000.0,537381.6,3.368519,2.112755,2076.363102,15099.84,1.494074,0.007315,0.233102,3.409491,...,0.011759,0.012454,0.011806,0.01213,0.006296,0.012963,0.014676,8.989293,16.538814,8.785338
std,2876428000.0,347816.1,0.907209,0.764996,905.418691,41431.12,0.539977,0.085215,0.763639,0.650764,...,0.107803,0.110902,0.108013,0.109467,0.079101,0.113117,0.120255,0.9022,1.113546,1.01492
min,1000102.0,75000.0,0.0,0.0,290.0,520.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.253829,13.105156,3.075775
25%,2123049000.0,321381.8,3.0,1.75,1420.0,5040.0,1.0,0.0,0.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.525161,15.943482,8.291547
50%,3904926000.0,450000.0,3.0,2.25,1910.0,7615.5,1.5,0.0,0.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.937941,16.466712,8.759355
75%,7308675000.0,645000.0,4.0,2.5,2550.0,10666.75,2.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.274887,17.019859,9.13777
max,9900000000.0,3850000.0,11.0,8.0,13540.0,1651359.0,3.5,1.0,4.0,5.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,14.317109,22.461789,14.316322


In [20]:
df.corr()['price'].sort_values(ascending = False)

price                  1.000000
price_log              0.910283
sqft_living            0.694332
grade                  0.677461
sqft_above             0.598753
sqft_living15          0.597792
bathrooms              0.520003
view                   0.397511
sq_living_x_lot_log    0.365247
lat                    0.320394
bedrooms               0.317871
sqft_basement          0.312569
zipcode__98004         0.271566
floors                 0.264089
waterfront             0.248897
zipcode__98040         0.207507
zipcode__98039         0.189449
sq_living_x_lot        0.183798
zipcode__98112         0.180201
sqft_lot_log           0.159662
sqft_yard_size_log     0.140781
zipcode__98006         0.139238
yr_renovated           0.123536
zipcode__98033         0.105165
zipcode__98105         0.096859
zipcode__98075         0.094640
sqft_lot               0.091994
zipcode__98199         0.089281
sqft_yard_size         0.087540
sqft_lot15             0.084420
                         ...   
zipcode_

### Remove Outliers & Unusual Values

In [21]:
# Remove observations where bathroom count is 0
df.drop(df[df['bathrooms'] == 0.0].index, inplace = True) 

In [31]:
from itertools import combinations
combinations = list(combinations(df, 2))

interactions = []
data = df.copy()
for combo in combinations:
    data['interaction'] = data[combo[0]] * data[combo[1]]
    score = np.mean(cross_val_score(regression, data, y, scoring="r2", cv=cross_validation))
    if score > baseline: interactions.append((combo[0], combo[1], round(score,3)))
        
print("Top 3 interactions: %s" %sorted(interactions, key=lambda inter: inter[2], reverse=True)[:5])

TypeError: ufunc multiply cannot use operands with types dtype('int64') and dtype('<M8[ns]')

## Split Dataset for Testing and Training

In [22]:
target = df['price_log']
features = df.drop(['price_log', 'price', 'date', 'season'], axis=1)
X_train, X_test, y_train, y_test = train_test_split(features, target, random_state=34,test_size=0.2)
features.head()

Unnamed: 0,id,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,...,zipcode__98166,zipcode__98168,zipcode__98177,zipcode__98178,zipcode__98188,zipcode__98198,zipcode__98199,sqft_lot_log,sq_living_x_lot_log,sqft_yard_size_log
0,7129300520,3,1.0,1180,5650,1.0,0,0,3,7,...,0,0,0,1,0,0,0,8.639411,15.712681,8.405144
1,6414100192,3,2.25,2570,7242,2.0,0,0,3,7,...,0,0,0,0,0,0,0,8.887653,16.739314,8.725345
2,5631500400,2,1.0,770,10000,1.0,0,0,3,6,...,0,0,0,0,0,0,0,9.21034,15.856731,9.130214
3,2487200875,4,3.0,1960,5000,1.0,0,0,5,7,...,0,0,0,0,0,0,0,8.517193,16.097893,8.281471
4,1954400510,3,2.0,1680,8080,1.0,0,0,3,8,...,0,0,0,0,0,0,0,8.997147,16.423696,8.764053


In [23]:
X_train.describe()

Unnamed: 0,id,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,...,zipcode__98166,zipcode__98168,zipcode__98177,zipcode__98178,zipcode__98188,zipcode__98198,zipcode__98199,sqft_lot_log,sq_living_x_lot_log,sqft_yard_size_log
count,17272.0,17272.0,17272.0,17272.0,17272.0,17272.0,17272.0,17272.0,17272.0,17272.0,...,17272.0,17272.0,17272.0,17272.0,17272.0,17272.0,17272.0,17272.0,17272.0,17268.0
mean,4558232000.0,3.368573,2.115548,2075.382932,15163.58,1.495484,0.007121,0.23292,3.408001,7.655049,...,0.011753,0.012332,0.011637,0.011753,0.006369,0.012448,0.014764,8.988725,16.538631,8.785016
std,2880084000.0,0.901997,0.763548,897.95687,41873.16,0.539969,0.08409,0.763429,0.650214,1.16818,...,0.107776,0.110366,0.10725,0.107776,0.079552,0.110877,0.12061,0.904162,1.11275,1.015825
min,1000102.0,0.0,0.5,370.0,520.0,1.0,0.0,0.0,1.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.253829,13.105156,4.812184
25%,2111011000.0,3.0,1.75,1430.0,5040.0,1.0,0.0,0.0,3.0,7.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.525161,15.944761,8.288911
50%,3886903000.0,3.0,2.25,1910.0,7609.0,1.5,0.0,0.0,3.0,7.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.937087,16.47023,8.757784
75%,7304301000.0,4.0,2.5,2550.0,10621.25,2.0,0.0,0.0,4.0,8.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.270612,17.011026,9.136075
max,9900000000.0,11.0,6.75,8020.0,1651359.0,3.5,1.0,4.0,5.0,13.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,14.317109,22.461789,14.316322


In [24]:
X_train.shape

(17272, 101)

In [25]:
scaler = StandardScaler()
scaler.fit(X_train)
#X_train =pd.DataFrame(data=scaler.transform(X_train), columns=feature_columns)
#X_test =pd.DataFrame(data=scaler.transform(X_test), columns=feature_columns)

  return self.partial_fit(X, y)


StandardScaler(copy=True, with_mean=True, with_std=True)

In [26]:

sns.set(style="white")


# Compute the correlation matrix
corr = X_train.corr()

# Generate a mask for the upper triangle
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(220, 10, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})

NameError: name 'plt' is not defined

In [44]:

def findDisappearedNumbers(self, nums):
    s1 = set(nums)
    s2 = set(range(1,len(nums)+1))
    out = s2 - s1
    return list(out)

In [45]:
findDisappearedNumbers(1,[4,3,2,7,8,2,3,1])

[5, 6]