In [5]:
import pandas as pd
pd.set_option('display.max_columns', 300)
import numpy as np
from matplotlib import pyplot as plt
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
import seaborn as sns
sns.set(style="whitegrid")

In [6]:
# Import dataset, 'kc_house_data.csv', called 'kc_housing_data_for_feat_engineering_lab.csv' in this file

df = pd.read_csv('kc_housing_data_for_feat_engineering_lab.csv')

In [7]:
df.columns

Index(['id', 'date', 'price', 'bedrooms', 'bathrooms', 'sqft_living',
       'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade',
       'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode',
       'lat', 'long', 'sqft_living15', 'sqft_lot15', 'yr_old', 'year_sold',
       'since_sold', 'price_log'],
      dtype='object')

In [9]:
df['date'] = pd.to_datetime(df['date'])

# Create a 'season' column

seasons = ['Winter', 'Winter', 'Spring', 'Spring', 'Spring',\
           'Summer', 'Summer', 'Summer','Fall', 'Fall', 'Fall', 'Winter']

df['season'] = df['date'].map(lambda x: seasons[x.month-1] )

#Create dummy variables for season

season_dummies = pd.get_dummies(df['season'], prefix="season_", drop_first=False)
season_dummies.head()

#Add the season dummy variables to the main dataframe

df = pd.concat([df, season_dummies], axis=1)
df.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15,yr_old,year_sold,since_sold,price_log,season,season__Fall,season__Spring,season__Summer,season__Winter
0,7129300520,2014-10-13,221900.0,3,1.0,1180,5650,1.0,0,0,3,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650,62,2014,3,12.309982,Fall,1,0,0,0
1,6414100192,2014-12-09,538000.0,3,2.25,2570,7242,2.0,0,0,3,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639,66,2014,3,13.195614,Winter,0,0,0,1
2,5631500400,2015-02-25,180000.0,2,1.0,770,10000,1.0,0,0,3,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062,84,2015,2,12.100712,Winter,0,0,0,1
3,2487200875,2014-12-09,604000.0,4,3.0,1960,5000,1.0,0,0,5,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000,52,2014,3,13.311329,Winter,0,0,0,1
4,1954400510,2015-02-18,510000.0,3,2.0,1680,8080,1.0,0,0,3,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503,30,2015,2,13.142166,Winter,0,0,0,1


In [10]:
# Create 'sq_living_x_lot' colimun

df['sq_living_x_lot'] = df['sqft_living'] * df['sqft_lot']

In [11]:
# Create column 'yard_size'

df['sqft_yard_size'] = df['sqft_lot'] - (df['sqft_above']/df['floors'])

In [12]:
#Create dummy variables for zipcode

zipcode_dummies = pd.get_dummies(df['zipcode'], prefix="zipcode_", drop_first=False)
zipcode_dummies.head()

#Add the zipcode dummy variables to the main dataframe

df = pd.concat([df, zipcode_dummies], axis=1)
df.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15,yr_old,year_sold,since_sold,price_log,season,season__Fall,season__Spring,season__Summer,season__Winter,sq_living_x_lot,sqft_yard_size,zipcode__98001,zipcode__98002,zipcode__98003,zipcode__98004,zipcode__98005,zipcode__98006,zipcode__98007,zipcode__98008,zipcode__98010,zipcode__98011,zipcode__98014,zipcode__98019,zipcode__98022,zipcode__98023,zipcode__98024,zipcode__98027,zipcode__98028,zipcode__98029,zipcode__98030,zipcode__98031,zipcode__98032,zipcode__98033,zipcode__98034,zipcode__98038,zipcode__98039,zipcode__98040,zipcode__98042,zipcode__98045,zipcode__98052,zipcode__98053,zipcode__98055,zipcode__98056,zipcode__98058,zipcode__98059,zipcode__98065,zipcode__98070,zipcode__98072,zipcode__98074,zipcode__98075,zipcode__98077,zipcode__98092,zipcode__98102,zipcode__98103,zipcode__98105,zipcode__98106,zipcode__98107,zipcode__98108,zipcode__98109,zipcode__98112,zipcode__98115,zipcode__98116,zipcode__98117,zipcode__98118,zipcode__98119,zipcode__98122,zipcode__98125,zipcode__98126,zipcode__98133,zipcode__98136,zipcode__98144,zipcode__98146,zipcode__98148,zipcode__98155,zipcode__98166,zipcode__98168,zipcode__98177,zipcode__98178,zipcode__98188,zipcode__98198,zipcode__98199
0,7129300520,2014-10-13,221900.0,3,1.0,1180,5650,1.0,0,0,3,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650,62,2014,3,12.309982,Fall,1,0,0,0,6667000,4470.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
1,6414100192,2014-12-09,538000.0,3,2.25,2570,7242,2.0,0,0,3,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639,66,2014,3,13.195614,Winter,0,0,0,1,18611940,6157.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,5631500400,2015-02-25,180000.0,2,1.0,770,10000,1.0,0,0,3,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062,84,2015,2,12.100712,Winter,0,0,0,1,7700000,9230.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,2487200875,2014-12-09,604000.0,4,3.0,1960,5000,1.0,0,0,5,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000,52,2014,3,13.311329,Winter,0,0,0,1,9800000,3950.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
4,1954400510,2015-02-18,510000.0,3,2.0,1680,8080,1.0,0,0,3,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503,30,2015,2,13.142166,Winter,0,0,0,1,13574400,6400.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [13]:
# Add columns where skeness is > 3

df['sqft_lot_log'] = np.log(df['sqft_lot'])
#df['waterfront_log'] = np.log(df['waterfront'])
#df['view_log'] = np.log(df['view'])
#df['sqft_lot15_log'] = np.log(df['sqft_lot15'])
#df['yr_renovated_log'] = np.log(df['yr_renovated'])
df['sq_living_x_lot_log'] = np.log(df['sq_living_x_lot'])
df['sqft_yard_size_log'] = np.log(df['sqft_yard_size'])

  if __name__ == '__main__':


In [14]:
# Remove observations where bathroom count is 0
df.drop(df[df['bathrooms'] == 0.0].index, inplace = True) 

In [15]:
target = df['price_log']

In [19]:
df.drop(df['price_log'], df['price'])

TypeError: 'Series' objects are mutable, thus they cannot be hashed