In [1]:
import pandas as pd
pd.set_option('display.max_columns', 100)
import pickle

## Step 1: Read in hold out data, scalers, and best model

In [2]:
df = pd.read_csv('Resources/kc_house_data_test_features.csv', index_col=0)

final_scaler = pickle.load(open('scaler.pickle','rb'))
final_model = pickle.load(open('model.pickle','rb'))

In [3]:
import seaborn as sns
import matplotlib.pyplot as plt

import numpy as np
from statsmodels.formula.api import ols

from sklearn import linear_model
from sklearn import metrics
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split

## Step 2: Feature Engineering for holdout set

Remember we have to perform the same transformations on our holdout data (feature engineering, extreme values, and scaling) that we performed on the original data.

In [4]:
##Only run this cell once!

# #Create new column 'age' for age of house
# df['age'] = 2020 - df.yr_built

# #Create new column 'last_renovted' for how long ago since last renovation
# df['last_ren'] = np.where(df['yr_renovated'] > 0, (2020 - df['yr_renovated']), 0)

# #Create new column 'last_ren2' for how long (years) since last renovation
# df['last_ren2'] = np.where(df['yr_renovated'] > 0, (2020 - df['yr_renovated']), (2020 - df['yr_built']))

# #Create 0/1 new column 'renovated'
# df['renovated'] = np.where(df['yr_renovated']>0 , 1, 0)

# #Create dummy columns using 'renovated'
# df = pd.get_dummies(df, columns=['renovated'])

# #Create dummy variable 'basement' y/n - 6806 with basement, 10484 without
# df['basement'] = np.where(df['sqft_basement']>0 , 1, 0)

#Create a new column 'neighbors_compared' with sqft_living - sqft_living15
df['neighbors_compared'] = df.sqft_living - df.sqft_living15


#Fixing outliers
bathroom_median = df.bathrooms.median()
df.loc[df.bathrooms == 0, 'bathrooms'] = np.nan
df.fillna(bathroom_median,inplace=True)

# #drop 33 bedrooms outlier
# df = df[df.bedrooms != 33]

#Changing large bedroom values to '7'     
df['bedrooms'] = df['bedrooms'].apply(lambda x : 7 if x > 6 else x)

# #Create dummy columns using 'waterfront'
# df = pd.get_dummies(df, columns=['waterfront'])

#Create dummy columns for 'condition2'
df['condition2'] = np.where(df['condition']<=2, 0, 1)
df = pd.get_dummies(df, columns=['condition2'])

In [5]:
df.head()

Unnamed: 0,id,date,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15,neighbors_compared,condition2_0,condition2_1
0,1974300020,20140827T000000,4,2.5,2270,11500,1.0,0,0,3,8,1540,730,1967,0,98034,47.7089,-122.241,2020,10918,250,0,1
1,1974300020,20150218T000000,4,2.5,2270,11500,1.0,0,0,3,8,1540,730,1967,0,98034,47.7089,-122.241,2020,10918,250,0,1
2,3630020380,20141107T000000,3,2.5,1470,1779,2.0,0,0,3,8,1160,310,2005,0,98029,47.5472,-121.998,1470,1576,0,0,1
3,1771000290,20141203T000000,3,1.75,1280,16200,1.0,0,0,3,8,1030,250,1976,0,98077,47.7427,-122.071,1160,10565,120,0,1
4,5126310470,20150115T000000,4,2.75,2830,8126,2.0,0,0,3,8,2830,0,2005,0,98059,47.4863,-122.14,2830,7916,0,0,1


In [6]:
 #drop columns
 df = df.drop(['date', 'zipcode'], axis=1)

In [7]:
# Binning bathrooms, 
cut_labels = [1, 2, 3, 4, 5]
cut_bins = [0, 1.25, 3, 4, 5, 8]
df['cut_bathrooms'] = pd.cut(df['bathrooms'], bins=cut_bins, labels=cut_labels)
df['cut_bathrooms'] = pd.to_numeric(df['cut_bathrooms'],errors='coerce') 

##Create Bins for Grade 1-4, 5-7, 8-13
cut_labels2 = [1, 2, 3]
cut_bins2 = [0, 4, 7, 13]
df['cut_grade'] = pd.cut(df['grade'], bins=cut_bins2, labels=cut_labels2)
df['cut_grade'] = pd.to_numeric(df['cut_grade'],errors='coerce') 

#Create dummy variable 'neighbors_compared_dummy' y/n
df['neighbors_compared_dummy'] = np.where((df['sqft_living'] > df['sqft_living15']), 1, 0)

#Create dummy columns using 'neighbors_compared_dummy'
df2 = pd.get_dummies(df, columns=['neighbors_compared_dummy'])


In [8]:
df.head()

Unnamed: 0,id,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,lat,long,sqft_living15,sqft_lot15,neighbors_compared,condition2_0,condition2_1,cut_bathrooms,cut_grade,neighbors_compared_dummy
0,1974300020,4,2.5,2270,11500,1.0,0,0,3,8,1540,730,1967,0,47.7089,-122.241,2020,10918,250,0,1,2,3,1
1,1974300020,4,2.5,2270,11500,1.0,0,0,3,8,1540,730,1967,0,47.7089,-122.241,2020,10918,250,0,1,2,3,1
2,3630020380,3,2.5,1470,1779,2.0,0,0,3,8,1160,310,2005,0,47.5472,-121.998,1470,1576,0,0,1,2,3,0
3,1771000290,3,1.75,1280,16200,1.0,0,0,3,8,1030,250,1976,0,47.7427,-122.071,1160,10565,120,0,1,2,3,1
4,5126310470,4,2.75,2830,8126,2.0,0,0,3,8,2830,0,2005,0,47.4863,-122.14,2830,7916,0,0,1,2,3,0


In [8]:
# # transformed_holdout = final_scaler(holdout)
# #Fixing outliers
# bathroom_median = df.bathrooms.median()
# df.loc[df.bathrooms == 0, 'bathrooms'] = np.nan
# df.fillna(bathroom_median,inplace=True)

# #drop 33 bedrooms outlier
# df = df[df.bedrooms != 33]

In [15]:
transformed_holdout =pd.DataFrame(
    data=final_scaler.transform(df
                               ),columns=df.columns)


ValueError: operands could not be broadcast together with shapes (4323,24) (77,) (4323,24) 

In [13]:
final_scaler

StandardScaler(copy=True, with_mean=True, with_std=True)

## Step 3: Predict the holdout set

In [10]:
columns = ['bathrooms',
 'sqft_living',
 'condition',
 'grade',
 'sqft_above',
 'sqft_living15',
 'bedrooms',
 'lat',
 'neighbors_compared',
 'condition2_1',
 'waterfront_1',
 'bathrooms^2',
 'bathrooms sqft_living',
 'bathrooms condition',
 'bathrooms grade',
 'bathrooms sqft_above',
 'bathrooms sqft_living15',
 'bathrooms bedrooms',
 'bathrooms lat',
 'bathrooms neighbors_compared',
 'bathrooms condition2_1',
 'bathrooms waterfront_1',
 'sqft_living^2',
 'sqft_living condition',
 'sqft_living grade',
 'sqft_living sqft_above',
 'sqft_living sqft_living15',
 'sqft_living bedrooms',
 'sqft_living lat',
 'sqft_living neighbors_compared',
 'sqft_living condition2_1',
 'sqft_living waterfront_1',
 'condition^2',
 'condition grade',
 'condition sqft_above',
 'condition sqft_living15',
 'condition bedrooms',
 'condition lat',
 'condition neighbors_compared',
 'condition condition2_1',
 'condition waterfront_1',
 'grade^2',
 'grade sqft_above',
 'grade sqft_living15',
 'grade bedrooms',
 'grade lat',
 'grade neighbors_compared',
 'grade condition2_1',
 'grade waterfront_1',
 'sqft_above^2',
 'sqft_above sqft_living15',
 'sqft_above bedrooms',
 'sqft_above lat',
 'sqft_above neighbors_compared',
 'sqft_above condition2_1',
 'sqft_above waterfront_1',
 'sqft_living15^2',
 'sqft_living15 bedrooms',
 'sqft_living15 lat',
 'sqft_living15 neighbors_compared',
 'sqft_living15 condition2_1',
 'sqft_living15 waterfront_1',
 'bedrooms^2',
 'bedrooms lat',
 'bedrooms neighbors_compared',
 'bedrooms condition2_1',
 'bedrooms waterfront_1',
 'lat^2',
 'lat neighbors_compared',
 'lat condition2_1',
 'lat waterfront_1',
 'neighbors_compared^2',
 'neighbors_compared condition2_1',
 'neighbors_compared waterfront_1',
 'condition2_1^2',
 'condition2_1 waterfront_1',
 'waterfront_1^2']

In [11]:
transformed_holdout = pd.DataFrame(data=final_scaler.transform(df),columns=df.columns)

ValueError: operands could not be broadcast together with shapes (4323,24) (77,) (4323,24) 

In [12]:
final_pred = final_model.predict(df[columns])


KeyError: "['sqft_living^2', 'grade neighbors_compared', 'lat^2', 'bathrooms neighbors_compared', 'sqft_above neighbors_compared', 'bedrooms lat', 'sqft_above waterfront_1', 'sqft_living sqft_living15', 'grade waterfront_1', 'sqft_above^2', 'sqft_above bedrooms', 'bathrooms condition', 'bedrooms waterfront_1', 'sqft_living lat', 'grade sqft_living15', 'sqft_living waterfront_1', 'bedrooms^2', 'condition lat', 'condition^2', 'grade condition2_1', 'condition sqft_living15', 'lat condition2_1', 'condition bedrooms', 'sqft_above condition2_1', 'sqft_living15^2', 'waterfront_1', 'sqft_living15 neighbors_compared', 'bathrooms bedrooms', 'bathrooms sqft_living15', 'neighbors_compared waterfront_1', 'condition sqft_above', 'bathrooms grade', 'sqft_living15 condition2_1', 'sqft_living15 waterfront_1', 'sqft_living bedrooms', 'sqft_living neighbors_compared', 'sqft_above sqft_living15', 'condition waterfront_1', 'sqft_above lat', 'lat waterfront_1', 'grade bedrooms', 'grade lat', 'bedrooms condition2_1', 'sqft_living15 bedrooms', 'neighbors_compared^2', 'sqft_living15 lat', 'sqft_living condition2_1', 'sqft_living condition', 'bathrooms^2', 'condition grade', 'bedrooms neighbors_compared', 'grade^2', 'bathrooms condition2_1', 'condition neighbors_compared', 'bathrooms sqft_above', 'sqft_living sqft_above', 'grade sqft_above', 'condition2_1 waterfront_1', 'waterfront_1^2', 'bathrooms lat', 'condition2_1^2', 'bathrooms sqft_living', 'bathrooms waterfront_1', 'condition condition2_1', 'neighbors_compared condition2_1', 'sqft_living grade', 'lat neighbors_compared'] not in index"

In [None]:

poly= PolynomialFeatures(degree=2, include_bias=False)
poly_data = poly.fit_transform(df_features)
poly_columns = poly.get_feature_names(df_features.columns)

df_poly = pd.DataFrame(poly_data, columns=columns)

In [17]:
x = pd.concat([df,])

Unnamed: 0,id,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,lat,long,sqft_living15,sqft_lot15,neighbors_compared,condition2_0,condition2_1,cut_bathrooms,cut_grade,neighbors_compared_dummy
0,1974300020,4,2.5,2270,11500,1.0,0,0,3,8,1540,730,1967,0,47.7089,-122.241,2020,10918,250,0,1,2,3,1
1,1974300020,4,2.5,2270,11500,1.0,0,0,3,8,1540,730,1967,0,47.7089,-122.241,2020,10918,250,0,1,2,3,1
2,3630020380,3,2.5,1470,1779,2.0,0,0,3,8,1160,310,2005,0,47.5472,-121.998,1470,1576,0,0,1,2,3,0
3,1771000290,3,1.75,1280,16200,1.0,0,0,3,8,1030,250,1976,0,47.7427,-122.071,1160,10565,120,0,1,2,3,1
4,5126310470,4,2.75,2830,8126,2.0,0,0,3,8,2830,0,2005,0,47.4863,-122.14,2830,7916,0,0,1,2,3,0


## Step 4: Export your predictions

In [16]:
final_answer = pd.DataFrame(final_pred)
final_answer.to_csv('housing_preds_cierra_andaur.csv')

NameError: name 'final_predictions' is not defined