In [1]:
import pandas as pd
pd.set_option('display.max_columns', 100)
import numpy as np
from sklearn.preprocessing import PolynomialFeatures
import pickle

## Step 1: Read in hold out data, scalers, and best model

In [2]:
df = pd.read_csv('Resources/kc_house_data_test_features.csv', index_col=0)
#final_scaler = pickle.load(open('scaler.pickle','rb'))
final_model = pickle.load(open('model.pickle','rb'))

## Step 2: Feature Engineering for holdout set

Remember we have to perform the same transformations on our holdout data (feature engineering, extreme values, and scaling) that we performed on the original data.

In [3]:
##RUN CELL ONLY ONCE!

# Create a new column 'neighbors_compared' with sqft_living - sqft_living15
df['neighbors_compared'] = df.sqft_living - df.sqft_living15

# Fixing outliers
bathroom_median = df.bathrooms.median()
df.loc[df.bathrooms == 0, 'bathrooms'] = np.nan
df.fillna(bathroom_median,inplace=True)

# Changing large bedroom values to '7'     
df['bedrooms'] = df['bedrooms'].apply(lambda x : 7 if x > 6 else x)

# Create dummy columns for 'condition2'
df['condition2'] = np.where(df['condition']<=2, 0, 1)
df = pd.get_dummies(df, columns=['condition2'])

# Create dummy columns using 'waterfront'
df = pd.get_dummies(df, columns=['waterfront'])

# Create dummy variable 'neighbors_compared_dummy' y/n
df['neighbors_compared_dummy'] = np.where((df['sqft_living'] > df['sqft_living15']), 1, 0)

#Create dummy columns using 'neighbors_compared_dummy'
df2 = pd.get_dummies(df, columns=['neighbors_compared_dummy'])

In [4]:
# Create Bins for Bathrooms, 
cut_labels = [1, 2, 3, 4, 5]
cut_bins = [0, 1.25, 3, 4, 5, 8]
df['cut_bathrooms'] = pd.cut(df['bathrooms'], bins=cut_bins, labels=cut_labels)
df['cut_bathrooms'] = pd.to_numeric(df['cut_bathrooms'],errors='coerce') 

# Create Bins for Grade 1-4, 5-7, 8-13
cut_labels2 = [1, 2, 3]
cut_bins2 = [0, 4, 7, 13]
df['cut_grade'] = pd.cut(df['grade'], bins=cut_bins2, labels=cut_labels2)
df['cut_grade'] = pd.to_numeric(df['cut_grade'],errors='coerce')

In [6]:
# Chosen Features for Final Model
features = ['bathrooms', 'sqft_living', 'condition', 'grade', 'sqft_above', 'sqft_living15', 'bedrooms', 'lat', 'neighbors_compared', 'condition2_1', 'waterfront_1']
df_features = df[features]

poly= PolynomialFeatures(degree=2, include_bias=False)
poly_data = poly.fit_transform(df_features)
poly_columns = poly.get_feature_names(df_features.columns)
df_poly = pd.DataFrame(poly_data, columns=poly_columns)

In [7]:
selected_columns = ['bathrooms',
 'sqft_living',
 'grade',
 'sqft_above',
 'sqft_living15',
 'bedrooms',
 'lat',
 'bathrooms^2',
 'bathrooms sqft_living',
 'bathrooms condition',
 'bathrooms grade',
 'bathrooms sqft_above',
 'bathrooms sqft_living15',
 'bathrooms bedrooms',
 'bathrooms lat',
 'bathrooms neighbors_compared',
 'bathrooms condition2_1',
 'sqft_living^2',
 'sqft_living condition',
 'sqft_living grade',
 'sqft_living sqft_above',
 'sqft_living sqft_living15',
 'sqft_living bedrooms',
 'sqft_living lat',
 'sqft_living neighbors_compared',
 'sqft_living condition2_1',
 'condition grade',
 'condition sqft_above',
 'condition sqft_living15',
 'grade^2',
 'grade sqft_above',
 'grade sqft_living15',
 'grade bedrooms',
 'grade lat',
 'grade neighbors_compared',
 'grade condition2_1',
 'sqft_above^2',
 'sqft_above sqft_living15',
 'sqft_above bedrooms',
 'sqft_above lat',
 'sqft_above neighbors_compared',
 'sqft_above condition2_1',
 'sqft_living15^2',
 'sqft_living15 bedrooms',
 'sqft_living15 lat',
 'sqft_living15 condition2_1',
 'bedrooms^2',
 'bedrooms lat',
 'bedrooms condition2_1',
 'lat^2']

## Step 3: Predict the holdout set

In [8]:
final_pred = final_model.predict(df_poly[selected_columns])

## Step 4: Export your predictions

In [9]:
final_prediction = pd.DataFrame(final_pred)
final_prediction.to_csv('housing_preds_cierra_andaur.csv')

In [10]:
final_pred[:5]

array([4.28071032e+09, 4.28071032e+09, 4.25567923e+09, 4.29058731e+09,
       4.24119527e+09])