In [16]:
import pandas as pd
import pickle
import numpy as np
from sklearn import metrics
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression

pd.set_option('display.max_columns', 300)

## Step 1: Read in hold out data, scalers, and best model

In [17]:
df = pd.read_csv('kc_house_data_test_features.csv', index_col=0)

In [18]:
final_scaler = pickle.load(open('scaler.pickle','rb'))
final_model = pickle.load(open('model.pickle','rb'))

## Step 2: Feature Engineering for holdout set

Remember we have to perform the same transformations on our holdout data (feature engineering, extreme values, and scaling) that we performed on the original data.

### Generating Features

In [19]:
# FEATURE: MONTH SOLD
# converting date to a real datetime
df.date = pd.to_datetime(df['date'])
# extracting just the month
df['month_sold'] = df.date.dt.month

# FEATURE: YEAR SINCE BUILD/RENO
# using np.select to find the years since it was built or renovated
# if it was renovated
conditions = [
    df['yr_renovated'] != 0,
]
# set the years since build to 2020 - that year to get the # of years
# the data stops at 2015 but all of these will be changed so the time since does not matter
choices = [
    2020-df['yr_renovated']
]
# if not renovated defaults to the year it was built
df['yr_since_build'] = np.select(conditions,choices,default=(2020-df['yr_built']))

# FEATURE: IS MULTIPLE FLOORS
# using lambda to see if floors is greater than 1
df['is_multi_floor'] = df['floors'].apply(lambda x: 1 if x > 1 else 0)

# FEATURE: HAS BASEMENT
# using lambda to see if there is a square footage for the beasement
df['has_basement'] = df['sqft_basement'].apply(lambda x: 1 if x>0 else 0)

In [20]:
features = [
    'bedrooms',
    'bathrooms',
    'sqft_living',
    'sqft_lot',
    'floors',
    'waterfront',
    'view',
    'condition',
    'grade',
    'sqft_above',
    'sqft_basement',
    'yr_built',
    'yr_renovated',
    'sqft_living15',
    'sqft_lot15',
    'month_sold',
    'yr_since_build',
    'is_multi_floor',
    'has_basement'
]
df_features = df[features]

## Generating Polynomials

In [21]:
# instantiating the object
poly2 = PolynomialFeatures(degree=2, include_bias=False)
# transforming my features
poly2_data = poly2.fit_transform(df_features)
# creating the new data frame
poly2_cols = poly2.get_feature_names(df_features.columns)
df_poly2 = pd.DataFrame(poly2_data,columns = poly2_cols)

## Feature Selection

In [29]:
selected_features = ['bedrooms',
 'bathrooms',
 'sqft_living',
 'floors',
 'grade',
 'sqft_above',
 'sqft_basement',
 'yr_built',
 'sqft_living15',
 'sqft_lot15',
 'yr_since_build',
 'is_multi_floor',
 'bedrooms^2',
 'bedrooms bathrooms',
 'bedrooms sqft_living',
 'bedrooms sqft_lot',
 'bedrooms floors',
 'bedrooms view',
 'bedrooms condition',
 'bedrooms grade',
 'bedrooms sqft_above',
 'bedrooms sqft_basement',
 'bedrooms yr_built',
 'bedrooms sqft_living15',
 'bedrooms sqft_lot15',
 'bedrooms yr_since_build',
 'bedrooms is_multi_floor',
 'bathrooms^2',
 'bathrooms sqft_living',
 'bathrooms sqft_lot',
 'bathrooms floors',
 'bathrooms view',
 'bathrooms condition',
 'bathrooms grade',
 'bathrooms sqft_above',
 'bathrooms sqft_basement',
 'bathrooms yr_built',
 'bathrooms sqft_living15',
 'bathrooms sqft_lot15',
 'bathrooms month_sold',
 'bathrooms yr_since_build',
 'bathrooms is_multi_floor',
 'bathrooms has_basement',
 'sqft_living^2',
 'sqft_living sqft_lot',
 'sqft_living floors',
 'sqft_living view',
 'sqft_living condition',
 'sqft_living grade',
 'sqft_living sqft_above',
 'sqft_living sqft_basement',
 'sqft_living yr_built',
 'sqft_living sqft_living15',
 'sqft_living sqft_lot15',
 'sqft_living month_sold',
 'sqft_living yr_since_build',
 'sqft_living is_multi_floor',
 'sqft_living has_basement',
 'sqft_lot floors',
 'sqft_lot grade',
 'sqft_lot sqft_above',
 'sqft_lot yr_built',
 'sqft_lot sqft_living15',
 'sqft_lot sqft_lot15',
 'sqft_lot is_multi_floor',
 'floors^2',
 'floors view',
 'floors condition',
 'floors grade',
 'floors sqft_above',
 'floors sqft_basement',
 'floors yr_built',
 'floors sqft_living15',
 'floors sqft_lot15',
 'floors yr_since_build',
 'floors is_multi_floor',
 'floors has_basement',
 'view^2',
 'view grade',
 'view sqft_above',
 'view sqft_living15',
 'view sqft_lot15',
 'condition grade',
 'condition sqft_above',
 'condition sqft_basement',
 'condition yr_built',
 'condition sqft_living15',
 'condition sqft_lot15',
 'condition yr_since_build',
 'condition is_multi_floor',
 'grade^2',
 'grade sqft_above',
 'grade sqft_basement',
 'grade yr_built',
 'grade sqft_living15',
 'grade sqft_lot15',
 'grade month_sold',
 'grade yr_since_build',
 'grade is_multi_floor',
 'grade has_basement',
 'sqft_above^2',
 'sqft_above sqft_basement',
 'sqft_above yr_built',
 'sqft_above sqft_living15',
 'sqft_above sqft_lot15',
 'sqft_above month_sold',
 'sqft_above yr_since_build',
 'sqft_above is_multi_floor',
 'sqft_above has_basement',
 'sqft_basement^2',
 'sqft_basement yr_built',
 'sqft_basement sqft_living15',
 'sqft_basement is_multi_floor',
 'sqft_basement has_basement',
 'yr_built^2',
 'yr_built sqft_living15',
 'yr_built sqft_lot15',
 'yr_built yr_since_build',
 'yr_built is_multi_floor',
 'sqft_living15^2',
 'sqft_living15 sqft_lot15',
 'sqft_living15 month_sold',
 'sqft_living15 yr_since_build',
 'sqft_living15 is_multi_floor',
 'sqft_living15 has_basement',
 'sqft_lot15^2',
 'sqft_lot15 is_multi_floor',
 'sqft_lot15 has_basement',
 'yr_since_build^2',
 'yr_since_build is_multi_floor',
 'is_multi_floor^2']

In [30]:
#df = df_poly2

In [34]:
df = df_poly2[selected_features]

In [35]:
transformed_holdout = pd.DataFrame(data=final_scaler.transform(df),columns=df.columns)

In [33]:
#transformed_holdout = transformed_holdout[selected_features]

## Scaling

In [36]:
df.shape

(4323, 131)

In [37]:

transformed_holdout = pd.DataFrame(data=final_scaler.transform(df),columns=df.columns)


## Step 3: Predict the holdout set

In [38]:
final_answers = final_model.predict(transformed_holdout)

In [39]:
final_answers = pd.DataFrame(final_answers)

In [40]:
final_answers.head()

Unnamed: 0,0
0,2.002938e+22
1,2.002938e+22
2,6.176219e+21
3,4.235072e+21
4,6.627107e+21


## Step 4: Export your predictions

In [41]:
# final_answers.to_csv('housing_preds_your_name.csv')
final_answers.to_csv('housing_preds_justin_fernandez.csv')