# Project

### Goal: Improve our original estimate of the log error by using clustering methodologies.

## Modeling

In [1]:
# ignore warnings
import warnings
warnings.filterwarnings("ignore")

# Wrangling
import pandas as pd
import numpy as np

# Modeling
import scipy.stats as stats
from sklearn.cluster import KMeans
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE

# Visualizing
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

# default pandas decimal number display format
pd.options.display.float_format = '{:20,.2f}'.format
pd.set_option('display.max_columns', None, 'display.max_rows', None)

import wrangle_zillow
import model

zillow = pd.read_csv('zillow_fe.csv', index_col='Unnamed: 0')

Split your data into training and test sets.

In [2]:
train_LA, test_LA = wrangle_zillow.split_my_data(zillow[zillow.county == 'Los_Angeles'], 0.8)
train_OC, test_OC = wrangle_zillow.split_my_data(zillow[zillow.county == 'Orange'], 0.8)
train_VC, test_VC = wrangle_zillow.split_my_data(zillow[zillow.county == 'Ventura'], 0.8)

train_LA, valid_LA = wrangle_zillow.split_my_data(train_LA, 0.8)
train_OC, valid_OC = wrangle_zillow.split_my_data(train_OC, 0.8)
train_VC, valid_VC = wrangle_zillow.split_my_data(train_VC, 0.8)

y_train_LA = train_LA[['logerror']]
y_train_OC = train_OC[['logerror']]
y_train_VC = train_VC[['logerror']]

y_valid_LA = valid_LA[['logerror']]
y_valid_OC = valid_OC[['logerror']]
y_valid_VC = valid_VC[['logerror']]

y_test_LA = test_LA[['logerror']]
y_test_OC = test_OC[['logerror']]
y_test_VC = test_VC[['logerror']]

train_LA = train_LA.drop(columns=['logerror'])
train_OC = train_OC.drop(columns=['logerror'])
train_VC = train_VC.drop(columns=['logerror'])

valid_LA = valid_LA.drop(columns=['logerror'])
valid_OC = valid_OC.drop(columns=['logerror'])
valid_VC = valid_VC.drop(columns=['logerror'])

test_LA = test_LA.drop(columns=['logerror'])
test_OC = test_OC.drop(columns=['logerror'])
test_VC = test_VC.drop(columns=['logerror'])

scaler, train_LA, valid_LA, test_LA = wrangle_zillow.min_max_scaler(train_LA, valid_LA, test_LA)
scaler, train_OC, valid_OC, test_OC = wrangle_zillow.min_max_scaler(train_OC, valid_OC, test_OC)
scaler, train_VC, valid_VC, test_VC = wrangle_zillow.min_max_scaler(train_VC, valid_VC, test_VC)

## Model Selection

Train at least 3 different models (a model is different if there are changes in one or more of the following: features, hyper-parameters, algorithm). Create object, fit, predict & evaluate. Use mean absolute error or mean squared error to evaluate. Also, try regression algorithms you have not used before.

### Los Angeles County

In [3]:
predictions_LA = pd.DataFrame({
    'actual': y_train_LA.logerror,
    'baseline': y_train_LA.logerror.mean()
})

In [4]:
# Create variables holding the columns to make models
MVP = ['bathroomcnt', 'bedroomcnt', 'finishedsquarefeet12']
FE_1 = ['bathroomcnt', 'centroid_finishedsquarefeet12', 'centroid_buildingqualitytypeid', 
        'centroid_roomcnt', 'centroid_is_extra']
FE_2 = ['bathroomcnt', 'centroid_finishedsquarefeet12', 'centroid_lotsizesquarefeet', 
        'centroid_landtaxvaluedollarcnt', 'centroid_new_zip']

In [5]:
# MVP model from before

X = train_LA[MVP]

poly = PolynomialFeatures(degree=3)
X_poly = poly.fit_transform(X)
lm_poly = LinearRegression(normalize=False).fit(X_poly, y_train_LA)

predictions_LA['MVP'] = lm_poly.predict(X_poly)

In [6]:
# Try adding cluster & binned features

X = train_LA[FE_1]

poly = PolynomialFeatures(degree=5)
X_poly = poly.fit_transform(X)
lm_poly = LinearRegression(normalize=False).fit(X_poly, y_train_LA)

predictions_LA['FE_1'] = lm_poly.predict(X_poly)

In [7]:
# Try adding cluster & binned features

X = train_LA[FE_2]

poly = PolynomialFeatures(degree=5)
X_poly = poly.fit_transform(X)
lm_poly = LinearRegression(normalize=False).fit(X_poly, y_train_LA)

predictions_LA['FE_2'] = lm_poly.predict(X_poly)

In [8]:
# Use RFE to select top 2 - 7 features
# first remove string columns
X = train_LA.drop(columns=['transactiondate', 'median_income', 'county',
                           'cluster_fancy', 'cluster_lot', 'census_tractandblock',
                           'latitude', 'longitude', 'taxdelinquencyyear', 'taxamount',
                           'age_bin', 'sf_bin', 'tax_bin', 'cluster_bins'
                          ])

model.select_rfe(X, y_train_LA, 2), model.select_rfe(X, y_train_LA, 3), model.select_rfe(X, y_train_LA, 4), model.select_rfe(X, y_train_LA, 5), model.select_rfe(X, y_train_LA, 6), model.select_rfe(X, y_train_LA, 7)

['tax_rate', 'centroid_buildingqualitytypeid']
['tax_rate', 'centroid_buildingqualitytypeid', 'centroid_roomcnt']
['finishedsquarefeet12', 'tax_rate', 'centroid_buildingqualitytypeid', 'centroid_roomcnt']
['finishedsquarefeet12', 'tax_rate', 'has_garage', 'centroid_buildingqualitytypeid', 'centroid_roomcnt']
['finishedsquarefeet12', 'age', 'tax_rate', 'has_garage', 'centroid_buildingqualitytypeid', 'centroid_roomcnt']
['finishedsquarefeet12', 'taxvaluedollarcnt', 'age', 'tax_rate', 'has_garage', 'centroid_buildingqualitytypeid', 'centroid_roomcnt']


(None, None, None, None, None, None)

In [9]:
LA_RFE_2 = ['tax_rate', 'centroid_buildingqualitytypeid']
LA_RFE_3 = ['tax_rate', 'centroid_buildingqualitytypeid', 'centroid_roomcnt']
LA_RFE_4 = ['finishedsquarefeet12', 'tax_rate', 'centroid_buildingqualitytypeid', 'centroid_roomcnt']
LA_RFE_5 = ['finishedsquarefeet12', 'tax_rate', 'has_garage', 'centroid_buildingqualitytypeid', 'centroid_roomcnt']
LA_RFE_6 = ['finishedsquarefeet12', 'age', 'tax_rate', 'has_garage', 'centroid_buildingqualitytypeid', 'centroid_roomcnt']
LA_RFE_7 = ['finishedsquarefeet12', 'taxvaluedollarcnt', 'age', 'tax_rate', 'has_garage', 'centroid_buildingqualitytypeid', 'centroid_roomcnt']

In [10]:
X = train_LA[LA_RFE_2]

poly = PolynomialFeatures(degree=2)
X_poly = poly.fit_transform(X)
lm_poly = LinearRegression(normalize=False).fit(X_poly, y_train_LA)

predictions_LA['RFE_2'] = lm_poly.predict(X_poly)

In [11]:
X = train_LA[LA_RFE_3]

poly = PolynomialFeatures(degree=3)
X_poly = poly.fit_transform(X)
lm_poly = LinearRegression(normalize=False).fit(X_poly, y_train_LA)

predictions_LA['RFE_3'] = lm_poly.predict(X_poly)

In [12]:
X = train_LA[LA_RFE_4]

poly = PolynomialFeatures(degree=4)
X_poly = poly.fit_transform(X)
lm_poly = LinearRegression(normalize=False).fit(X_poly, y_train_LA)

predictions_LA['RFE_4'] = lm_poly.predict(X_poly)

In [13]:
X = train_LA[LA_RFE_5]

poly = PolynomialFeatures(degree=5)
X_poly = poly.fit_transform(X)
lm_poly = LinearRegression(normalize=False).fit(X_poly, y_train_LA)

predictions_LA['RFE_5'] = lm_poly.predict(X_poly)

In [14]:
X = train_LA[LA_RFE_6]

poly = PolynomialFeatures(degree=6)
X_poly = poly.fit_transform(X)
lm_poly = LinearRegression(normalize=False).fit(X_poly, y_train_LA)

predictions_LA['RFE_6'] = lm_poly.predict(X_poly)

In [15]:
X = train_LA[LA_RFE_7]

poly = PolynomialFeatures(degree=7)
X_poly = poly.fit_transform(X)
lm_poly = LinearRegression(normalize=False).fit(X_poly, y_train_LA)

predictions_LA['RFE_7'] = lm_poly.predict(X_poly)

In [16]:
# Modeling 8 features takes too long, I won't model more than 7

In [17]:
# Set up RMSE functions
RMSE_bl = model.RMSE(predictions_LA.actual, predictions_LA.baseline)
RMSE_MVP = model.RMSE(predictions_LA.actual, predictions_LA.MVP)
RMSE_fe1 = model.RMSE(predictions_LA.actual, predictions_LA.FE_1)
RMSE_fe2 = model.RMSE(predictions_LA.actual, predictions_LA.FE_2)
RMSE_rfe2 = model.RMSE(predictions_LA.actual, predictions_LA.RFE_2)
RMSE_rfe3 = model.RMSE(predictions_LA.actual, predictions_LA.RFE_3)
RMSE_rfe4 = model.RMSE(predictions_LA.actual, predictions_LA.RFE_4)
RMSE_rfe5 = model.RMSE(predictions_LA.actual, predictions_LA.RFE_5)
RMSE_rfe6 = model.RMSE(predictions_LA.actual, predictions_LA.RFE_6)
RMSE_rfe6 = model.RMSE(predictions_LA.actual, predictions_LA.RFE_6)
RMSE_rfe7 = model.RMSE(predictions_LA.actual, predictions_LA.RFE_7)

In [18]:
# evaluation df
LA_eval_df = pd.DataFrame(np.array(['baseline', 'MVP', 'FE_1', 'FE_2', 'RFE_2', 'RFE_3',
                                 'RFE_4', 'RFE_5', 'RFE_6', 'RFE_7']), columns=['models'])
LA_eval_df['model_errors'] = np.array([RMSE_bl, RMSE_MVP, RMSE_fe1, RMSE_fe2, RMSE_rfe2, RMSE_rfe3,
                                    RMSE_rfe4, RMSE_rfe5, RMSE_rfe6, RMSE_rfe7])

LA_eval_df

Unnamed: 0,models,model_errors
0,baseline,0.153
1,MVP,0.153
2,FE_1,0.153
3,FE_2,0.152
4,RFE_2,0.153
5,RFE_3,0.153
6,RFE_4,0.153
7,RFE_5,0.152
8,RFE_6,0.152
9,RFE_7,0.15


### Orange County

In [19]:
predictions_OC = pd.DataFrame({
    'actual': y_train_OC.logerror,
    'baseline': y_train_OC.logerror.mean()
})

In [20]:
# MVP model from before
X = train_OC[MVP]

poly = PolynomialFeatures(degree=3)
X_poly = poly.fit_transform(X)
lm_poly = LinearRegression(normalize=False).fit(X_poly, y_train_OC)

predictions_OC['MVP'] = lm_poly.predict(X_poly)

In [21]:
# Try adding cluster & binned features
X = train_OC[FE_1]

poly = PolynomialFeatures(degree=5)
X_poly = poly.fit_transform(X)
lm_poly = LinearRegression(normalize=False).fit(X_poly, y_train_OC)

predictions_OC['FE_1'] = lm_poly.predict(X_poly)

In [22]:
# Try adding cluster & binned features
X = train_OC[FE_2]

poly = PolynomialFeatures(degree=5)
X_poly = poly.fit_transform(X)
lm_poly = LinearRegression(normalize=False).fit(X_poly, y_train_OC)

predictions_OC['FE_2'] = lm_poly.predict(X_poly)

In [23]:
# Use RFE to select top 2 - 7 features
# first remove string columns
X = train_OC.drop(columns=['transactiondate', 'median_income', 'county',
                           'cluster_fancy', 'cluster_lot', 'census_tractandblock',
                           'latitude', 'longitude', 'taxdelinquencyyear', 'taxamount',
                           'age_bin', 'sf_bin', 'tax_bin', 'cluster_bins'
                          ])

model.select_rfe(X, y_train_OC, 2), model.select_rfe(X, y_train_OC, 3), model.select_rfe(X, y_train_OC, 4), model.select_rfe(X, y_train_OC, 5), model.select_rfe(X, y_train_OC, 6), model.select_rfe(X, y_train_OC, 7)

['lotsizesquarefeet', 'tax_rate']
['lotsizesquarefeet', 'tax_rate', 'cost_structure_sf']
['lotsizesquarefeet', 'structuretaxvaluedollarcnt', 'tax_rate', 'cost_structure_sf']
['lotsizesquarefeet', 'structuretaxvaluedollarcnt', 'tax_rate', 'cost_structure_sf', 'centroid_buildingqualitytypeid']
['lotsizesquarefeet', 'structuretaxvaluedollarcnt', 'new_zip', 'tax_rate', 'cost_structure_sf', 'centroid_buildingqualitytypeid']
['lotsizesquarefeet', 'structuretaxvaluedollarcnt', 'new_zip', 'tax_rate', 'cost_structure_sf', 'centroid_buildingqualitytypeid', 'centroid_new_zip']


(None, None, None, None, None, None)

In [24]:
OC_RFE_2 = ['lotsizesquarefeet', 'tax_rate']
OC_RFE_3 = ['lotsizesquarefeet', 'tax_rate', 'cost_structure_sf']
OC_RFE_4 = ['lotsizesquarefeet', 'structuretaxvaluedollarcnt', 'tax_rate', 'cost_structure_sf']
OC_RFE_5 = ['lotsizesquarefeet', 'structuretaxvaluedollarcnt', 'tax_rate', 'cost_structure_sf', 'centroid_buildingqualitytypeid']
OC_RFE_6 = ['lotsizesquarefeet', 'structuretaxvaluedollarcnt', 'new_zip', 'tax_rate', 'cost_structure_sf', 'centroid_buildingqualitytypeid']
OC_RFE_7 = ['lotsizesquarefeet', 'structuretaxvaluedollarcnt', 'new_zip', 'tax_rate', 'cost_structure_sf', 'centroid_buildingqualitytypeid', 'centroid_new_zip']

In [25]:
X = train_OC[OC_RFE_2]

poly = PolynomialFeatures(degree=2)
X_poly = poly.fit_transform(X)
lm_poly = LinearRegression(normalize=False).fit(X_poly, y_train_OC)

predictions_OC['RFE_2'] = lm_poly.predict(X_poly)

In [26]:
X = train_OC[OC_RFE_3]

poly = PolynomialFeatures(degree=3)
X_poly = poly.fit_transform(X)
lm_poly = LinearRegression(normalize=False).fit(X_poly, y_train_OC)

predictions_OC['RFE_3'] = lm_poly.predict(X_poly)

In [27]:
X = train_OC[OC_RFE_4]

poly = PolynomialFeatures(degree=4)
X_poly = poly.fit_transform(X)
lm_poly = LinearRegression(normalize=False).fit(X_poly, y_train_OC)

predictions_OC['RFE_4'] = lm_poly.predict(X_poly)

In [28]:
X = train_OC[OC_RFE_5]

poly = PolynomialFeatures(degree=5)
X_poly = poly.fit_transform(X)
lm_poly = LinearRegression(normalize=False).fit(X_poly, y_train_OC)

predictions_OC['RFE_5'] = lm_poly.predict(X_poly)

In [29]:
X = train_OC[OC_RFE_6]

poly = PolynomialFeatures(degree=6)
X_poly = poly.fit_transform(X)
lm_poly = LinearRegression(normalize=False).fit(X_poly, y_train_OC)

predictions_OC['RFE_6'] = lm_poly.predict(X_poly)

In [30]:
X = train_OC[OC_RFE_7]

poly = PolynomialFeatures(degree=7)
X_poly = poly.fit_transform(X)
lm_poly = LinearRegression(normalize=False).fit(X_poly, y_train_OC)

predictions_OC['RFE_7'] = lm_poly.predict(X_poly)

In [31]:
# Set up RMSE functions
RMSE_bl = model.RMSE(predictions_OC.actual, predictions_OC.baseline)
RMSE_MVP = model.RMSE(predictions_OC.actual, predictions_OC.MVP)
RMSE_fe1 = model.RMSE(predictions_OC.actual, predictions_OC.FE_1)
RMSE_fe2 = model.RMSE(predictions_OC.actual, predictions_OC.FE_2)
RMSE_rfe2 = model.RMSE(predictions_OC.actual, predictions_OC.RFE_2)
RMSE_rfe3 = model.RMSE(predictions_OC.actual, predictions_OC.RFE_3)
RMSE_rfe4 = model.RMSE(predictions_OC.actual, predictions_OC.RFE_4)
RMSE_rfe5 = model.RMSE(predictions_OC.actual, predictions_OC.RFE_5)
RMSE_rfe6 = model.RMSE(predictions_OC.actual, predictions_OC.RFE_6)
RMSE_rfe6 = model.RMSE(predictions_OC.actual, predictions_OC.RFE_6)
RMSE_rfe7 = model.RMSE(predictions_OC.actual, predictions_OC.RFE_7)

In [32]:
# evaluation df
OC_eval_df = pd.DataFrame(np.array(['baseline', 'MVP', 'FE_1', 'FE_2', 'RFE_2', 'RFE_3',
                                 'RFE_4', 'RFE_5', 'RFE_6', 'RFE_7']), columns=['models'])
OC_eval_df['model_errors'] = np.array([RMSE_bl, RMSE_MVP, RMSE_fe1, RMSE_fe2, RMSE_rfe2, RMSE_rfe3,
                                    RMSE_rfe4, RMSE_rfe5, RMSE_rfe6, RMSE_rfe7])

OC_eval_df

Unnamed: 0,models,model_errors
0,baseline,0.167
1,MVP,0.167
2,FE_1,0.166
3,FE_2,0.166
4,RFE_2,0.167
5,RFE_3,0.167
6,RFE_4,0.166
7,RFE_5,0.162
8,RFE_6,0.151
9,RFE_7,0.125


### Ventura County

In [33]:
predictions_VC = pd.DataFrame({
    'actual': y_train_VC.logerror,
    'baseline': y_train_VC.logerror.mean()
})

In [34]:
# MVP model from before
X = train_VC[MVP]

poly = PolynomialFeatures(degree=3)
X_poly = poly.fit_transform(X)
lm_poly = LinearRegression(normalize=False).fit(X_poly, y_train_VC)

predictions_VC['MVP'] = lm_poly.predict(X_poly)

In [35]:
# Try adding cluster & binned features
X = train_VC[FE_1]

poly = PolynomialFeatures(degree=5)
X_poly = poly.fit_transform(X)
lm_poly = LinearRegression(normalize=False).fit(X_poly, y_train_VC)

predictions_VC['FE_1'] = lm_poly.predict(X_poly)

In [36]:
# Try adding cluster & binned features
X = train_VC[FE_2]

poly = PolynomialFeatures(degree=5)
X_poly = poly.fit_transform(X)
lm_poly = LinearRegression(normalize=False).fit(X_poly, y_train_VC)

predictions_VC['FE_2'] = lm_poly.predict(X_poly)

In [37]:
# Use RFE to select top 2 - 7 features
# first remove string columns
X = train_VC.drop(columns=['transactiondate', 'median_income', 'county',
                           'cluster_fancy', 'cluster_lot', 'census_tractandblock',
                           'latitude', 'longitude', 'taxdelinquencyyear', 'taxamount',
                           'age_bin', 'sf_bin', 'tax_bin', 'cluster_bins'
                          ])

model.select_rfe(X, y_train_VC, 2), model.select_rfe(X, y_train_VC, 3), model.select_rfe(X, y_train_VC, 4), model.select_rfe(X, y_train_VC, 5), model.select_rfe(X, y_train_VC, 6), model.select_rfe(X, y_train_VC, 7)

['lotsizesquarefeet', 'cost_structure_sf']
['lotsizesquarefeet', 'structuretaxvaluedollarcnt', 'cost_structure_sf']
['bathroomcnt', 'lotsizesquarefeet', 'structuretaxvaluedollarcnt', 'cost_structure_sf']
['bathroomcnt', 'lotsizesquarefeet', 'structuretaxvaluedollarcnt', 'tax_rate', 'cost_structure_sf']
['bathroomcnt', 'lotsizesquarefeet', 'structuretaxvaluedollarcnt', 'tax_rate', 'cost_structure_sf', 'centroid_is_extra']
['bathroomcnt', 'lotsizesquarefeet', 'structuretaxvaluedollarcnt', 'tax_rate', 'cost_structure_sf', 'centroid_roomcnt', 'centroid_is_extra']


(None, None, None, None, None, None)

In [38]:
VC_RFE_2 = ['lotsizesquarefeet', 'cost_structure_sf']
VC_RFE_3 = ['lotsizesquarefeet', 'structuretaxvaluedollarcnt', 'cost_structure_sf']
VC_RFE_4 = ['bathroomcnt', 'lotsizesquarefeet', 'structuretaxvaluedollarcnt', 'cost_structure_sf']
VC_RFE_5 = ['bathroomcnt', 'lotsizesquarefeet', 'structuretaxvaluedollarcnt', 'tax_rate', 'cost_structure_sf']
VC_RFE_6 = ['bathroomcnt', 'lotsizesquarefeet', 'structuretaxvaluedollarcnt', 'tax_rate', 'cost_structure_sf', 'centroid_is_extra']
VC_RFE_7 = ['bathroomcnt', 'lotsizesquarefeet', 'structuretaxvaluedollarcnt', 'tax_rate', 'cost_structure_sf', 'centroid_roomcnt', 'centroid_is_extra']

In [39]:
X = train_VC[VC_RFE_2]

poly = PolynomialFeatures(degree=2)
X_poly = poly.fit_transform(X)
lm_poly = LinearRegression(normalize=False).fit(X_poly, y_train_VC)

predictions_VC['RFE_2'] = lm_poly.predict(X_poly)

In [40]:
X = train_VC[VC_RFE_3]

poly = PolynomialFeatures(degree=3)
X_poly = poly.fit_transform(X)
lm_poly = LinearRegression(normalize=False).fit(X_poly, y_train_VC)

predictions_VC['RFE_3'] = lm_poly.predict(X_poly)

In [41]:
X = train_VC[VC_RFE_4]

poly = PolynomialFeatures(degree=4)
X_poly = poly.fit_transform(X)
lm_poly = LinearRegression(normalize=False).fit(X_poly, y_train_VC)

predictions_VC['RFE_4'] = lm_poly.predict(X_poly)

In [42]:
X = train_VC[VC_RFE_5]

poly = PolynomialFeatures(degree=5)
X_poly = poly.fit_transform(X)
lm_poly = LinearRegression(normalize=False).fit(X_poly, y_train_VC)

predictions_VC['RFE_5'] = lm_poly.predict(X_poly)

In [43]:
X = train_VC[VC_RFE_6]

poly = PolynomialFeatures(degree=6)
X_poly = poly.fit_transform(X)
lm_poly = LinearRegression(normalize=False).fit(X_poly, y_train_VC)

predictions_VC['RFE_6'] = lm_poly.predict(X_poly)

In [44]:
X = train_VC[VC_RFE_7]

poly = PolynomialFeatures(degree=7)
X_poly = poly.fit_transform(X)
lm_poly = LinearRegression(normalize=False).fit(X_poly, y_train_VC)

predictions_VC['RFE_7'] = lm_poly.predict(X_poly)

In [45]:
# Set up RMSE functions
RMSE_bl = model.RMSE(predictions_VC.actual, predictions_VC.baseline)
RMSE_MVP = model.RMSE(predictions_VC.actual, predictions_VC.MVP)
RMSE_fe1 = model.RMSE(predictions_VC.actual, predictions_VC.FE_1)
RMSE_fe2 = model.RMSE(predictions_VC.actual, predictions_VC.FE_2)
RMSE_rfe2 = model.RMSE(predictions_VC.actual, predictions_VC.RFE_2)
RMSE_rfe3 = model.RMSE(predictions_VC.actual, predictions_VC.RFE_3)
RMSE_rfe4 = model.RMSE(predictions_VC.actual, predictions_VC.RFE_4)
RMSE_rfe5 = model.RMSE(predictions_VC.actual, predictions_VC.RFE_5)
RMSE_rfe6 = model.RMSE(predictions_VC.actual, predictions_VC.RFE_6)
RMSE_rfe6 = model.RMSE(predictions_VC.actual, predictions_VC.RFE_6)
RMSE_rfe7 = model.RMSE(predictions_VC.actual, predictions_VC.RFE_7)

In [46]:
# evaluation df
VC_eval_df = pd.DataFrame(np.array(['baseline', 'MVP', 'FE_1', 'FE_2', 'RFE_2', 'RFE_3',
                                 'RFE_4', 'RFE_5', 'RFE_6', 'RFE_7']), columns=['models'])
VC_eval_df['model_errors'] = np.array([RMSE_bl, RMSE_MVP, RMSE_fe1, RMSE_fe2, RMSE_rfe2, RMSE_rfe3,
                                    RMSE_rfe4, RMSE_rfe5, RMSE_rfe6, RMSE_rfe7])

VC_eval_df

Unnamed: 0,models,model_errors
0,baseline,0.216
1,MVP,0.214
2,FE_1,0.215
3,FE_2,0.215
4,RFE_2,0.216
5,RFE_3,0.215
6,RFE_4,0.209
7,RFE_5,0.191
8,RFE_6,0.169
9,RFE_7,0.158


### Check performance on valid set

In [47]:
valid_predictions_LA = pd.DataFrame({
    'actual': y_valid_LA.logerror,
    'baseline': y_valid_LA.logerror.mean()
})

In [48]:
# MVP model from before
X = train_LA[MVP]

poly = PolynomialFeatures(degree=3)
X_poly = poly.fit_transform(X)
lm_poly = LinearRegression(normalize=False).fit(X_poly, y_train_LA)

X_valid = valid_LA[MVP]

X_valid_poly = poly.transform(X_valid)
valid_predictions_LA['MVP'] = lm_poly.predict(X_valid_poly)

In [49]:
# Try adding cluster & binned features
X = train_LA[FE_1]

poly = PolynomialFeatures(degree=5)
X_poly = poly.fit_transform(X)
lm_poly = LinearRegression(normalize=False).fit(X_poly, y_train_LA)

X_valid = valid_LA[FE_1]

X_valid_poly = poly.transform(X_valid)
valid_predictions_LA['FE_1'] = lm_poly.predict(X_valid_poly)

In [50]:
# Try adding cluster & binned features
X = train_LA[FE_2]

poly = PolynomialFeatures(degree=5)
X_poly = poly.fit_transform(X)
lm_poly = LinearRegression(normalize=False).fit(X_poly, y_train_LA)

X_valid = valid_LA[FE_2]

X_valid_poly = poly.transform(X_valid)
valid_predictions_LA['FE_2'] = lm_poly.predict(X_valid_poly)

In [51]:
X = train_LA[LA_RFE_2]

poly = PolynomialFeatures(degree=2)
X_poly = poly.fit_transform(X)
lm_poly = LinearRegression(normalize=False).fit(X_poly, y_train_LA)

X_valid = valid_LA[LA_RFE_2]

X_valid_poly = poly.transform(X_valid)
valid_predictions_LA['RFE_2'] = lm_poly.predict(X_valid_poly)

In [52]:
X = train_LA[LA_RFE_3]

poly = PolynomialFeatures(degree=3)
X_poly = poly.fit_transform(X)
lm_poly = LinearRegression(normalize=False).fit(X_poly, y_train_LA)

X_valid = valid_LA[LA_RFE_3]

X_valid_poly = poly.transform(X_valid)
valid_predictions_LA['RFE_3'] = lm_poly.predict(X_valid_poly)

In [53]:
X = train_LA[LA_RFE_4]

poly = PolynomialFeatures(degree=4)
X_poly = poly.fit_transform(X)
lm_poly = LinearRegression(normalize=False).fit(X_poly, y_train_LA)

X_valid = valid_LA[LA_RFE_4]

X_valid_poly = poly.transform(X_valid)
valid_predictions_LA['RFE_4'] = lm_poly.predict(X_valid_poly)

In [54]:
X = train_LA[LA_RFE_5]

poly = PolynomialFeatures(degree=5)
X_poly = poly.fit_transform(X)
lm_poly = LinearRegression(normalize=False).fit(X_poly, y_train_LA)

X_valid = valid_LA[LA_RFE_5]

X_valid_poly = poly.transform(X_valid)
valid_predictions_LA['RFE_5'] = lm_poly.predict(X_valid_poly)

In [55]:
X = train_LA[LA_RFE_6]

poly = PolynomialFeatures(degree=6)
X_poly = poly.fit_transform(X)
lm_poly = LinearRegression(normalize=False).fit(X_poly, y_train_LA)

X_valid = valid_LA[LA_RFE_6]

X_valid_poly = poly.transform(X_valid)
valid_predictions_LA['RFE_6'] = lm_poly.predict(X_valid_poly)

In [56]:
X = train_LA[LA_RFE_7]

poly = PolynomialFeatures(degree=7)
X_poly = poly.fit_transform(X)
lm_poly = LinearRegression(normalize=False).fit(X_poly, y_train_LA)

X_valid = valid_LA[LA_RFE_7]

X_valid_poly = poly.transform(X_valid)
valid_predictions_LA['RFE_7'] = lm_poly.predict(X_valid_poly)

In [57]:
# Set up RMSE functions
RMSE_blv = model.RMSE(valid_predictions_LA.actual, valid_predictions_LA.baseline)
RMSE_MVPv = model.RMSE(valid_predictions_LA.actual, valid_predictions_LA.MVP)
RMSE_fe1v = model.RMSE(valid_predictions_LA.actual, valid_predictions_LA.FE_1)
RMSE_fe2v = model.RMSE(valid_predictions_LA.actual, valid_predictions_LA.FE_2)
RMSE_rfe2v = model.RMSE(valid_predictions_LA.actual, valid_predictions_LA.RFE_2)
RMSE_rfe3v = model.RMSE(valid_predictions_LA.actual, valid_predictions_LA.RFE_3)
RMSE_rfe4v = model.RMSE(valid_predictions_LA.actual, valid_predictions_LA.RFE_4)
RMSE_rfe5v = model.RMSE(valid_predictions_LA.actual, valid_predictions_LA.RFE_5)
RMSE_rfe6v = model.RMSE(valid_predictions_LA.actual, valid_predictions_LA.RFE_6)
RMSE_rfe7v = model.RMSE(valid_predictions_LA.actual, valid_predictions_LA.RFE_7)

In [58]:
LA_eval_df['valid_errors'] = np.array([RMSE_blv, RMSE_MVPv, RMSE_fe1v, RMSE_fe2v, RMSE_rfe2v, RMSE_rfe3v,
                                    RMSE_rfe4v, RMSE_rfe5v, RMSE_rfe6v, RMSE_rfe7v])

LA_eval_df

Unnamed: 0,models,model_errors,valid_errors
0,baseline,0.153,0.16
1,MVP,0.153,0.16
2,FE_1,0.153,0.16
3,FE_2,0.152,0.163
4,RFE_2,0.153,0.16
5,RFE_3,0.153,0.161
6,RFE_4,0.153,0.208
7,RFE_5,0.152,3.336
8,RFE_6,0.152,84.944
9,RFE_7,0.15,80269.39


### Orange County

In [59]:
valid_predictions_OC = pd.DataFrame({
    'actual': y_valid_OC.logerror,
    'baseline': y_valid_OC.logerror.mean()
})

In [60]:
# MVP model from before
X = train_OC[MVP]

poly = PolynomialFeatures(degree=3)
X_poly = poly.fit_transform(X)
lm_poly = LinearRegression(normalize=False).fit(X_poly, y_train_OC)

X_valid = valid_OC[['bathroomcnt', 'bedroomcnt', 'finishedsquarefeet12']]

X_valid_poly = poly.transform(X_valid)
valid_predictions_OC['MVP'] = lm_poly.predict(X_valid_poly)

In [61]:
# Try adding cluster & binned features
X = train_OC[FE_1]

poly = PolynomialFeatures(degree=5)
X_poly = poly.fit_transform(X)
lm_poly = LinearRegression(normalize=False).fit(X_poly, y_train_OC)

X_valid = valid_OC[FE_1]

X_valid_poly = poly.transform(X_valid)
valid_predictions_OC['FE_1'] = lm_poly.predict(X_valid_poly)

In [62]:
# Try adding cluster & binned features
X = train_OC[FE_2]

poly = PolynomialFeatures(degree=5)
X_poly = poly.fit_transform(X)
lm_poly = LinearRegression(normalize=False).fit(X_poly, y_train_OC)

X_valid = valid_OC[FE_2]

X_valid_poly = poly.transform(X_valid)
valid_predictions_OC['FE_2'] = lm_poly.predict(X_valid_poly)

In [63]:
X = train_OC[OC_RFE_2]

poly = PolynomialFeatures(degree=2)
X_poly = poly.fit_transform(X)
lm_poly = LinearRegression(normalize=False).fit(X_poly, y_train_OC)

X_valid = valid_OC[OC_RFE_2]

X_valid_poly = poly.transform(X_valid)
valid_predictions_OC['RFE_2'] = lm_poly.predict(X_valid_poly)

In [64]:
X = train_OC[OC_RFE_3]

poly = PolynomialFeatures(degree=3)
X_poly = poly.fit_transform(X)
lm_poly = LinearRegression(normalize=False).fit(X_poly, y_train_OC)

X_valid = valid_OC[OC_RFE_3]

X_valid_poly = poly.transform(X_valid)
valid_predictions_OC['RFE_3'] = lm_poly.predict(X_valid_poly)

In [65]:
X = train_OC[OC_RFE_4]

poly = PolynomialFeatures(degree=4)
X_poly = poly.fit_transform(X)
lm_poly = LinearRegression(normalize=False).fit(X_poly, y_train_OC)

X_valid = valid_OC[OC_RFE_4]

X_valid_poly = poly.transform(X_valid)
valid_predictions_OC['RFE_4'] = lm_poly.predict(X_valid_poly)

In [66]:
X = train_OC[OC_RFE_5]

poly = PolynomialFeatures(degree=5)
X_poly = poly.fit_transform(X)
lm_poly = LinearRegression(normalize=False).fit(X_poly, y_train_OC)

X_valid = valid_OC[OC_RFE_5]

X_valid_poly = poly.transform(X_valid)
valid_predictions_OC['RFE_5'] = lm_poly.predict(X_valid_poly)

In [67]:
X = train_OC[OC_RFE_6]

poly = PolynomialFeatures(degree=6)
X_poly = poly.fit_transform(X)
lm_poly = LinearRegression(normalize=False).fit(X_poly, y_train_OC)

X_valid = valid_OC[OC_RFE_6]

X_valid_poly = poly.transform(X_valid)
valid_predictions_OC['RFE_6'] = lm_poly.predict(X_valid_poly)

In [68]:
X = train_OC[OC_RFE_7]

poly = PolynomialFeatures(degree=7)
X_poly = poly.fit_transform(X)
lm_poly = LinearRegression(normalize=False).fit(X_poly, y_train_OC)

X_valid = valid_OC[OC_RFE_7]

X_valid_poly = poly.transform(X_valid)
valid_predictions_OC['RFE_7'] = lm_poly.predict(X_valid_poly)

In [69]:
# Set up RMSE functions
RMSE_blv = model.RMSE(valid_predictions_OC.actual, valid_predictions_OC.baseline)
RMSE_MVPv = model.RMSE(valid_predictions_OC.actual, valid_predictions_OC.MVP)
RMSE_fe1v = model.RMSE(valid_predictions_OC.actual, valid_predictions_OC.FE_1)
RMSE_fe2v = model.RMSE(valid_predictions_OC.actual, valid_predictions_OC.FE_2)
RMSE_rfe2v = model.RMSE(valid_predictions_OC.actual, valid_predictions_OC.RFE_2)
RMSE_rfe3v = model.RMSE(valid_predictions_OC.actual, valid_predictions_OC.RFE_3)
RMSE_rfe4v = model.RMSE(valid_predictions_OC.actual, valid_predictions_OC.RFE_4)
RMSE_rfe5v = model.RMSE(valid_predictions_OC.actual, valid_predictions_OC.RFE_5)
RMSE_rfe6v = model.RMSE(valid_predictions_OC.actual, valid_predictions_OC.RFE_6)
RMSE_rfe7v = model.RMSE(valid_predictions_OC.actual, valid_predictions_OC.RFE_7)

In [70]:
OC_eval_df['valid_errors'] = np.array([RMSE_blv, RMSE_MVPv, RMSE_fe1v, RMSE_fe2v, RMSE_rfe2v, RMSE_rfe3v,
                                    RMSE_rfe4v, RMSE_rfe5v, RMSE_rfe6v, RMSE_rfe7v])

OC_eval_df

Unnamed: 0,models,model_errors,valid_errors
0,baseline,0.167,0.182
1,MVP,0.167,0.182
2,FE_1,0.166,69437359993.772
3,FE_2,0.166,0.182
4,RFE_2,0.167,0.185
5,RFE_3,0.167,0.204
6,RFE_4,0.166,1.273
7,RFE_5,0.162,26.542
8,RFE_6,0.151,32426461.598
9,RFE_7,0.125,159947.991


### Ventura County

In [71]:
valid_predictions_VC = pd.DataFrame({
    'actual': y_valid_VC.logerror,
    'baseline': y_valid_VC.logerror.mean()
})

In [72]:
# MVP model from before
X = train_VC[MVP]

poly = PolynomialFeatures(degree=3)
X_poly = poly.fit_transform(X)
lm_poly = LinearRegression(normalize=False).fit(X_poly, y_train_VC)

X_valid = valid_VC[MVP]

X_valid_poly = poly.transform(X_valid)
valid_predictions_VC['MVP'] = lm_poly.predict(X_valid_poly)

In [73]:
X = train_VC[FE_1]

poly = PolynomialFeatures(degree=5)
X_poly = poly.fit_transform(X)
lm_poly = LinearRegression(normalize=False).fit(X_poly, y_train_VC)

X_valid = valid_VC[FE_1]

X_valid_poly = poly.transform(X_valid)
valid_predictions_VC['FE_1'] = lm_poly.predict(X_valid_poly)

In [74]:
X = train_VC[FE_2]

poly = PolynomialFeatures(degree=5)
X_poly = poly.fit_transform(X)
lm_poly = LinearRegression(normalize=False).fit(X_poly, y_train_VC)

X_valid = valid_VC[FE_2]

X_valid_poly = poly.transform(X_valid)
valid_predictions_VC['FE_2'] = lm_poly.predict(X_valid_poly)

In [75]:
X = train_VC[VC_RFE_2]

poly = PolynomialFeatures(degree=2)
X_poly = poly.fit_transform(X)
lm_poly = LinearRegression(normalize=False).fit(X_poly, y_train_VC)

X_valid = valid_VC[VC_RFE_2]

X_valid_poly = poly.transform(X_valid)
valid_predictions_VC['RFE_2'] = lm_poly.predict(X_valid_poly)

In [76]:
X = train_VC[VC_RFE_3]

poly = PolynomialFeatures(degree=3)
X_poly = poly.fit_transform(X)
lm_poly = LinearRegression(normalize=False).fit(X_poly, y_train_VC)

X_valid = valid_VC[VC_RFE_3]

X_valid_poly = poly.transform(X_valid)
valid_predictions_VC['RFE_3'] = lm_poly.predict(X_valid_poly)

In [77]:
X = train_VC[VC_RFE_4]

poly = PolynomialFeatures(degree=4)
X_poly = poly.fit_transform(X)
lm_poly = LinearRegression(normalize=False).fit(X_poly, y_train_VC)

X_valid = valid_VC[VC_RFE_4]

X_valid_poly = poly.transform(X_valid)
valid_predictions_VC['RFE_4'] = lm_poly.predict(X_valid_poly)

In [78]:
X = train_VC[VC_RFE_5]

poly = PolynomialFeatures(degree=5)
X_poly = poly.fit_transform(X)
lm_poly = LinearRegression(normalize=False).fit(X_poly, y_train_VC)

X_valid = valid_VC[VC_RFE_5]


X_valid_poly = poly.transform(X_valid)
valid_predictions_VC['RFE_5'] = lm_poly.predict(X_valid_poly)

In [79]:
X = train_VC[VC_RFE_6]

poly = PolynomialFeatures(degree=6)
X_poly = poly.fit_transform(X)
lm_poly = LinearRegression(normalize=False).fit(X_poly, y_train_VC)

X_valid = valid_VC[VC_RFE_6]

X_valid_poly = poly.transform(X_valid)
valid_predictions_VC['RFE_6'] = lm_poly.predict(X_valid_poly)

In [80]:
X = train_VC[VC_RFE_7]

poly = PolynomialFeatures(degree=7)
X_poly = poly.fit_transform(X)
lm_poly = LinearRegression(normalize=False).fit(X_poly, y_train_VC)

X_valid = valid_VC[VC_RFE_7]

X_valid_poly = poly.transform(X_valid)
valid_predictions_VC['RFE_7'] = lm_poly.predict(X_valid_poly)

In [81]:
# Set up RMSE functions
RMSE_blv = model.RMSE(valid_predictions_VC.actual, valid_predictions_VC.baseline)
RMSE_MVPv = model.RMSE(valid_predictions_VC.actual, valid_predictions_VC.MVP)
RMSE_fe1v = model.RMSE(valid_predictions_VC.actual, valid_predictions_VC.FE_1)
RMSE_fe2v = model.RMSE(valid_predictions_VC.actual, valid_predictions_VC.FE_2)
RMSE_rfe2v = model.RMSE(valid_predictions_VC.actual, valid_predictions_VC.RFE_2)
RMSE_rfe3v = model.RMSE(valid_predictions_VC.actual, valid_predictions_VC.RFE_3)
RMSE_rfe4v = model.RMSE(valid_predictions_VC.actual, valid_predictions_VC.RFE_4)
RMSE_rfe5v = model.RMSE(valid_predictions_VC.actual, valid_predictions_VC.RFE_5)
RMSE_rfe6v = model.RMSE(valid_predictions_VC.actual, valid_predictions_VC.RFE_6)
RMSE_rfe7v = model.RMSE(valid_predictions_VC.actual, valid_predictions_VC.RFE_7)

In [82]:
VC_eval_df['valid_errors'] = np.array([RMSE_blv, RMSE_MVPv, RMSE_fe1v, RMSE_fe2v, RMSE_rfe2v, RMSE_rfe3v,
                                    RMSE_rfe4v, RMSE_rfe5v, RMSE_rfe6v, RMSE_rfe7v])

VC_eval_df

Unnamed: 0,models,model_errors,valid_errors
0,baseline,0.216,0.174
1,MVP,0.214,0.183
2,FE_1,0.215,0.174
3,FE_2,0.215,70697253775.255
4,RFE_2,0.216,0.176
5,RFE_3,0.215,0.223
6,RFE_4,0.209,0.995
7,RFE_5,0.191,145.558
8,RFE_6,0.169,43451934.333
9,RFE_7,0.158,557449191.839


Evaluate your best model on your test data set to get an idea of your model's out of sample error.