# Model 03: Optimal

### Imports

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error

from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import Ridge, RidgeCV, Lasso, LassoCV

## Final Data Clean

In [2]:
df_train = pd.read_csv('../datasets/train_ready.csv')
neighborhood = pd.read_csv('../datasets/train_neighbor_dummy.csv')

In [3]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [4]:
df_train.head()

Unnamed: 0.1,Unnamed: 0,id,pid,ms_subclass,ms_zoning,lot_area,street,regular_lot,land_contour,utilities,lot_config,land_slope,neighborhood,condition_1,condition_2,bldg_type,house_floors,overall_qual,overall_cond,year_built,year_remod/add,roof_style,roof_matl,exterior_1st,exterior_2nd,fancy_masonry,mas_vnr_area,exter_exc,exter_cond,conc_found,bsmt_exc,bsmt_cond,bsmt_exposure,bsmtfin_type_1,bsmtfin_sf_1,bsmtfin_type_2,bsmtfin_sf_2,bsmt_unf_sf,total_bsmt_sf,heating,heating_exc,central_air,stand_elec,1st_flr_sf,2nd_flr_sf,low_qual_fin_sf,gr_liv_area,bsmt_full_bath,bsmt_half_bath,full_bath,half_bath,bedroom_abvgr,kitchen_abvgr,kitchen_exc,totrms_abvgrd,functional,fireplaces,garage_connect,garage_cars,garage_area,paved_drive,wood_deck_sf,open_porch_sf,enclosed_porch,3ssn_porch,screen_porch,fence,misc_val,mo_sold,yr_sold,sale_type,saleprice,sell_diff,remod_diff,remod_x_sell,salepricelog,over_exc,qual_sqaure,exc_x_qual
0,2,153,535304180,20,RL,7922,Pave,1,Lvl,AllPub,Inside,Gtl,NAmes,Norm,Norm,1Fam,1.0,5,7,1953,2007,Gable,CompShg,VinylSd,VinylSd,0,0.0,0,Gd,0,0,TA,No,GLQ,731.0,Unf,0.0,326.0,1057.0,GasA,0,1,1,1057,0,0,1057,1.0,0.0,1,0,3,1,1,5,Typ,0,0,1.0,246.0,1,0,52,0,0,0,No,0,1,2010,WD,109000,57,3,171,11.599103,1,25,25
1,6,2827,908186070,180,RM,3675,Pave,1,Lvl,AllPub,Inside,Gtl,Edwards,Norm,Norm,TwnhsE,1.5,6,5,2005,2006,Gable,CompShg,VinylSd,VinylSd,1,82.0,0,TA,1,1,TA,Gd,GLQ,547.0,Unf,0.0,0.0,547.0,GasA,1,1,1,1072,0,0,1072,1.0,0.0,2,0,2,1,0,5,Typ,0,0,2.0,525.0,1,0,44,0,0,0,No,0,6,2006,New,140000,1,0,0,11.849398,4,36,144
2,10,1044,527451290,160,RM,1680,Pave,1,Lvl,AllPub,Inside,Gtl,BrDale,Norm,Norm,Twnhs,2.0,6,5,1971,1971,Gable,CompShg,HdBoard,HdBoard,1,232.0,0,TA,0,0,TA,No,ALQ,387.0,Unf,0.0,96.0,483.0,GasA,0,1,1,483,504,0,987,0.0,0.0,1,1,2,1,0,4,Typ,0,0,1.0,264.0,1,0,0,0,0,0,No,0,7,2008,COD,85400,37,37,1369,11.355101,0,36,0
3,11,2752,906380150,20,RL,7488,Pave,0,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,1.0,7,5,2005,2005,Gable,CompShg,VinylSd,VinylSd,0,0.0,1,TA,1,1,TA,Av,GLQ,393.0,Unf,0.0,815.0,1208.0,GasA,1,1,1,1208,0,0,1208,0.0,0.0,2,0,2,1,1,6,Typ,0,1,2.0,632.0,1,105,58,0,0,0,No,0,2,2006,WD,183600,1,1,1,12.120515,16,49,784
4,13,1177,533236070,160,FV,2645,Pave,1,Lvl,AllPub,Inside,Gtl,Somerst,Norm,Norm,Twnhs,2.0,8,5,1999,2000,Gable,CompShg,MetalSd,MetalSd,1,456.0,1,TA,1,1,TA,No,GLQ,813.0,Unf,0.0,147.0,960.0,GasA,1,1,1,962,645,0,1607,1.0,0.0,2,1,3,1,1,7,Typ,0,0,2.0,480.0,1,169,0,0,0,0,No,0,12,2008,ConLD,200000,9,8,72,12.206073,16,64,1024


In [5]:
df_train.drop(columns = ['Unnamed: 0'], inplace = True)
neighborhood.drop(columns = ['Unnamed: 0'], inplace = True)

In [6]:
df_train = pd.concat([df_train, neighborhood],axis=1)

## Linear Regression

For this model, I took all features from the previous model and turned them on and off and ran models until I got what I thought was the best score. Below is that model.

In [7]:
features = [#'overall_qual', 
            'gr_liv_area', 
            'garage_area', 
            #'garage_cars', 
            'total_bsmt_sf', 
            '1st_flr_sf', 
            #'year_built', 
            #'year_remod/add', 
            #'full_bath', 
            #'mas_vnr_area', 
            'totrms_abvgrd',
            
            #'sell_diff',
            'remod_diff',
            'garage_connect',
            'conc_found',
            'bsmt_exc',
            'kitchen_exc',
            'exter_exc',
            'over_exc',
            'exc_x_qual',
                        
            'n_Blueste',
            'n_BrDale',
            'n_BrkSide',
            'n_ClearCr',
            'n_CollgCr',
            'n_Crawfor',
            'n_Edwards',
            'n_Gilbert',
            'n_Greens',
            'n_IDOTRR',
            'n_Landmrk',
            'n_MeadowV',
            'n_Mitchel',
            'n_NAmes',
            'n_NPkVill',
            'n_NridgHt',
            'n_OldTown',
            'n_SWISU',
            'n_Sawyer',
            'n_SawyerW',
            'n_Somerst',
            'n_StoneBr',
            'n_Timber',
           ]

X = df_train[features]

y = df_train['saleprice']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, train_size = .8)

In [8]:
poly = PolynomialFeatures(include_bias = False)

X_overfit = poly.fit_transform(X)

# Taken from 3.07

In [9]:
lr = LinearRegression()

lr.fit(X_train, y_train)

predicts = lr.predict(X)

In [10]:
print(f"OLS Training R-Squared: {lr.score(X_train, y_train)}")
print(f"OLS Testing R-Squared: {lr.score(X_test, y_test)}")
print(f"OLS CV Training R-Squared: {cross_val_score(lr, X_train, y_train).mean()}")
print(f"OLS CV Testing R-Squared: {cross_val_score(lr, X_test, y_test).mean()}")
print(f"OLS Testing MSE: {mean_squared_error(y, predicts)}")
print(f"OLS Testing RMSE: {mean_squared_error(y, predicts, squared=False)}")

OLS Training R-Squared: 0.8730929135841867
OLS Testing R-Squared: 0.8344280717111642
OLS CV Training R-Squared: 0.8443345534082644
OLS CV Testing R-Squared: 0.685863329433106
OLS Testing MSE: 333578729.92908514
OLS Testing RMSE: 18264.137809628057


In [11]:
list(zip(X_train.columns, lr.coef_))

[('overall_qual', 4901.73194382301),
 ('gr_liv_area', 37.59405767484104),
 ('garage_area', 50.641324979279396),
 ('garage_cars', -8489.37376769896),
 ('total_bsmt_sf', 18.277777877835504),
 ('1st_flr_sf', 24.2253817584535),
 ('year_built', -275.31932996665336),
 ('year_remod/add', -229.3283196684442),
 ('full_bath', -2725.0383505634513),
 ('mas_vnr_area', 31.92744493982675),
 ('totrms_abvgrd', -773.0091136664859),
 ('sell_diff', -435.53149386020164),
 ('remod_diff', -481.52250402946316),
 ('garage_connect', 6015.065013662715),
 ('conc_found', 1188.9623426007563),
 ('bsmt_exc', 690.5719981949663),
 ('kitchen_exc', 8111.255143738437),
 ('exter_exc', -950.9182566785596),
 ('over_exc', -2572.4170190424998),
 ('exc_x_qual', 60.79488799343305),
 ('n_Blueste', 3778.01114642296),
 ('n_BrDale', -17031.97963343932),
 ('n_BrkSide', 5703.380279830907),
 ('n_ClearCr', -12267.05148470151),
 ('n_CollgCr', -294.8882129629669),
 ('n_Crawfor', 27413.45215376736),
 ('n_Edwards', -8352.992434507227),
 ('n

## Ridge

In [12]:
sc = StandardScaler()

Z_train = sc.fit_transform(X_train)

Z_test = sc.transform(X_test)

In [13]:
r_alphas = np.logspace(0, 5, 100)

ridge_cv = RidgeCV(alphas = r_alphas, scoring = "r2", cv = 5)

ridge_cv.fit(Z_train, y_train);

# Taken from 3.07

In [14]:
ridge = Ridge()

ridge.fit(X_train, y_train)

predicts = ridge.predict(X_test)

In [15]:
print(f"Ridge CV Training R-Squared: {ridge_cv.score(Z_train, y_train)}")
print(f"Ridge CV Testing R-Squared: {ridge_cv.score(Z_test, y_test)}")
print(f"Ridge Testing MSE: {mean_squared_error(y_test, predicts)}")
print(f"Ridge Testing RMSE: {mean_squared_error(y_test, predicts, squared=False)}")

Ridge CV Training R-Squared: 0.8728912164596317
Ridge CV Testing R-Squared: 0.8341720589996872
Ridge Testing MSE: 426109285.2672245
Ridge Testing RMSE: 20642.414715028484


In [16]:
list(zip(X_train.columns, ridge.coef_))

[('overall_qual', 4919.315469206787),
 ('gr_liv_area', 37.83309769213807),
 ('garage_area', 49.380274951879166),
 ('garage_cars', -7922.037098881096),
 ('total_bsmt_sf', 18.69137041141268),
 ('1st_flr_sf', 24.04266329892123),
 ('year_built', -271.54162845382655),
 ('year_remod/add', -219.48708932731583),
 ('full_bath', -2378.2804479391853),
 ('mas_vnr_area', 31.117062595311992),
 ('totrms_abvgrd', -869.8717020849059),
 ('sell_diff', -424.49820327518154),
 ('remod_diff', -476.55274243273624),
 ('garage_connect', 5912.44110591288),
 ('conc_found', 1250.2638158849013),
 ('bsmt_exc', 564.8926801279462),
 ('kitchen_exc', 7607.790211882988),
 ('exter_exc', -1133.453544093916),
 ('over_exc', -2543.32732001127),
 ('exc_x_qual', 61.73264019440009),
 ('n_Blueste', 2610.958340077737),
 ('n_BrDale', -14899.884952262144),
 ('n_BrkSide', 6034.0074939954975),
 ('n_ClearCr', -7608.395828332358),
 ('n_CollgCr', -40.658655299577575),
 ('n_Crawfor', 26173.496985653495),
 ('n_Edwards', -7607.657401204826)

## LASSO

In [28]:
l_alphas = np.logspace(-3, 0, 100)

lasso_cv = LassoCV(alphas = l_alphas, cv = 5, n_jobs= -1, max_iter = 1_500_000)

lasso_cv.fit(X_train, y_train);

# Taken from 3.07

In [29]:
lasso = Lasso()

lasso.fit(X_train, y_train)

predicts = lasso_cv.predict(X_test)

  model = cd_fast.enet_coordinate_descent(


**NOTE:** Returning convergence error no matter how much I increase "max_iter". However, returning R2, squared error, and coefficent results in par with OLS and Ridge results. Also, never returning scores better than OLS or Ridge so other models will be considered instead 

In [19]:
print(f"LASSO CV Training R-Squared: {lasso_cv.score(X_train, y_train)} ")
print(f"LASSO CV Testing R-Squared: {lasso_cv.score(X_test, y_test)} ")
print(f"LASSO Testing MSE: {mean_squared_error(y_test, predicts)} ")
print(f"LASSO Testing RMSE: {mean_squared_error(y_test,predicts,squared=False)} ")

LASSO CV Training R-Squared: 0.8730919612634293 
LASSO CV Testing R-Squared: 0.8345698031142142 
LASSO Testing MSE: 426858531.9818095 
LASSO Testing RMSE: 20660.554977584932 


In [20]:
list(zip(X_train.columns, lasso.coef_))

[('overall_qual', 4907.327166698356),
 ('gr_liv_area', 37.609830017366825),
 ('garage_area', 50.58870625513754),
 ('garage_cars', -8467.165371264151),
 ('total_bsmt_sf', 18.32045875726241),
 ('1st_flr_sf', 24.173280466649846),
 ('year_built', -278.08599366208193),
 ('year_remod/add', -195.33590991823135),
 ('full_bath', -2700.105697574727),
 ('mas_vnr_area', 31.909635930570648),
 ('totrms_abvgrd', -781.3587587150586),
 ('sell_diff', -438.5966960968154),
 ('remod_diff', -447.7908778338086),
 ('garage_connect', 5998.524776119254),
 ('conc_found', 1179.090091273253),
 ('bsmt_exc', 677.6504972676255),
 ('kitchen_exc', 8074.385594320976),
 ('exter_exc', -948.5015513925929),
 ('over_exc', -2573.6591578498446),
 ('exc_x_qual', 60.86638009284193),
 ('n_Blueste', 3416.1860186135236),
 ('n_BrDale', -17079.659449193012),
 ('n_BrkSide', 5638.072192767282),
 ('n_ClearCr', -12010.616985642564),
 ('n_CollgCr', -341.5021567036506),
 ('n_Crawfor', 27325.764711022548),
 ('n_Edwards', -8385.761577844029)