# Model 01: Vanilla

### Imports

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error

from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import Ridge, RidgeCV, Lasso, LassoCV

## Final Data Clean

In [2]:
df_train = pd.read_csv('../datasets/train_ready.csv')

In [3]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [4]:
df_train.head()

Unnamed: 0.1,Unnamed: 0,id,pid,ms_subclass,ms_zoning,lot_area,street,regular_lot,land_contour,utilities,lot_config,land_slope,neighborhood,condition_1,condition_2,bldg_type,house_floors,overall_qual,overall_cond,year_built,year_remod/add,roof_style,roof_matl,exterior_1st,exterior_2nd,fancy_masonry,mas_vnr_area,exter_exc,exter_cond,conc_found,bsmt_exc,bsmt_cond,bsmt_exposure,bsmtfin_type_1,bsmtfin_sf_1,bsmtfin_type_2,bsmtfin_sf_2,bsmt_unf_sf,total_bsmt_sf,heating,heating_exc,central_air,stand_elec,1st_flr_sf,2nd_flr_sf,low_qual_fin_sf,gr_liv_area,bsmt_full_bath,bsmt_half_bath,full_bath,half_bath,bedroom_abvgr,kitchen_abvgr,kitchen_exc,totrms_abvgrd,functional,fireplaces,garage_connect,garage_cars,garage_area,paved_drive,wood_deck_sf,open_porch_sf,enclosed_porch,3ssn_porch,screen_porch,fence,misc_val,mo_sold,yr_sold,sale_type,saleprice,sell_diff,remod_diff,remod_x_sell,salepricelog,over_exc,qual_sqaure,exc_x_qual
0,2,153,535304180,20,RL,7922,Pave,1,Lvl,AllPub,Inside,Gtl,NAmes,Norm,Norm,1Fam,1.0,5,7,1953,2007,Gable,CompShg,VinylSd,VinylSd,0,0.0,0,Gd,0,0,TA,No,GLQ,731.0,Unf,0.0,326.0,1057.0,GasA,0,1,1,1057,0,0,1057,1.0,0.0,1,0,3,1,1,5,Typ,0,0,1.0,246.0,1,0,52,0,0,0,No,0,1,2010,WD,109000,57,3,171,11.599103,1,25,25
1,6,2827,908186070,180,RM,3675,Pave,1,Lvl,AllPub,Inside,Gtl,Edwards,Norm,Norm,TwnhsE,1.5,6,5,2005,2006,Gable,CompShg,VinylSd,VinylSd,1,82.0,0,TA,1,1,TA,Gd,GLQ,547.0,Unf,0.0,0.0,547.0,GasA,1,1,1,1072,0,0,1072,1.0,0.0,2,0,2,1,0,5,Typ,0,0,2.0,525.0,1,0,44,0,0,0,No,0,6,2006,New,140000,1,0,0,11.849398,4,36,144
2,10,1044,527451290,160,RM,1680,Pave,1,Lvl,AllPub,Inside,Gtl,BrDale,Norm,Norm,Twnhs,2.0,6,5,1971,1971,Gable,CompShg,HdBoard,HdBoard,1,232.0,0,TA,0,0,TA,No,ALQ,387.0,Unf,0.0,96.0,483.0,GasA,0,1,1,483,504,0,987,0.0,0.0,1,1,2,1,0,4,Typ,0,0,1.0,264.0,1,0,0,0,0,0,No,0,7,2008,COD,85400,37,37,1369,11.355101,0,36,0
3,11,2752,906380150,20,RL,7488,Pave,0,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,1.0,7,5,2005,2005,Gable,CompShg,VinylSd,VinylSd,0,0.0,1,TA,1,1,TA,Av,GLQ,393.0,Unf,0.0,815.0,1208.0,GasA,1,1,1,1208,0,0,1208,0.0,0.0,2,0,2,1,1,6,Typ,0,1,2.0,632.0,1,105,58,0,0,0,No,0,2,2006,WD,183600,1,1,1,12.120515,16,49,784
4,13,1177,533236070,160,FV,2645,Pave,1,Lvl,AllPub,Inside,Gtl,Somerst,Norm,Norm,Twnhs,2.0,8,5,1999,2000,Gable,CompShg,MetalSd,MetalSd,1,456.0,1,TA,1,1,TA,No,GLQ,813.0,Unf,0.0,147.0,960.0,GasA,1,1,1,962,645,0,1607,1.0,0.0,2,1,3,1,1,7,Typ,0,0,2.0,480.0,1,169,0,0,0,0,No,0,12,2008,ConLD,200000,9,8,72,12.206073,16,64,1024


In [5]:
df_train.drop(columns = ['Unnamed: 0'], inplace = True)

## Linear Regression

For this model, I will only be using data from columns that were ready to be used immediately from the original dataset. Only using data with a 0.5 correlation or higher.

In [6]:
features = ['overall_qual', 
                 'gr_liv_area', 
                 'garage_area', 
                 'garage_cars', 
                 'total_bsmt_sf', 
                 '1st_flr_sf', 
                 'year_built', 
                 'year_remod/add', 
                 'full_bath', 
                 'mas_vnr_area', 
                 'totrms_abvgrd'
                ]

X = df_train[features]

y = df_train['saleprice']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, train_size = .8)

In [8]:
lr = LinearRegression()

lr.fit(X_train, y_train)

predicts = lr.predict(X)

In [9]:
print(f"OLS Training R-Squared: {lr.score(X_train, y_train)}")
print(f"OLS Testing R-Squared: {lr.score(X_test, y_test)}")
print(f"OLS CV Training R-Squared: {cross_val_score(lr, X_train, y_train).mean()}")
print(f"OLS CV Testing R-Squared: {cross_val_score(lr, X_test, y_test).mean()}")
print(f"OLS Testing MSE: {mean_squared_error(y, predicts)}")
print(f"OLS Testing RMSE: {mean_squared_error(y, predicts, squared=False)}")

OLS Training R-Squared: 0.8242134886086818
OLS Testing R-Squared: 0.7879110776414348
OLS CV Training R-Squared: 0.8091403213116497
OLS CV Testing R-Squared: 0.7137833391150594
OLS Testing MSE: 453126215.5272161
OLS Testing RMSE: 21286.7615086752


In [10]:
list(zip(X_train.columns, lr.coef_))

[('overall_qual', 10889.413403980132),
 ('gr_liv_area', 41.59372449668531),
 ('garage_area', 52.37362352899845),
 ('garage_cars', -7130.999988680781),
 ('total_bsmt_sf', 25.688615452269396),
 ('1st_flr_sf', 31.095971568645744),
 ('year_built', 170.35352086524358),
 ('year_remod/add', 332.8711135118789),
 ('full_bath', 610.6640860285343),
 ('mas_vnr_area', 23.742522790750392),
 ('totrms_abvgrd', -2128.255637154037)]

## Ridge

In [11]:
sc = StandardScaler()

Z_train = sc.fit_transform(X_train)

Z_test = sc.transform(X_test)

In [12]:
r_alphas = np.logspace(0, 5, 100)

ridge_cv = RidgeCV(alphas = r_alphas, scoring = "r2", cv = 5)

ridge_cv.fit(Z_train, y_train);

# Taken from 3.07

In [13]:
ridge = Ridge()

ridge.fit(X_train, y_train)

predicts = ridge.predict(X_test)

In [14]:
print(f"Ridge CV Training R-Squared: {ridge_cv.score(Z_train, y_train)}")
print(f"Ridge CV Testing R-Squared: {ridge_cv.score(Z_test, y_test)}")
print(f"Ridge Testing MSE: {mean_squared_error(y_test, predicts)}")
print(f"Ridge Testing RMSE: {mean_squared_error(y_test, predicts, squared=False)}")

Ridge CV Training R-Squared: 0.8239128666377428
Ridge CV Testing R-Squared: 0.7872763618236251
Ridge Testing MSE: 546945723.3796468
Ridge Testing RMSE: 23386.87074791424


In [15]:
list(zip(X_train.columns, ridge.coef_))

[('overall_qual', 10854.784893889275),
 ('gr_liv_area', 41.61379739865114),
 ('garage_area', 51.86774978483421),
 ('garage_cars', -6947.762553855798),
 ('total_bsmt_sf', 25.716749317440353),
 ('1st_flr_sf', 31.097001044053922),
 ('year_built', 170.04944175778007),
 ('year_remod/add', 333.19145577877225),
 ('full_bath', 593.8917417056684),
 ('mas_vnr_area', 23.770143007221865),
 ('totrms_abvgrd', -2130.222869110631)]

## LASSO

In [16]:
l_alphas = np.logspace(-3, 0, 100)

lasso_cv = LassoCV(alphas = l_alphas, cv = 5, n_jobs= -1, max_iter = 50_000)

lasso_cv.fit(X_train, y_train);

# Taken from 3.07

In [17]:
lasso = Lasso()

lasso.fit(X_train, y_train)

predicts = lasso_cv.predict(X_test)

In [18]:
print(f"LASSO CV Training R-Squared: {lasso_cv.score(X_train, y_train)} ")
print(f"LASSO CV Testing R-Squared: {lasso_cv.score(X_test, y_test)} ")
print(f"LASSO Testing MSE: {mean_squared_error(y_test, predicts)} ")
print(f"LASSO Testing RMSE: {mean_squared_error(y_test,predicts,squared=False)} ")

LASSO CV Training R-Squared: 0.8242134787742492 
LASSO CV Testing R-Squared: 0.7879209374478212 
LASSO Testing MSE: 547226316.6536787 
LASSO Testing RMSE: 23392.868927382093 


In [19]:
list(zip(X_train.columns, lasso.coef_))

[('overall_qual', 10888.13840718997),
 ('gr_liv_area', 41.596206515940736),
 ('garage_area', 52.33421110287903),
 ('garage_cars', -7117.580311045099),
 ('total_bsmt_sf', 25.689249398482776),
 ('1st_flr_sf', 31.096304849260846),
 ('year_built', 170.37186028815003),
 ('year_remod/add', 332.90221773149057),
 ('full_bath', 602.36066828333),
 ('mas_vnr_area', 23.743107632664618),
 ('totrms_abvgrd', -2127.2124210673346)]