# Ames Housing Data: Modelling
---

## I. Model Building and Testing
---

In [161]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# import missingno as msno

from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV
from sklearn import metrics 
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder #, PolynomialFeatures
# from sklearn.compose import ColumnTransformer
# from sklearn.neighbors import KNeighborsClassifier

import datetime

In [94]:
hs = pd.read_csv('../datasets/Clean/train.csv')
hs_test = pd.read_csv('../datasets/Clean/test.csv')

In [95]:
hs.columns.tolist()

['Id',
 'ms_subclass',
 'ms_zoning',
 'lot_frontage',
 'lot_area',
 'street',
 'lot_shape',
 'land_contour',
 'utilities',
 'lot_config',
 'land_slope',
 'neighborhood',
 'condition_1',
 'condition_2',
 'bldg_type',
 'house_style',
 'overall_qual',
 'overall_cond',
 'year_built',
 'year_remod',
 'roof_style',
 'roof_matl',
 'exterior_1st',
 'exterior_2nd',
 'mas_vnr_type',
 'mas_vnr_area',
 'exter_qual',
 'exter_cond',
 'foundation',
 'bsmt_qual',
 'bsmt_cond',
 'bsmt_exposure',
 'bsmtfin_type_1',
 'bsmtfin_sf_1',
 'bsmtfin_type_2',
 'bsmtfin_sf_2',
 'bsmt_unf_sf',
 'total_bsmt_sf',
 'heating',
 'heating_qc',
 'central_air',
 'electrical',
 '1st_flr_sf',
 '2nd_flr_sf',
 'low_qual_fin_sf',
 'gr_liv_area',
 'bsmt_full_bath',
 'bsmt_half_bath',
 'full_bath',
 'half_bath',
 'bedroom_abvgr',
 'kitchen_abvgr',
 'kitchen_qual',
 'totrms_abvgrd',
 'functional',
 'fireplaces',
 'fireplace_qu',
 'garage_type',
 'garage_yr_blt',
 'garage_finish',
 'garage_cars',
 'garage_area',
 'garage_qual',
 '

In [156]:
# UPDATE FEATURES FOR TESTING HERE
feats_updated = ['overall_qual', 'year_built', 'year_remod', 'total_bsmt_sf', 'gr_liv_area', 'full_bath', 'fireplaces', 'age', 'garage_area', 'kitchen_qual_Fa',
 'kitchen_qual_Gd', 'kitchen_qual_TA', 'was_remod', 'bsmt_cat_finished','bsmt_cat_unfinished', 'grg_qual_num', 'garage_cat_finished', 'garage_cat_unfinished', 'cond12_feeder_st',
 'cond12_near_park', 'cond12_near_rr', 'cond12_norm', 'lotconfig_culdsac', 'lotconfig_inside', 'hi_bsmt_exposure', 'nbr_rank']

### Modelling Sale Price (Not Log Transformed)

In [159]:
def mod_iteration(feats):
    
    # Fit regression to X_train and y_train (75% of training.csv)
    X = hs[feats]
    y = hs['SalePrice']
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 531)
    lr = LinearRegression()
    lr.fit(X_train, y_train)
    
    # Predict SalePrice for 25% testing data within train.csv and compare to truth to get residuals
    y_preds = lr.predict(X_test)
    MSE = metrics.mean_squared_error(y_test, y_preds)
    RMSE = metrics.mean_squared_error(y_test, y_preds, squared=False)
        
    for i, coef in zip(X.columns, lr.coef_):
        print(f"{i}: {coef}")
    print(f"intercept: {lr.intercept_}")
    
    return f"Training R2: {lr.score(X_train, y_train)}, Testing R2: {lr.score(X_test, y_test)}, MSE: {MSE}, RMSE: {RMSE}"
    
mod_iteration(feats_updated)

overall_qual: 11968.811813500542
year_built: -350.97358679281257
year_remod: 178.57242205684673
total_bsmt_sf: 33.89030292727547
gr_liv_area: 52.7116961463897
full_bath: -8242.235192230459
fireplaces: 6724.39613179342
age: -515.689687872751
garage_area: 43.26384218895882
kitchen_qual_Fa: -54131.27796334144
kitchen_qual_Gd: -52487.16789398767
kitchen_qual_TA: -57266.73408818462
was_remod: 8620.634674153202
bsmt_cat_finished: -21633.15879992531
bsmt_cat_unfinished: -31122.77411498547
grg_qual_num: 9514.049344319392
garage_cat_finished: -34260.88822052657
garage_cat_unfinished: -34943.03250356009
cond12_feeder_st: 8091.490014363354
cond12_near_park: 28535.396383037347
cond12_near_rr: 10555.166512402857
cond12_norm: 12994.632119788877
lotconfig_culdsac: 10451.268224975585
lotconfig_inside: 1023.6598948232844
hi_bsmt_exposure: 10867.301638715573
nbr_rank: 1549.66139866835
intercept: 379064.43169823266


'Training R2: 0.8801071212533926, Testing R2: 0.8710146049538188, MSE: 764756771.8888963, RMSE: 27654.23605686652'

In [160]:
def mod_runon_all(feats):
    
    # Fit regression to entire data
    X = hs[feats]
    y = hs['SalePrice']
    lr_all = LinearRegression()
    lr_all.fit(X, y)
    
    # Predict SalePrice for entire data and compare to truth to get residuals
    y_preds_all = lr_all.predict(hs[feats])
    y_true = hs['SalePrice'] #Can use var from entire dataset
    MSE = metrics.mean_squared_error(y_true, y_preds_all)
    RMSE = metrics.mean_squared_error(y_true, y_preds_all, squared=False)
    
    # Use regression to predict SalePrice on Test.csv (unseen) data
    y_preds_all_test = lr_all.predict(hs_test[feats])
    hs_test['SalePrice'] = y_preds_all_test
    
    #Submit Predictions to Kaggle
    submit = hs_test[['Id', 'SalePrice']]
    submit.set_index('Id', inplace=True)
    dt = datetime.datetime.now().strftime("%m%d%Y%H")
    submit.to_csv(f'../datasets/Submissions/Features_Submission-{dt}.csv')
        
    for i, coef in zip(X.columns, lr_all.coef_):
        print(f"{i}: {coef}")
    print(f"intercept: {lr_all.intercept_}")
    
    return f"Full Data R2: {lr_all.score(X, y)}, MSE = {MSE}, RMSE = {RMSE}"

mod_runon_all(feats_updated)

# first submission = ['Overall Qual', 'Year Built', 'Year Remod/Add', 'BsmtFin SF 1', 'Total Bsmt SF', 'Gr Liv Area', 'Full Bath', 'Fireplaces', 'Garage Area']
# 6/2 submission - ['overall_qual', 'year_built', 'year_remod', 'total_bsmt_sf', 'gr_liv_area', 'full_bath', 'fireplaces', 'garage_area']

overall_qual: 12315.39092250356
year_built: -550.7827459455575
year_remod: 201.20428865982905
total_bsmt_sf: 33.89630121189707
gr_liv_area: 53.30002893636158
full_bath: -7954.517101227186
fireplaces: 6896.424727395184
age: -702.3287508636138
garage_area: 41.352757616707855
kitchen_qual_Fa: -52091.56661918211
kitchen_qual_Gd: -51928.867136748646
kitchen_qual_TA: -55603.27525338432
was_remod: 8084.878348721583
bsmt_cat_finished: -17903.659632606166
bsmt_cat_unfinished: -27825.453046849052
grg_qual_num: 6032.196108411356
garage_cat_finished: -24492.689418144815
garage_cat_unfinished: -25979.26751480429
cond12_feeder_st: 4486.534961553298
cond12_near_park: 22516.574951005234
cond12_near_rr: 6009.260236228417
cond12_norm: 10571.81508951031
lotconfig_culdsac: 9674.441487087695
lotconfig_inside: -424.9186121698786
hi_bsmt_exposure: 11512.567116312723
nbr_rank: 1475.6232003489029
intercept: 733217.8189078611


'Full Data R2: 0.8783938399951442, MSE = 755168061.9424928, RMSE = 27480.321358064444'

### Modelling Sale Price (Log Transformed)

In [157]:
def log_mod_iteration(feats):
    
    # Fit regression to X_train and y_train (75% of training.csv)
    X = hs[feats]
    y = hs['log_price']
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 531)
    lr = LinearRegression()
    lr.fit(X_train, y_train)
        
    # Predict SalePrice for 25% testing data within train.csv and compare to truth to get residuals
    y_preds = np.exp(lr.predict(X_test)) # Undoing the logged price
    MSE = metrics.mean_squared_error(np.exp(y_test), y_preds)
    RMSE = metrics.mean_squared_error(np.exp(y_test), y_preds, squared=False)
        
    for i, coef in zip(X.columns, np.exp(lr.coef_)):
        print(f"{i}: {coef}")
    print(f"intercept: {np.exp(lr.intercept_)}")
    
    return f"Training R2: {lr.score(X_train, y_train)}, Testing R2: {lr.score(X_test, y_test)}, MSE: {MSE}, RMSE: {RMSE}"
    
log_mod_iteration(feats_updated)

overall_qual: 1.0652899468860169
year_built: 0.9974293102625212
year_remod: 1.0020394205819831
total_bsmt_sf: 1.000144750054229
gr_liv_area: 1.0002573735553357
full_bath: 0.9853799720381564
fireplaces: 1.0520894038300466
age: 0.9965020241929878
garage_area: 1.0001360819100704
kitchen_qual_Fa: 0.833149471052506
kitchen_qual_Gd: 0.9062403849700669
kitchen_qual_TA: 0.875750340677795
was_remod: 1.0104580138701016
bsmt_cat_finished: 1.011526745224659
bsmt_cat_unfinished: 0.9434323505423419
grg_qual_num: 1.065723145327647
garage_cat_finished: 0.9225192140019572
garage_cat_unfinished: 0.9054621463126891
cond12_feeder_st: 1.0683359991349752
cond12_near_park: 1.1614438355263488
cond12_near_rr: 1.0814552213606787
cond12_norm: 1.0974132793724154
lotconfig_culdsac: 1.0296320884029058
lotconfig_inside: 0.9935256927877539
hi_bsmt_exposure: 1.0362133500536257
nbr_rank: 1.0064750820671917
intercept: 168662.3642301317


'Training R2: 0.9027066907883885, Testing R2: 0.8469346326344672, MSE: 646650684.6715598, RMSE: 25429.32725558346'

In [158]:
def log_mod_runon_all(feats):
    
    # Fit regression to entire data
    X = hs[feats]
    y = hs['log_price']
    lr_all = LinearRegression()
    lr_all.fit(X, y)
    
    # Predict SalePrice for entire data and compare to truth to get residuals
    y_preds_all = np.exp(lr_all.predict(hs[feats]))
    y_true = hs['SalePrice'] #Can use var from entire dataset
    MSE = metrics.mean_squared_error(y_true, y_preds_all)
    RMSE = metrics.mean_squared_error(y_true, y_preds_all, squared=False)
    
    # Use regression to predict SalePrice on Test.csv (unseen) data
    y_preds_all_test = np.exp(lr_all.predict(hs_test[feats]))
    hs_test['SalePrice'] = y_preds_all_test
    
    #Submit Predictions to Kaggle
    submit = hs_test[['Id', 'SalePrice']]
    submit.set_index('Id', inplace=True)
    dt = datetime.datetime.now().strftime("%m%d%Y%H")
    submit.to_csv(f'../datasets/Submissions/Features_Submission_logy-{dt}.csv')
        
    for i, coef in zip(X.columns, np.exp(lr_all.coef_)):
        print(f"{i}: {coef}")
    print(f"intercept: {np.exp(lr_all.intercept_)}")
    
    return f"Full Data R2: {lr_all.score(X, y)}, MSE = {MSE}, RMSE = {RMSE}"

log_mod_runon_all(feats_updated)

overall_qual: 1.0700044474922827
year_built: 0.995966660473269
year_remod: 1.0020249902646061
total_bsmt_sf: 1.0001450608060531
gr_liv_area: 1.000262913671798
full_bath: 0.9825204135480938
fireplaces: 1.0491404286946246
age: 0.9951758871327632
garage_area: 1.0001120666758458
kitchen_qual_Fa: 0.8358095681887513
kitchen_qual_Gd: 0.9114755884314091
kitchen_qual_TA: 0.8882442964550927
was_remod: 1.01180134774983
bsmt_cat_finished: 1.0345746437414591
bsmt_cat_unfinished: 0.9608666836599141
grg_qual_num: 1.069009566005959
garage_cat_finished: 0.9127696831938725
garage_cat_unfinished: 0.8903533790078164
cond12_feeder_st: 1.0555217604870832
cond12_near_park: 1.135677042787556
cond12_near_rr: 1.0589808407380914
cond12_norm: 1.0820369337840932
lotconfig_culdsac: 1.0220627791521084
lotconfig_inside: 0.9861029420627985
hi_bsmt_exposure: 1.0410245317703117
nbr_rank: 1.006934041446833
intercept: 3180680.766252608


'Full Data R2: 0.8891974819498465, MSE = 546453367.2372748, RMSE = 23376.342041416035'

### II. Regularization
Our model is overfit, as reflected by the R^2 for our training data being comfortably higher than the R^2 on our testing data. This indicates that our model is not doing as well on unseen data.

In [172]:
def log_mod_iteration_reg(feats):
    
    # Fit regression to X_train and y_train (75% of training.csv)
    X = hs[feats]
    y = hs['log_price']
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 602)
    
    # Scale features
    sc = StandardScaler()
    Z_train = sc.fit_transform(X_train)
    Z_test = sc.transform(X_test)
    
    # Run Ridge Regression
    r_alphas = np.logspace(0, 5, 150)
    ridge_cv = RidgeCV(alphas = r_alphas, scoring = 'r2', cv = 10)
    ridge_cv.fit(Z_train, y_train)
            
    # Predict SalePrice for 25% testing data within train.csv and compare to truth to get residuals
    y_preds = np.exp(ridge_cv.predict(Z_test)) # Undoing the logged price
    MSE = metrics.mean_squared_error(np.exp(y_test), y_preds)
    RMSE = metrics.mean_squared_error(np.exp(y_test), y_preds, squared=False)
        
    for i, coef in zip(X.columns, np.exp(ridge_cv.coef_)):
        print(f"{i}: {coef}")
    print(f"intercept: {np.exp(ridge_cv.intercept_)}")
    
    return f"Training R2: {ridge_cv.score(Z_train, y_train)}, Testing R2: {ridge_cv.score(Z_test, y_test)}, MSE: {MSE}, RMSE: {RMSE}"
    
log_mod_iteration_reg(feats_updated)

overall_qual: 1.0939220739005142
year_built: 1.0040755902306637
year_remod: 1.04140889428045
total_bsmt_sf: 1.068896287760467
gr_liv_area: 1.130277752658832
full_bath: 0.9990403216193918
fireplaces: 1.0325838321293963
age: 0.9815993923209081
garage_area: 1.0287925411210734
kitchen_qual_Fa: 0.9746523159954423
kitchen_qual_Gd: 0.9635254189291536
kitchen_qual_TA: 0.9524809579886162
was_remod: 1.0074591450994759
bsmt_cat_finished: 1.0121097642957746
bsmt_cat_unfinished: 0.9813774961844252
grg_qual_num: 1.019773950160603
garage_cat_finished: 1.0087295744366764
garage_cat_unfinished: 0.9932542232391477
cond12_feeder_st: 1.0065305360764778
cond12_near_park: 1.017656082728806
cond12_near_rr: 1.0106457226048629
cond12_norm: 1.0232917584382524
lotconfig_culdsac: 1.0039756971461884
lotconfig_inside: 0.9936471852136411
hi_bsmt_exposure: 1.018813569413983
nbr_rank: 1.040701465228548
intercept: 168613.18234487195


'Training R2: 0.9027254341349968, Testing R2: 0.839338444201079, MSE: 602963159.0604529, RMSE: 24555.308164640348'

In [173]:
def log_mod_runon_all_reg(feats):
    
    # Fit regression to entire data
    X = hs[feats]
    y = hs['log_price']
    
    # Scale features
    sc_all = StandardScaler()
    Z_all = sc_all.fit_transform(X)
    
    # Run Ridge Regression
    r_alphas = np.logspace(0, 8, 200)
    ridge_cv_all = RidgeCV(alphas = r_alphas, scoring = 'r2', cv = 10)
    ridge_cv_all.fit(Z_all, y)
    
    # Predict SalePrice for entire data and compare to truth to get residuals
    y_preds_all = np.exp(ridge_cv_all.predict(Z_all)) # Undoing the logged price
    y_true = hs['SalePrice'] #Can use var from entire dataset
    MSE = metrics.mean_squared_error(y_true, y_preds_all)
    RMSE = metrics.mean_squared_error(y_true, y_preds_all, squared=False)
    
    # Use regression to predict SalePrice on Test.csv (unseen) data
    # first standard scale
    Z_all_test = sc_all.transform(hs_test[feats])
    y_preds_all_test = np.exp(ridge_cv_all.predict(Z_all_test))
    hs_test['SalePrice'] = y_preds_all_test
    
    #Submit Predictions to Kaggle
    submit = hs_test[['Id', 'SalePrice']]
    submit.set_index('Id', inplace=True)
    dt = datetime.datetime.now().strftime("%m%d%Y%H")
    submit.to_csv(f'../datasets/Submissions/Features_Submission_logy_reg-{dt}.csv')
        
    for i, coef in zip(X.columns, np.exp(ridge_cv_all.coef_)):
        print(f"{i}: {coef}")
    print(f"intercept: {np.exp(ridge_cv_all.intercept_)}")
    
    return f"Full Data R2: {ridge_cv_all.score(Z_all, y)}, MSE = {MSE}, RMSE = {RMSE}"

log_mod_runon_all_reg(feats_updated)

overall_qual: 1.0992338071239376
year_built: 1.0017788882893603
year_remod: 1.042831164938939
total_bsmt_sf: 1.0632978744993025
gr_liv_area: 1.1339008996049866
full_bath: 0.9918202497708329
fireplaces: 1.0316074361236327
age: 0.977453834472778
garage_area: 1.0248227320433363
kitchen_qual_Fa: 0.976005246027994
kitchen_qual_Gd: 0.9596957142957249
kitchen_qual_TA: 0.946915411562688
was_remod: 1.0054774287918284
bsmt_cat_finished: 1.0164468973281113
bsmt_cat_unfinished: 0.9826399951812711
grg_qual_num: 1.0341269779578652
garage_cat_finished: 0.9847683203681766
garage_cat_unfinished: 0.9721834983876779
cond12_feeder_st: 1.0105824740298153
cond12_near_park: 1.0167522522698196
cond12_near_rr: 1.0092103211098131
cond12_norm: 1.0251934357072556
lotconfig_culdsac: 1.0056837798213347
lotconfig_inside: 0.9941845934470174
hi_bsmt_exposure: 1.0178420016003606
nbr_rank: 1.0376121500256643
intercept: 168371.1380929812


'Full Data R2: 0.8888500552096913, MSE = 549697671.558395, RMSE = 23445.632249064965'

In [None]:
# SUBMISSION HISTORY


# first submission = ['Overall Qual', 'Year Built', 'Year Remod/Add', 'BsmtFin SF 1', 'Total Bsmt SF', 'Gr Liv Area', 'Full Bath', 'Fireplaces', 'Garage Area']
# 6/2 submission = ['overall_qual', 'year_built', 'year_remod', 'total_bsmt_sf', 'gr_liv_area', 'full_bath', 'fireplaces', 'garage_area']
# 6/3 8:30P submission = ['overall_qual', 'year_built', 'year_remod', 'total_bsmt_sf', 'gr_liv_area', 'full_bath', 'fireplaces', 'garage_area', 'age', 'was_remod', 'bsmt_cat_finished','bsmt_cat_unfinished']
# 6/3 10P submission = ['overall_qual', 'year_built', 'year_remod', 'total_bsmt_sf', 'gr_liv_area', 'full_bath', 'fireplaces', 'age', 'garage_area', 'kitchen_qual_Fa', 'kitchen_qual_Gd', 'kitchen_qual_TA', 'was_remod', 'bsmt_cat_finished','bsmt_cat_unfinished', 'grg_qual_num', 'garage_cat_finished', 'garage_cat_unfinished', 'cond12_feeder_st', 'cond12_near_park', 'cond12_near_rr', 'cond12_norm', 'lotconfig_culdsac', 'lotconfig_inside', 'hi_bsmt_exposure', 'nbr_rank']

In [None]:
# Interpreting Log transformations in a linear model: https://data.library.virginia.edu/interpreting-log-transformations-in-a-linear-model/

In [None]:
'Training R2: 0.8921525700749652, Testing R2: 0.8386870855035922, MSE: 708822632.901252, RMSE: 26623.72312245701'

In [None]:
'Full Data R2: 0.8790630647367969, MSE = 628333111.7919718, RMSE = 25066.573594968497'

In [None]:
'Training R2: 0.8966273309464855, Testing R2: 0.8405937419909588, MSE: 670075440.0093209, RMSE: 25885.81542098531'

In [None]:
'Full Data R2: 0.8829099879875415, MSE = 582529034.6266086, RMSE = 24135.638268473627'

In [None]:
'Training R2: 0.899604693015945, Testing R2: 0.8417923789760717, MSE: 669342144.1531007, RMSE: 25871.647495919173'

In [None]:
'Full Data R2: 0.8855391285382741, MSE = 568967239.703276, RMSE = 23853.03418232733'

In [None]:
'Training R2: 0.9008216061613761, Testing R2: 0.8439859776639766, MSE: 651311530.0776106, RMSE: 25520.805827356053'

In [None]:
'Full Data R2: 0.8870332739007128, MSE = 552934634.5430821, RMSE = 23514.56218055276'

In [None]:
'Training R2: 0.9027066907883885, Testing R2: 0.8469346326344672, MSE: 646650684.6715598, RMSE: 25429.32725558346'

In [None]:
'Full Data R2: 0.8891974819498465, MSE = 546453367.2372748, RMSE = 23376.342041416035'

feats_updated = ['overall_qual', 'year_built', 'year_remod', 'total_bsmt_sf', 'gr_liv_area', 'full_bath', 'fireplaces', 'age', 'garage_area', 'kitchen_qual_Fa',
 'kitchen_qual_Gd', 'kitchen_qual_TA', 'was_remod', 'bsmt_cat_finished','bsmt_cat_unfinished', 'grg_qual_num', 'garage_cat_finished', 'garage_cat_unfinished', 'cond12_feeder_st',
 'cond12_near_park', 'cond12_near_rr', 'cond12_norm', 'lotconfig_culdsac', 'lotconfig_inside', 'hi_bsmt_exposure', 'nbr_rank']

In [None]:
fireplace qual
garage qual
has pool
amenities_scr