# Ames Housing Data: Modeling
---

This notebook stors the final model chosen for our analysis. This model has the highest R^2 whilst minimzing RMSE compared to all other options tested. The chosen Linear Regression model is performed with a <u>Log-Transformed Target Variable</u> (Sale Price) and with <u>LassoCV regularization</u>.

For more information on the initial data cleaning, exploration, and visualization see the [initial notebook](../code/01_EDA_and_Cleaning.ipynb) of this analysis. For transforming our variables into model-ready form, see the [second notebook](../code/02_Feature_Engineering.ipynb) of this analysis.

For more information on the background, [data](https://jse.amstat.org/v19n3/decock/DataDocumentation.txt), and a summary of methods and findings, please see the associated [README](../Farah_Malik_Proj2_README.md) for this analysis.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# import missingno as msno

from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV
from sklearn import metrics 
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder #, PolynomialFeatures
# from sklearn.compose import ColumnTransformer
# from sklearn.neighbors import KNeighborsClassifier

import datetime
import statsmodels.api as sm

In [6]:
import os
cwd = os.getcwd()
cwd

'C:\\Users\\farah\\Documents\\General Assembly DSI\\DSI-508\\Lessons\\301-302-lesson-linear-regression\\solution-code'

In [3]:
os.chdir('C:/Users/farah/Documents/General Assembly DSI/DSI-508/Projects/project-2/datasets')

In [4]:
#cwd = os.getcwd()
#cwd

In [5]:
hs = pd.read_csv('../datasets/Clean/train.csv', na_values=['NaN', '', 'Missing'], keep_default_na=False)
hs_test = pd.read_csv('../datasets/Clean/test.csv', na_values=['NaN', '', 'Missing'], keep_default_na=False)

FileNotFoundError: [Errno 2] No such file or directory: '../datasets/Clean/train.csv'

In [None]:
# UPDATE FEATURES FOR TESTING HERE
feats_updated = ['overall_qual', 'year_built', 'year_remod', 'total_bsmt_sf', 'gr_liv_area', 'full_bath', 'fireplaces', 'age', 'garage_area', 'kitchen_qual_Fa',
 'kitchen_qual_Gd', 'kitchen_qual_TA', 'was_remod', 'bsmt_cat_finished','bsmt_cat_unfinished', 'grg_qual_num', 'garage_cat_finished', 'garage_cat_unfinished', 'garage_cat_rough_finished', 'cond12_feeder_st',
 'cond12_near_park', 'cond12_near_rr', 'cond12_norm', 'lotconfig_culdsac', 'lotconfig_inside', 'hi_bsmt_exposure', 'nbr_rank']

In [None]:
def log_mod_iteration_lreg(feats):
    
    # Fit regression to X_train and y_train (75% of training.csv)
    X = hs[feats]
    y = hs['log_price']
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 602)
    
    # Scale features
    sc = StandardScaler()
    Z_train = sc.fit_transform(X_train)
    Z_test = sc.transform(X_test)
    
    # Run Lasso Regression
    l_alphas = np.logspace(-5, 0, 150)
    lasso_cv = LassoCV(alphas = l_alphas, cv = 10, max_iter=75_000)
    lasso_cv.fit(Z_train, y_train)
            
    # Predict SalePrice for 25% testing data within train.csv and compare to truth to get residuals
    y_preds = np.exp(lasso_cv.predict(Z_test)) # Undoing the logged price
    MSE = metrics.mean_squared_error(np.exp(y_test), y_preds)
    RMSE = metrics.mean_squared_error(np.exp(y_test), y_preds, squared=False)
        
    for i, coef in zip(X.columns, np.exp(lasso_cv.coef_)):
        print(f"{i}: {coef}")
    print(f"intercept: {np.exp(lasso_cv.intercept_)}")
    
    return f"Training R2: {lasso_cv.score(Z_train, y_train)}, Testing R2: {lasso_cv.score(Z_test, y_test)}, MSE: {MSE}, RMSE: {RMSE}"
    
log_mod_iteration_lreg(feats_updated)

In [None]:
def log_mod_runon_all_lreg(feats):
    
    # Fit regression to entire data
    X = hs[feats]
    y = hs['log_price']
    
    # Scale features
    sc_all = StandardScaler()
    Z_all = sc_all.fit_transform(X)
    
    # Run Ridge Regression
    l_alphas = np.logspace(-5, 0, 150)
    lasso_cv_all = LassoCV(alphas = l_alphas, cv = 10, max_iter=75_000)
    lasso_cv_all.fit(Z_all, y)
    
    # Predict SalePrice for entire data and compare to truth to get residuals
    y_preds_all = np.exp(lasso_cv_all.predict(Z_all)) # Undoing the logged price
    y_true = hs['SalePrice'] #Can use var from entire dataset
    MSE = metrics.mean_squared_error(y_true, y_preds_all)
    RMSE = metrics.mean_squared_error(y_true, y_preds_all, squared=False)
    
    # Use regression to predict SalePrice on Test.csv (unseen) data
    # first standard scale
    Z_all_test = sc_all.transform(hs_test[feats])
    y_preds_all_test = np.exp(lasso_cv_all.predict(Z_all_test))
    hs_test['SalePrice'] = y_preds_all_test

    # Null model for comparison
    hs['null_pred'] = np.exp(np.mean(y))
    null_pred = hs['null_pred']
    null_MSE = metrics.mean_squared_error(y_true, null_pred)
    null_RMSE = metrics.mean_squared_error(y_true, null_pred, squared=False)
    
    # Submit Predictions to Kaggle
    submit = hs_test[['Id', 'SalePrice']]
    submit.set_index('Id', inplace=True)
    dt = datetime.datetime.now().strftime("%m%d%Y%H")
    submit.to_csv(f'../datasets/Submissions/Features_Submission_logy_lreg-{dt}.csv')
        
    for i, coef in zip(X.columns, np.exp(lasso_cv_all.coef_)):
        print(f"{i}: {coef}")
    print(f"intercept: {np.exp(lasso_cv_all.intercept_)}")
    print(f"null_MSE: {null_MSE}, null_RMSE: {null_RMSE}")
    
    return f"Full Data R2: {lasso_cv_all.score(Z_all, y)}, MSE = {MSE}, RMSE = {RMSE}"

log_mod_runon_all_lreg(feats_updated)

In [None]:
# The below attempts to look at OLS measures on our model 

In [None]:
X = hs[feats_updated]
y = hs['log_price']
    
# Scale features
sc_all = StandardScaler()
Z_all = sc_all.fit_transform(X)
    
# Run Ridge Regression
l_alphas = np.logspace(-5, 0, 150)
lasso_cv_all = LassoCV(alphas = l_alphas, cv = 10, max_iter=75_000)
lasso_cv_all.fit(Z_all, y)
    
# Predict SalePrice for entire data and compare to truth to get residuals
y_preds_all = np.exp(lasso_cv_all.predict(Z_all)) # Undoing the logged priceZ_wc = sm.add_constant(Z_all)
y_true = hs['SalePrice']

Z_all_test = sc_all.transform(hs_test[feats_updated])
y_preds_all_test = np.exp(lasso_cv_all.predict(Z_all_test))
hs_test['SalePrice'] = y_preds_all_test

In [None]:
Z_wc = sm.add_constant(Z_all_test)

In [None]:
mod = sm.OLS(y_preds_all_test, Z_wc)

In [None]:
ols = sm.OLS(y_true, Z_wc).fit()

In [None]:
ols.summary()