In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.linear_model import LinearRegression, LassoCV, RidgeCV, Lasso, Ridge, ElasticNet, ElasticNetCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
from sklearn.pipeline import Pipeline

from scipy.stats import pearsonr

  import pandas.util.testing as tm


In [2]:
#get the dataset

df_traindata_cleaned=pd.read_csv('../datasets/df_traindata_cleaned.csv')

X=df_traindata_cleaned.drop('SalePrice', axis=1)
y=df_traindata_cleaned['SalePrice']


In [3]:
len(X.columns)

35

In [4]:
# using Pipeline to do scaling with KFold

from sklearn.pipeline import Pipeline

ss = StandardScaler()
lr = LinearRegression()

# setting random seed as 42 for our KFold tests, shuffle = True
cv = KFold(n_splits=10,shuffle=True, random_state=33)




In [5]:
# KFold cross_val_score R^2 value for Linear Regression

pipeline_lr = Pipeline([('transformer', ss), ('estimator', lr)])

lr_mean_cross_val_score = cross_val_score(pipeline_lr, X, y, cv = cv).mean()

print(lr_mean_cross_val_score)


0.8055319473892262


In [6]:
# Find an optimal value for Ridge Regression alpha using RidgeCV
ridge_alphas = np.logspace(0, 5, 200)

optimal_ridge = RidgeCV(alphas=ridge_alphas, cv = cv)
optimal_ridge.fit(X, y)

print (optimal_ridge.alpha_)

15.167168884709232


In [7]:
# KFold cross_val_score R^2 value for Ridge Regression using optimal alpha

pipeline_ridge = Pipeline([('transformer', ss), ('estimator', optimal_ridge)])

optimal_ridge_mean_cross_val_score = cross_val_score(pipeline_ridge, X, y, cv = cv).mean()

print(optimal_ridge_mean_cross_val_score)




0.8067882235682259


In [8]:
# Find an optimal value for Lasso Regression alpha using LassoCV

optimal_lasso = LassoCV(n_alphas=500, cv=cv)
optimal_lasso.fit(X, y)

print (optimal_lasso.alpha_)

158407.69610015253


In [9]:
# KFold cross_val_score R^2 value for Lasso Regression using optimal alpha

pipeline_lasso = Pipeline([('transformer', ss), ('estimator', optimal_lasso)])

optimal_lasso_mean_cross_val_score = cross_val_score(pipeline_lasso, X, y, cv = cv).mean()

print(optimal_lasso_mean_cross_val_score)

0.8075325879249349


In [10]:
# Lasso performs slightly better than the Ridge, but similarly.
# Lasso deals primarily with the feature selection of valuable variables,
# eliminating ones that are not useful. This also takes care of multicollinearity,
# but in a different way: it will choose the "best" of the correlated variables
# and zero-out the other redundant ones.
# There may also be useless variables in the data which it is simply getting rid
# of entirely.

In [11]:
# Find optimal parameters for ElasticNet Regression using ElasticNetCV

l1_ratios = np.linspace(0.01, 1.0, 25)

optimal_enet = ElasticNetCV(l1_ratio=l1_ratios, n_alphas=100, cv=cv,)
optimal_enet.fit(X, y)

print (optimal_enet.alpha_)
print (optimal_enet.l1_ratio_)

158407.69610015253
1.0


In [12]:
# KFold cross_val_score R^2 value for ElasticNet Regression using optimal alpha and l1

pipeline_elastic = Pipeline([('transformer', ss), ('estimator', optimal_enet)])

optimal_elastic_mean_cross_val_score = cross_val_score(pipeline_elastic, X, y, cv = cv).mean()

print(optimal_elastic_mean_cross_val_score)

0.8074918830364943


In [13]:
# the optimal parameters of ElasticNet also describe a Lasso Regression, as optimal l1_ratio is 1.0

In [14]:
# Displaying coefficients for variables in the Ridge Regression Model

In [15]:
# applying optimal alpha to lasso model

ridge = Ridge(alpha=optimal_ridge.alpha_)

ridge.fit(X, y)

Ridge(alpha=15.167168884709232, copy_X=True, fit_intercept=True, max_iter=None,
      normalize=False, random_state=None, solver='auto', tol=0.001)

In [16]:
ridge_coefs = pd.DataFrame({'variable':X.columns,
                            'coef':ridge.coef_,
                            'abs_coef':np.abs(ridge.coef_)})

ridge_coefs.sort_values('abs_coef', inplace=True, ascending=False)

ridge_coefs.head(20)

Unnamed: 0,variable,coef,abs_coef
18,Neighborhood_NoRidge,17004.712479,17004.712479
25,Overall Qual,13713.47194,13713.47194
14,Exter Qual,13651.159356,13651.159356
6,Kitchen Qual,13161.840707,13161.840707
16,Roof Style_Hip,12463.144295,12463.144295
2,Mas Vnr Type_Stone,7952.744627,7952.744627
10,Bsmt Cond,-6622.480969,6622.480969
11,Bsmt Qual,6022.440085,6022.440085
9,Bsmt Exposure,5473.103004,5473.103004
32,Garage Cars,5346.448467,5346.448467


In [17]:
# Displaying coefficients for variables in the Lasso Regression Model

In [18]:
# applying optimal alpha to lasso model

lasso = Lasso(alpha=optimal_lasso.alpha_)

lasso.fit(X, y)

Lasso(alpha=158407.69610015253, copy_X=True, fit_intercept=True, max_iter=1000,
      normalize=False, positive=False, precompute=False, random_state=None,
      selection='cyclic', tol=0.0001, warm_start=False)

In [19]:
lasso_coefs = pd.DataFrame({'variable':X.columns,
                            'coef':lasso.coef_,
                            'abs_coef':np.abs(lasso.coef_)})

lasso_coefs.sort_values('abs_coef', inplace=True, ascending=False)

lasso_coefs.head(20)

Unnamed: 0,variable,coef,abs_coef
26,Year Built,474.864794,474.864794
33,Garage Area,75.561246,75.561246
34,Total Flr SF,45.677134,45.677134
28,Total Bsmt SF,34.410986,34.410986
29,Gr Liv Area,22.918702,22.918702
27,BsmtFin SF 1,16.547036,16.547036
24,Lot Area,0.220946,0.220946
30,TotRms AbvGrd,-0.0,0.0
31,Garage Yr Blt,0.0,0.0
18,Neighborhood_NoRidge,0.0,0.0


In [20]:
# Loading in the holdout dataset to test model

df_holdoutdata_cleaned=pd.read_csv('../datasets/df_holdoutdata_cleaned.csv')



In [21]:
# extracting Id column together with its values for later concat with y_holdouts
holdout_Id_column=pd.DataFrame(df_holdoutdata_cleaned['Id'])

print(holdout_Id_column.shape)
holdout_Id_column.tail()

(879, 1)


Unnamed: 0,Id
874,1662
875,1234
876,1373
877,1672
878,1939


In [22]:
X_holdout=df_holdoutdata_cleaned.drop(columns=['Id'])

print(X_holdout.shape)
X_holdout.tail()

(879, 35)


Unnamed: 0,Mas Vnr Type_BrkFace,Mas Vnr Type_None,Mas Vnr Type_Stone,Garage Finish,Garage Type_Attchd,Garage Type_Detchd,Kitchen Qual,Heating QC,BsmtFin Type 1,Bsmt Exposure,...,Overall Qual,Year Built,BsmtFin SF 1,Total Bsmt SF,Gr Liv Area,TotRms AbvGrd,Garage Yr Blt,Garage Cars,Garage Area,Total Flr SF
874,0,1,0,1,1,0,3,3,5,1,...,6,1974,931,1084,1877,8,1974.0,2,488,1877
875,1,0,0,2,1,0,4,5,4,1,...,6,1966,575,1104,1988,9,1966.0,2,480,1988
876,0,1,0,1,1,0,3,5,4,1,...,5,1968,250,952,1211,5,1968.0,1,322,1211
877,0,1,0,1,0,1,3,3,5,1,...,4,1971,616,864,864,5,1974.0,2,528,864
878,0,1,0,1,1,0,3,3,4,1,...,5,1955,793,923,925,5,1955.0,1,390,925


In [23]:
# Predicting SalePrice using Ridge Regression Model

In [24]:
y_holdout_ridge=ridge.predict(X_holdout)

In [25]:
print(y_holdout_ridge.shape)
pd.DataFrame(y_holdout_ridge)

(879,)


Unnamed: 0,0
0,136514.536040
1,170064.760142
2,217892.138106
3,113712.052023
4,159495.277102
...,...
874,178104.256238
875,215770.992966
876,131619.611898
877,104133.628823


In [26]:
final_result_ridge=pd.concat([holdout_Id_column,pd.DataFrame(y_holdout_ridge)], axis='columns')

In [27]:
submission_ridge=final_result_ridge.sort_values(['Id'], ascending=True)

In [28]:
submission_ridge.reset_index(inplace=True,drop=True)

In [29]:
print(submission_ridge.shape)
submission_ridge.head()

(879, 2)


Unnamed: 0,Id,0
0,2,105022.564315
1,4,277328.818476
2,6,187144.246408
3,7,229683.63773
4,17,243778.355268


In [30]:
submission_ridge.to_csv(r'../datasets/submission_ridge.csv', index = False)

In [31]:
# Predicting SalePrice using Lasso Regression Model

In [32]:
y_holdout_lasso=lasso.predict(X_holdout)

In [33]:
print(y_holdout_lasso.shape)
pd.DataFrame(y_holdout_lasso)

(879,)


Unnamed: 0,0
0,170487.647146
1,248259.496014
2,183715.217685
3,111904.536989
4,187642.789047
...,...
874,218362.819029
875,217844.733420
876,141530.091644
877,137911.305440


In [34]:
final_result_lasso=pd.concat([holdout_Id_column,pd.DataFrame(y_holdout_lasso)], axis='columns')

In [35]:
final_result_lasso.rename(columns={0: 'SalePrice'}, errors="raise",inplace=True)

In [36]:
print(final_result_lasso.shape)
final_result_lasso.head()

(879, 2)


Unnamed: 0,Id,SalePrice
0,2658,170487.647146
1,2718,248259.496014
2,2414,183715.217685
3,1989,111904.536989
4,625,187642.789047


In [37]:
submission_lasso=final_result_lasso.sort_values(['Id'], ascending=True)

In [38]:
submission_lasso.reset_index(inplace=True,drop=True)

In [39]:
print(submission_lasso.shape)
submission_lasso.head()

(879, 2)


Unnamed: 0,Id,SalePrice
0,2,149370.852948
1,4,272286.706127
2,6,199228.929113
3,7,204161.330487
4,17,191246.608878


In [40]:
submission_lasso.to_csv(r'../datasets/submission_lasso.csv', index = False)

In [41]:
## DONE!