In [1]:
import pandas as pd
import missingno as msno
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import RobustScaler, PowerTransformer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split, cross_val_score, KFold, GridSearchCV
from sklearn.linear_model import LinearRegression, Lasso, LassoCV, Ridge, RidgeCV, ElasticNetCV
from sklearn.metrics import mean_squared_error

In [2]:
combined_df = pd.read_csv('./datasets/combined_df.csv')

In [3]:
coef_index = combined_df.columns.drop(['Id','Label_test','Label_train','SalePrice'])
new_train_data = combined_df.loc[combined_df['Label_train']==1].drop(columns=['Id','Label_test','Label_train'])
new_test_data = combined_df.loc[combined_df['Label_test']==1].drop(columns=['Id','Label_test','Label_train'])
X_train, X_test, y_train, y_test = train_test_split(new_train_data.drop('SalePrice', axis=1),new_train_data['SalePrice'], random_state=42)
y_train = np.log1p(y_train.values.ravel())

# fit OLS, Ridge, Lasso
Naively fitting all 3 linear models optimising for R^2 score. then observe which would be the best. Ridge appears to be the best. CV=100 is arbitrarily chosen as a high number of folds but without compromising on shorter run times  
- R^2 improves for test with lasso implying that this regularisation is doing a good job at reducing bias especially compared to ridge.

'*' Elasticnet is tried and it tended to use lasso. this is interesting because even though ElasticNet chooses the `l1_ratio` with the best score RidgeCV still has a better score than LassoCV. ElasticNet is dropped from further analysis.

In [4]:
ols=LinearRegression()
ols.fit(X_train, y_train)
ols_scores = cross_val_score(ols, X_train, y_train, cv=100, n_jobs=-1,scoring='r2')
print(f'ols train R^2:\t\t {ols_scores.mean():.4f}')

ols_test_score = ols.score(X_test,np.log1p(y_test))
print(f'ols test R^2:\t\t {ols_test_score:.4f}')

ols train R^2:		 0.8687
ols test R^2:		 0.8892


In [7]:
ridge_cv=RidgeCV(cv=100,alphas=[0.125, 1.25, 12.5])
ridge_cv.fit(X_train, y_train)
ridge_cv_score = ridge_cv.score(X_train, y_train)
print(f'Ridge train R^2:\t {ridge_cv_score:.4f}')

ridge_cv_test_score = ridge_cv.score(X_test,np.log1p(y_test))
print(f'Ridge test R^2:\t\t {ridge_cv_test_score:.4f}')

Ridge train R^2:	 0.9421
Ridge test R^2:		 0.8911


In [8]:
lasso_cv=LassoCV(cv=100)
lasso_cv.fit(X_train, y_train)
lasso_cv_score = lasso_cv.score(X_train, y_train)
print(f'lasso train R^2:\t {lasso_cv_score:.4f}')

lasso_cv_test_score = lasso_cv.score(X_test,np.log1p(y_test))
print(f'lasso test R^2:\t\t {lasso_cv_test_score:.4f}')

lasso train R^2:	 0.8757
lasso test R^2:		 0.8674


In [9]:
print(f'ols {(ols_test_score-ols_scores.mean())/ols_scores.mean()*100:.2f}%')
print(f'ridge {(ridge_cv_test_score-ridge_cv_score)/ridge_cv_score*100:.2f}%')
print(f'lasso {(lasso_cv_test_score-lasso_cv_score)/lasso_cv_score*100:.2f}%')

ols 2.37%
ridge -5.42%
lasso -0.95%


In [10]:
elastic_cv = ElasticNetCV(cv=100,l1_ratio =[0.01, .1, .3, .5, .7, .9, 1])
elastic_cv.fit(X_train, y_train)
elastic_cv_score = elastic_cv.score(X_train, y_train)
print(f'elastic train R^2:\t {elastic_cv_score:.4f}')

elastic_cv_test_score = elastic_cv.score(X_test,np.log1p(y_test))
print(f'elastic test R^2:\t {elastic_cv_test_score:.4f}')

elastic train R^2:	 0.8757
elastic test R^2:	 0.8674


## RidgeCV
to fit Ridge, an optimal alpha is required. this is determined with Ridge CV and varying alpha values. additional step of using MSE or 'neg_mean_squared_error' is taken because the competition uses R-MSE.  
For-loop is used to sift an optimal alpha value = 1.25

In [11]:
ridge_cv = RidgeCV()
alphas=[[.01,.1,1,10,100],[.1,.5,1,2,5],[.5,.7,1,1.5,2],[1,1.25,1.5,1.75,2],[1,1.1,1.25,1.3,1.5]]

In [12]:
for a in alphas:
    ridge_cv = RidgeCV(alphas=a,scoring='neg_mean_squared_error')
    ridge_cv.fit(X_train, y_train)
    print('='*24)
    print(f'alpha\t{ridge_cv.alpha_}\nR^2\t{ridge_cv.score(X_train, y_train):.4f}')
    y_pred = ridge_cv.predict(X_train)
    ridge_cv_mse = mean_squared_error(y_train, y_pred)
    print(f'tr MSE  {ridge_cv_mse:.5f}')

alpha	1.0
R^2	0.9429
tr MSE  0.00947
alpha	1.0
R^2	0.9429
tr MSE  0.00947
alpha	1.5
R^2	0.9414
tr MSE  0.00971
alpha	1.25
R^2	0.9421
tr MSE  0.00959
alpha	1.25
R^2	0.9421
tr MSE  0.00959


# Comparing MSE
observing MSE, OLS is the least in both train and test set. This may imply that the overfitted OLS may be useful in modelling the multi-variable aspects of house buying.

OLS is simple and we use CV to determine the expected OLS scores. additional step of computing MSE or 'neg_mean_squared_error' is taken because the competition uses R-MSE.

'*' we pay special atttention that y is log transformed and compute RMSE

In [13]:
ols_scores = cross_val_score(ols, X_train, y_train, cv=100, n_jobs=-1,scoring='neg_mean_squared_error')

In [14]:
y_pred = np.expm1(ols.predict(X_train))
ols_mse = mean_squared_error(np.expm1(y_train), y_pred)
train_mse = ols_mse**.5
print(f'train rmse\t\t{ols_mse**.5:,}')

y_pred = np.expm1(ols.predict(X_test))
ols_mse = mean_squared_error(y_test, y_pred)
test_mse = ols_mse**.5
print(f'test rmse\t\t{ols_mse**.5:,}')

print(f'delta of train: \t{(test_mse-train_mse)/train_mse*100:.2f}%')

train rmse		18,705.268739991196
test rmse		21,371.13367972099
delta of train: 	14.25%


In [15]:
y_pred = np.expm1(ridge_cv.predict(X_train))
ridge_cv_mse = mean_squared_error(np.expm1(y_train), y_pred)
train_mse = ridge_cv_mse**.5
print(f'train rmse\t\t{ridge_cv_mse**.5:,}')

y_pred = np.expm1(ridge_cv.predict(X_test))
ridge_cv_mse = mean_squared_error(y_test, y_pred)
test_mse = ridge_cv_mse**.5
print(f'test rmse\t\t{ridge_cv_mse**.5:,}')

print(f'delta of train: \t{(test_mse-train_mse)/train_mse*100:.2f}%')

train rmse		19,606.82807081815
test rmse		21,189.585405414226
delta of train: 	8.07%


In [16]:
y_pred = np.expm1(lasso_cv.predict(X_train))
lasso_cv_mse = mean_squared_error(np.expm1(y_train), y_pred)
train_mse = lasso_cv_mse**.5
print(f'train rmse\t\t{lasso_cv_mse**.5:,}')

y_pred = np.expm1(lasso_cv.predict(X_test))
lasso_cv_mse = mean_squared_error(y_test, y_pred)
test_mse = lasso_cv_mse**.5
print(f'test rmse\t\t{lasso_cv_mse**.5:,}')

print(f'delta of train: \t{(test_mse-train_mse)/train_mse*100:.2f}%')

train rmse		30,235.400253568747
test rmse		26,322.084295295113
delta of train: 	-12.94%


## LassoCV
LassoCV has a robust determination method of determining optimal alpha.