In [19]:
# Imports

import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression, LassoCV, RidgeCV, Ridge,Lasso
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler


In [20]:
# Reading in the data

train = pd.read_csv('./datasets/train_eng.csv')

## Baseline Score and Linear Model

- Without removing any features that were orginally in the dataset or that I had created, I wanted to examine the baseline score for several different models; OLS, Ridge and Lasso.

In [21]:
#Baseline features

features = [col for col in train.select_dtypes(exclude=['object']).columns if col != 'SalePrice']
features.remove('Id')
features.remove('PID')

X = train[features]
y = train['SalePrice']

In [62]:
# Splitting the training data into it's own train-test-split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 44)

In [63]:
# Scaling the data, considering we'll be using regularization later and certain variables need it

ss = StandardScaler()

Z_train = ss.fit_transform(X_train)
Z_test = ss.transform(X_test)

In [72]:
# Cross evaluation score (baseline) of a linear regression model with the data

lr = LinearRegression()
lr_scores = cross_val_score(lr, Z_train, y_train, cv=5)
print(lr_scores.mean(), lr_scores.std())

-149882816262269.9 299765632524541.56


In [65]:
# Fitting the model and scoring it on the train and test data derived from the 'train' dataset
# Couldn't find the number that was making my cross_val_score so ridiculous, moving on to fitting and will tune model later.
lr.fit(Z_train, y_train)

LinearRegression()

In [66]:
print("Training OLS score:", lr.score(Z_train, y_train))
print("Testing OLS score:", lr.score(Z_test, y_test))

Training OLS score: 0.9055944594300793
Testing OLS score: 0.8986651723239815


## Next Step: Regularization (Lasso and Ridge)

### Ridge Regression

In [27]:
ridge_model = Ridge(alpha=10)
ridge_model.fit(Z_train, y_train)
print("Training Ridge score:", ridge_model.score(Z_train, y_train))
print("Testing Ridge score:", ridge_model.score(Z_test, y_test))

Training Ridge score: 0.9052106024047347
Testing Ridge score: 0.8981286187906979


In [28]:
r_alpha = np.logspace(0,5,100)

# Cross-validate over our list of ridge alphas.
ridge_cv = RidgeCV(alphas = r_alpha, scoring = 'r2', cv = 5)

# Fit model using best ridge alpha!
ridge_cv.fit(Z_train, y_train)

RidgeCV(alphas=array([1.00000000e+00, 1.12332403e+00, 1.26185688e+00, 1.41747416e+00,
       1.59228279e+00, 1.78864953e+00, 2.00923300e+00, 2.25701972e+00,
       2.53536449e+00, 2.84803587e+00, 3.19926714e+00, 3.59381366e+00,
       4.03701726e+00, 4.53487851e+00, 5.09413801e+00, 5.72236766e+00,
       6.42807312e+00, 7.22080902e+00, 8.11130831e+00, 9.11162756e+00,
       1.02353102e+01, 1.14975700e+0...
       6.89261210e+03, 7.74263683e+03, 8.69749003e+03, 9.77009957e+03,
       1.09749877e+04, 1.23284674e+04, 1.38488637e+04, 1.55567614e+04,
       1.74752840e+04, 1.96304065e+04, 2.20513074e+04, 2.47707636e+04,
       2.78255940e+04, 3.12571585e+04, 3.51119173e+04, 3.94420606e+04,
       4.43062146e+04, 4.97702356e+04, 5.59081018e+04, 6.28029144e+04,
       7.05480231e+04, 7.92482898e+04, 8.90215085e+04, 1.00000000e+05]),
        cv=5, scoring='r2')

In [29]:
ridge_cv.alpha_

29.150530628251758

In [30]:
print(ridge_cv.score(Z_train, y_train))
print(ridge_cv.score(Z_test, y_test))

0.904584254692416
0.8974301604887753


### Lasso Regression

In [31]:
# Set up a list of Lasso alphas to check.
l_alphas = np.logspace(0,15,1000)

# Cross-validate over our list of Lasso alphas.
lasso_cv = LassoCV(alphas = l_alphas, max_iter=10e5)

# Fit model using best ridge alpha!
lasso_cv.fit(Z_train, y_train)

LassoCV(alphas=array([1.00000000e+00, 1.03517796e+00, 1.07159340e+00, 1.10928986e+00,
       1.14831241e+00, 1.18870770e+00, 1.23052400e+00, 1.27381132e+00,
       1.31862140e+00, 1.36500781e+00, 1.41302599e+00, 1.46273336e+00,
       1.51418933e+00, 1.56745541e+00, 1.62259529e+00, 1.67967487e+00,
       1.73876240e+00, 1.79992851e+00, 1.86324631e+00, 1.92879151e+00,
       1.99664245e+00, 2.06688025e+0...
       4.51496777e+14, 4.67379511e+14, 4.83820966e+14, 5.00840799e+14,
       5.18459354e+14, 5.36697695e+14, 5.55577622e+14, 5.75121707e+14,
       5.95353313e+14, 6.16296626e+14, 6.37976681e+14, 6.60419396e+14,
       6.83651600e+14, 7.07701066e+14, 7.32596543e+14, 7.58367791e+14,
       7.85045620e+14, 8.12661920e+14, 8.41249705e+14, 8.70843150e+14,
       9.01477631e+14, 9.33189772e+14, 9.66017480e+14, 1.00000000e+15]),
        max_iter=1000000.0)

In [32]:
lasso_cv.alpha_

114.03996019700331

In [33]:
print(lasso_cv.score(Z_train, y_train))
print(lasso_cv.score(Z_test, y_test))

0.9045125126979532
0.8970302754472178


## Summary

In [34]:
ridge_cv.coef_

array([-3500.93690385,  2769.9884128 ,  3309.84959526,  -534.68090863,
       14164.87184796,  5781.77388338,  6679.20344512,   581.80563531,
        4480.10750912,  5712.86546057, -2464.29206064, -3255.00634785,
       -8629.30795702,  3989.61474977, -2520.01293296,  6037.7036877 ,
         548.39646494,  -173.63834894,  1743.02411009,  7449.07621941,
        1773.97977837, -1565.66440992,  8412.5816016 ,  8962.10788088,
          63.77949634, 14404.98213965,  1338.61878714,  -700.01328936,
       -1168.33673824, -3927.75935821, -3606.41292003, -2497.68823943,
        4813.45501434,  2855.6000389 ,  3952.62600422, -5355.92504281,
         265.25945619,  3305.17282601,  -750.8966911 ,  -388.18119883,
        -488.33954033,  6366.99463218,  1757.15122002, -2525.70735204,
        2219.09062938,   101.88931495,   279.74123408,   423.3138835 ,
        3572.94238346, -2645.29194908,  1982.9550014 ,  -302.0009938 ,
          16.19253531,  -180.22683679,   393.8257038 , -1093.96040298,
      

In [35]:
lasso_cv.coef_

array([-3619.40161254,  2726.80223367,  3251.14107333,  -397.5065467 ,
       14574.06855863,  5897.74568464,  6300.81338828,    59.27631862,
        4351.9888944 ,  8461.34651556,  -421.98732486, -4080.29639065,
       -9285.19182378,  4027.81416101, -2384.25813472,  6198.02062545,
         255.97847648,     0.        ,     0.        ,  9322.10448432,
        1621.35433947, -1478.49329299,    98.44041124,     0.        ,
        -823.05970296, 25763.75552116,  1173.1054339 ,  -590.69070829,
       -1150.67667693, -3596.58503428, -3429.65951261, -2060.07843637,
        4888.54419537,  2018.19698138,  3849.66182023, -5916.96663224,
          -0.        ,  3032.46575839,   -81.10056983,  -108.95345022,
          -0.        ,  5820.95482515,   859.14344588,  -906.73693232,
        2028.94829613,     0.        ,    84.00061171,   356.51135465,
        3430.70669022, -1919.32640966,  1876.01912234,  -118.99979128,
          -0.        ,   -53.55688646,   291.22184875, -1006.69027529,
      

In [36]:
print(" OLS ".center(18, "="))
print(lr.score(Z_train, y_train))
print(lr.score(Z_test, y_test))
print()
print(" Ridge ".center(18, "="))
print(ridge_cv.score(Z_train, y_train))
print(ridge_cv.score(Z_test, y_test))
print()
print(" Lasso ".center(18, "="))
print(lasso_cv.score(Z_train, y_train))
print(lasso_cv.score(Z_test, y_test))

0.9055944594300793
0.8986651723239815

0.904584254692416
0.8974301604887753

0.9045125126979532
0.8970302754472178
