#### IMPORTING THE NECESSARY LIBRARIES

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures, MinMaxScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV

import warnings
warnings.filterwarnings('ignore')

#### READING AND UNDERSTANDING THE DATA

In [2]:
pwd

'C:\\Users\\User'

In [3]:
cd Downloads

C:\Users\User\Downloads


In [4]:
df = pd.read_csv('train.csv')

In [5]:
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [7]:
df.shape

(1460, 81)

In [8]:
df.describe()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
count,1460.0,1460.0,1201.0,1460.0,1460.0,1460.0,1460.0,1460.0,1452.0,1460.0,...,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0
mean,730.5,56.89726,70.049958,10516.828082,6.099315,5.575342,1971.267808,1984.865753,103.685262,443.639726,...,94.244521,46.660274,21.95411,3.409589,15.060959,2.758904,43.489041,6.321918,2007.815753,180921.19589
std,421.610009,42.300571,24.284752,9981.264932,1.382997,1.112799,30.202904,20.645407,181.066207,456.098091,...,125.338794,66.256028,61.119149,29.317331,55.757415,40.177307,496.123024,2.703626,1.328095,79442.502883
min,1.0,20.0,21.0,1300.0,1.0,1.0,1872.0,1950.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2006.0,34900.0
25%,365.75,20.0,59.0,7553.5,5.0,5.0,1954.0,1967.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,2007.0,129975.0
50%,730.5,50.0,69.0,9478.5,6.0,5.0,1973.0,1994.0,0.0,383.5,...,0.0,25.0,0.0,0.0,0.0,0.0,0.0,6.0,2008.0,163000.0
75%,1095.25,70.0,80.0,11601.5,7.0,6.0,2000.0,2004.0,166.0,712.25,...,168.0,68.0,0.0,0.0,0.0,0.0,0.0,8.0,2009.0,214000.0
max,1460.0,190.0,313.0,215245.0,10.0,9.0,2010.0,2010.0,1600.0,5644.0,...,857.0,547.0,552.0,508.0,480.0,738.0,15500.0,12.0,2010.0,755000.0


#### DATA PRE-PROCESSING

In [9]:
df = df.drop(columns = ['Id'])

In [10]:
#identifying the numeric data and scaling them
numeric_columns = []

for column in df.columns:
    if pd.api.types.is_numeric_dtype(df[column]):
        numeric_columns.append(column)

print(numeric_columns)

['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold', 'SalePrice']


In [11]:
scaler = MinMaxScaler()
df[numeric_columns] = scaler.fit_transform(df[numeric_columns])

In [12]:
df.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,0.235294,RL,0.150685,0.03342,Pave,,Reg,Lvl,AllPub,Inside,...,0.0,,,,0.0,0.090909,0.5,WD,Normal,0.241078
1,0.0,RL,0.202055,0.038795,Pave,,Reg,Lvl,AllPub,FR2,...,0.0,,,,0.0,0.363636,0.25,WD,Normal,0.203583
2,0.235294,RL,0.160959,0.046507,Pave,,IR1,Lvl,AllPub,Inside,...,0.0,,,,0.0,0.727273,0.5,WD,Normal,0.261908
3,0.294118,RL,0.133562,0.038561,Pave,,IR1,Lvl,AllPub,Corner,...,0.0,,,,0.0,0.090909,0.0,WD,Abnorml,0.145952
4,0.235294,RL,0.215753,0.060576,Pave,,IR1,Lvl,AllPub,FR2,...,0.0,,,,0.0,1.0,0.5,WD,Normal,0.298709


In [13]:
#Handling null values
null_counts = df.isnull().sum()
columns_with_null = null_counts[null_counts > 0]
percentage_null = (columns_with_null / len(df)) * 100

In [14]:
null_info = pd.DataFrame({
    'Column Name': columns_with_null.index,
    'Null Count': columns_with_null.values,
    'Percentage Null': percentage_null.values
})

In [15]:
null_info

Unnamed: 0,Column Name,Null Count,Percentage Null
0,LotFrontage,259,17.739726
1,Alley,1369,93.767123
2,MasVnrType,8,0.547945
3,MasVnrArea,8,0.547945
4,BsmtQual,37,2.534247
5,BsmtCond,37,2.534247
6,BsmtExposure,38,2.60274
7,BsmtFinType1,37,2.534247
8,BsmtFinType2,38,2.60274
9,Electrical,1,0.068493


In [16]:
df.drop(columns = ['Alley', 'PoolQC', 'Fence', 'MiscFeature', 'LotFrontage', 'FireplaceQu'], inplace=True)

In [17]:
df = df.dropna()

In [18]:
df.head()

Unnamed: 0,MSSubClass,MSZoning,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,0.235294,RL,0.03342,Pave,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,...,0.0,0.0,0.0,0.0,0.0,0.090909,0.5,WD,Normal,0.241078
1,0.0,RL,0.038795,Pave,Reg,Lvl,AllPub,FR2,Gtl,Veenker,...,0.0,0.0,0.0,0.0,0.0,0.363636,0.25,WD,Normal,0.203583
2,0.235294,RL,0.046507,Pave,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,...,0.0,0.0,0.0,0.0,0.0,0.727273,0.5,WD,Normal,0.261908
3,0.294118,RL,0.038561,Pave,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,...,0.492754,0.0,0.0,0.0,0.0,0.090909,0.0,WD,Abnorml,0.145952
4,0.235294,RL,0.060576,Pave,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,...,0.0,0.0,0.0,0.0,0.0,1.0,0.5,WD,Normal,0.298709


In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1338 entries, 0 to 1459
Data columns (total 74 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   MSSubClass     1338 non-null   float64
 1   MSZoning       1338 non-null   object 
 2   LotArea        1338 non-null   float64
 3   Street         1338 non-null   object 
 4   LotShape       1338 non-null   object 
 5   LandContour    1338 non-null   object 
 6   Utilities      1338 non-null   object 
 7   LotConfig      1338 non-null   object 
 8   LandSlope      1338 non-null   object 
 9   Neighborhood   1338 non-null   object 
 10  Condition1     1338 non-null   object 
 11  Condition2     1338 non-null   object 
 12  BldgType       1338 non-null   object 
 13  HouseStyle     1338 non-null   object 
 14  OverallQual    1338 non-null   float64
 15  OverallCond    1338 non-null   float64
 16  YearBuilt      1338 non-null   float64
 17  YearRemodAdd   1338 non-null   float64
 18  RoofStyl

In [20]:
object_columns = df.select_dtypes(include=['object'])
object_column_names = object_columns.columns.tolist()
object_column_names

['MSZoning',
 'Street',
 'LotShape',
 'LandContour',
 'Utilities',
 'LotConfig',
 'LandSlope',
 'Neighborhood',
 'Condition1',
 'Condition2',
 'BldgType',
 'HouseStyle',
 'RoofStyle',
 'RoofMatl',
 'Exterior1st',
 'Exterior2nd',
 'MasVnrType',
 'ExterQual',
 'ExterCond',
 'Foundation',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'Heating',
 'HeatingQC',
 'CentralAir',
 'Electrical',
 'KitchenQual',
 'Functional',
 'GarageType',
 'GarageFinish',
 'GarageQual',
 'GarageCond',
 'PavedDrive',
 'SaleType',
 'SaleCondition']

In [21]:
df['CentralAir'] = df['CentralAir'].map({'Yes': 1, 'No': 0})

In [22]:
object_column_names.remove('CentralAir')

In [23]:
#Creating dummy variables
status = pd.get_dummies(df[object_column_names], drop_first=True)
status.head()

Unnamed: 0,MSZoning_FV,MSZoning_RH,MSZoning_RL,MSZoning_RM,Street_Pave,LotShape_IR2,LotShape_IR3,LotShape_Reg,LandContour_HLS,LandContour_Low,...,SaleType_ConLI,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,0,0,1,0,1,0,0,1,0,0,...,0,0,0,0,1,0,0,0,1,0
1,0,0,1,0,1,0,0,1,0,0,...,0,0,0,0,1,0,0,0,1,0
2,0,0,1,0,1,0,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0
3,0,0,1,0,1,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,0,0,1,0,1,0,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0


In [24]:
df = pd.concat([df, status], axis = 1)

In [25]:
df.drop(columns = ['MSZoning',
 'Street',
 'LotShape',
 'LandContour',
 'Utilities',
 'LotConfig',
 'LandSlope',
 'Neighborhood',
 'Condition1',
 'Condition2',
 'BldgType',
 'HouseStyle',
 'RoofStyle',
 'RoofMatl',
 'Exterior1st',
 'Exterior2nd',
 'MasVnrType',
 'ExterQual',
 'ExterCond',
 'Foundation',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'Heating',
 'HeatingQC',
 'CentralAir',
 'Electrical',
 'KitchenQual',
 'Functional',
 'GarageType',
 'GarageFinish',
 'GarageQual',
 'GarageCond',
 'PavedDrive',
 'SaleType',
 'SaleCondition'], inplace=True)

#### TRAIN-TEST SPLIT

In [26]:
df_train, df_test = train_test_split(df, train_size=0.7, random_state=100)
print(df_train.shape)
print(df_test.shape)

(936, 226)
(402, 226)


In [27]:
y_train = df_train.pop('SalePrice')
X_train = df_train

In [28]:
y_test = df_test.pop('SalePrice')
X_test = df_test

In [29]:
y_train.head()

1059    0.257048
571     0.118178
173     0.177892
732     0.260519
1282    0.160533
Name: SalePrice, dtype: float64

In [30]:
X_train.head()

Unnamed: 0,MSSubClass,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,...,SaleType_ConLI,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
1059,0.176471,0.046624,0.555556,0.75,0.434783,0.0,0.3,0.052622,0.377883,0.0,...,0,0,0,0,1,0,0,0,1,0
571,0.0,0.028194,0.555556,0.625,0.630435,0.15,0.129375,0.073352,0.0,0.192637,...,0,0,0,0,1,0,0,0,0,0
173,0.0,0.041585,0.555556,0.5,0.644928,0.183333,0.306875,0.051028,0.253731,0.299658,...,0,0,0,0,0,0,0,0,1,0
732,0.235294,0.047227,0.666667,0.5,0.913043,0.816667,0.12625,0.044649,0.0,0.385702,...,0,0,0,0,1,0,0,0,1,0
1282,0.0,0.035056,0.444444,0.75,0.76087,0.966667,0.0,0.094259,0.097693,0.155822,...,0,0,0,0,1,0,0,0,1,0


#### BUILDING A LINEAR REGRESSION MODEL

In [31]:
lm = LinearRegression()
lm.fit(X_train, y_train)

In [32]:
y_pred_train = lm.predict(X_train)
y_pred_test = lm.predict(X_test)


metric = []
r2_train_lr = r2_score(y_train, y_pred_train)
print(r2_train_lr)
metric.append(r2_train_lr)
r2_test_lr = r2_score(y_test, y_pred_test)
print(r2_test_lr)
metric.append(r2_test_lr)
rss_train_lr = np.sum(np.square(y_train - y_pred_train))
print(rss_train_lr)
metric.append(rss_train_lr)
rss_test_lr = np.sum(np.square(y_test - y_pred_test))
print(rss_test_lr)
metric.append(rss_test_lr)
mse_train_lr = mean_squared_error(y_train, y_pred_train)
print(mse_train_lr)
metric.append(mse_train_lr)
mse_test_lr = mean_squared_error(y_test, y_pred_test)
print(mse_test_lr)
metric.append(mse_test_lr)

0.9315399644390137
-1.9971879376675167e+20
0.7658760891585316
9.679760643506574e+20
0.0008182436849984312
2.4079006575887e+18


#### RIDGE REGRESSION

In [33]:
params = {'alpha' : [0.0001, 0.001, 0.01, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 20, 50, 100, 500, 1000]}

In [34]:
ridge = Ridge()

In [35]:
folds = 5
model_cv = GridSearchCV (estimator = ridge, param_grid = params, scoring = 'neg_mean_absolute_error', cv = folds, return_train_score = True, verbose = 1)
model_cv.fit(X_train, y_train)

Fitting 5 folds for each of 28 candidates, totalling 140 fits


In [36]:
print(model_cv.best_params_)

{'alpha': 3.0}


In [37]:
alpha = 3.0
ridge = Ridge (alpha = alpha)
ridge.fit (X_train, y_train)
print(ridge.coef_)

[-2.28507806e-02  2.44022184e-02  7.98167332e-02  4.58613847e-02
  2.66984472e-02  1.12252220e-02  4.45356674e-02  6.47170391e-02
  6.19650413e-03  2.79043573e-03  6.23428926e-02  8.15101805e-02
  8.91773270e-02  1.01472017e-02  1.02708505e-01  2.19588475e-02
 -5.96943961e-04  3.22953615e-02  1.05034674e-02 -2.77618211e-03
 -1.82382042e-02  5.75118012e-02  2.51172935e-02 -1.05549803e-02
  2.36171478e-02  5.16564974e-02  2.02809232e-02  6.22623861e-03
  1.00059630e-02  1.34049341e-02  1.49991910e-02  4.98591897e-02
  1.28473356e-03 -3.99830939e-03 -7.83112909e-04  1.47755322e-02
  3.88894362e-03  1.20686605e-02  6.48699975e-03  1.83005675e-02
  1.29719847e-02  1.26639510e-02 -2.26655785e-03  2.06719257e-02
 -3.74909583e-03  1.58681100e-02 -1.05755765e-02  1.17391599e-02
 -1.44901728e-02 -4.82206471e-03 -3.78544747e-03  3.88391091e-03
 -1.28188414e-02 -2.04183942e-03  3.68159683e-04  5.65680292e-03
 -2.05787038e-04 -7.23737588e-03  2.24415205e-02 -2.12202626e-02
 -1.39585507e-02 -1.01026

In [38]:
y_pred_train = ridge.predict(X_train)
y_pred_test = ridge.predict(X_test)


metric2 = []
r2_train_rd = r2_score(y_train, y_pred_train)
print(r2_train_rd)
metric2.append(r2_train_rd)
r2_test_rd = r2_score(y_test, y_pred_test)
print(r2_test_rd)
metric2.append(r2_test_rd)
rss_train_rd = np.sum(np.square(y_train - y_pred_train))
print(rss_train_rd)
metric2.append(rss_train_rd)
rss_test_rd = np.sum(np.square(y_test - y_pred_test))
print(rss_test_rd)
metric2.append(rss_test_rd)
mse_train_rd = mean_squared_error(y_train, y_pred_train)
print(mse_train_rd)
metric2.append(mse_train_rd)
mse_test_rd = mean_squared_error(y_test, y_pred_test)
print(mse_test_rd)
metric2.append(mse_test_rd)

0.913369346783041
0.7915615367455991
0.9691544175133877
1.0102376422117307
0.0010354213862322518
0.0025130289607256983


#### LASSO REGRESSION

In [39]:
lasso = Lasso()

folds = 5
model_cv = GridSearchCV (estimator = lasso, param_grid = params, scoring = 'neg_mean_absolute_error', cv = folds, return_train_score = True, verbose = 1)
model_cv.fit(X_train, y_train)

Fitting 5 folds for each of 28 candidates, totalling 140 fits


In [40]:
print(model_cv.best_params_)

{'alpha': 0.0001}


In [41]:
alpha = 0.0001
lasso = Lasso (alpha = alpha)
lasso.fit (X_train, y_train)
print(lasso.coef_)

[-3.15563676e-02  1.21792156e-02  1.02479664e-01  6.06812588e-02
  4.12082145e-02  7.19636690e-03  4.45411395e-02  1.14179414e-01
  0.00000000e+00 -0.00000000e+00  7.20529848e-02  0.00000000e+00
  3.34439241e-02 -0.00000000e+00  3.85678064e-01  1.19932557e-02
 -0.00000000e+00  5.80175296e-03 -0.00000000e+00 -0.00000000e+00
 -3.85789081e-03  1.07242764e-02  1.69827875e-02 -0.00000000e+00
  1.10279296e-03  6.62251098e-02  8.55989993e-03  0.00000000e+00
  0.00000000e+00  0.00000000e+00  1.04401208e-02  7.48096478e-02
 -0.00000000e+00 -1.99764653e-03 -0.00000000e+00  7.73634842e-03
  0.00000000e+00  7.40385252e-03 -0.00000000e+00  1.84777511e-04
  9.61927588e-03  0.00000000e+00 -4.93469174e-04  1.73031366e-02
  0.00000000e+00  1.31899976e-02 -0.00000000e+00  1.27076193e-02
 -7.43521714e-03 -0.00000000e+00 -2.38109608e-03  0.00000000e+00
 -0.00000000e+00 -0.00000000e+00  0.00000000e+00  8.70671131e-03
  2.98841282e-03 -5.75817246e-04  2.66121484e-02 -1.16459736e-02
  0.00000000e+00 -0.00000

In [42]:
y_pred_train = lasso.predict(X_train)
y_pred_test = lasso.predict(X_test)


metric3 = []
r2_train_ls = r2_score(y_train, y_pred_train)
print(r2_train_ls)
metric3.append(r2_train_ls)
r2_test_ls = r2_score(y_test, y_pred_test)
print(r2_test_ls)
metric3.append(r2_test_ls)
rss_train_ls = np.sum(np.square(y_train - y_pred_train))
print(rss_train_ls)
metric3.append(rss_train_ls)
rss_test_ls = np.sum(np.square(y_test - y_pred_test))
print(rss_test_ls)
metric3.append(rss_test_ls)
mse_train_ls = mean_squared_error(y_train, y_pred_train)
print(mse_train_ls)
metric3.append(mse_train_ls)
mse_test_ls = mean_squared_error(y_test, y_pred_test)
print(mse_test_ls)
metric3.append(mse_test_ls)

0.9172669171324392
0.7623717476661062
0.9255515196773475
1.1517116448305929
0.0009888370936723798
0.002864954340374609


#### COMPARING MODELS

In [43]:
#Building a table to campare the model scores
lr_table = {'Metric': ['R2Score(train)', 'R2Score(test)', 'RSS(Train)', 'RSS(Test)', 'MSE(Train)', 'MSE(Test)'], 'Linear Regression' : metric}

lr_metric = pd.DataFrame(lr_table, columns = ['Metric', 'Linear Regression'])
rg_metric = pd.Series(metric2, name = 'Ridge')
ls_metric = pd.Series(metric3, name = 'Lasso')

final_metric = pd.concat([lr_metric, rg_metric, ls_metric], axis = 1)
final_metric

Unnamed: 0,Metric,Linear Regression,Ridge,Lasso
0,R2Score(train),0.93154,0.913369,0.917267
1,R2Score(test),-1.997188e+20,0.791562,0.762372
2,RSS(Train),0.7658761,0.969154,0.925552
3,RSS(Test),9.679761e+20,1.010238,1.151712
4,MSE(Train),0.0008182437,0.001035,0.000989
5,MSE(Test),2.407901e+18,0.002513,0.002865


In [44]:
betas = pd.DataFrame (index = X_train.columns)
betas.rows = X_train.columns
betas['Linear'] = lm.coef_
betas['Ridge'] = ridge.coef_
betas['Lasso'] = lasso.coef_

In [45]:
#Displaying the coefficients
pd.set_option('display.max_rows', None)
betas.head(80)

Unnamed: 0,Linear,Ridge,Lasso
MSSubClass,-0.0233134,-0.022851,-0.031556
LotArea,0.1614345,0.024402,0.012179
OverallQual,0.07719604,0.079817,0.10248
OverallCond,0.07104006,0.045861,0.060681
YearBuilt,0.07635608,0.026698,0.041208
YearRemodAdd,0.00467946,0.011225,0.007196
MasVnrArea,0.04958887,0.044536,0.044541
BsmtFinSF1,-132174900000.0,0.064717,0.114179
BsmtFinSF2,-34519100000.0,0.006197,0.0
BsmtUnfSF,-54705980000.0,0.00279,-0.0


#### DOING RIDGE AND LASSO REGRESSION WITH TWICE 'ALPHA'

In [46]:
alpha = 6.0
ridge = Ridge (alpha = alpha)
ridge.fit (X_train, y_train)
print(ridge.coef_)

[-2.29043024e-02  1.60274056e-02  7.21397993e-02  3.55504784e-02
  1.73382624e-02  1.38810150e-02  3.95548721e-02  4.65523691e-02
  3.71512149e-03  7.22976952e-03  4.66622589e-02  6.35703507e-02
  7.03574558e-02  8.38509242e-03  8.04679743e-02  2.26853674e-02
 -1.13303590e-03  3.62080639e-02  1.25576064e-02  5.92540454e-03
 -1.14331594e-02  5.49514943e-02  3.01468349e-02 -4.58305161e-03
  2.83271975e-02  4.62233022e-02  2.09909454e-02  7.20299591e-03
  6.75216016e-03  1.01140447e-02  1.53320373e-02  3.36997036e-02
  4.30287047e-04 -3.61100587e-03 -1.04935283e-03  1.02460854e-02
  9.13524139e-04  1.01480131e-02  2.16252635e-03  1.28311018e-02
  1.27364250e-02  9.49893396e-03 -3.51178987e-03  1.73097332e-02
 -2.48158797e-03  1.34822741e-02 -5.72042664e-03  1.12497050e-02
 -1.37240613e-02 -3.33192124e-03 -4.16211654e-03  4.47007889e-03
 -1.00351704e-02 -1.20081797e-03  4.32930275e-04  6.26491881e-03
 -8.26479070e-04 -8.60015017e-03  2.24294356e-02 -2.06295338e-02
 -1.73115292e-02 -9.00105

In [47]:
y_pred_train = ridge.predict(X_train)
y_pred_test = ridge.predict(X_test)

r2_train_rd = r2_score(y_train, y_pred_train)
print(r2_train_rd)
r2_test_rd = r2_score(y_test, y_pred_test)
print(r2_test_rd)

0.9026532741009903
0.8072930071895629


In [48]:
alpha = 0.0002
lasso = Lasso (alpha = alpha)
lasso.fit (X_train, y_train)
print(lasso.coef_)

[-3.39974645e-02  0.00000000e+00  1.08345172e-01  4.59235865e-02
  3.32118673e-02  9.80051721e-03  3.41614247e-02  9.98334038e-02
  0.00000000e+00 -0.00000000e+00  0.00000000e+00  0.00000000e+00
  1.09338099e-02 -0.00000000e+00  4.19686345e-01  1.16071845e-02
 -0.00000000e+00  3.64167314e-03  0.00000000e+00 -0.00000000e+00
 -0.00000000e+00  6.67315797e-03  1.97652062e-02  0.00000000e+00
  2.80249982e-03  6.20073329e-02  6.83453095e-03  0.00000000e+00
 -0.00000000e+00  0.00000000e+00  9.01078934e-03  2.99015488e-02
 -0.00000000e+00 -2.99795023e-04 -0.00000000e+00  0.00000000e+00
 -0.00000000e+00  4.51227301e-03 -2.00175719e-03  0.00000000e+00
  8.96996047e-03  0.00000000e+00 -1.70702184e-03  1.19304444e-02
  0.00000000e+00  9.14767948e-03 -0.00000000e+00  1.11217934e-02
 -4.73042897e-03 -0.00000000e+00 -2.24493491e-03  0.00000000e+00
 -0.00000000e+00 -0.00000000e+00  0.00000000e+00  5.64715902e-03
  0.00000000e+00 -0.00000000e+00  2.38610091e-02 -9.68461733e-03
 -0.00000000e+00 -0.00000

In [49]:
y_pred_train = lasso.predict(X_train)
y_pred_test = lasso.predict(X_test)

r2_train_ls = r2_score(y_train, y_pred_train)
print(r2_train_ls)
r2_test_ls = r2_score(y_test, y_pred_test)
print(r2_test_ls)

0.9073810899364011
0.7836726752311289


In [50]:
betas = pd.DataFrame (index = X_train.columns)
betas.rows = X_train.columns
betas['Linear'] = lm.coef_
betas['Ridge'] = ridge.coef_
betas['Lasso'] = lasso.coef_

In [51]:
pd.set_option('display.max_rows', None)
betas

Unnamed: 0,Linear,Ridge,Lasso
MSSubClass,-0.0233134,-0.022904,-0.033997
LotArea,0.1614345,0.016027,0.0
OverallQual,0.07719604,0.07214,0.108345
OverallCond,0.07104006,0.03555,0.045924
YearBuilt,0.07635608,0.017338,0.033212
YearRemodAdd,0.00467946,0.013881,0.009801
MasVnrArea,0.04958887,0.039555,0.034161
BsmtFinSF1,-132174900000.0,0.046552,0.099833
BsmtFinSF2,-34519100000.0,0.003715,0.0
BsmtUnfSF,-54705980000.0,0.00723,-0.0
