In [71]:
# Let's import all the necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import PolynomialFeatures, MinMaxScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.feature_selection import RFE
from sklearn.model_selection import train_test_split
import statsmodels.api as sm

import warnings
warnings.filterwarnings('ignore')

In [72]:
df = pd.read_csv('train.csv')

In [73]:
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [74]:
df.drop('Id',axis = 1,inplace=True)

In [75]:
df.shape

(1460, 80)

In [76]:
df.describe()

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
count,1460.0,1201.0,1460.0,1460.0,1460.0,1460.0,1460.0,1452.0,1460.0,1460.0,...,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0
mean,56.89726,70.049958,10516.828082,6.099315,5.575342,1971.267808,1984.865753,103.685262,443.639726,46.549315,...,94.244521,46.660274,21.95411,3.409589,15.060959,2.758904,43.489041,6.321918,2007.815753,180921.19589
std,42.300571,24.284752,9981.264932,1.382997,1.112799,30.202904,20.645407,181.066207,456.098091,161.319273,...,125.338794,66.256028,61.119149,29.317331,55.757415,40.177307,496.123024,2.703626,1.328095,79442.502883
min,20.0,21.0,1300.0,1.0,1.0,1872.0,1950.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2006.0,34900.0
25%,20.0,59.0,7553.5,5.0,5.0,1954.0,1967.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,2007.0,129975.0
50%,50.0,69.0,9478.5,6.0,5.0,1973.0,1994.0,0.0,383.5,0.0,...,0.0,25.0,0.0,0.0,0.0,0.0,0.0,6.0,2008.0,163000.0
75%,70.0,80.0,11601.5,7.0,6.0,2000.0,2004.0,166.0,712.25,0.0,...,168.0,68.0,0.0,0.0,0.0,0.0,0.0,8.0,2009.0,214000.0
max,190.0,313.0,215245.0,10.0,9.0,2010.0,2010.0,1600.0,5644.0,1474.0,...,857.0,547.0,552.0,508.0,480.0,738.0,15500.0,12.0,2010.0,755000.0


In [77]:
df.columns[df.isnull().sum()>0]

Index(['LotFrontage', 'Alley', 'MasVnrType', 'MasVnrArea', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
       'Electrical', 'FireplaceQu', 'GarageType', 'GarageYrBlt',
       'GarageFinish', 'GarageQual', 'GarageCond', 'PoolQC', 'Fence',
       'MiscFeature'],
      dtype='object')

In [78]:
df[['LotFrontage', 'Alley', 'MasVnrType', 'MasVnrArea', 'BsmtQual','BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2','Electrical', 'FireplaceQu', 'GarageType', 'GarageYrBlt','GarageFinish', 'GarageQual', 'GarageCond', 'PoolQC', 'Fence','MiscFeature']].isnull().sum()

LotFrontage      259
Alley           1369
MasVnrType         8
MasVnrArea         8
BsmtQual          37
BsmtCond          37
BsmtExposure      38
BsmtFinType1      37
BsmtFinType2      38
Electrical         1
FireplaceQu      690
GarageType        81
GarageYrBlt       81
GarageFinish      81
GarageQual        81
GarageCond        81
PoolQC          1453
Fence           1179
MiscFeature     1406
dtype: int64

In [79]:
df.select_dtypes(include=np.number)

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
0,60,65.0,8450,7,5,2003,2003,196.0,706,0,...,0,61,0,0,0,0,0,2,2008,208500
1,20,80.0,9600,6,8,1976,1976,0.0,978,0,...,298,0,0,0,0,0,0,5,2007,181500
2,60,68.0,11250,7,5,2001,2002,162.0,486,0,...,0,42,0,0,0,0,0,9,2008,223500
3,70,60.0,9550,7,5,1915,1970,0.0,216,0,...,0,35,272,0,0,0,0,2,2006,140000
4,60,84.0,14260,8,5,2000,2000,350.0,655,0,...,192,84,0,0,0,0,0,12,2008,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,60,62.0,7917,6,5,1999,2000,0.0,0,0,...,0,40,0,0,0,0,0,8,2007,175000
1456,20,85.0,13175,6,6,1978,1988,119.0,790,163,...,349,0,0,0,0,0,0,2,2010,210000
1457,70,66.0,9042,7,9,1941,2006,0.0,275,0,...,0,60,0,0,0,0,2500,5,2010,266500
1458,20,68.0,9717,5,6,1950,1996,0.0,49,1029,...,366,0,112,0,0,0,0,4,2010,142125


---
## Linear Regression
---

In [80]:
Scaler = MinMaxScaler()

In [81]:
df_linreg = df
numeric_features = df_linreg.select_dtypes(include=np.number).columns

In [82]:
df_linreg[numeric_features] = Scaler.fit_transform(df_linreg[numeric_features])

In [83]:
df_linreg.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,0.235294,RL,0.150685,0.03342,Pave,,Reg,Lvl,AllPub,Inside,...,0.0,,,,0.0,0.090909,0.5,WD,Normal,0.241078
1,0.0,RL,0.202055,0.038795,Pave,,Reg,Lvl,AllPub,FR2,...,0.0,,,,0.0,0.363636,0.25,WD,Normal,0.203583
2,0.235294,RL,0.160959,0.046507,Pave,,IR1,Lvl,AllPub,Inside,...,0.0,,,,0.0,0.727273,0.5,WD,Normal,0.261908
3,0.294118,RL,0.133562,0.038561,Pave,,IR1,Lvl,AllPub,Corner,...,0.0,,,,0.0,0.090909,0.0,WD,Abnorml,0.145952
4,0.235294,RL,0.215753,0.060576,Pave,,IR1,Lvl,AllPub,FR2,...,0.0,,,,0.0,1.0,0.5,WD,Normal,0.298709


In [84]:
non_numeric = df_linreg.columns[~df_linreg.columns.isin(df_linreg[numeric_features].columns)]

In [85]:
for column in non_numeric:
    df_linreg = pd.concat([df_linreg,pd.get_dummies(df[column])],axis = 1)

In [86]:
df_linreg.drop(non_numeric,1,inplace=True)
df_linreg.shape


(1460, 289)

In [87]:
df_linreg.isnull().sum()[df_linreg.isnull().sum()>0]

LotFrontage    259
MasVnrArea       8
GarageYrBlt     81
dtype: int64

In [88]:
df_linreg.dropna(inplace=True)

## Test - Train Split

In [89]:
np.random.seed(0)
df_train, df_test = train_test_split(df_linreg, train_size = 0.7, test_size = 0.3, random_state = 100)

In [90]:
y_train = df_train['SalePrice']
X_train = df_train.drop('SalePrice',1)
y_test = df_test['SalePrice']
X_test = df_test.drop('SalePrice',1)

## Linear Regression

In [91]:
X_train_lm = sm.add_constant(X_train)
lr = sm.OLS(y_train,X_train_lm).fit()

In [92]:
(lr.summary())

0,1,2,3
Dep. Variable:,SalePrice,R-squared:,0.946
Model:,OLS,Adj. R-squared:,0.925
Method:,Least Squares,F-statistic:,43.81
Date:,"Tue, 01 Mar 2022",Prob (F-statistic):,6.519999999999999e-258
Time:,11:45:11,Log-Likelihood:,1777.9
No. Observations:,784,AIC:,-3104.0
Df Residuals:,558,BIC:,-2050.0
Df Model:,225,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
MSSubClass,-0.0086,0.035,-0.248,0.804,-0.077,0.060
LotFrontage,0.0121,0.027,0.454,0.650,-0.040,0.064
LotArea,0.5174,0.081,6.354,0.000,0.357,0.677
OverallQual,0.0739,0.019,3.971,0.000,0.037,0.110
OverallCond,0.0655,0.015,4.423,0.000,0.036,0.095
YearBuilt,0.0655,0.023,2.823,0.005,0.020,0.111
YearRemodAdd,0.0078,0.007,1.163,0.246,-0.005,0.021
MasVnrArea,0.0593,0.017,3.443,0.001,0.025,0.093
BsmtFinSF1,0.1459,0.032,4.502,0.000,0.082,0.210

0,1,2,3
Omnibus:,225.724,Durbin-Watson:,2.093
Prob(Omnibus):,0.0,Jarque-Bera (JB):,4495.713
Skew:,0.778,Prob(JB):,0.0
Kurtosis:,14.628,Cond. No.,1.33e+16


In [93]:
model = LinearRegression()

In [94]:
model.fit(X_train_lm,y_train)
print(model.score(X_train_lm,y_train))

0.9464219419690233


In [105]:
y_test_pred = model.predict(X_test)

In [108]:
model.score(sm.add_constant(X_test),y_test)

-4.502032961444353e+19

## Ridge

In [109]:
for i in [0,0.0001,0.001,0.01,0.1,1,10,100]:
    ridge = Ridge(alpha=i)
    ridge.fit(X_train_lm,y_train)
    y_test_pred = ridge.predict(sm.add_constant(X_test))
    print("lambda = {}, r2 score = ".format(i) + str(r2_score(y_test, y_test_pred))) 

lambda = 0, r2 score = -6.267793847783145e+26
lambda = 0.0001, r2 score = 0.8169182661652024
lambda = 0.001, r2 score = 0.8174297674472316
lambda = 0.01, r2 score = 0.8218295760165091
lambda = 0.1, r2 score = 0.8352607591189984
lambda = 1, r2 score = 0.8191394660504794
lambda = 10, r2 score = 0.7929764480239943
lambda = 100, r2 score = 0.7144805425859218


## Lasso 

In [110]:
values = np.linspace(0,0.001,100)
results = []
for i in values:
    lasso = Lasso(alpha=i)
    lasso.fit(X_train_lm,y_train)
    y_test_pred = lasso.predict((sm.add_constant(X_test)))
    results.append(r2_score(y_test,y_test_pred))
#     print("lambda = {}, r2 score = ".format(i) + str(r2_score(y_test, y_test_pred))) 
#     print('{:0.1f}%'.format((lasso.coef_==0).sum()/288*100))
#     print('-'*50)


In [113]:
max(results)

0.8340197176261149

In [111]:
values[results.index(max(results))]

9.090909090909092e-05

In [114]:
lasso = Lasso(alpha=0.0001)
lasso.fit(X_train_lm,y_train)
y_test_pred = lasso.predict((sm.add_constant(X_test)))
print(r2_score(y_test,y_test_pred))

0.8340034701705483


Obtained a R2 score of 0.835

### Exploring features

In [122]:
(lasso.coef_==0).sum()

173

In [115]:
coeff_df = pd.DataFrame(data={'Columns':X_train_lm.columns,'Coeff':lasso.coef_})

In [116]:
coeff_df.sort_values('Coeff',ascending=False,inplace=True)

In [117]:
coeff_df.tail(5)

Unnamed: 0,Columns,Coeff
70,Edwards,-0.014592
41,Grvl,-0.018333
129,Tar&Grv,-0.019955
36,C (all),-0.020569
124,ClyTile,-0.578775
