In [53]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LassoCV
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import make_scorer
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ShuffleSplit
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import statsmodels.formula.api as smf
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from QBUS2820 import rmse_jack, r2_jack 

In [88]:
data = pd.read_csv('TrainStandard.csv')
final_train = data.sample(frac=0.6, random_state=1)
final_test = data[data.index.isin(final_train.index)==False]
final_train.head()
y_train = final_train.pop('SalePrice')
y_test = final_test.pop('SalePrice')


In [89]:
final_train.head()

Unnamed: 0,1stFlrSF,2ndFlrSF,3SsnPorch,Alley,BedroomAbvGr,BsmtCond,BsmtExposure,BsmtFinSF1,BsmtFinSF2,BsmtFinType1,...,SaleType_CWD,SaleType_Con,SaleType_ConLD,SaleType_ConLI,SaleType_ConLw,SaleType_Oth,SaleType_VWD,SaleType_WD,Utilities_AllPub,Utilities_NoSewr
8,0.469641,-0.795752,-0.108531,0.042812,-1.008823,0.307671,0.609567,1.370578,-0.306636,-0.456898,...,0,0,0,0,0,0,0,1,1,0
791,-1.701872,0.644464,-0.108531,3.867311,-1.008823,0.307671,0.609567,-1.045567,-0.306636,1.328502,...,0,0,0,0,0,0,0,1,1,0
571,-0.739243,-0.795752,-0.108531,0.042812,0.216394,0.307671,0.609567,-0.14402,0.970805,-1.349598,...,0,0,0,0,0,0,0,1,1,0
224,-0.059245,-0.795752,-0.108531,0.042812,0.216394,0.307671,0.609567,0.093988,-0.103528,-1.349598,...,0,0,0,0,0,0,0,1,1,0
498,0.164622,-0.795752,-0.108531,0.042812,0.216394,0.307671,0.609567,-1.045567,-0.306636,1.328502,...,0,0,0,0,0,0,0,1,1,0


### Standardised LASSO Regression

In [90]:
y_train
mu=y_train.mean()
sigma=y_train.std() 

standardPrice=(y_train-mu)/sigma

8      0.670170
791   -0.315526
571   -0.751233
224   -0.658377
498   -0.186957
262    0.127323
73    -1.309794
795    1.170161
670    0.113037
731    0.740169
436   -0.684091
117   -0.229814
245   -0.186957
735   -0.258385
493   -0.586950
385    0.170179
443    0.455888
700   -1.215510
592   -0.972658
242   -0.715519
555   -0.608378
311   -0.972658
394   -0.745876
81     2.098716
370   -0.686948
386    2.998700
698   -0.829803
487   -0.151244
683   -0.501237
148   -0.601236
         ...   
18    -0.615521
168    1.813007
228   -1.394079
292   -0.879802
377    1.241589
472   -0.559808
397   -0.286956
227    2.327283
674    1.084449
607    0.205893
328   -0.301241
283   -0.636949
747   -0.979800
746   -1.015514
603   -1.558361
504    0.977308
93    -0.686948
213   -0.472667
258   -0.136958
337   -0.658377
425    0.097323
291   -1.088370
673   -0.194100
643   -1.501220
106   -0.474095
115   -0.544094
158    0.898738
797   -0.478381
112   -0.572665
36    -0.629807
Name: SalePrice, Length:

In [91]:
lasso = LassoCV(cv=10)
lasso.fit(final_train, np.ravel(standardPrice)) 
pred_L = lasso.predict(final_test)

In [92]:
predFinal = (pred_L*sigma) + mu

In [73]:
print(mean_absolute_error(y_test, predFinal))
print(r2_jack(y_test, predFinal))

14657.4053287
(0.89835013360354243, 0.027066848620392148)


### R-square of 0.89. Really strong result. Let's try this on Kaggle

### Prediction

In [77]:
data = pd.read_csv('TrainStandard.csv')
y_train = data.pop('SalePrice')

In [78]:
y_train
mu=y_train.mean()
sigma=y_train.std() 

standardPrice=(y_train-mu)/sigma
standardPrice

0     -0.554354
1     -0.789949
2      1.109088
3     -0.076025
4     -0.447266
5     -0.104582
6      1.637392
7      0.495114
8      0.637899
9     -1.718050
10     0.238101
11    -1.168329
12     1.651670
13    -0.804228
14    -0.304481
15     1.594556
16     0.709291
17     0.495114
18    -0.647164
19    -0.363023
20    -0.425848
21     0.032848
22    -0.175975
23    -0.534364
24     0.569362
25    -0.791377
26     0.459418
27     0.058192
28     1.423215
29    -0.889898
         ...   
774    0.923468
775   -0.789949
776    1.737341
777   -0.825645
778   -0.504380
779   -0.475823
780   -0.482962
781    0.922040
782   -0.220238
783    2.822505
784    0.495114
785   -0.474038
786    6.277897
787   -0.111721
788   -0.147418
789   -0.897038
790   -0.363023
791   -0.347316
792   -0.218810
793    0.309494
794   -1.325392
795    1.137645
796    0.936319
797   -0.510091
798   -0.218810
799    1.002000
800   -0.404430
801    0.066759
802   -1.303974
803    0.009646
Name: SalePrice, Length:

In [79]:
kaggle = pd.read_csv('TestStandard.csv')

In [48]:
kaggle.head()

Unnamed: 0,1stFlrSF,2ndFlrSF,3SsnPorch,Alley,BedroomAbvGr,BsmtCond,BsmtExposure,BsmtFinSF1,BsmtFinSF2,BsmtFinType1,...,SaleType_CWD,SaleType_Con,SaleType_ConLD,SaleType_ConLI,SaleType_ConLw,SaleType_Oth,SaleType_VWD,SaleType_WD,Utilities_AllPub,Utilities_NoSewr
0,-0.297105,-0.795752,-0.108531,0.042812,-1.008823,0.307671,0.609567,-0.451748,2.178763,-1.349598,...,0,0,0,0,0,0,0,1,1,0
1,0.581575,-0.795752,-0.108531,0.042812,-1.008823,0.307671,-0.308208,0.976302,-0.306636,-0.903248,...,0,0,0,0,0,0,0,1,1,0
2,-0.437022,0.656466,-0.108531,0.042812,0.216394,0.307671,0.609567,-1.045567,-0.306636,1.328502,...,0,0,0,0,0,0,0,1,1,0
3,0.416473,1.045324,-0.108531,0.042812,0.216394,0.307671,-0.308208,-1.045567,-0.306636,1.328502,...,0,0,0,0,0,0,0,1,1,0
4,0.715895,-0.795752,11.442596,0.042812,-1.008823,0.307671,0.609567,0.142071,-0.306636,0.882152,...,0,0,0,0,0,0,0,1,1,0


In [76]:
lasso = LassoCV(cv=10)
lasso.fit(data, np.ravel(standardPrice)) 
pred_L = lasso.predict(kaggle)

ValueError: X and y have inconsistent dimensions (804 != 482)

In [50]:
#Conver the scale back
prediction_L = (pred_L*sigma) + mu

In [51]:
#This is for the indices
ind = np.arange(1,1609)
headers = ['Id','Prediction']
prediction = pd.DataFrame({'Id':ind, 'Prediction':predictions})
prediction

Unnamed: 0,Id,Prediction
0,1,88882.901908
1,2,159498.269115
2,3,146777.246622
3,4,190390.487553
4,5,151326.812478
5,6,344254.124723
6,7,130237.174914
7,8,281655.860316
8,9,114911.926623
9,10,123133.054324


In [52]:
#Saving results into CSV file 
prediction.to_csv("PredictionsStandardised.csv", index=False)

### Nonstandard

In [56]:
data = pd.read_csv('TrainSale1.csv')
final_train = data.sample(frac=0.6, random_state=1)
final_test = data[data.index.isin(final_train.index)==False]
final_train.head()
y_train = final_train.pop('SalePrice')
y_test = final_test.pop('SalePrice')


In [57]:
lasso = LassoCV(cv=10)
lasso.fit(final_train, np.ravel(y_train)) 
pred_L = lasso.predict(final_test)

In [58]:
print(mean_absolute_error(y_test, pred_L))
print(r2_jack(y_test, pred_L))

21681.6745836
(0.78519115450633303, 0.037783889579167514)


### Standardising really does help with LASSO!

# Lets try Ridge Now

In [60]:
#Ridge Regression
from sklearn.linear_model import Ridge
from sklearn.linear_model import RidgeCV

alphas = np.exp(np.linspace(-10,20,500)) 
ridge = RidgeCV(alphas=alphas, cv=10)
ridge.fit(final_train, np.ravel(y_train))
ridge = Ridge(alpha=ridge.alpha_)
ridge.fit(final_train, np.ravel(y_train))
pred_R = ridge.predict(final_test)

In [62]:
print(mean_absolute_error(y_test, pred_R))
print(r2_jack(y_test, pred_R))

14822.2660239
(0.89430728415431615, 0.030947905036899932)


In [69]:
#Ridge naturally performs well

### Try standardising it now

In [93]:
data = pd.read_csv('TrainStandard.csv')
final_train = data.sample(frac=0.6, random_state=1)
final_test = data[data.index.isin(final_train.index)==False]
final_train.head()
y_train = final_train.pop('SalePrice')
y_test = final_test.pop('SalePrice')

y_train
mu=y_train.mean()
sigma=y_train.std() 

standardPrice=(y_train-mu)/sigma

In [94]:
alphas = np.exp(np.linspace(-10,20,500)) 
ridge = RidgeCV(alphas=alphas, cv=10)
ridge.fit(final_train, np.ravel(standardPrice))
ridge = Ridge(alpha=ridge.alpha_)
ridge.fit(final_train, np.ravel(standardPrice))
pred_R = ridge.predict(final_test)

In [95]:
predFinal2 = (pred_R*sigma) + mu
print(mean_absolute_error(y_test, predFinal))
print(r2_jack(y_test, predFinal))

14657.4053287
(0.89835013360354243, 0.027066848620392148)


In [96]:
combinedPred = (predFinal2 + predFinal)/2
print(mean_absolute_error(y_test, combinedPred))
print(r2_jack(y_test, combinedPred))

14686.4905954
(0.89945042272882647, 0.027118362975003994)
