In [118]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LassoCV
from sklearn.model_selection import KFold
from sklearn.linear_model import ElasticNetCV
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import make_scorer
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ShuffleSplit
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import r2_score
import warnings
import statsmodels.formula.api as smf
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from QBUS2820 import rmse_jack, r2_jack 
from sklearn.linear_model import Ridge
from sklearn.linear_model import RidgeCV
from QBUS2820 import forward
from sklearn.ensemble import ExtraTreesRegressor, BaggingRegressor, GradientBoostingRegressor,RandomForestRegressor
from sklearn import linear_model
from sklearn.neighbors import RadiusNeighborsRegressor,NearestNeighbors

### Using standardised data for LASSO, and Ridge

In [94]:
data = pd.read_csv('TrainStandard.csv')
final_train = data.sample(frac=0.6, random_state=1)
final_test = data[data.index.isin(final_train.index)==False]
final_train.head()
y_train = final_train.pop('SalePrice')
y_test = final_test.pop('SalePrice')

y_train
mu=y_train.mean()
sigma=y_train.std() 

standardPrice=(y_train-mu)/sigma

In [95]:
method = []
pred = []

In [96]:
#LASSO
lasso = LassoCV(cv=10)
lasso.fit(final_train, np.ravel(standardPrice)) 
pred_L = lasso.predict(final_test)
predFinalLasso = (pred_L*sigma) + mu
method.append('LASSO')
pred.append(predFinalLasso)

In [97]:
#RIDGE
alphas = np.exp(np.linspace(-10,20,500)) 
ridge = RidgeCV(alphas=alphas, cv=10)
ridge.fit(final_train, np.ravel(standardPrice))
ridge = Ridge(alpha=ridge.alpha_)
ridge.fit(final_train, np.ravel(standardPrice))
pred_R = ridge.predict(final_test)
predFinalRidge = (pred_R*sigma) + mu
method.append('Ridge')
pred.append(predFinalRidge)

In [119]:
#ENET
#Elastic Net

enet = ElasticNetCV(l1_ratio=[0.01,0.05,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,0.95,0.99], cv=10)
enet.fit(final_train, np.ravel(standardPrice))

from sklearn.linear_model import ElasticNet
enet = ElasticNet(alpha=enet.alpha_, l1_ratio=enet.l1_ratio_)
enet.fit(final_train, np.ravel(standardPrice))
pred_E = enet.predict(final_test)
predFinalEnet = (pred_E*sigma) + mu
method.append('ENET')
pred.append(predFinalEnet)

### Now using a regular dataset

In [98]:
data = pd.read_csv('TrainSale1.csv')
final_train = data.sample(frac=0.6, random_state=1)
final_test = data[data.index.isin(final_train.index)==False]
final_train.head()
y_train = final_train.pop('SalePrice')
y_test = final_test.pop('SalePrice')

In [99]:
#Forward with non-standardised data
fwd = forward()
fwd.fit(final_train, y_train)
predFinalForwardNormal = fwd.predict(final_test)

In [100]:
method.append('Forward Sel')
pred.append(predFinalForwardNormal)

### Using esemble methods

### We specify a loss function of either least square (LS) or least absolute deviation (LAD)
### LAD does better if outliers present and LS does better if no outliers present

In [116]:
#We can fine tune and see if we are able to find superior model of adaboost
#We can adjust number of estimators, the learning rate, and the base estimator used
n_estimators = range(50, 400, 50)
learning_rate = [0.0001, 0.001, 0.01, 0.1, 0.2, 0.3,1]
lossType = ['square','linear']
param = []
adapred = []
#This is to keep track of which one to use
minMAE = 10000000000000
optEstAda=0 
optRateAda = 0

for i in n_estimators:
    for j in learning_rate:
        for k in lossType:
            regr = AdaBoostRegressor(loss=k, learning_rate = j, n_estimators = i)
            regr = regr.fit(final_train,y_train)
            predFinalAdaBoost = regr.predict(final_test)
            adapred.append(predFinalAdaBoost)
            param.append("{} estimators, {} rate and {} type".format(i, j, k))
            if((mean_absolute_error(y_test, predFinalAdaBoost)<minMAE) and (r2_score(y_test,predFinalAdaBoost)>0.8)):
                optEstAda = i
                optRateAda = j
                opttypeAda = k               
getResultTable(param,adapred)
#From this, we can figure out which tuning specification we should use 

Unnamed: 0,Test RMSE,SE,Jack R2,SE.1,MAE,R-square
"50 estimators, 0.0001 rate and square type",29649.544,3433.706,0.820,0.025,21269.327,0.820
"50 estimators, 0.0001 rate and linear type",29852.698,3033.374,0.818,0.021,21825.917,0.818
"50 estimators, 0.001 rate and square type",28800.346,3282.502,0.830,0.022,20752.061,0.830
"50 estimators, 0.001 rate and linear type",29583.824,2880.191,0.821,0.020,22100.676,0.821
"50 estimators, 0.01 rate and square type",28863.377,3103.781,0.830,0.021,21160.105,0.830
"50 estimators, 0.01 rate and linear type",28350.894,3007.248,0.836,0.020,20966.847,0.836
"50 estimators, 0.1 rate and square type",27659.435,2662.983,0.844,0.018,20637.702,0.844
"50 estimators, 0.1 rate and linear type",27369.429,2473.351,0.847,0.017,20326.353,0.847
"50 estimators, 0.2 rate and square type",26621.393,1778.866,0.855,0.015,20364.046,0.855
"50 estimators, 0.2 rate and linear type",28100.092,3912.520,0.839,0.029,20082.683,0.839


In [109]:
print()
print("Optimal for AdaBoost was {} estimators, {} rate and {} type".format(optEstAda, optRateAda, opttypeAda))


Optimal for AdaBoost was 350 estimators, 1 rate and linear type


### Fine tune parameters for random Forest

In [144]:
#We can fine tune and see if we are able to find superior model of randomForest too
#We can adjust number of estimators, the learning rate, and the base estimator used
n_estimators = range(5, 29, 3)
depth = range(1,5,1)
samSplit = [2,3]
featureType = ['auto','sqrt']
paramFor = []
Forpred = []
#This is to keep track of which one to use
minMAE = 10000000000000
optEstFor=0 
optDepthFor = 0
optSplit = 0
for i in n_estimators:
    for j in depth:
        for k in samSplit:
            for l in featureType:
                regr = RandomForestRegressor(n_estimators = i, max_depth=j, min_samples_split=k ,criterion ='mae', max_features = l)
                regr = regr.fit(final_train,y_train)
                predFinalRandomForest = regr.predict(final_test)           
                Forpred.append(predFinalRandomForest)
                paramFor.append("{} estimators, {} depth, {} split, and {} type".format(i, j, k, l))
                if((mean_absolute_error(y_test, predFinalRandomForest)<minMAE) and (r2_score(y_test,predFinalRandomForest)>0.8)):
                    optEstFor = i
                    optDepthFor = j
                    optSplit = k
                    opttypeFor = l               
getResultTable(paramFor,Forpred)
#From this, we can figure out which tuning specification we should use 

ValueError: Number of labels=804 does not match number of samples=482

In [104]:
print()

print("Optimal for Random For was {} estimators, {} depth, {} split and {} type".format(optEstFor, optDepthFor, optSplit, opttypeFor))


Optimal for Random For was 26 estimators, 4 depth, 3 split and sqrt type


### Finally for gradient boosting

In [105]:
#We can fine tune and see if we are able to find superior model of Gradientboost
#We can adjust number of estimators, the learning rate, and the base estimator used
n_estimators = range(50, 450, 50)
learning_rate = [0.01, 0.2, 0.6, 1]
depth = range(1,4,1)
samSplit = [2,3]
paramGrad = []
Gradpred = []
#This is to keep track of which one to use
minMAE = 10000000000000
optEstGrad=0 
optRateGrad = 0
optDepthGrad = 0
optSplitGrad = 0

for i in n_estimators:
    for j in learning_rate:
        for k in depth:
            for l in samSplit:
                regr = GradientBoostingRegressor(loss='lad', n_estimators = i, learning_rate = j, max_depth = k, min_samples_split=l)
                regr = regr.fit(final_train,y_train)
                predFinalGradBoostlad = regr.predict(final_test)          
                Gradpred.append(predFinalGradBoostlad)
                paramFor.append("{} estimators, {} rate, {} depth, and {} split".format(i, j, k, l))
                if((mean_absolute_error(y_test, predFinalGradBoostlad)<minMAE) and (r2_score(y_test,predFinalGradBoostlad)>0.8) ):
                    optEstGrad = i
                    optRateGrad = j
                    optDepthGrad = k
                    optSplitGrad = l               
getResultTable(paramGrad,Gradpred)
#From this, we can figure out which tuning specification we should use 

Unnamed: 0,Test RMSE,SE,Jack R2,SE.1,MAE,R-square


In [115]:
print()
print("Optimal for Gradient boost was {} estimators, {} rate,  {} depth, {} split".format(optEstGrad, optRateGrad, optDepthGrad, optSplitGrad))


Optimal for Gradient boost was 400 estimators, 1 rate,  3 depth, 3 split


In [148]:
#Now using Ensemble methods
#http://scikit-learn.org/stable/modules/ensemble.html
#http://scikit-learn.org/stable/modules/ensemble.html#adaboost

#Bagging and random forest use averages
#Boosting combine several weak models into a powerful model
#LS first
#Random Forest
regr = RandomForestRegressor(n_estimators = optEstFor, max_depth=optDepthFor, min_samples_split=optSplit ,criterion ='mae', max_features = opttypeFor)
regr = regr.fit(final_train,y_train)
predFinalRandomForest = regr.predict(final_test)        

#GradientBoosting aka Stochastic gradient boosting
regr = GradientBoostingRegressor(n_estimators = optEstGrad, learning_rate = optRateGrad, max_depth = optDepthGrad, min_samples_split=optSplitGrad)
regr = regr.fit(final_train,y_train)
predFinalGradBoostLS = regr.predict(final_test)

#Extremely Random forest
regr = ExtraTreesRegressor(max_depth=None, min_samples_split=2)
regr = regr.fit(final_train,y_train)
predFinalExtRandomForestLS = regr.predict(final_test)

regr = BaggingRegressor()
regr = regr.fit(final_train,y_train)
predFinalBagging = regr.predict(final_test)

regr = AdaBoostRegressor(loss=opttypeAda, learning_rate = optRateAda, n_estimators = optEstAda)
regr = regr.fit(final_train,y_train)
predFinalAdaBoost = regr.predict(final_test)

method.append('Random ForestLS')
pred.append(predFinalRandomForest)
method.append('GradientBoostingLS')
pred.append(predFinalGradBoostLS)
method.append('ExtraTreesRegressorLS')
pred.append(predFinalExtRandomForestLS)
method.append('BaggingRegressorLS')
pred.append(predFinalBagging)
method.append('AdaBoostRegressorLS')
pred.append(predFinalAdaBoost)


#GradientBoosting
regr = GradientBoostingRegressor(loss='lad', n_estimators = optEstGrad, learning_rate = optRateGrad, max_depth = optDepthGrad, min_samples_split=optSplitGrad)
regr = regr.fit(final_train,y_train)
predFinalGradBoostlad = regr.predict(final_test)   

#Extremely Random forest
regr = ExtraTreesRegressor(criterion='mae',max_depth=None,min_samples_split=2)
regr = regr.fit(final_train,y_train)
predFinalExtRandomForestlad = regr.predict(final_test)

regr = AdaBoostRegressor(loss='linear')
regr = regr.fit(final_train,y_train)
predFinalAdaBoostlad = regr.predict(final_test)

method.append('GradientBoostingLAD')
pred.append(predFinalGradBoostlad)
method.append('ExtraTreesRegressorLAD')
pred.append(predFinalExtRandomForestLS)

ValueError: n_estimators must be greater than zero, got 0.

### Some new techniques

In [111]:
#Bayesian Ridge Regression with normalisation of data
clf = linear_model.BayesianRidge(normalize=True)
clf.fit(final_train,y_train)
predFinalBayRidge = clf.predict(final_test)
method.append('Bayseian Ridge')
pred.append(predFinalBayRidge)

### KNN does not work for unsupervised learning/predictions

# Results of our models

In [None]:
# So I don't have to construct these tables manually 
def getResultTable(rows, predictions):
    columns=['Test RMSE', 'SE', 'Jack R2', 'SE', 'MAE', 'R-square']
    results=pd.DataFrame(0.0, columns=columns, index=rows) 
    
    for row,pred in zip(range(0,len(rows)),predictions):
        results.iloc[row,0], results.iloc[row,1] = rmse_jack(y_test, pred)
        results.iloc[row,2], results.iloc[row,3] = (r2_jack(y_test, pred))
        results.iloc[row,4] = mean_absolute_error(y_test, pred)
        results.iloc[row,5] = r2_score(y_test,pred)
    return results.round(3)

In [120]:
getResultTable(method,pred)

Unnamed: 0,Test RMSE,SE,Jack R2,SE.1,MAE,R-square
LASSO,22302.821,3273.858,0.898,0.027,14657.405,0.898
Ridge,22198.874,3205.593,0.899,0.027,14814.092,0.899
Forward Sel,23308.651,2581.009,0.889,0.022,15289.87,0.889
Random ForestLS,32890.253,4167.495,0.779,0.028,20847.5,0.779
GradientBoostingLS,35093.947,2416.598,0.748,0.04,24355.783,0.748
ExtraTreesRegressorLS,22010.162,1882.558,0.901,0.011,15477.165,0.901
BaggingRegressorLS,23734.254,2445.746,0.885,0.015,16902.539,0.885
AdaBoostRegressorLS,24834.308,2427.106,0.874,0.015,18086.837,0.874
GradientBoostingLAD,37042.179,2921.321,0.72,0.034,26811.292,0.72
ExtraTreesRegressorLAD,22010.162,1882.558,0.901,0.011,15477.165,0.901


# KAGGLE PREDICTION

## Using ExtraTreeLAD, Ridge, Lasso, and Bayesian Ridge

In [133]:
#Using standardised data for ridge and lasso
data = pd.read_csv('TrainStandard.csv')
y_train = data.pop('SalePrice')

y_train
mu=y_train.mean()
sigma=y_train.std() 

standardPrice=(y_train-mu)/sigma

In [134]:
kaggle1 = pd.read_csv('TestStandard.csv')

In [135]:
#RIDGE
alphas = np.exp(np.linspace(-10,20,500)) 
ridge = RidgeCV(alphas=alphas, cv=10)
ridge.fit(data, np.ravel(standardPrice))
ridge = Ridge(alpha=ridge.alpha_)
ridge.fit(data, np.ravel(standardPrice))
pred_R_Kaggle = ridge.predict(kaggle1)
predTestRidge = (pred_R_Kaggle*sigma) + mu

#LASSO
lasso = LassoCV(cv=10)
lasso.fit(data, np.ravel(standardPrice)) 
pred_L_Kaggle = lasso.predict(kaggle1)
predTestLasso = (pred_L_Kaggle*sigma) + mu


In [137]:
#Using standard dataset now
data2 = pd.read_csv('TrainSale1.csv')
y_train2 = data2.pop('SalePrice')
kaggle2 = pd.read_csv('TestSale1.csv')

In [138]:
#Bayesian Ridge Regression with normalisation of data
clf = linear_model.BayesianRidge(normalize=True)
clf.fit(data2,y_train2)
predTestBayRidge = clf.predict(kaggle2)


In [139]:
regr = ExtraTreesRegressor(criterion='mae',max_depth=None,min_samples_split=2)
regr = regr.fit(data2,y_train2)
predTestRandomForest = regr.predict(kaggle2)

In [140]:
KagglePred1 = (predTestRandomForest+predTestBayRidge+predTestLasso+predTestRidge)/4
#This is for the indices
ind = np.arange(1,1609)
headers = ['Id','Prediction']
prediction2 = pd.DataFrame({'Id':ind, 'Prediction':KagglePred1})
prediction2
#Saving results into CSV file 
prediction2.to_csv("PredictionDay2_2.csv", index=False)

### Combine with forward selection

In [142]:
#Forward with non-standardised data
fwd = forward()
fwd.fit(data2, y_train2)
predFinalForward = fwd.predict(kaggle2)

In [146]:
kagglepred3 = (predTestLasso+predTestRidge+predFinalForward+predTestRandomForest)/4
#This is for the indices
ind = np.arange(1,1609)
headers = ['Id','Prediction']
prediction3 = pd.DataFrame({'Id':ind, 'Prediction':kagglepred3})
prediction3
#Saving results into CSV file 
prediction3.to_csv("PredictionDay2_3.csv", index=False)

## Different model completely and dataset

In [147]:
data = pd.read_csv('Train6.csv')
y_price = data.pop('SalePrice')

alphas = np.exp(np.linspace(-10,20,500)) 
ridge = RidgeCV(alphas=alphas, cv=10)
ridge.fit(data, np.ravel(y_price))
ridge = Ridge(alpha=ridge.alpha_)
ridge.fit(data, np.ravel(y_price))

GBoost = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05,
                                   max_depth=4, max_features='sqrt',
                                   min_samples_leaf=15, min_samples_split=10, 
                                   loss='huber', random_state =5)
GBoost.fit(data,y_price)

#Forward selection
from QBUS2820 import forward

fwd = forward()
fwd.fit(data, y_price)


In [149]:
regr = AdaBoostRegressor(loss=opttypeAda, learning_rate = optRateAda, n_estimators = optEstAda)
regr = regr.fit(data,y_price)

In [150]:
regr_T = ExtraTreesRegressor(criterion='mae',max_depth=None,min_samples_split=2)
regr_T = regr.fit(data,y_price)

regr_B = BaggingRegressor()
regr_B = regr.fit(data,y_price)


In [151]:
kaggle = pd.read_csv('Test6.csv')

In [152]:
pred_B = regr_B.predict(kaggle)
pred_T = regr_T.predict(kaggle)
pred_F = fwd.predict(kaggle)
pred_G = GBoost.predict(kaggle)
pred_R = ridge.predict(kaggle)
pred_A = regr.predict(kaggle)

In [153]:
kagglepred4 = (pred_B+pred_T+pred_F+pred_G+pred_R+pred_A)/6
#This is for the indices
ind = np.arange(1,1609)
headers = ['Id','Prediction']
prediction4 = pd.DataFrame({'Id':ind, 'Prediction':kagglepred4})
prediction4
#Saving results into CSV file 
prediction4.to_csv("PredictionDay2_4.csv", index=False)

## Final attempt on previous model

In [155]:
kagglepred5 = (pred_A+pred_R+pred_F+pred_G)/4
#This is for the indices
ind = np.arange(1,1609)
headers = ['Id','Prediction']
prediction5 = pd.DataFrame({'Id':ind, 'Prediction':kagglepred5})
prediction5
#Saving results into CSV file 
prediction5.to_csv("PredictionDay2_5.csv", index=False)