In [230]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LassoCV
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import make_scorer
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ShuffleSplit
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import statsmodels.formula.api as smf
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor

In [231]:
data = pd.read_csv('Train6.csv')
final_train = data.sample(frac=0.6, random_state=1)
final_test = data[data.index.isin(final_train.index)==False]
final_train.head()
y_train = final_train.pop('SalePrice')
y_test = final_test.pop('SalePrice')


In [232]:
data.head()

Unnamed: 0,1stFlrSF,2ndFlrSF,3SsnPorch,Alley,BedroomAbvGr,BsmtCond,BsmtExposure,BsmtFinSF1,BsmtFinSF2,BsmtFinType1,...,SaleType_CWD,SaleType_Con,SaleType_ConLD,SaleType_ConLI,SaleType_ConLw,SaleType_Oth,SaleType_VWD,SaleType_WD,Utilities_AllPub,Utilities_NoSewr
0,864,0,0,1,3,5,3,646.0,0.0,0,...,0,0,0,0,0,0,0,1,1,0
1,1368,0,0,1,3,5,2,1078.0,0.0,5,...,0,0,0,0,0,0,0,1,1,0
2,1113,858,0,1,3,5,2,341.0,0.0,2,...,0,0,0,0,0,0,0,1,1,0
3,684,720,0,2,3,5,3,600.0,0.0,2,...,0,0,0,0,0,0,0,1,1,0
4,815,875,0,1,3,5,3,0.0,0.0,6,...,0,0,0,0,1,0,0,0,1,0


# Testing adaboosting + Trees

In [233]:
#http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.AdaBoostRegressor.html#sklearn.ensemble.AdaBoostRegressor
#http://scikit-learn.org/stable/auto_examples/ensemble/plot_adaboost_regression.html

regr = AdaBoostRegressor(DecisionTreeRegressor(max_depth=9),random_state=0)
regr = regr.fit(final_train,y_train)

In [234]:
pred = regr.predict(final_test)

In [235]:
mean_absolute_error(y_test, pred)

16639.76613059935

In [236]:
r2_score(y_test, pred)

0.88182299874358849

In [237]:
values = range(1,20)
maximum = 0
minimum = 1000000000000
optimal_node = 0
for i in range(1,25):
    regr = AdaBoostRegressor(DecisionTreeRegressor(max_depth=i),random_state=0)
    regr = regr.fit(final_train,y_train)
    pred = regr.predict(final_test)
    
    #Check to see if this layer is better since highest r2 score
    r2 = r2_score(y_test, pred)
    MAE = mean_absolute_error(y_test, pred)
    print("Node {}".format(i))
    print("R-square is {}".format(r2))
    print("MAE is {}".format(MAE))
    if r2>maximum and (MAE<minimum):
        optimal_node = i
        maximum = r2
        minimum = MAE

print(" ")        
print("Optimal nodes is {}".format(optimal_node))
regr = AdaBoostRegressor(DecisionTreeRegressor(max_depth=optimal_node),random_state=0)

regr.fit(final_train, y_train)
y_1 = regr.predict(final_test)
accuracy = r2_score(y_test, y_1)
accuracy_2 = mean_absolute_error(y_test, y_1)

print("R-squared score: {}".format(accuracy))
print("Mean Absolute Error: {}".format(accuracy_2))

Node 1
R-square is 0.602396662829
MAE is 35670.101181
Node 2
R-square is 0.828513196305
MAE is 21876.6314771
Node 3
R-square is 0.856990915588
MAE is 18910.2758894
Node 4
R-square is 0.882043398342
MAE is 16963.6673744
Node 5
R-square is 0.881785592125
MAE is 17088.8933392
Node 6
R-square is 0.884945252707
MAE is 16793.4867847
Node 7
R-square is 0.879063658874
MAE is 16802.9531185
Node 8
R-square is 0.897214886703
MAE is 15932.8481544
Node 9
R-square is 0.881822998744
MAE is 16639.7661306
Node 10
R-square is 0.897943103512
MAE is 15717.7201378
Node 11
R-square is 0.888409275758
MAE is 16482.6942951
Node 12
R-square is 0.903188285759
MAE is 15667.9656898
Node 13
R-square is 0.88928519733
MAE is 16055.1131984
Node 14
R-square is 0.885565451642
MAE is 16168.6374593
Node 15
R-square is 0.891609068743
MAE is 15879.7516822
Node 16
R-square is 0.890782585772
MAE is 15946.4627329
Node 17
R-square is 0.897729021207
MAE is 15983.7038845
Node 18
R-square is 0.887982847988
MAE is 16076.9006211
Nod

## This is testing LASSO, RIDGE, and ENET

### LASSO

In [238]:
lasso = LassoCV(cv=10)
lasso.fit(final_train, np.ravel(y_train)) 
pred_L = lasso.predict(final_test)

### Enet

In [239]:
#Elastic Net
from sklearn.linear_model import ElasticNetCV
enet = ElasticNetCV(l1_ratio=[0.01,0.05,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,0.95,0.99], cv=10)
enet.fit(final_train, np.ravel(y_train))

from sklearn.linear_model import ElasticNet
enet = ElasticNet(alpha=enet.alpha_, l1_ratio=enet.l1_ratio_)
enet.fit(final_train, np.ravel(y_train))

ElasticNet(alpha=174760.54192474418, copy_X=True, fit_intercept=True,
      l1_ratio=0.98999999999999999, max_iter=1000, normalize=False,
      positive=False, precompute=False, random_state=None,
      selection='cyclic', tol=0.0001, warm_start=False)

### Ridge

In [240]:
#Ridge Regression
from sklearn.linear_model import Ridge
from sklearn.linear_model import RidgeCV

alphas = np.exp(np.linspace(-10,20,500)) 
ridge = RidgeCV(alphas=alphas, cv=10)
ridge.fit(final_train, np.ravel(y_train))
ridge = Ridge(alpha=ridge.alpha_)
ridge.fit(final_train, np.ravel(y_train))

Ridge(alpha=5.603810954001732, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

### Predictions

In [241]:
pred_L = lasso.predict(final_test)
pred_E = enet.predict(final_test)
pred_R = ridge.predict(final_test)

regr = AdaBoostRegressor(DecisionTreeRegressor(max_depth=optimal_node),random_state=0)

regr.fit(final_train, y_train)
pred_T = regr.predict(final_test)

In [242]:
#Combining the 2 predictions
pred_combined = (pred_L+pred_T)/2
pred_combined2 = (pred_T+pred_R)/2
pred_combined3 = (pred_T+pred_E)/2
pred_combined4 = (pred_T+pred_R+pred_E+pred_T)/4

### Variable selection

In [254]:
from QBUS2820 import forward

fwd = forward()
fwd.fit(final_train, y_train)
pred_F = fwd.predict(final_test)

In [258]:
pred_combined5 = (pred_F+pred_T)/2
pred_combined6 = (pred_F+pred_T+pred_R)/3

### Other random models

### Kernel Ridge Regression

In [274]:
from sklearn.linear_model import ElasticNet, Lasso,  BayesianRidge, LassoLarsIC
from sklearn.ensemble import RandomForestRegressor,  GradientBoostingRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler

In [279]:
KRR = KernelRidge(alpha=0.1)

In [280]:
KRR.fit(final_train,y_train)
pred_K = KRR.predict(final_test)

### Gradient Boosting Regression 

In [281]:
GBoost = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05,
                                   max_depth=4, max_features='sqrt',
                                   min_samples_leaf=15, min_samples_split=10, 
                                   loss='huber', random_state =5)
GBoost.fit(final_train,y_train)
pred_GB = GBoost.predict(final_test)

In [286]:
pred_combined7 = (pred_F+pred_T+pred_R+pred_GB)/4
pred_combined8 = (pred_F+pred_T+pred_R+pred_GB+pred_K)/5

### Evaluation of the model

In [288]:
from QBUS2820 import rmse_jack, r2_jack 
columns=['Test RMSE', 'SE', 'Test R2', 'SE', 'MAE']
rows = ['Lasso', 'Adaboost Tree', 'Lasso + Tree', 'Ridge', 'Enet', 'Tree + Ridge', 'Tree + Enet', 'Tree + All 3', 'Forward', 'Forward + Tree', 'Forward + Tree + Ridge', 'KRR', 'GBoost','Forward Tree Ridge GB','Forward Tree Ridge GB KRR']
results=pd.DataFrame(0.0, columns=columns, index=rows) 

results.iloc[0,0], results.iloc[0,1] = rmse_jack(y_test, pred_L)
results.iloc[0,2], results.iloc[0,3] = (r2_jack(y_test, pred_L))
results.iloc[1,0], results.iloc[1,1] = rmse_jack(y_test, pred_T)
results.iloc[1,2], results.iloc[1,3] = (r2_jack(y_test, pred_T))
results.iloc[2,0], results.iloc[2,1] = rmse_jack(y_test, pred_combined)
results.iloc[2,2], results.iloc[2,3] = (r2_jack(y_test, pred_combined))
results.iloc[3,0], results.iloc[3,1] = rmse_jack(y_test, pred_R)
results.iloc[3,2], results.iloc[3,3] = (r2_jack(y_test, pred_R))
results.iloc[4,0], results.iloc[4,1] = rmse_jack(y_test, pred_E)
results.iloc[4,2], results.iloc[4,3] = (r2_jack(y_test, pred_E))
results.iloc[5,0], results.iloc[5,1] = rmse_jack(y_test, pred_combined2)
results.iloc[5,2], results.iloc[5,3] = (r2_jack(y_test, pred_combined2))
results.iloc[6,0], results.iloc[6,1] = rmse_jack(y_test, pred_combined3)
results.iloc[6,2], results.iloc[6,3] = (r2_jack(y_test, pred_combined3))
results.iloc[7,0], results.iloc[7,1] = rmse_jack(y_test, pred_combined4)
results.iloc[7,2], results.iloc[7,3] = (r2_jack(y_test, pred_combined4))
results.iloc[8,0], results.iloc[8,1] = (rmse_jack(y_test, pred_F))
results.iloc[8,2], results.iloc[8,3] = (r2_jack(y_test, pred_F))
results.iloc[9,0], results.iloc[9,1] = rmse_jack(y_test, pred_combined5)
results.iloc[9,2], results.iloc[9,3] = (r2_jack(y_test, pred_combined5))
results.iloc[10,0], results.iloc[10,1] = rmse_jack(y_test, pred_combined6)
results.iloc[10,2], results.iloc[10,3] = (r2_jack(y_test, pred_combined6))
results.iloc[11,0], results.iloc[11,1] = rmse_jack(y_test, pred_K)
results.iloc[11,2], results.iloc[11,3] = (r2_jack(y_test, pred_K))
results.iloc[12,0], results.iloc[12,1] = rmse_jack(y_test, pred_GB)
results.iloc[12,2], results.iloc[12,3] = (r2_jack(y_test, pred_GB))
results.iloc[13,0], results.iloc[13,1] = rmse_jack(y_test, pred_combined7)
results.iloc[13,2], results.iloc[13,3] = (r2_jack(y_test, pred_combined7))
results.iloc[14,0], results.iloc[14,1] = rmse_jack(y_test, pred_combined8)
results.iloc[14,2], results.iloc[14,3] = (r2_jack(y_test, pred_combined8))
results.iloc[0,4] = mean_absolute_error(y_test, pred_L)
results.iloc[1,4] = mean_absolute_error(y_test, pred_T)
results.iloc[2,4] = mean_absolute_error(y_test, pred_combined)
results.iloc[3,4] = mean_absolute_error(y_test, pred_R)
results.iloc[4,4] = mean_absolute_error(y_test, pred_E)
results.iloc[5,4] = mean_absolute_error(y_test, pred_combined2)
results.iloc[6,4] = mean_absolute_error(y_test, pred_combined3)
results.iloc[7,4] = mean_absolute_error(y_test, pred_combined4)
results.iloc[8,4] = mean_absolute_error(y_test, pred_F)
results.iloc[9,4] = mean_absolute_error(y_test, pred_combined5)
results.iloc[10,4] = mean_absolute_error(y_test, pred_combined6)
results.iloc[11,4] = mean_absolute_error(y_test, pred_K)
results.iloc[12,4] = mean_absolute_error(y_test, pred_GB)
results.iloc[13,4] = mean_absolute_error(y_test, pred_combined7)
results.iloc[14,4] = mean_absolute_error(y_test, pred_combined8)
results.round(3)

Unnamed: 0,Test RMSE,SE,Test R2,SE.1,MAE
Lasso,32421.452,3673.317,0.785,0.038,21681.675
Adaboost Tree,21765.585,2131.775,0.903,0.011,15667.966
Lasso + Tree,24119.075,2756.184,0.881,0.016,16262.404
Ridge,22742.014,3622.616,0.894,0.031,14822.266
Enet,33798.607,3404.502,0.767,0.037,23229.632
Tree + Ridge,19161.662,2034.263,0.925,0.01,13058.764
Tree + Enet,24684.927,2665.54,0.875,0.016,16876.821
Tree + All 3,21140.31,2419.839,0.909,0.013,14132.884
Forward,22240.102,2315.032,0.899,0.024,15109.307
Forward + Tree,18502.279,1372.057,0.93,0.008,13230.422


# Predicting on Kaggle Data

In [327]:
data = pd.read_csv('Train6.csv')
y_price = data.pop('SalePrice')

In [328]:
#regr = (DecisionTreeRegressor(max_depth=9))
regr = AdaBoostRegressor(DecisionTreeRegressor(max_depth=optimal_node),random_state=0)
regr = regr.fit(data,y_price)



alphas = np.exp(np.linspace(-10,20,500)) 
ridge = RidgeCV(alphas=alphas, cv=10)
ridge.fit(data, np.ravel(y_price))
ridge = Ridge(alpha=ridge.alpha_)
ridge.fit(data, np.ravel(y_price))

GBoost = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05,
                                   max_depth=4, max_features='sqrt',
                                   min_samples_leaf=15, min_samples_split=10, 
                                   loss='huber', random_state =5)
GBoost.fit(data,y_price)

#Forward selection
from QBUS2820 import forward

fwd = forward()
fwd.fit(data, y_price)


In [329]:
kaggle = pd.read_csv('Test6.csv')
np.any(np.isnan(kaggle))
np.all(np.isfinite(kaggle))
kaggle = kaggle.replace(np.nan, 0, regex=True)
#kaggle.to_csv("Test1_edited.csv", index=False)


In [330]:
prediction_T = regr.predict(kaggle)
prediction_R = ridge.predict(kaggle)
pred_GB = GBoost.predict(kaggle)
pred_F = fwd.predict(kaggle)

In [340]:
prediction = (prediction_T+prediction_R+pred_F+pred_GB)/4
len(prediction)

1608

In [341]:
#This is for the indices
ind = np.arange(1,1609)

In [342]:
headers = ['Id','Prediction']
predictions = pd.DataFrame({'Id':ind, 'Prediction':prediction})
predictions

Unnamed: 0,Id,Prediction
0,1,93948.363916
1,2,161848.629679
2,3,141864.777252
3,4,192411.696709
4,5,151754.730469
5,6,330888.748432
6,7,128273.033201
7,8,280039.821416
8,9,116780.877276
9,10,123140.447783


In [343]:
#Saving results into CSV file 
predictions.to_csv("PredictionsFinal.csv", index=False)

In [306]:
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
class AveragingModels(BaseEstimator, RegressorMixin, TransformerMixin):
    def __init__(self, models):
        self.models = models
        
    # we define clones of the original models to fit the data in
    def fit(self, X, y):
        self.models_ = [clone(x) for x in self.models]
        
        # Train cloned base models
        for model in self.models_:
            model.fit(X, y)

        return self
    
    #Now we do the predictions for cloned models and average them
    def predict(self, X):
        predictions = np.column_stack([
            model.predict(X) for model in self.models_
        ])
        return np.mean(predictions, axis=1)   

In [313]:
#Validation function
n_folds = 5

def rmsle_cv(model):
    kf = KFold(n_folds, shuffle=True, random_state=42).get_n_splits(data.values)
    rmse= np.sqrt(-cross_val_score(model, data, y_price, scoring="neg_mean_squared_error", cv = kf))
    return(rmse)

In [312]:
averaged_models = AveragingModels(models = (GBoost, ridge, regr))

score = rmsle_cv(averaged_models)
print(" Averaged base models score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

 Averaged base models score: 18630.1142 (2059.9310)



In [320]:
averaged_models.fit(data,y_price)
prediction = averaged_models.predict(kaggle)

In [321]:
headers = ['Id','Prediction']
predictions = pd.DataFrame({'Id':ind, 'Prediction':prediction})
predictions

Unnamed: 0,Id,Prediction
0,1,95263.306134
1,2,158733.856853
2,3,140288.486132
3,4,190663.037170
4,5,149878.058677
5,6,339970.158532
6,7,129233.121292
7,8,281331.946530
8,9,114598.091487
9,10,124152.596155


In [322]:
#Saving results into CSV file 
predictions.to_csv("Predictions6.csv", index=False)

In [324]:
data = pd.read_csv('Train6.csv')
final_train = data.sample(frac=0.6, random_state=1)
final_test = data[data.index.isin(final_train.index)==False]
final_train.head()
y_train = final_train.pop('SalePrice')
y_test = final_test.pop('SalePrice')
averaged_models.fit(final_train,y_train)
prediction = averaged_models.predict(final_test)
print(mean_absolute_error(y_test, prediction))
print(r2_jack(y_test, prediction))

12782.9415182
(0.92637546092480094, 0.012194779117396599)


In [348]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import BaggingRegressor

### Trying out learning models

In [350]:
data = pd.read_csv('Train6.csv')
final_train = data.sample(frac=0.6, random_state=1)
final_test = data[data.index.isin(final_train.index)==False]
final_train.head()
y_train = final_train.pop('SalePrice')
y_test = final_test.pop('SalePrice')


regr = RandomForestRegressor()
regr = regr.fit(final_train,y_train)
prediction = regr.predict(final_test)
print(mean_absolute_error(y_test, prediction))
print(r2_jack(y_test, prediction))

regr = GradientBoostingRegressor()
regr = regr.fit(final_train,y_train)
prediction = regr.predict(final_test)
print(mean_absolute_error(y_test, prediction))
print(r2_jack(y_test, prediction))

regr = ExtraTreesRegressor()
regr = regr.fit(final_train,y_train)
prediction = regr.predict(final_test)
print(mean_absolute_error(y_test, prediction))
print(r2_jack(y_test, prediction))

regr = BaggingRegressor()
regr = regr.fit(final_train,y_train)
prediction = regr.predict(final_test)
print(mean_absolute_error(y_test, prediction))
print(r2_jack(y_test, prediction))


regr = AdaBoostRegressor()
regr = regr.fit(final_train,y_train)
prediction = regr.predict(final_test)
print(mean_absolute_error(y_test, prediction))
print(r2_jack(y_test, prediction))

17349.3645963
(0.87681040063502624, 0.019429542755916837)
14736.963304
(0.90803696593536321, 0.016003155413911684)
14667.381677
(0.91830706286221209, 0.009264426944876461)
17395.0841615
(0.88328310351363315, 0.013689744351550731)
18883.7267235
(0.86334937375067777, 0.016383731965058431)


### Combining our regressor with ensemble techniques

In [357]:

regr = AdaBoostRegressor(DecisionTreeRegressor(max_depth=optimal_node),random_state=0)
regr = regr.fit(final_train,y_train)
prediction = regr.predict(final_test)
print(mean_absolute_error(y_test, prediction))
print(r2_jack(y_test, prediction))

regr = BaggingRegressor(DecisionTreeRegressor(max_depth=optimal_node),random_state=0)
regr = regr.fit(final_train,y_train)
prediction = regr.predict(final_test)
print(mean_absolute_error(y_test, prediction))
print(r2_jack(y_test, prediction))


15667.9656898
(0.90318828575875654, 0.011394514860448169)
17314.7883701
(0.87310459729234757, 0.019839352531541194)


In [358]:
from sklearn.svm import SVR

In [364]:
clf = SVR(C=10.0, epsilon=1.2)
clf = clf.fit(final_train,y_train)
prediction = clf.predict(final_test)
print(mean_absolute_error(y_test, prediction))
print(r2_jack(y_test, prediction))

50947.7504542
(-0.11446608721681084, 0.030382379704914194)
