In [2]:
# Reading the data
import pandas as pd
import numpy as np

df = pd.read_csv("train.csv")
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [3]:
X = df[["GrLivArea","YearBuilt"]]
y = df["SalePrice"]

In [4]:
# Splitting the train and test data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, train_size=0.8,random_state=42)

In [5]:
# Standardize the data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)
X_train_std = scaler.transform(X_train)
X_test_std = scaler.transform(X_test)

In [9]:
# Importing sklearn models needed for blending
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
# Import metrics used for estimation
from sklearn.metrics import mean_squared_error

# [Problem 1]

In [7]:
def blending_model(models,weight,X_train, X_test, y_train, y_test):
    y_pred_list = []
    for model in models:
        model.fit(X_train,y_train)
        y_pred = model.predict(X_test)
        y_pred_list.append(y_pred)
        name = type(model).__name__
        mse = mean_squared_error(y_test,y_pred)
        print("MSE for {}: {}".format(name,mse))
    blending_pred = np.zeros((len(y_test)))
    for i in range(len(y_pred_list)):
        blending_pred += y_pred_list[i]*weight[i]
    print("MSE for blending model is : {}".format(mean_squared_error(y_test,blending_pred)))

In [71]:
models = [LinearRegression(),DecisionTreeRegressor(max_depth=3),Lasso(alpha=0.1)]
weight = [0.4, 0.3,0.3]
blending_model(models,weight,X_train_std,X_test_std,y_train,y_test)

MSE for LinearRegression: 2495554898.6683207
MSE for DecisionTreeRegressor: 2806312761.649392
MSE for Lasso: 2495554865.425528
MSE for blending model is : 2439504197.9325466


In [87]:
models2 = [DecisionTreeRegressor(),Ridge(),ElasticNet()]
weight2 = [0.6,0.3,0.1]
blending_model(models2,weight2,X_train_std,X_test_std,y_train,y_test)

MSE for DecisionTreeRegressor: 2245462576.40449
MSE for Ridge: 2496340156.4585676
MSE for ElasticNet: 3190751084.7226815
MSE for blending model is : 1950251897.069018


In [204]:
models3 = [RandomForestRegressor(max_depth=3),DecisionTreeRegressor(max_depth=3),ElasticNet(),LinearRegression()]
weight3 = [0.35,0.2,0.1,0.35]
blending_model(models3,weight3,X_train_std,X_test_std,y_train,y_test)

MSE for RandomForestRegressor: 2446547479.077316
MSE for DecisionTreeRegressor: 2806312761.649392
MSE for ElasticNet: 3190751084.7226815
MSE for LinearRegression: 2495554898.6683207
MSE for blending model is : 2440659076.8351493


# [Problem 2]

In [11]:
def baggingFromScratch(models, X_train, y_train, X_test, y_test):
    y_pred_final = np.zeros(len(X_test))
    for model in models:
        X_bagging, _ , y_bagging, _ = train_test_split(X_train,y_train, train_size=0.2,shuffle=True)
        model.fit(X_bagging,y_bagging)
        y_pred = model.predict(X_test)
        
        # Output the mse of single model
        mse = mean_squared_error(y_test,y_pred)
        name = type(model).__name__
        print("MSE for {}: {}".format(name,mse))
        
        # Adding up each model prediction to the final prediction
        y_pred_final += y_pred/len(models)

    print("MSE for the Bagging model: {}".format(mean_squared_error(y_test,y_pred_final)))

In [13]:
models_for_Bagging = []
for i in range(20):
    models_for_Bagging.append(DecisionTreeRegressor())
baggingFromScratch(models_for_Bagging,X_train_std,y_train,X_test_std,y_test)

MSE for DecisionTreeRegressor: 2901991070.770548
MSE for DecisionTreeRegressor: 3743849094.030822
MSE for DecisionTreeRegressor: 3102761398.332192
MSE for DecisionTreeRegressor: 4150162512.85274
MSE for DecisionTreeRegressor: 4036881589.472603
MSE for DecisionTreeRegressor: 3000367561.9528155
MSE for DecisionTreeRegressor: 3815496423.339041
MSE for DecisionTreeRegressor: 3402966252.3116436
MSE for DecisionTreeRegressor: 2768236496.359589
MSE for DecisionTreeRegressor: 2816452139.7089043
MSE for DecisionTreeRegressor: 3008892853.0
MSE for DecisionTreeRegressor: 3439989388.619863
MSE for DecisionTreeRegressor: 3781118825.400685
MSE for DecisionTreeRegressor: 2734420344.7636986
MSE for DecisionTreeRegressor: 4375594083.356164
MSE for DecisionTreeRegressor: 3354743756.202055
MSE for DecisionTreeRegressor: 4269040441.941781
MSE for DecisionTreeRegressor: 3315616582.8116436
MSE for DecisionTreeRegressor: 3805288525.452055
MSE for DecisionTreeRegressor: 3956005881.160198
MSE for the Bagging m

# [Problem 3]

In [14]:
from sklearn.model_selection import KFold

In [38]:
class Stacking_Regressor():
    """
    Class for creating a Stacking Model
    """
    def __init__(self, models, final_model, K):
        self.K = K
        self.models = models
        self.final_model = final_model
        self.fit_models = [ [] for _ in range(K) ]
    
    def fit(self, X, y):
        
        X = np.array(X)
        y = np.array(y)
        
        # Array to store estimated result of n-th depth
        self.y_pred = np.zeros((len(X),len(self.models)))
        
        kf = KFold(n_splits=self.K, shuffle=False)
        fold = 0
        for i, model in enumerate(self.models):
            for train_idx, valid_idx in kf.split(X):
                self.fit_models[fold].append(model.fit(X[train_idx],y[train_idx]))
                self.y_pred[valid_idx,i] = model.predict(X[valid_idx])
                fold += 1
            fold = 0
        
        self.final_model.fit(self.y_pred,y)
            
    def predict(self,X):
        self.final_estimated_result = np.zeros((X.shape[0],self.K))
        
        for i, fold in enumerate(self.fit_models):
            for model in fold:
                self.final_estimated_result[:,i] += model.predict(X)/len(fold)
        
        return final_model.predict(self.final_estimated_result)
        

In [50]:
models = [ElasticNet(), Ridge(), LinearRegression()]
K = 3
final_model = DecisionTreeRegressor(max_depth=3)
stack_mdl = Stacking_Regressor(models,final_model,K)
stack_mdl.fit(X_train_std,y_train)

In [51]:
print("MSE for stacking model: {}".format(mean_squared_error(y_test,y_pred)))

MSE for stacking model: 2435224203.1911225


In [52]:
blending_model(models,[0.3,0.4,0.5],X_train_std,X_test_std,y_train,y_test)

MSE for ElasticNet: 3190751084.7226815
MSE for Ridge: 2496340156.4585676
MSE for LinearRegression: 2495554898.6683207
MSE for blending model is : 3622574335.88682
