# Sprint 8

## Ensemble Learning

In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error

In [2]:
df = pd.read_csv("train.csv")
X = df[['GrLivArea', 'YearBuilt']].values
y = df['SalePrice'].values
X[:10], y[:10]

(array([[1710, 2003],
        [1262, 1976],
        [1786, 2001],
        [1717, 1915],
        [2198, 2000],
        [1362, 1993],
        [1694, 2004],
        [2090, 1973],
        [1774, 1931],
        [1077, 1939]], dtype=int64),
 array([208500, 181500, 223500, 140000, 250000, 143000, 307000, 200000,
        129900, 118000], dtype=int64))

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8)
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((1168, 2), (1168,), (292, 2), (292,))

In [4]:
s = StandardScaler()
s.fit(X_train)
X_train = s.transform(X_train)
X_test = s.transform(X_test)

In [5]:
lr = LinearRegression()
lr.fit(X_train, y_train)
lr_pred = lr.predict(X_test)
print(lr_pred[:10])
print(mean_squared_error(y_test, lr_pred))

[175938.78781975 122729.19598676 135261.23092683 233729.97912765
 225000.3471398  198669.47703876 167614.07775664 312053.6729979
 177128.05973106 112409.47690588]
2087542129.4462168


In [6]:
svm = SVR()
svm.fit(X_train, y_train)
svm_pred = svm.predict(X_test)
print(svm_pred[:10])
print(mean_squared_error(y_test, svm_pred))

[161955.50890178 161734.38210749 161705.54250398 162070.98897002
 162149.37677863 161941.31944279 161882.72199242 162029.10235423
 161904.17932941 161739.20549816]
7451386269.047913


In [7]:
dtr = DecisionTreeRegressor()
dtr.fit(X_train, y_train)
dtr_pred = dtr.predict(X_test)
print(dtr_pred[:10])
print(mean_squared_error(y_test, dtr_pred))

[157000. 125000. 142000. 375000. 173000. 141000. 124000. 466500. 149500.
 153900.]
2753675215.8359966


### [Problem 1] Blending scratch mounting

In [8]:
class ScratchBlending():
    
    def __init__(self, models, weights=None):
        self.models = models
        self.weights = weights
        
    def fit(self, X, y):
        for model in self.models:
            model.fit(X, y)
            
    def predict(self, X):
        y_preds = []
        for model in self.models:
            y_preds.append(model.predict(X))
        y_pred = np.average(y_preds, axis=0, weights=self.weights)
        return y_pred

In [9]:
sb_1 = ScratchBlending([LinearRegression(), DecisionTreeRegressor()], weights=[0.8, 0.2])
sb_1.fit(X_train, y_train)
sb_1_pred = sb_1.predict(X_test)
print(sb_1_pred[:10])
print(mean_squared_error(y_test, sb_1_pred))

[172151.0302558  125183.35678941 136608.98474147 261983.98330212
 213500.27771184 183935.58163101 158891.26220531 342942.93839832
 171602.44778485 120707.5815247 ]
1919521899.0450652


In [10]:
sb_2 = ScratchBlending([LinearRegression(), SVR(), DecisionTreeRegressor()])
sb_2.fit(X_train, y_train)
sb_2_pred = sb_2.predict(X_test)
print(sb_2_pred[:10])
print(mean_squared_error(y_test, sb_2_pred))

[164964.76557384 138821.19269808 146322.25781027 256933.65603256
 184883.24130614 161870.26549385 151165.59991635 313527.59178404
 162844.07968683 142682.89413468]
2587235003.384756


In [11]:
sb_3 = ScratchBlending([LinearRegression(), SVR(), DecisionTreeRegressor()], weights=[0.6, 0.1, 0.3])
sb_3.fit(X_train, y_train)
sb_3_pred = sb_3.predict(X_test)
print(sb_3_pred[:10])
print(mean_squared_error(y_test, sb_3_pred))

[168858.82358203 127310.9558028  139927.2928065  268945.08637359
 201465.14596174 177695.81816753 153956.71885322 343385.11403416
 167317.25377158 129789.60669334]
1944138389.9036586


### [Problem 2] Scratch mounting of bagging

In [12]:
class ScratchBagging():
    
    def __init__(self, model, N=3, train_size=0.8):
        self.models = []
        self.train_size = train_size
        for i in range(N):
            self.models.append(model)
            
    def fit(self, X, y):
        for model in self.models:
            X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=self.train_size, shuffle=True)
            model.fit(X_train, y_train)
            
    def predict(self, X):
        y_preds = []
        for model in self.models:
            y_preds.append(model.predict(X))
        y_pred = np.average(y_preds, axis=0)
        return y_pred

In [13]:
sbag = ScratchBagging(LinearRegression())
sbag.fit(X_train, y_train)
sbag_pred = sbag.predict(X_test)
print(sbag_pred[:10])
print(mean_squared_error(y_test, sbag_pred))

[179237.13583842 122025.70358414 137324.01653754 233668.10889843
 226917.82753204 198707.95869394 169843.05855766 311312.49494082
 178649.35127787 111659.66778822]
2097282106.1870823


In [14]:
sbag = ScratchBagging(SVR())
sbag.fit(X_train, y_train)
sbag_pred = sbag.predict(X_test)
print(sbag_pred[:10])
print(mean_squared_error(y_test, sbag_pred))

[163137.12522571 162961.64492067 162937.95820867 163225.21958965
 163288.98853826 163124.88722094 163079.38882711 163188.54259107
 163096.32909932 162965.24136808]
7407930767.40372


In [15]:
sbag = ScratchBagging(DecisionTreeRegressor())
sbag.fit(X_train, y_train)
sbag_pred = sbag.predict(X_test)
print(sbag_pred[:10])
print(mean_squared_error(y_test, sbag_pred))

[141000. 143000. 132000. 375000. 163500. 141000. 152000. 466500. 149500.
  89500.]
2501293107.25


### [Problem 3] Stacking scratch mounting

In [21]:
class ScratchStacking():

    def __init__(self, stage_0_models, stage_1_model, K=3):
        self.stage_0_models = stage_0_models
        self.stage_1_model = stage_1_model
        self.K = K
        self.fitted_models = []
    
    def fit(self, X, y):

        kf = KFold(n_splits=self.K)

        blend_data = np.zeros([X.shape[0], len(self.stage_0_models)])

        for i, model in enumerate(self.stage_0_models):

            model_list = []

            for train_index, test_index in kf.split(X):
                X_train, X_test = X[train_index], X[test_index]
                y_train, y_test = y[train_index], y[test_index]

                model_list.append(model.fit(X_train, y_train))

                blend_data[test_index, i] = model.predict(X_test)

            self.fitted_models.append(model_list)

        self.stage_1_model.fit(blend_data, y)
    
    def predict(self, X):

        preds = [] 

        blend_data = np.zeros([X.shape[0], len(self.stage_0_models)])
        
        for i, models in enumerate(self.fitted_models):
            for j in range(self.K):
                pred = models[j].predict(X)
                blend_data[:, i] += pred
            blend_data[:, i] /= self.K
   
        return self.stage_1_model.predict(blend_data)

In [23]:
lr = LinearRegression()
svm = SVR()
dtr = DecisionTreeRegressor()

ss = ScratchStacking([lr, svm], dtr, K=3)
ss.fit(X_train, y_train)
ss_pred = ss.predict(X_test)
print(ss_pred[:10])
print(mean_squared_error(y_test, ss_pred))

[157000. 130000. 128500. 144000. 230000. 145000. 144000. 336000. 158900.
 105000.]
2906502181.917808
