In [153]:
import numpy as np 
import pandas as pd

In [154]:
house_data_root = pd.read_csv('./train.csv')
house_data_root.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [155]:
X = house_data_root.loc[:, ['GrLivArea', 'YearBuilt']].to_numpy()
y = house_data_root['SalePrice'].to_numpy()

display(X[:5])
display(y[:5])

array([[1710, 2003],
       [1262, 1976],
       [1786, 2001],
       [1717, 1915],
       [2198, 2000]])

array([208500, 181500, 223500, 140000, 250000])

In [156]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=69)

In [157]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(X_train)
X_test = scaler.transform(X_test)
X_train = scaler.transform(X_train)

print(X_test[:5])

[[-1.93408803 -0.73766614]
 [ 2.04171325  1.17230386]
 [-1.37784917 -1.0340408 ]
 [ 0.36553036 -0.40836097]
 [-1.16132666  0.08559679]]


In [158]:
from sklearn.metrics import mean_squared_error

<h3>[Problem 1] Blending scratch mounting</h3>

In [159]:
class ScratchBlendingRegressor():
    def __init__(self, models):
        self.models = models
    
    def fit(self, X_train, y_train):
        for i in range(0, len(self.models)):
            self.models[i].fit(X_train, y_train)
    
    def predict(self, X_test):
        self.y_pred = []
        for i in range(0, len(self.models)):
            self.y_pred.append(self.models[i].predict(X_test))
        return np.mean(self.y_pred, axis=0)

In [160]:
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor

In [161]:
model1 = LinearRegression()
model2 = SVR(kernel='poly', degree=3, C=1, epsilon=0.2)
model3 = DecisionTreeRegressor(random_state=69)
blending = ScratchBlendingRegressor(
        models=[model1, model2, model3]
)

In [162]:
print("MSE")

blending.fit(X_train, y_train)
print("blending: {}".format(mean_squared_error(y_test, blending.predict(X_test))))

model1.fit(X_train, y_train)
print("LinearRegression(): {}".format(mean_squared_error(y_test, model1.predict(X_test))))

model2.fit(X_train, y_train)
print("SVR(): {}".format(mean_squared_error(y_test, model2.predict(X_test))))

model3.fit(X_train, y_train)
print("DecisionTree(): {}".format(mean_squared_error(y_test, model3.predict(X_test))))

MSE
blending: 2548592206.480154
LinearRegression(): 2046027454.2181535
SVR(): 6397002593.666542
DecisionTree(): 2763157707.3356166


With blending with 3 models, we can see it is better than SVR, DecisionTreeRegressor single model in terms of MSE.

<h3>[Problem 2] Scratch mounting of bagging</h3>

In [163]:
class ScratchBaggingRegressor():
    def __init__(self, models, max_samples):
        self.models = models
        self.max_samples = max_samples
    
    def fit(self, X_train, y_train):
        for i in range(0, len(self.models)):
            X_train_sub, _, y_train_sub, _ = train_test_split(X_train, y_train, train_size=self.max_samples, shuffle=True)
            indices = np.random.choice(y_train_sub.shape[0], size=np.round(y_train.shape[0] * (1 - self.max_samples)).astype(np.int64), replace=False)
            X_train_sub = np.append(X_train_sub, X_train_sub[indices, :], axis=0)
            y_train_sub = np.append(y_train_sub, y_train_sub[indices])
            self.models[i].fit(X_train_sub, y_train_sub)
    
    def predict(self, X_test):
        self.y_pred = []
        for i in range(0, len(self.models)):
            self.y_pred.append(self.models[i].predict(X_test))
        return np.mean(self.y_pred, axis=0)

In [164]:
bagging = ScratchBaggingRegressor(
        models=[model1, model2, model3],
        max_samples=0.7
)

In [165]:
print("MSE")

bagging.fit(X_train, y_train)
print("bagging: {}".format(mean_squared_error(y_test, bagging.predict(X_test))))

model1.fit(X_train, y_train)
print("LinearRegression(): {}".format(mean_squared_error(y_test, model1.predict(X_test))))

model2.fit(X_train, y_train)
print("SVR(): {}".format(mean_squared_error(y_test, model2.predict(X_test))))

model3.fit(X_train, y_train)
print("DecisionTree(): {}".format(mean_squared_error(y_test, model3.predict(X_test))))

MSE
bagging: 2518873508.1890936
LinearRegression(): 2046027454.2181535
SVR(): 6397002593.666542
DecisionTree(): 2763157707.3356166


With bagging with 3 models, we can see it is better than SVR, DecisionTreeRegressor single model in terms of MSE. Moreover, it is better than pure blending.

<h3>[Problem 3] Stacking scratch mounting</h3>

In [166]:
class ScratchStackingRegressor():
    def __init__(self, models, final_model, K, random_state):
        self.models = models
        self.final_model = final_model
        self.M = len(self.models)
        self.K = K
        self.random_state = random_state

    def fit(self, X_train, y_train):
        #stage 0
        np.random.seed(self.random_state)
        X_train, X_test, y_train, y_test = train_test_split(X_train,
                                                        y_train,
                                                        test_size=1/self.K,
                                                        shuffle=True
                                                        )
        y_pred0 = []
        for i in range(0, self.M):
            self.models[i].fit(X_train, y_train)
            y_pred0.append(self.models[i].predict(X_test))
        y_pred0 = np.array(y_pred0).reshape(len(y_pred0[0]), len(y_pred0))
        
        #stage 1
        self.final_model.fit(y_pred0, y_test)

    def predict(self, X_test):
        #stage 0
        y_pred0 = []
        for i in range(0, self.M):
            y_pred0.append(self.models[i].predict(X_test))
        y_pred0 = np.array(y_pred0).reshape(len(y_pred0[0]), len(y_pred0))

        #stage 1
        return self.final_model.predict(y_pred0)

In [167]:
stacking = ScratchStackingRegressor(
        models=[model1, model2],
        final_model=model3,
        K=2,
        random_state=69
)

In [169]:
print("MSE")

stacking.fit(X_train, y_train)
print("stacking: {}".format(mean_squared_error(y_test, stacking.predict(X_test))))

model1.fit(X_train, y_train)
print("LinearRegression(): {}".format(mean_squared_error(y_test, model1.predict(X_test))))

model2.fit(X_train, y_train)
print("SVR(): {}".format(mean_squared_error(y_test, model2.predict(X_test))))

model3.fit(X_train, y_train)
print("DecisionTree(): {}".format(mean_squared_error(y_test, model3.predict(X_test))))

MSE
stacking: 13454954860.938356
LinearRegression(): 2046027454.2181535
SVR(): 6397002593.666542
DecisionTree(): 2763157707.3356166
