Modify the Gradient Boosting scratch code in our lecture such that:
- Notice that we are still using max_depth = 1.  Attempt to tweak min_samples_split, max_depth for the regression and see whether we can achieve better mse on our boston data
- Notice that we only write scratch code for gradient boosting for regression, add some code so that it also works for binary classification.  Load the breast cancer data from sklearn and see that it works.
- Further change the code so that it works for multiclass classification.  Load the digits data from sklearn and see that it works
- Put everything into class

#### st122645

In [127]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.dummy import DummyRegressor, DummyClassifier
from sklearn.datasets import  load_boston, load_breast_cancer, load_digits
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.metrics import accuracy_score


class GradientBoosting:
    def __init__(self, S= 200, learning_rate=1, max_depth= 1, min_samples_split=2, regression= True ):
        self.S = S
        self.learning_rate= learning_rate
        self.tree_params = { "max_depth": max_depth, "min_samples_split": min_samples_split}
        self.models = [DecisionTreeRegressor(**self.tree_params) for _ in range(S)]
        self.regression = regression
        if regression == True:
            first_model = DummyRegressor(strategy='mean')
#             self.models.insert(0, DummyRegressor(strategy="mean") )
            
        else:
            first_model = DummyClassifier(strategy='most_frequent')
#             self.models.insert(0, DummyClassifier(strategy="most_frequent"))
            
        self.models.insert(0, first_model)

    
    def grad(self,y, h):
        return y - h

    def softmax(self, yhat):
        return np.exp(yhat) / np.expand_dims(np.sum(np.exp(yhat), axis=1), axis=1)
    
    def fit(self, X, y):
        self.models[0].fit(X,y)
        for i in range(self.S):
            y_pred = self.predict(X, self.models[:i+1], with_argmax=False)
            residual = self.grad(y, y_pred)
            self.models[i+1].fit(X, residual)
        
    def predict(self,X, models= None, with_argmax=True):
        if models is None:
            models = self.models
        h0 = models[0].predict(X)
        boosting = sum(self.learning_rate * model.predict(X) for model in models[1:])
        yhat = h0 + boosting
        if not self.regression:
            yhat = np.exp(yhat) / np.sum(np.exp(yhat), axis=1, keepdims=True)
            if with_argmax:
                yhat = np.argmax(yhat, axis=1)
        return yhat
    


In [149]:
# Regression - Boston Dataset

X, y = load_boston(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

clf = GradientBoosting();
clf.fit(X_train,y_train)
yhat = clf.predict(X_test)

print("CP dsai MSE: ", mean_squared_error(y_test, yhat))


n_estimators = 200
sklearn_model = GradientBoostingRegressor(n_estimators=n_estimators, learning_rate = 0.1,
                                            max_depth=3,loss='ls')

yhat_sk = sklearn_model.fit(X_train, y_train).predict(X_test)
print("Sklearn MSE: ", mean_squared_error(y_test, yhat_sk))

CP dsai MSE:  12.92985577558652
Sklearn MSE:  8.032644175128802


In [151]:
# Binary - Cancer Dataset

X, y = load_breast_cancer(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
y_train_encoded = np.zeros((y_train.shape[0], len(set(y))))
# print(y_train_encoded)
for each_class in range(len(set(y))):
    cond = y_train==each_class
    y_train_encoded[np.where(cond), each_class] = 1

clf = GradientBoosting(regression=False);
clf.fit(X_train,y_train_encoded)
yhat = clf.predict(X_test)

print("CP dsai MSE: ", mean_squared_error(y_test, yhat))
print("CP dsai Accuracy Score: ", accuracy_score(y_test, yhat))


sklearn_model = GradientBoostingClassifier(n_estimators=n_estimators,learning_rate = 0.1,max_depth=1)

yhat_sk = sklearn_model.fit(X_train, y_train).predict(X_test)
print("Sklearn MSE: ", mean_squared_error(y_test, yhat_sk))
print("Sklearn accuracy: ", accuracy_score(y_test, yhat_sk))

CP dsai MSE:  0.04093567251461988
CP dsai Accuracy Score:  0.9590643274853801
Sklearn MSE:  0.03508771929824561
Sklearn accuracy:  0.9649122807017544


In [152]:
# Multiclass - Digit Dataset

X, y = load_digits(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
y_train_encoded = np.zeros((y_train.shape[0], len(set(y))))
for each_class in range(len(set(y))):
    cond = y_train==each_class
    y_train_encoded[np.where(cond), each_class] = 1

clf = GradientBoosting(regression=False);

clf.fit(X_train,y_train_encoded)

yhat = clf.predict(X_test)

print("CP dsai MSE: ", mean_squared_error(y_test, yhat))
print("CP dsai Accuracy Score: ", accuracy_score(y_test, yhat))


sklearn_model = GradientBoostingClassifier(n_estimators=n_estimators,learning_rate = 0.1,max_depth=1)

yhat_sk = sklearn_model.fit(X_train, y_train).predict(X_test)
print("Sklearn MSE: ", mean_squared_error(y_test, yhat_sk))
print("Sklearn accuracy: ", accuracy_score(y_test, yhat_sk))

CP dsai MSE:  1.3351851851851853
CP dsai Accuracy Score:  0.924074074074074
Sklearn MSE:  0.8574074074074074
Sklearn accuracy:  0.9481481481481482
