# Exercise

# #1

In [1]:
#implement Bagging ensembles of Decision Trees from scratch

"""
Coding considerations:
1. Load any data you would like to apply
2. Make sure to subsample *with* replacement.  I.e., 
   we should allow the same training instance to be subsampled 
   for the same predictor.  For example, X[5] can appear multiple
   times in the nth predictor (it may not even appear at all!)
3. Once all predictors are trained, the bagging ensemble can make a prediction
   by simply aggregating, commonly using mode for classification and
   average for regression

"""

# #2

In [2]:
#implement AdaBoost for binary classification from scratch

"""
Coding considerations:
1. Load any binary classification problem you would like to apply
2. Given binary classification, AdaBoost expects input as -1 and 1
3. Once all predictors have been trained, the final predictions are
simply the np.sign of a_j * y_pred.  This works because if more
classifiers say that it is positive, then the sum will turn out
to be positive, thus we can simply look at the np.sign to know
the class
"""

# #3

In [3]:
#implement Gradient Boosting from scratch
#To add to the challenge, we shall create our algorithm that
#works for both binary classification and regression

"""
Coding considerations:
1. Load one binary classification and one regression problem you would like to apply
2. To make our algorithm work for both regression and binary classification
   We can perform this via a params called 'loss'
   if loss is defined as "mse", then its a regression problem and
   the residual errors shall be simply calculated as y - f(x)
   if loss is defined as "logistic", then its a classification problem
   and the residual errors shall be calculated as y - sigmoid(x)
   the reason of using sigmoid is obvious, since we want to create
   a mapping of x to something between 0 and 1 so it can be compared
   to y, thus we use sigmoid
3. You may wonder what classifier or regressor to use for this
   problem.  Luckily, if we define the loss function with sigmoid,
   we can simply use regressor as our estimator, since we already
   define sigmoid function that map any continuous value to values 
   between 0 and 1.  The only thing you have to do is to 
   create a tolerance function mapping >0.5 to 1 otherwise 0
4. When finding the loss, it is important to calculate the loss
   based on the total errors made by all models you have trained
   up to nth iteration
"""

# Solution

# #1

In [4]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

iris = load_iris()
X = iris.data
y = iris.target

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                test_size=0.3, shuffle=True, random_state=42)

In [5]:
from sklearn.tree import DecisionTreeClassifier
import random
import numpy as np
from scipy import stats
from sklearn.metrics import classification_report

n_estimators = 5
boostrap_ratio = 0.1
tree_params = {'max_depth': 2, 'criterion':'gini', 'min_samples_split': 5}
models = [DecisionTreeClassifier(**tree_params) for _ in range(n_estimators)]

#sample size for each predictor
sample_size = int(boostrap_ratio * len(X_train))

#2 for x and y
xsamples = []
ysamples = []

#subsamples for each model
for _ in range(n_estimators):
    xsample = []
    ysample = []
    ##sampling with replacement; i.e., sample can occur more than once
    #for the same predictor
    for _ in range(sample_size):
        idx = random.randrange(X_train.shape[0])
        xsample.append(X_train[idx])
        ysample.append(y_train[idx])
    xsamples.append(xsample)
    ysamples.append(ysample)

#convert to numpy for easier manipulation
#usually if we start with list.append and later convert to np
#is faster than using np.empty
xsamples = np.asarray(xsamples)
ysamples = np.asarray(ysamples)

#fitting each estimator
for i, model in enumerate(models):
    _X = xsamples[i, :]
    _y = ysamples[i, :]
    model.fit(_X, _y)
    
#make prediction and return the probabilities
predictions = []
for model in models:
    y_pred = model.predict(X_test)
    predictions.append(y_pred)
    
#we can use unpacking technique to remove the outerlist
#we need [0] because .mode return both list and the counts
[y_pred] = stats.mode(predictions)[0]

print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.90      1.00      0.95        19
           1       1.00      0.85      0.92        13
           2       1.00      1.00      1.00        13

    accuracy                           0.96        45
   macro avg       0.97      0.95      0.96        45
weighted avg       0.96      0.96      0.95        45



# #2

In [6]:
from sklearn.datasets import make_classification

X, y = make_classification(n_samples=500)
y = np.where(y==0,-1,1)  #change our y to be -1 if it is 0, otherwise 1

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42)

In [7]:
from sklearn.metrics import accuracy_score

n_samples = X_train.shape[0]
n_estimators = 20
tree_params = {'max_depth': 1}
models = [DecisionTreeClassifier(**tree_params) for _ in range(n_estimators)]

#initially, we set our weight to 1/n
W = np.ones(shape=n_samples) / n_samples

#keep collection of a_j
a_js = np.zeros(shape=n_estimators)

for j, model in enumerate(models):
    
    #train weak learner
    model.fit(X_train, y_train, sample_weight=W)
    
    #compute the errors r_j
    y_pred = model.predict(X_train) 
    compare = y_pred != y_train  #this works since True * w = 1 * w
#     compare = np.array([int(x) for x in (y_pred != y_train)])
    
    r_j = np.sum(W * compare) / sum(W)
    
    #compute the predictor weight a_j
    #let eta to be 1 as adaboost default 
    #if predictor is doing well, a_j will be big
    eta = 1
    a_j = eta * np.log ((1 - r_j) / r_j)
    a_js[j] = a_j
    
    #update sample weight; divide sum of W to normalize
    W = (W * np.exp(a_j)) / sum (W)
    
        
#make weighted predictions
Hx = 0
for i, model in enumerate(models):
    y_pred = model.predict(X_test)
    Hx += a_js[i] * y_pred
    
y_pred = np.sign(Hx)

print(classification_report(y_test, y_pred))



              precision    recall  f1-score   support

          -1       0.80      0.93      0.86        68
           1       0.93      0.80      0.86        82

    accuracy                           0.86       150
   macro avg       0.86      0.87      0.86       150
weighted avg       0.87      0.86      0.86       150



# #3

In [8]:
from scipy.special import expit
from sklearn.tree import DecisionTreeRegressor
from sklearn.dummy import DummyRegressor

def mse(y, f):
    return y - f

def logit(y, f):
    return y - expit(f)

def fit(X, y, models, loss="mse"):
    
    if loss == "mse":
        loss_func = mse
    else:
        loss_func = logit
    
    models_trained = []
    
    #using DummyRegressor is a good technique for starting model
    first_model = DummyRegressor(strategy='constant', constant=0)
    first_model.fit(X, y)
    models_trained.append(first_model)
    
    #fit the estimators
    for i, model in enumerate(models):
        #predict using all the weak learners we trained up to
        #this point
        y_pred = predict(X, models_trained)
        
        #errors will be the total errors maded by models_trained
        residual = loss_func(y, y_pred)
        
        #fit the next model with residual
        model.fit(X, residual)
        
        models_trained.append(model)
        
    return models_trained
        
def predict(X, models):
    learning_rate = 0.1  ##hard code for now
    f0 = models[0].predict(X)  #first use the dummy model
    boosting = sum(learning_rate * model.predict(X) for model in models[1:])
    return f0 + boosting

def change_to_0_1(sample):
    return int(sample > 0.5)

def predict_class(X, models):
    probas = expit(predict(X, models))
    return np.array([change_to_0_1(proba) for proba in probas])

### Regression

In [9]:
from sklearn.datasets import load_boston
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import GradientBoostingRegressor

X, y = load_boston(return_X_y=True)

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.3, random_state=42)

n_estimators = 200
tree_params = {'max_depth': 1}
models = [DecisionTreeRegressor(**tree_params) for _ in range(n_estimators)]

#fit the models
models = fit(X_train, y_train, models)

#predict
y_pred = predict(X_test, models)

#print metrics
print("Custom MSE: ", mean_squared_error(y_test, y_pred))


#Compare to sklearn: ls is the same as our mse
sklearn_model = GradientBoostingRegressor(
    n_estimators=n_estimators,
    learning_rate = 0.1,
    max_depth=1,
    loss='ls'
)

y_pred_sk = sklearn_model.fit(X_train, y_train).predict(X_test)

#print metrics
print("Sklearn MSE: ", mean_squared_error(y_test, y_pred_sk))


Custom MSE:  12.94555760240659
Sklearn MSE:  12.945557601580582


### Classification

In [10]:
from sklearn.datasets import load_breast_cancer
from sklearn.metrics import accuracy_score
from sklearn.ensemble import GradientBoostingClassifier

X, y = load_breast_cancer(return_X_y=True)

X_train, X_test, y_train, y_test = \
        train_test_split(X, y, test_size=0.3, random_state=42)

n_estimators = 200
tree_params = {'max_depth': 1}
models = [DecisionTreeRegressor(**tree_params) for _ in range(n_estimators)]

#fit the models
models = fit(X_train, y_train, models, loss="logistic")

#predict
y_pred = predict_class(X_test, models)

# #print metrics
print("Custom accuracy: ", accuracy_score(y_test, y_pred))

#Compare to sklearn: ls is the same as our mse
sklearn_model = GradientBoostingClassifier(
    n_estimators=n_estimators,
    learning_rate = 0.1,
    max_depth=1
)

y_pred_sk = sklearn_model.fit(X_train, y_train).predict(X_test)

#print metrics
print("Sklearn accuracy: ", accuracy_score(y_test, y_pred_sk))



Custom accuracy:  0.9532163742690059
Sklearn accuracy:  0.9649122807017544
