In [1]:
import pandas as pd

train = pd.read_csv("p2_data/data_train.csv", header=None)
target = train.shape[1]-1
y = train[target].to_numpy()
X = train.drop(columns=[target]).to_numpy()

test = pd.read_csv("p2_data/data_test.csv", header=None)
y_test = test[target]
X_test = test.drop(columns=[target])

A)<br/>
Implement Random Forest:
1. It gets the number of trees (n) and their height
2. Samples the dataset with replacement(bootstrap) n times
3. For each tree randomly selects a few features
4. Train all trees
5. sum the predictions of all trees and use argmax

In [2]:
from sklearn import tree
import random
from sklearn.utils import resample
import numpy as np

class RandomForest:
    def __init__(self, n, depth):
        self.n = n
        self.depth = depth
        self.trees = []
        self.features = []
        self.samplesX = []
        self.samplesy = []
        self.n_classes = 2
        self.build()
        
    def build(self):
        for i in range(self.n):
            clf = tree.DecisionTreeClassifier(max_depth=self.depth)
            self.trees.append(clf)
            
    def fit(self, X, y, n):
        self.n_classes = len(np.unique(y))
        for i in range(self.n):
            self.bootstrap(X, y)
            self.random_features(n, X.shape[1]-1)
            self.trees[i] = self.trees[i].fit(self.samplesX[i][:, self.features[i]], self.samplesy[i])
            
    def predict(self, X):
        predictions = np.zeros(shape=(len(X), self.n_classes))
        for i in range(self.n):
            predictions = np.add(predictions,self.trees[i].predict_proba(X.iloc[:, self.features[i]]))
            
        return np.argmax(predictions, axis=1)

    def bootstrap(self, X, y):
        r = random.randint(0, 1000)
        bootX = resample(X, replace=True, n_samples=len(X), random_state=r)
        booty = resample(y, replace=True, n_samples=len(X), random_state=r)
        self.samplesX.append(bootX)
        self.samplesy.append(booty)
        
    def random_features(self, n, n_features):
        features = []
        for i in range(n):
            features.append(random.randint(0, n_features))
        self.features.append(features)


In [3]:
clf = RandomForest(15, 3)
clf.fit(X, y, 3)

In [4]:
print(y_test)
predictions = clf.predict(X_test)
print(predictions)

0       8
1       8
2       8
3       9
4       9
       ..
3493    4
3494    2
3495    0
3496    0
3497    4
Name: 16, Length: 3498, dtype: int64
[8 8 0 ... 0 0 4]


In [5]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, predictions)

0.7375643224699828

In [6]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, predictions)

array([[334,   2,   0,   0,   0,   0,   0,   1,  26,   0],
       [  0, 189, 149,  17,   3,   0,   3,   0,   0,   3],
       [  0,  10, 346,   3,   0,   0,   0,   5,   0,   0],
       [  0,   8,   0, 318,   0,   0,   1,   8,   0,   1],
       [  0,   1,   0,   0, 357,   0,   3,   0,   0,   3],
       [  4,   1,   0, 126,  11, 156,   8,   0,   8,  21],
       [  3,   0,   0,   3,   3,   0, 324,   3,   0,   0],
       [  0,  53,   8,   1,   4,   0,   4, 283,  11,   0],
       [ 85,   0,   0,   0,   0,  36,   1,  14, 200,   0],
       [  0,  46,   0, 176,  33,   0,   4,   3,   1,  73]])

B)<br/>
Implement AdaBoost:<br/>
In this implementation I found argmax more meaningfull than sign method because we are facing a multiclass problem.

In [7]:
from sklearn import tree
import random
from sklearn.utils import resample
import numpy as np

class AdaBoost:
    def __init__(self, n):
        self.n = n
        self.trees = [] 
        self.alphas = []
        self.build()
        
    def build(self):
        for i in range(self.n):
            stump = tree.DecisionTreeClassifier(max_depth=1, max_leaf_nodes=2)
            self.trees.append(stump)
            
    def fit(self, X, y):
        n_samples, _ = X.shape
        
        # init weight
        w = np.full(n_samples, (1/n_samples))
        
        for i in range(self.n):
            self.trees[i] = self.trees[i].fit(X, y, sample_weight=w)
            predictions = self.trees[i].predict(X)
            
            missclassified = w[y != predictions]
            error = sum(missclassified)
            
            EPS = 1e-10
            alpha = 0.5 * np.log((1-error) / (error+EPS))
            self.alphas.append(alpha)
           
            w *= np.exp(-alpha * y * predictions)
            w /= np.sum(w)
            
            
    def predict(self, X):
        predictions = np.zeros(shape=(X.shape[0], 10))
        for i in range(self.n):
            predictions = np.add(predictions, self.alphas[i] * self.trees[i].predict_proba(X))

        return np.argmax(predictions, axis=1)


In [8]:
clf = AdaBoost(10)
clf.fit(X, y)

predictions = clf.predict(X_test)

accuracy_score(y_test, predictions)

0.10377358490566038

The AdaBoost algorithm I implemented predicts every sample zero. After some searching I found out that after traing a few trees, there is one tree that predicts every sample zero and from then the error doesn't change much so the weights don't change either and all the trees after this predict only zero.<br/>

C)

In [None]:
clf_5 = AdaBoost(5)
clf_5.fit(X, y)

predictions = clf_5.predict(X_test)
print(accuracy_score(y_test, predictions))

clf_20 = AdaBoost(20)
clf_20.fit(X, y)

predictions = clf_20.predict(X_test)
print(accuracy_score(y_test, predictions))

clf_50 = AdaBoost(50)
clf_50.fit(X, y)

predictions = clf_50.predict(X_test)
print(accuracy_score(y_test, predictions))

0.19925671812464266
0.10377358490566038


D)

XGBoost is an implementation of gradient boosting so explaining gradient boosting also explains xgboost.<br/>
XGBoost classifier seems like AdaBoost classifier.The major difference between AdaBoost and Gradient Boosting Algorithm is how the two algorithms identify the shortcomings of weak learners (eg. decision trees). While the AdaBoost model identifies the shortcomings by using high weight data points, gradient boosting performs the same by using gradients in the loss function. The loss function is a measure indicating how good model’s coefficients are at fitting the underlying data.

In [None]:
from xgboost import XGBClassifier


model = XGBClassifier()
model.fit(X, y)

y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: %.2f%%" % (accuracy * 100.0))