In [1]:
import pandas as pd

train = pd.read_csv("p2_data/data_train.csv", header=None)
target = train.shape[1]-1
y = train[target].to_numpy()
X = train.drop(columns=[target]).to_numpy()

test = pd.read_csv("p2_data/data_test.csv", header=None)
y_test = test[target]
X_test = test.drop(columns=[target])


A)<br/>
Implement Random Forest:
1. It gets the number of trees (n) and their height
2. Samples the dataset with replacement(bootstrap) n times
3. For each tree randomly selects a few features
4. Train all trees
5. sum the predictions of all trees and use argmax

In [30]:
from sklearn import tree
import random
from sklearn.utils import resample
import numpy as np

class RandomForest:
    def __init__(self, n, depth):
        self.n = n
        self.depth = depth
        self.trees = []
        self.features = []
        self.samplesX = []
        self.samplesy = []
        self.n_classes = 2
        self.build()
        
    def build(self):
        for i in range(self.n):
            clf = tree.DecisionTreeClassifier(max_depth=self.depth)
            self.trees.append(clf)
            
    def fit(self, X, y, n):
        self.n_classes = len(np.unique(y))
        for i in range(self.n):
            self.bootstrap(X, y)
            self.random_features(n, X.shape[1]-1)
            self.trees[i] = self.trees[i].fit(self.samplesX[i][:, self.features[i]], self.samplesy[i])
            
    def predict(self, X):
        predictions = np.zeros(shape=(len(X), self.n_classes))
        for i in range(self.n):
            predictions = np.add(predictions,self.trees[i].predict_proba(X.iloc[:, self.features[i]]))
            
        return np.argmax(predictions, axis=1)

    def bootstrap(self, X, y):
        r = random.randint(0, 1000)
        bootX = resample(X, replace=True, n_samples=len(X), random_state=r)
        booty = resample(y, replace=True, n_samples=len(X), random_state=r)
        self.samplesX.append(bootX)
        self.samplesy.append(booty)
        
    def random_features(self, n, n_features):
        features = []
        for i in range(n):
            features.append(random.randint(0, n_features))
        self.features.append(features)


In [31]:
clf = RandomForest(15, 3)
clf.fit(X, y, 3)

In [32]:
print(y_test)
predictions = clf.predict(X_test)
print(predictions)

0       8
1       8
2       8
3       9
4       9
       ..
3493    4
3494    2
3495    0
3496    0
3497    4
Name: 16, Length: 3498, dtype: int64
[8 8 0 ... 0 0 4]


In [33]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, predictions)

0.7655803316180675

In [34]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, predictions)

array([[311,   6,   0,   0,   1,   0,  17,   3,  25,   0],
       [  0, 144, 156,  59,   2,   0,   2,   0,   0,   1],
       [  0,   4, 357,   0,   0,   0,   1,   2,   0,   0],
       [  0,   6,   1, 326,   0,   0,   0,   2,   0,   1],
       [  0,   3,   0,   0, 355,   0,   5,   0,   0,   1],
       [  1,   0,   1,  98,   3, 166,  20,   0,   1,  45],
       [  2,   0,   2,   0,   2,   0, 325,   5,   0,   0],
       [  4,  32,  11,   1,  20,   0,   0, 293,   3,   0],
       [ 28,   0,   0,   0,   0,   6,   1,  23, 277,   1],
       [  0,  39,   0, 147,  19,   1,   6,   0,   0, 124]])

B)<br/>
Implement AdaBoost

In [35]:
from sklearn import tree
import random
from sklearn.utils import resample
import numpy as np

class AdaBoost:
    def __init__(self, n):
        self.n = n
        self.trees = [] 
        self.alphas = []
        self.build()
        
    def build(self):
        for i in range(self.n):
            stump = tree.DecisionTreeClassifier(max_depth=1)
            self.trees.append(stump)
            
    def fit(self, X, y):
        n_samples, _ = X.shape
        
        # init weight
        w = np.full(n_samples, (1/n_samples))
        
        for i in range(self.n):
            self.trees[i] = self.trees[i].fit(X, y, sample_weight=w)
            predictions = self.trees[i].predict(X)

#             print(predictions)
            
            missclassified = w[y != predictions]
            error = sum(missclassified)
            
            print(np.sum(w[y == predictions]))
            print(np.sum(w[y != predictions]))
            print("++++++++++++")
            
            EPS = 1e-10
            alpha = 0.5 * np.log((1-error) / (error+EPS))
            self.alphas.append(alpha)
           
            w *= np.exp(-alpha * y * predictions)
            w /= np.sum(w)
            
            
    def predict(self, X):
        predictions = np.array([])
        for i in range(self.n):
            np.append(predictions, self.alphas[i] * self.trees[i].predict_proba(X))
        print(predictions)
        
        y_pred = np.sum(predictions, axis=0)
        return np.sign(predictions)

In [36]:
clf = AdaBoost(10)
clf.fit(X, y)

# print(y)
predictions = clf.predict(X_test)
# print(predictions)

# accuracy_score(y, predictions)

0.2068321323725647
0.7931678676274352
++++++++++++
0.902268603311021
0.09773139668897887
++++++++++++
0.9999881625985556
1.1837401444314504e-05
++++++++++++
0.9999914273824037
8.572617596505707e-06
++++++++++++
0.9999919703297782
8.029670221883996e-06
++++++++++++
0.9999919687556711
8.03124432881917e-06
++++++++++++
0.9999919687556711
8.03124432881917e-06
++++++++++++
0.9999919687556711
8.03124432881917e-06
++++++++++++
0.9999919687556711
8.03124432881917e-06
++++++++++++
0.9999919687556711
8.03124432881917e-06
++++++++++++
[]


C)

In [37]:
clf_5 = AdaBoost(5)
clf_5.fit(X, y)

predictions = clf_5.predict(X_test)
accuracy_score(y_test, predictions)

clf_20 = AdaBoost(20)
clf_20.fit(X, y)

predictions = clf_20.predict(X_test)
accuracy_score(y_test, predictions)

clf_50 = AdaBoost(50)
clf_50.fit(X, y)

predictions = clf_50.predict(X_test)
accuracy_score(y_test, predictions)

0.2068321323725647
0.7931678676274352
++++++++++++
0.902268603311021
0.09773139668897887
++++++++++++
0.9999881625985556
1.1837401444314504e-05
++++++++++++
0.9999914273824037
8.572617596505707e-06
++++++++++++
0.9999919703297782
8.029670221883996e-06
++++++++++++
[]


ValueError: Found input variables with inconsistent numbers of samples: [3498, 0]

D)

In [38]:
from xgboost import XGBClassifier

model = XGBClassifier()
model.fit(X, y)

y_pred = model.predict(X_test)
# predictions = [round(value) for value in y_pred]
# evaluate predictions
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

ModuleNotFoundError: No module named 'xgboost'