In [1]:
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import math
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from random import seed
from random import randrange

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
#the dataset aims to predict the whether a given banknote is authentic given a number of measures taken from a photograph. 
#it is a binary classification problem where 0 is the label for an authentic banknote while 1 for a fake one.
banknoteData = pd.read_csv('/content/drive/MyDrive/MachineLearning/datasets/data_banknote_authentication.csv')
banknoteData.head()

Unnamed: 0,Variance of Wavelet Transformed image,Skewness of Wavelet Transformed image,Kurtosis of Wavelet Transformed image,Entropy of image,Class
0,3.6216,8.6661,-2.8073,-0.44699,0
1,4.5459,8.1674,-2.4586,-1.4621,0
2,3.866,-2.6383,1.9242,0.10645,0
3,3.4566,9.5228,-4.0112,-3.5944,0
4,0.32924,-4.4552,4.5718,-0.9888,0


In [4]:
X = banknoteData.iloc[:,:-1].values
y = (banknoteData.iloc[:,-1].values) #.reshape(-1,1)
#the observation when dealing with adaboost are labeled with -1 or 1. Therefore we change the class label in the main dataset.
y = np.where(y==0,-1,y)

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, shuffle=True)
print('n samples in Train set: ', len(X_train))
print('n samples in Test set: ', len(X_test))

n samples in Train set:  1097
n samples in Test set:  275


#AdaBoost from scratch#

*The idea is to rely on the Boosting strategy where a sequence of weak learners is combined into a stronger one. It is an ensemble methods.  By training the predictors sequentially is possibile that each predictor corrects the previous one (predecessor). \\
One way for a predictor to correct its predecessor is to pay a bit more attention to the training instances that the predecessor underfitted. In other words the algorithm gives more weight to the misclassified training istances.
This result in a predictor focusing more on hard cases. \\
In AdaBoost the weak learners are usually decision trees of depth 1*

In [6]:
from sklearn.tree import DecisionTreeClassifier
n_weak_learners = 100
list_of_weak_learn = []
alphas = []
tr_errors = []
N = X_train.shape[0]

#at the beginning the weights are initialized with same values (1/N)
w_i=(np.ones(N)*1/N)

sample_weights = np.zeros(shape=(n_weak_learners, N))
stumps = np.zeros(shape=n_weak_learners, dtype=object)
stump_weights = np.zeros(shape=n_weak_learners)
errors = np.zeros(shape=n_weak_learners)

#weights initialization
sample_weights[0] = np.ones(shape=N)/N

#iterate over the M classifiers
for i in range(0,n_weak_learners):
  #fit a weak classifier and predict labels
  w_i = sample_weights[i]
  #G_m is only a decision tree of depth one called 'stump', i.e 2 terminal-node classification tree
  G_m = DecisionTreeClassifier(max_depth=1,max_leaf_nodes=2)
  G_m.fit(X_train, y_train, sample_weight=w_i)
  y_pred_st = G_m.predict(X_train)

  #find a weak learner that minimized the following error
  #we take the index for w_i where the condiction != is satified, i.e the missclassified instances and we compute the error
  e_t = np.sum(w_i[(y_pred_st != y_train)])
  alpha_m = (1/2)*np.log((1-e_t)/e_t)
  
  #update sample weights by increasing them for the misclassified observations
  update_weights = (w_i * np.exp(-alpha_m*y_train*y_pred_st))
  #renormalize weights
  update_weights /= update_weights.sum()

  #update sample weights for t+1 if we are not in the final iteration
  if i+1 < n_weak_learners:
    sample_weights[i+1] = update_weights
  
  stumps[i] = G_m
  stump_weights[i] = alpha_m
  errors[i] = e_t

**Test Phase**

In [12]:
from sklearn.utils.extmath import stable_cumsum
#get predictions based on majority of votes
#stump_preds = np.array([stump.predict(X_test) for stump in self.stumps])
stump_preds = []
for st in stumps:
  y_st_pred = st.predict(X_test)
  stump_preds.append(y_st_pred)

stump_preds = np.array(stump_preds)

#the final predictions are given as the sign by the linear combination of each decision tree (stump) prediction and its corresponding weight alpha_m
y_preds = np.sign(np.dot(stump_weights, stump_preds))

#accuracy computation 
correct = np.sum(y_preds == y_test)
accuracy = correct/len(y_test)

cm = confusion_matrix(y_test, y_preds)
print('AdaBoost from scratch confusion matrix: \n', cm)
print('\n')
print('AdaBoost accuracy: ', accuracy)
print(classification_report(y_test, y_preds))

AdaBoost from scratch confusion matrix: 
 [[152   0]
 [  2 121]]


AdaBoost accuracy:  0.9927272727272727
              precision    recall  f1-score   support

          -1       0.99      1.00      0.99       152
           1       1.00      0.98      0.99       123

    accuracy                           0.99       275
   macro avg       0.99      0.99      0.99       275
weighted avg       0.99      0.99      0.99       275



*Comparison with the Sklearn Model*

In [13]:
from sklearn.ensemble import AdaBoostClassifier

adaBoost = AdaBoostClassifier(
    DecisionTreeClassifier(max_depth=1), n_estimators=100,
    algorithm="SAMME", learning_rate=0.5)
#training
adaBoost.fit(X_train, y_train)

AdaBoostClassifier(algorithm='SAMME',
                   base_estimator=DecisionTreeClassifier(max_depth=1),
                   learning_rate=0.5, n_estimators=100)

In [14]:
#test phase
y_pred_sk = adaBoost.predict(X_test)
cm_sk = confusion_matrix(y_test, y_pred_sk)
print('AdaBoost confusion matrix: \n', cm_sk)
print('\n')
print ("Accuracy : ", accuracy_score(y_test, y_pred_sk))
print(classification_report(y_test, y_pred_sk))

AdaBoost confusion matrix: 
 [[152   0]
 [  4 119]]


Accuracy :  0.9854545454545455
              precision    recall  f1-score   support

          -1       0.97      1.00      0.99       152
           1       1.00      0.97      0.98       123

    accuracy                           0.99       275
   macro avg       0.99      0.98      0.99       275
weighted avg       0.99      0.99      0.99       275

