# Import Libraries and Dataset

In [1]:
import pandas as pd
import numpy as np
from bisect import bisect_left

df = pd.read_csv('~/machine_learning/ML/iris.csv')
df.head()

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,variety
0,5.1,3.5,1.4,0.2,Setosa
1,4.9,3.0,1.4,0.2,Setosa
2,4.7,3.2,1.3,0.2,Setosa
3,4.6,3.1,1.5,0.2,Setosa
4,5.0,3.6,1.4,0.2,Setosa


# Pre-Processing

Change `variety` column to numeric type and then get rid of third class to make this a binary classification algorithm. Also drop various feature columns to make prediction more difficult.

In [2]:
df['variety'] = df['variety'].map({'Setosa': -1,
                                  'Versicolor': 1,
                                  'Virginica':2},
                                 na_action=None)

df.drop(columns=['sepal.width', 'petal.width','petal.length'], inplace=True)

df = df[df['variety'] < 2]
df = df.sample(frac=1)

df.head()

Unnamed: 0,sepal.length,variety
15,5.7,-1
50,7.0,1
58,6.6,1
78,6.0,1
49,5.0,-1


# Base Classifier (Decision Tree Stumps)

### TreeNode Class



#### Attributes
+ `left`
    + points to the tree that is less than or equal to the threshold value
+ `right`
    + points to the tree that is greater than the threshold value
+ `feature`
    + holds the feature (columns) that the data was split
+ `threshold`
    + holds the specific value of the feature for splitting
+ `info_gain`
    + information gained by the decision node
+ `value`
    + if TreeNode is a leaf than this holds the majority class and prediction of the leaf, otherwise `None`
+ `observations`
    + this holds all the observations of the leaves' data

In [3]:
class TreeNode():
    def __init__(self, left=None, right=None, feature=None, threshold=None, info_gain=None, 
                 value=None, observations=None):
        
        self.threshold=threshold
        self.feature=feature
        self.right=right
        self.left=left
        self.info_gain=info_gain

        # Used for the leaf node
        self.value=value
        self.observations = observations
        

### DecisionTree Class



#### Attributes
+ `max_depth`
    + hperparameter to control how deep the tree is allowed to grow
+ `alpha`
    + this is the *amount of say* that this decision tree will have in the final prediction
+ `root`
    + points to the root of the DecisionTree
    
#### Methods
+ `dfs_build`
    + recursively builds the decision tree until the tree no longer contains enough samples or becomes too deep
+ `find_best_split`
    + Calculates all possible splits for a given dataframe and selects the feature and threshold with the largest information gain
+ `gain_from_gini`
    + calculates information gain from parent and children gini scores
+ `get_gini_score`
    + self-explanatory
+ `leafnode_value`
    + calculates the majority label of the given dataset and sets the TreeNode's `value` parameter to this majority label
+ `fit`
    + trains the decision tree classifier
+ `predict`
    + predicts the label of a test observation by traversing the DecisionTree until it reaches a leaf node and then outputs the leaf node's `value` attribute 
+ `set_alpha`
    + sets the alpha value of the `DecisionTree` to be used in the Adaboost prediction method

In [4]:
class DecisionTree():
    def __init__(self, max_depth=1):
        self.max_depth=max_depth

        self.root=None
        self.alpha=None
    
    def dfs_build(self, df, depth=0):
        X = df.iloc[:, :-2].to_numpy()
        Y = df.iloc[:, -2].to_numpy()
        num_samples, _ = np.shape(X)
        if depth < self.max_depth:
            node_info = self.find_best_split(df)
            if "info_gain" in node_info and node_info["info_gain"] > 0:
                left_subtree = self.dfs_build(node_info["left_df"], depth+1)
                right_subtree = self.dfs_build(node_info["right_df"], depth+1)
                return TreeNode(left_subtree, 
                               right_subtree, 
                               node_info["feature"], 
                               node_info["threshold"], 
                               node_info["info_gain"])
        leaf_value = self.leafnode_value(Y)
        return TreeNode(value = leaf_value, observations = df)
    
    def find_best_split(self, df):
        max_info_gain = -float("inf")
        TreeNode_stats = {}
        for col_index, feature in enumerate(df.columns):
            if col_index == len(df.columns) - 2:
                break
            threshold_vals = df[feature].unique()
            for threshold in threshold_vals:
                left_df = df[df[feature] <= threshold]
                right_df = df[df[feature] > threshold]
                if left_df.empty or right_df.empty:
                    continue
                G = self.gain_from_gini(left_df.iloc[:,-2].to_numpy(), 
                                        right_df.iloc[:,-2].to_numpy(), 
                                        df.iloc[:,-2].to_numpy(), 
                                        threshold)
                if G > max_info_gain:
                    max_info_gain = G
                    TreeNode_stats["info_gain"] = G
                    TreeNode_stats["feature"] = col_index 
                    TreeNode_stats["threshold"] = threshold
                    TreeNode_stats["left_df"] = left_df
                    TreeNode_stats["right_df"] = right_df
                    
        return TreeNode_stats
    
    def gain_from_gini(self, left_data, right_data, labels, threshold):
        
        samples = len(left_data) + len(right_data)
        weight_left = len(left_data) / samples
        weight_right = len(right_data) / samples

        parent_gini = self.get_gini_score(labels)
        leftchild_gini = self.get_gini_score(left_data)
        rightchild_gini = self.get_gini_score(right_data)

        gain = parent_gini - ((weight_left * leftchild_gini) + (weight_right * rightchild_gini))
        return round(gain,2)
    
    def get_gini_score(self, Y):
        class_labels = np.unique(Y)
        gini_score = 0
        for label in class_labels:
            label_prob = len(Y[Y == label]) / len(Y)
            gini_score +=label_prob ** 2
        return 1. - gini_score
    
    def leafnode_value(self, labels):
        pos = 0
        for val in labels:
            if val > 0:
                pos += 1
        neg = len(labels) - pos
        return 1 if pos >= neg else -1
    
    def fit(self, df):
        self.root = self.dfs_build(df)
        
    def set_alpha(self, A):
        self.alpha = A
        
    def predict(self, test, Tree):
        ptr = Tree.root
        while ptr.value is None:
            index = ptr.feature
            if test[index] <= ptr.threshold:
                ptr = ptr.left
            else:
                ptr = ptr.right
        return ptr.value

# Adaboost Classifier

### Adaboost Class



#### Attributes
+ `num_trees`
    + sets the number of trees that the classifier will use to predict
+ `trees`
    + array holding all the trees that are created (used in prediction)
    
#### Methods
+ `fit`
    + trains the Adaboost tree classifier
+ `newly_distributed`
    + creates a new dataframe based on the new sample weights of the previous tree's error values (selecting from a distribution of those error updated weights)
+ `update_weights`
    + updates the weights of the based on the error and alpha values of the `DecisionTree`
+ `tree_alpha`
    + calculates the alpha of the `DecisionTree` based on the errors predicted by the tree
+ `error`
    + calculates the error of the `TreeNode` (leaf node)
+ `predict`
    + makes a prediction based on the array of decision trees created and their corresponding alpha values, then essentially takes the *argmax* of the decision tree outputs except for both negative and positive values

In [5]:
class Adaboost():
    def __init__(self, num_trees=100):
        self.num_trees = num_trees

        self.trees = []

    def fit(self, df):
        for _ in range(self.num_trees):
            Tree = DecisionTree(max_depth=1)
            Tree.fit(df)
            A = self.tree_alpha(Tree)
            Tree.set_alpha(A)
            ptr = Tree.root
            left_df = self.update_weights(ptr.left.value, Tree.alpha, ptr.left.observations)
            right_df = self.update_weights(ptr.right.value, Tree.alpha, ptr.right.observations)
            new_arr = np.concatenate((left_df, right_df))
            normal = new_arr[:, -1].sum()
            new_arr[:, -1] = new_arr[:, -1] / normal
            new_arr = self.newly_distributed(new_arr)
            df = pd.DataFrame(new_arr, columns = df.columns)
            df['sample_weights'] = 1. / len(df)
            self.trees.append(Tree)
            
    def newly_distributed(self, matrix):
        running_sum = 0
        new = empty_array = np.empty((0, 3), int)
        for i, wt in enumerate(matrix[:, -1]):
            matrix[i, -1] = wt + running_sum
            running_sum += wt
        for i in range(len(matrix)):
            rand = np.random.uniform(low=0.0, high=1.0, size=None)
            index = bisect_left(matrix[:, -1], rand)
            new = np.append(new, np.array([matrix[index]]), axis=0)
        return new

    def update_weights(self, majority_label, alpha, data):
        matrix = data.to_numpy()
        for i, wt in enumerate(matrix[:, -1]):
            if matrix[i, -2] == majority_label:  
                matrix[i][-1] = wt * np.exp(-alpha)
            else:
                matrix[i][-1] = wt * np.exp(alpha)
        return matrix
    def tree_alpha(self, tree):
        Root = tree.root
        err = self.error(Root.left) + self.error(Root.right)
        return 0.5 * np.log((1.0 - err) / err)

    def error(self, child):
        majority_label = child.value
        matrix = child.observations.to_numpy()
        targets = matrix[:, -2]
        correct = 0
        for target in targets:
            if target == majority_label:
                correct += 1
        return (len(targets) - correct) / len(targets)
    
    def predict(self, df_pred):
        predictions = df_pred.to_numpy()
        res = []
        for row in predictions:
            forest_preds = []
            for tree in self.trees:
                pred = tree.predict(row, tree)
                forest_preds.append(pred * tree.alpha)
            res.append(1 if sum(forest_preds) >= 0 else -1)
        return res

# Shuffle and Split the Dataset

Shuffle the data using pandas df methods(in reality should use scikit-learn's *train_test_split* but chose to keep the theme consistent of not using any ML libraries)

In [6]:
# Shuffle the data and add a weights column
train_set = df.copy().iloc[10:, ]
test_set = df.copy().iloc[:10,]
train_set['sample_weights'] = 1. / len(train_set)

In [7]:
train_set.head()

Unnamed: 0,sepal.length,variety,sample_weights
79,5.7,1,0.011111
31,5.4,-1,0.011111
67,5.8,1,0.011111
57,4.9,1,0.011111
60,5.0,1,0.011111


In [8]:
test_set.head()

Unnamed: 0,sepal.length,variety
15,5.7,-1
50,7.0,1
58,6.6,1
78,6.0,1
49,5.0,-1


# Train the Classifier

In [9]:
classifier = Adaboost(num_trees = 10)
classifier.fit(train_set)

# Make Predictions

In [30]:
preds = classifier.predict(test_set)
targs = test_set['variety'].to_numpy()

In [31]:
def accuracy(preds, targs):
    running_sum = 0
    for prediction, target in zip(preds, targs):
        if prediction == target:
            running_sum += 1
    return float(running_sum) / len(targs)

print(accuracy(preds, targs))

0.9


# Compare with Scikit-Learn Adaboost Implementation

In [10]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn import metrics

In [11]:
AdaModel = AdaBoostClassifier(n_estimators=10, random_state=1)

In [22]:
feat = train_set.iloc[:, 0].to_numpy().reshape(-1, 1)
lab = train_set.iloc[:, 1].to_numpy()

test_feat = test_set.iloc[:, 0].to_numpy().reshape(-1,1)

In [20]:
model = AdaModel.fit(feat, lab)

In [23]:
y_pred = model.predict(test_feat)

In [25]:
test_lab = test_set.iloc[:, 1].to_numpy()
test_lab

array([-1,  1,  1,  1, -1, -1, -1, -1, -1,  1])

In [29]:
# Model Accuracy
print(f"Accuracy: {metrics.accuracy_score(test_lab, y_pred)}")

Accuracy: 0.9
