In [1]:
import pandas as pd 
import numpy as np 
import math
from learners import weak, strong 
import copy 

In [2]:
#Pre-processing - changing to one-hot encoding 
#it is assumed that catagorical datasets have the first entry as the label, can easily change index for any given data 
df = pd.read_csv("agaricus-lepiota.data", header=None).drop(columns=11) #only column with missing data 
vals_col = {}
pro_data = []

for c in df.columns:
    uvals = df[c].unique()
    l = uvals.tolist()
    vals_col[c] = l
    
for r in df.iterrows(): 
    r = r[1].to_numpy()
    r = np.insert(r,11, 0) # circumvent indexing problems 
    encoded = []
    for i,feature in enumerate(r):
        if i != 11: #skip column
            if i == 0: #must have features either -1, 1 for loss function to work 
                if feature == 'p': 
                    encoded.append(-1.0)
                else: 
                    encoded.append(1.0)
            else: 
                one_hot = np.zeros(len(vals_col[i])) 
                one_hot[vals_col[i].index(feature)] = 1.0
                encoded.append(one_hot.tolist())
    encoded.insert(0, 1/8124) #set equal weights
    pro_data.append(encoded)
train = pro_data[:6092] #75, 25 test train split 
test = pro_data[6093:]

In [3]:
%%writefile learners.py
class strong: 
    def __init__(self): 
        self.ensemble = [] 
        
    def pred(self, dp): 
        output = 0
        for i in self.ensemble: 
            output += i.partition(dp)*i.weight
        if output > 0: 
            return 1.0
        if output < 0: 
            return -1.0 
        
class weak: 
    def __init__(self, mark, direct): 
        self.marker = mark
        self.weight = 0  
        self.dir = direct
        
    def partition(self, dp): 
        x = self.marker[0]
        y = self.marker[1]
        val = dp[x][y]
        if self.dir == 'pos': 
            return 1.0 if val == 1.0 else -1.0
        if self.dir == 'neg': 
            return -1.0 if val == 1.0 else 1.0

Overwriting learners.py


In [4]:
#Exponential Loss function/minimizer 
def exp_loss(data): 
    losses = []
    for k,feature  in enumerate(data[0][2:]):  #iterate over every feature 
        for coord in range(len(feature)):   #iterate over all possible vals of feature
            preds_neg = []
            preds_pos = []
            neg_err = 0
            pos_err = 0 
            for dp in data: #for each of these, make prediction on every datapoint, considering both dir of decision
                mark = [k+2, coord] #accounting for fact that first two entries are weight, label
                w_pos = weak(mark, 'pos' )
                w_neg = weak(mark, 'neg')
                preds_pos.append(w_pos.partition(dp))
                preds_neg.append(w_neg.partition(dp))
            for i in range(len(data)):    #compare with actual labels, calculating error based on incorrectness                    
                if preds_pos[i] != data[i][1]:
                    pos_err += data[i][0]  #add weights 
                if preds_neg[i] != data[i][1]: 
                    neg_err += data[i][0]
            losses.append([pos_err, mark, 'pos']) #store all losses along with identifier of which coord was used to classify
            losses.append([neg_err, mark, 'neg'])
    min_err = None       #find smallest error 
    for err in losses: 
        if min_err == None: 
            min_err = err 
        else: 
            if err[0] < min_err[0]:
                if err[0] == 0.0: # I have no clue how this is even possible 
                    #print('uh oh')
                    continue 
                else: 
                    min_err = err 
                         #calculate weight of new weak learner, then instantiate and add to ensemble 
    alph = .5 * math.log((1-min_err[0])/min_err[0])
    new = weak(min_err[1], min_err[2])
    new.weight = alph 
                        #for updating data point weight
    return data, new   
            

In [5]:
#Weight updater 
def data_weight(data, new): 
    nw_raw = 0 
    for dp in data: #update 
        dp[0] = dp[0] * math.exp(-dp[1] * new.weight * new.partition(dp))
        nw_raw += dp[0]
    for dp in data: #normalize
        dp[0] = dp[0] / nw_raw
    return data

In [6]:
def conf(sl, test): 
    mat = np.array([[0,0],  #rows denote true values(top=1.0), cols denote predicted(LHS=1.0)
                  [0,0]])   #(1) - edible, (-1) - inedible 
    true_pos = 0 
    true_neg = 0
    false_pos = 0
    false_neg = 0 
    for dp in test:
        label = dp[1]
        pred = sl.pred(dp)
        if label == 1.0:
            if pred == 1.0:
                mat[0, 0] += 1  # True positive
            else:
                mat[0, 1] += 1  # False negative
        if label == -1.0:
            if pred == -1.0: 
                mat[1, 1] += 1  # True negative
            else: 
                mat[1, 0] += 1  # False positive
    return mat 

In [7]:
def train_test(tr, te):
    SL = strong()
    iterr = 0 
    best_SL = strong()
    best_acc = 0 
    while iterr <= 100: 
        if iterr == 0:  
            data,new_learner = exp_loss(tr)
        else: 
            data,new_learner = exp_loss(weighted_data)
        SL.ensemble.append(new_learner)
        weighted_data = data_weight(data, new_learner)
        if iterr%5 == 0: 
            c = conf(SL, te)
            print(f'Confusion matrix for {iterr} weak learners in the ensemble: \n{c}\n\n')
            acc = c[0,0] + c[1,1]
            if acc> best_acc: 
                best_acc = acc
                best_SL.ensemble = SL.ensemble.copy()
        iterr += 1 
    print(f'The most accurate Adaboost with decision stumps model is that with {len(best_SL.ensemble)} weak learners \n having an accuracy of {best_acc/8124}%' )
    return best_SL

In [8]:
train_test(train, test)

Confusion matrix for 0 weak learners in the ensemble: 
[[ 525    0]
 [1006  500]]


Confusion matrix for 5 weak learners in the ensemble: 
[[ 525    0]
 [  44 1462]]


Confusion matrix for 10 weak learners in the ensemble: 
[[ 525    0]
 [   8 1498]]


Confusion matrix for 15 weak learners in the ensemble: 
[[ 525    0]
 [   8 1498]]




KeyboardInterrupt: 