In [13]:
import pandas as pd
import numpy as np
from time import time
import matplotlib.pyplot as plt
%matplotlib inline

In [14]:
""" Read data to be used """
# In this example, a dataset about spamming is used
data = pd.read_csv('spambase.csv',sep=",")
data.head()

""" Some preprocessing on data """
# Number of features
m = data.shape[1]
# Remove unwanted features
X = data.iloc[:,0:48]
y = data.iloc[:,(m-1):]

# Turn data into onehot format
X_onehot = pd.get_dummies(X)

In [15]:
from sklearn.model_selection import train_test_split
""" Splitting training and testing data """
X_train, X_test, y_train, y_test = train_test_split(X_onehot, y, test_size=0.25, random_state=33)

In [16]:
print("% of spams (train): ", np.mean(y_train))
print("% of spams (test): ", np.mean(y_test))

% of spams (train):  Spam    0.402899
dtype: float64
% of spams (test):  Spam    0.367507
dtype: float64


In [6]:
""" Import BRF classifiers """
from BoostedRandomForest import BoostedRandomForest
# Example usage
#brf = BoostedRandomForest()
#rf = BoostedRandomForest(weight_update=False, boosting=False)
#brf_wout_update = BoostedRandomForest(weight_update=False)

In [7]:
""" Default parameters """
# Max number of trees
brf_params = {'T': 250,
              'depth_max': 20,
              'weight_update': True,
              'boosting': True,
             }


In [None]:
""" Without Early Stopping """
brf_nonstop_params = brf_params
brf_nonstop_clf = BoostedRandomForest(**brf_nonstop_params)

In [None]:
brf_nonstop_clf.fit(X_train, y_train)

In [None]:
# Training accuracies 
brf_train_accs = brf_nonstop_clf.train_accs
print(len(brf_train_accs))
# Epsilons
brf_eps = brf_nonstop_clf.all_eps
print(len(brf_eps))
# Alphas
brf_alphas = brf_nonstop_clf.all_alphas
print(len(brf_alphas))

In [None]:
""" Early Stopping with eps=[1e-20, 0.5]"""
brf_es_params = brf_params
brf_es_params['eps_ub'] = 0.5
brf_es_params['eps_lb'] = 1e-20
brf_es_params['eps_exceed_limit'] = 5
# Disable for evaluate performance after early stopping
brf_es_params['early_stop'] = False 

brf_es_clf = BoostedRandomForest(**brf_es_params)

In [None]:
brf_es_clf.fit(X_train, y_train)

In [None]:
# Training accuracies 
brf_train_accs = brf_es_clf.train_accs
print("Len of accs: ", len(brf_train_accs))
# Ids of saved tress
brf_clf_ids = brf_es_clf.clf_ids
print("Len of ids: ", len(brf_clf_ids))
# Epsilons
brf_eps = brf_es_clf.all_eps
print("Len of eps: ", len(brf_eps))
# Alphas
brf_alphas = brf_es_clf.all_alphas
print("Len of alphs: ", len(brf_alphas))

stop_index = brf_es_clf.stop_index
print("Stop index: ", stop_index)

In [None]:
fig1, ax1 = plt.subplots()
plt.title("Error rate vs Tree weights")
ax1.set_xlabel('Trees across Training')

# Epsilons
color = 'tab:orange'
ax1.set_ylabel('eps', color=color)
ax1.plot(brf_eps, color=color)
ax1.tick_params(axis='y', labelcolor=color)

ax2 = ax1.twinx()
# Alphas
color = 'tab:blue'
ax2.set_ylabel('alphas', color=color)  
ax2.plot(brf_alphas, color=color)
ax2.tick_params(axis='y', labelcolor=color)

# Epsilon limits
color = "tab:grey"
ax1.axhline(y=brf_es_params['eps_ub'], color=color, label="eps_ub")
ax1.axhline(y=brf_es_params['eps_lb'], color=color, label="eps_lb")

# Early stop
color = 'tab:green'
ax1.axvline(x=stop_index, color=color, label="early_stop")

ax1.legend(loc='center right', bbox_to_anchor=(1.37, 0.9))
plt.subplots_adjust(right=0.4)
fig1.tight_layout()  # otherwise the right y-label is slightly clipped
plt.show()

In [None]:
fig, ax1 = plt.subplots()
plt.suptitle("Boosted Random Forest")
plt.title("Accuracy vs #Trees")
ax1.set_xlabel('Trees across Training')

# Accuracy
color = 'tab:blue'
ax1.set_ylabel('Acc')  
ax1.plot(brf_train_accs, color=color, label="Acc")
ax1.tick_params(axis='y')

# Early stop
color = 'red'
plt.axvline(x=stop_index, color=color, label="early_stop")

plt.legend()
plt.show()


In [11]:
""" Mean performance without early stop """
brf_ns_mean_params = brf_params

In [12]:
brf_ns_accs = []
brf_ns_tree_cnts = []
brf_ns_times = []

loop_start = time()
# Take mean performance from 20 trials
for it in range(20) :
    print("Iteration: ", it)
    # Create new classifier
    brf_ns_mean_clf = BoostedRandomForest(**brf_ns_mean_params)
    
    # Train classifier
    start =  time()
    brf_ns_mean_clf.fit(X_train, y_train)
    end = time()
    
    # Record results
    brf_ns_accs.append(brf_ns_mean_clf.train_accs)
    brf_ns_tree_cnts.append(len(brf_ns_mean_clf.clfs))
    brf_ns_times.append(end-start)
    
    print("Time used from start: ", time()-loop_start)
    

Iteration:  0
Time used from start:  111.13352823257446
Iteration:  1


KeyboardInterrupt: 

In [None]:
# Find out forest with fewest tree
brf_ns_min_tree_cnt = min(brf_ns_tree_cnts)
print("Min #Tree: ", brf_ns_min_tree_cnt)

brf_ns_mean_tree_cnt = np.mean(brf_ns_tree_cnts)
print("Mean #Tree: ", brf_ns_mean_tree_cnt)

brf_ns_mean_time = np.mean(brf_ns_times)
print("Mean time used: ", brf_ns_mean_time)

# Trim records by min_tree_cnt
trimmed_accs = [ accs[:min_tree_cnt] for accs in brf_ns_accs ]
brf_ns_mean_accs = np.mean(trimmed_accs, axis=0)

print(brf_ns_mean_accs)


In [None]:
# Align

# Output results to csv file
brf_ns_df = pd.DataFrame({'Mean_Acc': brf_ns_mean_accs, ''})

In [27]:
""" Mean performance with early stop """
brf_es_mean_params = brf_params
brf_es_mean_params['eps_ub'] = 0.5
brf_es_mean_params['eps_lb'] = 1e-20
brf_es_mean_params['eps_exceed_limit'] = 5
# Enable early stopping
brf_es_mean_params['early_stop'] = True

In [None]:
brf_es_accs = []
brf_es_tree_cnts = []
brf_es_times = []

loop_start = time()
# Take mean performance from 20 trials
for it in range(20) :
    print("Iteration: ", it)
    # Create new classifier 
    brf_es_mean_clf = BoostedRandomForest(**brf_es_mean_params)
    
    # Train classifier
    start =  time()
    brf_es_mean_clf.fit(X_train, y_train)
    end = time()
    
    # Record results
    brf_es_accs.append(brf_es_mean_clf.train_accs)
    brf_es_tree_cnts.append(len(brf_es_mean_clf.clfs))
    brf_es_times.append(end-start)
    
    print("Time used from start: ", time()-loop_start)
    

Iteration:  0


In [None]:
# Find out forest with fewest tree
brf_es_min_tree_cnt = min(brf_es_tree_cnts)
print("Min #Tree: ", brf_es_min_tree_cnt)

brf_es_mean_tree_cnt = np.mean(brf_es_tree_cnts)
print("Mean #Tree: ", brf_es_mean_tree_cnt)

brf_es_mean_time = np.mean(brf_es_times)
print("Mean time used: ", brf_es_mean_time)

# Trim records by min_tree_cnt
trimmed_accs = [ accs[:brf_es_min_tree_cnt] for accs in brf_es_accs ]
brf_es_mean_accs = np.mean(trimmed_accs, axis=0)

print(brf_es_mean_accs)



In [None]:
fig, ax1 = plt.subplots()
plt.suptitle("Boosted Random Forest")
plt.title("With vs W/out Early Stops")
ax1.set_xlabel('Trees across Training')
ax1.set_ylabel('Acc')  
ax1.tick_params(axis='y')

# Nonstop
color = 'tab:blue'
ax1.plot(brf_ns_mean_accs, color=color, label="Non-stop")


# Early Stop
color = 'tab:red'
ax1.plot(brf_es_mean_accs, color=color, label="Early Stop")
color = 'tab:red'
plt.axvline(x=brf_es_min_tree_cnt, color=color, label="Early Stop line")

plt.legend()
plt.show()

In [17]:
""" Mean performance without weight updates without early stop """
brf_wout_updates_mean_params = brf_params
brf_wout_updates_mean_params['eps_ub'] = 0.5
brf_wout_updates_mean_params['eps_lb'] = 1e-20
brf_wout_updates_mean_params['eps_exceed_limit'] = 5
# Disable early stopping
brf_wout_updates_mean_params['early_stop'] = False
# Without weight update
brf_wout_updates_mean_params['weight_update'] = False
brf_wout_updates_mean_params['boosting'] = True

In [18]:
brf_wout_updates_accs = []
brf_wout_updates_tree_cnts = []
brf_wout_updates_times = []

loop_start = time()
# Take mean performance from 20 trials
for it in range(20) :
    print("Iteration: ", it)
    # Create new classifier 
    brf_wout_updates_mean_clf = BoostedRandomForest(**brf_wout_updates_mean_params)
    
    # Train classifier
    start =  time()
    brf_wout_updates_mean_clf.fit(X_train, y_train)
    end = time()
    
    # Record results
    brf_wout_updates_accs.append(brf_wout_updates_mean_clf.train_accs)
    brf_wout_updates_tree_cnts.append(len(brf_wout_updates_mean_clf.clfs))
    brf_wout_updates_times.append(end-start)
    
    print("Time used from start: ", time()-loop_start)
    

Iteration:  0
Time used from start:  103.55438208580017
Iteration:  1
Time used from start:  206.36199188232422
Iteration:  2
Time used from start:  310.11452293395996
Iteration:  3
Time used from start:  411.48009490966797
Iteration:  4
Time used from start:  513.5435709953308
Iteration:  5
Time used from start:  614.5802481174469
Iteration:  6
Time used from start:  715.6993541717529
Iteration:  7
Time used from start:  817.4803278446198
Iteration:  8
Time used from start:  918.9615759849548
Iteration:  9
Time used from start:  1020.5486431121826
Iteration:  10
Time used from start:  1121.1737480163574
Iteration:  11
Time used from start:  1225.4046170711517
Iteration:  12
Time used from start:  1342.1464121341705
Iteration:  13
Time used from start:  1450.8076400756836
Iteration:  14
Time used from start:  1560.9579169750214
Iteration:  15
Time used from start:  1662.5376110076904
Iteration:  16
Time used from start:  1763.1003708839417
Iteration:  17
Time used from start:  1864.764

In [22]:
# Find out forest with fewest tree
brf_wout_updates_min_tree_cnt = min(brf_wout_updates_tree_cnts)
print("Min #Tree: ", brf_wout_updates_min_tree_cnt)

brf_wout_updates_mean_tree_cnt = np.mean(brf_wout_updates_tree_cnts)
print("Mean #Tree: ", brf_wout_updates_mean_tree_cnt)

brf_wout_updates_mean_time = np.mean(brf_wout_updates_times)
print("Mean time used: ", brf_wout_updates_mean_time)

# Trim records by min_tree_cnt
trimmed_accs = [ accs[:brf_wout_updates_min_tree_cnt] for accs in brf_wout_updates_accs ]
brf_wout_updates_mean_accs = np.mean(trimmed_accs, axis=0)

print("Max acc: ", max(brf_wout_updates_mean_accs))
print(brf_wout_updates_mean_accs)

Min #Tree:  250
Mean #Tree:  250.0
Mean time used:  103.25054233074188
Max acc:  0.9631449275362318
[0.87313043 0.89378261 0.9182029  0.92895652 0.93804348 0.94224638
 0.94708696 0.94837681 0.94985507 0.95123188 0.95236232 0.95313043
 0.95405797 0.95456522 0.95534783 0.95578261 0.95647826 0.95657971
 0.9574058  0.95695652 0.95718841 0.9574058  0.95757971 0.95842029
 0.9585942  0.95865217 0.95898551 0.95897101 0.95913043 0.95921739
 0.95934783 0.95950725 0.9595942  0.95966667 0.95978261 0.96008696
 0.96026087 0.96021739 0.96036232 0.96021739 0.96055072 0.96056522
 0.96072464 0.96082609 0.96107246 0.96126087 0.96163768 0.9617971
 0.96156522 0.96153623 0.96130435 0.9614058  0.96137681 0.96127536
 0.96136232 0.96156522 0.96149275 0.96184058 0.96189855 0.96185507
 0.96186957 0.96201449 0.96194203 0.96201449 0.96184058 0.96198551
 0.96172464 0.96188406 0.96201449 0.96176812 0.96168116 0.96166667
 0.96195652 0.96214493 0.96207246 0.96205797 0.96205797 0.96211594
 0.96204348 0.96223188 0.96228

In [23]:
""" Mean performane with RF """
brf_rf_mean_params = brf_params
brf_rf_mean_params['eps_ub'] = 0.5
brf_rf_mean_params['eps_lb'] = 1e-20
brf_rf_mean_params['eps_exceed_limit'] = 5
# Disable early stopping
brf_rf_mean_params['early_stop'] = False
# RF
brf_rf_mean_params['weight_update'] = False
brf_rf_mean_params['boosting'] = False

In [24]:
brf_rf_accs = []
brf_rf_tree_cnts = []
brf_rf_times = []

loop_start = time()
# Take mean performance from 20 trials
for it in range(20) :
    print("Iteration: ", it)
    # Create new classifier 
    brf_rf_mean_clf = BoostedRandomForest(**brf_rf_mean_params)
    
    # Train classifier
    start =  time()
    brf_rf_mean_clf.fit(X_train, y_train)
    end = time()
    
    # Record results
    brf_rf_accs.append(brf_rf_mean_clf.train_accs)
    brf_rf_tree_cnts.append(len(brf_rf_mean_clf.clfs))
    brf_rf_times.append(end-start)
    
    print("Time used from start: ", time()-loop_start)
    

Iteration:  0
Time used from start:  104.58639621734619
Iteration:  1
Time used from start:  213.47217631340027
Iteration:  2
Time used from start:  319.7023711204529
Iteration:  3
Time used from start:  422.6237721443176
Iteration:  4
Time used from start:  525.2742431163788
Iteration:  5
Time used from start:  627.7744829654694
Iteration:  6
Time used from start:  732.9789321422577
Iteration:  7
Time used from start:  837.5700690746307
Iteration:  8
Time used from start:  948.9854891300201
Iteration:  9
Time used from start:  1060.824354171753
Iteration:  10
Time used from start:  1163.0245592594147
Iteration:  11
Time used from start:  1264.7902400493622
Iteration:  12
Time used from start:  1366.9250831604004
Iteration:  13
Time used from start:  1469.8918342590332
Iteration:  14
Time used from start:  1572.4220700263977
Iteration:  15
Time used from start:  1679.1669890880585
Iteration:  16
Time used from start:  1799.1158320903778
Iteration:  17
Time used from start:  1914.205586

In [26]:
# Find out forest with fewest tree
brf_rf_min_tree_cnt = min(brf_rf_tree_cnts)
print("Min #Tree: ", brf_rf_min_tree_cnt)

brf_rf_mean_tree_cnt = np.mean(brf_rf_tree_cnts)
print("Mean #Tree: ", brf_rf_mean_tree_cnt)

brf_rf_mean_time = np.mean(brf_rf_times)
print("Mean time used: ", brf_rf_mean_time)

# Trim records by min_tree_cnt
trimmed_accs = [ accs[:brf_rf_min_tree_cnt] for accs in brf_rf_accs ]
brf_rf_mean_accs = np.mean(trimmed_accs, axis=0)

print("Max acc: ", max(brf_rf_mean_accs))
print(brf_rf_mean_accs)

Min #Tree:  250
Mean #Tree:  250.0
Mean time used:  106.52943848371505
Max acc:  0.9626231884057969
[0.87492754 0.89104348 0.9173913  0.92457971 0.93628986 0.93915942
 0.94426087 0.94508696 0.94853623 0.94946377 0.95104348 0.95142029
 0.95295652 0.95343478 0.954      0.95498551 0.95502899 0.95544928
 0.95623188 0.95623188 0.9563913  0.9567971  0.95705797 0.95708696
 0.95757971 0.95762319 0.95842029 0.95781159 0.95814493 0.95817391
 0.95814493 0.95824638 0.95795652 0.95876812 0.95884058 0.95894203
 0.95913043 0.95911594 0.95908696 0.95928986 0.9595942  0.95931884
 0.95895652 0.95942029 0.95947826 0.95972464 0.95988406 0.95975362
 0.95973913 0.95982609 0.95982609 0.95995652 0.96008696 0.96002899
 0.96037681 0.96043478 0.9605942  0.9606087  0.96034783 0.96072464
 0.96068116 0.96075362 0.9607971  0.96072464 0.961      0.96081159
 0.96072464 0.96082609 0.96091304 0.96095652 0.96101449 0.96123188
 0.96126087 0.96127536 0.96128986 0.96114493 0.96118841 0.96134783
 0.9614058  0.96134783 0.9610