In [1]:
""" The following cells are required to be executed first, before running the experiments. """

In [None]:
""" REQUIRED: Import required packages """
import pandas as pd
import numpy as np
from time import time
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
""" REQUIRED: Read and Prepare data"""
""" Read data to be used """
# In this example, a dataset about spamming is used
data = pd.read_csv('spambase.csv',sep=",")
data.head()

""" Some preprocessing on data """
# Number of features
m = data.shape[1]
# Remove unwanted features
X = data.iloc[:,0:48]
y = data.iloc[:,(m-1):]

# Turn data into onehot format
X_onehot = pd.get_dummies(X)

In [None]:
""" REQUIRED: Splitting training and testing data """
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_onehot, y, test_size=0.25, random_state=33)

In [None]:
print("% of spams (train): ", np.mean(y_train))
print("% of spams (test): ", np.mean(y_test))

In [None]:
""" REQUIRED: Import BRF Classifier Module """
from BoostedRandomForest import BoostedRandomForest
# Example usage
#brf = BoostedRandomForest()
#rf = BoostedRandomForest(weight_update=False, boosting=False)
#brf_wout_update = BoostedRandomForest(weight_update=False)

In [None]:
""" REQURED: Default parameters """
# The parameters can be changed according to the situation.
# The following gives a set of sample values for the parameters.
brf_params = {'T': 250,
              'depth_max': 20,
              'weight_update': True,
              'boosting': True,
             }


In [None]:
"""
Each of the following sections, separated by comments quouted by 3 quotation marks (\"\"\"), 
represents a test case, which is also descriped in the comments themselves.
Please execute the desired secion of cells in order.
""" 

In [None]:
""" Mean performance without early stop """
brf_ns_mean_params = brf_params
# Disable early stopping
brf_ns_mean_params['early_stop'] = False

In [None]:
brf_ns_accs = []
brf_ns_tree_cnts = []
brf_ns_times = []

loop_start = time()
# Take mean performance from 20 trials
for it in range(20) :
    print("Iteration: ", it)
    # Create new classifier
    brf_ns_mean_clf = BoostedRandomForest(**brf_ns_mean_params)
    
    # Train classifier
    start =  time()
    brf_ns_mean_clf.fit(X_train, y_train)
    end = time()
    
    # Record results
    brf_ns_accs.append(brf_ns_mean_clf.train_accs)
    brf_ns_tree_cnts.append(len(brf_ns_mean_clf.clfs))
    brf_ns_times.append(end-start)
    
    print("Time used from start: ", time()-loop_start)
    

In [None]:
# Find out forest with fewest tree
brf_ns_min_tree_cnt = min(brf_ns_tree_cnts)
print("Min #Tree: ", brf_ns_min_tree_cnt)

brf_ns_mean_tree_cnt = np.mean(brf_ns_tree_cnts)
print("Mean #Tree: ", brf_ns_mean_tree_cnt)

brf_ns_mean_time = np.mean(brf_ns_times)
print("Mean time used: ", brf_ns_mean_time)

# Trim records by min_tree_cnt
trimmed_accs = [ accs[:brf_ns_min_tree_cnt] for accs in brf_ns_accs ]
brf_ns_mean_accs = np.mean(trimmed_accs, axis=0)

print(brf_ns_mean_accs)


In [None]:
# Pad accuracy list to length of depth_max
padded_brf_ns_mean_accs = np.hstack([brf_ns_mean_accs, np.array([-1]*(250-brf_ns_min_tree_cnt))])

# Output to csv 
brf_ns_mean_accs_df = pd.DataFrame(padded_brf_ns_mean_accs)
brf_ns_mean_accs_df.to_csv("Results/brf_ns_mean_accs.csv", index=False)

In [None]:
""" Mean performance with early stop """
brf_es_mean_params = brf_params
brf_es_mean_params['eps_ub'] = 0.5
brf_es_mean_params['eps_lb'] = 1e-20
brf_es_mean_params['eps_exceed_limit'] = 5
# Enable early stopping
brf_es_mean_params['early_stop'] = True

In [None]:
brf_es_accs = []
brf_es_tree_cnts = []
brf_es_times = []

loop_start = time()
# Take mean performance from 20 trials
for it in range(20) :
    print("Iteration: ", it)
    # Create new classifier 
    brf_es_mean_clf = BoostedRandomForest(**brf_es_mean_params)
    
    # Train classifier
    start =  time()
    brf_es_mean_clf.fit(X_train, y_train)
    end = time()
    
    # Record results
    brf_es_accs.append(brf_es_mean_clf.train_accs)
    brf_es_tree_cnts.append(len(brf_es_mean_clf.clfs))
    brf_es_times.append(end-start)
    
    print("Time used from start: ", time()-loop_start)
    

In [None]:
# Find out forest with fewest tree
brf_es_min_tree_cnt = min(brf_es_tree_cnts)
print("Min #Tree: ", brf_es_min_tree_cnt)

brf_es_mean_tree_cnt = np.mean(brf_es_tree_cnts)
print("Mean #Tree: ", brf_es_mean_tree_cnt)

brf_es_mean_time = np.mean(brf_es_times)
print("Mean time used: ", brf_es_mean_time)

# Trim records by min_tree_cnt
trimmed_accs = [ accs[:brf_es_min_tree_cnt] for accs in brf_es_accs ]
brf_es_mean_accs = np.mean(trimmed_accs, axis=0)

print(brf_es_mean_accs)



In [None]:
# Pad accuracy list to length of depth_max
padded_brf_es_mean_accs = np.hstack([brf_es_mean_accs, np.array([-1]*(250-brf_es_min_tree_cnt))])

# Output to csv 
brf_es_mean_accs_df = pd.DataFrame(padded_brf_es_mean_accs)
brf_es_mean_accs_df.to_csv("Results/brf_es_mean_accs.csv", index=False)

In [None]:
""" Mean performance without weight updates without early stop """
brf_wout_update_mean_params = brf_params
brf_wout_update_mean_params['eps_ub'] = 0.5
brf_wout_update_mean_params['eps_lb'] = 1e-20
brf_wout_update_mean_params['eps_exceed_limit'] = 5
# Disable early stopping
brf_wout_update_mean_params['early_stop'] = False
# Without weight update
brf_wout_update_mean_params['weight_update'] = False
brf_wout_update_mean_params['boosting'] = True

In [None]:
brf_wout_update_accs = []
brf_wout_update_tree_cnts = []
brf_wout_update_times = []

loop_start = time()
# Take mean performance from 20 trials
for it in range(20) :
    print("Iteration: ", it)
    # Create new classifier 
    brf_wout_update_mean_clf = BoostedRandomForest(**brf_wout_update_mean_params)
    
    # Train classifier
    start =  time()
    brf_wout_update_mean_clf.fit(X_train, y_train)
    end = time()
    
    # Record results
    brf_wout_update_accs.append(brf_wout_update_mean_clf.train_accs)
    brf_wout_update_tree_cnts.append(len(brf_wout_update_mean_clf.clfs))
    brf_wout_update_times.append(end-start)
    
    print("Time used from start: ", time()-loop_start)
    

In [None]:
# Find out forest with fewest tree
brf_wout_update_min_tree_cnt = min(brf_wout_update_tree_cnts)
print("Min #Tree: ", brf_wout_update_min_tree_cnt)

brf_wout_update_mean_tree_cnt = np.mean(brf_wout_update_tree_cnts)
print("Mean #Tree: ", brf_wout_update_mean_tree_cnt)

brf_wout_update_mean_time = np.mean(brf_wout_update_times)
print("Mean time used: ", brf_wout_update_mean_time)

# Trim records by min_tree_cnt
trimmed_accs = [ accs[:brf_wout_update_min_tree_cnt] for accs in brf_wout_update_accs ]
brf_wout_update_mean_accs = np.mean(trimmed_accs, axis=0)

print("Max acc: ", max(brf_wout_update_mean_accs))
print(brf_wout_update_mean_accs)

In [None]:
# Output to csv 
brf_wout_update_mean_accs_df = pd.DataFrame(brf_wout_update_mean_accs)
brf_wout_update_mean_accs_df.to_csv("Results/brf_wout_update_mean_accs.csv", index=False)

In [None]:
""" Mean performane with RF """
brf_rf_mean_params = brf_params
brf_rf_mean_params['eps_ub'] = 0.5
brf_rf_mean_params['eps_lb'] = 1e-20
brf_rf_mean_params['eps_exceed_limit'] = 5
# Disable early stopping
brf_rf_mean_params['early_stop'] = False
# RF
brf_rf_mean_params['weight_update'] = False
brf_rf_mean_params['boosting'] = False

In [None]:
brf_rf_accs = []
brf_rf_tree_cnts = []
brf_rf_times = []

loop_start = time()
# Take mean performance from 20 trials
for it in range(20) :
    print("Iteration: ", it)
    # Create new classifier 
    brf_rf_mean_clf = BoostedRandomForest(**brf_rf_mean_params)
    
    # Train classifier
    start =  time()
    brf_rf_mean_clf.fit(X_train, y_train)
    end = time()
    
    # Record results
    brf_rf_accs.append(brf_rf_mean_clf.train_accs)
    brf_rf_tree_cnts.append(len(brf_rf_mean_clf.clfs))
    brf_rf_times.append(end-start)
    
    print("Time used from start: ", time()-loop_start)
    

In [None]:
# Find out forest with fewest tree
brf_rf_min_tree_cnt = min(brf_rf_tree_cnts)
print("Min #Tree: ", brf_rf_min_tree_cnt)

brf_rf_mean_tree_cnt = np.mean(brf_rf_tree_cnts)
print("Mean #Tree: ", brf_rf_mean_tree_cnt)

brf_rf_mean_time = np.mean(brf_rf_times)
print("Mean time used: ", brf_rf_mean_time)

# Trim records by min_tree_cnt
trimmed_accs = [ accs[:brf_rf_min_tree_cnt] for accs in brf_rf_accs ]
brf_rf_mean_accs = np.mean(trimmed_accs, axis=0)

print("Max acc: ", max(brf_rf_mean_accs))
print(brf_rf_mean_accs)



In [None]:
# Output to csv 
brf_rf_mean_accs_df = pd.DataFrame(brf_rf_mean_accs)
brf_rf_mean_accs_df.to_csv("Results/brf_rf_mean_accs.csv", index=False)

In [None]:
""" Plot: With vs W/Out Early Stopping  """
fig1, ax1 = plt.subplots()
plt.suptitle("Boosted Random Forest")
plt.title("With vs W/out Early Stops")
ax1.set_xlabel('Trees across Training')
ax1.set_ylabel('Acc')  
ax1.tick_params(axis='y')

# Nonstop
color = 'tab:blue'
ax1.plot(brf_ns_mean_accs, color=color, label="Non-stop")


# Early Stop
color = 'tab:red'
ax1.plot(brf_es_mean_accs, color=color, label="Early Stop")
color = 'tab:red'
plt.axvline(x=brf_es_min_tree_cnt, color=color, label="Early Stop line")

plt.grid()
plt.legend()
plt.show()

In [None]:
""" Plot: BRF vs BRF w/out update vs RF  """
num_Tree = 150

fig2, ax1 = plt.subplots()
plt.suptitle("Prediction Accuracy")
plt.title("BRF vs BRF w/out update vs RF")
ax1.set_xlabel('Trees across Training')
ax1.set_ylabel('Acc')  
ax1.tick_params(axis='y')
ax1.set_xticks(list(range(0, num_Tree+1, 10)))

# BRF
color = 'tab:blue'
ax1.plot(brf_ns_mean_accs[:num_Tree], color=color, label="BRF")
# W/out weight update
color = 'tab:orange'
ax1.plot(brf_wout_updates_mean_accs[:num_Tree], color=color, label="BRF W/out Update")
# RF
color = 'tab:green'
ax1.plot(brf_rf_mean_accs[:num_Tree], color=color, label="RF")


plt.grid()
plt.legend()
plt.show()

In [None]:
""" Plot: Epsilons vs Alphas """

# Error rates
brf_ns_eps = brf_ns_mean_clf.all_eps
# Alpha
brf_ns_alphas = brf_ns_mean_clf.all_alphas
# Stop index
brf_es_stop_index = brf_es_mean_clf.stop_index


fig1, ax1 = plt.subplots()
plt.title("Boosted Random Forest")
plt.title("Error rate (Eps) vs Tree weights (Alpha)")
ax1.set_xlabel('Trees across Training')

# Epsilons
color = 'tab:orange'
ax1.set_ylabel('eps', color=color)
ax1.plot(brf_ns_eps, color=color)
ax1.tick_params(axis='y', labelcolor=color)

ax2 = ax1.twinx()
# Alphas
color = 'tab:blue'
ax2.set_ylabel('alphas', color=color)  
ax2.plot(brf_ns_alphas, color=color)
ax2.tick_params(axis='y', labelcolor=color)

# Epsilon limits
color = "tab:grey"
ax1.axhline(y=brf_ns_mean_params['eps_ub'], color=color, label="eps_ub")
ax1.axhline(y=brf_ns_mean_params['eps_lb'], color=color, label="eps_lb")

# Early stop
color = 'tab:green'
ax1.axvline(x=brf_es_stop_index, color=color, label="early_stop")

ax1.legend(loc='center right', bbox_to_anchor=(1.37, 0.9))
plt.subplots_adjust(right=0.4)
fig1.tight_layout()  # otherwise the right y-label is slightly clipped
plt.show()