In [1]:
import pandas as pd
import numpy as np

In [2]:
""" Read data to be used """
# In this example, a dataset about spamming is used
data = pd.read_csv('spambase.csv',sep=",")
data.head()

""" Some preprocessing on data """
# Number of features
m = data.shape[1]
# Remove unwanted features
X = data.iloc[:,0:48]
y = data.iloc[:,(m-1):]

# Turn data into onehot format
X_onehot = pd.get_dummies(X)

In [3]:
from sklearn.model_selection import train_test_split
""" Splitting training and testing data """
X_train, X_test, y_train, y_test = train_test_split(X_onehot, y, test_size=0.25, random_state=33)

In [4]:
""" Import BRF classifiers """
from BoostedRandomForest import BoostedRandomForest
from sklearn.ensemble import RandomForestClassifier
"""brf = BoostedRandomForest()
rf = BoostedRandomForest(weight_update=False, boosting=False)
brf_wout_update = BoostedRandomForest(weight_update=False)"""

'brf = BoostedRandomForest()\nrf = BoostedRandomForest(weight_update=False, boosting=False)\nbrf_wout_update = BoostedRandomForest(weight_update=False)'

In [5]:
""" Function for evaluating a classifier"""
from sklearn.metrics import accuracy_score
from time import time

def eval_clf(method, CLF, params, X_train, X_test, y_train, y_test, iterations) :
    # List of accuracies in each iteration
    accs = []
    # List of time used for training
    times = []
    
    # Train classifier for iterations
    for it in range(iterations) :
        #print("Iteraiton: ", it)        
        # Create classifier
        clf = CLF(**params)
        
        #Start timer
        start = time()
        # Train classifier
        clf.fit(X_train, y_train)
        # End timer 
        end = time()
        # Record time
        times.append(end-start)
        
        # Give predictions
        if method == "Ensemble" :
            pred = clf.ensemble_predict(X_test)
        elif method == "RF" :
            pred = clf.RF_predict(X_test)
        else :
            pred = clf.predict(X_test)
            
        # Calculate accuracy
        acc = accuracy_score(y_test, pred)
        
        # Record accuracy
        accs.append(acc)
        
        
    # Return results and times used
    return accs, times

In [6]:
# Number of iterations to train classifier to get results
iterations = 10
# Max number of trees per classifier
max_tree = 150
# Max depth for each tree
max_depth = 50

In [44]:
""" Boosted Random Forest """
brf_params = {'weight_update': True, 'boosting': True}


brf_results = []
start = time()
for T in range(5, max_tree+1, 5) :
    for D in range(5, max_depth+1, 5) :
        print('Max number of trees: {}, Max depth: {}'.format(T, D))
        # Set max number of trees
        brf_params['T'] = T
        # Set max depth for each tree
        brf_params['depth_max'] = D
        # Train classifier
        brf_accs, brf_times = eval_clf("Ensemble", BoostedRandomForest, brf_params, X_train, X_test, y_train, y_test, iterations)
        # Calcuate mean results
        mean_acc = np.mean(brf_accs)
        mean_time = np.mean(brf_times)
        print("Mean acc: {}; Mean time: {}".format(mean_acc, mean_time))
        # Record results
        brf_results.append(['brf', T, D, mean_acc, mean_time])
        
        # Report time used for evaluation
        end = time()
        print("Total time spent for evaluation: ", end-start)


Max number of trees: 5, Max depth: 5
Mean acc: 0.90503909643788; Mean time: 0.1668393135070801
Total time spent for evaluation:  1.7423269748687744
Max number of trees: 5, Max depth: 10
Mean acc: 0.9067767158992179; Mean time: 0.1332714557647705
Total time spent for evaluation:  3.129426956176758
Max number of trees: 5, Max depth: 15
Mean acc: 0.9062554300608167; Mean time: 0.1437476634979248
Total time spent for evaluation:  4.628628969192505
Max number of trees: 5, Max depth: 20
Mean acc: 0.9006081668114684; Mean time: 0.17016661167144775
Total time spent for evaluation:  6.404349088668823
Max number of trees: 5, Max depth: 25
Mean acc: 0.903475238922676; Mean time: 0.1411504030227661
Total time spent for evaluation:  7.874499082565308
Max number of trees: 5, Max depth: 30
Mean acc: 0.9041702867072111; Mean time: 0.13598909378051757
Total time spent for evaluation:  9.289360046386719
Max number of trees: 5, Max depth: 35
Mean acc: 0.8968722849695917; Mean time: 0.13890743255615234
To

Mean acc: 0.9266724587315378; Mean time: 1.364100694656372
Total time spent for evaluation:  354.23609375953674
Max number of trees: 30, Max depth: 30
Mean acc: 0.9245004344048654; Mean time: 1.338101291656494
Total time spent for evaluation:  367.8916358947754
Max number of trees: 30, Max depth: 35
Mean acc: 0.9234578627280626; Mean time: 1.308420467376709
Total time spent for evaluation:  381.2423930168152
Max number of trees: 30, Max depth: 40
Mean acc: 0.919026933101651; Mean time: 1.2888355493545531
Total time spent for evaluation:  394.394082069397
Max number of trees: 30, Max depth: 45
Mean acc: 0.9217202432667246; Mean time: 1.3226890087127685
Total time spent for evaluation:  407.8956849575043
Max number of trees: 30, Max depth: 50
Mean acc: 0.9230234578627281; Mean time: 1.3178941249847411
Total time spent for evaluation:  421.3419818878174
Max number of trees: 35, Max depth: 5
Mean acc: 0.929800173761946; Mean time: 1.5077481269836426
Total time spent for evaluation:  436.74

Mean acc: 0.9192006950477845; Mean time: 3.1935617446899416
Total time spent for evaluation:  1675.4806818962097
Max number of trees: 60, Max depth: 5
Mean acc: 0.9342311033883579; Mean time: 4.022613286972046
Total time spent for evaluation:  1716.2968490123749
Max number of trees: 60, Max depth: 10
Mean acc: 0.9372719374456994; Mean time: 3.8017836570739747
Total time spent for evaluation:  1754.8719069957733
Max number of trees: 60, Max depth: 15
Mean acc: 0.9223284100781928; Mean time: 3.6806190490722654
Total time spent for evaluation:  1792.201476097107
Max number of trees: 60, Max depth: 20
Mean acc: 0.9275412684622069; Mean time: 3.6171693086624144
Total time spent for evaluation:  1828.8883039951324
Max number of trees: 60, Max depth: 25
Mean acc: 0.9205039096437879; Mean time: 3.5156487703323362
Total time spent for evaluation:  1864.5505430698395
Max number of trees: 60, Max depth: 30
Mean acc: 0.9143353605560381; Mean time: 3.393461298942566
Total time spent for evaluation:

Mean acc: 0.9155516941789749; Mean time: 5.961328983306885
Total time spent for evaluation:  6088.507203102112
Max number of trees: 85, Max depth: 30
Mean acc: 0.9172024326672459; Mean time: 6.303952097892761
Total time spent for evaluation:  6152.363680839539
Max number of trees: 85, Max depth: 35
Mean acc: 0.9087749782797566; Mean time: 6.185230827331543
Total time spent for evaluation:  6215.015074968338
Max number of trees: 85, Max depth: 40
Mean acc: 0.9013900955690703; Mean time: 5.889645767211914
Total time spent for evaluation:  6274.6492829322815
Max number of trees: 85, Max depth: 45
Mean acc: 0.8977410947002606; Mean time: 5.858444738388061
Total time spent for evaluation:  6333.966547966003
Max number of trees: 85, Max depth: 50
Mean acc: 0.9198957428323198; Mean time: 5.959149265289307
Total time spent for evaluation:  6394.278750896454
Max number of trees: 90, Max depth: 5
Mean acc: 0.9285838401390094; Mean time: 7.458756828308106
Total time spent for evaluation:  6469.75

Mean acc: 0.8921807124239793; Mean time: 9.477196288108825
Total time spent for evaluation:  10769.569358110428
Max number of trees: 115, Max depth: 5
Mean acc: 0.9318853171155517; Mean time: 15.436162543296813
Total time spent for evaluation:  10925.413156032562
Max number of trees: 115, Max depth: 10
Mean acc: 0.9318853171155516; Mean time: 15.394274926185608
Total time spent for evaluation:  11080.92603302002
Max number of trees: 115, Max depth: 15
Mean acc: 0.9173761946133796; Mean time: 13.806134724617005
Total time spent for evaluation:  11220.399357795715
Max number of trees: 115, Max depth: 20
Mean acc: 0.9198957428323196; Mean time: 12.192810654640198
Total time spent for evaluation:  11343.670815944672
Max number of trees: 115, Max depth: 25
Mean acc: 0.9087749782797567; Mean time: 11.890997052192688
Total time spent for evaluation:  11463.887438058853
Max number of trees: 115, Max depth: 30
Mean acc: 0.8981754995655953; Mean time: 12.060869884490966
Total time spent for eval

Mean acc: 0.9208514335360556; Mean time: 15.853023552894593
Total time spent for evaluation:  18801.081134080887
Max number of trees: 140, Max depth: 25
Mean acc: 0.9011294526498699; Mean time: 15.886356520652772
Total time spent for evaluation:  18961.376294136047
Max number of trees: 140, Max depth: 30
Mean acc: 0.8981754995655951; Mean time: 15.134086871147156
Total time spent for evaluation:  19114.065727949142
Max number of trees: 140, Max depth: 35
Mean acc: 0.8711555169417897; Mean time: 14.518351769447326
Total time spent for evaluation:  19260.557109832764
Max number of trees: 140, Max depth: 40
Mean acc: 0.8767158992180712; Mean time: 15.413547015190124
Total time spent for evaluation:  19416.091998815536
Max number of trees: 140, Max depth: 45
Mean acc: 0.8900955690703736; Mean time: 14.767343449592591
Total time spent for evaluation:  19565.125426769257
Max number of trees: 140, Max depth: 50
Mean acc: 0.9039096437880104; Mean time: 15.835038256645202
Total time spent for e

<function time.time>

In [48]:
# Output results to csv file
brf_df = pd.DataFrame(brf_results)
brf_df.to_csv("brf_results.csv", header=['Method', '#Tree', 'Max_Depth', 'Mean_Acc', 'Mean_Time'], index=False) 

In [27]:
# Evaluate training accuracy on the fly
brf_clf = BoostedRandomForest(depth_max=20, T=250, weight_update=True, boosting=True)
brf_clf.fit(X_train, y_train)

In [28]:
brf_train_accs = brf_clf.train_accs

In [29]:
print(len(brf_train_accs))

195


In [14]:
brf1 = brf_clf

In [55]:
# Output results to csv file
brf_train_accs_df = pd.DataFrame(brf_train_accs)
brf_train_accs_df.to_csv("brf_train_accs.csv", header=['Acc'], index=False) 

In [None]:
""" sklearn random forest """
rf_accs, rf_times, rt_clfs = eval_clf("", RandomForestClassifier, {}, X_train, X_test, y_train, y_test, iterations)

In [11]:
""" Boosted Random Forest without weights update """
brf_wout_update_params = {'weight_update': False, 'boosting': True}


brf_wout_update_results = []
start = time()
for T in range(5, max_tree+1, 5) :
    for D in range(5, max_depth+1, 5) :
        print('Max number of trees: {}, Max depth: {}'.format(T, D))
        # Set max number of trees
        brf_wout_update_params['T'] = T
        # Set max depth for each tree
        brf_wout_update_params['depth_max'] = D
        # Train classifier
        brf_wout_update_accs, brf_wout_update_times = eval_clf("Ensemble", BoostedRandomForest, brf_wout_update_params, X_train, X_test, y_train, y_test, iterations)
        # Calcuate mean results
        mean_acc = np.mean(brf_wout_update_accs)
        mean_time = np.mean(brf_wout_update_times)
        print("Mean acc: {}; Mean time: {}".format(mean_acc, mean_time))
        # Record results
        brf_wout_update_results.append(['brf_wout_update', T, D, mean_acc, mean_time])
        
        # Report time used for evaluation
        end = time()
        print("Total time spent for evaluation: ", end-start)

Iteraiton:  0
Iteraiton:  1
Iteraiton:  2
Iteraiton:  3
Iteraiton:  4
Iteraiton:  5
Iteraiton:  6
Iteraiton:  7
Iteraiton:  8
Iteraiton:  9
Iteraiton:  10
Iteraiton:  11
Iteraiton:  12
Iteraiton:  13
Iteraiton:  14
Iteraiton:  15
Iteraiton:  16
Iteraiton:  17
Iteraiton:  18
Iteraiton:  19


In [59]:
# Evaluate training accuracy on the fly
brf_wout_update_clf = BoostedRandomForest(depth_max=20, T=200, weight_update=False, boosting=True)
brf_wout_update_clf.fit(X_train, y_train)

In [62]:
brf_wout_update_train_accs = brf_wout_update_clf.train_accs 

In [63]:
# Output results to csv file
brf_wout_update_train_accs_df = pd.DataFrame(brf_wout_update_train_accs)
brf_wout_update_train_accs_df.to_csv("brf_wout_update_train_accs.csv", header=['Acc'], index=False)

In [13]:
""" Random Forest """
brf_rf_params = {'weight_update': False, 'boosting': False}

brf_rf_results = []
start = time()
for T in range(5, max_tree+1, 5) :
    for D in range(5, max_depth+1, 5) :
        print('Max number of trees: {}, Max depth: {}'.format(T, D))
        # Set max number of trees
        brf_rf_params['T'] = T
        # Set max depth for each tree
        brf_rf_params['depth_max'] = D
        # Train classifier
        brf_rf_accs, brf_rf_times = eval_clf("RF", BoostedRandomForest, brf_rf_params, X_train, X_test, y_train, y_test, iterations)
        # Calcuate mean results
        mean_acc = np.mean(brf_rf_accs)
        mean_time = np.mean(brf_rf_times)
        print("Mean acc: {}; Mean time: {}".format(mean_acc, mean_time))
        # Record results
        brf_rf_results.append(['brf_rf', T, D, mean_acc, mean_time])
        
        # Report time used for evaluation
        end = time()
        print("Total time spent for evaluation: ", end-start)



Iteraiton:  0
Iteraiton:  1
Iteraiton:  2
Iteraiton:  3
Iteraiton:  4
Iteraiton:  5
Iteraiton:  6
Iteraiton:  7
Iteraiton:  8
Iteraiton:  9
Iteraiton:  10
Iteraiton:  11
Iteraiton:  12
Iteraiton:  13
Iteraiton:  14
Iteraiton:  15
Iteraiton:  16
Iteraiton:  17
Iteraiton:  18
Iteraiton:  19


In [53]:
# Evaluate training accuracy on the fly
brf_rf_clf = BoostedRandomForest(depth_max=20, T=200, weight_update=False, boosting=False)
brf_rf_clf.fit(X_train, y_train)

In [60]:
brf_rf_train_accs = brf_rf_clf.train_accs

In [61]:
# Output results to csv file
brf_rf_train_accs_df = pd.DataFrame(brf_rf_train_accs)
brf_rf_train_accs_df.to_csv("brf_rf_train_accs.csv", header=['Acc'], index=False)