In [1]:
import pandas as pd
import numpy as np

In [2]:
""" Read data to be used """
# In this example, a dataset about spamming is used
data = pd.read_csv('spambase.csv',sep=",")
data.head()

""" Some preprocessing on data """
# Number of features
m = data.shape[1]
# Remove unwanted features
X = data.iloc[:,0:48]
y = data.iloc[:,(m-1):]

# Turn data into onehot format
X_onehot = pd.get_dummies(X)

In [3]:
from sklearn.model_selection import train_test_split
""" Splitting training and testing data """
X_train, X_test, y_train, y_test = train_test_split(X_onehot, y, test_size=0.25, random_state=33)

In [4]:
""" Import BRF classifiers """
from BoostedRandomForest import BoostedRandomForest
from sklearn.ensemble import RandomForestClassifier
"""brf = BoostedRandomForest()
rf = BoostedRandomForest(weight_update=False, boosting=False)
brf_wout_update = BoostedRandomForest(weight_update=False)"""

'brf = BoostedRandomForest()\nrf = BoostedRandomForest(weight_update=False, boosting=False)\nbrf_wout_update = BoostedRandomForest(weight_update=False)'

In [36]:
""" Function for evaluating a classifier"""
from sklearn.metrics import accuracy_score
from time import time

def eval_clf(method, CLF, params, X_train, X_test, y_train, y_test, iterations) :
    # List of accuracies in each iteration
    accs = []
    # List of time used for training
    times = []
    
    # Train classifier for iterations
    for it in range(iterations) :
        #print("Iteraiton: ", it)        
        # Create classifier
        clf = CLF(**params)
        
        #Start timer
        start = time()
        # Train classifier
        clf.fit(X_train, y_train)
        # End timer 
        end = time()
        # Record time
        times.append(end-start)
        
        # Give predictions
        if method == "Ensemble" :
            pred = clf.ensemble_predict(X_test)
        elif method == "RF" :
            pred = clf.RF_predict(X_test)
        else :
            pred = clf.predict(X_test)
            
        # Calculate accuracy
        acc = accuracy_score(y_test, pred)
        
        # Record accuracy
        accs.append(acc)
        
        
    # Return results and times used
    return accs, times

In [40]:
# Number of iterations to train classifier to get results
iterations = 10
# Max number of trees per classifier
max_tree = 150
# Max depth for each tree
max_depth = 50

In [None]:
""" Boosted Random Forest """
brf_params = {'weight_update': True, 'boosting': True}


brf_results = []
start = time()
for T in range(5, max_tree+1, 5) :
    for D in range(5, max_depth+1, 5) :
        print('Max number of trees: {}, Max depth: {}'.format(T, D))
        # Set max number of trees
        brf_params['T'] = T
        # Set max depth for each tree
        brf_params['depth_max'] = D
        # Train classifier
        brf_accs, brf_times = eval_clf("Ensemble", BoostedRandomForest, brf_params, X_train, X_test, y_train, y_test, iterations)
        # Calcuate mean results
        mean_acc = np.mean(brf_accs)
        mean_time = np.mean(brf_times)
        print("Mean acc: {}; Mean time: {}".format(mean_acc, mean_time))
        # Record results
        brf_results.append(['brf', T, D, mean_acc, mean_time])
        
        # Report time used for evaluation
        end = time()
        print("Total time spent for evaluation: ", end-start)
end = time()

time 

Max number of trees: 5, Max depth: 5
Mean acc: 0.90503909643788; Mean time: 0.1668393135070801
Total time spent for evaluation:  1.7423269748687744
Max number of trees: 5, Max depth: 10
Mean acc: 0.9067767158992179; Mean time: 0.1332714557647705
Total time spent for evaluation:  3.129426956176758
Max number of trees: 5, Max depth: 15
Mean acc: 0.9062554300608167; Mean time: 0.1437476634979248
Total time spent for evaluation:  4.628628969192505
Max number of trees: 5, Max depth: 20
Mean acc: 0.9006081668114684; Mean time: 0.17016661167144775
Total time spent for evaluation:  6.404349088668823
Max number of trees: 5, Max depth: 25
Mean acc: 0.903475238922676; Mean time: 0.1411504030227661
Total time spent for evaluation:  7.874499082565308
Max number of trees: 5, Max depth: 30
Mean acc: 0.9041702867072111; Mean time: 0.13598909378051757
Total time spent for evaluation:  9.289360046386719
Max number of trees: 5, Max depth: 35
Mean acc: 0.8968722849695917; Mean time: 0.13890743255615234
To

In [33]:
for T in range(10, max_tree+1, 10) :
    print(T)

10
20
30
40
50
60
70
80
90
100
110
120
130
140
150


In [15]:
print(np.mean(brf_accs))

0.9294960903562121


In [9]:
rf_accs, rf_times, rt_clfs = eval_clf("", RandomForestClassifier, {}, X_train, X_test, y_train, y_test, iterations)

Iteraiton:  0
Iteraiton:  1
Iteraiton:  2
Iteraiton:  3
Iteraiton:  4




Iteraiton:  5
Iteraiton:  6
Iteraiton:  7
Iteraiton:  8
Iteraiton:  9




Iteraiton:  10
Iteraiton:  11
Iteraiton:  12
Iteraiton:  13
Iteraiton:  14




Iteraiton:  15
Iteraiton:  16
Iteraiton:  17
Iteraiton:  18
Iteraiton:  19




In [10]:
print(rf_accs)
print(rf_times)

[0.9391833188531712, 0.9435273675065161, 0.9409209383145091, 0.9391833188531712, 0.9417897480451781, 0.9357080799304952, 0.9409209383145091, 0.9548218940052129, 0.9313640312771503, 0.9374456993918332, 0.9383145091225021, 0.9435273675065161, 0.9304952215464813, 0.9313640312771503, 0.9365768896611643, 0.9391833188531712, 0.9374456993918332, 0.944396177237185, 0.944396177237185, 0.9409209383145091]
[0.050575971603393555, 0.0408172607421875, 0.038609981536865234, 0.0412139892578125, 0.054486989974975586, 0.0455319881439209, 0.040560245513916016, 0.041316986083984375, 0.03984522819519043, 0.04445695877075195, 0.04268288612365723, 0.038713932037353516, 0.04107213020324707, 0.039511680603027344, 0.043576717376708984, 0.04264068603515625, 0.04109787940979004, 0.039585113525390625, 0.040570735931396484, 0.046916961669921875]


In [16]:
print(np.mean(rf_accs))

0.9395742832319721


In [11]:
""" Boosted Random Forest without weights update """
brf_wout_update_params = {'weight_update': False, 'boosting': True}
brf_wout_update_accs, brf_wout_update_times, brf_wout_update_clfs = eval_clf("Ensemble", BoostedRandomForest, brf_wout_update_params, X_train, X_test, y_train, y_test, iterations)

Iteraiton:  0
Iteraiton:  1
Iteraiton:  2
Iteraiton:  3
Iteraiton:  4
Iteraiton:  5
Iteraiton:  6
Iteraiton:  7
Iteraiton:  8
Iteraiton:  9
Iteraiton:  10
Iteraiton:  11
Iteraiton:  12
Iteraiton:  13
Iteraiton:  14
Iteraiton:  15
Iteraiton:  16
Iteraiton:  17
Iteraiton:  18
Iteraiton:  19


In [17]:
print(brf_wout_update_accs)
print(brf_wout_update_times)

[0.9105125977410947, 0.9122502172024327, 0.9087749782797567, 0.9148566463944396, 0.9218071242397915, 0.9139878366637706, 0.9087749782797567, 0.9139878366637706, 0.9096437880104257, 0.9105125977410947, 0.9122502172024327, 0.9131190269331017, 0.9209383145091226, 0.9218071242397915, 0.9122502172024327, 0.9131190269331017, 0.9122502172024327, 0.9165942658557776, 0.9113814074717637, 0.9165942658557776]
[2.740494966506958, 2.6867198944091797, 2.941295862197876, 3.1523473262786865, 2.7550299167633057, 2.806382179260254, 2.691952705383301, 2.7052652835845947, 2.705120086669922, 2.702498197555542, 2.7824108600616455, 2.688153028488159, 2.708527088165283, 2.7030251026153564, 2.7201852798461914, 2.685549259185791, 2.7275540828704834, 2.6926229000091553, 2.7203118801116943, 2.751677989959717]


In [18]:
print(np.mean(brf_wout_update_accs))

0.9137706342311034


In [13]:
""" Random Forest """
brf_rf_params = {'weight_update': False, 'boosting': False}
brf_rf_accs, brf_rf_times, brf_rf_clfs = eval_clf("RF", BoostedRandomForest, brf_rf_params, X_train, X_test, y_train, y_test, iterations)

Iteraiton:  0
Iteraiton:  1
Iteraiton:  2
Iteraiton:  3
Iteraiton:  4
Iteraiton:  5
Iteraiton:  6
Iteraiton:  7
Iteraiton:  8
Iteraiton:  9
Iteraiton:  10
Iteraiton:  11
Iteraiton:  12
Iteraiton:  13
Iteraiton:  14
Iteraiton:  15
Iteraiton:  16
Iteraiton:  17
Iteraiton:  18
Iteraiton:  19


In [19]:
print(brf_rf_accs)
print(brf_rf_times)

[0.9131190269331017, 0.9079061685490878, 0.9122502172024327, 0.9105125977410947, 0.9148566463944396, 0.9157254561251086, 0.9157254561251086, 0.9113814074717637, 0.9113814074717637, 0.9139878366637706, 0.9139878366637706, 0.9122502172024327, 0.9157254561251086, 0.9052997393570807, 0.9148566463944396, 0.9131190269331017, 0.9131190269331017, 0.9113814074717637, 0.9096437880104257, 0.9148566463944396]
[2.7335221767425537, 2.8847408294677734, 2.6722030639648438, 2.8215560913085938, 2.7033348083496094, 2.742257833480835, 2.7540910243988037, 2.787626028060913, 2.8061649799346924, 2.896130084991455, 2.816082239151001, 2.769460916519165, 3.3437459468841553, 2.7175989151000977, 2.710498094558716, 2.714344024658203, 2.7678780555725098, 2.76701021194458, 2.8184142112731934, 2.7731950283050537]


In [20]:
print(np.mean(brf_rf_accs))

0.9125543006081667
