In [2]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from BoostedRandomForest import BoostedRandomForest
import pandas as pd
import numpy as np

In [4]:
# Read data to be used
# In this example, a dataset about spamming is used
data = pd.read_csv('Data/Spamebase/spambase.csv',sep=",")
data.head()

Unnamed: 0,word_freq_make,word_freq_address,word_freq_all,word_freq_3d,word_freq_our,word_freq_over,word_freq_remove,word_freq_internet,word_freq_order,word_freq_mail,...,char_freq_;,char_freq_(,char_freq_[,char_freq_!,char_freq_$,char_freq_#,capital_run_length_average,capital_run_length_longest,capital_run_length_total,Spam
0,0.0,0.64,0.64,0.0,0.32,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.778,0.0,0.0,3.756,61,278,1
1,0.21,0.28,0.5,0.0,0.14,0.28,0.21,0.07,0.0,0.94,...,0.0,0.132,0.0,0.372,0.18,0.048,5.114,101,1028,1
2,0.06,0.0,0.71,0.0,1.23,0.19,0.19,0.12,0.64,0.25,...,0.01,0.143,0.0,0.276,0.184,0.01,9.821,485,2259,1
3,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.137,0.0,0.137,0.0,0.0,3.537,40,191,1
4,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.135,0.0,0.135,0.0,0.0,3.537,40,191,1


In [5]:
# Some preprocessing on data

# Number of features
m = data.shape[1]
# Remove unwanted features
X = data.iloc[:,0:48]
y = data.iloc[:,(m-1):]

# Turn data into onehot format
X_onehot = pd.get_dummies(X)

In [12]:
# Splitting training and testing data
X_train, X_test, y_train, y_test = train_test_split(X_onehot, y, test_size=0.25, random_state=33)

In [7]:
""" Create a simple BRF classifier """
# Create a BRF classifier with default parameters (to be explained later)
brf = BoostedRandomForest()

# Train BRF classifier
brf.fit(X_train, y_train)

In [20]:
# Give prediction 
pred = brf.ensemble_predict(X_test)

# Calculate accuracy
acc = accuracy_score(y_test, pred)
print(acc)

0.9278887923544744


In [22]:
# To get the training accuracies during training
train_accs = brf.train_accs
print("Training accuracies: ", train_accs[:5])

# To get the number of tree trained in the forest
num_tree = len(brf.clfs)
print("#Trees: ", num_tree)

Training accuracies:  [0.756231884057971, 0.856231884057971, 0.8857971014492754, 0.9043478260869565, 0.9144927536231884]
#Trees:  49


In [16]:
""" Create a BRF classifier w/out weight update """
# This can be done by setting 'weight_update' to False
brf_wout_update = BoostedRandomForest(weight_update=False)
# Train classifier
brf_wout_update.fit(X_train, y_train)

In [17]:
# Give prediction 
pred = brf.ensemble_predict(X_test)

# Calculate accuracy
acc = accuracy_score(y_test, pred)
print(acc)

0.9278887923544744


In [14]:
""" Create a Random Forest Classifier """
# RF classifiers can be created by setting 'weight_update' and 'boosting' to False
rf = BoostedRandomForest(weight_update=False, boosting=False)

# Train RF classifier
rf.fit(X_train, y_train)

In [15]:
# For RF classifiers, prediction are given by RF_predict()
pred = rf.RF_predict(X_test)
# Calculate accuracy
acc = accuracy_score(y_test, pred)
print(acc)

0.9131190269331017


In [18]:
""" Enable Early Stop for BRF """
# Similarly, it can be done by setting 'early_stop' to True.
# Meanwhile, it is needed to set the parameters for early stopping
brf_es_params = {"eps_ub": 0.5,
                 "eps_lb": 1e-20,
                 "early_stop": True,
                }
brf_es = BoostedRandomForest(**brf_es_params)

# Train classifier
brf_es.fit(X_train, y_train)

In [19]:
# For RF classifiers, prediction are given by RF_predict()
pred = brf_es.ensemble_predict(X_test)
# Calculate accuracy
acc = accuracy_score(y_test, pred)
print(acc)

0.9426585577758471


In [None]:
""" 
The following creates a BRF classifier with default values for all parameters.
For more information about the parameters and class variables, 
please refer to the annotations given in the definition of module in BoostedRandomForest.py.

If one is interested, please check case_evaluation.ipynb, which gives codes used in the experiment,
for more usage of the module.
"""
default_brf = BoostedRandomForest(T=50, sample_portion=0.6, 
                                  depth_max=5, criterion='entropy', 
                                  eps_ub=1, eps_lb=0, 
                                  eps_exceed_limit=5, early_stop=False,
                                  weight_update=True, boosting=True, 
                                  debug_msg=False, verbose=False)