In [31]:
import numpy as np
import pandas as pd

# Data

In [32]:
np.random.seed(2)

dataset_size = ["small", "medium", "large", "beta"][-1]

dataset_info = {
    "small": {
        "dataset_name": "wine",
        "class_name": "Class",
        "drop_fields": []
    },
    "medium": {
        "dataset_name": "breast-cancer-wisconsin",
        "class_name": "Class",
        "drop_fields": ["Sample code number"]
    },
    "large": {
        "dataset_name": "seismic-bumps",
        "class_name": "class",
        "drop_fields": []
    },
    "beta": {
        "dataset_name": "Acoustic_Extinguisher_Fire_Dataset",
        "class_name": "Class",
        "drop_fields": []
    }
}

dataset_name = dataset_info[dataset_size]["dataset_name"]
class_name = dataset_info[dataset_size]["class_name"]
drop_fields = dataset_info[dataset_size]["drop_fields"]

df = pd.read_csv('../data/' + dataset_name + ".csv")
df = df.drop(drop_fields, axis=1)
df = df.iloc[np.random.permutation(len(df))]

if dataset_name == "breast-cancer-wisconsin":
    df[class_name].replace({2: 0, 4: 1}, inplace=True)

n_cut = int(0.8*len(df))
df_trn = df[:n_cut]
df_tst = df[n_cut:]

X_trn = df_trn.drop(class_name, axis=1)
y_trn = df_trn[class_name]

X_tst = df_tst.drop(class_name, axis=1)
y_tst = df_tst[class_name]

In [33]:
from sklearn.preprocessing import KBinsDiscretizer


def discretize_df(X_trn, X_tst, n_bins=3):

    cols_to_discretize = [
        col for col in X_trn.columns if X_trn[col].dtype == float or X_trn[col].dtype == int]

    if len(cols_to_discretize) == 0:
        return X_trn, X_tst

    est = KBinsDiscretizer(n_bins=n_bins, encode='ordinal', strategy='uniform')

    X_trn_aux = X_trn[cols_to_discretize].to_numpy()
    X_tst_aux = X_tst[cols_to_discretize].to_numpy()

    if len(X_trn_aux.shape) > 1:
        X_trn_aux = est.fit_transform(X_trn_aux)
        X_tst_aux = est.transform(X_tst_aux)

    else:
        X_trn_aux = X_trn_aux[:, None]
        X_tst_aux = X_tst_aux[:, None]

        X_trn_aux = est.fit_transform(X_trn_aux)
        X_tst_aux = est.transform(X_tst_aux)

        X_trn_aux = X_trn_aux.reshape(X_trn_aux.shape[0])
        X_tst_aux = X_tst_aux.reshape(X_tst_aux.shape[0])

    X_trn_aux = X_trn_aux.astype(int).astype(str)
    X_tst_aux = X_tst_aux.astype(int).astype(str)

    X_trn[cols_to_discretize] = X_trn_aux
    X_tst[cols_to_discretize] = X_tst_aux

    dict_replace = {
        3: {'0': 'L', '1': 'M', '2': 'H'},
        5: {'0': 'LL', '1': 'L', '2': 'M', '3': 'H', '4': 'HH'}
    }

    X_trn = X_trn.replace(dict_replace[n_bins])
    X_tst = X_tst.replace(dict_replace[n_bins])

    return X_trn, X_tst


In [34]:
X_trn, X_tst = discretize_df(X_trn, X_tst)

# Random Forest

In [35]:
from RandomForest import RandomForest
from sklearn.model_selection import GridSearchCV

M = X_trn.shape[1]

M = X_trn.shape[1]
CV_dict_params = {'NT': [1, 10, 25, 50, 75, 100],
                  'F': list(set([1, 3, int(np.log2(M + 1)), int(np.sqrt(M))]))
                  }

clf_RF = GridSearchCV(RandomForest(), param_grid=CV_dict_params, verbose=3)
clf_RF.fit(X_trn.to_numpy(), y_trn.to_numpy())


Fitting 5 folds for each of 18 candidates, totalling 90 fits
[CV 1/5] END .........................F=1, NT=1;, score=0.860 total time=   0.0s
[CV 2/5] END .........................F=1, NT=1;, score=0.858 total time=   0.0s
[CV 3/5] END .........................F=1, NT=1;, score=0.853 total time=   0.0s
[CV 4/5] END .........................F=1, NT=1;, score=0.856 total time=   0.0s
[CV 5/5] END .........................F=1, NT=1;, score=0.860 total time=   0.0s
[CV 1/5] END ........................F=1, NT=10;, score=0.779 total time=   0.4s
[CV 2/5] END ........................F=1, NT=10;, score=0.772 total time=   0.4s
[CV 3/5] END ........................F=1, NT=10;, score=0.780 total time=   0.4s
[CV 4/5] END ........................F=1, NT=10;, score=0.781 total time=   0.4s
[CV 5/5] END ........................F=1, NT=10;, score=0.791 total time=   0.4s
[CV 1/5] END ........................F=1, NT=25;, score=0.847 total time=   0.9s
[CV 2/5] END ........................F=1, NT=25;

GridSearchCV(estimator=RandomForest(),
             param_grid={'F': [1, 2, 3], 'NT': [1, 10, 25, 50, 75, 100]},
             verbose=3)

In [36]:
sorted(clf_RF.cv_results_.keys())

['mean_fit_time',
 'mean_score_time',
 'mean_test_score',
 'param_F',
 'param_NT',
 'params',
 'rank_test_score',
 'split0_test_score',
 'split1_test_score',
 'split2_test_score',
 'split3_test_score',
 'split4_test_score',
 'std_fit_time',
 'std_score_time',
 'std_test_score']

In [37]:
for F_, NT_, score_ in zip(clf_RF.cv_results_['param_F'], 
                           clf_RF.cv_results_['param_NT'],
                           clf_RF.cv_results_['mean_test_score']):
    print(f'(F, NT) = {(F_, NT_)} \t--> \t F1-Score = {round(score_, 3)}')

(F, NT) = (1, 1) 	--> 	 F1-Score = 0.857
(F, NT) = (1, 10) 	--> 	 F1-Score = 0.78
(F, NT) = (1, 25) 	--> 	 F1-Score = 0.844
(F, NT) = (1, 50) 	--> 	 F1-Score = 0.838
(F, NT) = (1, 75) 	--> 	 F1-Score = 0.837
(F, NT) = (1, 100) 	--> 	 F1-Score = 0.845
(F, NT) = (2, 1) 	--> 	 F1-Score = 0.857
(F, NT) = (2, 10) 	--> 	 F1-Score = 0.859
(F, NT) = (2, 25) 	--> 	 F1-Score = 0.867
(F, NT) = (2, 50) 	--> 	 F1-Score = 0.869
(F, NT) = (2, 75) 	--> 	 F1-Score = 0.866
(F, NT) = (2, 100) 	--> 	 F1-Score = 0.867
(F, NT) = (3, 1) 	--> 	 F1-Score = 0.855
(F, NT) = (3, 10) 	--> 	 F1-Score = 0.869
(F, NT) = (3, 25) 	--> 	 F1-Score = 0.868
(F, NT) = (3, 50) 	--> 	 F1-Score = 0.869
(F, NT) = (3, 75) 	--> 	 F1-Score = 0.869
(F, NT) = (3, 100) 	--> 	 F1-Score = 0.869


In [38]:
best_RF = clf_RF.best_estimator_
print(best_RF.get_params())

{'F': 3, 'NT': 50, 'df_columns': None}


In [39]:
y_trn_hat = best_RF.predict(X_trn.to_numpy())
y_tst_hat = best_RF.predict(X_tst.to_numpy())

In [40]:
print(best_RF.importance / best_RF.importance.sum())
ordered_RF_importance = np.flip(np.argsort(best_RF.importance))
X_trn.columns[ordered_RF_importance]


[0.18987342 0.39636076 0.09177215 0.13686709 0.05221519 0.13291139]


Index(['Fuel', 'Size', 'Desibel', 'Frequency', 'Distance', 'Airflow'], dtype='object')

In [41]:
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

print(f'Accuracy (test): {round(accuracy_score(y_tst.to_numpy(), y_tst_hat), 3)}')
print(f'Precision (test): {round(precision_score(y_tst.to_numpy(), y_tst_hat), 3)}')
print(f'Recall (test): {round(recall_score(y_tst.to_numpy(), y_tst_hat), 3)}')
print(f'F1 Score (test): {round(f1_score(y_tst.to_numpy(), y_tst_hat), 3)}')

# print(f'Accuracy (train): {accuracy_score(y_trn.to_numpy(), y_trn_hat)}')
# print(f'Precision (train): {precision_score(y_trn.to_numpy(), y_trn_hat)}')
# print(f'Recall (train): {recall_score(y_trn.to_numpy(), y_trn_hat)}')
# print(f'F1 Score (train): {f1_score(y_trn.to_numpy(), y_trn_hat)}')


Accuracy (test): 0.878
Precision (test): 0.881
Recall (test): 0.875
F1 Score (test): 0.878


In [42]:
for F_ in CV_dict_params['F']:
    for NT_ in CV_dict_params['NT']:
        clf = RandomForest(NT=NT_, F=F_, df_columns=X_trn.columns)
        print(f'(F, NT) = {(F_, NT_)}')
        clf.fit(X_trn.to_numpy(), y_trn.to_numpy(), verbose=1)
        y_tst_hat = clf.predict(X_tst.to_numpy())        
        print(f'Accuracy (test): {round(accuracy_score(y_tst.to_numpy(), y_tst_hat), 3)}')
        print(f'Precision (test): {round(precision_score(y_tst.to_numpy(), y_tst_hat), 3)}')
        print(f'Recall (test): {round(recall_score(y_tst.to_numpy(), y_tst_hat), 3)}')
        print(f'F1 Score (test): {round(f1_score(y_tst.to_numpy(), y_tst_hat), 3)}')
        print()

(F, NT) = (1, 1)
Feature importance: Index(['Airflow', 'Frequency', 'Desibel', 'Distance', 'Fuel', 'Size'], dtype='object')
Accuracy (test): 0.859
Precision (test): 0.848
Recall (test): 0.874
F1 Score (test): 0.861

(F, NT) = (1, 10)
Feature importance: Index(['Size', 'Airflow', 'Desibel', 'Distance', 'Frequency', 'Fuel'], dtype='object')
Accuracy (test): 0.727
Precision (test): 0.768
Recall (test): 0.649
F1 Score (test): 0.703

(F, NT) = (1, 25)
Feature importance: Index(['Distance', 'Size', 'Fuel', 'Airflow', 'Desibel', 'Frequency'], dtype='object')
Accuracy (test): 0.819
Precision (test): 0.778
Recall (test): 0.893
F1 Score (test): 0.831

(F, NT) = (1, 50)
Feature importance: Index(['Distance', 'Fuel', 'Size', 'Airflow', 'Desibel', 'Frequency'], dtype='object')
Accuracy (test): 0.827
Precision (test): 0.79
Recall (test): 0.89
F1 Score (test): 0.837

(F, NT) = (1, 75)
Feature importance: Index(['Distance', 'Airflow', 'Size', 'Fuel', 'Frequency', 'Desibel'], dtype='object')
Accuracy (

# Decision Tree

In [43]:
from DecisionForest import DecisionForest
from sklearn.model_selection import GridSearchCV

M = X_trn.shape[1]
CV_dict_params = {'NT': [1, 10, 25, 50, 75, 100],
                  'F': list(set([int(M/4), int(M/2), int(3*M/4), -1]))
                  }

clf_DF = GridSearchCV(DecisionForest(), param_grid=CV_dict_params, verbose=3)
clf_DF.fit(X_trn.to_numpy(), y_trn.to_numpy())


Fitting 5 folds for each of 24 candidates, totalling 120 fits
[CV 1/5] END .........................F=1, NT=1;, score=0.701 total time=   0.0s
[CV 2/5] END .........................F=1, NT=1;, score=0.708 total time=   0.0s
[CV 3/5] END .........................F=1, NT=1;, score=0.710 total time=   0.0s
[CV 4/5] END .........................F=1, NT=1;, score=0.707 total time=   0.0s
[CV 5/5] END .........................F=1, NT=1;, score=0.708 total time=   0.0s
[CV 1/5] END ........................F=1, NT=10;, score=0.844 total time=   0.4s
[CV 2/5] END ........................F=1, NT=10;, score=0.839 total time=   0.4s
[CV 3/5] END ........................F=1, NT=10;, score=0.840 total time=   0.4s
[CV 4/5] END ........................F=1, NT=10;, score=0.842 total time=   0.4s
[CV 5/5] END ........................F=1, NT=10;, score=0.843 total time=   0.4s
[CV 1/5] END ........................F=1, NT=25;, score=0.827 total time=   0.9s
[CV 2/5] END ........................F=1, NT=25

GridSearchCV(estimator=DecisionForest(),
             param_grid={'F': [1, 3, 4, -1], 'NT': [1, 10, 25, 50, 75, 100]},
             verbose=3)

In [44]:
sorted(clf_DF.cv_results_.keys())

['mean_fit_time',
 'mean_score_time',
 'mean_test_score',
 'param_F',
 'param_NT',
 'params',
 'rank_test_score',
 'split0_test_score',
 'split1_test_score',
 'split2_test_score',
 'split3_test_score',
 'split4_test_score',
 'std_fit_time',
 'std_score_time',
 'std_test_score']

In [45]:
for F_, NT_, score_ in zip(clf_DF.cv_results_['param_F'], 
                           clf_DF.cv_results_['param_NT'],
                           clf_DF.cv_results_['mean_test_score']):
    print(f'(F, NT) = {(F_, NT_)} \t--> \t F1-Score = {round(score_, 3)}')

(F, NT) = (1, 1) 	--> 	 F1-Score = 0.707
(F, NT) = (1, 10) 	--> 	 F1-Score = 0.841
(F, NT) = (1, 25) 	--> 	 F1-Score = 0.827
(F, NT) = (1, 50) 	--> 	 F1-Score = 0.835
(F, NT) = (1, 75) 	--> 	 F1-Score = 0.841
(F, NT) = (1, 100) 	--> 	 F1-Score = 0.84
(F, NT) = (3, 1) 	--> 	 F1-Score = 0.84
(F, NT) = (3, 10) 	--> 	 F1-Score = 0.868
(F, NT) = (3, 25) 	--> 	 F1-Score = 0.866
(F, NT) = (3, 50) 	--> 	 F1-Score = 0.87
(F, NT) = (3, 75) 	--> 	 F1-Score = 0.866
(F, NT) = (3, 100) 	--> 	 F1-Score = 0.868
(F, NT) = (4, 1) 	--> 	 F1-Score = 0.85
(F, NT) = (4, 10) 	--> 	 F1-Score = 0.875
(F, NT) = (4, 25) 	--> 	 F1-Score = 0.877
(F, NT) = (4, 50) 	--> 	 F1-Score = 0.877
(F, NT) = (4, 75) 	--> 	 F1-Score = 0.877
(F, NT) = (4, 100) 	--> 	 F1-Score = 0.877
(F, NT) = (-1, 1) 	--> 	 F1-Score = 0.873
(F, NT) = (-1, 10) 	--> 	 F1-Score = 0.875
(F, NT) = (-1, 25) 	--> 	 F1-Score = 0.87
(F, NT) = (-1, 50) 	--> 	 F1-Score = 0.879
(F, NT) = (-1, 75) 	--> 	 F1-Score = 0.884
(F, NT) = (-1, 100) 	--> 	 F1-Score

In [46]:
best_DF = clf_DF.best_estimator_
print(best_DF.get_params())

{'F': -1, 'NT': 100, 'df_columns': None}


In [47]:
print(best_DF.importance / best_DF.importance.sum())
ordered_DF_importance = np.flip(np.argsort(best_DF.importance))
X_trn.columns[ordered_RF_importance]

[0.14899264 0.38701894 0.07166124 0.26432622 0.0369164  0.09108457]


Index(['Fuel', 'Size', 'Desibel', 'Frequency', 'Distance', 'Airflow'], dtype='object')

In [48]:
y_tst_hat = best_DF.predict(X_tst.to_numpy())

In [49]:
print(f'Accuracy (test): {round(accuracy_score(y_tst.to_numpy(), y_tst_hat), 3)}')
print(f'Precision (test): {round(precision_score(y_tst.to_numpy(), y_tst_hat), 3)}')
print(f'Recall (test): {round(recall_score(y_tst.to_numpy(), y_tst_hat), 3)}')
print(f'F1 Score (test): {round(f1_score(y_tst.to_numpy(), y_tst_hat), 3)}')

Accuracy (test): 0.894
Precision (test): 0.911
Recall (test): 0.873
F1 Score (test): 0.892


In [50]:
for F_ in CV_dict_params['F']:
    for NT_ in CV_dict_params['NT']:
        clf = DecisionForest(NT=NT_, F=F_, df_columns=X_trn.columns)
        print(f'(F, NT) = {(F_, NT_)}')
        clf.fit(X_trn.to_numpy(), y_trn.to_numpy(), verbose=1)
        y_tst_hat = clf.predict(X_tst.to_numpy())        
        print(f'Accuracy (test): {round(accuracy_score(y_tst.to_numpy(), y_tst_hat), 3)}')
        print(f'Precision (test): {round(precision_score(y_tst.to_numpy(), y_tst_hat), 3)}')
        print(f'Recall (test): {round(recall_score(y_tst.to_numpy(), y_tst_hat), 3)}')
        print(f'F1 Score (test): {round(f1_score(y_tst.to_numpy(), y_tst_hat), 3)}')
        print()

(F, NT) = (1, 1)
Feature importance: Index(['Frequency', 'Airflow', 'Desibel', 'Distance', 'Fuel', 'Size'], dtype='object')
Accuracy (test): 0.621
Precision (test): 0.576
Recall (test): 0.92
F1 Score (test): 0.708

(F, NT) = (1, 10)
Feature importance: Index(['Frequency', 'Airflow', 'Desibel', 'Distance', 'Size', 'Fuel'], dtype='object')
Accuracy (test): 0.847
Precision (test): 0.834
Recall (test): 0.865
F1 Score (test): 0.849

(F, NT) = (1, 25)
Feature importance: Index(['Frequency', 'Fuel', 'Airflow', 'Desibel', 'Distance', 'Size'], dtype='object')
Accuracy (test): 0.82
Precision (test): 0.779
Recall (test): 0.893
F1 Score (test): 0.832

(F, NT) = (1, 50)
Feature importance: Index(['Frequency', 'Desibel', 'Distance', 'Fuel', 'Airflow', 'Size'], dtype='object')
Accuracy (test): 0.833
Precision (test): 0.802
Recall (test): 0.884
F1 Score (test): 0.841

(F, NT) = (1, 75)
Feature importance: Index(['Fuel', 'Frequency', 'Airflow', 'Distance', 'Size', 'Desibel'], dtype='object')
Accuracy (