In [2]:
import numpy as np
import pandas as pd

from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from sklearn.utils import resample

# Data

In [3]:
np.random.seed(2)

dataset_size = ["small", "medium", "large"][0]

dataset_info = {
    "small": {
        "dataset_name": "wine",
        "class_name": "Class",
        "drop_fields": []
    },
    "medium": {
        "dataset_name": "breast-cancer-wisconsin",
        "class_name": "Class",
        "drop_fields": ["Sample code number"]
    },
    "large": {
        "dataset_name": "agaricus-lepiota",
        "class_name": "Class",
        "drop_fields": []
    },
}

dataset_name = dataset_info[dataset_size]["dataset_name"]
class_name = dataset_info[dataset_size]["class_name"]
drop_fields = dataset_info[dataset_size]["drop_fields"]

df = pd.read_csv('../data/' + dataset_name + ".csv")
df = df.drop(drop_fields, axis=1)
df = df.iloc[np.random.permutation(len(df))]

if dataset_name == "breast-cancer-wisconsin":
    df[class_name].replace({2: 0, 4: 1}, inplace=True)
    
if dataset_name == "agaricus-lepiota":
    df[class_name].replace({'p': 1, 'e': 0}, inplace=True)

n_cut = int(0.8*len(df))
df_trn = df[:n_cut]
df_tst = df[n_cut:]

X_trn = df_trn.drop(class_name, axis=1)
y_trn = df_trn[class_name]

X_tst = df_tst.drop(class_name, axis=1)
y_tst = df_tst[class_name]

# Random Forest

In [None]:
from RandomForest_df import RandomForest_df

M = X_trn.shape[1]
CV_dict_params = {'NT': [1, 10, 25, 50, 75, 100],
                  'F': sorted(list(set([1, 3, int(np.log2(M + 1)), int(np.sqrt(M))])))
                  }

best_F, best_NT, best_score = None, None, -1
all_metrics_CV = []

for F_ in CV_dict_params['F']:
    for NT_ in CV_dict_params['NT']:
        all_scores = np.zeros(5)
        N = len(X_trn)
        for run in range(5):
            
            ind_tst_ = np.full(len(X_trn), False)
            ind_tst_[int(N*run/5):int(N*(run+1)/5)] = True
            
            X_trn_, y_trn_ = X_trn[ind_tst_], y_trn[ind_tst_]
            X_tst_, y_tst_ = X_trn[~ind_tst_], y_trn[~ind_tst_]
            
            clf = RandomForest_df(NT=NT_, F=F_)
            clf.fit(X_trn_, y_trn_)
            all_scores[run] = clf.score(X_tst_, y_tst_)
            
        score_ = all_scores.mean()
        all_metrics_CV.append(score_)
        
        if score_ > best_score:
            best_F, best_NT, best_score = F_, NT_, score_
        print(f'(F, NT) = {(F_, NT_)} \t--> \t F1-Score = {round(score_, 3)}')
        
all_metrics_CV = np.array(all_metrics_CV)


In [None]:
print(f'Best Parameters (F, NT): {(best_F, best_NT)}')
best_RF = RandomForest_df(NT=best_NT, F=best_F)
best_RF.fit(X_trn, y_trn, verbose=3)


In [6]:
y_trn_hat = best_RF.predict(X_trn)
y_tst_hat = best_RF.predict(X_tst)

In [None]:
print(f'Accuracy (test): {round(accuracy_score(y_tst.to_numpy(), y_tst_hat), 3)}')
print(f'Precision (test): {round(precision_score(y_tst.to_numpy(), y_tst_hat), 3)}')
print(f'Recall (test): {round(recall_score(y_tst.to_numpy(), y_tst_hat), 3)}')
print(f'F1 Score (test): {round(f1_score(y_tst.to_numpy(), y_tst_hat), 3)}')
print('-'*15)
print(f'Accuracy (train): {round(accuracy_score(y_trn.to_numpy(), y_trn_hat), 3)}')
print(f'Precision (train): {round(precision_score(y_trn.to_numpy(), y_trn_hat), 3)}')
print(f'Recall (train): {round(recall_score(y_trn.to_numpy(), y_trn_hat), 3)}')
print(f'F1 Score (train): {round(f1_score(y_trn.to_numpy(), y_trn_hat), 3)}')


In [None]:
all_importances = []
all_metrics = []

for F_ in CV_dict_params['F']:
    for NT_ in CV_dict_params['NT']:
        clf = RandomForest_df(NT=NT_, F=F_)
        print(f'(F, NT) = {(F_, NT_)}')
        clf.fit(X_trn, y_trn, verbose=3)
        all_importances.append(np.flip(np.argsort(clf.importance)))
        y_tst_hat = clf.predict(X_tst)     
        
        acc = accuracy_score(y_tst.to_numpy(), y_tst_hat)
        prec = precision_score(y_tst.to_numpy(), y_tst_hat)
        rec = recall_score(y_tst.to_numpy(), y_tst_hat)
        f1_ = f1_score(y_tst.to_numpy(), y_tst_hat)
           
        print(f'Accuracy (test): {round(acc, 3)}')
        print(f'Precision (test): {round(prec, 3)}')
        print(f'Recall (test): {round(rec, 3)}')
        print(f'F1 Score (test): {round(f1_, 3)}')
        print('-'*15)
        
        all_metrics.append([acc, prec, rec, f1_])
        
all_importances = np.array(all_importances)
all_metrics = np.array(all_metrics)

In [None]:
all_metrics_CV

In [None]:
all_metrics_CV

In [None]:
print(all_metrics.T)

In [None]:
print(all_metrics_CV.T)

In [14]:
# from utils.print_latex import print_table

# print("IMPORTANCES")
# print_table(all_importances.T)
# print("-"*15)

# print("ACC - PRECISION - RECALL - F1 (TEST)")
# print_table(all_metrics.T)
# print("-"*15)

# print("F1 (CV)")
# print_table(all_metrics_CV)
# print("-"*15)


In [15]:
# from sys import modules
# del modules["utils.print_latex"]

# Decision Forest

In [None]:
from DecisionForest_df import DecisionForest_df

M = X_trn.shape[1]
CV_dict_params = {'NT': [1, 10, 25, 50, 75, 100],
                  'F': sorted(list(set([int(M/4), int(M/2), int(3*M/4)]))) + [-1]
                  }

best_F, best_NT, best_score = None, None, -1
all_metrics_CV = []

for F_ in CV_dict_params['F']:
    for NT_ in CV_dict_params['NT']:
        all_scores = np.zeros(5)
        N = len(X_trn)
        for run in range(5):
            
            ind_tst_ = np.full(len(X_trn), False)
            ind_tst_[int(N*run/5):int(N*(run+1)/5)] = True
            
            X_trn_, y_trn_ = X_trn[ind_tst_], y_trn[ind_tst_]
            X_tst_, y_tst_ = X_trn[~ind_tst_], y_trn[~ind_tst_]
            
            clf = DecisionForest_df(NT=NT_, F=F_)
            clf.fit(X_trn_, y_trn_)
            all_scores[run] = clf.score(X_tst_, y_tst_)
            
        score_ = all_scores.mean()
        all_metrics_CV.append(score_)
        if score_ > best_score:
            best_F, best_NT, best_score = F_, NT_, score_
        print(f'(F, NT) = {(F_, NT_)} \t--> \t F1-Score = {round(score_, 3)}')
        
all_metrics_CV = np.array(all_metrics_CV)


In [None]:
print(f'Best Parameters (F, NT): {(best_F, best_NT)}')
best_DF = DecisionForest_df(NT=best_NT, F=best_F)
best_DF.fit(X_trn, y_trn, verbose=3)
print(np.flip(np.argsort(best_DF.importance)))

In [20]:
y_trn_hat = best_DF.predict(X_trn)
y_tst_hat = best_DF.predict(X_tst)

In [None]:
print(f'Accuracy (test): {round(accuracy_score(y_tst.to_numpy(), y_tst_hat), 3)}')
print(f'Precision (test): {round(precision_score(y_tst.to_numpy(), y_tst_hat), 3)}')
print(f'Recall (test): {round(recall_score(y_tst.to_numpy(), y_tst_hat), 3)}')
print(f'F1 Score (test): {round(f1_score(y_tst.to_numpy(), y_tst_hat), 3)}')
print('-'*15)
print(f'Accuracy (train): {round(accuracy_score(y_trn.to_numpy(), y_trn_hat), 3)}')
print(f'Precision (train): {round(precision_score(y_trn.to_numpy(), y_trn_hat), 3)}')
print(f'Recall (train): {round(recall_score(y_trn.to_numpy(), y_trn_hat), 3)}')
print(f'F1 Score (train): {round(f1_score(y_trn.to_numpy(), y_trn_hat), 3)}')


In [None]:
all_importances = []
all_metrics = []

for F_ in CV_dict_params['F']:
    for NT_ in CV_dict_params['NT']:
        clf = DecisionForest_df(NT=NT_, F=F_)
        print(f'(F, NT) = {(F_, NT_)}')
        clf.fit(X_trn, y_trn, verbose=3)
        y_tst_hat = clf.predict(X_tst)   
        
        all_importances.append(np.flip(np.argsort(clf.importance)))     
        
        acc = accuracy_score(y_tst.to_numpy(), y_tst_hat)
        prec = precision_score(y_tst.to_numpy(), y_tst_hat)
        rec = recall_score(y_tst.to_numpy(), y_tst_hat)
        f1_ = f1_score(y_tst.to_numpy(), y_tst_hat)
           
        print(f'Accuracy (test): {round(acc, 3)}')
        print(f'Precision (test): {round(prec, 3)}')
        print(f'Recall (test): {round(rec, 3)}')
        print(f'F1 Score (test): {round(f1_, 3)}')
        print('-'*15)
        
        all_metrics.append([acc, prec, rec, f1_])
        
all_importances = np.array(all_importances)
all_metrics = np.array(all_metrics)

In [None]:
print(all_metrics.T)


In [None]:
print(all_metrics_CV.T)

In [26]:
# from utils.print_latex import print_table

# print("IMPORTANCES")
# print_table(all_importances.T)
# print("-"*15)

# print("ACC - PRECISION - RECALL - F1 (TEST)")
# print_table(all_metrics.T)
# print("-"*15)

# print("F1 (CV)")
# print_table(all_metrics_CV)
# print("-"*15)
