In [13]:
import numpy as np
import pandas as pd
import sklearn
from IPython.core.display import display
from imblearn.datasets import fetch_datasets
from sklearn.metrics import accuracy_score

from sklearn.model_selection import StratifiedKFold
from sklearn.tree import tree

from multi_imbalance.ensemble.mrbbagging import MRBBagging

from imblearn.metrics import geometric_mean_score

np.random.seed(0)

datasets = fetch_datasets()
results_g_mean = dict()
results_acc = dict()

for dataset_name, dataset_values in datasets.items():
    if len(dataset_values.data) > 1000:
        continue
    print(dataset_name)
    X, y = dataset_values.data, dataset_values.target
    results_g_mean[dataset_name] = dict()
    results_acc[dataset_name] = dict()
    skf = StratifiedKFold(n_splits=4)
    skf.get_n_splits(X, y)
    mrbbagging_means, bagging_means, tree_means, acc_mrbbagging, acc_bagging, acc_tree = [], [], [], [], [], []

    mrbbagging = MRBBagging()
    decision_tree_classifier = tree.DecisionTreeClassifier()
    tree_classifiers = dict()
    for i in range(30):
        tree_classifiers[i] = tree.DecisionTreeClassifier()
    bagging = sklearn.ensemble.BaggingClassifier(base_estimator=decision_tree_classifier, n_estimators=30)

    for train_index, test_index in skf.split(X, y):
        train_X, test_X = X[train_index], X[test_index]
        train_y, test_y = y[train_index], y[test_index]

        mrbbagging.fit(train_X, train_y, len(train_X), 30, tree_classifiers)
        bagging.fit(train_X, train_y)
        decision_tree = tree.DecisionTreeClassifier()
        decision_tree.fit(train_X, train_y)
        
        prediction_mrbbagging = mrbbagging.predict(test_X)
        prediction_bagging = bagging.predict(test_X)
        prediction_tree = decision_tree.predict(test_X)

        ypred_mrbbagging = np.array(prediction_mrbbagging)
        mrbbagging_means.append(geometric_mean_score(test_y, ypred_mrbbagging, correction=0.01))
        acc_mrbbagging.append(round(accuracy_score(test_y, ypred_mrbbagging), 3))

        ypred_bagging = np.array(prediction_bagging)
        bagging_means.append(geometric_mean_score(test_y, ypred_bagging, correction=0.01))
        acc_bagging.append(round(accuracy_score(test_y, ypred_bagging), 3))

        ypred_tree = np.array(prediction_tree)
        tree_means.append(geometric_mean_score(test_y, ypred_tree, correction=0.01))
        acc_tree.append(round(accuracy_score(test_y, ypred_tree), 3))

    results_g_mean[dataset_name]["MRBBgging"] = np.asarray(mrbbagging_means).mean()
    results_acc[dataset_name]["MRBBgging"] = np.asarray(acc_mrbbagging).mean()

    results_g_mean[dataset_name]["Bagging"] = np.asarray(bagging_means).mean()
    results_acc[dataset_name]["Bagging"] = np.asarray(acc_bagging).mean()

    results_g_mean[dataset_name]["Tree"] = np.asarray(tree_means).mean()
    results_acc[dataset_name]["Tree"] = np.asarray(acc_tree).mean()

display("G-MEAN")
df = pd.DataFrame(results_g_mean).T
display(df)

display("ACC")
df = pd.DataFrame(results_acc).T
display(df)


ecoli


spectrometer


libras_move


arrhythmia


oil


'G-MEAN'

Unnamed: 0,MRBBgging,Bagging,Tree
ecoli,0.753246,0.573694,0.672289
spectrometer,0.915183,0.820636,0.843906
libras_move,0.427223,0.389321,0.517469
arrhythmia,0.944426,0.904326,0.83144
oil,0.435809,0.487741,0.624286


'ACC'