In [1]:

def read_data(directory, filename):
    with open(directory+filename) as f:
        content = f.readlines()
    content = [x.strip().split(",") for x in content]
    idx = 1
    for i in content:
        if i == ['@data']:
            break
        else:
            idx += 1
    data = np.array(content[idx:])
    X_train, y_train = data[:, :-1], data[:, -1]

    return X_train, y_train

In [None]:
import os
from collections import OrderedDict

import numpy as np
import pandas as pd
import sklearn
from IPython.core.display import display


from imblearn.datasets import fetch_datasets

from sklearn.model_selection import StratifiedKFold
from sklearn.tree import tree

from multi_imbalance.ensemble.mrbbagging import MRBBagging

from imblearn.metrics import geometric_mean_score
from multi_imbalance.datasets._data_loader import load_datasets
from sklearn.datasets.base import Bunch


np.random.seed(0)

datasets = OrderedDict()
directory = "./benchmarks/mrbbagging/data/"
for file in os.listdir(directory):
    filename = os.fsdecode(file)
    X, y = read_data(directory, filename)
    datasets[filename] = Bunch(data=X, target=y, DESCR=filename)
results_g_mean = dict()

for dataset_name, dataset_values in datasets.items():
    print(dataset_name)
    X, y = dataset_values.data, dataset_values.target
    results_g_mean[dataset_name] = dict()
    skf = StratifiedKFold(n_splits=5)
    skf.get_n_splits(X, y)
    omrbbagging_means, mrbbagging_means, bagging_means, tree_means = [], [], [], []

    mrbbagging = MRBBagging()
    decision_tree_classifier = tree.DecisionTreeClassifier()
    tree_classifiers = dict()
    for i in range(50):
        tree_classifiers[i] = tree.DecisionTreeClassifier()
        
    omrbbagging = MRBBagging()
    otree_classifiers = dict()
    for i in range(50):
        otree_classifiers[i] = tree.DecisionTreeClassifier()
    
    bagging = sklearn.ensemble.BaggingClassifier(base_estimator=decision_tree_classifier, n_estimators=50)

    for train_index, test_index in skf.split(X, y):
        train_X, test_X = X[train_index], X[test_index]
        train_y, test_y = y[train_index], y[test_index]

        mrbbagging.fit(train_X, train_y, tree_classifiers)
        omrbbagging.fit(train_X, train_y, otree_classifiers, False)
        bagging.fit(train_X, train_y)
        decision_tree = tree.DecisionTreeClassifier()
        decision_tree.fit(train_X, train_y)

        prediction_mrbbagging = mrbbagging.predict(test_X)
        prediction_omrbbagging = omrbbagging.predict(test_X)
        prediction_bagging = bagging.predict(test_X)
        prediction_tree = decision_tree.predict(test_X)

        ypred_mrbbagging = np.array(prediction_mrbbagging)
        mrbbagging_means.append(geometric_mean_score(test_y, ypred_mrbbagging, correction=0.01))

        ypred_omrbbagging = np.array(prediction_omrbbagging)
        omrbbagging_means.append(geometric_mean_score(test_y, ypred_omrbbagging, correction=0.01))

        ypred_bagging = np.array(prediction_bagging)
        bagging_means.append(geometric_mean_score(test_y, ypred_bagging, correction=0.01))

        ypred_tree = np.array(prediction_tree)
        tree_means.append(geometric_mean_score(test_y, ypred_tree, correction=0.01))

    results_g_mean[dataset_name]["UMRBBgging"] = np.asarray(mrbbagging_means).mean()
    
    results_g_mean[dataset_name]["OMRBBgging"] = np.asarray(omrbbagging_means).mean()

    results_g_mean[dataset_name]["Bagging"] = np.asarray(bagging_means).mean()

    results_g_mean[dataset_name]["Tree"] = np.asarray(tree_means).mean()

display("G-MEAN")
df = pd.DataFrame(results_g_mean).T
df.to_csv("./mrbbagging_results")
display(df)
