In [6]:
def read_data(directory, filename):
    with open(directory+filename) as f:
        content = f.readlines()
    content = [x.strip().split(",") for x in content]
    idx = 1
    for i in content:
        if i == ['@data']:
            break
        else:
            idx += 1
    data = np.array(content[idx:])
    X_train, y_train = data[:, :-1], data[:, -1]

    return X_train, y_train

In [7]:
import os
from collections import OrderedDict

import numpy as np
import pandas as pd
import sklearn
from IPython.core.display import display


from imblearn.datasets import fetch_datasets

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import tree

from multi_imbalance.ensemble.mrbbagging import MRBBagging

from imblearn.metrics import geometric_mean_score
from multi_imbalance.datasets._data_loader import load_datasets
from sklearn.datasets.base import Bunch

np.random.seed(0)

datasets = OrderedDict()
directory = "./benchmarks/mrbbagging/data/"
for file in os.listdir(directory):
    filename = os.fsdecode(file)
    X, y = read_data(directory, filename)
    datasets[filename] = Bunch(data=X, target=y, DESCR=filename)
results_g_mean = dict()

for dataset_name, dataset_values in datasets.items():
    print(dataset_name)
    X, y = dataset_values.data, dataset_values.target
    
    le = LabelEncoder()
    y = le.fit_transform(y)
    results_g_mean[dataset_name] = dict()
    skf = StratifiedKFold(n_splits=5)
    skf.get_n_splits(X, y)
    omrbbagging_means, mrbbagging_means, bagging_means, tree_means = [], [], [], []

    mrbbagging = MRBBagging(50, tree.DecisionTreeClassifier())
    decision_tree_classifier = tree.DecisionTreeClassifier()
        
    omrbbagging = MRBBagging(50, tree.DecisionTreeClassifier(), undersampling = False)
    
    bagging = sklearn.ensemble.BaggingClassifier(base_estimator=decision_tree_classifier, n_estimators=50)
    for train_index, test_index in skf.split(X, y):
        train_X, test_X = X[train_index], X[test_index]
        train_y, test_y = y[train_index], y[test_index]

        mrbbagging.fit(train_X, train_y)
        omrbbagging.fit(train_X, train_y)
        bagging.fit(train_X, train_y)
        decision_tree = tree.DecisionTreeClassifier()
        decision_tree.fit(train_X, train_y)

        prediction_mrbbagging = mrbbagging.predict(test_X)
        prediction_omrbbagging = omrbbagging.predict(test_X)
        prediction_bagging = bagging.predict(test_X)
        prediction_tree = decision_tree.predict(test_X)

        ypred_mrbbagging = np.array(prediction_mrbbagging)
        mrbbagging_means.append(geometric_mean_score(test_y, ypred_mrbbagging, correction=0.01))

        ypred_omrbbagging = np.array(prediction_omrbbagging)
        omrbbagging_means.append(geometric_mean_score(test_y, ypred_omrbbagging, correction=0.01))

        ypred_bagging = np.array(prediction_bagging)
        bagging_means.append(geometric_mean_score(test_y, ypred_bagging, correction=0.01))

        ypred_tree = np.array(prediction_tree)
        tree_means.append(geometric_mean_score(test_y, ypred_tree, correction=0.01))

    results_g_mean[dataset_name]["UMRBBgging"] = np.asarray(mrbbagging_means).mean()
    
    results_g_mean[dataset_name]["OMRBBgging"] = np.asarray(omrbbagging_means).mean()

    results_g_mean[dataset_name]["Bagging"] = np.asarray(bagging_means).mean()

    results_g_mean[dataset_name]["Tree"] = np.asarray(tree_means).mean()

display("G-MEAN")
df = pd.DataFrame(results_g_mean).T
df.to_csv("./mrbbagging_results")
display(df)


cleveland-sm.arff


cleveland.arff


ecoli.arff


ecoliimUpp.arff


glass.arff


glass37.arff


new-thyroid.arff


thyroid.arff


vehicle.arff


yeast-me2-me3.arff


yeast-sm.arff


yeast.arff


'G-MEAN'

Unnamed: 0,cleveland-sm.arff,cleveland.arff,ecoli.arff,ecoliimUpp.arff,glass.arff,glass37.arff,new-thyroid.arff,thyroid.arff,vehicle.arff,yeast-me2-me3.arff,yeast-sm.arff,yeast.arff
UMRBBgging,0.291408,0.266973,0.812153,0.775065,0.680074,0.791637,0.923435,0.98524,0.700117,0.839618,0.486769,0.492058
OMRBBgging,0.338391,0.135417,0.778688,0.753166,0.671456,0.665075,0.912214,0.973431,0.701354,0.701334,0.375448,0.454912
Bagging,0.262546,0.150828,0.781659,0.646945,0.566181,0.37875,0.908349,0.935807,0.718078,0.523958,0.315928,0.234905
Tree,0.324042,0.199457,0.695989,0.628147,0.388908,0.631354,0.895354,0.92746,0.682335,0.496972,0.319057,0.192241


In [9]:
df = df.T
df.to_csv("./mrbbagging_results")

In [1]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler


def read_data(directory, filename):
    with open(directory+filename) as f:
        content = f.readlines()
    content = [x.strip().split(",") for x in content]
    idx = 1
    for i in content:
        if i == ['@data']:
            break
        else:
            idx += 1
    data = np.array(content[idx:])
    X_train, y_train = data[:, :-1], data[:, -1]

    return X_train, y_train


def preprocess_dataset(data):
    df = pd.DataFrame(data)
    y_index = len(df.columns) - 1
    y = df.pop(df.columns[y_index])

    le = LabelEncoder()
    scaler = MinMaxScaler()

    y = le.fit_transform(y)

    categorical_feature_mask = df.dtypes == object

    categorical_cols = df.columns[categorical_feature_mask].tolist()
    non_categorical_cols = df.columns[~categorical_feature_mask].tolist()

    df[categorical_cols] = df[categorical_cols].replace({b'?': np.NaN})
    mode = df.mode().iloc[0]
    mean = df.filter(non_categorical_cols).mean()

    df[categorical_cols] = df.filter(categorical_cols).fillna(mode)
    df[non_categorical_cols] = df.filter(non_categorical_cols).fillna(mean)

    # if len(non_categorical_cols) >= 1:
    #     df[non_categorical_cols] = scaler.fit_transform(df[non_categorical_cols])
    X = pd.get_dummies(df, columns=categorical_cols)
    return X.to_numpy(), y

In [4]:
import os
import random
import warnings
from collections import OrderedDict, Counter

import numpy as np
import pandas as pd
import sklearn
from IPython.core.display import display


from imblearn.datasets import fetch_datasets
from scipy.io import arff

from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import tree

from multi_imbalance.ensemble.mrbbagging import MRBBagging

from imblearn.metrics import geometric_mean_score
from multi_imbalance.datasets._data_loader import load_datasets
from sklearn.datasets.base import Bunch
from numpy import genfromtxt

warnings.filterwarnings("ignore")

random.seed(0)

def read_data_fs(directory, filename):
    data, meta = arff.loadarff(directory+filename)
    return data


def get_prediction(classifier, train_X, train_y, test_X, test_y):
    classifier.fit(train_X, train_y)
    prediction = classifier.predict(test_X)

    prediction = np.array(prediction)
    return geometric_mean_score(test_y, prediction, correction=0.01)


datasets = OrderedDict()
directory = "./benchmarks/mrbbagging/data_fs/"
for file in os.listdir(directory):
    filename = os.fsdecode(file)
    if "arff" not in filename:
        continue
    if filename not in ["vehicle.arff", "cleveland.arff"]:
        data = read_data_fs(directory, filename)
        X, y = preprocess_dataset(data)
    else:
        X, y = read_data(directory, filename)
    datasets[filename] = Bunch(data=X, target=y, DESCR=filename)
results_g_mean = dict()

hepatitis = genfromtxt('./benchmarks/mrbbagging/data_fs/hepatitis.data', delimiter=',')

hepatitis_x, hepatitis_y = preprocess_dataset(hepatitis)

datasets["hepatitis"] = Bunch(data=hepatitis_x, target=hepatitis_y)
mrbbagging_means, omrbbagging_means, bagging_means, tree_means = [], [], [], []


for dataset_name, dataset_values in datasets.items():
    print(dataset_name)
    X, y = dataset_values.data, dataset_values.target
    
    if dataset_name in ["vehicle.arff", "cleveland.arff"]:
        le = LabelEncoder()
        y = le.fit_transform(y)
    
    results_g_mean[dataset_name] = dict()
    skf = StratifiedKFold(n_splits=10)
    skf.get_n_splits(X, y)
    umrbbagging_fs_means, mrbbagging_means, bagging_means, tree_means, all_random_means = [], [], [], [], []
    all_random_umrbbagging_sqrt_means, umrbbagging_fs_sqrt_means = [], []

    mrbbagging = MRBBagging(30, tree.DecisionTreeClassifier())
    decision_tree_classifier = tree.DecisionTreeClassifier()
    umrbbagging_fs = MRBBagging(30, tree.DecisionTreeClassifier(), undersampling=True, feature_selection=True)
    all_random_umrbbagging = MRBBagging(30, tree.DecisionTreeClassifier(),
                                        undersampling=True, feature_selection=True, all_random=True)
    umrbbagging_fs_sqrt = MRBBagging(30, tree.DecisionTreeClassifier(), undersampling=True,
                                     feature_selection=True, half_features=False)
    all_random_umrbbagging_sqrt = MRBBagging(30, tree.DecisionTreeClassifier(), undersampling=True, feature_selection=True,
                                             all_random =True, half_features=False)
    bagging = sklearn.ensemble.BaggingClassifier(base_estimator=decision_tree_classifier, n_estimators=30)

    for train_index, test_index in skf.split(X, y):
        train_X, test_X = X[train_index], X[test_index]
        train_y, test_y = y[train_index], y[test_index]
    
        mrbbagging_means.append(get_prediction(mrbbagging, train_X, train_y, test_X, test_y))
        umrbbagging_fs_means.append(get_prediction(umrbbagging_fs, train_X, train_y, test_X, test_y))
        bagging_means.append(get_prediction(bagging, train_X,train_y, test_X, test_y))
        tree_means.append(get_prediction(decision_tree_classifier, train_X, train_y, test_X, test_y))
        all_random_means.append(get_prediction(all_random_umrbbagging, train_X, train_y, test_X, test_y))
        all_random_umrbbagging_sqrt_means.append(get_prediction(all_random_umrbbagging_sqrt, train_X, train_y, test_X, test_y))
        umrbbagging_fs_sqrt_means.append(get_prediction(umrbbagging_fs_sqrt,train_X, train_y, test_X, test_y))
        
    results_g_mean[dataset_name]["uMRBBgging"] = np.asarray(mrbbagging_means).mean()
    
    results_g_mean[dataset_name]["uMRBBagging + FS 50%"] = np.asarray(umrbbagging_fs_means).mean()
    
    results_g_mean[dataset_name]["uMRBBagging + All random 50%"] = np.asarray(all_random_means).mean()
    results_g_mean[dataset_name]["uMRBBagging + FS sqrt"] = np.asarray(umrbbagging_fs_sqrt_means).mean()
    results_g_mean[dataset_name]["uMRBBagging + All random sqrt"] = np.asarray(all_random_umrbbagging_sqrt_means).mean()
    
    results_g_mean[dataset_name]["Bagging"] = np.asarray(bagging_means).mean()
    
    results_g_mean[dataset_name]["Tree"] = np.asarray(tree_means).mean()
display("G-MEAN")
df = pd.DataFrame(results_g_mean).T
df.to_csv("./mrbbagging_results")
display(df)


abdominal_pain.arff


cleveland.arff


credit-g.arff


scrotal_pain.arff


vehicle.arff


hepatitis


'G-MEAN'

Unnamed: 0,uMRBBgging,uMRBBagging + FS 50%,uMRBBagging + All random 50%,uMRBBagging + FS sqrt,uMRBBagging + All random sqrt,Bagging,Tree
abdominal_pain.arff,0.795822,0.813956,0.820663,0.813821,0.845172,0.793706,0.776824
cleveland.arff,0.132067,0.118641,0.166136,0.12935,0.105572,0.090807,0.132281
credit-g.arff,0.721796,0.71291,0.704812,0.687719,0.696327,0.628911,0.602853
scrotal_pain.arff,0.754145,0.763116,0.7632,0.732072,0.695449,0.734856,0.725562
vehicle.arff,0.711543,0.691162,0.728724,0.639588,0.697312,0.698009,0.659123
hepatitis,0.571506,0.614541,0.641805,0.590059,0.611497,0.569158,0.552441
