In [20]:
def read_data(directory, filename):
    with open(directory+filename) as f:
        content = f.readlines()
    content = [x.strip().split(",") for x in content]
    idx = 1
    for i in content:
        if i == ['@data']:
            break
        else:
            idx += 1
    data = np.array(content[idx:])
    X_train, y_train = data[:, :-1], data[:, -1]

    return X_train, y_train

In [18]:
import os
from collections import OrderedDict

import numpy as np
import pandas as pd
import sklearn
from IPython.core.display import display


from imblearn.datasets import fetch_datasets

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import tree

from multi_imbalance.ensemble.mrbbagging import MRBBagging

from imblearn.metrics import geometric_mean_score
from multi_imbalance.datasets._data_loader import load_datasets
from sklearn.datasets.base import Bunch

np.random.seed(0)

datasets = OrderedDict()
directory = "./benchmarks/mrbbagging/data/"
for file in os.listdir(directory):
    filename = os.fsdecode(file)
    X, y = read_data(directory, filename)
    datasets[filename] = Bunch(data=X, target=y, DESCR=filename)
results_g_mean = dict()

for dataset_name, dataset_values in datasets.items():
    print(dataset_name)
    X, y = dataset_values.data, dataset_values.target
    
    le = LabelEncoder()
    y = le.fit_transform(y)
    results_g_mean[dataset_name] = dict()
    skf = StratifiedKFold(n_splits=5)
    skf.get_n_splits(X, y)
    omrbbagging_means, mrbbagging_means, bagging_means, tree_means = [], [], [], []

    mrbbagging = MRBBagging(100, tree.DecisionTreeClassifier())
    decision_tree_classifier = tree.DecisionTreeClassifier()
        
    omrbbagging = MRBBagging(100, tree.DecisionTreeClassifier(), undersampling = False)
    
    bagging = sklearn.ensemble.BaggingClassifier(base_estimator=decision_tree_classifier, n_estimators=100)
    for train_index, test_index in skf.split(X, y):
        train_X, test_X = X[train_index], X[test_index]
        train_y, test_y = y[train_index], y[test_index]

        mrbbagging.fit(train_X, train_y)
        omrbbagging.fit(train_X, train_y)
        bagging.fit(train_X, train_y)
        decision_tree = tree.DecisionTreeClassifier()
        decision_tree.fit(train_X, train_y)

        prediction_mrbbagging = mrbbagging.predict(test_X)
        prediction_omrbbagging = omrbbagging.predict(test_X)
        prediction_bagging = bagging.predict(test_X)
        prediction_tree = decision_tree.predict(test_X)

        ypred_mrbbagging = np.array(prediction_mrbbagging)
        mrbbagging_means.append(geometric_mean_score(test_y, ypred_mrbbagging, correction=0.01))

        ypred_omrbbagging = np.array(prediction_omrbbagging)
        omrbbagging_means.append(geometric_mean_score(test_y, ypred_omrbbagging, correction=0.01))

        ypred_bagging = np.array(prediction_bagging)
        bagging_means.append(geometric_mean_score(test_y, ypred_bagging, correction=0.01))

        ypred_tree = np.array(prediction_tree)
        tree_means.append(geometric_mean_score(test_y, ypred_tree, correction=0.01))

    results_g_mean[dataset_name]["UMRBBgging"] = np.asarray(mrbbagging_means).mean()
    
    results_g_mean[dataset_name]["OMRBBgging"] = np.asarray(omrbbagging_means).mean()

    results_g_mean[dataset_name]["Bagging"] = np.asarray(bagging_means).mean()

    results_g_mean[dataset_name]["Tree"] = np.asarray(tree_means).mean()

display("G-MEAN")
df = pd.DataFrame(results_g_mean).T
df.to_csv("./mrbbagging_results")
display(df)


cleveland-sm.arff


cleveland.arff


ecoli.arff


ecoliimUpp.arff


glass.arff


glass37.arff


new-thyroid.arff


thyroid.arff


vehicle.arff


yeast-me2-me3.arff


yeast-sm.arff


yeast.arff


'G-MEAN'

Unnamed: 0,UMRBBgging,OMRBBgging,Bagging,Tree
cleveland-sm.arff,0.350256,0.294053,0.267132,0.271424
cleveland.arff,0.194268,0.150758,0.109574,0.149635
ecoli.arff,0.776484,0.763488,0.762315,0.69089
ecoliimUpp.arff,0.810825,0.742094,0.637851,0.629918
glass.arff,0.680015,0.680238,0.495077,0.349969
glass37.arff,0.696252,0.688583,0.37875,0.603165
new-thyroid.arff,0.931386,0.904289,0.905013,0.909126
thyroid.arff,0.985752,0.973887,0.935351,0.927858
vehicle.arff,0.704071,0.702581,0.701265,0.688747
yeast-me2-me3.arff,0.846731,0.694054,0.557151,0.553868


In [8]:
df.to_csv("./mrbbagging_results")

In [1]:
def read_data(directory, filename):
    with open(directory+filename) as f:
        content = f.readlines()
    content = [x.strip().split(",") for x in content]
    idx = 1
    for i in content:
        if i == ['@data']:
            break
        else:
            idx += 1
    data = np.array(content[idx:])
    X_train, y_train = data[:, :-1], data[:, -1]

    return X_train, y_train


def preprocess_dataset(data):
    df = pd.DataFrame(data)
    y_index = len(df.columns) - 1
    y = df.pop(df.columns[y_index])

    le = LabelEncoder()

    y = le.fit_transform(y)

    categorical_feature_mask = df.dtypes == object

    categorical_cols = df.columns[categorical_feature_mask].tolist()
    non_categorical_cols = df.columns[~categorical_feature_mask].tolist()

    df[categorical_cols] = df[categorical_cols].replace({b'?': np.NaN})
    mode = df.mode().iloc[0]
    mean = df.filter(non_categorical_cols).mean()

    df[categorical_cols] = df.filter(categorical_cols).fillna(mode)
    df[non_categorical_cols] = df.filter(non_categorical_cols).fillna(mean)

    X = pd.get_dummies(df, columns=categorical_cols)
    
    return X.to_numpy(), y

In [2]:
import os
import random
import warnings
from collections import OrderedDict, Counter

import numpy as np
import pandas as pd
import sklearn
from IPython.core.display import display


from imblearn.datasets import fetch_datasets
from scipy.io import arff

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import tree

from multi_imbalance.ensemble.mrbbagging import MRBBagging

from imblearn.metrics import geometric_mean_score
from multi_imbalance.datasets._data_loader import load_datasets
from sklearn.datasets.base import Bunch
from numpy import genfromtxt

warnings.filterwarnings("ignore")

random.seed(0)


def read_data_fs(directory, filename):
    data, meta = arff.loadarff(directory+filename)
    return data


def get_prediction(classifier, train_X, train_y, test_X, test_y):
    classifier.fit(train_X, train_y)
    prediction = classifier.predict(test_X)

    prediction = np.array(prediction)
    return geometric_mean_score(test_y, prediction, correction=0.01)


datasets = OrderedDict()
directory = "./benchmarks/mrbbagging/data_fs/"
for file in os.listdir(directory):
    filename = os.fsdecode(file)
    if "arff" not in filename:
        continue
    X, y = read_data(directory, filename)
    datasets[filename] = Bunch(data=X, target=y, DESCR=filename)
results_g_mean = dict()

hepatitis = genfromtxt('./benchmarks/mrbbagging/data_fs/hepatitis.data', delimiter=',')
data2 = genfromtxt('./benchmarks/mrbbagging/data_fs/dermatology.data', delimiter=',')
data4 = genfromtxt('./benchmarks/mrbbagging/data_fs/lymphography.data', delimiter=',')


hepatitis_x, hepatitis_y = preprocess_dataset(hepatitis)
data2_x, data2_y = preprocess_dataset(data2)
data4_x, data4_y = preprocess_dataset(data4)


datasets["hepatitis"] = Bunch(data=hepatitis_x, target=hepatitis_y)
datasets["dermatology"] = Bunch(data=data2_x, target=data2_y)
datasets["lymphography"] = Bunch(data=data4_x, target=data4_y)

mrbbagging_means, omrbbagging_means, bagging_means, tree_means = [], [], [], []


for dataset_name, dataset_values in datasets.items():
    print(dataset_name)
    X, y = dataset_values.data, dataset_values.target
    
    le = LabelEncoder()
    y = le.fit_transform(y)
    
    results_g_mean[dataset_name] = dict()
    skf = StratifiedKFold(n_splits=5)
    skf.get_n_splits(X, y)
    umrbbagging_fs_means, mrbbagging_means, bagging_means, tree_means, all_random_means = [], [], [], [], []
    all_random_umrbbagging_sqrt_means, umrbbagging_fs_sqrt_means = [], []

    mrbbagging = MRBBagging(150, tree.DecisionTreeClassifier())
    decision_tree_classifier = tree.DecisionTreeClassifier()
    umrbbagging_fs = MRBBagging(50, tree.DecisionTreeClassifier(), undersampling=True, feature_selection=True)
    all_random_umrbbagging = MRBBagging(50, tree.DecisionTreeClassifier(),
                                        undersampling=True, feature_selection=True, random_fs=True)
    umrbbagging_fs_sqrt = MRBBagging(50, tree.DecisionTreeClassifier(), undersampling=True,
                                     feature_selection=True, half_features=False)
    all_random_umrbbagging_sqrt = MRBBagging(50, tree.DecisionTreeClassifier(), undersampling=True, feature_selection=True,
                                             random_fs=True, half_features=False)
    bagging = sklearn.ensemble.BaggingClassifier(base_estimator=decision_tree_classifier, n_estimators=150)

    for train_index, test_index in skf.split(X, y):
        train_X, test_X = X[train_index], X[test_index]
        train_y, test_y = y[train_index], y[test_index]
    
        mrbbagging_means.append(get_prediction(mrbbagging, train_X, train_y, test_X, test_y))
        umrbbagging_fs_means.append(get_prediction(umrbbagging_fs, train_X, train_y, test_X, test_y))
        bagging_means.append(get_prediction(bagging, train_X,train_y, test_X, test_y))
        tree_means.append(get_prediction(decision_tree_classifier, train_X, train_y, test_X, test_y))
        all_random_means.append(get_prediction(all_random_umrbbagging, train_X, train_y, test_X, test_y))
        all_random_umrbbagging_sqrt_means.append(get_prediction(all_random_umrbbagging_sqrt, train_X, train_y, test_X, test_y))
        umrbbagging_fs_sqrt_means.append(get_prediction(umrbbagging_fs_sqrt,train_X, train_y, test_X, test_y))
        
    results_g_mean[dataset_name]["uMRBBgging"] = np.asarray(mrbbagging_means).mean()
    
    results_g_mean[dataset_name]["uMRBBagging + FS 50%"] = np.asarray(umrbbagging_fs_means).mean()
    
    results_g_mean[dataset_name]["uMRBBagging + All random 50%"] = np.asarray(all_random_means).mean()
    results_g_mean[dataset_name]["uMRBBagging + FS sqrt"] = np.asarray(umrbbagging_fs_sqrt_means).mean()
    results_g_mean[dataset_name]["uMRBBagging + All random sqrt"] = np.asarray(all_random_umrbbagging_sqrt_means).mean()
    
    results_g_mean[dataset_name]["Bagging"] = np.asarray(bagging_means).mean()
    
    results_g_mean[dataset_name]["Tree"] = np.asarray(tree_means).mean()
display("G-MEAN")
df = pd.DataFrame(results_g_mean).T
df.to_csv("./mrbbagging_results")
display(df)

cleveland.arff


glass.arff


ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "C:\Users\Hania\Desktop\git\multi-imbalance\venv\lib\site-packages\IPython\core\interactiveshell.py", line 2869, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-2-3e91ef49630b>", line 105, in <module>
    umrbbagging_fs_sqrt_means.append(get_prediction(umrbbagging_fs_sqrt,train_X, train_y, test_X, test_y))
  File "<ipython-input-2-3e91ef49630b>", line 38, in get_prediction
    prediction = classifier.predict(test_X)
  File "C:\Users\Hania\Desktop\git\multi-imbalance\multi_imbalance\ensemble\mrbbagging.py", line 208, in predict
    return self._select_classes(data)
  File "C:\Users\Hania\Desktop\git\multi-imbalance\multi_imbalance\ensemble\mrbbagging.py", line 194, in _select_classes
    voting_matrix = self._count_votes(data)
  File "C:\Users\Hania\Desktop\git\multi-imbalance\multi_imbalance\ensemble\mrbbagging.py", line 186, in _count_votes
    classes = self.classifiers[classifier_id].predict(new_data)

KeyboardInterrupt: 