In [12]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn import tree

from multi_imbalance.ensemble.mrbbagging import MRBBagging
from multi_imbalance.utils.data import construct_flat_2pc_df

%matplotlib inline

sns.set_style('darkgrid')


data = 'https://archive.ics.uci.edu/ml/machine-learning-databases/ecoli/ecoli.data'
df = pd.read_csv(data, delim_whitespace=True, header=None,
                 names=['name', '1', '2', '3', '4', '5', '6', '7', 'class'])

df = df.sample(frac=1)

X, y = df.iloc[:, 1:8].to_numpy(), df['class'].to_numpy()
print(df)

           name     1     2     3    4     5     6     7 class
222  NLPA_ECOLI  0.75  0.55  1.00  1.0  0.40  0.47  0.30   imL
219  EMRB_ECOLI  0.71  0.52  0.48  0.5  0.64  1.00  0.99    im
264  OMPA_ECOLI  0.74  0.90  0.48  0.5  0.57  0.53  0.29    om
136  UBIC_ECOLI  0.30  0.37  0.48  0.5  0.43  0.18  0.30    cp
255  TYRP_ECOLI  0.86  0.55  0.48  0.5  0.63  0.81  0.83   imU
..          ...   ...   ...   ...  ...   ...   ...   ...   ...
85   PROB_ECOLI  0.40  0.46  0.48  0.5  0.52  0.49  0.56    cp
185  NANT_ECOLI  0.20  0.46  0.48  0.5  0.57  0.78  0.81    im
184   MTR_ECOLI  0.74  0.70  0.48  0.5  0.66  0.65  0.69    im
310  MALM_ECOLI  0.74  0.47  0.48  0.5  0.50  0.57  0.42    pp
284   AGP_ECOLI  0.74  0.49  0.48  0.5  0.42  0.54  0.36    pp

[336 rows x 9 columns]


In [3]:
import sklearn.ensemble

mrbbagging = MRBBagging()
decision_tree_classifier = tree.DecisionTreeClassifier()
bagging = sklearn.ensemble.BaggingClassifier(base_estimator=decision_tree_classifier, n_estimators=30)

In [5]:
import numpy as np
from imblearn.metrics import geometric_mean_score
from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits=4)
skf.get_n_splits(X, y)
mrbbagging_means = []
bagging_means = []
tree_means = []

for train_index, test_index in skf.split(X, y):
    train_X, test_X = X[train_index], X[test_index]
    train_y, test_y = y[train_index], y[test_index]

    mrbbagging.fit(train_X, train_y, 30, tree.DecisionTreeClassifier())
    bagging.fit(train_X, train_y)
    decision_tree = tree.DecisionTreeClassifier()
    decision_tree.fit(train_X, train_y)
    
    prediction_mrbbagging = mrbbagging.predict(test_X)
    prediction_bagging = bagging.predict(test_X)
    prediction_tree = decision_tree.predict(test_X)

    ypred_mrbbagging = np.array(prediction_mrbbagging)
    mrbbagging_means.append(geometric_mean_score(test_y, ypred_mrbbagging, correction=0.01))

    ypred_bagging = np.array(prediction_bagging)
    bagging_means.append(geometric_mean_score(test_y, ypred_bagging, correction=0.01))

    ypred_tree = np.array(prediction_tree)
    tree_means.append(geometric_mean_score(test_y, ypred_tree, correction=0.01))

print("MRBBagging mean of geometric median scores: ")
print(np.asarray(mrbbagging_means).mean())
print("Bagging mean of geometric median scores: ")
print(np.asarray(bagging_means).mean())
print("Tree mean of geometric median scores: ")
print(np.asarray(tree_means).mean())


MRBBagging mean of geometric median scores: 
0.47718553514115575
Bagging mean of geometric median scores: 
0.1941281437773299
Tree mean of geometric median scores: 
0.18913395386991894
