In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn import tree

from multi_imbalance.ensemble.mrbbagging import MRBBagging
from multi_imbalance.utils.data import construct_flat_2pc_df

%matplotlib inline

sns.set_style('darkgrid')


data = 'https://archive.ics.uci.edu/ml/machine-learning-databases/yeast/yeast.data'
df = pd.read_csv(data, delim_whitespace=True, header=None,
                 names=['name', '1', '2', '3', '4', '5', '6', '7', 'class'])

df = df.sample(frac=1)

X, y = df.iloc[:, 1:8].to_numpy(), df['class'].to_numpy()
print(df)

            name     1     2     3    4    5     6     7 class
POP2_YEAST  0.50  0.45  0.53  0.21  0.5  0.0  0.28  0.22   NUC
PAP_YEAST   0.64  0.52  0.50  0.26  0.5  0.0  0.51  0.31   NUC
CBP3_YEAST  0.47  0.47  0.57  0.48  0.5  0.0  0.48  0.22   MIT
PT22_YEAST  0.56  0.68  0.52  0.39  0.5  0.0  0.36  0.22   MIT
PR39_YEAST  0.32  0.48  0.49  0.17  0.5  0.0  0.47  0.22   NUC
...          ...   ...   ...   ...  ...  ...   ...   ...   ...
TPS3_YEAST  0.40  0.44  0.50  0.13  0.5  0.0  0.52  0.39   CYT
IF2G_YEAST  0.51  0.35  0.46  0.15  0.5  0.0  0.51  0.42   CYT
EST1_YEAST  0.35  0.60  0.47  0.13  0.5  0.0  0.52  0.43   NUC
ST12_YEAST  0.38  0.51  0.56  0.21  0.5  0.0  0.48  0.31   NUC
PRC6_YEAST  0.53  0.43  0.57  0.24  0.5  0.0  0.39  0.28   CYT

[1484 rows x 9 columns]


In [3]:
import sklearn.ensemble

mrbbagging = MRBBagging()
decision_tree_classifier = tree.DecisionTreeClassifier()
tree_classifiers = dict()
for i in range(30):
    tree_classifiers[i] = tree.DecisionTreeClassifier()
bagging = sklearn.ensemble.BaggingClassifier(base_estimator=decision_tree_classifier, n_estimators=30)

In [5]:
import numpy as np
from imblearn.metrics import geometric_mean_score
from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits=4)
skf.get_n_splits(X, y)
mrbbagging_means = []
bagging_means = []
tree_means = []

for train_index, test_index in skf.split(X, y):
    train_X, test_X = X[train_index], X[test_index]
    train_y, test_y = y[train_index], y[test_index]

    mrbbagging.fit(train_X, train_y, tree_classifiers)
    bagging.fit(train_X, train_y)
    decision_tree = tree.DecisionTreeClassifier()
    decision_tree.fit(train_X, train_y)
    
    prediction_mrbbagging = mrbbagging.predict(test_X)
    prediction_bagging = bagging.predict(test_X)
    prediction_tree = decision_tree.predict(test_X)

    ypred_mrbbagging = np.array(prediction_mrbbagging)
    mrbbagging_means.append(geometric_mean_score(test_y, ypred_mrbbagging, correction=0.01))

    ypred_bagging = np.array(prediction_bagging)
    bagging_means.append(geometric_mean_score(test_y, ypred_bagging, correction=0.01))

    ypred_tree = np.array(prediction_tree)
    tree_means.append(geometric_mean_score(test_y, ypred_tree, correction=0.01))

print("MRBBagging mean of geometric median scores: ")
print(np.asarray(mrbbagging_means).mean())
print("Bagging mean of geometric median scores: ")
print(np.asarray(bagging_means).mean())
print("Tree mean of geometric median scores: ")
print(np.asarray(tree_means).mean())


MRBBagging mean of geometric median scores: 
0.42860356419772105
Bagging mean of geometric median scores: 
0.19222089566616907
Tree mean of geometric median scores: 
0.16437837366840433
