In [17]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn import tree

from multi_imbalance.ensemble.mrbbagging import MRBBagging
from multi_imbalance.utils.data import construct_flat_2pc_df

%matplotlib inline

sns.set_style('darkgrid')


# TODO replace it by correct file in repository
ecoli_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/yeast/yeast.data'
df = pd.read_csv(ecoli_url, delim_whitespace=True, header=None,
                 names=['name', '1', '2', '3', '4', '5', '6', '7', 'class'])

df = df.sample(frac=1)

X, y = df.iloc[:, 1:8].to_numpy(), df['class'].to_numpy()
print(df)

            name     1     2     3    4    5     6     7 class
RL4A_YEAST  0.56  0.35  0.47  0.41  0.5  0.0  0.50  0.22   CYT
YBI8_YEAST  0.45  0.50  0.43  0.16  0.5  0.0  0.52  0.28   ME3
SKN1_YEAST  0.38  0.42  0.31  0.35  0.5  0.0  0.52  0.22   ME3
RL12_YEAST  0.59  0.51  0.48  0.12  0.5  0.0  0.54  0.31   CYT
USO1_YEAST  0.50  0.47  0.45  0.14  0.5  0.0  0.53  0.32   ME3
...          ...   ...   ...   ...  ...  ...   ...   ...   ...
NIN1_YEAST  0.47  0.50  0.55  0.19  0.5  0.0  0.62  0.22   CYT
FU34_YEAST  0.37  0.34  0.32  0.22  0.5  0.0  0.56  0.22   ME3
PTP1_YEAST  0.53  0.45  0.53  0.18  0.5  0.0  0.42  0.22   CYT
ACP_YEAST   0.52  0.53  0.58  0.69  0.5  0.0  0.50  0.22   MIT
S160_YEAST  0.46  0.49  0.55  0.18  0.5  0.0  0.56  0.22   NUC

[1484 rows x 9 columns]


In [19]:
import sklearn.ensemble

mrbbagging = MRBBagging()
decision_tree_classifier = tree.DecisionTreeClassifier()
tree_classifiers = dict()
for i in range(30):
    tree_classifiers[i] = tree.DecisionTreeClassifier()
bagging = sklearn.ensemble.BaggingClassifier(base_estimator=decision_tree_classifier, n_estimators=30)

BaggingClassifier(base_estimator=DecisionTreeClassifier(class_weight=None,
                                                        criterion='gini',
                                                        max_depth=None,
                                                        max_features=None,
                                                        max_leaf_nodes=None,
                                                        min_impurity_decrease=0.0,
                                                        min_impurity_split=None,
                                                        min_samples_leaf=1,
                                                        min_samples_split=2,
                                                        min_weight_fraction_leaf=0.0,
                                                        presort=False,
                                                        random_state=None,
                                                        splitter='best'),
    

In [24]:
import numpy as np
from imblearn.metrics import geometric_mean_score
from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits=4)
skf.get_n_splits(X, y)
mrbbagging_means = []
bagging_means = []
tree_means = []

for train_index, test_index in skf.split(X, y):
    train_X, test_X = X[train_index], X[test_index]
    train_y, test_y = y[train_index], y[test_index]

    mrbbagging.fit(train_X, train_y, len(train_X), 30, tree_classifiers)
    bagging.fit(train_X, train_y)
    decision_tree = tree.DecisionTreeClassifier()
    decision_tree.fit(train_X, train_y)
    
    prediction_mrbbagging = mrbbagging.predict(test_X)
    prediction_bagging = bagging.predict(test_X)
    prediction_tree = decision_tree.predict(test_X)

    ypred_mrbbagging = np.array(prediction_mrbbagging)
    mrbbagging_means.append(geometric_mean_score(test_y, ypred_mrbbagging, correction=0.01))

    ypred_bagging = np.array(prediction_bagging)
    bagging_means.append(geometric_mean_score(test_y, ypred_bagging, correction=0.01))

    ypred_tree = np.array(prediction_tree)
    tree_means.append(geometric_mean_score(test_y, ypred_tree, correction=0.01))

print("MRBBagging mean of geometric median scores: ")
print(np.asarray(mrbbagging_means).mean())
print("Bagging mean of geometric median scores: ")
print(np.asarray(bagging_means).mean())
print("Tree mean of geometric median scores: ")
print(np.asarray(tree_means).mean())


MRBBagging mean of geometric median scores: 
0.43058505648811196
Bagging mean of geometric median scores: 
0.23026223549909342
Tree mean of geometric median scores: 
0.1751724367698185
