Unzip datasets and prepare data:

In [4]:
import os

import seaborn as sns
from imblearn.metrics import geometric_mean_score
from sklearn.model_selection import cross_val_score, ShuffleSplit
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

from multi_imbalance.datasets import load_datasets
from multi_imbalance.ensemble.soup_bagging import SOUPBagging
from multi_imbalance.utils.data import load_arff_dataset
from multi_imbalance.utils.min_int_maj import maj_int_min

%matplotlib inline
sns.set_style("darkgrid")

In [5]:
dataset = load_datasets()["new_ecoli"]

X, y = dataset.data, dataset.target
print(X[:5])
print(y[:5])

[[0.49 0.29 0.48 0.5  0.56 0.24 0.35]
 [0.07 0.4  0.48 0.5  0.54 0.35 0.44]
 [0.56 0.4  0.48 0.5  0.49 0.37 0.46]
 [0.59 0.49 0.48 0.5  0.52 0.45 0.36]
 [0.23 0.32 0.48 0.5  0.55 0.25 0.35]]
[0 0 0 0 0]


In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

In [7]:
clf = KNeighborsClassifier()
vote_classifier = SOUPBagging(
    clf, n_classifiers=50, maj_int_min=maj_int_min["new_ecoli"]
)
vote_classifier.fit(X_train, y_train)
y_pred = vote_classifier.predict(X_test)
geometric_mean_score(y_test, y_pred, correction=0.001)

0.7550748879971014

In [8]:
X, y = load_arff_dataset(f"{os.getcwd()}/../../data/arff/new_ecoli.arff")
clf = make_pipeline(StandardScaler(), SOUPBagging())
cv = ShuffleSplit(n_splits=5, test_size=0.3, random_state=0)
print(cross_val_score(clf, X, y, cv=cv))



[0.85148515 0.8019802  0.89108911 0.84158416 0.86138614]
