# Evaluating on real datasets

In [1]:
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import cohen_kappa_score
from collections import Counter
import numpy as np
import pandas as pd

seed = 0

In [2]:
from sklearn.datasets import load_iris
iris = load_iris()
X1 = iris.data
y1 = iris.target
sss = StratifiedShuffleSplit(n_splits=5, test_size=0.3, random_state=seed)
sss.get_n_splits(X1, y1)
for train_index, test_index in sss.split(X1, y1):
    X1_train, X1_test = X1[train_index], X1[test_index]
    y1_train, y1_test = y1[train_index], y1[test_index]

In [3]:
from maatpy.dataset import Dataset
yeast = Dataset()
yeast.load_from_csv('datasets/yeast_data.csv', output_column='Class', ignore='Sequence Name')
X2 = yeast.data
y2 = yeast.target
sss = StratifiedShuffleSplit(n_splits=5, test_size=0.3, random_state=seed)
sss.get_n_splits(X2, y2)
for train_index, test_index in sss.split(X2, y2):
    X2_train, X2_test = X2[train_index], X2[test_index]
    y2_train, y2_test = y2[train_index], y2[test_index]

  from numpy.core.umath_tests import inner1d


In [4]:
sat_trn = Dataset()
sat_trn.load_from_csv('datasets/sat_train.csv', output_column='X37')
X3_train = sat_trn.data
y3_train = sat_trn.target
sat_tst = Dataset()
sat_tst.load_from_csv('datasets/sat_tst.csv', output_column='X37')
X3_test = sat_tst.data
y3_test = sat_tst.target

In [5]:
abalone = Dataset()
abalone.load_from_csv('datasets/abalone_data.csv', output_column='Rings')
X4 = abalone.data
y4 = abalone.target
sss = StratifiedShuffleSplit(n_splits=5, test_size=0.3, random_state=seed)
sss.get_n_splits(X4, y4)
for train_index, test_index in sss.split(X4, y4):
    X4_train, X4_test = X4[train_index], X4[test_index]
    y4_train, y4_test = y4[train_index], y4[test_index]

In [6]:
heart = Dataset()
heart.load_from_csv('datasets/heart_data.csv', output_column='X14')
X5 = heart.data
y5 = heart.target
sss = StratifiedShuffleSplit(n_splits=5, test_size=0.3, random_state=seed)
sss.get_n_splits(X5, y5)
for train_index, test_index in sss.split(X5, y5):
    X5_train, X5_test = X5[train_index], X5[test_index]
    y5_train, y5_test = y5[train_index], y5[test_index]

In [7]:
datasets = {'Iris': [X1_train, y1_train, X1_test, y1_test, 4],
           'yeast': [X2_train, y2_train, X2_test, y2_test, 8],
           'satimage': [X3_train, y3_train, X3_test, y3_test, 36],
           'abalone': [X4_train, y4_train, X4_test, y4_test, 8],
           'heart': [X5_train, y5_train, X5_test, y5_test, 13],}

In [8]:
info = {'Iris': [X1.shape[1], len(y1), len(np.unique(y1)), round((Counter(y1).most_common()[-1][1]/float(len(y1)))*100, 3)],
        'Yeast': [X2.shape[1], len(y2), len(np.unique(y2)), round((Counter(y2).most_common()[-1][1]/float(len(y2)))*100, 3)],
        'Satimage': [X3_train.shape[1], len(np.concatenate([y3_train,y3_test])), len(np.unique(np.concatenate([y3_train,y3_test]))), round((Counter(np.concatenate([y3_train,y3_test])).most_common()[-1][1]/float(len(np.concatenate([y3_train,y3_test]))))*100, 3)],
        'Abalone':[X4.shape[1], len(y4), len(np.unique(y4)), round((Counter(y4).most_common()[-1][1]/float(len(y4)))*100, 3)],
        'Heart': [X5.shape[1], len(y5), len(np.unique(y5)), round((Counter(y5).most_common()[-1][1]/float(len(y5)))*100, 3)]}


In [9]:
df = pd.DataFrame.from_dict(info, orient='index', columns=['#Attributes', '#Samples', '#Classes', '%Minority class'])
df.sort_index()

Unnamed: 0,#Attributes,#Samples,#Classes,%Minority class
Abalone,8,4098,16,0.781
Heart,13,270,2,44.444
Iris,4,150,3,33.333
Satimage,36,6435,6,9.728
Yeast,8,1484,10,0.337


## AdaCost

In [10]:
from sklearn.ensemble import AdaBoostClassifier
from maatpy.classifiers import AdaCost

results = {}
for dataset in datasets:
    X_train, y_train, X_test, y_test, features = datasets[dataset]
    results[dataset] = {}
    adaboost = AdaBoostClassifier(random_state=seed)
    adaboost.fit(X_train, y_train)
    y_pred = adaboost.predict(X_test)
    kappa = cohen_kappa_score(y_test, y_pred)
    results[dataset]['adaboost'] = kappa
    algorithm = ['adacost', 'adac1', 'adac2', 'adac3']
    for alg in algorithm:
        clf = AdaCost(algorithm=alg, random_state=seed)
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        kappa = cohen_kappa_score(y_test, y_pred)
        results[dataset][alg] = kappa

  ((sample_weight > 0) | (estimator_weight < 0)))
  ((sample_weight > 0) | (estimator_weight < 0)))
  np.sum(sample_weight))
  avg = np.multiply(a, wgt, dtype=result_dtype).sum(axis)/scl
  ((sample_weight > 0) | (estimator_weight < 0)))
  ((sample_weight > 0) | (estimator_weight < 0)))
  ((sample_weight > 0) | (estimator_weight < 0)))
  ((sample_weight > 0) | (estimator_weight < 0)))
  ((sample_weight > 0) | (estimator_weight < 0)))
  ((sample_weight > 0) | (estimator_weight < 0)))
  ((sample_weight > 0) | (estimator_weight < 0)))
  ((sample_weight > 0) | (estimator_weight < 0)))
  ((sample_weight > 0) | (estimator_weight < 0)))
  ((sample_weight > 0) | (estimator_weight < 0)))
  ((sample_weight > 0) | (estimator_weight < 0)))
  ((sample_weight > 0) | (estimator_weight < 0)))
  ((sample_weight > 0) | (estimator_weight < 0)))
  ((sample_weight > 0) | (estimator_weight < 0)))
  ((sample_weight > 0) | (estimator_weight < 0)))
  ((sample_weight > 0) | (estimator_weight < 0)))
  ((sample_we

In [11]:
df1 = pd.DataFrame.from_dict(results)

df1

Unnamed: 0,Iris,yeast,satimage,abalone,heart
adaboost,0.866667,0.258505,0.639031,0.0627,0.526316
adac1,0.933333,0.0,0.10335,0.011134,0.464865
adac2,0.933333,0.004627,0.086508,0.0,0.0
adac3,0.5,0.004627,0.20414,0.012779,0.0
adacost,0.933333,0.004627,0.10335,0.011134,0.628099


## BalancedRandomForestClassifier

In [12]:
from sklearn.ensemble import RandomForestClassifier
from maatpy.classifiers import BalancedRandomForestClassifier
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import make_pipeline

results = {}
for dataset in datasets:
    X_train, y_train, X_test, y_test, features = datasets[dataset]
    results[dataset] = {}

    original = RandomForestClassifier(random_state=seed)
    original.fit(X_train, y_train)
    y_pred = original.predict(X_test)
    kappa = cohen_kappa_score(y_test, y_pred)
    results[dataset]['RandomForestClassifier'] = kappa

    modified = BalancedRandomForestClassifier(random_state=seed, class_weight='balanced')
    modified.fit(X_train, y_train)
    y_pred = modified.predict(X_test)
    kappa = cohen_kappa_score(y_test, y_pred)
    results[dataset]['BalancedRandomForestClassifier'] = kappa
    
    pipeline = make_pipeline(RandomUnderSampler(random_state=seed), original)
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    kappa = cohen_kappa_score(y_test, y_pred)
    results[dataset]['Pipeline(RUS, RF)'] = kappa

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [13]:
df1 = pd.DataFrame.from_dict(results)

df1

Unnamed: 0,Iris,yeast,satimage,abalone,heart
BalancedRandomForestClassifier,0.933333,0.195164,0.85537,0.108234,0.55
"Pipeline(RUS, RF)",0.933333,0.122419,0.846959,0.106745,0.623955
RandomForestClassifier,0.933333,0.507936,0.876295,0.12463,0.59322


## SMOTEBoost

In [14]:
X_mod= X2[y2!=1]
y_mod = y2[y2!=1]
print(X_mod.shape)
print(Counter(y_mod))
sss = StratifiedShuffleSplit(n_splits=5, test_size=0.3, random_state=seed)
sss.get_n_splits(X_mod, y_mod)
for train_index, test_index in sss.split(X_mod, y_mod):
    Xm_train, Xm_test = X_mod[train_index], X_mod[test_index]
    ym_train, ym_test = y_mod[train_index], y_mod[test_index]
datasets['yeast'] = [Xm_train, ym_train, Xm_test, ym_test, 8]

(1479, 8)
Counter({0: 463, 7: 429, 6: 244, 5: 163, 4: 51, 3: 44, 2: 35, 9: 30, 8: 20})


In [15]:
from maatpy.classifiers import SMOTEBoost
from imblearn.over_sampling import SMOTE
from SMOTEBoost import SMOTEBoost as SMOTEBoost1
results = {}
for dataset in datasets:
    X_train, y_train, X_test, y_test, features = datasets[dataset]
    results[dataset] = {}

    original = AdaBoostClassifier(random_state=seed)
    original.fit(X_train, y_train)
    y_pred = original.predict(X_test)
    kappa = cohen_kappa_score(y_test, y_pred)
    results[dataset]['AdaBoost'] = kappa

    modified = SMOTEBoost(random_state=seed)
    modified.fit(X_train, y_train)
    y_pred = modified.predict(X_test)
    kappa = cohen_kappa_score(y_test, y_pred)
    results[dataset]['maatpy.SMOTEBoost'] = kappa

    alternative = SMOTEBoost1(random_state=seed)
    alternative.fit(X_train, y_train)
    y_pred = alternative.predict(X_test)
    kappa = cohen_kappa_score(y_test, y_pred)
    results[dataset]['dialnd.SMOTEBoost'] = kappa

In [16]:
df1 = pd.DataFrame.from_dict(results)

df1

Unnamed: 0,Iris,yeast,satimage,abalone,heart
AdaBoost,0.866667,0.260323,0.639031,0.0627,0.526316
dialnd.SMOTEBoost,0.9,0.116109,0.5223,0.046797,0.608696
maatpy.SMOTEBoost,0.933333,0.475467,0.35446,0.073587,0.482192


### Debugging SMOTEBoost satimage issues

In [17]:
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import cohen_kappa_score
clf = AdaBoostClassifier(random_state=seed)
print(Counter(y3_train))
clf.fit(X3_train, y3_train)
y_pred = clf.predict(X3_test)
print(cohen_kappa_score(y3_test, y_pred))
X_res, y_res = SMOTE(random_state=seed).fit_sample(X3_train, y3_train)
print(Counter(y_res))
clf.fit(X_res, y_res)
y_pred = clf.predict(X3_test)
print(cohen_kappa_score(y3_test, y_pred))
scores = []
for n in range(1,51):
    clf = SMOTEBoost(n_estimators=n, random_state=seed)
    clf.fit(X3_train, y3_train)
    y_pred = clf.predict(X3_test)
    kappa = cohen_kappa_score(y3_test, y_pred)
    scores.append(kappa)

Counter({0: 1072, 5: 1038, 2: 961, 1: 479, 4: 470, 3: 415})
0.6390306934036836
Counter({2: 1072, 3: 1072, 4: 1072, 5: 1072, 1: 1072, 0: 1072})
0.6361094411558388


In [18]:
from matplotlib import pyplot as plt
plt.plot(list(range(1,51)), scores)
plt.ylabel('kappa score')
plt.xlabel('n_estimator')
plt.show()

<Figure size 640x480 with 1 Axes>

## SMOTEBagging

In [19]:
from sklearn.ensemble import BaggingClassifier
from imblearn.ensemble import BalancedBaggingClassifier
from maatpy.classifiers import SMOTEBagging
from maatpy.classifiers import SMOTEBoost
results = {}
for dataset in datasets:
    X_train, y_train, X_test, y_test, features = datasets[dataset]
    results[dataset] = {}

    original = BaggingClassifier(random_state=seed)
    original.fit(X_train, y_train)
    y_pred = original.predict(X_test)
    kappa = cohen_kappa_score(y_test, y_pred)
    results[dataset]['BaggingClassifier'] = kappa
    imblearn = BalancedBaggingClassifier(random_state=seed)
    imblearn.fit(X_train, y_train)
    y_pred = imblearn.predict(X_test)
    kappa = cohen_kappa_score(y_test, y_pred)
    results[dataset]['BalancedBaggingClassifier'] = kappa
    modified = SMOTEBagging(random_state=seed, k_neighbors=3)
    modified.fit(X_train, y_train)
    y_pred = modified.predict(X_test)
    kappa = cohen_kappa_score(y_test, y_pred)
    results[dataset]['SMOTEBagging'] = kappa

In [20]:
df1 = pd.DataFrame.from_dict(results)

df1

Unnamed: 0,Iris,yeast,satimage,abalone,heart
BaggingClassifier,0.933333,0.493312,0.871468,0.123046,0.621849
BalancedBaggingClassifier,0.933333,0.368569,0.859283,0.124696,0.672269
SMOTEBagging,0.933333,0.491887,0.866121,0.140777,0.621849


## SMOTEEEN

In [21]:
from imblearn.combine import SMOTEENN as SMOTEENN1
from maatpy.samplers import SMOTEENN as SMOTEENN2
from imblearn.over_sampling import SMOTE
from sklearn.svm import SVC
from maatpy.pipeline import make_pipeline
clf = SVC(kernel="linear",random_state=seed)
results = {}
for dataset in datasets:
    X_train, y_train, X_test, y_test, features = datasets[dataset]
    results[dataset] = {}
    original = make_pipeline(SMOTEENN1(random_state=seed), clf)
    original.fit(X_train, y_train)
    y_pred = original.predict(X_test)
    kappa = cohen_kappa_score(y_test, y_pred)
    results[dataset]['imblearn.SMOTEENN'] = kappa         
    try:
        modified = make_pipeline(SMOTEENN2(random_state=seed), clf)   
        modified.fit(X_train, y_train)
        y_pred = modified.predict(X_test)
        kappa = cohen_kappa_score(y_test, y_pred)
        results[dataset]['maatpy.SMOTEENN'] = kappa
    except ValueError:
         results[dataset]['maatpy.SMOTEENN'] = None

In [22]:
df1 = pd.DataFrame.from_dict(results)

df1

Unnamed: 0,Iris,yeast,satimage,abalone,heart
imblearn.SMOTEENN,0.966667,0.24943,0.790968,0.047812,0.311346
maatpy.SMOTEENN,0.966667,0.482592,0.826216,,0.521008


In [23]:
from imblearn.under_sampling import EditedNearestNeighbours
print(Counter(y4_train))
X_res, y_res = EditedNearestNeighbours(random_state=seed).fit_sample(X4_train, y4_train)
print(Counter(y_res))

Counter({5: 482, 6: 444, 4: 398, 7: 341, 3: 274, 8: 187, 2: 181, 9: 142, 10: 88, 1: 80, 11: 72, 12: 47, 13: 41, 0: 40, 14: 29, 15: 22})
Counter({15: 22, 4: 15, 5: 13, 6: 12, 0: 7, 3: 7, 12: 2, 2: 1, 7: 1})


## SMOTETomek

In [24]:
from imblearn.combine import SMOTETomek as SMOTETomek1
from maatpy.samplers import SMOTETomek as SMOTETomek2
clf = SVC(kernel="linear",random_state=seed)
results = {}
for dataset in datasets:
    X_train, y_train, X_test, y_test, features = datasets[dataset]
    results[dataset] = {}

    original = make_pipeline(SMOTETomek1(random_state=seed), clf)
    original.fit(X_train, y_train)
    y_pred = original.predict(X_test)
    kappa = cohen_kappa_score(y_test, y_pred)
    results[dataset]['imblearn.SMOTETomek'] = kappa

    modified = make_pipeline(SMOTETomek2(random_state=seed), clf)
    try:
        modified.fit(X_train, y_train)
        y_pred = modified.predict(X_test)
        kappa = cohen_kappa_score(y_test, y_pred)
        results[dataset]['maatpy.SMOTETomek'] = kappa
    except ValueError:
         results[dataset]['maatpy.SMOTETomek'] = None

In [25]:
df1 = pd.DataFrame.from_dict(results)

df1

Unnamed: 0,Iris,yeast,satimage,abalone,heart
imblearn.SMOTETomek,0.966667,0.368849,0.819,0.106814,0.55
maatpy.SMOTETomek,0.966667,0.45723,0.821952,0.136378,0.573816
