In [44]:
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from multi_imbalance.resampling.spider import SPIDER3
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from multi_imbalance.datasets import load_datasets
from multi_imbalance.resampling.MDO import MDO
from multi_imbalance.resampling.SOUP import SOUP
from collections import Counter
from imblearn.metrics import geometric_mean_score


In [6]:
def read_train_and_test_data(overlap, imbalance_ratio, i):
    with open(f"data/3class-{imbalance_ratio}-overlap-{overlap}-learn-{i}.arff") as f:
        content = f.readlines()
    content = [x.strip().split(",") for x in content][5:]
    data = np.array(content)
    X_train, y_train = data[:, :-1].astype(float), data[:, -1].astype(object)

    with open(f"data/3class-{imbalance_ratio}-overlap-{overlap}-test-{i}.arff") as f:
        content = f.readlines()
    content = [x.strip().split(",") for x in content][5:]
    data = np.array(content)
    X_test, y_test = data[:, :-1].astype(float), data[:, -1].astype(object)

    return X_train, y_train, X_test, y_test


def train_and_test():
    neigh = KNeighborsClassifier(n_neighbors=1)
    neigh.fit(X_train, y_train)
    y_pred = neigh.predict(X_test)
    labels = ['MIN', 'INT', 'MAJ']
    return [confusion_matrix(y_test, y_pred, labels=labels)[i, i] / confusion_matrix(y_test, y_pred, labels=labels)[i,:].sum() for i,label in enumerate(labels)]


In [7]:
result = []

for imbalance_ratio in ["70-30-0-0", "40-50-10-0", "30-40-15-15"]:
    print(f"Imbalance ratio: {imbalance_ratio}")
    for overlap in range(0, 3):
        print(f"Overlap: {overlap}")
        min_tpr = []
        int_tpr = []
        maj_tpr = []
        for i in range(1,11):
            X_train, y_train, X_test, y_test = read_train_and_test_data(overlap, imbalance_ratio, i)
            cost = np.ones((3, 3))
            for i in range(3):
                cost[i][i] = 0

            clf = SPIDER3(k=5, cost=cost, majority_classes=['MAJ'],
                          intermediate_classes=['INT'], minority_classes=['MIN'])
            X_train, y_train = clf.fit_transform(X_train.astype(np.float64), y_train)
            min_t, int_t, maj_t = train_and_test()
            min_tpr.append(min_t)
            int_tpr.append(int_t)
            maj_tpr.append(maj_t)
        result.append([np.array(min_tpr).mean(), np.array(int_tpr).mean(), np.array(maj_tpr).mean()])

np.savetxt("spider-results.csv", np.asarray(result), delimiter=",")

Imbalance ratio: 70-30-0-0
Overlap: 0
Overlap: 1
Overlap: 2
Imbalance ratio: 40-50-10-0
Overlap: 0
Overlap: 1
Overlap: 2
Imbalance ratio: 30-40-15-15
Overlap: 0
Overlap: 1
Overlap: 2


In [6]:
np.random.seed(0)
datasets = load_datasets()
results = dict()

In [9]:
datasets.keys()

odict_keys(['balance_scale', 'cleveland', 'cmc', 'dermatology', 'ecoli', 'glass', 'hayes_roth', 'new_thyroid', 'winequailty_red', 'yeast'])

In [68]:
for k in datasets.keys():
    X, y = datasets[k].data, datasets[k].target
    no_classes = np.unique(y).size
    print(k)
    print(Counter(y))

balance_scale
Counter({2: 288, 1: 288, 0: 49})
cleveland
Counter({0: 160, 1: 54, 2: 35, 3: 35, 4: 13})
cmc
Counter({0: 629, 2: 511, 1: 333})
dermatology
Counter({0: 111, 2: 71, 1: 60, 4: 48, 3: 48, 5: 20})
ecoli
Counter({0: 143, 1: 77, 7: 52, 4: 35, 5: 20, 6: 5, 3: 2, 2: 2})
glass
Counter({1: 76, 0: 70, 5: 29, 2: 17, 3: 13, 4: 9})
hayes_roth
Counter({0: 51, 1: 51, 2: 30})
new_thyroid
Counter({0: 150, 1: 35, 2: 30})
winequailty_red
Counter({2: 681, 3: 638, 4: 199, 1: 53, 5: 18, 0: 10})
yeast
Counter({0: 463, 7: 429, 6: 244, 5: 163, 4: 51, 3: 44, 2: 35, 9: 30, 8: 20, 1: 5})


In [69]:
maj_int_min = {
    'balance_scale' : {
        'maj': [2, 1],
        'int': [],
        'min': [0]
    }, 
    'cleveland': {
        'maj': [0],
        'int': [1],
        'min': [2,3,4]
    }, 
    'cmc': {
        'maj': [0],
        'int': [2],
        'min': [1]
    }, 
    'dermatology': {
        'maj': [0],
        'int': [2,1,4,3],
        'min': [5]
    }, 
    'ecoli': {
        'maj': [0,1],
        'int': [7,4,5],
        'min': [6,3,2]
    }, 
    'glass': {
        'maj': [1,0],
        'int': [5],
        'min': [2,3,4]
    }, 
    'hayes_roth': {
        'maj': [0,1],
        'int': [],
        'min': [2]
    }, 
    'new_thyroid': {
        'maj': [0],
        'int': [],
        'min': [1,2]
    }, 
    'winequailty_red': {
        'maj': [2,3],
        'int': [4],
        'min': [1,5,0]
    }, 
    'yeast': {
        'maj': [0,7],
        'int': [6, 5],
        'min': [4,3,2,9,8,1]
    }
}

In [64]:
X, y = datasets['balance_scale'].data, datasets['balance_scale'].target
no_classes = np.unique(y).size
print(Counter(y))

Counter({2: 288, 1: 288, 0: 49})


In [32]:
cost = np.ones((no_classes, no_classes))
np.fill_diagonal(cost, 0)

In [51]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)

In [52]:
clf = SPIDER3(k=5, cost=cost, majority_classes=[2,1], intermediate_classes=[], minority_classes=[0])
X_train, y_train = clf.fit_transform(X_train.astype(np.float64), y_train)

In [53]:
neigh = KNeighborsClassifier(n_neighbors=1)
neigh.fit(X_train, y_train)
y_pred = neigh.predict(X_test)

In [54]:
score = geometric_mean_score(y_test, y_pred)
score

0.45478372810266815

In [70]:
np.random.seed(0)
datasets = load_datasets()
results = dict()

for dataset_name, dataset_values in datasets.items():
    results[dataset_name]=dict()
    for resample in ['base', 'soup','mdo', 'spider']:
        clf_tree = DecisionTreeClassifier(random_state=0)
        X, y = dataset_values.data, dataset_values.target
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)
        if resample == 'base':
            pass
        elif resample=='soup':
            soup = SOUP()
            X_train, y_train = soup.fit_transform(X_train, y_train, shuffle=True)
        elif resample=='mdo':
            try:
                mdo = MDO()
                X_train,y_train = mdo.fit_transform(X_train, y_train)
            except Exception as e:
                print(dataset_name, e)
                pass
        elif resample=='spider':
            no_classes = np.unique(y).size
            cnt = Counter(y)
            cost = np.ones((no_classes, no_classes))
            np.fill_diagonal(cost, 0)
            
            clf = SPIDER3(k=5, cost=cost, majority_classes=maj_int_min[dataset_name]['maj'], intermediate_classes=maj_int_min[dataset_name]['int'], minority_classes=maj_int_min[dataset_name]['min'])
            X_train, y_train = clf.fit_transform(X_train.astype(np.float64), y_train)
        clf_tree.fit(X_train, y_train)
        y_pred = clf_tree.predict(X_test)
        score = geometric_mean_score(y_test, y_pred)
        results[dataset_name][resample]=round(score,3)

df = pd.DataFrame(results).T
df

  explained_variance_ = (S ** 2) / (n_samples - 1)
  variables_variance = np.diag(np.cov(uncorrelated_samples, rowvar=False))
  c *= np.true_divide(1, fact)
  c *= np.true_divide(1, fact)


cleveland Input must be 1- or 2-d.


  a = np.sum(X / V)


cmc Range exceeds valid bounds


Unnamed: 0,base,soup,mdo,spider
balance_scale,0.37,0.548,0.465,0.661
cleveland,0.0,0.273,0.0,0.251
cmc,0.434,0.507,0.434,0.446
dermatology,0.864,0.908,0.896,0.923
ecoli,0.0,0.0,0.0,0.0
glass,0.704,0.635,0.527,0.745
hayes_roth,0.67,0.687,0.652,0.41
new_thyroid,0.84,0.887,0.917,0.864
winequailty_red,0.0,0.241,0.0,0.306
yeast,0.0,0.445,0.389,0.0
