In [102]:
import numpy as np
import intracluster_smote
from intracluster_smote import IntraclusterSmote
import evaluation
import matplotlib.pyplot as plt
import mnist_utils
import pandas as pd
from IPython.display import display
import sklearn
import sklearn.cluster

In [40]:
# Import Ecoli Dataset
ecoli_data_raw = pd.read_csv('datasets/ecoli.data.txt', delim_whitespace=True, header=None, names=['Sequence Name','mcg','gvh','lip','chg','aac','alm1','alm2','class'])
ecoli_data_target = ecoli_data_raw['class'].as_matrix()
ecoli_data = ecoli_data_raw.drop(['Sequence Name','class'], axis=1).as_matrix()
ecoli_scaler = sklearn.preprocessing.MinMaxScaler()
ecoli_data = ecoli_scaler.fit_transform(ecoli_data)
ecoli_train, ecoli_validate, ecoli_train_target, ecoli_validate_target = sklearn.model_selection.train_test_split(ecoli_train, ecoli_train_target)

In [96]:
datasets = {
    'MNIST Imbalanced 7+8': {
        'Data': mnist_utils.load_binary_imbalanced(classes=[7,8], ratio=0.2),
        'Imbalance Ratio': 1/5,
        'Minority Class': 8,
        'Scaler': sklearn.preprocessing.MinMaxScaler().fit([[0],[255]]),
        'k': 2
    },
    'MNIST Imbalanced 1+7': {
        'Data': mnist_utils.load_binary_imbalanced(classes=[1,7], ratio=0.2),
        'Imbalance Ratio': 1/5,
        'Minority Class': 7,
        'Scaler': sklearn.preprocessing.MinMaxScaler().fit([[0],[255]]),
        'k':2
    },
    'Ecoli': {
        'Data': ((ecoli_train, ecoli_train_target),(ecoli_validate, ecoli_validate_target)),
        'Imbalance Ratio': 1/5.46,
        'Minority Class':'pp',
        'Scaler': ecoli_scaler,
        'k':2
    }
}

In [127]:
clustering_methods = {
    'No Clustering': lambda X, k: np.zeros(X.shape[0],),
    'Random Clustering': lambda X, k: np.random.choice(k, size=(X.shape[0])),
    'K-Means': lambda X, k: sklearn.cluster.KMeans(n_clusters=k).fit(X).labels_
}

In [None]:
classification_results = {}
for dataset_name, dataset in datasets.items():
    (train_set, train_set_target), (validation_set, validation_set_target) = dataset['Data']
    for method_name, clustering_method in clustering_methods.items():
        #upsample
        oversampler = IntraclusterSmote( IntraclusterSmote.compute_synthetic_count(train_set.shape[0], dataset['Imbalance Ratio']) )
        oversampled_train_set, oversampled_train_set_target = oversampler.fit(train_set, train_set_target, (train_set_target == dataset['Minority Class']), clustering_method(train_set, dataset['k']))
        # classify
        classification_results[(dataset_name,method_name)] = evaluation.classify(oversampled_train_set, oversampled_train_set_target, validation_set)

classification_evaulation = {}
for (dataset_name, clustering_method), res in classification_results.items():
    for classifier_name, classification in res.items():
        classification_evaulation[(dataset_name,clustering_method, classifier_name)] = evaluation.evaluate_classification(classification, datasets[dataset_name]['Data'][1][1])

In [134]:
display(pd.DataFrame(classification_evaulation).transpose())

Unnamed: 0,Unnamed: 1,Unnamed: 2,Weighted F-Measure
Ecoli,K-Means,Gradient Boosting,0.373442
Ecoli,K-Means,Logistic Regression,0.714286
Ecoli,K-Means,Random Forest,0.37037
Ecoli,K-Means,Support Vector,0.714286
Ecoli,No Clustering,Gradient Boosting,0.393939
Ecoli,No Clustering,Logistic Regression,0.714286
Ecoli,No Clustering,Random Forest,0.652778
Ecoli,No Clustering,Support Vector,0.714286
Ecoli,Random Clustering,Gradient Boosting,0.539989
Ecoli,Random Clustering,Logistic Regression,0.714286
