In [None]:
from collections import defaultdict
import numpy as np
import pandas as pd
from sklearn.metrics import recall_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelBinarizer
from IPython.core.display import display
from sklearn.model_selection import StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from tqdm import tqdm_notebook

from multi_imbalance.ensemble.SOUPBagging import SOUPBagging
from multi_imbalance.resampling.SOUP import SOUP
from multi_imbalance.resampling.MDO import MDO
from multi_imbalance.resampling.GlobalCS import GlobalCS

from imblearn.metrics import geometric_mean_score
from imblearn.over_sampling import SMOTE
from multi_imbalance.resampling.spider import SPIDER3

from sklearn.neighbors import KNeighborsClassifier
import warnings
import logging
from multi_imbalance.utils.data import load_arff_datasets
from multi_imbalance.utils.min_int_maj import maj_int_min
logger = logging.getLogger()
logger.setLevel(logging.CRITICAL)
warnings.filterwarnings('ignore')

from IPython.display import clear_output
clear_output(wait=True)

In [None]:
# def green_valid_backgroud(s):
#     correct = ['1czysty-cut', '2delikatne-cut', '3mocniej-cut','4delikatne-bezover-cut', 'cmc', 'dermatology', 'new_ecoli','new_vehicle','thyroid-newthyroid']
#     return ['background-color: green' if v in correct else '' for v in list(s.index)]
# 


def bold_max(s):
    '''
    highlight the maximum in a Series yellow.
    '''
    is_max = s == s.max()
    return ['font-weight: bold' if v else '' for v in is_max]
    
def print_scores(scores, only_read_dt = False):
    display("G-MEAN")
    df = pd.DataFrame(scores).T
    if only_read_dt:
        df = df.iloc[4:]
    df2 = df.style.apply(bold_max, axis=1)
    display(df2)

    df.fillna(df.median(), inplace=True)
    display(pd.DataFrame(df.mean().sort_values(ascending=False),columns=['Mean G-mean']))
    display(pd.DataFrame(df.rank(axis=1,ascending=False).mean().sort_values(),columns=['Mean rank']))
# print_scores(scores_knn)

In [None]:
def resample_data(resample, seed, X_train, y_train, no_classes, dataset_name):
    if resample == 'base':
        X_train_resampled, y_train_resampled = X_train, y_train
    elif resample=='soup':
        soup = SOUP(k=3)
        X_train_resampled, y_train_resampled = soup.fit_transform(np.copy(X_train), np.copy(y_train))
    elif resample=='global':
        global_cs = GlobalCS()
        X_train_resampled, y_train_resampled = global_cs.fit_transform(np.copy(X_train), np.copy(y_train), shuffle=False)
    elif resample=='smote':
        smote = SMOTE(random_state=seed)
        X_train_resampled, y_train_resampled = smote.fit_sample(np.copy(X_train), np.copy(y_train))
    elif resample=='mdo':
        mdo = MDO(k=3, k1_frac=0.5, seed=seed)
        X_train_resampled, y_train_resampled = mdo.fit_transform(np.copy(X_train), np.copy(y_train), maj_int_min[dataset_name])
    elif resample=='spider':
        cost = np.ones((no_classes, no_classes))
        np.fill_diagonal(cost, 0)
        clf = SPIDER3(k=5, cost=cost, majority_classes=maj_int_min[dataset_name]['maj'], intermediate_classes=maj_int_min[dataset_name]['int'], minority_classes=maj_int_min[dataset_name]['min'])
        X_train_resampled, y_train_resampled = clf.fit_transform(X_train.astype(np.float64), y_train)
    elif 'soupbg' in resample or 'mrbbag' in resample:
        # SOUP Bagging does it by itself
        X_train_resampled, y_train_resampled = X_train, y_train
    else:
        raise ValueError(f'Bad type{resample}')
    return X_train_resampled, y_train_resampled



def test_resampling(res, dataset_values, dataset_name):
    X, y = dataset_values.data, dataset_values.target

    no_classes = np.unique(y).size
    minority_class = maj_int_min[dataset_name]['min']
    result_data = defaultdict(int)
    run_data = defaultdict(lambda: defaultdict(list)) # {metric: {run_number: [scores]}}
    for i in range(10):
        skf = StratifiedKFold(n_splits=5, shuffle=True,random_state=i)
        for train_index, test_index in skf.split(X, y):
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]
            normalizer = StandardScaler().fit(X_train)

            X_train = normalizer.transform(X_train)
            X_test = normalizer.transform(X_test)
            X_train_resampled, y_train_resampled = resample_data(res, i, X_train, y_train, no_classes, dataset_name)

            # for clf_name in ['knn']:
            for clf_name in ['knn','tree']:
                if clf_name == 'knn':
                    clf = KNeighborsClassifier(n_neighbors=3)
                elif clf_name == 'tree':
                    clf = DecisionTreeClassifier(random_state=i)
                # DONT JUDGE ME
                if res == 'soupbg005':
                    vote_classifier = SOUPBagging(clf, n_classifiers=5)
                    clf = vote_classifier
                elif res == 'soupbg015':
                    vote_classifier = SOUPBagging(clf, n_classifiers=15)
                    clf = vote_classifier
                elif res == 'soupbg030':
                    vote_classifier = SOUPBagging(clf, n_classifiers=30)
                    clf = vote_classifier
                elif res == 'soupbg050':
                    vote_classifier = SOUPBagging(clf, n_classifiers=50)
                    clf = vote_classifier
                elif res == 'soupbg100':
                    vote_classifier = SOUPBagging(clf, n_classifiers=100)
                    clf = vote_classifier
                # elif res == 'mrbbag005':
                    
                    
                clf.fit(X_train_resampled, y_train_resampled)
                y_pred = clf.predict(X_test)
                gmean = geometric_mean_score(y_test, y_pred, correction=0.001)a
                minority_gmean = geometric_mean_score(y_test, y_pred,labels=minority_class, correction=0.001)
                avg_acc = np.mean(recall_score(y_test, y_pred, average=None))
                run_data['g_mean_{}'.format(clf_name)][str(i)].append(gmean)
                run_data['g_mean_{}_minority'.format(clf_name)][str(i)].append(minority_gmean)
                # run_data['avg_acc_{}'.format(clf_name)][str(i)].append(avg_acc)
    
    def get_score_from_metric(run_data, metric):
        runs = run_data[metric]
        runs_scores_list = list(runs.values()) #[[one run k-foledscores],[..]]
        result = np.mean(list(map(np.mean, runs_scores_list)))
        return result
            
    for metric_name, metric_values in run_data.items():
        result_data[metric_name] = get_score_from_metric(run_data, metric_name)
        
    return result_data


def provide_test_and_get_scores(dataset, clf_res):
    scores = defaultdict(lambda: defaultdict(dict))
    for dataset_name, dataset_values in tqdm_notebook(datasets.items(),total=len(datasets), desc='1st loop'):
        for resample in clf_res_names:
            result_data = test_resampling(resample, dataset_values, dataset_name)
            for key in result_data:
                scores[key][dataset_name][resample] = round(result_data[key],4)
    return scores

clf_res_names =['soupbg005','soupbg015', 'soupbg030', 'soupbg050', 'soupbg100']
datasets = load_arff_datasets()
scores = provide_test_and_get_scores(datasets, clf_res_names)


### Wszystkie zbiory danych:
#### Drzewo

In [None]:
print_scores(scores['g_mean_tree'])


#### kNN - 5

In [None]:
print_scores(scores['g_mean_knn'])
# print_scores(avg_acc)


### Rzeczywiste zbiory danych
#### Drzewo

In [None]:
print_scores(scores['g_mean_tree'],only_read_dt=True)


#### kNN

In [None]:
print_scores(scores['g_mean_knn'],only_read_dt=True)

### Wyniki dla klas mniejszościowych

### Wszystkie zbiory danych:
#### Drzewo

In [None]:
print_scores(scores['g_mean_tree_minority'])


#### kNN - 5

In [None]:
print_scores(scores['g_mean_knn_minority'])
# print_scores(avg_acc)


### Porównanie baggingów - głosowanie przez średnią

In [None]:
# clf_res_names =['base','soup','soupbg005','soupbg015','soupbg030', 'soupbg050', 'soupbg100']
# # clf_res_names =['base','global','smote','mdo','soup']
# # clf_res_names =['base','global','soup']
# # datasets = load_arff_datasets()
# scores = provide_test_and_get_scores(datasets, clf_res_names)
# 
# 

### Wszystkie zbiory danych:
#### Drzewo

In [None]:
# print_scores(scores['g_mean_tree'])
# 
# 

#### kNN - 5

In [None]:
# print_scores(scores['g_mean_knn'])
# # print_scores(avg_acc)
# 
# 

# print_scores(scores['g_mean_knn'])
# # print_scores(avg_acc)
# 
# 

In [None]:
print_scores(scores['g_mean_knn_minority'])
# print_scores(avg_acc)


### Porównanie baggingów - głosowanie przez średnią

In [None]:
# clf_res_names =['base','soup','soupbg005','soupbg015','soupbg030', 'soupbg050', 'soupbg100']
# # clf_res_names =['base','global','smote','mdo','soup']
# # clf_res_names =['base','global','soup']
# # datasets = load_arff_datasets()
# scores = provide_test_and_get_scores(datasets, clf_res_names)
# 
# 

### Wszystkie zbiory danych:
#### Drzewo

In [None]:
# print_scores(scores['g_mean_tree'])
# 
# 

#### kNN - 5

In [None]:
# print_scores(scores['g_mean_knn'])
# # print_scores(avg_acc)
# 
# 

