In [1]:
from multi_imbalance.datasets import load_datasets
import experiment_v2 as ex
from sklearn.model_selection import StratifiedKFold
from collections import Counter, defaultdict
from multi_imbalance.utils.metrics import gmean_score
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from utils import plot_embeddings
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis

In [2]:
datasets = load_datasets()

In [3]:
ds_name = 'dermatology'
test_on_all_folds = True
test_all_ds = True

In [4]:
datasets = ex.one_hot_encode_all(datasets)

In [5]:
from experiment import dt_name_to_cols_to_encode, dt_name_minority_classes

In [6]:
from sklearn.neighbors import KNeighborsClassifier

def calc_safety_minority_majority_5(X,y, minority_classes):
    result_minority = {
        "safe":0,
        "borderline":0,
        "rare":0,
        "outlier":0
    }
    
    result_majority = {
        "safe":0,
        "borderline":0,
        "rare":0,
        "outlier":0
    }
    
    result = {True: result_minority, False: result_majority}
    
    
    neigh = KNeighborsClassifier(n_neighbors=5)
    neigh.fit(X, y)
    neighbors = neigh.kneighbors(X, 5+1, False)
    
    for i, (x, nbors) in enumerate(list(zip(X, neighbors))):
        same_class_nbrs = (y[nbors[1:]] == y[i]).sum()
        
        cond = y[i] in minority_classes
        
        if same_class_nbrs >= 4:
            result[cond]['safe'] += 1
        elif same_class_nbrs >= 2:
            result[cond]['borderline'] += 1
        elif same_class_nbrs == 1:
            result[cond]['rare'] += 1
        else:
            result[cond]['outlier'] += 1
    return result[True], result[False]

In [7]:
def standardize(X_train, X_test):
    means = X_train.mean(axis=0)
    stds = X_train.std(axis=0)
    return (X_train - means) / (stds+1e-6), (X_test - means) / (stds + 1e-6)

In [8]:
result_dict = defaultdict(list)

safety_dict = defaultdict(list)

if test_all_ds:
    ds_names = dt_name_to_cols_to_encode.keys()
else:
    ds_names = [ds_name]
    
for dataset_name in ds_names:
# for dataset_name in list(datasets.keys()):
    print(dataset_name)
    
    safety_dict["dataset"].append(dataset_name)
    

    X, y = datasets[dataset_name]['data'], datasets[dataset_name]['target']
    X_encoded, y = datasets[f"{dataset_name}_encoded"]['data'], datasets[f"{dataset_name}_encoded"]['target']
    
    X_encoded, _ = standardize(X_encoded, X_encoded)
    
    minority_classes = dt_name_minority_classes[dataset_name]
    
    number_of_minority_examples = sum([label in minority_classes for label in y])
        
    s_dict_minority, s_dict_majority = calc_safety_minority_majority_5(X_encoded, y, minority_classes)
    for k,v in s_dict_minority.items():
        safety_dict[f"minority-{k}"].append(v / number_of_minority_examples)
        
    for k,v in s_dict_majority.items():
        safety_dict[f"majority-{k}"].append(v / (X_encoded.shape[0] - number_of_minority_examples))

cmc
dermatology
hayes-roth
new_vehicle
new_yeast
1czysty-cut
2delikatne-cut
3mocniej-cut
4delikatne-bezover-cut
balance-scale
cleveland
cleveland_v2
glass
new_ecoli
new_led7digit
new_winequality-red
thyroid-newthyroid


In [9]:
safety_df = pd.DataFrame.from_dict(safety_dict)

In [10]:
safety_df[sorted(safety_df.filter(regex=("dataset|minority")).columns)].style.highlight_max(color = 'lightgreen', axis = 1)

Unnamed: 0,dataset,minority-borderline,minority-outlier,minority-rare,minority-safe
0,cmc,0.477477,0.213213,0.192192,0.117117
1,dermatology,0.45,0.0,0.1,0.45
2,hayes-roth,0.451613,0.0,0.0,0.548387
3,new_vehicle,0.146283,0.004796,0.023981,0.82494
4,new_yeast,0.266667,0.35,0.161111,0.222222
5,1czysty-cut,0.077778,0.0,0.0,0.922222
6,2delikatne-cut,0.330556,0.055556,0.130556,0.483333
7,3mocniej-cut,0.35,0.15,0.291667,0.208333
8,4delikatne-bezover-cut,0.230556,0.041667,0.083333,0.644444
9,balance-scale,0.0,0.755102,0.244898,0.0


In [11]:
safety_df[sorted(safety_df.filter(regex=("dataset|majority")).columns)].style.highlight_max(color = 'lightgreen', axis = 1)

Unnamed: 0,dataset,majority-borderline,majority-outlier,majority-rare,majority-safe
0,cmc,0.524561,0.074561,0.221053,0.179825
1,dermatology,0.17341,0.011561,0.028902,0.786127
2,hayes-roth,0.775194,0.0,0.03876,0.186047
3,new_vehicle,0.111888,0.002331,0.027972,0.857809
4,new_yeast,0.425613,0.110429,0.180215,0.283742
5,1czysty-cut,0.039286,0.0,0.010714,0.95
6,2delikatne-cut,0.115476,0.003571,0.014286,0.866667
7,3mocniej-cut,0.265476,0.00119,0.010714,0.722619
8,4delikatne-bezover-cut,0.132143,0.005952,0.027381,0.834524
9,balance-scale,0.619792,0.003472,0.168403,0.208333
