In [1]:
from multi_imbalance.datasets import load_datasets
import experiment_v2 as ex
from sklearn.model_selection import StratifiedKFold
from collections import Counter, defaultdict
from multi_imbalance.utils.metrics import gmean_score
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from utils import plot_embeddings
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis



In [2]:
datasets = load_datasets()

In [3]:
ds_name = 'dermatology'
test_on_all_folds = True
test_all_ds = True

In [4]:
datasets = ex.one_hot_encode_all(datasets)

In [5]:
from experiment import dt_name_to_cols_to_encode
from experiment import dt_name_minority_classes

In [6]:
from sklearn.neighbors import KNeighborsClassifier

def calc_safety5(X,y, clazz=None):
    result = {
        "safe":0,
        "borderline":0,
        "rare":0,
        "outlier":0
    }
    neigh = KNeighborsClassifier(n_neighbors=5)
    neigh.fit(X, y)
    neighbors = neigh.kneighbors(X, 5+1, False)
    for i, (x, nbors) in enumerate(list(zip(X, neighbors))):
        if y[i] == clazz:
            same_class_nbrs = (y[nbors[1:]] == y[i]).sum()
            if same_class_nbrs >= 4:
                result['safe'] += 1
            elif same_class_nbrs >= 2:
                result['borderline'] += 1
            elif same_class_nbrs == 1:
                result['rare'] += 1
            else:
                result['outlier'] += 1
    return result

In [7]:
def standardize(X_train, X_test):
    means = X_train.mean(axis=0)
    stds = X_train.std(axis=0)
    return (X_train - means) / (stds+1e-6), (X_test - means) / (stds + 1e-6)

In [8]:
result_dict = defaultdict(list)

safety_dict = defaultdict(list)

if test_all_ds:
    ds_names = dt_name_to_cols_to_encode.keys()
else:
    ds_names = [ds_name]
    
for dataset_name in ds_names:
# for dataset_name in list(datasets.keys()):
    print(dataset_name)
    
    safety_dict["dataset"].append(dataset_name)
    

    X, y = datasets[dataset_name]['data'], datasets[dataset_name]['target']
    X_encoded, y = datasets[f"{dataset_name}_encoded"]['data'], datasets[f"{dataset_name}_encoded"]['target']
    
    X_encoded, _ = standardize(X_encoded, X_encoded)
        
        
    min_classes = dt_name_minority_classes[dataset_name]
    
    partial_safeness = defaultdict(list)
    
    for min_class in min_classes:
        # safeness is calculated only for examples from a specific class
        s_dict = calc_safety5(X_encoded, y, clazz=min_class)
        for k,v in s_dict.items():
            partial_safeness[k].append(v / np.sum(y==min_class))
            
    for k,v in partial_safeness.items():
        safety_dict[f"raw-{k}"].append(np.mean(v))

cmc
dermatology
hayes-roth
new_vehicle
new_yeast
1czysty-cut
2delikatne-cut
3mocniej-cut
4delikatne-bezover-cut
balance-scale
cleveland
cleveland_v2
glass
new_ecoli
new_led7digit
new_winequality-red
thyroid-newthyroid


In [9]:
safety_df = pd.DataFrame.from_dict(safety_dict)

In [10]:
safety_df[sorted(safety_df.filter(regex=("dataset|raw")).columns)].style.highlight_max(color = 'lightgreen', axis = 1)

Unnamed: 0,dataset,raw-borderline,raw-outlier,raw-rare,raw-safe
0,cmc,0.465465,0.198198,0.213213,0.123123
1,dermatology,0.45,0.0,0.1,0.45
2,hayes-roth,0.451613,0.0,0.0,0.548387
3,new_vehicle,0.149544,0.005025,0.024031,0.821401
4,new_yeast,0.216303,0.374665,0.1648,0.244232
5,1czysty-cut,0.089583,0.0,0.0,0.910417
6,2delikatne-cut,0.31875,0.072917,0.152083,0.45625
7,3mocniej-cut,0.325,0.175,0.30625,0.19375
8,4delikatne-bezover-cut,0.233333,0.05625,0.0875,0.622917
9,balance-scale,0.0,0.857143,0.142857,0.0
