In [1]:
from multi_imbalance.datasets import load_datasets
import experiment_v2 as ex
from sklearn.model_selection import StratifiedKFold
from collections import Counter, defaultdict
from multi_imbalance.utils.metrics import gmean_score
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from utils import plot_embeddings
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis

In [2]:
datasets = load_datasets()

In [3]:
ds_name = 'balance-scale'
test_on_all_folds = True
test_all_ds = True

In [4]:
datasets = ex.one_hot_encode_all(datasets)

In [5]:
from experiment import dt_name_to_cols_to_encode, dt_name_minority_classes

In [6]:
from sklearn.neighbors import KNeighborsClassifier

def calc_safety_minority_majority_5(X,y, minority_classes, metric=None):
    result_minority = {
        "safe":0,
        "borderline":0,
        "rare":0,
        "outlier":0
    }
    
    result_majority = {
        "safe":0,
        "borderline":0,
        "rare":0,
        "outlier":0
    }
    
    result = {True: result_minority, False: result_majority}
    
    
    neigh = CustomKNN(n_neighbors=5, metric=metric)
    neigh.fit(X, y)
    neighbors = neigh.kneighbors(X, 5, False)
    
    for i, (x, nbors) in enumerate(list(zip(X, neighbors))):
        same_class_nbrs = (y[nbors] == y[i]).sum()
        
        cond = y[i] in minority_classes
        
        if same_class_nbrs >= 4:
            result[cond]['safe'] += 1
        elif same_class_nbrs >= 2:
            result[cond]['borderline'] += 1
        elif same_class_nbrs == 1:
            result[cond]['rare'] += 1
        else:
            result[cond]['outlier'] += 1
    return result[True], result[False]

In [7]:
def standardize(X_train, X_test):
    means = X_train.mean(axis=0)
    stds = X_train.std(axis=0)
    return (X_train - means) / (stds+1e-6), (X_test - means) / (stds + 1e-6)

In [8]:
class CustomKNN:
    def __init__(self, n_neighbors, metric):
        self.k = n_neighbors
        self.metric = metric
        self.D = None  # Distance matrix
        
    def fit(self, X, y):
        self.D = np.zeros((X.shape[0], X.shape[0]))
        for i, x1 in enumerate(X):
            for j, x2 in enumerate(X):
                if i!=j:
                    self.D[i,j] = self.metric(x1, x2)
                    
    def kneighbors(self, X, k1, *args):
        result = []
        for i, x in enumerate(X):
            closest_indices = np.argsort(self.D[i, :])[:k1+1]
            closest_indices = [idx for idx in closest_indices if idx != i] # without self
            if len(closest_indices) == k1:
                result.append(closest_indices)
            else:
                result.append(closest_indices[:k1])
        return np.array(result)

In [9]:
from collections import Counter
class HVDM:
    def __init__(self, X, y, categorical_cols_indices):
        self.X = X
        self.y = y
        self.stds = np.std(X, axis=0)
        self.categorical_cols_indices = categorical_cols_indices
        self.N_a_x = np.apply_along_axis(lambda x: Counter(x), 0, X)
        self.C = np.unique(y).size
        self.N_a_x_c = np.apply_along_axis(lambda x: Counter(list(zip(x, y))), 0, X)
        
    def metric(self, X1, X2):
        result = []
        for a in range(self.X.shape[1]):
            x = X1[a]
            y = X2[a]
            if a in self.categorical_cols_indices:
                vdm = 0
                for c in range(self.C):
                    P_a_x_c = self.N_a_x_c[a][(x,c)] / self.N_a_x[a][x]
                    P_a_y_c = self.N_a_x_c[a][(y,c)] / self.N_a_x[a][y]

                    vdm += np.abs(P_a_x_c - P_a_y_c)
                result.append(vdm)
            else:
                result.append(np.abs(x-y) / self.stds[a] / 4)
                
        return sum(result)

In [10]:
result_dict = defaultdict(list)

safety_dict = defaultdict(list)

if test_all_ds:
    ds_names = dt_name_to_cols_to_encode.keys()
else:
    ds_names = [ds_name]
    
for dataset_name in ds_names:
    print(dataset_name)
    
    safety_dict["dataset"].append(dataset_name)
    
    X, y = datasets[dataset_name]['data'], datasets[dataset_name]['target']
    
    minority_classes = dt_name_minority_classes[dataset_name]
    number_of_minority_examples = sum([label in minority_classes for label in y])
    
    hvdm = HVDM(X, y, dt_name_to_cols_to_encode[dataset_name])
            
    s_dict_minority, s_dict_majority = calc_safety_minority_majority_5(X, y, minority_classes, metric=hvdm.metric)
    for k,v in s_dict_minority.items():
        safety_dict[f"minority-{k}"].append(v / number_of_minority_examples)
        
    for k,v in s_dict_majority.items():
        safety_dict[f"majority-{k}"].append(v / (X.shape[0] - number_of_minority_examples))

cmc
dermatology
hayes-roth
new_vehicle
new_yeast
1czysty-cut
2delikatne-cut
3mocniej-cut
4delikatne-bezover-cut
balance-scale
cleveland
cleveland_v2
glass
new_ecoli
new_led7digit
new_winequality-red
thyroid-newthyroid


In [11]:
safety_df = pd.DataFrame.from_dict(safety_dict)

In [12]:
safety_df[sorted(safety_df.filter(regex=("dataset|minority")).columns)].style.highlight_max(color = 'lightgreen', axis = 1)

Unnamed: 0,dataset,minority-borderline,minority-outlier,minority-rare,minority-safe
0,cmc,0.45045,0.204204,0.24024,0.105105
1,dermatology,0.0,0.0,0.0,1.0
2,hayes-roth,0.0,0.0,0.0,1.0
3,new_vehicle,0.093525,0.0,0.016787,0.889688
4,new_yeast,0.288889,0.327778,0.188889,0.194444
5,1czysty-cut,0.072222,0.0,0.002778,0.925
6,2delikatne-cut,0.330556,0.047222,0.125,0.497222
7,3mocniej-cut,0.363889,0.141667,0.294444,0.2
8,4delikatne-bezover-cut,0.230556,0.041667,0.083333,0.644444
9,balance-scale,0.0,0.734694,0.265306,0.0


In [13]:
safety_df[sorted(safety_df.filter(regex=("dataset|majority")).columns)].style.highlight_max(color = 'lightgreen', axis = 1)

Unnamed: 0,dataset,majority-borderline,majority-outlier,majority-rare,majority-safe
0,cmc,0.474561,0.077193,0.225439,0.222807
1,dermatology,0.054913,0.0,0.0,0.945087
2,hayes-roth,0.27907,0.0,0.015504,0.705426
3,new_vehicle,0.104895,0.002331,0.025641,0.867133
4,new_yeast,0.419479,0.115798,0.172546,0.292178
5,1czysty-cut,0.046429,0.0,0.009524,0.944048
6,2delikatne-cut,0.115476,0.003571,0.021429,0.859524
7,3mocniej-cut,0.258333,0.00119,0.019048,0.721429
8,4delikatne-bezover-cut,0.12381,0.004762,0.032143,0.839286
9,balance-scale,0.197917,0.0,0.020833,0.78125
