# Here, we do analysis using the validation/test results

In [None]:
top = 1   # if we choose multiple top results of each method, However, we have (many) 60 datasets, so top 1 is enough.

In [None]:
import pandas as pd
import numpy as np
from scipy import stats
from scipy.stats import kurtosis, skew
from itertools import combinations
import minepy
from collections import Counter
from sklearn.metrics import mutual_info_score
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state = 2)   # 5-fold-cross validation

In [None]:
import bayesiantests as bt
import matplotlib.pyplot as plt

#https://matplotlib.org/stable/gallery/lines_bars_and_markers/horizontal_barchart_distribution.html
def stacked_bar(results, category_names):
    """
    Parameters
    ----------
    results : dict
        A mapping from question labels to a list of answers per category.
        It is assumed all lists contain the same number of entries and that
        it matches the length of *category_names*.
    category_names : list of str
        The category labels.
    """
    labels = list(results.keys())
    data = np.array(list(results.values()))
    data_cum = data.cumsum(axis=1)
    category_colors = plt.colormaps['RdYlGn'](
        np.linspace(0.15, 0.85, data.shape[1]))

    fig, ax = plt.subplots(figsize=(9.2, 5))
    ax.invert_yaxis()
    ax.xaxis.set_visible(False)
    ax.set_xlim(0, np.sum(data, axis=1).max())

    for i, (colname, color) in enumerate(zip(category_names, category_colors)):
        widths = data[:, i]
        starts = data_cum[:, i] - widths
        rects = ax.barh(labels, widths, left=starts, height=0.5,
                        label=colname, color=color)

        r, g, b, _ = color
        text_color = 'white' if r * g * b < 0.5 else 'darkgrey'
        ax.bar_label(rects, label_type='center', color=text_color)
    ax.legend(ncols=len(category_names), bbox_to_anchor=(0, 1),
              loc='upper left', fontsize='small')

    return fig, ax

In [None]:
def BST(rope, baselines, ours, dfs):
    comp = []
    basewin = []
    draw = []
    ourswin = []
    z = 0
    for i in range(len(ours)):
        for j in range(len(baselines)):
            names = (baselines[j],ours[i])
            comp.append(names)
            X = np.array(dfs[i][[baselines[j],ours[i]]])
            left, within, right = bt.signtest(X, rope=rope, verbose=True, names=names)
            basewin.append(left)
            draw.append(within)
            ourswin.append(right)        
    results = pd.DataFrame(comp, columns = ["Baseline","Ours"])
    results["Basewin_prob"] = basewin
    results["Draw_prob"] = draw
    results["Ourswin_prob"] = ourswin
    return results

In [None]:
final_list = [18, 41, 14, 43, 53, 28, 20, 63, 69, 56, 19, 25, 6, 24, 80, 32, 22, 15, 27, 33, 58,
              46, 29, 64, 62, 17, 47, 13, 44, 9, 49, 55, 3, 35, 67, 54, 12, 7, 39, 36, 4, 79, 59, 52, 5, 57,
              21, 50, 45, 42, 11, 1, 51, 38, 34, 16, 10, 2, 26, 91]
print(len(final_list), final_list)

In [None]:
df_meta_loaded = pd.read_csv("df_meta_new.csv", index_col='Unnamed: 0')
df_meta_loaded

In [None]:
new_index = []
for i in range(len(df_meta_loaded)):
    if list(df_meta_loaded['data'])[i] in final_list:
        new_index.append(i)
df_meta = df_meta_loaded.iloc[new_index,:]
df_meta.reset_index(inplace=True, drop=True)
df_meta

# Analysis

###  find the best-performing for each method in each results of (SMOTE&LLM) and (LSH)

In [None]:
import pandas as pd
import numpy as np
df = pd.read_csv('result.csv', low_memory=False)
df.rename(columns={'Unnamed: 0':'Metrics'}, inplace=True)
df

In [None]:
df_LS = pd.read_csv('result_LS.csv', low_memory=False)
df_LS.rename(columns={'Unnamed: 0':'Metrics'}, inplace=True)
df_LS

In [None]:
# F1
base_ind = 2
num_class = (df.shape[1]-base_ind)/(1+5+5)
org_ind = int(base_ind+num_class)
sm_ind = int(org_ind+(df.shape[1]-base_ind-num_class)/2)
metrics = ['Acc', 'Pre', 'Rec', 'Spe', 'F1', 'GM', 'BA', 'AUC'] # 8:Acc, 9:Pre, 10:Rec, 11:Spe, 12:F1, 13:GM, 14:BA, 15:AUC 
df_res = pd.DataFrame({'DATA':[0 for i in range(top*len(final_list))],
                       'ORG_tr':[0 for i in range(top*len(final_list))],
                       'ORG_va':[0 for i in range(top*len(final_list))],
                       'ORG_t':[0 for i in range(top*len(final_list))],
                       'SMOTE_tr':[0 for i in range(top*len(final_list))],
                       'SMOTE_va':[0 for i in range(top*len(final_list))],
                       'SMOTE_t':[0 for i in range(top*len(final_list))],
                       'LLM_tr':[0 for i in range(top*len(final_list))],
                       'LLM_va':[0 for i in range(top*len(final_list))],
                       'LLM_t':[0 for i in range(top*len(final_list))]})

for i in range(len(final_list)):        # in one dataset
    for j in range(len(metrics)*2):       # in one metric
        if j == 12:                      # only choose one metric (12:validation_F1)
            df_i = df[df["Dataset"] == str(final_list[i])]   # i_th dataset
            df_i_base = df_i.iloc[:,:base_ind]               # i_th dataset base
            df_i_org = df_i.iloc[:,base_ind:org_ind]         # i_th dataset original results (12 classifiers)
            df_i_sm = df_i.iloc[:,org_ind:sm_ind]            # i_th dataset smote results (12 classifier X 5 Resam strategy)
            df_i_lm = df_i.iloc[:,sm_ind:]                   # i_th dataset llm results (12 classifier X 5 Resam strategy)  
            best_org = pd.DataFrame(df_i_org.iloc[j,:]).iloc[:,0].sort_values(ascending=False)[:top].index
            df_i_org_best = df_i_org.loc[:,best_org] 
            best_sm = pd.DataFrame(df_i_sm.iloc[j,:]).iloc[:,0].sort_values(ascending=False)[:top].index
            df_i_sm_best = df_i_sm.loc[:,best_sm]
            best_lm = pd.DataFrame(df_i_lm.iloc[j,:]).iloc[:,0].sort_values(ascending=False)[:top].index
            df_i_lm_best = df_i_lm.loc[:,best_lm]
            df_i_best = pd.concat([df_i_base,df_i_org_best],axis=1)
            df_i_best = pd.concat([df_i_best,df_i_sm_best],axis=1)
            df_i_best = pd.concat([df_i_best,df_i_lm_best],axis=1)
            for k in range(top):
                df_res.iloc[i*top+k:i*top+k+1,0] = final_list[i]
                df_res.iloc[i*top+k:i*top+k+1,1] = list(df_i_best.iloc[[j-8],2+k])
                df_res.iloc[i*top+k:i*top+k+1,2] = list(df_i_best.iloc[[j],2+k])
                df_res.iloc[i*top+k:i*top+k+1,3] = list(df_i_best.iloc[[j+8],2+k])
                df_res.iloc[i*top+k:i*top+k+1,4] = list(df_i_best.iloc[[j-8],2+top+k])
                df_res.iloc[i*top+k:i*top+k+1,5] = list(df_i_best.iloc[[j],2+top+k])
                df_res.iloc[i*top+k:i*top+k+1,6] = list(df_i_best.iloc[[j+8],2+top+k])
                df_res.iloc[i*top+k:i*top+k+1,7] = list(df_i_best.iloc[[j-8],2+top+top+k])
                df_res.iloc[i*top+k:i*top+k+1,8] = list(df_i_best.iloc[[j],2+top+top+k])
                df_res.iloc[i*top+k:i*top+k+1,9] = list(df_i_best.iloc[[j+8],2+top+top+k])

In [None]:
# F1-LS
df_res_LS = pd.DataFrame({'LS_tr':[0 for i in range(top*len(final_list))],
                          'LS_va':[0 for i in range(top*len(final_list))],
                          'LS_t':[0 for i in range(top*len(final_list))]})
for i in range(len(final_list)):        # in one dataset
    for j in range(len(metrics)*2):       # in one metric
        if j == 12:                      # only choose one metric (12:validation_F1)
            df_i = df_LS[df_LS["Dataset"] == str(final_list[i])]   # i_th dataset
            df_i_base = df_i.iloc[:,:base_ind]               # i_th dataset base
            df_i_LS = df_i.iloc[:,base_ind:]                 # i_th dataset LS results 
            best_LS = pd.DataFrame(df_i_LS.iloc[j,:]).iloc[:,0].sort_values(ascending=False)[:top].index
            df_i_LS_best = df_i_LS.loc[:,best_LS]     
            df_i_best = pd.concat([df_i_base,df_i_LS_best],axis=1)
            for k in range(top):
                df_res_LS.iloc[i*top+k:i*top+k+1,0] = list(df_i_best.iloc[[j-8],2+k])
                df_res_LS.iloc[i*top+k:i*top+k+1,1] = list(df_i_best.iloc[[j],2+k])
                df_res_LS.iloc[i*top+k:i*top+k+1,2] = list(df_i_best.iloc[[j+8],2+k])

In [None]:
df_res_final = pd.concat([df_res, df_res_LS], axis=1)

In [None]:
df_best_scores = df_res_final
df_best_scores

In [None]:
# Averaged Results  (when top>1 is used, this code calculates average scores)
df_best_scores_avg = pd.DataFrame(final_list, columns=['DATA'])
for i in range(1, df_best_scores.shape[1]):
    avg_list = []
    for j in range(len(df_best_scores_avg)):
        avg_list.append(np.average(df_best_scores.iloc[j*top:j*top+top,i]))
    df_best_scores_avg[f'{df_best_scores.columns[i]}'] = avg_list
df_best_scores_avg

## 1.Performance - comparison

In [None]:
# Table 1 (SMOTE/BSM/ADA/GPT/LSH-G)
# Performance Margin
df_best_scores_avg['(LS-SM)_t'] = df_best_scores_avg.loc[:,'LS_t'].astype(float) - df_best_scores_avg.loc[:,'SMOTE_t'].astype(float)
df_best_scores_avg['(LS-BS)_t'] = df_best_scores_avg.loc[:,'LS_t'].astype(float) - df_best_scores_avg.loc[:,'BSM_t'].astype(float)
df_best_scores_avg['(LS-AD)_t'] = df_best_scores_avg.loc[:,'LS_t'].astype(float) - df_best_scores_avg.loc[:,'ADA_t'].astype(float)
df_best_scores_avg['(LS-LM)_t'] = df_best_scores_avg.loc[:,'LS_t'].astype(float) - df_best_scores_avg.loc[:,'LLM_t'].astype(float)

# Winning Number
df_best_scores_avg['(LS>SM)_t'] = [1 if df_best_scores_avg.loc[i,"LS_t"] > df_best_scores_avg.loc[i,"SMOTE_t"] else 0 for i in range(len(df_best_scores_avg))]
df_best_scores_avg['(LS=SM)_t'] = [1 if df_best_scores_avg.loc[i,"LS_t"] == df_best_scores_avg.loc[i,"SMOTE_t"] else 0 for i in range(len(df_best_scores_avg))]
df_best_scores_avg['(LS<SM)_t'] = [1 if df_best_scores_avg.loc[i,"LS_t"] < df_best_scores_avg.loc[i,"SMOTE_t"] else 0 for i in range(len(df_best_scores_avg))]
df_best_scores_avg['(LS>BS)_t'] = [1 if df_best_scores_avg.loc[i,"LS_t"] > df_best_scores_avg.loc[i,"BSM_t"] else 0 for i in range(len(df_best_scores_avg))]
df_best_scores_avg['(LS=BS)_t'] = [1 if df_best_scores_avg.loc[i,"LS_t"] == df_best_scores_avg.loc[i,"BSM_t"] else 0 for i in range(len(df_best_scores_avg))]
df_best_scores_avg['(LS<BS)_t'] = [1 if df_best_scores_avg.loc[i,"LS_t"] < df_best_scores_avg.loc[i,"BSM_t"] else 0 for i in range(len(df_best_scores_avg))]
df_best_scores_avg['(LS>AD)_t'] = [1 if df_best_scores_avg.loc[i,"LS_t"] > df_best_scores_avg.loc[i,"ADA_t"] else 0 for i in range(len(df_best_scores_avg))]
df_best_scores_avg['(LS=AD)_t'] = [1 if df_best_scores_avg.loc[i,"LS_t"] == df_best_scores_avg.loc[i,"ADA_t"] else 0 for i in range(len(df_best_scores_avg))]
df_best_scores_avg['(LS<AD)_t'] = [1 if df_best_scores_avg.loc[i,"LS_t"] < df_best_scores_avg.loc[i,"ADA_t"] else 0 for i in range(len(df_best_scores_avg))]
df_best_scores_avg['(LS>LM)_t'] = [1 if df_best_scores_avg.loc[i,"LS_t"] > df_best_scores_avg.loc[i,"LLM_t"] else 0 for i in range(len(df_best_scores_avg))]
df_best_scores_avg['(LS=LM)_t'] = [1 if df_best_scores_avg.loc[i,"LS_t"] == df_best_scores_avg.loc[i,"LLM_t"] else 0 for i in range(len(df_best_scores_avg))]
df_best_scores_avg['(LS<LM)_t'] = [1 if df_best_scores_avg.loc[i,"LS_t"] < df_best_scores_avg.loc[i,"LLM_t"] else 0 for i in range(len(df_best_scores_avg))]

df_best_scores_avg.iloc[:,16:]

In [None]:
# winning numbers
print(sum(df_best_scores_avg['(LS>SM)_t']), sum(df_best_scores_avg['(LS=SM)_t']), sum(df_best_scores_avg['(LS<SM)_t']))
print(sum(df_best_scores_avg['(LS>BS)_t']), sum(df_best_scores_avg['(LS=BS)_t']), sum(df_best_scores_avg['(LS<BS)_t']))
print(sum(df_best_scores_avg['(LS>AD)_t']), sum(df_best_scores_avg['(LS=AD)_t']), sum(df_best_scores_avg['(LS<AD)_t']))
print(sum(df_best_scores_avg['(LS>LM)_t']), sum(df_best_scores_avg['(LS=LM)_t']), sum(df_best_scores_avg['(LS<LM)_t']))

In [None]:
# average performance
print("SM_t", np.average(df_best_scores_avg.loc[:,'SMOTE_t']))
print("BS_t", np.average(df_best_scores_avg.loc[:,'BSM_t']))
print("AD_t", np.average(df_best_scores_avg.loc[:,'ADA_t']))
print("LM_t", np.average(df_best_scores_avg.loc[:,'LLM_t']))
print("LS_t", np.average(df_best_scores_avg.loc[:,'LS_t']))

# performance margin
print("avg. margin (LS-SM)_t:", np.average(df_best_scores_avg.loc[:,'(LS-SM)_t']))
print("avg. margin (LS-BS)_t:", np.average(df_best_scores_avg.loc[:,'(LS-BS)_t']))
print("avg. margin (LS-AD)_t:", np.average(df_best_scores_avg.loc[:,'(LS-AD)_t']))
print("avg. margin (LS-LM)_t:", np.average(df_best_scores_avg.loc[:,'(LS-LM)_t']))

## 2. Performance by imbalance

In [None]:
# IMBALANCE GROUPS
print(len(df_best_scores_avg.iloc[:15,:]))
print(len(df_best_scores_avg.iloc[15:30:,:]))
print(len(df_best_scores_avg.iloc[30:44,:]))
print(len(df_best_scores_avg.iloc[44:,:]))

In [None]:
# Relative Improvement (# Table 1 (SMOTE/BSM/ADA/GPT/LSH-G))
df_rel = df_best_scores_avg.loc[:,["(LS-SM)_t", "SMOTE_t", "(LS-BS)_t", "BSM_t", "(LS-AD)_t", "ADA_t", "(LS-LM)_t", "LLM_t"]]
df_rel["SM_rel"] = df_rel["(LS-SM)_t"] / df_rel["SMOTE_t"] * 100
df_rel["BS_rel"] = df_rel["(LS-BS)_t"] / df_rel["BSM_t"] * 100
df_rel["AD_rel"] = df_rel["(LS-AD)_t"] / df_rel["ADA_t"] * 100
df_rel["LM_rel"] = df_rel["(LS-LM)_t"] / df_rel["LLM_t"] * 100
df_rel

In [None]:
# df_rel = df_rel.replace(np.inf, np.nan)
df_rel = df_rel.replace(np.inf, 100)  

print("LS-SM", "less", np.mean(df_rel.iloc[44:,8]))
print("LS-BS", "less", np.mean(df_rel.iloc[44:,9]))
print("LS-AD", "less", np.mean(df_rel.iloc[44:,10]))
print("LS-LM", "less", np.mean(df_rel.iloc[44:,11]))
print('*'*20)
print("LS-SM", "mod", np.mean(df_rel.iloc[30:44,8]))
print("LS-BS", "mod", np.mean(df_rel.iloc[30:44,9]))
print("LS-AD", "mod", np.mean(df_rel.iloc[30:44,10]))
print("LS-LM", "mod", np.mean(df_rel.iloc[30:44,11]))
print('*'*20)
print("LS-SM", "mid", np.mean(df_rel.iloc[15:30,8]))
print("LS-BS", "mid", np.mean(df_rel.iloc[15:30,9]))
print("LS-AD", "mid", np.mean(df_rel.iloc[15:30,10]))
print("LS-LM", "mid", np.mean(df_rel.iloc[15:30,11]))
print('*'*20)
print("LS-SM", "less", np.mean(df_rel.iloc[:15,8]))
print("LS-BS", "less", np.mean(df_rel.iloc[:15,9]))
print("LS-AD", "less", np.mean(df_rel.iloc[:15,10]))
print("LS-LM", "less", np.mean(df_rel.iloc[:15,11]))
print('*'*20)

## 3. Robustness - Achievement Rate

In [None]:
# Achievement Rate (AR_t/va) - Table 1
df_best_scores_avg['SM_t/SM_va'] = df_best_scores_avg.loc[:,'SMOTE_t'].astype(float) / df_best_scores_avg.loc[:,'SMOTE_va'].astype(float)
df_best_scores_avg['LM_t/LM_va'] = df_best_scores_avg.loc[:,'LLM_t'].astype(float) / df_best_scores_avg.loc[:,'LLM_va'].astype(float)
df_best_scores_avg['LS_t/LS_va'] = df_best_scores_avg.loc[:,'LS_t'].astype(float) / df_best_scores_avg.loc[:,'LS_va'].astype(float)

# Achievement Rate (AR_t/va) - Table 2
df_best_scores_avg_2['BSM_t/BSM_va'] = df_best_scores_avg_2.loc[:,'BSM_t'].astype(float) / df_best_scores_avg_2.loc[:,'BSM_va'].astype(float)
df_best_scores_avg_2['ADA_t/ADA_va'] = df_best_scores_avg_2.loc[:,'ADA_t'].astype(float) / df_best_scores_avg_2.loc[:,'ADA_va'].astype(float)
df_best_scores_avg_2['LM2_t/LM2_va'] = df_best_scores_avg_2.loc[:,'DEV_t'].astype(float) / df_best_scores_avg_2.loc[:,'DEV_va'].astype(float)
df_best_scores_avg_2['LS2_t/LS2_va'] = df_best_scores_avg_2.loc[:,'LS2_t'].astype(float) / df_best_scores_avg_2.loc[:,'LS2_va'].astype(float)

In [None]:
df_AR = pd.concat([df_best_scores_avg.iloc[:,-3:], df_best_scores_avg_2.iloc[:,-4:]], axis=1)
df_AR

In [None]:
# testing/validation
print("SM", np.average(df_AR.loc[:,"SM_t/SM_va"]), np.std(df_AR.loc[:,"SM_t/SM_va"]))
print("BSM", np.average(df_AR.loc[:,"BSM_t/BSM_va"]), np.std(df_AR.loc[:,"BSM_t/BSM_va"]))
print("ADA", np.average(df_AR.loc[:,"ADA_t/ADA_va"]), np.std(df_AR.loc[:,"ADA_t/ADA_va"]))
print("LM", np.average(df_AR.loc[:,"LM_t/LM_va"]), np.std(df_AR.loc[:,"LM_t/LM_va"]))
print("LS", np.average(df_AR.loc[:,"LS_t/LS_va"]), np.std(df_AR.loc[:,"LS_t/LS_va"]))
print("LM2", np.average(df_AR.loc[:,"LM2_t/LM2_va"]), np.std(df_AR.loc[:,"LM2_t/LM2_va"]))
print("LS2", np.average(df_AR.loc[:,"LS2_t/LS2_va"]), np.std(df_AR.loc[:,"LS2_t/LS2_va"]))

print("SM", np.var(df_AR.loc[:,"SM_t/SM_va"]))
print("BSM", np.var(df_AR.loc[:,"BSM_t/BSM_va"]))
print("ADA", np.var(df_AR.loc[:,"ADA_t/ADA_va"]))
print("LM", np.var(df_AR.loc[:,"LM_t/LM_va"]))
print("LS", np.var(df_AR.loc[:,"LS_t/LS_va"]))
print("LM2", np.var(df_AR.loc[:,"LM2_t/LM2_va"]))
print("LS2", np.var(df_AR.loc[:,"LS2_t/LS2_va"]))