# Análise dos resultados (V2)

In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import ipywidgets as w
import colorcet as cc

Definições uteis para outras situações:

In [83]:
%load_ext autoreload
%autoreload 2
from utils import RESULTS_V2_PATH, DATASET_LIST, ABREV_DICT
import utils

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Coletando resultados

In [3]:
csv_files = !ls  $RESULTS_V2_PATH

In [4]:
def get_results_info(file_list: list) -> pd.DataFrame:
    
    results_info = pd.DataFrame(file_list, columns=["file"])
    results_info[["dataset", "split", "learner", "method"]] = results_info.file.str.split("_", n=3, expand=True)
    results_info['file'] = results_info.file.map(lambda x: os.path.join(RESULTS_V2_PATH, x))
    results_info['method'] = results_info.method.apply(lambda x: x.split('.')[0])
    
    return results_info

results_info = get_results_info(csv_files)
results_info.head()

Unnamed: 0,file,dataset,split,learner,method
0,../results/v2/abalone-3class_1x5_5NN_borderlin...,abalone-3class,1x5,5NN,borderline_points_sampling
1,../results/v2/abalone-3class_1x5_5NN_class_bal...,abalone-3class,1x5,5NN,class_balance_sampling
2,../results/v2/abalone-3class_1x5_5NN_class_lik...,abalone-3class,1x5,5NN,class_likelihood_sampling
3,../results/v2/abalone-3class_1x5_5NN_class_lik...,abalone-3class,1x5,5NN,class_likeliood_diff_sampling
4,../results/v2/abalone-3class_1x5_5NN_density_w...,abalone-3class,1x5,5NN,density_weighted_sampling


## Análise de curvas de aprendizado

In [78]:
import seaborn as sns

n_methods = results_info.method.nunique()
color_list = sns.color_palette(cc.glasbey_dark ,n_colors=n_methods)
COLOR_DICT = dict(zip(results_info.method.unique().tolist(), color_list))
COLOR_DICT["random_sampling"] = "firebrick"

In [100]:
def plot_learning_curve(dataset, learner):
    
    fig, ax = plt.subplots()
    ax.grid(True)
    ax.set_ylim(-1,1)
    ax.set_title(f"Average learning curves for AL methods")
    
    
    result_files = results_info[(results_info.learner == learner) & (results_info.dataset == dataset)].file.tolist()

    results_df = pd.concat(pd.read_csv(f) for f in result_files)

    auc_dict = {}
    for method, data in results_df.groupby("method"):
        avg_scores =  data.groupby("query").kappa.mean()
        auc = (np.trapz(avg_scores, avg_scores.index))
        auc_dict[method] = auc

    ranked_method_list = sorted(auc_dict.items(), key=lambda x: x[1],reverse=True)

    i=0
    for method, _ in ranked_method_list:
        avg_curve = results_df[results_df.method == method].groupby("query").kappa.mean()
        ax.plot(avg_curve.rolling(window=5, min_periods=1).mean(),
                utils.get_style(method), linewidth=0.5, color=COLOR_DICT[method], label=ABREV_DICT[method],
            markevery=(0.3, 0.2))
        i+=1
        
    ax.legend(loc='upper right', bbox_to_anchor=(1.2, 1.02), ncols=1, prop={'size':8}, framealpha=1)

w.interact(plot_learning_curve,
          dataset=w.SelectionSlider(options=utils.DATASET_LIST),
          learner=w.ToggleButtons(options=utils.ABREV_MODEL.keys(), value="SVC"))

interactive(children=(SelectionSlider(description='dataset', options=('abalone-3class', 'artificial-characters…

<function __main__.plot_learning_curve(dataset, learner)>

## Coleta Resultados

In [132]:
def get_results_df(results_info) -> pd.DataFrame:
    df = pd.concat(pd.read_csv(f) for f in  results_info.file)
    
    return df
    

In [133]:
results_df = get_results_df(results_info)

In [136]:
results_df.head()

Unnamed: 0,time,dataset,classifier,method,run,fold,query,kappa
0,2025-03-23 18:41:24.629248,abalone-3class,5NN,borderline_points_sampling,0,0,0,0.0
1,2025-03-23 18:41:29.325717,abalone-3class,5NN,borderline_points_sampling,0,0,1,0.231167
2,2025-03-23 18:41:34.087002,abalone-3class,5NN,borderline_points_sampling,0,0,2,0.314495
3,2025-03-23 18:41:38.828586,abalone-3class,5NN,borderline_points_sampling,0,0,3,0.249614
4,2025-03-23 18:41:43.579852,abalone-3class,5NN,borderline_points_sampling,0,0,4,0.212956


## Plotando Resuldados

### Ranking Curves

In [145]:
def get_avg_ranks(results_df) -> pd.DataFrame:
    # Afere o kappa médio para cada query registrada
    avg_kappa = results_df.groupby(["classifier","method", "query", "dataset"]).apply(lambda x: x.kappa.mean())

    # Afere o rank de cada método para cada dataset
    ranks_df = avg_kappa.to_frame(name="kappa")
    ranks_df['rank'] = ranks_df.groupby(["query", "dataset"]).kappa.rank(ascending=False)

    # Calcula rank médio por query para cada 
    avg_ranks = ranks_df.groupby(["classifier", "method", "query"])["rank"].mean()
    return avg_ranks


In [146]:
avg_ranks_df = get_avg_ranks(results_df)

In [193]:
def plot_learning_curves(avg_ranks_df, classifier):
    fig, ax = plt.subplots()
    ax.grid(True)
    
    # Calcula rank médio por query para cada 
    clf_ranks = avg_ranks_df.loc[classifier]
    for method, data in clf_ranks.groupby("method"):

        # aplica média móvel nos dados:
        # smoothed_data = data.rolling(window=5, min_periods=1).mean()
        smoothed_data = data
        
        ax.plot(smoothed_data.values, utils.get_style(method),
                label=ABREV_DICT.get(method), color=COLOR_DICT[method], markevery=(0.3, 0.2),
                linewidth=(0.5 if utils.get_hm_type(method) != 'Classic' else 1.5) )
    
    ax.legend(loc='upper right', bbox_to_anchor=(1.2, 1.02), ncols=1, prop={'size': 8}, framealpha=1)
    ax.invert_yaxis()

w.interact(plot_learning_curves,
           avg_ranks_df=w.fixed(avg_ranks_df),
           classifier=w.ToggleButtons(options=utils.ABREV_MODEL.keys(), description="Leaner:"))


interactive(children=(ToggleButtons(description='Leaner:', options=('GaussianNB', 'SVC', 'DecisionTree', '5NN'…

<function __main__.plot_learning_curves(avg_ranks_df, classifier)>

## Group Bands

In [None]:
def plot_group_bands(avg_ranks):
    df = avg_ranks.reset_index()
    df['group'] = df.method.apply(utils.get_hm_type)

    fig, ax = plt.subplots()
    df_clf = df[df.classifier=="SVC"]

    for group, data in df_clf.groupby(["group"]):

        min_ranks = data.groupby("query")["rank"].min()
        max_ranks = data.groupby("query")["rank"].max()
        ax.fill_between(np.arange(len(min_ranks)), min_ranks, max_ranks, alpha=0.3)


plot_group_bands(avg_ranks_df)

In [210]:
mean_kappa = results_df.groupby(["classifier", "method", "dataset", "query"]).kappa.mean()

In [223]:
ranks_data = mean_kappa.groupby(["classifier",  "dataset", "query"]).rank(ascending=False)

In [249]:
mean_kappa.loc["DecisionTree", "class_balance_sampling", "banana"]

query
0      0.031483
1      0.014254
2      0.023462
3      0.047551
4      0.043539
         ...   
96     0.301779
97     0.290296
98     0.307134
99     0.306423
100    0.307134
Name: kappa, Length: 101, dtype: float64

In [250]:
ranks_data.loc["DecisionTree", "class_balance_sampling", "banana"]

query
0       2.0
1      21.5
2      20.5
3      15.5
4      17.5
       ... 
96     18.0
97     19.0
98     17.0
99     17.0
100    17.0
Name: kappa, Length: 101, dtype: float64

In [244]:
comparisson = (mean_kappa.loc[:, "minority_value_sampling", :] == mean_kappa.loc[:, "class_balance_sampling", :])

In [248]:
comparisson[comparisson==False].reset_index().classifier.unique()

array(['DecisionTree'], dtype=object)

In [191]:
avg_ranks_df.loc["SVC", "minority_value_sampling"]

query
0      34.750000
1      38.411111
2      38.122222
3      42.444444
4      42.022222
         ...    
96     46.183333
97     46.266667
98     46.194444
99     46.250000
100    46.294444
Name: rank, Length: 101, dtype: float64