In [1]:
import locale
locale.setlocale(locale.LC_NUMERIC, "pt_BR")

import os
import pandas as pd
import glob
import json
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)

matplotlib.use("pgf")
matplotlib.rcParams.update({
    'axes.formatter.use_locale': True,
    "pgf.texsystem": "pdflatex",
    'font.family': 'serif',
    'text.usetex': True,
    'pgf.rcfonts': False,
})
%matplotlib inline

sns.set_style('white')
sns.set_context("paper", font_scale = 1)

In [2]:
RESULTS_PATH = 'results/'
samples = 3000
class_ratios = [
    '1to2.5', '1to3', '1to4', '1to5', '1to6', 
    '1to7', '1to8', '1to9', '1to10', '1to20'
]
class_1_radius = [f'{i/10}, {i/10}' for i in range(1, 6, 1)]

distances = ['1_2', '0_8', '0_4', '0_0']

total_ratios = len(class_ratios)
total_radius = len(class_1_radius)

# 1. Média de desempenhos de teste

In [3]:
fig = plt.figure(
    figsize=(6.29707, 8),
)

fig.suptitle(
    (
        f'Médias dos desempenhos por $D$, $R_{{min}}$ e $IR$, onde $N = {samples}$'
    ),
    y=1.04
)

subfigs = fig.subfigures(
    nrows=total_ratios,
    ncols=1,
)

for i_ratios, ratio in enumerate(class_ratios):
    ratio_str = ratio.split('to')[1].replace('.', ',')
    subfigs[i_ratios].suptitle(f'$IR = {ratio_str}$', y=1.05, fontsize=10)
    axs = subfigs[i_ratios].subplots(
        nrows=1,
        ncols=total_radius,
    )

    for i_radius, radius in enumerate(class_1_radius):
        radius_str = radius[:3]
        radius_filename = radius_str.replace('.', '_')
        
        plot_df = pd.DataFrame()
        for distance in distances:
            distance_str = distance.replace('_', '.')

            results_path = f'results/spheres-2d-samples={samples}-radius={radius_filename}-ratio={ratio}-distance={distance}/'

            test_results_path = glob.glob(f'{results_path}/test_results/*.json')
            test_df = pd.DataFrame()
            for file in test_results_path:
                estimator_name = file[:-5].replace('__', ' | ')
                with open(file, 'r') as f:
                    data = json.load(f)
                estimator_test_df = pd.json_normalize(data)
                estimator_test_df['estimator'] = estimator_name
                estimator_test_df = estimator_test_df[['estimator', 'auprc', 'roc_auc', 'f1-score']]
                test_df = pd.concat([test_df, estimator_test_df])

            mean_test_AUPRC_score = test_df.loc[
                :,
                'auprc'
            ].mean()
            mean_test_AUROC_score = test_df.loc[
                :,
                'roc_auc'
            ].mean()
            mean_test_f1_score = test_df.loc[
                :,
                'f1-score'
            ].mean()

            plot_df = pd.concat(
                [
                    plot_df,
                    pd.DataFrame(
                        {
                            'AUPRC': mean_test_AUPRC_score,
                            'AUROC': mean_test_AUROC_score,
                            'f1': mean_test_f1_score
                        }, index=[float(distance_str)]
                    )
                ]
            )

        lineplt = sns.lineplot(
            plot_df,
            legend=True,
            ax=axs[i_radius],
        )
        axs[i_radius].set_xlim(0, 1.2)
        axs[i_radius].set_ylim(0, 1)

        axs[i_radius].set(xlabel=None, ylabel=None)
        radius_str = radius_str.replace('.', ',')
        axs[i_radius].text(0.36, 0.05, f'$R_{{min}} = {radius_str}$', fontsize=7)                

        axs[i_radius].tick_params(axis='both', which='major', labelsize=7)

        if i_radius == 0:
            axs[i_radius].set_xticks([0, 0.5, 1])

        if i_radius == 0 and i_ratios == 4:
            axs[i_radius].set(ylabel='Desempenho')

        if i_radius != 0:
            axs[i_radius].set_yticks([])           

        if i_ratios == total_ratios - 1 and i_radius == 2:
            axs[i_radius].set(xlabel='D')  

        if i_ratios == total_ratios - 1:
            axs[i_radius].set_xticks([1.2, 0.8, 0.4, 0.0])
        
        if i_ratios != total_ratios - 1:
            axs[i_radius].set_xticks([])

        lineplt.get_legend().remove()
        if i_ratios == total_ratios - 1 and i_radius == total_radius - 1:
            lineplt.legend(loc='lower center', bbox_to_anchor=(-2, -1.4), ncol=3)
            L = lineplt.get_legend()
            L.get_texts()[0].set_text('$\\overline{{AUPRC(E_{\\kappa}(\\tau))}}$')
            L.get_texts()[1].set_text('$\\overline{{AUROC(E_{\\kappa}(\\tau))}}$')
            L.get_texts()[2].set_text('$\\overline{{f_{1}(E_{\\kappa}(\\tau)}}$')
plt.savefig(
    f'results/samples={samples}_generated_data_performance.pgf',
    bbox_inches="tight"
)
plt.savefig(
    f'results/samples={samples}_generated_data_performance.png',
    bbox_inches="tight"
)
plt.show()

# 2. Comparação geral do uso de pré-processamento de amostragem

In [None]:
fig = plt.figure(
    figsize=(6.29707, 8),
)

fig.suptitle(
    (
        'Diferenças nas médias de desempenho entre amostragem e não-amostragem\n'
        f'por $D$ e $IR$, para $N = {samples}$'
    ),
    y=1.06
)

subfigs = fig.subfigures(
    nrows=total_ratios,
    ncols=1,
)

for i_ratios, ratio in enumerate(class_ratios):
    ratio_str = ratio.split('to')[1]
    subfigs[i_ratios].suptitle(f'$IR = {ratio_str}$', y=1.05, fontsize=10)
    axs = subfigs[i_ratios].subplots(
        nrows=1,
        ncols=total_radius,
    )

    for i_radius, radius in enumerate(class_1_radius):
        radius_str = radius[:3]
        radius_filename = radius_str.replace('.', '_')
        
        plot_df = pd.DataFrame()
        for distance in distances:
            distance_str = distance.replace('_', '.')

            results_path = f'results/spheres-2d-samples={samples}-radius={radius_filename}-ratio={ratio}-distance={distance}/'
            test_results_path = glob.glob(f'{results_path}/test_results/*.json')
            test_df = pd.DataFrame()
            for file_path in test_results_path:
                file_name = os.path.basename(file_path)
                estimator_name = file_name[:-5].replace('__', ' | ')
                with open(file_path, 'r') as f:
                    data = json.load(f)
                estimator_test_df = pd.json_normalize(data)
                estimator_test_df['estimator'] = estimator_name
                estimator_test_df = estimator_test_df[['estimator', 'auprc', 'roc_auc', 'f1-score']]
                test_df = pd.concat([test_df, estimator_test_df])

            # NO SAMPLING
            baseline_mean_test_AUPRC_score = test_df.loc[
                ~test_df['estimator'].str.contains('|', regex=False),
                'auprc'
            ].mean()
            baseline_mean_test_AUROC_score = test_df.loc[
                ~test_df['estimator'].str.contains('|', regex=False),
                'roc_auc'
            ].mean()
            baseline_mean_test_f1_score = test_df.loc[
                ~test_df['estimator'].str.contains('|', regex=False),
                'f1-score'
            ].mean()

            # SMOTE
            smote_mean_test_AUPRC_score = test_df.loc[
                test_df['estimator'].str.startswith('SMOTE'),
                'auprc'
            ].mean()
            smote_mean_test_AUROC_score = test_df.loc[
                test_df['estimator'].str.startswith('SMOTE'),
                'roc_auc'
            ].mean()
            smote_mean_test_f1_score = test_df.loc[
                test_df['estimator'].str.startswith('SMOTE'),
                'f1-score'
            ].mean()

            # BORDERLINE SMOTE
            borderlinesmote_mean_test_AUPRC_score = test_df.loc[
                test_df['estimator'].str.startswith('BorderlineSMOTE'),
                'auprc'
            ].mean()
            borderlinesmote_mean_test_AUROC_score = test_df.loc[
                test_df['estimator'].str.startswith('BorderlineSMOTE'),
                'roc_auc'
            ].mean()
            borderlinesmote_mean_test_f1_score = test_df.loc[
                test_df['estimator'].str.startswith('BorderlineSMOTE'),
                'f1-score'
            ].mean()

            # CLUSTER CENTROIDS
            clustercentroids_mean_test_AUPRC_score = test_df.loc[
                test_df['estimator'].str.startswith('ClusterCentroids'),
                'auprc'
            ].mean()
            clustercentroids_mean_test_AUROC_score = test_df.loc[
                test_df['estimator'].str.startswith('ClusterCentroids'),
                'roc_auc'
            ].mean()
            clustercentroids_mean_test_f1_score = test_df.loc[
                test_df['estimator'].str.startswith('ClusterCentroids'),
                'f1-score'
            ].mean()

            # NEAR MISS
            nearmiss_mean_test_AUPRC_score = test_df.loc[
                test_df['estimator'].str.startswith('NearMiss'),
                'auprc'
            ].mean()
            nearmiss_mean_test_AUROC_score = test_df.loc[
                test_df['estimator'].str.startswith('NearMiss'),
                'roc_auc'
            ].mean()
            nearmiss_mean_test_f1_score = test_df.loc[
                test_df['estimator'].str.startswith('NearMiss'),
                'f1-score'
            ].mean()

            smote_delta_auprc = smote_mean_test_AUPRC_score - baseline_mean_test_AUPRC_score
            borderlinesmote_delta_auprc = borderlinesmote_mean_test_AUPRC_score - baseline_mean_test_AUPRC_score
            clustercentroids_delta_auprc = clustercentroids_mean_test_AUPRC_score - baseline_mean_test_AUPRC_score
            nearmiss_delta_auprc = nearmiss_mean_test_AUPRC_score - baseline_mean_test_AUPRC_score

            plot_df = pd.concat(
                [
                    plot_df,
                    pd.DataFrame(
                        {
                            'SMOTE': smote_delta_auprc,
                            'BorderlineSMOTE': borderlinesmote_delta_auprc,
                            'ClusterCentroids': clustercentroids_delta_auprc,
                            'NearMiss': nearmiss_delta_auprc
                        }, index=[float(distance_str)]
                    )
                ]
            )

        lineplt = sns.lineplot(
            plot_df,
            legend=True,
            ax=axs[i_radius],
        )
        axs[i_radius].set_xlim(0, 1.2)
        axs[i_radius].set_ylim(-.5, .5)

        axs[i_radius].set(xlabel=None, ylabel=None)
        radius_str = radius_str.replace('.', ',')
        axs[i_radius].text(0.04, 0.36, f'$R_{{min}} = {radius_str}$', fontsize=7)

        axs[i_radius].tick_params(axis='both', which='major', labelsize=7)

        if i_radius == 0:
            axs[i_radius].set_xticks([0, 0.5, 1])

        if i_radius == 0 and i_ratios == 4:
            axs[i_radius].set(ylabel='Diferença no desempenho')

        if i_radius != 0:
            axs[i_radius].set_yticks([])           

        if i_ratios == total_ratios - 1 and i_radius == 2:
            axs[i_radius].set(xlabel='$D$')  

        if i_ratios == total_ratios - 1:
            axs[i_radius].set_xticks([1.2, 0.8, 0.4, 0.0])
        
        if i_ratios != total_ratios - 1:
            axs[i_radius].set_xticks([])

        lineplt.get_legend().remove()
        if i_ratios == total_ratios - 1 and i_radius == total_radius - 1:
            lineplt.legend(title='$\\Delta AUPRC_{{amost.}}$', loc='lower center', bbox_to_anchor=(-2, -2), ncol=2)

plt.savefig(
    f'results/samples={samples}_generated_data_delta_performance.pgf',
    bbox_inches="tight"
)
plt.savefig(
    f'results/samples={samples}_generated_data_delta_performance.png',
    bbox_inches="tight"
)
plt.show()

# 3. Desempenhos por grupo estimador

In [None]:
estimators = [
    'ExtraTreesClassifier',
    'RandomForestClassifier',
    'DecisionTreeClassifier',
    'AdaBoostClassifier',
    'GradientBoostingClassifier'
]


for e in estimators:
    fig = plt.figure(
        figsize=(6.29707, 8),
    )

    fig.suptitle(
        (
            f'Médias dos desempenhos para o estimador {e}\n'
            f'por $D$, $R_{{min}}$ e $IR$, onde $N = {samples}$'
        ),
        y=1.06
    )

    subfigs = fig.subfigures(
        nrows=total_ratios,
        ncols=1,
    )

    for i_ratios, ratio in enumerate(class_ratios):
        ratio_str = ratio.split('to')[1].replace('.', ',')
        subfigs[i_ratios].suptitle(f'$IR = {ratio_str}$', y=1.05, fontsize=10)
        axs = subfigs[i_ratios].subplots(
            nrows=1,
            ncols=total_radius,
        )

        for i_radius, radius in enumerate(class_1_radius):
            radius_str = radius[:3]
            radius_filename = radius_str.replace('.', '_')
            
            plot_df = pd.DataFrame()
            for distance in distances:
                distance_str = distance.replace('_', '.')

                results_path = f'results/spheres-2d-samples={samples}-radius={radius_filename}-ratio={ratio}-distance={distance}/'

                test_results_path = glob.glob(f'{results_path}/test_results/*.json')
                test_df = pd.DataFrame()
                for file in test_results_path:
                    estimator_name = file[:-5].replace('__', ' | ')
                    with open(file, 'r') as f:
                        data = json.load(f)
                    estimator_test_df = pd.json_normalize(data)
                    estimator_test_df['estimator'] = estimator_name
                    estimator_test_df = estimator_test_df[['estimator', 'auprc', 'roc_auc', 'f1-score']]
                    test_df = pd.concat([test_df, estimator_test_df])

                mean_test_AUPRC_score = test_df.loc[
                    test_df['estimator'].str.contains(e),
                    'auprc'
                ].mean()
                mean_test_AUROC_score = test_df.loc[
                    test_df['estimator'].str.contains(e),
                    'roc_auc'
                ].mean()
                mean_test_f1_score = test_df.loc[
                    test_df['estimator'].str.contains(e),
                    'f1-score'
                ].mean()

                plot_df = pd.concat(
                    [
                        plot_df,
                        pd.DataFrame(
                            {
                                'AUPRC': mean_test_AUPRC_score,
                                'AUROC': mean_test_AUROC_score,
                                'f1': mean_test_f1_score
                            }, index=[float(distance_str)]
                        )
                    ]
                )

            lineplt = sns.lineplot(
                plot_df,
                legend=True,
                ax=axs[i_radius],
            )
            axs[i_radius].set_xlim(0, 1.2)
            axs[i_radius].set_ylim(0, 1)

            axs[i_radius].set(xlabel=None, ylabel=None)
            radius_str = radius_str.replace('.', ',')
            axs[i_radius].text(0.36, 0.05, f'$R_{{min}} = {radius_str}$', fontsize=7)                

            axs[i_radius].tick_params(axis='both', which='major', labelsize=7)

            if i_radius == 0:
                axs[i_radius].set_xticks([0, 0.5, 1])

            if i_radius == 0 and i_ratios == 4:
                axs[i_radius].set(ylabel='Desempenho')

            if i_radius != 0:
                axs[i_radius].set_yticks([])           

            if i_ratios == total_ratios - 1 and i_radius == 2:
                axs[i_radius].set(xlabel='D')  

            if i_ratios == total_ratios - 1:
                axs[i_radius].set_xticks([1.2, 0.8, 0.4, 0.0])
            
            if i_ratios != total_ratios - 1:
                axs[i_radius].set_xticks([])

            lineplt.get_legend().remove()
            if i_ratios == total_ratios - 1 and i_radius == total_radius - 1:
                lineplt.legend(loc='lower center', bbox_to_anchor=(-2, -1.4), ncol=3)
                L = lineplt.get_legend()
                L.get_texts()[0].set_text('$\\overline{{AUPRC(E_{\\kappa}(\\tau))}}$')
                L.get_texts()[1].set_text('$\\overline{{AUROC(E_{\\kappa}(\\tau))}}$')
                L.get_texts()[2].set_text('$\\overline{{f_{1}(E_{\\kappa}(\\tau))}}$')
    plt.savefig(
        f'results/samples={samples}-estimator={e}_generated_data_performance.pgf',
        bbox_inches="tight"
    )
    plt.savefig(
        f'results/samples={samples}-estimator={e}_generated_data_performance.png',
        bbox_inches="tight"
    )
    plt.show()

In [None]:
estimators = [
    'ExtraTreesClassifier',
    'RandomForestClassifier',
    'DecisionTreeClassifier',
    'AdaBoostClassifier',
    'GradientBoostingClassifier'
]


for e in estimators:
    fig = plt.figure(
        figsize=(6.29707, 8),
    )

    fig.suptitle(
        (
            'Diferenças nas médias de desempenho entre amostragem e não-amostragem\n'
            f'para o estimador {e} por $D$ e $IR$, para $N = {samples}$'
        ),
        y=1.06
    )

    subfigs = fig.subfigures(
        nrows=total_ratios,
        ncols=1,
    )

    for i_ratios, ratio in enumerate(class_ratios):
        ratio_str = ratio.split('to')[1].replace('.', ',')
        subfigs[i_ratios].suptitle(f'$IR = {ratio_str}$', y=1.05, fontsize=10)
        axs = subfigs[i_ratios].subplots(
            nrows=1,
            ncols=total_radius,
        )

        for i_radius, radius in enumerate(class_1_radius):
            radius_str = radius[:3]
            radius_filename = radius_str.replace('.', '_')
            
            plot_df = pd.DataFrame()
            for distance in distances:
                distance_str = distance.replace('_', '.')

                results_path = f'results/spheres-2d-samples={samples}-radius={radius_filename}-ratio={ratio}-distance={distance}/'
                test_results_path = glob.glob(f'{results_path}/test_results/*.json')
                test_df = pd.DataFrame()
                for file_path in test_results_path:
                    file_name = os.path.basename(file_path)
                    estimator_name = file_name[:-5].replace('__', ' | ')
                    with open(file_path, 'r') as f:
                        data = json.load(f)
                    estimator_test_df = pd.json_normalize(data)
                    estimator_test_df['estimator'] = estimator_name
                    estimator_test_df = estimator_test_df[['estimator', 'auprc', 'roc_auc', 'f1-score']]
                    test_df = pd.concat([test_df, estimator_test_df])

                test_df = test_df.loc[
                    test_df['estimator'].str.contains(e),
                    :
                ]

                # NO SAMPLING
                baseline_mean_test_AUPRC_score = test_df.loc[
                    ~test_df['estimator'].str.contains('|', regex=False),
                    'auprc'
                ].mean()
                baseline_mean_test_AUROC_score = test_df.loc[
                    ~test_df['estimator'].str.contains('|', regex=False),
                    'roc_auc'
                ].mean()
                baseline_mean_test_f1_score = test_df.loc[
                    ~test_df['estimator'].str.contains('|', regex=False),
                    'f1-score'
                ].mean()

                # SMOTE
                smote_mean_test_AUPRC_score = test_df.loc[
                    test_df['estimator'].str.startswith('SMOTE'),
                    'auprc'
                ].mean()
                smote_mean_test_AUROC_score = test_df.loc[
                    test_df['estimator'].str.startswith('SMOTE'),
                    'roc_auc'
                ].mean()
                smote_mean_test_f1_score = test_df.loc[
                    test_df['estimator'].str.startswith('SMOTE'),
                    'f1-score'
                ].mean()

                # BORDERLINE SMOTE
                borderlinesmote_mean_test_AUPRC_score = test_df.loc[
                    test_df['estimator'].str.startswith('BorderlineSMOTE'),
                    'auprc'
                ].mean()
                borderlinesmote_mean_test_AUROC_score = test_df.loc[
                    test_df['estimator'].str.startswith('BorderlineSMOTE'),
                    'roc_auc'
                ].mean()
                borderlinesmote_mean_test_f1_score = test_df.loc[
                    test_df['estimator'].str.startswith('BorderlineSMOTE'),
                    'f1-score'
                ].mean()

                # CLUSTER CENTROIDS
                clustercentroids_mean_test_AUPRC_score = test_df.loc[
                    test_df['estimator'].str.startswith('ClusterCentroids'),
                    'auprc'
                ].mean()
                clustercentroids_mean_test_AUROC_score = test_df.loc[
                    test_df['estimator'].str.startswith('ClusterCentroids'),
                    'roc_auc'
                ].mean()
                clustercentroids_mean_test_f1_score = test_df.loc[
                    test_df['estimator'].str.startswith('ClusterCentroids'),
                    'f1-score'
                ].mean()

                # NEAR MISS
                nearmiss_mean_test_AUPRC_score = test_df.loc[
                    test_df['estimator'].str.startswith('NearMiss'),
                    'auprc'
                ].mean()
                nearmiss_mean_test_AUROC_score = test_df.loc[
                    test_df['estimator'].str.startswith('NearMiss'),
                    'roc_auc'
                ].mean()
                nearmiss_mean_test_f1_score = test_df.loc[
                    test_df['estimator'].str.startswith('NearMiss'),
                    'f1-score'
                ].mean()

                smote_delta_auprc = smote_mean_test_AUPRC_score - baseline_mean_test_AUPRC_score
                borderlinesmote_delta_auprc = borderlinesmote_mean_test_AUPRC_score - baseline_mean_test_AUPRC_score
                clustercentroids_delta_auprc = clustercentroids_mean_test_AUPRC_score - baseline_mean_test_AUPRC_score
                nearmiss_delta_auprc = nearmiss_mean_test_AUPRC_score - baseline_mean_test_AUPRC_score

                plot_df = pd.concat(
                    [
                        plot_df,
                        pd.DataFrame(
                            {
                                'SMOTE': smote_delta_auprc,
                                'BorderlineSMOTE': borderlinesmote_delta_auprc,
                                'ClusterCentroids': clustercentroids_delta_auprc,
                                'NearMiss': nearmiss_delta_auprc
                            }, index=[float(distance_str)]
                        )
                    ]
                )

            lineplt = sns.lineplot(
                plot_df,
                legend=True,
                ax=axs[i_radius],
            )
            axs[i_radius].set_xlim(0, 1.2)
            axs[i_radius].set_ylim(-.5, .5)

            axs[i_radius].set(xlabel=None, ylabel=None)
            radius_str = radius_str.replace('.', ',')
            axs[i_radius].text(0.04, 0.36, f'$R_{{min}} = {radius_str}$', fontsize=7)

            axs[i_radius].tick_params(axis='both', which='major', labelsize=7)

            if i_radius == 0:
                axs[i_radius].set_xticks([0, 0.5, 1])

            if i_radius == 0 and i_ratios == 4:
                axs[i_radius].set(ylabel='Diferença no desempenho')

            if i_radius != 0:
                axs[i_radius].set_yticks([])           

            if i_ratios == total_ratios - 1 and i_radius == 2:
                axs[i_radius].set(xlabel='$D$')  

            if i_ratios == total_ratios - 1:
                axs[i_radius].set_xticks([1.2, 0.8, 0.4, 0.0])
            
            if i_ratios != total_ratios - 1:
                axs[i_radius].set_xticks([])

            lineplt.get_legend().remove()
            if i_ratios == total_ratios - 1 and i_radius == total_radius - 1:
                lineplt.legend(title='$\\Delta AUPRC_{{amost.}}$', loc='lower center', bbox_to_anchor=(-2, -2), ncol=2)

    plt.savefig(
        f'results/samples={samples}-estimator={e}_generated_data_delta_performance.pgf',
        bbox_inches="tight"
    )
    plt.savefig(
        f'results/samples={samples}-estimator={e}_generated_data_delta_performance.png',
        bbox_inches="tight"
    )
    plt.show()