# Results

## Imports and functions

In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

import pandas as pd
import os
from enum import Enum

pd.set_option("display.precision", 5)
pd.set_option('display.max_rows', None)

In [2]:
metadata_file = "./results/metadata.pkl"

class dataset_types(Enum):
    train = 1
    development = 2
    test = 3
    
    def title(self):
        shorten_names = ("Train", "Dev", "Test")
        return shorten_names[self.value - 1]

    def __lt__(self, other):
        return self.value < other.value

def save_results(y_pred, index, name, task, language, dataset_type, group=None, description=None, truth=False, filename=None):
    """Auxiliar method to save predictions and extra data through pickle. All extra information will be stored though a metadata file.
    Some atributes will be used to generate the path where the prediction is stored.
    
    The metadata file will store:
    - Path: where the prediction is stored.
    - Name: name of the model.
    - Description: extra information of the prediction. For example, 'the model parameters'.
    - Dataset type: enum value of the types of datasets.
    - Groud Truth: if the path contains the truth and not a prediction.
    - Group: model group (used in path). Example, 'best_traditional'.
    - Task: name of the task (used in path). Example, 'hateval2019/task1'.
    - Language: language of the task (used in path).
    
    - `filename`: name of the file (used in path).
    """
    
    path = f"./results/{task}/{language}/{dataset_type.name}{'/' + group if group is not None else ''}/{name if filename is None else filename}.pkl"
    
    directory = "/".join(path.split("/")[:-1])
    if not os.path.exists(directory):
        os.makedirs(directory)
    
    if os.path.exists(metadata_file):
        metadata = pd.read_pickle(metadata_file)
    else:
        metadata = pd.DataFrame({
            "Path": pd.Series([], dtype=str),
            "Name": pd.Series([], dtype=str),
            "Description": pd.Series([], dtype=str),
            "Dataset type": pd.Categorical([], categories=dataset_types, ordered=False),
            "Groud Truth": pd.Series([], dtype=bool),
            "Group": pd.Series([], dtype=str),
            "Task": pd.Series([], dtype=str),
            "Language": pd.Series([], dtype=str),
        }).set_index("Path")
    
    if path in metadata.index:
        metadata = remove_results(path)

    metadata.loc[path] = {"Name": name, "Description": description, "Dataset type": dataset_type, "Groud Truth": truth, "Group": group, "Task": task, "Language": lenguage}
    results = pd.DataFrame({"id": index, "y_pred": y_pred}).set_index("id") 
    
    results.to_pickle(path)
    metadata.to_pickle(metadata_file)
    
    print("Results saved on: " + path)

def remove_results(path=None):
    """Removes prediction from specified path from the disk and the metadata.
    In case `path is None` removes from the disk all files in './results' that aren't in the metadata.
    """
    # Checks if metadata exists else does nothing
    if os.path.exists(metadata_file):
        metadata = pd.read_pickle(metadata_file)
        # If path removes file from metadata and disk
        if path is not None:
            if os.path.exists(path):
                metadata = metadata.drop(path)
                os.remove(path)
                metadata.to_pickle(metadata_file)
        else:
            # Removes files from disk if they aren't in metadata
            used_files = [os.path.normpath(f) for f in metadata.index]
            all_files = [os.path.normpath(os.path.join(dp, f)) for dp, dn, filenames in os.walk('./results') for f in filenames][1:]
            for f in all_files:
                if f not in set(used_files):
                    print(f)
                    os.remove(f)
        return metadata

    
def load_results():
    """Loades from disk a DataFrame containing all the metadata and the predictions. 
    The predictions are also stored loaded into a DataFrame. 
    In case, no prediction has been stored, returns `None`.    
    """
    if os.path.exists(metadata_file):
        metadata = pd.read_pickle(metadata_file)
    else:
        return None

    # Loads all predictions from disk
    metadata["Prediction"] = [pd.read_pickle(path) for path in metadata.index]
    return metadata

In [3]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

def score(y_true, y_pred, name, f1_average):
    acc = f1 = precision = recall =  None
    if y_true is not None:
        acc, f1 = accuracy_score(y_true, y_pred), f1_score(y_true, y_pred, average=f1_average, zero_division=0)   
        precision, recall = precision_score(y_true, y_pred, zero_division=0), recall_score(y_true, y_pred, zero_division=0)
    
    return {"F1": f1, "Accuracy": acc, "Precision": precision, "Recall": recall}

def score_hateval_task1(y_true, y_pred, name):
    return score(y_true, y_pred, name, 'macro')
    
def score_detoxis_task1(y_true, y_pred, name):
    return score(y_true, y_pred, name, 'binary')

## Loads Results

In [4]:
# Loads dataframe with results
df_results = load_results()

# Finds gold standard of the datasets
mask = df_results["Groud Truth"] == True

# Splits the data into truth and predictions
df_truth = df_results[mask]
df_pred = df_results[~mask]

In [5]:
# List of dictionaries of all predictions
results = []

for path, (name, desc, dataset_type, truth, group, task, language, y_pred) in df_pred.iterrows():
    # Tires to find the gold standard corresping to the current dataset
    y_true = df_truth[(df_truth["Dataset type"] == dataset_type) & (df_truth["Task"] == task) & (df_truth["Language"] == language)]["Prediction"]
    # Checks if truth is empty
    if y_true.empty:
        y_true = [None]
    
    # Checks the task type to find the corresponding scoring method
    if task == "hateval2019/task1":
        result = score_hateval_task1(y_true[0], y_pred, name)
    elif task == "detoxis/task1":
        result = score_detoxis_task1(y_true[0], y_pred, name)
    else:
        raise Exception("Unknown Task")
        
    # Reformats features to better values
    result.update({"Dataset type": dataset_type.title(),
                   "Task": ' '.join(task.split('/')).title(),
                   "Name": name,
                   "Group": ' '.join(group.split('_')).title(),
                   "Language": language.title(),
                   "Description": desc})
    
    results.append(result)

# Generates base DataFrame model
df_results = pd.DataFrame(results).set_index(["Task", "Language", "Dataset type", "Group", "Name", "Description"]).sort_index(level=[0, 1, 2, 3, 4])

## Option 1

Scores group by task, language and dataset type. Also are sorted by F1. Seems the least useful.

In [6]:
score_names = df_results.columns.to_list()
df_results_1 = df_results.sort_values(by=["Task", "Language", "Dataset type"] + score_names, ascending=3*[True] + len(score_names) * [False]).droplevel("Group")
df_results_1.droplevel("Description")

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,F1,Accuracy,Precision,Recall
Task,Language,Dataset type,Name,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Detoxis Task1,Spanish,Dev,Bert base (2 epochs),0.66368,0.78355,0.74,0.60163
Detoxis Task1,Spanish,Dev,RidgeClassifier SMOTE_IPF,0.6616,0.74315,0.62143,0.70732
Detoxis Task1,Spanish,Dev,RidgeClassifier SMOTE_TomekLinks,0.65909,0.74026,0.61702,0.70732
Detoxis Task1,Spanish,Dev,RidgeClassifier G_SMOTE,0.65774,0.7417,0.62094,0.69919
Detoxis Task1,Spanish,Dev,RidgeClassifier SMOBD,0.65774,0.7417,0.62094,0.69919
Detoxis Task1,Spanish,Dev,RidgeClassifier Assembled_SMOTE,0.65655,0.73882,0.61566,0.70325
Detoxis Task1,Spanish,Dev,Bert base (3 epochs),0.65471,0.77778,0.73,0.5935
Detoxis Task1,Spanish,Dev,RidgeClassifier polynom_fit_SMOTE,0.64916,0.73016,0.60279,0.70325
Detoxis Task1,Spanish,Dev,Bert Avarage,0.64819,0.7619,0.68161,0.61789
Detoxis Task1,Spanish,Dev,Bert Avarage (2 epochs| differnt lr),0.64775,0.78499,0.77401,0.55691


## Option 2

Scores group by task, language and group and sorted by F1 test (if exists) or F1 Dev. Allows to compare model in the same group.

In [7]:
df_results_2 = df_results.groupby(["Task", "Language",'Group','Name'], as_index=False).aggregate([(lambda t: lambda x: x[:, :, t] if x.index.isin([t], level=2).any() else None)(t.title()) for t in dataset_types])
df_results_2.columns = pd.MultiIndex.from_product([score_names, [t.title() for t in dataset_types]])
df_results_2 = df_results_2.sort_values(by=['Task', 'Language', 'Group', ("F1", dataset_types.test.title()), ("F1", dataset_types.development.title())], ascending=3*[True] + 2*[False])

df_results_2

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,F1,F1,F1,Accuracy,Accuracy,Accuracy,Precision,Precision,Precision,Recall,Recall,Recall
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Train,Dev,Test,Train,Dev,Test,Train,Dev,Test,Train,Dev,Test
Task,Language,Group,Name,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2
Detoxis Task1,Spanish,Deep Learning,Bert base (2 epochs),0.83636,0.66368,0.55882,0.89928,0.78355,0.76431,0.88682,0.74,0.56118,0.79134,0.60163,0.55649
Detoxis Task1,Spanish,Deep Learning,Bert base (3 epochs),0.85899,0.65471,0.54286,0.91336,0.77778,0.7486,0.91261,0.73,0.52988,0.81132,0.5935,0.55649
Detoxis Task1,Spanish,Deep Learning,Bert Avarage,0.98041,0.64819,0.53975,0.98736,0.7619,0.75309,0.98871,0.68161,0.53975,0.97225,0.61789,0.53975
Detoxis Task1,Spanish,Deep Learning,Bert base (4 epochs),0.95833,0.64069,0.51653,0.97329,0.76046,0.73737,0.97257,0.68519,0.5102,0.94451,0.60163,0.52301
Detoxis Task1,Spanish,Deep Learning,Bert Avarage (2 epochs| differnt lr),0.86458,0.64775,0.5157,0.91733,0.78499,0.75758,0.92532,0.77401,0.55556,0.81132,0.55691,0.48117
Detoxis Task1,Spanish,Deep Learning,Bert Avarage (3 epochs),0.94378,0.62882,0.49886,0.96426,0.75469,0.75421,0.96628,0.67925,0.55051,0.92231,0.58537,0.45607
Detoxis Task1,Spanish,Deep Learning,Atalaya,0.23953,0.06178,0.02439,0.71805,0.64935,0.73064,0.97619,0.61538,0.42857,0.13651,0.03252,0.01255
Detoxis Task1,Spanish,Sbert,Multi-layer Perceptron classifier,0.66381,0.6186,0.47332,0.80181,0.76335,0.74523,0.74044,0.72283,0.53125,0.60155,0.54065,0.42678
Detoxis Task1,Spanish,Sbert,Ridge Classifier,0.62394,0.58621,0.45113,0.79242,0.75758,0.75421,0.75955,0.74375,0.5625,0.52941,0.48374,0.37657
Detoxis Task1,Spanish,Sbert,Support Vector Classification,0.82534,0.56533,0.42017,0.90144,0.76479,0.76768,0.97432,0.82171,0.63559,0.71587,0.43089,0.31381


## Option 3
Scores group by task, language and are sorted by F1 test (if exists) or F1 Dev. Allows to find best models.

In [8]:
df_results_3 = df_results_2.sort_values(by=["Task", "Language", ("F1", dataset_types.test.title()), ("F1", dataset_types.development.title())], ascending=2*[True] + 2*[False])
df_results_3

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,F1,F1,F1,Accuracy,Accuracy,Accuracy,Precision,Precision,Precision,Recall,Recall,Recall
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Train,Dev,Test,Train,Dev,Test,Train,Dev,Test,Train,Dev,Test
Task,Language,Group,Name,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2
Detoxis Task1,Spanish,Deep Learning,Bert base (2 epochs),0.83636,0.66368,0.55882,0.89928,0.78355,0.76431,0.88682,0.74,0.56118,0.79134,0.60163,0.55649
Detoxis Task1,Spanish,Deep Learning,Bert base (3 epochs),0.85899,0.65471,0.54286,0.91336,0.77778,0.7486,0.91261,0.73,0.52988,0.81132,0.5935,0.55649
Detoxis Task1,Spanish,Deep Learning,Bert Avarage,0.98041,0.64819,0.53975,0.98736,0.7619,0.75309,0.98871,0.68161,0.53975,0.97225,0.61789,0.53975
Detoxis Task1,Spanish,Sbert Oversampling,MLPClassifier G_SMOTE,0.73046,0.64552,0.53779,0.80578,0.72583,0.70483,0.66575,0.59655,0.46364,0.8091,0.70325,0.64017
Detoxis Task1,Spanish,Sbert Oversampling,MLPClassifier SMOTE_TomekLinks,0.93829,0.64167,0.53282,0.95921,0.7518,0.7284,0.92366,0.65812,0.49462,0.95339,0.62602,0.57741
Detoxis Task1,Spanish,Sbert Oversampling,RidgeClassifier SMOTE_TomekLinks,0.69412,0.65909,0.53054,0.78014,0.74026,0.69809,0.63394,0.61702,0.45509,0.76693,0.70732,0.63598
Detoxis Task1,Spanish,Sbert Oversampling,RidgeClassifier polynom_fit_SMOTE,0.69386,0.64916,0.52688,0.78051,0.73016,0.7037,0.63502,0.60279,0.46082,0.76471,0.70325,0.61506
Detoxis Task1,Spanish,Sbert Oversampling,RidgeClassifier SMOTE_IPF,0.69382,0.6616,0.52539,0.78014,0.74315,0.69585,0.63419,0.62143,0.45181,0.76582,0.70732,0.62762
Detoxis Task1,Spanish,Sbert Oversampling,RidgeClassifier G_SMOTE,0.6946,0.65774,0.52536,0.78159,0.7417,0.70595,0.63704,0.62094,0.46326,0.7636,0.69919,0.60669
Detoxis Task1,Spanish,Sbert Oversampling,MLPClassifier SMOBD,0.68076,0.63813,0.52269,0.77653,0.7316,0.70483,0.63584,0.61194,0.46154,0.73252,0.66667,0.60251


## Option 4
Same as option 2, but with reduced number of models

In [9]:
df_results_4 = df_results_2.sort_values(by=["Task", "Language", "Group", ("F1", dataset_types.test.title()), ("F1", dataset_types.development.title())], ascending=3*[True] + 2*[False]).groupby(by=["Task", "Language","Group"]).head(3)
df_results_4

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,F1,F1,F1,Accuracy,Accuracy,Accuracy,Precision,Precision,Precision,Recall,Recall,Recall
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Train,Dev,Test,Train,Dev,Test,Train,Dev,Test,Train,Dev,Test
Task,Language,Group,Name,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2
Detoxis Task1,Spanish,Deep Learning,Bert base (2 epochs),0.83636,0.66368,0.55882,0.89928,0.78355,0.76431,0.88682,0.74,0.56118,0.79134,0.60163,0.55649
Detoxis Task1,Spanish,Deep Learning,Bert base (3 epochs),0.85899,0.65471,0.54286,0.91336,0.77778,0.7486,0.91261,0.73,0.52988,0.81132,0.5935,0.55649
Detoxis Task1,Spanish,Deep Learning,Bert Avarage,0.98041,0.64819,0.53975,0.98736,0.7619,0.75309,0.98871,0.68161,0.53975,0.97225,0.61789,0.53975
Detoxis Task1,Spanish,Sbert,Multi-layer Perceptron classifier,0.66381,0.6186,0.47332,0.80181,0.76335,0.74523,0.74044,0.72283,0.53125,0.60155,0.54065,0.42678
Detoxis Task1,Spanish,Sbert,Ridge Classifier,0.62394,0.58621,0.45113,0.79242,0.75758,0.75421,0.75955,0.74375,0.5625,0.52941,0.48374,0.37657
Detoxis Task1,Spanish,Sbert,Support Vector Classification,0.82534,0.56533,0.42017,0.90144,0.76479,0.76768,0.97432,0.82171,0.63559,0.71587,0.43089,0.31381
Detoxis Task1,Spanish,Sbert Best,Multi-layer Perceptron classifier (best),0.61733,0.60849,0.48675,0.78159,0.76046,0.76094,0.71765,0.72472,0.57386,0.54162,0.52439,0.42259
Detoxis Task1,Spanish,Sbert Best,Ridge Classifier (best),0.64695,0.57757,0.44125,0.79747,0.74459,0.7385,0.74709,0.69942,0.51685,0.57048,0.49187,0.38494
Detoxis Task1,Spanish,Sbert Best,Support Vector Classification (best),0.99833,0.60325,0.43091,0.99892,0.75325,0.72727,1.0,0.7027,0.48936,0.99667,0.52846,0.38494
Detoxis Task1,Spanish,Sbert Oversampling,MLPClassifier G_SMOTE,0.73046,0.64552,0.53779,0.80578,0.72583,0.70483,0.66575,0.59655,0.46364,0.8091,0.70325,0.64017


## Option 5
Same as option 4, but without restriting groups

In [10]:
df_results_5 = df_results_2.sort_values(by=["Task", "Language", ("F1", dataset_types.test.title()), ("F1", dataset_types.development.title())], ascending=2*[True] + 2*[False]).groupby(by=["Task", "Language","Group"]).head(3)
df_results_5

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,F1,F1,F1,Accuracy,Accuracy,Accuracy,Precision,Precision,Precision,Recall,Recall,Recall
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Train,Dev,Test,Train,Dev,Test,Train,Dev,Test,Train,Dev,Test
Task,Language,Group,Name,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2
Detoxis Task1,Spanish,Deep Learning,Bert base (2 epochs),0.83636,0.66368,0.55882,0.89928,0.78355,0.76431,0.88682,0.74,0.56118,0.79134,0.60163,0.55649
Detoxis Task1,Spanish,Sbert Oversampling,RidgeClassifier SMOTE_IPF,0.69382,0.6616,0.52539,0.78014,0.74315,0.69585,0.63419,0.62143,0.45181,0.76582,0.70732,0.62762
Detoxis Task1,Spanish,Sbert Oversampling,RidgeClassifier SMOTE_TomekLinks,0.69412,0.65909,0.53054,0.78014,0.74026,0.69809,0.63394,0.61702,0.45509,0.76693,0.70732,0.63598
Detoxis Task1,Spanish,Sbert Oversampling,RidgeClassifier G_SMOTE,0.6946,0.65774,0.52536,0.78159,0.7417,0.70595,0.63704,0.62094,0.46326,0.7636,0.69919,0.60669
Detoxis Task1,Spanish,Deep Learning,Bert base (3 epochs),0.85899,0.65471,0.54286,0.91336,0.77778,0.7486,0.91261,0.73,0.52988,0.81132,0.5935,0.55649
Detoxis Task1,Spanish,Deep Learning,Bert Avarage,0.98041,0.64819,0.53975,0.98736,0.7619,0.75309,0.98871,0.68161,0.53975,0.97225,0.61789,0.53975
Detoxis Task1,Spanish,Sbert,Multi-layer Perceptron classifier,0.66381,0.6186,0.47332,0.80181,0.76335,0.74523,0.74044,0.72283,0.53125,0.60155,0.54065,0.42678
Detoxis Task1,Spanish,Sbert Best,Multi-layer Perceptron classifier (best),0.61733,0.60849,0.48675,0.78159,0.76046,0.76094,0.71765,0.72472,0.57386,0.54162,0.52439,0.42259
Detoxis Task1,Spanish,Sbert Best,Support Vector Classification (best),0.99833,0.60325,0.43091,0.99892,0.75325,0.72727,1.0,0.7027,0.48936,0.99667,0.52846,0.38494
Detoxis Task1,Spanish,Sbert,Ridge Classifier,0.62394,0.58621,0.45113,0.79242,0.75758,0.75421,0.75955,0.74375,0.5625,0.52941,0.48374,0.37657
