In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

import pandas as pd
import os
from enum import Enum

pd.set_option("display.precision", 5)
pd.set_option('display.max_rows', None)

In [2]:
metadata_file = "./results/metadata.pkl"

class dataset_types(Enum):
    train = 1
    development = 2
    test = 3
    
    def title(self):
        return self._shorten_names[self.value - 1]

    def __lt__(self, other):
        return self.value < other.value
dataset_types._shorten_names = ["Train", "Dev", "Test"]

def save_results(y_pred, index, name, task, lenguage, dataset_type, group=None, description=None, truth=False, filename=None):
    
    path = f"./results/{task}/{lenguage}/{dataset_type.name}{'/' + group if group is not None else ''}/{name if filename is None else filename}.pkl"
    
    directory = "/".join(path.split("/")[:-1])
    if not os.path.exists(directory):
        os.makedirs(directory)
    
    if os.path.exists(metadata_file):
        metadata = pd.read_pickle(metadata_file)
    else:
        metadata = pd.DataFrame({
            "Path": pd.Series([], dtype=str),
            "Name": pd.Series([], dtype=str),
            "Description": pd.Series([], dtype=str),
            "Dataset type": pd.Categorical([], categories=dataset_types, ordered=False),
            "Groud Truth": pd.Series([], dtype=bool),
            "Group": pd.Series([], dtype=str),
            "Task": pd.Series([], dtype=str),
            "Lenguage": pd.Series([], dtype=str),
        }).set_index("Path")
    
    if path in metadata.index:
        metadata = remove_results(path)

    metadata.loc[path] = {"Name": name, "Description": description, "Dataset type": dataset_type, "Groud Truth": truth, "Group": group, "Task": task, "Lenguage": lenguage}
    results = pd.DataFrame({"id": index, "y_pred": y_pred}).set_index("id") 
    
    results.to_pickle(path)
    metadata.to_pickle(metadata_file)
    
    print("Results saved on: " + path)

def remove_results(path=None):
    if os.path.exists(metadata_file):
        metadata = pd.read_pickle(metadata_file)
        if path is not None:
            if os.path.exists(path):
                metadata = metadata.drop(path)
                os.remove(path)
                metadata.to_pickle(metadata_file)
        else:
            if os.path.exists(metadata_file):
                used_files = [os.path.normpath(f) for f in metadata.index]
                all_files = set([os.path.normpath(os.path.join(dp, f)) for dp, dn, filenames in os.walk('./results') for f in filenames][1:])
                for f in used_files:
                    if f not in all_files:
                        os.remove(f)
        return metadata

    
def load_results():
    if os.path.exists(metadata_file):
        metadata = pd.read_pickle(metadata_file)
    else:
        return None
    
    metadata["Results"] = [pd.read_pickle(path) for path in metadata.index]
    return metadata

In [3]:
from sklearn.metrics import accuracy_score, classification_report, f1_score, precision_score, recall_score

def print_score(y_true, y_pred, name, f1_average):
    classification_report_results = acc = f1 = precision = recall =  None
    if y_true is not None:
        classification_report_results = classification_report(y_true, y_pred)

        acc, f1 = accuracy_score(y_true, y_pred), f1_score(y_true, y_pred, average=f1_average)   
        precision, recall = precision_score(y_true, y_pred), recall_score(y_true, y_pred)

    print(name)
    print('F1 macro: ', f1)
    print('Accuracy: ', acc)

    print('\nClassification Report')
    print('======================================================')
    print('\n', classification_report_results)
    
    return {"F1": f1, "Accuracy": acc, "Precision": precision, "Recall": recall}

def print_score_hateval_task1(y_true, y_pred, name):
    return print_score(y_true, y_pred, name, 'macro')
    
def print_score_detoxis_task1(y_true, y_pred, name):
    return print_score(y_true, y_pred, name, 'binary')

In [4]:
df_results = load_results()

mask = df_results["Groud Truth"] == True

df_truth = df_results[mask]
df_pred = df_results[~mask]

df_truth.sort_values(by=["Lenguage", "Dataset type"])

Unnamed: 0_level_0,Name,Description,Dataset type,Groud Truth,Group,Task,Lenguage,Results
Path,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
./results/hateval2019/task1/english/train/train_truth_task1.pkl,English Train,,dataset_types.train,True,,hateval2019/task1,english,y_pred id 201 1 202 ...
./results/hateval2019/task1/english/development/dev_truth_task1.pkl,English Development,,dataset_types.development,True,,hateval2019/task1,english,y_pred id 18201 0 1820...
./results/hateval2019/task1/english/test/test_truth_task1.pkl,English Test,,dataset_types.test,True,,hateval2019/task1,english,y_pred id 34243 0 3059...
./results/hateval2019/task1/spanish/train/train_truth_task1.pkl,Spanish Train,,dataset_types.train,True,,hateval2019/task1,spanish,y_pred id 20001 1 2000...
./results/detoxis/task1/spanish/train/train_truth_task1.pkl,Detoxis Train,,dataset_types.train,True,,detoxis/task1,spanish,y_pred id 0 0 1 ...
./results/hateval2019/task1/spanish/development/dev_truth_task1.pkl,Spanish Development,,dataset_types.development,True,,hateval2019/task1,spanish,y_pred id 20005 0 2000...
./results/detoxis/task1/spanish/development/dev_truth_task1.pkl,Detoxis Development,,dataset_types.development,True,,detoxis/task1,spanish,y_pred id 0 0 1 ...
./results/hateval2019/task1/spanish/test/test_truth_task1.pkl,Spanish Test,,dataset_types.test,True,,hateval2019/task1,spanish,y_pred id 31494 0 3246...


In [5]:
results = []

for path, (name, desc, dataset_type, truth, group, task, lenguage, y_pred) in df_pred.iterrows():
    y_true = df_truth[(df_truth["Dataset type"] == dataset_type) & (df_truth["Task"] == task) & (df_truth["Lenguage"] == lenguage)]["Results"]
    y_true = [None] if y_true.empty else y_true
    
    result = {}
    if task == "hateval2019/task1":
        result = print_score_hateval_task1(y_true[0], y_pred, name)
    if task == "detoxis/task1":
        result = print_score_detoxis_task1(y_true[0], y_pred, name)
        
    result.update({"Dataset type": dataset_type.title(),
                   "Task": ' '.join(task.split('/')).title(),
                   "Name": name,
                   "Group": ' '.join(group.split('_')).title(),
                   "Lenguage": lenguage.title(),
                   "Description": desc})
    
    results.append(result)

Bert base
F1 macro:  0.9299943995519642
Accuracy:  0.9315555555555556

Classification Report

               precision    recall  f1-score   support

           0       0.95      0.93      0.94      5217
           1       0.91      0.93      0.92      3783

    accuracy                           0.93      9000
   macro avg       0.93      0.93      0.93      9000
weighted avg       0.93      0.93      0.93      9000

Bert base
F1 macro:  0.7469041129594169
Accuracy:  0.749

Classification Report

               precision    recall  f1-score   support

           0       0.81      0.73      0.77       573
           1       0.68      0.77      0.72       427

    accuracy                           0.75      1000
   macro avg       0.75      0.75      0.75      1000
weighted avg       0.76      0.75      0.75      1000

Bert base
F1 macro:  0.5941497231031642
Accuracy:  0.6053333333333333

Classification Report

               precision    recall  f1-score   support

           0       

Bert Avarage
F1 macro:  0.9804141018466703
Accuracy:  0.9873646209386282

Classification Report

               precision    recall  f1-score   support

           0       0.99      0.99      0.99      1869
           1       0.99      0.97      0.98       901

    accuracy                           0.99      2770
   macro avg       0.99      0.98      0.99      2770
weighted avg       0.99      0.99      0.99      2770

Bert Avarage
F1 macro:  0.6481876332622601
Accuracy:  0.7619047619047619

Classification Report

               precision    recall  f1-score   support

           0       0.80      0.84      0.82       447
           1       0.68      0.62      0.65       246

    accuracy                           0.76       693
   macro avg       0.74      0.73      0.73       693
weighted avg       0.76      0.76      0.76       693

Bert Avarage
F1 macro:  None
Accuracy:  None

Classification Report

 None
Atalaya
F1 macro:  0.3424908424908425
Accuracy:  0.7407942238267148

Classi

Support Vector Classification
F1 macro:  0.5599999999999999
Accuracy:  0.7619047619047619

Classification Report

               precision    recall  f1-score   support

           0       0.75      0.95      0.84       447
           1       0.81      0.43      0.56       246

    accuracy                           0.76       693
   macro avg       0.78      0.69      0.70       693
weighted avg       0.77      0.76      0.74       693

Support Vector Classification (best)
F1 macro:  0.5972222222222222
Accuracy:  0.7489177489177489

Classification Report

               precision    recall  f1-score   support

           0       0.77      0.87      0.82       447
           1       0.69      0.52      0.60       246

    accuracy                           0.75       693
   macro avg       0.73      0.70      0.71       693
weighted avg       0.74      0.75      0.74       693

AdaBoost classifier
F1 macro:  0.5057471264367815
Accuracy:  0.6897546897546898

Classification Report

     

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Random Forest classifier (best)
F1 macro:  0.9977190608856654
Accuracy:  0.9977777777777778

Classification Report

               precision    recall  f1-score   support

           0       1.00      1.00      1.00      5217
           1       1.00      1.00      1.00      3783

    accuracy                           1.00      9000
   macro avg       1.00      1.00      1.00      9000
weighted avg       1.00      1.00      1.00      9000

Random Forest classifier
F1 macro:  0.9989737648560049
Accuracy:  0.999

Classification Report

               precision    recall  f1-score   support

           0       1.00      1.00      1.00      5217
           1       1.00      1.00      1.00      3783

    accuracy                           1.00      9000
   macro avg       1.00      1.00      1.00      9000
weighted avg       1.00      1.00      1.00      9000

Support Vector Classification (best)
F1 macro:  0.9540676206890405
Accuracy:  0.9553333333333334

Classification Report

           

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Ridge Classifier (best)
F1 macro:  0.7312939231087225
Accuracy:  0.74

Classification Report

               precision    recall  f1-score   support

           0       0.76      0.80      0.78       573
           1       0.71      0.66      0.68       427

    accuracy                           0.74      1000
   macro avg       0.74      0.73      0.73      1000
weighted avg       0.74      0.74      0.74      1000

Ridge Classifier
F1 macro:  0.7109660702864922
Accuracy:  0.718

Classification Report

               precision    recall  f1-score   support

           0       0.75      0.76      0.76       573
           1       0.67      0.66      0.67       427

    accuracy                           0.72      1000
   macro avg       0.71      0.71      0.71      1000
weighted avg       0.72      0.72      0.72      1000

Random Forest classifier (best)
F1 macro:  0.7421994884910486
Accuracy:  0.748

Classification Report

               precision    recall  f1-score   support

   

Support Vector Classification
F1 macro:  0.44542732228205584
Accuracy:  0.4846666666666667

Classification Report

               precision    recall  f1-score   support

           0       0.71      0.19      0.30      1740
           1       0.44      0.89      0.59      1260

    accuracy                           0.48      3000
   macro avg       0.58      0.54      0.45      3000
weighted avg       0.60      0.48      0.42      3000

AdaBoost classifier (best)
F1 macro:  0.4253963328793964
Accuracy:  0.4666666666666667

Classification Report

               precision    recall  f1-score   support

           0       0.65      0.17      0.27      1740
           1       0.43      0.87      0.58      1260

    accuracy                           0.47      3000
   macro avg       0.54      0.52      0.43      3000
weighted avg       0.56      0.47      0.40      3000

AdaBoost classifier
F1 macro:  0.42942942942942947
Accuracy:  0.468

Classification Report

               precision  

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Support Vector Classification
F1 macro:  0.9756835918383328
Accuracy:  0.9764444444444444

Classification Report

               precision    recall  f1-score   support

           0       0.98      0.98      0.98      2643
           1       0.97      0.97      0.97      1857

    accuracy                           0.98      4500
   macro avg       0.98      0.98      0.98      4500
weighted avg       0.98      0.98      0.98      4500

AdaBoost classifier (best)
F1 macro:  0.8543402557677213
Accuracy:  0.8613333333333333

Classification Report

               precision    recall  f1-score   support

           0       0.86      0.92      0.89      2643
           1       0.87      0.78      0.82      1857

    accuracy                           0.86      4500
   macro avg       0.86      0.85      0.85      4500
weighted avg       0.86      0.86      0.86      4500

AdaBoost classifier
F1 macro:  0.7732658386776186
Accuracy:  0.7871111111111111

Classification Report

               

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Dummy Classifier
F1 macro:  0.3700787401574803
Accuracy:  0.5875

Classification Report

               precision    recall  f1-score   support

           0       0.59      1.00      0.74       940
           1       0.00      0.00      0.00       660

    accuracy                           0.59      1600
   macro avg       0.29      0.50      0.37      1600
weighted avg       0.35      0.59      0.43      1600

Multinomial Naive Bayes classifier (best)
F1 macro:  0.6931989718901717
Accuracy:  0.70625

Classification Report

               precision    recall  f1-score   support

           0       0.74      0.78      0.76       940
           1       0.66      0.61      0.63       660

    accuracy                           0.71      1600
   macro avg       0.70      0.69      0.69      1600
weighted avg       0.70      0.71      0.70      1600

Multinomial Naive Bayes classifier
F1 macro:  0.6850367540180241
Accuracy:  0.70375

Classification Report

               precision    reca

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Multinomial Naive Bayes classifier (best)
F1 macro:  0.7724867724867724
Accuracy:  0.8758122743682311

Classification Report

               precision    recall  f1-score   support

           0       0.85      0.99      0.91      1869
           1       0.96      0.65      0.77       901

    accuracy                           0.88      2770
   macro avg       0.90      0.82      0.84      2770
weighted avg       0.89      0.88      0.87      2770

Bernoulli Naive Bayes classifier
F1 macro:  0.6634549208534066
Accuracy:  0.8234657039711192

Classification Report

               precision    recall  f1-score   support

           0       0.81      0.96      0.88      1869
           1       0.87      0.53      0.66       901

    accuracy                           0.82      2770
   macro avg       0.84      0.75      0.77      2770
weighted avg       0.83      0.82      0.81      2770

Bernoulli Naive Bayes classifier (best)
F1 macro:  0.7614068441064639
Accuracy:  0.8187725631768953



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Bernoulli Naive Bayes classifier
F1 macro:  0.21604938271604937
Accuracy:  0.6334776334776335

Classification Report

               precision    recall  f1-score   support

           0       0.66      0.90      0.76       447
           1       0.45      0.14      0.22       246

    accuracy                           0.63       693
   macro avg       0.55      0.52      0.49       693
weighted avg       0.58      0.63      0.57       693

Bernoulli Naive Bayes classifier (best)
F1 macro:  0.3614457831325302
Accuracy:  0.5411255411255411

Classification Report

               precision    recall  f1-score   support

           0       0.65      0.64      0.64       447
           1       0.36      0.37      0.36       246

    accuracy                           0.54       693
   macro avg       0.50      0.50      0.50       693
weighted avg       0.54      0.54      0.54       693

Ridge Classifier
F1 macro:  0.26038781163434904
Accuracy:  0.6147186147186147

Classification Report



MLP SMOBD
F1 macro:  0.9523287671232876
Accuracy:  0.9685920577617328

Classification Report

               precision    recall  f1-score   support

           0       0.98      0.97      0.98      1869
           1       0.94      0.96      0.95       901

    accuracy                           0.97      2770
   macro avg       0.96      0.97      0.96      2770
weighted avg       0.97      0.97      0.97      2770

MLP G_SMOTE
F1 macro:  0.9238521836506158
Accuracy:  0.9509025270758122

Classification Report

               precision    recall  f1-score   support

           0       0.96      0.97      0.96      1869
           1       0.93      0.92      0.92       901

    accuracy                           0.95      2770
   macro avg       0.95      0.94      0.94      2770
weighted avg       0.95      0.95      0.95      2770

MLP CCR
F1 macro:  0.11921891058581706
Accuracy:  0.6906137184115524

Classification Report

               precision    recall  f1-score   support

     

SVC SMOBD
F1 macro:  0.9052268811028144
Accuracy:  0.9404332129963899

Classification Report

               precision    recall  f1-score   support

           0       0.94      0.97      0.96      1869
           1       0.94      0.87      0.91       901

    accuracy                           0.94      2770
   macro avg       0.94      0.92      0.93      2770
weighted avg       0.94      0.94      0.94      2770

SVC G_SMOTE
F1 macro:  0.9073114565342544
Accuracy:  0.9418772563176895

Classification Report

               precision    recall  f1-score   support

           0       0.94      0.97      0.96      1869
           1       0.94      0.87      0.91       901

    accuracy                           0.94      2770
   macro avg       0.94      0.92      0.93      2770
weighted avg       0.94      0.94      0.94      2770

SVC CCR
F1 macro:  0.04125950054288817
Accuracy:  0.6812274368231047

Classification Report

               precision    recall  f1-score   support

     

Ridge G_SMOTE
F1 macro:  0.694331983805668
Accuracy:  0.7819494584837545

Classification Report

               precision    recall  f1-score   support

           0       0.87      0.79      0.83      1869
           1       0.64      0.76      0.69       901

    accuracy                           0.78      2770
   macro avg       0.76      0.78      0.76      2770
weighted avg       0.80      0.78      0.79      2770

Ridge CCR
F1 macro:  0.39089481946624804
Accuracy:  0.71985559566787

Classification Report

               precision    recall  f1-score   support

           0       0.73      0.93      0.82      1869
           1       0.67      0.28      0.39       901

    accuracy                           0.72      2770
   macro avg       0.70      0.61      0.60      2770
weighted avg       0.71      0.72      0.68      2770

Ridge LVQ_SMOTE
F1 macro:  0.6485849056603773
Accuracy:  0.7848375451263538

Classification Report

               precision    recall  f1-score   support

Bert Avarage (2 epochs| differnt lr)
F1 macro:  0.6325581395348837
Accuracy:  0.772005772005772

Classification Report

               precision    recall  f1-score   support

           0       0.78      0.89      0.83       447
           1       0.74      0.55      0.63       246

    accuracy                           0.77       693
   macro avg       0.76      0.72      0.73       693
weighted avg       0.77      0.77      0.76       693

Bert Avarage (2 epochs| differnt lr)
F1 macro:  None
Accuracy:  None

Classification Report

 None


In [6]:
df_results = pd.DataFrame(results).set_index(["Task", "Lenguage", "Dataset type", "Group", "Name", "Description"]).sort_index(level=[0, 1, 2, 3, 4])
scores = df_results.columns.to_list()
df_results_index = df_results.sort_values(by=["Task", "Lenguage", "Dataset type"] + scores, ascending=3*[True] + len(scores) * [False]).droplevel("Group")
df_results_index.droplevel("Description")

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,F1,Accuracy,Precision,Recall
Task,Lenguage,Dataset type,Name,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Detoxis Task1,Spanish,Dev,Bert base (2 epochs),0.66368,0.78355,0.74,0.60163
Detoxis Task1,Spanish,Dev,Ridge SMOBD,0.66038,0.74026,0.6162,0.71138
Detoxis Task1,Spanish,Dev,MLP SMOTE_TomekLinks,0.66019,0.74747,0.63197,0.69106
Detoxis Task1,Spanish,Dev,Ridge G_SMOTE,0.65672,0.73449,0.6069,0.71545
Detoxis Task1,Spanish,Dev,Bert base (3 epochs),0.65471,0.77778,0.73,0.5935
Detoxis Task1,Spanish,Dev,Ridge SMOTE_TomekLinks,0.65291,0.73304,0.60627,0.70732
Detoxis Task1,Spanish,Dev,Ridge SMOTE_IPF,0.65028,0.73304,0.60777,0.69919
Detoxis Task1,Spanish,Dev,Bert Avarage,0.64819,0.7619,0.68161,0.61789
Detoxis Task1,Spanish,Dev,Ridge Assembled_SMOTE,0.6454,0.72727,0.5993,0.69919
Detoxis Task1,Spanish,Dev,Bert base (4 epochs),0.64069,0.76046,0.68519,0.60163


In [7]:
df_results_columns = df_results.groupby(["Task", "Lenguage",'Group','Name'], as_index=False).aggregate([(lambda t: lambda x: x[:, :, t] if x.index.isin([t], level=2).any() else None)(t.title()) for t in dataset_types])
df_results_columns.columns = pd.MultiIndex.from_product([scores, [t.title() for t in dataset_types]])
df_results_columns = df_results_columns.sort_values(by=['Task', 'Lenguage', 'Group', ("F1", dataset_types.test.title()), ("F1", dataset_types.development.title())], ascending=[True, True, True, False, False])

df_results_columns

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,F1,F1,F1,Accuracy,Accuracy,Accuracy,Precision,Precision,Precision,Recall,Recall,Recall
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Train,Dev,Test,Train,Dev,Test,Train,Dev,Test,Train,Dev,Test
Task,Lenguage,Group,Name,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2
Detoxis Task1,Spanish,Deep Learning,Bert base (2 epochs),0.83636,0.66368,,0.89928,0.78355,,0.88682,0.74,,0.79134,0.60163,
Detoxis Task1,Spanish,Deep Learning,Bert base (3 epochs),0.85899,0.65471,,0.91336,0.77778,,0.91261,0.73,,0.81132,0.5935,
Detoxis Task1,Spanish,Deep Learning,Bert Avarage,0.98041,0.64819,,0.98736,0.7619,,0.98871,0.68161,,0.97225,0.61789,
Detoxis Task1,Spanish,Deep Learning,Bert base (4 epochs),0.95833,0.64069,,0.97329,0.76046,,0.97257,0.68519,,0.94451,0.60163,
Detoxis Task1,Spanish,Deep Learning,Bert Avarage (2 epochs| differnt lr),0.86307,0.63256,,0.91661,0.77201,,0.92621,0.73913,,0.80799,0.55285,
Detoxis Task1,Spanish,Deep Learning,Bert Avarage (3 epochs),0.94378,0.62882,,0.96426,0.75469,,0.96628,0.67925,,0.92231,0.58537,
Detoxis Task1,Spanish,Deep Learning,Atalaya,0.34249,0.06792,,0.74079,0.64358,,0.97906,0.47368,,0.20755,0.03659,
Detoxis Task1,Spanish,Sbert,Multi-layer Perceptron classifier,0.65392,0.61072,,0.79747,0.75902,,0.73611,0.71585,,0.58824,0.53252,
Detoxis Task1,Spanish,Sbert,Ridge Classifier,0.62939,0.57985,,0.79422,0.75325,,0.75981,0.73292,,0.53718,0.47967,
Detoxis Task1,Spanish,Sbert,Support Vector Classification,0.82315,0.56,,0.90072,0.7619,,0.97859,0.81395,,0.71032,0.42683,


In [8]:
df_results_columns2 = df_results_columns#.droplevel('Group')
df_results_columns2 = df_results_columns2.sort_values(by=["Task", "Lenguage", ("F1", dataset_types.test.title()), ("F1", dataset_types.development.title())], ascending=[True, True, False, False])
df_results_columns2

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,F1,F1,F1,Accuracy,Accuracy,Accuracy,Precision,Precision,Precision,Recall,Recall,Recall
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Train,Dev,Test,Train,Dev,Test,Train,Dev,Test,Train,Dev,Test
Task,Lenguage,Group,Name,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2
Detoxis Task1,Spanish,Deep Learning,Bert base (2 epochs),0.83636,0.66368,,0.89928,0.78355,,0.88682,0.74,,0.79134,0.60163,
Detoxis Task1,Spanish,Sbert Oversampling,Ridge SMOBD,0.69273,0.66038,,0.7787,0.74026,,0.63163,0.6162,,0.76693,0.71138,
Detoxis Task1,Spanish,Sbert Oversampling,MLP SMOTE_TomekLinks,0.74348,0.66019,,0.82238,0.74747,,0.70108,0.63197,,0.79134,0.69106,
Detoxis Task1,Spanish,Sbert Oversampling,Ridge G_SMOTE,0.69433,0.65672,,0.78195,0.73449,,0.63814,0.6069,,0.76138,0.71545,
Detoxis Task1,Spanish,Deep Learning,Bert base (3 epochs),0.85899,0.65471,,0.91336,0.77778,,0.91261,0.73,,0.81132,0.5935,
Detoxis Task1,Spanish,Sbert Oversampling,Ridge SMOTE_TomekLinks,0.68725,0.65291,,0.7769,0.73304,,0.63163,0.60627,,0.75361,0.70732,
Detoxis Task1,Spanish,Sbert Oversampling,Ridge SMOTE_IPF,0.68886,0.65028,,0.77726,0.73304,,0.63124,0.60777,,0.75805,0.69919,
Detoxis Task1,Spanish,Deep Learning,Bert Avarage,0.98041,0.64819,,0.98736,0.7619,,0.98871,0.68161,,0.97225,0.61789,
Detoxis Task1,Spanish,Sbert Oversampling,Ridge Assembled_SMOTE,0.6983,0.6454,,0.78195,0.72727,,0.63488,0.5993,,0.7758,0.69919,
Detoxis Task1,Spanish,Deep Learning,Bert base (4 epochs),0.95833,0.64069,,0.97329,0.76046,,0.97257,0.68519,,0.94451,0.60163,


In [9]:
df_results_columns.loc["Detoxis Task1"].reset_index(level=2).groupby(by=["Group"]).first()

Unnamed: 0_level_0,Name,F1,F1,F1,Accuracy,Accuracy,Accuracy,Precision,Precision,Precision,Recall,Recall,Recall
Unnamed: 0_level_1,Unnamed: 1_level_1,Train,Dev,Test,Train,Dev,Test,Train,Dev,Test,Train,Dev,Test
Group,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2
Deep Learning,Bert base (2 epochs),0.83636,0.66368,,0.89928,0.78355,,0.88682,0.74,,0.79134,0.60163,
Sbert,Multi-layer Perceptron classifier,0.65392,0.61072,,0.79747,0.75902,,0.73611,0.71585,,0.58824,0.53252,
Sbert Best,Support Vector Classification (best),0.99778,0.59722,,0.99856,0.74892,,1.0,0.69355,,0.99556,0.52439,
Sbert Oversampling,Ridge SMOBD,0.69273,0.66038,,0.7787,0.74026,,0.63163,0.6162,,0.76693,0.71138,
Traditional,Ridge Classifier,0.82495,0.26039,,0.90072,0.61472,,0.96716,0.4087,,0.7192,0.19106,
Traditional Best,Dummy Classifier (best),0.49087,0.52396,,0.32527,0.35498,,0.32527,0.35498,,1.0,1.0,


In [10]:
df_results.loc[:, :, :, :, "Bert Avarage"]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,F1,Accuracy,Precision,Recall
Task,Lenguage,Dataset type,Group,Description,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Detoxis Task1,Spanish,Dev,Deep Learning,"Pretrained model with bert-base-multilingual-uncased checkpoint (12 layers). Trained 4 epochs with a batch size of 4 and using AdamW optimizer with lr=2e-5 and eps=1e-8.\nHastags and user have been removed from input text, followed by the same procedure as in method direct_replacement().\nBert Pooler layer is modified to use the avarage of the last 2 layers instead of just [CLS].",0.64819,0.7619,0.68161,0.61789
Detoxis Task1,Spanish,Test,Deep Learning,"Pretrained model with bert-base-multilingual-uncased checkpoint (12 layers). Trained 4 epochs with a batch size of 4 and using AdamW optimizer with lr=2e-5 and eps=1e-8.\nHastags and user have been removed from input text, followed by the same procedure as in method direct_replacement().\nBert Pooler layer is modified to use the avarage of the last 2 layers instead of just [CLS].",,,,
Detoxis Task1,Spanish,Train,Deep Learning,"Pretrained model with bert-base-multilingual-uncased checkpoint (12 layers). Trained 4 epochs with a batch size of 4 and using AdamW optimizer with lr=2e-5 and eps=1e-8.\nHastags and user have been removed from input text, followed by the same procedure as in method direct_replacement().\nBert Pooler layer is modified to use the avarage of the last 2 layers instead of just [CLS].",0.98041,0.98736,0.98871,0.97225
Hateval2019 Task1,English,Dev,Deep Learning,"Pretrained model with bert-base-uncased checkpoint (12 layers). Trained 3 epochs with a batch size of 32 and using AdamW optimizer with lr=2e-5 and eps=1e-8.\nHastags and user have been removed from input text, followed by the same procedure as in method direct_replacement().\nBert Pooler layer is modified to use the avarage of the last 2 layers instead of just [CLS].",0.75085,0.753,0.6875,0.77283
Hateval2019 Task1,English,Test,Deep Learning,"Pretrained model with bert-base-uncased checkpoint (12 layers). Trained 3 epochs with a batch size of 32 and using AdamW optimizer with lr=2e-5 and eps=1e-8.\nHastags and user have been removed from input text, followed by the same procedure as in method direct_replacement().\nBert Pooler layer is modified to use the avarage of the last 2 layers instead of just [CLS].",0.59399,0.606,0.51729,0.92619
Hateval2019 Task1,English,Train,Deep Learning,"Pretrained model with bert-base-uncased checkpoint (12 layers). Trained 3 epochs with a batch size of 32 and using AdamW optimizer with lr=2e-5 and eps=1e-8.\nHastags and user have been removed from input text, followed by the same procedure as in method direct_replacement().\nBert Pooler layer is modified to use the avarage of the last 2 layers instead of just [CLS].",0.93938,0.94078,0.92186,0.93867
Hateval2019 Task1,Spanish,Dev,Deep Learning,"Pretrained model with bert-base-multilingual-uncased checkpoint (12 layers). Trained 4 epochs with a batch size of 16 and using AdamW optimizer with lr=2e-5 and eps=1e-8.\nHastags and user have been removed from input text, followed by the same procedure as in method direct_replacement(). \nBert Pooler layer is modified to use the avarage of the last 2 layers instead of just [CLS].",0.83122,0.832,0.78279,0.86036
Hateval2019 Task1,Spanish,Test,Deep Learning,"Pretrained model with bert-base-multilingual-uncased checkpoint (12 layers). Trained 4 epochs with a batch size of 16 and using AdamW optimizer with lr=2e-5 and eps=1e-8.\nHastags and user have been removed from input text, followed by the same procedure as in method direct_replacement(). \nBert Pooler layer is modified to use the avarage of the last 2 layers instead of just [CLS].",0.74574,0.74687,0.65306,0.82424
Hateval2019 Task1,Spanish,Train,Deep Learning,"Pretrained model with bert-base-multilingual-uncased checkpoint (12 layers). Trained 4 epochs with a batch size of 16 and using AdamW optimizer with lr=2e-5 and eps=1e-8.\nHastags and user have been removed from input text, followed by the same procedure as in method direct_replacement(). \nBert Pooler layer is modified to use the avarage of the last 2 layers instead of just [CLS].",0.98353,0.984,0.97347,0.98815


Improvements:
- Allow removal of all files with specific features or codition (not just path)