In [1]:
import pandas as pd
import os
from enum import Enum

metadata_file = "./results/metadata.pkl"
dataset_types =  Enum("dataset_types", "train development test")
# Added to allow sorting
dataset_types.__lt__ = lambda self, other: self.value < other.value

def save_results(y_pred, index, name, task, lenguage, dataset_type, group=None, description=None, truth=False, filename=None):
    
    path = f"./results/{task}/{lenguage}/{dataset_type.name}{'/' + group if group is not None else ''}/{name if filename is None else filename}.pkl"
    
    directory = "/".join(path.split("/")[:-1])
    if not os.path.exists(directory):
        os.makedirs(directory)
    
    if os.path.exists(metadata_file):
        metadata = pd.read_pickle(metadata_file)
    else:
        metadata = pd.DataFrame({
            "Path": pd.Series([], dtype=str),
            "Name": pd.Series([], dtype=str),
            "Description": pd.Series([], dtype=str),
            "Dataset type": pd.Categorical([], categories=dataset_types, ordered=False),
            "Groud Truth": pd.Series([], dtype=bool),
            "Group": pd.Series([], dtype=str),
            "Task": pd.Series([], dtype=str),
            "Lenguage": pd.Series([], dtype=str),
        }).set_index("Path")
    
    if path in metadata.index:
        metadata = remove_results(path)

    metadata.loc[path] = {"Name": name, "Description": description, "Dataset type": dataset_type, "Groud Truth": truth, "Group": group, "Task": task, "Lenguage": lenguage}
    results = pd.DataFrame({"id": index, "y_pred": y_pred}).set_index("id") 
    
    results.to_pickle(path)
    metadata.to_pickle(metadata_file)
    
    print("Results saved on: " + path)

def remove_results(path=None):
    if os.path.exists(metadata_file):
        metadata = pd.read_pickle(metadata_file)
        if path is not None:
            if os.path.exists(path):
                metadata = metadata.drop(path)
                os.remove(path)
                metadata.to_pickle(metadata_file)
        else:
            if os.path.exists(metadata_file):
                used_files = [os.path.normpath(f) for f in metadata.index]
                all_files = set([os.path.normpath(os.path.join(dp, f)) for dp, dn, filenames in os.walk('./results') for f in filenames][1:])
                for f in used_files:
                    if f not in all_files:
                        os.remove(f)
        return metadata

    
def load_results():
    if os.path.exists(metadata_file):
        metadata = pd.read_pickle(metadata_file)
    else:
        return None
    
    metadata["Results"] = [pd.read_pickle(path) for path in metadata.index]
    return metadata

In [2]:
from sklearn.metrics import accuracy_score, classification_report, f1_score

def print_score(y_true, y_pred, name):
    classification_report_results = classification_report(y_true, y_pred)
    
    acc, f1 = accuracy_score(y_true, y_pred), f1_score(y_true, y_pred, average='macro')

    print(name)
    print('F1 macro: ', f1)
    print('Accuracy: ', acc)

    print('\nClassification Report')
    print('======================================================')
    print('\n', classification_report_results)
    
    return {"F1 macro": f1, "Accuracy": acc}

In [3]:
df_results = load_results()

mask = df_results["Groud Truth"] == True

df_truth = df_results[mask]
df_pred = df_results[~mask]

df_truth.sort_values(by=["Lenguage", "Dataset type"])

Unnamed: 0_level_0,Name,Description,Dataset type,Groud Truth,Group,Task,Lenguage,Results
Path,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
./results/hateval2019/task1/english/train/train_truth_task1.pkl,English Train,,dataset_types.train,True,,hateval2019/task1,english,y_pred id 201 1 202 ...
./results/hateval2019/task1/english/development/dev_truth_task1.pkl,English Development,,dataset_types.development,True,,hateval2019/task1,english,y_pred id 18201 0 1820...
./results/hateval2019/task1/english/test/test_truth_task1.pkl,English Test,,dataset_types.test,True,,hateval2019/task1,english,y_pred id 34243 0 3059...
./results/hateval2019/task1/spanish/train/train_truth_task1.pkl,Spanish Train,,dataset_types.train,True,,hateval2019/task1,spanish,y_pred id 20001 1 2000...
./results/hateval2019/task1/spanish/development/dev_truth_task1.pkl,Spanish Development,,dataset_types.development,True,,hateval2019/task1,spanish,y_pred id 20005 0 2000...
./results/hateval2019/task1/spanish/test/test_truth_task1.pkl,Spanish Test,,dataset_types.test,True,,hateval2019/task1,spanish,y_pred id 31494 0 3246...


In [4]:
results = []

for path, (name, desc, dataset_type, truth, group, task, lenguage, y_pred) in df_pred.iterrows():
    y_true = df_truth[(df_truth["Dataset type"] == dataset_type) & (df_truth["Task"] == task) & (df_truth["Lenguage"] == lenguage)]["Results"][0]
    
    result = print_score(y_true, y_pred, name)
    result.update({"Dataset type": dataset_type, "Name": name, "Group": group, "Lenguage": lenguage, "Description": desc})
    
    results.append(result)

Bert base
F1 macro:  0.9299943995519642
Accuracy:  0.9315555555555556

Classification Report

               precision    recall  f1-score   support

           0       0.95      0.93      0.94      5217
           1       0.91      0.93      0.92      3783

    accuracy                           0.93      9000
   macro avg       0.93      0.93      0.93      9000
weighted avg       0.93      0.93      0.93      9000

Bert base
F1 macro:  0.7469041129594169
Accuracy:  0.749

Classification Report

               precision    recall  f1-score   support

           0       0.81      0.73      0.77       573
           1       0.68      0.77      0.72       427

    accuracy                           0.75      1000
   macro avg       0.75      0.75      0.75      1000
weighted avg       0.76      0.75      0.75      1000

Bert base
F1 macro:  0.5941497231031642
Accuracy:  0.6053333333333333

Classification Report

               precision    recall  f1-score   support

           0       

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Best Ridge Classifier
F1 macro:  0.8271567865321129
Accuracy:  0.8356666666666667

Classification Report

               precision    recall  f1-score   support

           0       0.82      0.91      0.87      5217
           1       0.86      0.73      0.79      3783

    accuracy                           0.84      9000
   macro avg       0.84      0.82      0.83      9000
weighted avg       0.84      0.84      0.83      9000

Random Forest classifier
F1 macro:  0.9989737648560049
Accuracy:  0.999

Classification Report

               precision    recall  f1-score   support

           0       1.00      1.00      1.00      5217
           1       1.00      1.00      1.00      3783

    accuracy                           1.00      9000
   macro avg       1.00      1.00      1.00      9000
weighted avg       1.00      1.00      1.00      9000

Best Random Forest classifier
F1 macro:  0.9965763219746517
Accuracy:  0.9966666666666667

Classification Report

               precision    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Best Support Vector Classification
F1 macro:  0.7069521297684922
Accuracy:  0.728

Classification Report

               precision    recall  f1-score   support

           0       0.72      0.87      0.79       573
           1       0.75      0.54      0.63       427

    accuracy                           0.73      1000
   macro avg       0.74      0.70      0.71      1000
weighted avg       0.73      0.73      0.72      1000

AdaBoost classifier
F1 macro:  0.7258515670051333
Accuracy:  0.739

Classification Report

               precision    recall  f1-score   support

           0       0.74      0.84      0.79       573
           1       0.73      0.61      0.67       427

    accuracy                           0.74      1000
   macro avg       0.74      0.72      0.73      1000
weighted avg       0.74      0.74      0.73      1000

Best AdaBoost classifier
F1 macro:  0.7046539875871478
Accuracy:  0.716

Classification Report

               precision    recall  f1-score   supp

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Bernoulli Naive Bayes classifier
F1 macro:  0.8776877260140155
Accuracy:  0.8808888888888889

Classification Report

               precision    recall  f1-score   support

           0       0.91      0.89      0.90      2643
           1       0.84      0.87      0.86      1857

    accuracy                           0.88      4500
   macro avg       0.88      0.88      0.88      4500
weighted avg       0.88      0.88      0.88      4500

Best Bernoulli Naive Bayes classifier
F1 macro:  0.871494135645079
Accuracy:  0.8755555555555555

Classification Report

               precision    recall  f1-score   support

           0       0.89      0.90      0.89      2643
           1       0.85      0.85      0.85      1857

    accuracy                           0.88      4500
   macro avg       0.87      0.87      0.87      4500
weighted avg       0.88      0.88      0.88      4500

Ridge Classifier
F1 macro:  0.9337121832097052
Accuracy:  0.936

Classification Report

               pre

Best Support Vector Classification
F1 macro:  0.7173983389062669
Accuracy:  0.738

Classification Report

               precision    recall  f1-score   support

           0       0.71      0.91      0.79       278
           1       0.82      0.53      0.64       222

    accuracy                           0.74       500
   macro avg       0.76      0.72      0.72       500
weighted avg       0.76      0.74      0.73       500

AdaBoost classifier
F1 macro:  0.741819819149947
Accuracy:  0.75

Classification Report

               precision    recall  f1-score   support

           0       0.75      0.83      0.79       278
           1       0.76      0.64      0.70       222

    accuracy                           0.75       500
   macro avg       0.75      0.74      0.74       500
weighted avg       0.75      0.75      0.75       500

Best AdaBoost classifier
F1 macro:  0.74581413273422
Accuracy:  0.752

Classification Report

               precision    recall  f1-score   support


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Bert Avarage
F1 macro:  0.5939885574846252
Accuracy:  0.606

Classification Report

               precision    recall  f1-score   support

           0       0.88      0.37      0.52      1740
           1       0.52      0.93      0.66      1260

    accuracy                           0.61      3000
   macro avg       0.70      0.65      0.59      3000
weighted avg       0.72      0.61      0.58      3000

Bert Avarage
F1 macro:  0.9835327664403473
Accuracy:  0.984

Classification Report

               precision    recall  f1-score   support

           0       0.99      0.98      0.99      2643
           1       0.97      0.99      0.98      1857

    accuracy                           0.98      4500
   macro avg       0.98      0.98      0.98      4500
weighted avg       0.98      0.98      0.98      4500

Bert Avarage
F1 macro:  0.8312195592419347
Accuracy:  0.832

Classification Report

               precision    recall  f1-score   support

           0       0.88      0.81   

In [5]:
df_results = pd.DataFrame(results).set_index(["Lenguage", "Dataset type", "Group", "Name"]).groupby(level=[0, 1, 2, 3]).sum()
df_results_index = df_results.sort_values(by=["Lenguage", "Dataset type", "F1 macro", "Accuracy"], ascending=[True, True, False, False]).droplevel("Group")
df_results_index

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,F1 macro,Accuracy
Lenguage,Dataset type,Name,Unnamed: 3_level_1,Unnamed: 4_level_1
english,dataset_types.train,Random Forest classifier,0.998974,0.999000
english,dataset_types.train,Best Random Forest classifier,0.996576,0.996667
english,dataset_types.train,Support Vector Classification,0.956309,0.957556
english,dataset_types.train,Entire layer,0.940645,0.942000
english,dataset_types.train,Bert Avarage,0.939383,0.940778
...,...,...,...,...
spanish,dataset_types.test,Best Support Vector Classification,0.682732,0.725000
spanish,dataset_types.test,Multinomial Naive Bayes classifier,0.670608,0.700625
spanish,dataset_types.test,AdaBoost classifier,0.659715,0.667500
spanish,dataset_types.test,Best Multinomial Naive Bayes classifier,0.655271,0.697500


In [6]:
a = [lambda x: 1, lambda x: 2, lambda x: 3] # Objective
b = [(lambda x: i+1) for i in range(3)] # Problem
c = [(lambda i: lambda x: i+1)(i) for i in range(3)] #First solution

print(a[0](1), b[0](1), c[0](1))

1 3 1


In [7]:
list_aggregate = [(lambda t: lambda x: x[:, t])(t) for t in dataset_types]
df_results_columns = df_results.groupby(["Lenguage",'Group','Name']).aggregate(list_aggregate)
df_results_columns.columns = pd.MultiIndex.from_product([['F1 macro', 'Accuracy'], [t.name.capitalize() for t in dataset_types]])
df_results_columns = df_results_columns.sort_values(by=['Lenguage', 'Group', ("F1 macro", dataset_types.test.name.capitalize())], ascending=[True, False, False])
df_results_columns.index = df_results_columns.index.set_levels([' '.join(i.split('_')).title() for i in df_results_columns.index.levels[0]], level=0)

df_results_columns

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,F1 macro,F1 macro,F1 macro,Accuracy,Accuracy,Accuracy
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Train,Development,Test,Train,Development,Test
Lenguage,Group,Name,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
English,traditional,Ridge Classifier,0.877431,0.709848,0.487733,0.881667,0.717,0.513667
English,traditional,Multi-layer Perceptron classifier,0.888015,0.693095,0.483704,0.891444,0.7,0.511333
English,traditional,Multinomial Naive Bayes classifier,0.81263,0.730203,0.478476,0.820444,0.738,0.505667
English,traditional,Support Vector Classification,0.956309,0.734485,0.447326,0.957556,0.745,0.486333
English,traditional,AdaBoost classifier,0.767939,0.725852,0.419558,0.781667,0.739,0.463667
English,traditional,Bernoulli Naive Bayes classifier,0.821029,0.73455,0.418188,0.826,0.738,0.472667
English,traditional,Random Forest classifier,0.998974,0.731493,0.393599,0.999,0.74,0.451
English,traditional,Dummy Classifier,0.366955,0.364272,0.367089,0.579667,0.573,0.58
English,deep_learning,Bert base,0.929994,0.746904,0.59415,0.931556,0.749,0.605333
English,deep_learning,Bert Avarage,0.939383,0.750845,0.593989,0.940778,0.753,0.606


In [8]:
df_results_columns2 = df_results_columns.droplevel('Group')
df_results_columns2 = df_results_columns2.sort_values(by=["Lenguage", ("F1 macro", dataset_types.test.name.capitalize())], ascending=[True, False])
df_results_columns2

Unnamed: 0_level_0,Unnamed: 1_level_0,F1 macro,F1 macro,F1 macro,Accuracy,Accuracy,Accuracy
Unnamed: 0_level_1,Unnamed: 1_level_1,Train,Development,Test,Train,Development,Test
Lenguage,Name,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
English,Bert base,0.929994,0.746904,0.59415,0.931556,0.749,0.605333
English,Bert Avarage,0.939383,0.750845,0.593989,0.940778,0.753,0.606
English,CLS layers,0.931599,0.750351,0.589766,0.933111,0.753,0.602667
English,Entire layer,0.940645,0.757246,0.586625,0.942,0.759,0.599333
English,Tokens,0.936114,0.748223,0.58653,0.937444,0.75,0.599
English,Best Multi-layer Perceptron classifier,0.817371,0.732399,0.488366,0.821778,0.736,0.518
English,Ridge Classifier,0.877431,0.709848,0.487733,0.881667,0.717,0.513667
English,Best Multinomial Naive Bayes classifier,0.807131,0.732959,0.483788,0.816222,0.742,0.509
English,Multi-layer Perceptron classifier,0.888015,0.693095,0.483704,0.891444,0.7,0.511333
English,Best Ridge Classifier,0.827157,0.733554,0.480252,0.835667,0.744,0.507667


Improvements:
- Allow removal of all files with specific features (not just path)