In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [2]:
from sklearn.metrics import classification_report, f1_score, precision_score, recall_score
import pandas as pd

In [3]:
def get_metrics(df):
    df["Prediction"] = df["Prediction"].fillna("MISSING")
    df["Emotion"] = df["Emotion"].astype("category")
    df["Prediction"] = df["Prediction"].astype("category")

    y_true = df["Emotion"]
    y_pred = df["Prediction"]

    labels = sorted([c for c in df["Emotion"].unique()])
    macro_f1 = f1_score(y_true, y_pred, average="macro", labels=labels) # true labels, so that it doesn't try to compute those that have MISSING as prediction
    weighted_f1 = f1_score(y_true, y_pred, average="weighted", labels=labels)

    macro_precision = precision_score(y_true, y_pred, average="macro", labels=labels)
    weighted_precision = precision_score(y_true, y_pred, average="weighted", labels=labels)

    macro_recall = recall_score(y_true, y_pred, average="macro", labels=labels)
    weighted_recall = recall_score(y_true, y_pred, average="weighted", labels=labels)

    accuracy = df["Evaluation"].mean()
    metrics = {"eval_accuracy": float(accuracy),
           "eval_f1_macro": float(macro_f1),
           "eval_f1_weighted": float(weighted_f1),
           "eval_precision_macro": float(macro_precision),
           "eval_precision_weighted": float(weighted_precision),
           "eval_recall_macro": float(macro_recall),
           "eval_recall_weighted": float(weighted_recall)}
    return metrics


In [8]:
df = pd.read_csv("0_shot_no_noise_results.csv")

In [4]:
df_mash_30_2 = pd.read_csv("0_shot_noise_mash_30_2_results.csv")

In [5]:
df_mash_10_1 = pd.read_csv("0_shot_noise_mash_10_1_results.csv")

In [9]:
df_pseudo_10_1 = pd.read_csv("0_shot_noise_pseudo_10_1_results.csv")

In [11]:
df_pseudo_30_2 = pd.read_csv("0_shot_noise_pseudo_30_2_results.csv")

In [13]:
df_25_shot_no_noise = pd.read_csv("25_shot_no_noise_results.csv")

In [9]:
print(get_metrics(df))

{'eval_accuracy': 0.32, 'eval_f1_macro': 0.31568216895747725, 'eval_f1_weighted': 0.33945251701667817, 'eval_precision_macro': 0.3574328736539826, 'eval_precision_weighted': 0.4307661891926589, 'eval_recall_macro': 0.3698208129384578, 'eval_recall_weighted': 0.32}


In [10]:
print(get_metrics(df_pseudo_10_1))

{'eval_accuracy': 0.335, 'eval_f1_macro': 0.32081084182079345, 'eval_f1_weighted': 0.35661533504646364, 'eval_precision_macro': 0.37033706553577844, 'eval_precision_weighted': 0.4620509995785676, 'eval_recall_macro': 0.3676587261830643, 'eval_recall_weighted': 0.335}


In [12]:
print(get_metrics(df_pseudo_30_2))

{'eval_accuracy': 0.305, 'eval_f1_macro': 0.2904734571266038, 'eval_f1_weighted': 0.33125423521296665, 'eval_precision_macro': 0.36483565763751524, 'eval_precision_weighted': 0.46336406591176965, 'eval_recall_macro': 0.3142181905861221, 'eval_recall_weighted': 0.305}


In [6]:
print(get_metrics(df_mash_10_1))

{'eval_accuracy': 0.3416666666666667, 'eval_f1_macro': 0.33931177389414596, 'eval_f1_weighted': 0.360128340680298, 'eval_precision_macro': 0.4128099132306317, 'eval_precision_weighted': 0.46452644098327656, 'eval_recall_macro': 0.36046190864094857, 'eval_recall_weighted': 0.3416666666666667}


In [7]:
print(get_metrics(df_mash_30_2))

{'eval_accuracy': 0.33166666666666667, 'eval_f1_macro': 0.32925761121399033, 'eval_f1_weighted': 0.34456649998649164, 'eval_precision_macro': 0.3879047558632456, 'eval_precision_weighted': 0.4560171287511226, 'eval_recall_macro': 0.37276781905932543, 'eval_recall_weighted': 0.33166666666666667}


FEW SHOT RESULTS

In [14]:
print(get_metrics(df_25_shot_no_noise))

{'eval_accuracy': 0.39, 'eval_f1_macro': 0.3817334761880771, 'eval_f1_weighted': 0.40628138694511384, 'eval_precision_macro': 0.4144733148473079, 'eval_precision_weighted': 0.46775688908942736, 'eval_recall_macro': 0.41450678266430185, 'eval_recall_weighted': 0.39}
