In [1]:
!ls

DownloadHateSpeech.ipynb  data-collated        huggingface.txt	t5
README.md		  gpt-neo	       human-evals	yi-ai
bart			  hate-speech-dataset  llama


In [2]:
import pandas as pd
pd.set_option('display.max_columns', None)

In [3]:
df = pd.read_csv('data-collated/collective-data.csv')

reference_models = ["BART (Conditional)", "T5 (Conditional)"]
datasets = ["CARER", "HATE", "SST2"]

display(df.head())

Unnamed: 0,Model,Dataset,Accuracy,Precision,Recall,F1-Score,Support,Filtered
0,BART (Conditional),CARER,92.83,0.93,0.85,0.88,2000,0.0
1,BART (Conditional),HATE,90.94,0.79,0.74,0.76,1071,0.0
2,BART (Conditional),SST2,93.01,0.93,0.93,0.93,872,0.0
3,T5 (Conditional),CARER,93.0,0.9,0.86,0.88,2000,0.0
4,T5 (Conditional),HATE,92.2,0.81,0.76,0.79,1071,0.0


In [4]:
def format_diff(value):
    if value < 0:
        text = f"\\red{{{value}}}"
    elif value == 0:
        text = f"{value}"
    else:
        text = f"\\green{{{value}}}"
    return text

In [5]:
formatted_results = []
for i, dataset in enumerate(datasets):
    bart_scores = dict(df[(df["Model"] == "BART (Conditional)") & (df["Dataset"] == dataset)].iloc[0])
    t5_scores = dict(df[(df["Model"] == "T5 (Conditional)") & (df["Dataset"] == dataset)].iloc[0])
    
    print(bart_scores)
    print(t5_scores)

    # [2:] to skip the baseline statistics
    for _, row in df[df["Dataset"] == dataset].iloc[2:].iterrows():
        model_scores = dict(row)

        print(model_scores)

        if model_scores["Model"] == "Human Evaluation":
            formatted_row = {"Model": model_scores["Model"], "Type": "N/A"}
        else:
            formatted_row = {"Model": model_scores["Model"].split(" ")[0], "Type": " ".join(model_scores["Model"].split(" ")[1:])}

        formatted_row["Dataset"] = dataset
        for metric in ["Accuracy", "Precision", "Recall", "F1-Score"]:
            bart_diff = round(model_scores[metric] - bart_scores[metric], 2)
            t5_diff = round(model_scores[metric] - t5_scores[metric], 2)

            formatted_row[metric] = f"{model_scores[metric]:.2f} ({format_diff(bart_diff)}, {format_diff(t5_diff)})"

        formatted_row["Support"] = model_scores["Support"]
        formatted_row["Filtered"] = model_scores["Filtered"]
        
        formatted_results.append(formatted_row)

{'Model': 'BART (Conditional)', 'Dataset': 'CARER', 'Accuracy': 92.83, 'Precision': 0.93, 'Recall': 0.85, 'F1-Score': 0.88, 'Support': 2000, 'Filtered': 0.0}
{'Model': 'T5 (Conditional)', 'Dataset': 'CARER', 'Accuracy': 93.0, 'Precision': 0.9, 'Recall': 0.86, 'F1-Score': 0.88, 'Support': 2000, 'Filtered': 0.0}
{'Model': 'GPT-Neo Zero-Shot', 'Dataset': 'CARER', 'Accuracy': 35.35, 'Precision': 0.61, 'Recall': 0.23, 'F1-Score': 0.18, 'Support': 2000, 'Filtered': nan}
{'Model': 'GPT-Neo Few-Shot', 'Dataset': 'CARER', 'Accuracy': 35.25, 'Precision': 0.48, 'Recall': 0.27, 'F1-Score': 0.2, 'Support': 2000, 'Filtered': nan}
{'Model': 'GPT-Neo Few-Shot CoT', 'Dataset': 'CARER', 'Accuracy': 55.98, 'Precision': 0.52, 'Recall': 0.5, 'F1-Score': 0.5, 'Support': 2000, 'Filtered': 1214.0}
{'Model': 'LLAMA Zero-Shot', 'Dataset': 'CARER', 'Accuracy': 57.105, 'Precision': 0.51, 'Recall': 0.44, 'F1-Score': 0.46, 'Support': 1886, 'Filtered': 114.0}
{'Model': 'LLAMA Few-Shot', 'Dataset': 'CARER', 'Accuracy

In [6]:
final_df = pd.DataFrame(formatted_results)
sorted_df = final_df.sort_values(by="Model")
display(sorted_df)

Unnamed: 0,Model,Type,Dataset,Accuracy,Precision,Recall,F1-Score,Support,Filtered
0,GPT-Neo,Zero-Shot,CARER,"35.35 (\red{-57.48}, \red{-57.65})","0.61 (\red{-0.32}, \red{-0.29})","0.23 (\red{-0.62}, \red{-0.63})","0.18 (\red{-0.7}, \red{-0.7})",2000,
1,GPT-Neo,Few-Shot,CARER,"35.25 (\red{-57.58}, \red{-57.75})","0.48 (\red{-0.45}, \red{-0.42})","0.27 (\red{-0.58}, \red{-0.59})","0.20 (\red{-0.68}, \red{-0.68})",2000,
2,GPT-Neo,Few-Shot CoT,CARER,"55.98 (\red{-36.85}, \red{-37.02})","0.52 (\red{-0.41}, \red{-0.38})","0.50 (\red{-0.35}, \red{-0.36})","0.50 (\red{-0.38}, \red{-0.38})",2000,1214.0
22,GPT-Neo,Few-Shot CoT,SST2,"65.08 (\red{-27.93}, \red{-28.42})","0.69 (\red{-0.24}, \red{-0.25})","0.66 (\red{-0.27}, \red{-0.28})","0.64 (\red{-0.29}, \red{-0.3})",872,
21,GPT-Neo,Few-Shot,SST2,"73.17 (\red{-19.84}, \red{-20.33})","0.81 (\red{-0.12}, \red{-0.13})","0.74 (\red{-0.19}, \red{-0.2})","0.72 (\red{-0.21}, \red{-0.22})",872,
20,GPT-Neo,Zero-Shot,SST2,"52.06 (\red{-40.95}, \red{-41.44})","0.76 (\red{-0.17}, \red{-0.18})","0.51 (\red{-0.42}, \red{-0.43})","0.36 (\red{-0.57}, \red{-0.58})",872,
12,GPT-Neo,Few-Shot CoT,HATE,"80.78 (\red{-10.16}, \red{-11.42})","0.44 (\red{-0.35}, \red{-0.37})","0.45 (\red{-0.29}, \red{-0.31})","0.45 (\red{-0.31}, \red{-0.34})",1071,
11,GPT-Neo,Few-Shot,HATE,"88.80 (\red{-2.14}, \red{-3.4})","0.44 (\red{-0.35}, \red{-0.37})","0.50 (\red{-0.24}, \red{-0.26})","0.47 (\red{-0.29}, \red{-0.32})",1071,
10,GPT-Neo,Zero-Shot,HATE,"88.70 (\red{-2.24}, \red{-3.5})","0.44 (\red{-0.35}, \red{-0.37})","0.50 (\red{-0.24}, \red{-0.26})","0.47 (\red{-0.29}, \red{-0.32})",1071,
19,Human Evaluation,,HATE,"73.00 (\red{-17.94}, \red{-19.2})","0.60 (\red{-0.19}, \red{-0.21})","0.73 (\red{-0.01}, \red{-0.03})","0.60 (\red{-0.16}, \red{-0.19})",400,0.0


In [7]:
output_path = "data-collated/relative-scores.csv"
final_df.to_csv(output_path, index=False)