In [1]:
!ls

DownloadHateSpeech.ipynb  data-collated        huggingface.txt	t5
README.md		  gpt-neo	       human-evals	yi-ai
bart			  hate-speech-dataset  llama


In [2]:
import pandas as pd
import math

pd.set_option('display.max_columns', None)

In [3]:
df = pd.read_csv('data-collated/collective-data.csv')

reference_models = ["BART (Conditional)", "T5 (Conditional)"]
datasets = ["CARER", "HATE", "SST2"]

display(df.head())

Unnamed: 0,Model,Dataset,Accuracy,Precision,Recall,F1-Score,Support,Filtered
0,BART (Conditional),CARER,92.83,0.93,0.85,0.88,2000,0.0
1,BART (Conditional),HATE,90.94,0.79,0.74,0.76,1071,0.0
2,BART (Conditional),SST2,93.01,0.93,0.93,0.93,872,0.0
3,T5 (Conditional),CARER,93.0,0.9,0.86,0.88,2000,0.0
4,T5 (Conditional),HATE,92.2,0.81,0.76,0.79,1071,0.0


In [4]:
def format_diff(value):
    if value < 0:
        text = f"\\textcolor{{Red}}{{{value}}}"
    elif value == 0:
        text = f"{value}"
    else:
        text = f"\\textcolor{{Green}}{{+{value}}}"
    return text

model_metadata = {
    "BART": "(139M)",
    "T5": "(223M)",
    "GPT-Neo": "(2.7B)",
    "LLAMA": "3.1 (8B)",
    "YI": "1.5 (6B)",
}

In [5]:
formatted_results = []
for i, dataset in enumerate(datasets):
    bart_scores = dict(df[(df["Model"] == "BART (Conditional)") & (df["Dataset"] == dataset)].iloc[0])
    t5_scores = dict(df[(df["Model"] == "T5 (Conditional)") & (df["Dataset"] == dataset)].iloc[0])
    
    print(bart_scores)
    print(t5_scores)

    # [2:] to skip the baseline statistics
    for _, row in df[df["Dataset"] == dataset].iloc[2:].iterrows():
        model_scores = dict(row)

        if model_scores["Model"] == "Human Evaluation":
            formatted_row = {"Model": model_scores["Model"], "Type": "N/A"}
        else:
            model_name = model_scores["Model"].split(" ")[0]
            model_version = model_metadata.get(model_name, "")
            full_model_name = f"{model_name} {model_version}" if model_version else model_name
            formatted_row = {"Model": full_model_name, "Type": " ".join(model_scores["Model"].split(" ")[1:])}

        formatted_row["Dataset"] = dataset
        for metric in ["Accuracy", "Precision", "Recall", "F1-Score"]:
            bart_diff = round(model_scores[metric] - bart_scores[metric], 2)
            t5_diff = round(model_scores[metric] - t5_scores[metric], 2)

            formatted_row[metric] = f"{model_scores[metric]:.2f} ({format_diff(bart_diff)}, {format_diff(t5_diff)})"

        formatted_row["Support"] = model_scores["Support"]
        formatted_row["Filtered"] = "N/A" if math.isnan(model_scores["Filtered"]) else int(model_scores["Filtered"])
        
        formatted_results.append(formatted_row)

{'Model': 'BART (Conditional)', 'Dataset': 'CARER', 'Accuracy': 92.83, 'Precision': 0.93, 'Recall': 0.85, 'F1-Score': 0.88, 'Support': 2000, 'Filtered': 0.0}
{'Model': 'T5 (Conditional)', 'Dataset': 'CARER', 'Accuracy': 93.0, 'Precision': 0.9, 'Recall': 0.86, 'F1-Score': 0.88, 'Support': 2000, 'Filtered': 0.0}
{'Model': 'BART (Conditional)', 'Dataset': 'HATE', 'Accuracy': 90.94, 'Precision': 0.79, 'Recall': 0.74, 'F1-Score': 0.76, 'Support': 1071, 'Filtered': 0.0}
{'Model': 'T5 (Conditional)', 'Dataset': 'HATE', 'Accuracy': 92.2, 'Precision': 0.81, 'Recall': 0.76, 'F1-Score': 0.79, 'Support': 1071, 'Filtered': 0.0}
{'Model': 'BART (Conditional)', 'Dataset': 'SST2', 'Accuracy': 93.01, 'Precision': 0.93, 'Recall': 0.93, 'F1-Score': 0.93, 'Support': 872, 'Filtered': 0.0}
{'Model': 'T5 (Conditional)', 'Dataset': 'SST2', 'Accuracy': 93.5, 'Precision': 0.94, 'Recall': 0.94, 'F1-Score': 0.94, 'Support': 872, 'Filtered': 0.0}


In [6]:
final_df = pd.DataFrame(formatted_results)
display(final_df)

Unnamed: 0,Model,Type,Dataset,Accuracy,Precision,Recall,F1-Score,Support,Filtered
0,GPT-Neo (2.7B),Zero-Shot,CARER,"35.35 (\textcolor{Red}{-57.48}, \textcolor{Red...","0.61 (\textcolor{Red}{-0.32}, \textcolor{Red}{...","0.23 (\textcolor{Red}{-0.62}, \textcolor{Red}{...","0.18 (\textcolor{Red}{-0.7}, \textcolor{Red}{-...",2000,
1,GPT-Neo (2.7B),Few-Shot,CARER,"35.25 (\textcolor{Red}{-57.58}, \textcolor{Red...","0.48 (\textcolor{Red}{-0.45}, \textcolor{Red}{...","0.27 (\textcolor{Red}{-0.58}, \textcolor{Red}{...","0.20 (\textcolor{Red}{-0.68}, \textcolor{Red}{...",2000,
2,GPT-Neo (2.7B),Few-Shot CoT,CARER,"55.98 (\textcolor{Red}{-36.85}, \textcolor{Red...","0.52 (\textcolor{Red}{-0.41}, \textcolor{Red}{...","0.50 (\textcolor{Red}{-0.35}, \textcolor{Red}{...","0.50 (\textcolor{Red}{-0.38}, \textcolor{Red}{...",2000,1214.0
3,LLAMA 3.1 (8B),Zero-Shot,CARER,"57.10 (\textcolor{Red}{-35.72}, \textcolor{Red...","0.51 (\textcolor{Red}{-0.42}, \textcolor{Red}{...","0.44 (\textcolor{Red}{-0.41}, \textcolor{Red}{...","0.46 (\textcolor{Red}{-0.42}, \textcolor{Red}{...",1886,114.0
4,LLAMA 3.1 (8B),Few-Shot,CARER,"56.00 (\textcolor{Red}{-36.83}, \textcolor{Red...","0.49 (\textcolor{Red}{-0.44}, \textcolor{Red}{...","0.45 (\textcolor{Red}{-0.4}, \textcolor{Red}{-...","0.46 (\textcolor{Red}{-0.42}, \textcolor{Red}{...",1916,84.0
5,LLAMA 3.1 (8B),Few-Shot CoT,CARER,"56.36 (\textcolor{Red}{-36.47}, \textcolor{Red...","0.50 (\textcolor{Red}{-0.43}, \textcolor{Red}{...","0.46 (\textcolor{Red}{-0.39}, \textcolor{Red}{...","0.47 (\textcolor{Red}{-0.41}, \textcolor{Red}{...",1909,91.0
6,YI 1.5 (6B),Zero-Shot,CARER,"44.56 (\textcolor{Red}{-48.27}, \textcolor{Red...","0.42 (\textcolor{Red}{-0.51}, \textcolor{Red}{...","0.42 (\textcolor{Red}{-0.43}, \textcolor{Red}{...","0.38 (\textcolor{Red}{-0.5}, \textcolor{Red}{-...",1957,43.0
7,YI 1.5 (6B),Few-Shot,CARER,"52.35 (\textcolor{Red}{-40.48}, \textcolor{Red...","0.48 (\textcolor{Red}{-0.45}, \textcolor{Red}{...","0.45 (\textcolor{Red}{-0.4}, \textcolor{Red}{-...","0.43 (\textcolor{Red}{-0.45}, \textcolor{Red}{...",1912,88.0
8,YI 1.5 (6B),Few-Shot CoT,CARER,"55.09 (\textcolor{Red}{-37.74}, \textcolor{Red...","0.47 (\textcolor{Red}{-0.46}, \textcolor{Red}{...","0.45 (\textcolor{Red}{-0.4}, \textcolor{Red}{-...","0.44 (\textcolor{Red}{-0.44}, \textcolor{Red}{...",1886,114.0
9,Human Evaluation,,CARER,"46.00 (\textcolor{Red}{-46.83}, \textcolor{Red...","0.40 (\textcolor{Red}{-0.53}, \textcolor{Red}{...","0.42 (\textcolor{Red}{-0.43}, \textcolor{Red}{...","0.39 (\textcolor{Red}{-0.49}, \textcolor{Red}{...",400,0.0


In [7]:
for _, row in final_df.iterrows():
    data = row.to_dict()

    print(f'-- {data["Model"]} {data["Type"]} {data["Dataset"]} --')
    print(f'\t & \\textsc{{{data["Model"]}}} & {data["Accuracy"]} & {data["Precision"]} & {data["Recall"]} & {data["F1-Score"]} & {data["Support"]} & {data["Filtered"]} \\\\')

-- GPT-Neo (2.7B) Zero-Shot CARER --
	 & \textsc{GPT-Neo (2.7B)} & 35.35 (\textcolor{Red}{-57.48}, \textcolor{Red}{-57.65}) & 0.61 (\textcolor{Red}{-0.32}, \textcolor{Red}{-0.29}) & 0.23 (\textcolor{Red}{-0.62}, \textcolor{Red}{-0.63}) & 0.18 (\textcolor{Red}{-0.7}, \textcolor{Red}{-0.7}) & 2000 & N/A \\
-- GPT-Neo (2.7B) Few-Shot CARER --
	 & \textsc{GPT-Neo (2.7B)} & 35.25 (\textcolor{Red}{-57.58}, \textcolor{Red}{-57.75}) & 0.48 (\textcolor{Red}{-0.45}, \textcolor{Red}{-0.42}) & 0.27 (\textcolor{Red}{-0.58}, \textcolor{Red}{-0.59}) & 0.20 (\textcolor{Red}{-0.68}, \textcolor{Red}{-0.68}) & 2000 & N/A \\
-- GPT-Neo (2.7B) Few-Shot CoT CARER --
	 & \textsc{GPT-Neo (2.7B)} & 55.98 (\textcolor{Red}{-36.85}, \textcolor{Red}{-37.02}) & 0.52 (\textcolor{Red}{-0.41}, \textcolor{Red}{-0.38}) & 0.50 (\textcolor{Red}{-0.35}, \textcolor{Red}{-0.36}) & 0.50 (\textcolor{Red}{-0.38}, \textcolor{Red}{-0.38}) & 2000 & 1214 \\
-- LLAMA 3.1 (8B) Zero-Shot CARER --
	 & \textsc{LLAMA 3.1 (8B)} & 57.10 (\

In [8]:
output_path = "data-collated/relative-scores.csv"
final_df.to_csv(output_path, index=False)