In [1]:
!ls

calculate-relatives.ipynb  collective-data.csv	relative-scores.csv


In [2]:
import pandas as pd
import math

pd.set_option('display.max_columns', None)

In [3]:
df = pd.read_csv('collective-data.csv')

reference_models = ["BART (Conditional)", "T5 (Conditional)"]
datasets = ["CARER", "HATE", "SST2"]

display(df.head())

Unnamed: 0,Model,Dataset,Accuracy,Precision,Recall,F1-Score,Support,Filtered
0,BART (Conditional),CARER,92.83,0.93,0.85,0.88,2000,0.0
1,BART (Conditional),HATE,90.94,0.79,0.74,0.76,1071,0.0
2,BART (Conditional),SST2,93.01,0.93,0.93,0.93,872,0.0
3,T5 (Conditional),CARER,93.0,0.9,0.86,0.88,2000,0.0
4,T5 (Conditional),HATE,92.2,0.81,0.76,0.79,1071,0.0


In [4]:
def format_diff(value):
    if value < 0:
        text = f"\\textcolor{{Red}}{{{value}}}"
    elif value == 0:
        text = f"{value}"
    else:
        text = f"\\textcolor{{Green}}{{+{value}}}"
    return text

def get_full_model_name(model):
    model_name = model["Model"].split(" ")[0]
    model_version = model_metadata.get(model_name, "")
    return f"{model_name} {model_version}" if model_version else model_name

model_metadata = {
    "BART": "(139M)",
    "T5": "(223M)",
    "GPT-Neo": "(2.7B)",
    "LLAMA": "3.1 (8B)",
    "YI": "1.5 (6B)",
}

In [5]:
formatted_results = []
for i, dataset in enumerate(datasets):
    bart_scores = dict(df[(df["Model"] == "BART (Conditional)") & (df["Dataset"] == dataset)].iloc[0])
    t5_scores = dict(df[(df["Model"] == "T5 (Conditional)") & (df["Dataset"] == dataset)].iloc[0])

    # To add BART and T5 scores first
    formatted_row_bart = {
        "Model": get_full_model_name(bart_scores),
        "Type": " ".join(bart_scores["Model"].split(" ")[1:]),
        "Dataset": dataset,
        "Accuracy": bart_scores["Accuracy"],
        "Precision": bart_scores["Precision"],
        "Recall": bart_scores["Recall"],
        "F1-Score": bart_scores["F1-Score"],
        "Support": bart_scores["Support"],
        "Filtered": 0
    }

    formatted_row_t5 = {
        "Model": get_full_model_name(t5_scores),
        "Type": " ".join(t5_scores["Model"].split(" ")[1:]),
        "Dataset": dataset,
        "Accuracy": t5_scores["Accuracy"],
        "Precision": t5_scores["Precision"],
        "Recall": t5_scores["Recall"],
        "F1-Score": t5_scores["F1-Score"],
        "Support": t5_scores["Support"],
        "Filtered": 0
    }
    
    formatted_results.append(formatted_row_bart)
    formatted_results.append(formatted_row_t5)
    
    # [2:] to skip the baseline statistics
    for _, row in df[df["Dataset"] == dataset].iloc[2:].iterrows():
        model_scores = dict(row)

        if model_scores["Model"] == "Human Evaluation":
            formatted_row = {"Model": model_scores["Model"], "Type": "N/A"}
        else:
            formatted_row = {"Model": get_full_model_name(model_scores), "Type": " ".join(model_scores["Model"].split(" ")[1:])}

        formatted_row["Dataset"] = dataset
        for metric in ["Accuracy", "Precision", "Recall", "F1-Score"]:
            bart_diff = round(model_scores[metric] - bart_scores[metric], 2)
            t5_diff = round(model_scores[metric] - t5_scores[metric], 2)

            formatted_row[metric] = f"{model_scores[metric]:.2f} ({format_diff(bart_diff)}, {format_diff(t5_diff)})"

        formatted_row["Support"] = model_scores["Support"]
        formatted_row["Filtered"] = "N/A" if math.isnan(model_scores["Filtered"]) else int(model_scores["Filtered"])
        
        formatted_results.append(formatted_row)

print(formatted_results)

[{'Model': 'BART (139M)', 'Type': '(Conditional)', 'Dataset': 'CARER', 'Accuracy': 92.83, 'Precision': 0.93, 'Recall': 0.85, 'F1-Score': 0.88, 'Support': 2000, 'Filtered': 0}, {'Model': 'T5 (223M)', 'Type': '(Conditional)', 'Dataset': 'CARER', 'Accuracy': 93.0, 'Precision': 0.9, 'Recall': 0.86, 'F1-Score': 0.88, 'Support': 2000, 'Filtered': 0}, {'Model': 'GPT-Neo (2.7B)', 'Type': 'Zero-Shot', 'Dataset': 'CARER', 'Accuracy': '35.35 (\\textcolor{Red}{-57.48}, \\textcolor{Red}{-57.65})', 'Precision': '0.61 (\\textcolor{Red}{-0.32}, \\textcolor{Red}{-0.29})', 'Recall': '0.23 (\\textcolor{Red}{-0.62}, \\textcolor{Red}{-0.63})', 'F1-Score': '0.18 (\\textcolor{Red}{-0.7}, \\textcolor{Red}{-0.7})', 'Support': 2000, 'Filtered': 'N/A'}, {'Model': 'GPT-Neo (2.7B)', 'Type': 'Few-Shot', 'Dataset': 'CARER', 'Accuracy': '35.25 (\\textcolor{Red}{-57.58}, \\textcolor{Red}{-57.75})', 'Precision': '0.48 (\\textcolor{Red}{-0.45}, \\textcolor{Red}{-0.42})', 'Recall': '0.27 (\\textcolor{Red}{-0.58}, \\textc

In [6]:
final_df = pd.DataFrame(formatted_results)
display(final_df)

Unnamed: 0,Model,Type,Dataset,Accuracy,Precision,Recall,F1-Score,Support,Filtered
0,BART (139M),(Conditional),CARER,92.83,0.93,0.85,0.88,2000,0.0
1,T5 (223M),(Conditional),CARER,93.0,0.9,0.86,0.88,2000,0.0
2,GPT-Neo (2.7B),Zero-Shot,CARER,"35.35 (\textcolor{Red}{-57.48}, \textcolor{Red...","0.61 (\textcolor{Red}{-0.32}, \textcolor{Red}{...","0.23 (\textcolor{Red}{-0.62}, \textcolor{Red}{...","0.18 (\textcolor{Red}{-0.7}, \textcolor{Red}{-...",2000,
3,GPT-Neo (2.7B),Few-Shot,CARER,"35.25 (\textcolor{Red}{-57.58}, \textcolor{Red...","0.48 (\textcolor{Red}{-0.45}, \textcolor{Red}{...","0.27 (\textcolor{Red}{-0.58}, \textcolor{Red}{...","0.20 (\textcolor{Red}{-0.68}, \textcolor{Red}{...",2000,
4,GPT-Neo (2.7B),Few-Shot CoT,CARER,"55.98 (\textcolor{Red}{-36.85}, \textcolor{Red...","0.52 (\textcolor{Red}{-0.41}, \textcolor{Red}{...","0.50 (\textcolor{Red}{-0.35}, \textcolor{Red}{...","0.50 (\textcolor{Red}{-0.38}, \textcolor{Red}{...",786,1214.0
5,LLAMA 3.1 (8B),Zero-Shot,CARER,"57.10 (\textcolor{Red}{-35.72}, \textcolor{Red...","0.51 (\textcolor{Red}{-0.42}, \textcolor{Red}{...","0.44 (\textcolor{Red}{-0.41}, \textcolor{Red}{...","0.46 (\textcolor{Red}{-0.42}, \textcolor{Red}{...",1886,114.0
6,LLAMA 3.1 (8B),Few-Shot,CARER,"56.00 (\textcolor{Red}{-36.83}, \textcolor{Red...","0.49 (\textcolor{Red}{-0.44}, \textcolor{Red}{...","0.45 (\textcolor{Red}{-0.4}, \textcolor{Red}{-...","0.46 (\textcolor{Red}{-0.42}, \textcolor{Red}{...",1916,84.0
7,LLAMA 3.1 (8B),Few-Shot CoT,CARER,"56.36 (\textcolor{Red}{-36.47}, \textcolor{Red...","0.50 (\textcolor{Red}{-0.43}, \textcolor{Red}{...","0.46 (\textcolor{Red}{-0.39}, \textcolor{Red}{...","0.47 (\textcolor{Red}{-0.41}, \textcolor{Red}{...",1909,91.0
8,YI 1.5 (6B),Zero-Shot,CARER,"44.56 (\textcolor{Red}{-48.27}, \textcolor{Red...","0.42 (\textcolor{Red}{-0.51}, \textcolor{Red}{...","0.42 (\textcolor{Red}{-0.43}, \textcolor{Red}{...","0.38 (\textcolor{Red}{-0.5}, \textcolor{Red}{-...",1957,43.0
9,YI 1.5 (6B),Few-Shot,CARER,"52.35 (\textcolor{Red}{-40.48}, \textcolor{Red...","0.48 (\textcolor{Red}{-0.45}, \textcolor{Red}{...","0.45 (\textcolor{Red}{-0.4}, \textcolor{Red}{-...","0.43 (\textcolor{Red}{-0.45}, \textcolor{Red}{...",1912,88.0


In [7]:
for _, row in final_df.iterrows():
    data = row.to_dict()

    print(f'-- {data["Model"]} {data["Type"]} {data["Dataset"]} --')
    print(f'\t & \\textsc{{{data["Model"]}}} & {data["Accuracy"]} & {data["Precision"]} & {data["Recall"]} & {data["F1-Score"]} & {data["Support"]} & {data["Filtered"]} \\\\')

-- BART (139M) (Conditional) CARER --
	 & \textsc{BART (139M)} & 92.83 & 0.93 & 0.85 & 0.88 & 2000 & 0 \\
-- T5 (223M) (Conditional) CARER --
	 & \textsc{T5 (223M)} & 93.0 & 0.9 & 0.86 & 0.88 & 2000 & 0 \\
-- GPT-Neo (2.7B) Zero-Shot CARER --
	 & \textsc{GPT-Neo (2.7B)} & 35.35 (\textcolor{Red}{-57.48}, \textcolor{Red}{-57.65}) & 0.61 (\textcolor{Red}{-0.32}, \textcolor{Red}{-0.29}) & 0.23 (\textcolor{Red}{-0.62}, \textcolor{Red}{-0.63}) & 0.18 (\textcolor{Red}{-0.7}, \textcolor{Red}{-0.7}) & 2000 & N/A \\
-- GPT-Neo (2.7B) Few-Shot CARER --
	 & \textsc{GPT-Neo (2.7B)} & 35.25 (\textcolor{Red}{-57.58}, \textcolor{Red}{-57.75}) & 0.48 (\textcolor{Red}{-0.45}, \textcolor{Red}{-0.42}) & 0.27 (\textcolor{Red}{-0.58}, \textcolor{Red}{-0.59}) & 0.20 (\textcolor{Red}{-0.68}, \textcolor{Red}{-0.68}) & 2000 & N/A \\
-- GPT-Neo (2.7B) Few-Shot CoT CARER --
	 & \textsc{GPT-Neo (2.7B)} & 55.98 (\textcolor{Red}{-36.85}, \textcolor{Red}{-37.02}) & 0.52 (\textcolor{Red}{-0.41}, \textcolor{Red}{-0.38}

In [8]:
output_path = "relative-scores.csv"
final_df.to_csv(output_path, index=False)