# Evaluate the models

In [180]:
# imports

import pandas as pd
import os
import glob
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

### Evaluation functions

In [185]:
def get_metrics_per_category(dataset,experiment,model,print_res=False,include_inconclusive=False):

    directory = "../final_sets"

    positive = os.path.join(directory, f"{dataset}_positive_{experiment}_{model}_predictions.csv")
    negative = os.path.join(directory, f"{dataset}_negative_{experiment}_{model}_predictions.csv")
    inconclusive = os.path.join(directory, f"{dataset}_inconclusive_{experiment}_{model}_predictions.csv")

    df_pos = pd.read_csv(positive)
    df_neg = pd.read_csv(negative)
    df_inc = pd.read_csv(inconclusive)

    if include_inconclusive == True: # include inconclusive gold
        df = pd.concat([df_pos, df_neg, df_inc], ignore_index=True) 
        incl_or_excl = 'including'
    else:
        df = pd.concat([df_pos, df_neg], ignore_index=True)
        incl_or_excl = 'excluding'

    if 'expectation' not in df.columns or 'prediction' not in df.columns:
        raise ValueError("CSV must contain 'expectation' and 'prediction' columns.")

    exp0_pred0 = df[(df['expectation'] == 0) & (df['prediction'] == 0)] # 0,0
    exp0_pred1 = df[(df['expectation'] == 0) & (df['prediction'] == 1)] # 0,1
    exp0_pred2 = df[(df['expectation'] == 0) & (df['prediction'] == 2)] # 0,2
    
    exp1_pred0 = df[(df['expectation'] == 1) & (df['prediction'] == 0)] # 1,0
    exp1_pred1 = df[(df['expectation'] == 1) & (df['prediction'] == 1)] # 1,1
    exp1_pred2 = df[(df['expectation'] == 1) & (df['prediction'] == 2)] # 1,2

    exp2_pred0 = df[(df['expectation'] == 2) & (df['prediction'] == 0)] # 2,0
    exp2_pred1 = df[(df['expectation'] == 2) & (df['prediction'] == 1)] # 2,1
    exp2_pred2 = df[(df['expectation'] == 2) & (df['prediction'] == 2)] # 2,2

    precision_0 = len(exp0_pred0) / (len(exp0_pred0)+len(exp1_pred0)+len(exp2_pred0)) # true positives / true positives + false positives
    recall_0 = len(exp0_pred0) / (len(exp0_pred0)+len(exp0_pred1)+len(exp0_pred2)) # true positives / true positives + false negatives
    f1_score_0 = (2 * precision_0 * recall_0) / (precision_0+recall_0)
    
    precision_1 = len(exp1_pred1) / (len(exp1_pred1)+len(exp0_pred1)+len(exp2_pred1)) # true positives / true positives + false positives
    recall_1 = len(exp1_pred1) / (len(exp1_pred1)+len(exp1_pred0)+len(exp1_pred2)) # true positives / true positives + false negatives
    f1_score_1 = (2 * precision_1 * recall_1) / (precision_1+recall_1)

    try:
        precision_2 = len(exp2_pred2) / (len(exp2_pred2)+len(exp0_pred2)+len(exp1_pred2)) # true positives / true positives + false positives
    except ZeroDivisionError:
        precision_2 = 0.0
    try:
        recall_2 = len(exp2_pred2) / (len(exp2_pred2)+len(exp2_pred0)+len(exp2_pred1)) # denominator is 0
    except ZeroDivisionError:
        recall_2 = 0.0
    try:
        f1_score_2 = (2 * precision_2 * recall_2) / (precision_2+recall_2)
    except ZeroDivisionError:
        f1_score_2 = 0.0

    if print_res == True:
        
        print(f"Model: {model} Experiment: {experiment} Dataset: {dataset}")
        print(f"Precision for positive cases: {precision_1:.3f}")
        print(f"Recall for positive cases: {recall_1:.3f}")
        print(f"f1-score for positive cases: {f1_score_1:.3f}")
        print(f"Precision for negative cases: {precision_0:.3f}")
        print(f"Recall for negative cases: {recall_0:.3f}")
        print(f"f1-score for negative cases: {f1_score_0:.3f}")
        print(f"Precision for inconclusive cases: {precision_2:.3f}")
        print(f"Recall for inconclusive cases: {recall_2:.3f}")
        print(f"f1-score for inconclusive cases: {f1_score_2:.3f}")
        print()

    # sanity_check
    if model == 'anthroscore':
        aggregate_precision = (precision_0 + precision_1 + precision_2) / 3
        aggregate_recall = (recall_0 + recall_1 + recall_2) / 3
        aggregate_f1 = (f1_score_0 + f1_score_1 + f1_score_2) / 3
    elif model == 'AtypicalAnimacy':
        aggregate_precision = (precision_0 + precision_1) / 2
        aggregate_recall = (recall_0 + recall_1) / 2
        aggregate_f1 = (f1_score_0 + f1_score_1) / 2

    print(f"Precision for all cases: {aggregate_precision:.3f}")
    print(f"Recall for all cases: {aggregate_recall:.3f}")
    print(f"f1-score for all cases: {aggregate_f1:.3f}")
    print()

def get_precision_recall_f1_and_accuracy(dataset,experiment,model,print_res=False,map_pred=False,include_inconclusive=False):

    directory = "../final_sets"

    positive = os.path.join(directory, f"{dataset}_positive_{experiment}_{model}_predictions.csv")
    negative = os.path.join(directory, f"{dataset}_negative_{experiment}_{model}_predictions.csv")
    inconclusive = os.path.join(directory, f"{dataset}_inconclusive_{experiment}_{model}_predictions.csv")

    df_pos = pd.read_csv(positive)
    df_neg = pd.read_csv(negative)
    df_inc = pd.read_csv(inconclusive)

    if include_inconclusive == True: # include inconclusive gold
        df = pd.concat([df_pos, df_neg, df_inc], ignore_index=True) 
        average_ = 'weighted'
        incl_or_excl = 'including'
    else:
        df = pd.concat([df_pos, df_neg], ignore_index=True)
        average_ = 'macro'
        incl_or_excl = 'excluding'

    if 'expectation' not in df.columns or 'prediction' not in df.columns:
        raise ValueError("CSV must contain 'expectation' and 'prediction' columns.")

    # Filter out all rows where the prediction is 2
    # df = df[df['prediction'] != 2]

    y_true = df['expectation'].astype(int)
    y_pred = df['prediction'].astype(int)
    if map_pred == True:
        y_pred = [0 if pred == 2 else pred for pred in y_pred]

    # filter out cases which have a prediction of 2
    #mask = (y_true != 2) & (y_pred != 2)
    #y_true_filtered = y_true[mask]
    #y_pred_filtered = y_pred[mask]
    #if include_inconclusive == False:
        #y_true = y_true_filtered
        #y_pred = y_pred_filtered

    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average=average_, zero_division=0)
    recall = recall_score(y_true, y_pred, average=average_, zero_division=0)
    f1 = f1_score(y_true, y_pred, average=average_, zero_division=0)
    
    correct = (y_true == y_pred).sum()
    total = len(df)

    if print_res == True:
        print(f"Model: {model} Experiment: {experiment} Dataset: {dataset}")
        print(f"average: {average_} (we are {incl_or_excl} inconclusive cases from expectations)")
        print(f"Correct predictions (accuracy): {correct} / {total}")
        print(f"Accuracy: {accuracy:.3f}")
        print(f"Precision: {precision:.3f}")
        print(f"Recall:    {recall:.3f}")
        print(f"F1 Score:  {f1:.3f}")
        print()

    return accuracy, precision, recall, f1

def get_accuracy(dataset,experiment,model):

    directory = "../final_sets"

    # this is only relevant to positive cases
    path_to_files = os.path.join(directory, f"{dataset}_positive_{experiment}_{model}_predictions.csv") 
    df = pd.read_csv(path_to_files)
        
    if 'expectation' not in df.columns or 'prediction' not in df.columns:
        raise ValueError("CSV must contain 'expectation' and 'prediction' columns.")

    # convert to integers
    df['expectation'] = df['expectation'].astype(int)
    df['prediction'] = df['prediction'].astype(int)

    #print("Unique values in expectation:", df['expectation'].unique())
    #print("Unique values in prediction:", df['prediction'].unique())

    accuracy = accuracy_score(df['expectation'], df['prediction'])    
    correct = (df['expectation'] == df['prediction']).sum()
    total = len(df)

    print(f"Model: {model} Experiment: {experiment} Dataset: {dataset}")
    print(f"Correct predictions: {correct} / {total}")
    print(f"Accuracy: {accuracy:.3f}")
    print()

def get_inconclusive_trends(dataset,experiment,model):

    directory = "../final_sets"
    
    pattern = os.path.join(directory, f"{dataset}_inconclusive_{experiment}_{model}_predictions.csv")
    file_list = glob.glob(pattern)
        
    df_list = [pd.read_csv(file) for file in file_list]
    df = pd.concat(df_list, ignore_index=True)

    if 'expectation' not in df.columns or 'prediction' not in df.columns:
        raise ValueError("CSV must contain 'expectation' and 'prediction' columns.")

    # convert to integers
    y_true = df['expectation'].astype(int)
    y_pred = df['prediction'].astype(int)

    num_positive_pred = len([pred for pred in y_pred if pred == 1])
    num_negative_pred = len([pred for pred in y_pred if pred == 0])
    num_inconclusive_pred = len([pred for pred in y_pred if pred == 2])

    if model == 'AtypicalAnimacy':
        note = " (AtypicalAnimacy does not have inconclusive predictions)"
    else:
        note = ""

    print(f"Model: {model} Experiment: {experiment} Dataset: {dataset}_inconclusive")
    
    print(f"Number of inconclusive cases in the dataset: {len(y_true)}")
    print(f"Number total predictions: {len(y_pred)}")
    print(f"Number of positive predictions: {num_positive_pred}")
    print(f"Number of negative predictions: {num_negative_pred}")
    print(f"Number of inconlcusive predictions: {num_inconclusive_pred}", note)
    print()

def compare_anthroscore_scenarios(dataset):

    exp_1_no_map = get_precision_recall_f1_and_accuracy(dataset,'experiment_1','anthroscore',print_res=False,map_pred=False)
    print(f"scenario: {dataset} anthroscore experiment 1 no mapping")
    print(f"Accuracy: {exp_1_no_map[0]:.3f}")
    print(f"Precision: {exp_1_no_map[1]:.3f}")
    print(f"Recall:    {exp_1_no_map[2]:.3f}")
    print(f"F1 Score:  {exp_1_no_map[3]:.3f}")
    print()
    exp_1_map = get_precision_recall_f1_and_accuracy(dataset,'experiment_1','anthroscore',print_res=False,map_pred=True)
    print(f"scenario: {dataset} anthroscore experiment 1 with mapping")
    print(f"Accuracy: {exp_1_map[0]:.3f}")
    print(f"Precision: {exp_1_map[1]:.3f}")
    print(f"Recall:    {exp_1_map[2]:.3f}")
    print(f"F1 Score:  {exp_1_map[3]:.3f}")
    print()
    exp_2_no_map = get_precision_recall_f1_and_accuracy(dataset,'experiment_2','anthroscore',print_res=False,map_pred=False)
    print(f"scenario: {dataset} anthroscore experiment 2 no mapping")
    print(f"Accuracy: {exp_2_no_map[0]:.3f}")
    print(f"Precision: {exp_2_no_map[1]:.3f}")
    print(f"Recall:    {exp_2_no_map[2]:.3f}")
    print(f"F1 Score:  {exp_2_no_map[3]:.3f}")
    print()
    exp_2_map = get_precision_recall_f1_and_accuracy(dataset,'experiment_2','anthroscore',print_res=False,map_pred=True)
    print(f"scenario: {dataset} anthroscore experiment 2 with mapping")
    print(f"Accuracy: {exp_2_map[0]:.3f}")
    print(f"Precision: {exp_2_map[1]:.3f}")
    print(f"Recall:    {exp_2_map[2]:.3f}")
    print(f"F1 Score:  {exp_2_map[3]:.3f}")
    print()

### Obtain precision, recall and f1-scores per class

In [186]:
multi_label_datasets = ["adjective_phrases", "verb_objects", "verb_subjects"]
experiments = ['experiment_1', 'experiment_2']

for experiment in experiments:
    for dataset in multi_label_datasets:
        get_metrics_per_category(dataset,experiment,'anthroscore',print_res=True)
        get_metrics_per_category(dataset,experiment,'AtypicalAnimacy',print_res=True)

Model: anthroscore Experiment: experiment_1 Dataset: adjective_phrases
Precision for positive cases: 1.000
Recall for positive cases: 0.114
f1-score for positive cases: 0.204
Precision for negative cases: 0.544
Recall for negative cases: 0.956
f1-score for negative cases: 0.694
Precision for inconclusive cases: 0.000
Recall for inconclusive cases: 0.000
f1-score for inconclusive cases: 0.000

Precision for all cases: 0.515
Recall for all cases: 0.356
f1-score for all cases: 0.299

Model: AtypicalAnimacy Experiment: experiment_1 Dataset: adjective_phrases
Precision for positive cases: 0.905
Recall for positive cases: 0.432
f1-score for positive cases: 0.585
Precision for negative cases: 0.632
Recall for negative cases: 0.956
f1-score for negative cases: 0.761
Precision for inconclusive cases: 0.000
Recall for inconclusive cases: 0.000
f1-score for inconclusive cases: 0.000

Precision for all cases: 0.769
Recall for all cases: 0.694
f1-score for all cases: 0.673

Model: anthroscore Exper

### Comparison between anthroscore and AtypicalAnimacy on the binary cases only

Get accuracy, precision, recall and f1-score (for multilabel evaluation sets adjective_phrases, verb_subjects and verb_objects), and accuracy for adjective_phrases, verb_subjects, verb_objects, noun_phrases and possessives.

(for the last two accuracy is the same as recall, since there is only 1 expectation - positive).

In [187]:
multi_label_datasets = ["adjective_phrases", "verb_objects", "verb_subjects"] # recall and precision matters
single_label_datasets = ['noun_phrases', 'possessives'] # only recall i.e. accuracy matters

experiments = ['experiment_1', 'experiment_2']

for experiment in experiments:
    for dataset in multi_label_datasets:
        get_precision_recall_f1_and_accuracy(dataset,experiment,'anthroscore',print_res=True,map_pred=False)
        get_precision_recall_f1_and_accuracy(dataset,experiment,'AtypicalAnimacy',print_res=True,map_pred=False)

for experiment in experiments:
    for dataset in single_label_datasets:
        get_accuracy(dataset,experiment,'anthroscore')
        get_accuracy(dataset,experiment,'AtypicalAnimacy')

Model: anthroscore Experiment: experiment_1 Dataset: adjective_phrases
average: macro (we are excluding inconclusive cases from expectations)
Correct predictions (accuracy): 48 / 89
Accuracy: 0.539
Precision: 0.515
Recall:    0.356
F1 Score:  0.299

Model: AtypicalAnimacy Experiment: experiment_1 Dataset: adjective_phrases
average: macro (we are excluding inconclusive cases from expectations)
Correct predictions (accuracy): 62 / 89
Accuracy: 0.697
Precision: 0.769
Recall:    0.694
F1 Score:  0.673

Model: anthroscore Experiment: experiment_1 Dataset: verb_objects
average: macro (we are excluding inconclusive cases from expectations)
Correct predictions (accuracy): 67 / 117
Accuracy: 0.573
Precision: 0.548
Recall:    0.370
F1 Score:  0.334

Model: AtypicalAnimacy Experiment: experiment_1 Dataset: verb_objects
average: macro (we are excluding inconclusive cases from expectations)
Correct predictions (accuracy): 94 / 117
Accuracy: 0.803
Precision: 0.803
Recall:    0.803
F1 Score:  0.803



### Get results for anthroscore including inconclusive expectations

Get accuracy, precision, recall and accuracy for anthroscore when the inconclusive cases are not excluded. They are excluded in the comparison between the two models because AtypicalAnimacy only provides positive and negative scores

In [188]:
# get accuracy, precision, recall and accuracy for anthroscore when the inconclusive cases are not excluded 
# they are excluded in the comparison between the two models because AtypicalAnimacy only provides positive and negative scores

for dataset in multi_label_datasets:
    get_metrics_per_category(dataset,'experiment_1','anthroscore',print_res=True,include_inconclusive=True)
    #get_precision_recall_f1_and_accuracy(dataset,'experiment_1','anthroscore',print_res=True,map_pred=False,include_inconclusive=True)

for dataset in multi_label_datasets:
    get_metrics_per_category(dataset,'experiment_2','anthroscore',print_res=True,include_inconclusive=True)
    #get_precision_recall_f1_and_accuracy(dataset,'experiment_2','anthroscore',print_res=True,map_pred=False,include_inconclusive=True)

Model: anthroscore Experiment: experiment_1 Dataset: adjective_phrases
Precision for positive cases: 0.833
Recall for positive cases: 0.114
f1-score for positive cases: 0.200
Precision for negative cases: 0.457
Recall for negative cases: 0.956
f1-score for negative cases: 0.619
Precision for inconclusive cases: 0.167
Recall for inconclusive cases: 0.059
f1-score for inconclusive cases: 0.087

Precision for all cases: 0.486
Recall for all cases: 0.376
f1-score for all cases: 0.302

Model: anthroscore Experiment: experiment_1 Dataset: verb_objects
Precision for positive cases: 0.700
Recall for positive cases: 0.125
f1-score for positive cases: 0.212
Precision for negative cases: 0.545
Recall for negative cases: 0.984
f1-score for negative cases: 0.702
Precision for inconclusive cases: 0.292
Recall for inconclusive cases: 0.259
f1-score for inconclusive cases: 0.275

Precision for all cases: 0.512
Recall for all cases: 0.456
f1-score for all cases: 0.396

Model: anthroscore Experiment: ex

### Get the trends for the inconclusive sets only

Count for each inconclusive set, how many positive, negative and inconclusive (in the case of anthroscore only) predictions were given.

In [179]:
inconclusives = ["adjective_phrases", "verb_objects", "verb_subjects","comparisons"] 

for dataset in inconclusives:
    get_inconclusive_trends(dataset,'experiment_1','anthroscore')
    get_inconclusive_trends(dataset,'experiment_2','anthroscore')
    get_inconclusive_trends(dataset,'experiment_1','AtypicalAnimacy')
    get_inconclusive_trends(dataset,'experiment_2','AtypicalAnimacy')

Model: anthroscore Experiment: experiment_1 Dataset: adjective_phrases_inconclusive
Number of inconclusive cases in the dataset: 17
Number total predictions: 17
Number of positive predictions: 1
Number of negative predictions: 15
Number of inconlcusive predictions: 1 

Model: anthroscore Experiment: experiment_2 Dataset: adjective_phrases_inconclusive
Number of inconclusive cases in the dataset: 21
Number total predictions: 21
Number of positive predictions: 2
Number of negative predictions: 15
Number of inconlcusive predictions: 4 

Model: AtypicalAnimacy Experiment: experiment_1 Dataset: adjective_phrases_inconclusive
Number of inconclusive cases in the dataset: 17
Number total predictions: 17
Number of positive predictions: 2
Number of negative predictions: 15
Number of inconlcusive predictions: 0  (AtypicalAnimacy does not have inconclusive predictions)

Model: AtypicalAnimacy Experiment: experiment_2 Dataset: adjective_phrases_inconclusive
Number of inconclusive cases in the datas