# Evaluate the models

In [17]:
# imports

import pandas as pd

### Evaluation functions

In [22]:
import pandas as pd
import os
import glob
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

def get_precision_recall_f1_and_accuracy(dataset,experiment,model,print_res=False,map_pred=False,include_inconclusive=False):

    directory = "../final_sets"
    
    pattern = os.path.join(directory, f"{dataset}*_{experiment}_{model}_predictions.csv")
    file_list = glob.glob(pattern)
        
    df_list = [pd.read_csv(file) for file in file_list]
    df = pd.concat(df_list, ignore_index=True)

    average_ = 'macro'

    if include_inconclusive == True:
        df = df[df['expectation'] != '2']
        average_ = 'weighted'

    if 'expectation' not in df.columns or 'prediction' not in df.columns:
        raise ValueError("CSV must contain 'expectation' and 'prediction' columns.")

    y_true = df['expectation'].astype(int)
    y_pred = df['prediction'].astype(int)
    if map_pred == True:
        y_pred = [0 if pred == 2 else pred for pred in y_pred]

    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average=average_, zero_division=0)
    recall = recall_score(y_true, y_pred, average=average_, zero_division=0)
    f1 = f1_score(y_true, y_pred, average=average_, zero_division=0)
    
    correct = (y_true == y_pred).sum()
    total = len(df)

    if print_res == True:
        print(f"Model: {model} Experiment: {experiment} Dataset: {dataset}")
        print(f"Correct predictions (accuracy): {correct} / {total}")
        print(f"Accuracy: {accuracy:.4f}")
        print(f"Precision: {precision:.4f}")
        print(f"Recall:    {recall:.4f}")
        print(f"F1 Score:  {f1:.4f}")
        print()

    return accuracy, precision, recall, f1

def get_accuracy(dataset,experiment,model):

    directory = "../final_sets"
    
    pattern = os.path.join(directory, f"{dataset}*_{experiment}_{model}_predictions.csv")
    file_list = glob.glob(pattern)
        
    df_list = [pd.read_csv(file) for file in file_list]
    df = pd.concat(df_list, ignore_index=True)

    if 'expectation' not in df.columns or 'prediction' not in df.columns:
        raise ValueError("CSV must contain 'expectation' and 'prediction' columns.")

    # convert to integers
    df['expectation'] = df['expectation'].astype(int)
    df['prediction'] = df['prediction'].astype(int)

    #print("Unique values in expectation:", df['expectation'].unique())
    #print("Unique values in prediction:", df['prediction'].unique())

    accuracy = accuracy_score(df['expectation'], df['prediction'])    
    correct = (df['expectation'] == df['prediction']).sum()
    total = len(df)

    print(f"Model: {model} Experiment: {experiment} Dataset: {dataset}")
    print(f"Correct predictions: {correct} / {total}")
    print(f"Accuracy: {accuracy:.4f}")
    print()

def get_inconclusive_trends(dataset,experiment,model):

    directory = "../final_sets"
    
    pattern = os.path.join(directory, f"{dataset}_inconclusive_{experiment}_{model}_predictions.csv")
    file_list = glob.glob(pattern)
        
    df_list = [pd.read_csv(file) for file in file_list]
    df = pd.concat(df_list, ignore_index=True)

    if 'expectation' not in df.columns or 'prediction' not in df.columns:
        raise ValueError("CSV must contain 'expectation' and 'prediction' columns.")

    # convert to integers
    y_true = df['expectation'].astype(int)
    y_pred = df['prediction'].astype(int)

    num_positive_pred = len([pred for pred in y_pred if pred == 1])
    num_negative_pred = len([pred for pred in y_pred if pred == 0])
    num_inconclusive_pred = len([pred for pred in y_pred if pred == 2])

    if model == 'AtypicalAnimacy':
        note = " (AtypicalAnimacy does not have inconclusive predictions)"
    else:
        note = ""

    print(f"Model: {model} Experiment: {experiment} Dataset: {dataset}_inconclusive")
    
    print(f"Number of inconclusive cases in the dataset: {len(y_true)}")
    print(f"Number total predictions: {len(y_pred)}")
    print(f"Number of positive predictions: {num_positive_pred}")
    print(f"Number of negative predictions: {num_negative_pred}")
    print(f"Number of inconlcusive predictions: {num_inconclusive_pred}", note)
    print()

def compare_anthroscore_scenarios(dataset):

    exp_1_no_map = get_precision_recall_f1_and_accuracy(dataset,'experiment_1','anthroscore',print_res=False,map_pred=False)
    print(f"scenario: {dataset} anthroscore experiment 1 no mapping")
    print(f"Accuracy: {exp_1_no_map[0]:.4f}")
    print(f"Precision: {exp_1_no_map[1]:.4f}")
    print(f"Recall:    {exp_1_no_map[2]:.4f}")
    print(f"F1 Score:  {exp_1_no_map[3]:.4f}")
    print()
    exp_1_map = get_precision_recall_f1_and_accuracy(dataset,'experiment_1','anthroscore',print_res=False,map_pred=True)
    print(f"scenario: {dataset} anthroscore experiment 1 with mapping")
    print(f"Accuracy: {exp_1_map[0]:.4f}")
    print(f"Precision: {exp_1_map[1]:.4f}")
    print(f"Recall:    {exp_1_map[2]:.4f}")
    print(f"F1 Score:  {exp_1_map[3]:.4f}")
    print()
    exp_2_no_map = get_precision_recall_f1_and_accuracy(dataset,'experiment_2','anthroscore',print_res=False,map_pred=False)
    print(f"scenario: {dataset} anthroscore experiment 2 no mapping")
    print(f"Accuracy: {exp_2_no_map[0]:.4f}")
    print(f"Precision: {exp_2_no_map[1]:.4f}")
    print(f"Recall:    {exp_2_no_map[2]:.4f}")
    print(f"F1 Score:  {exp_2_no_map[3]:.4f}")
    print()
    exp_2_map = get_precision_recall_f1_and_accuracy(dataset,'experiment_2','anthroscore',print_res=False,map_pred=True)
    print(f"scenario: {dataset} anthroscore experiment 2 with mapping")
    print(f"Accuracy: {exp_2_map[0]:.4f}")
    print(f"Precision: {exp_2_map[1]:.4f}")
    print(f"Recall:    {exp_2_map[2]:.4f}")
    print(f"F1 Score:  {exp_2_map[3]:.4f}")
    print()

### Comparison between anthroscore and AtypicalAnimacy on the binary cases only

Get accuracy, precision, recall and f1-score (for multilabel evaluation sets adjective_phrases, verb_subjects and verb_objects), and accuracy for adjective_phrases, verb_subjects, verb_objects, noun_phrases and possessives.

(for the last two accuracy is the same as recall, since there is only 1 expectation - positive).

In [8]:
multi_label_datasets = ["adjective_phrases", "verb_objects", "verb_subjects"] # recall and precision matters
single_label_datasets = ['noun_phrases', 'possessives'] # only recall i.e. accuracy matters

for dataset in multi_label_datasets:
    get_precision_recall_f1_and_accuracy(dataset,'experiment_1','anthroscore',print_res=True,map_pred=False,include_inconclusive=False)
    get_precision_recall_f1_and_accuracy(dataset,'experiment_1','AtypicalAnimacy',print_res=True,map_pred=False,include_inconclusive=False)
    get_precision_recall_f1_and_accuracy(dataset,'experiment_2','anthroscore',print_res=True,map_pred=False,include_inconclusive=False)
    get_precision_recall_f1_and_accuracy(dataset,'experiment_2','AtypicalAnimacy',print_res=True,map_pred=False,include_inconclusive=False)

for dataset in single_label_datasets:
    get_accuracy(dataset,'experiment_1','anthroscore')
for dataset in single_label_datasets:
    get_accuracy(dataset,'experiment_2','AtypicalAnimacy')

Model: anthroscore Experiment: experiment_1 Dataset: adjective_phrases
Correct predictions (accuracy): 50 / 107
Accuracy: 0.4673
Precision: 0.4877
Recall:    0.3763
F1 Score:  0.3037

Model: AtypicalAnimacy Experiment: experiment_1 Dataset: adjective_phrases
Correct predictions (accuracy): 62 / 106
Accuracy: 0.5849
Precision: 0.4481
Recall:    0.4625
F1 Score:  0.4130

Model: anthroscore Experiment: experiment_2 Dataset: adjective_phrases
Correct predictions (accuracy): 39 / 120
Accuracy: 0.3250
Precision: 0.3338
Recall:    0.3063
F1 Score:  0.2804

Model: AtypicalAnimacy Experiment: experiment_2 Dataset: adjective_phrases
Correct predictions (accuracy): 71 / 120
Accuracy: 0.5917
Precision: 0.4481
Recall:    0.4865
F1 Score:  0.4271

Model: anthroscore Experiment: experiment_1 Dataset: verb_objects
Correct predictions (accuracy): 75 / 147
Accuracy: 0.5102
Precision: 0.5186
Recall:    0.4608
F1 Score:  0.3996

Model: AtypicalAnimacy Experiment: experiment_1 Dataset: verb_objects
Correct

### compare scenarios for handling anthroscore inconclusive predictions

compare accuracy, precision, recall and f1-score for two anthroscore scenarios: 

1. keeping inconclusive predictions (anthroscore between 1 and -1)
2. mapping inconclusive predictions to negative

In [5]:
multi_label_datasets = ["adjective_phrases", "verb_objects", "verb_subjects"] # recall and precision matters
single_label_datasets = ['noun_phrases', 'possessives'] # only recall i.e. accuracy matters

for dataset in multi_label_datasets:
    compare_anthroscore_scenarios(dataset)
for dataset in single_label_datasets:
    compare_anthroscore_scenarios(dataset)

scenario: adjective_phrases anthroscore experiment 1 no mapping
Accuracy: 0.4673
Precision: 0.4877
Recall:    0.3763
F1 Score:  0.3037

scenario: adjective_phrases anthroscore experiment 1 with mapping
Accuracy: 0.4766
Precision: 0.4296
Recall:    0.3712
F1 Score:  0.2753

scenario: adjective_phrases anthroscore experiment 2 no mapping
Accuracy: 0.3250
Precision: 0.3338
Recall:    0.3063
F1 Score:  0.2804

scenario: adjective_phrases anthroscore experiment 2 with mapping
Accuracy: 0.4083
Precision: 0.2981
Recall:    0.3421
F1 Score:  0.2594

scenario: verb_objects anthroscore experiment 1 no mapping
Accuracy: 0.5102
Precision: 0.5186
Recall:    0.4608
F1 Score:  0.3996

scenario: verb_objects anthroscore experiment 1 with mapping
Accuracy: 0.4626
Precision: 0.3818
Recall:    0.3743
F1 Score:  0.2750

scenario: verb_objects anthroscore experiment 2 no mapping
Accuracy: 0.3733
Precision: 0.3700
Recall:    0.3608
F1 Score:  0.3470

scenario: verb_objects anthroscore experiment 2 with mapp

### Get results for anthroscore including inconclusive expectations

Get accuracy, precision, recall and accuracy for anthroscore when the inconclusive cases are not excluded. They are excluded in the comparison between the two models because AtypicalAnimacy only provides positive and negative scores

In [16]:
# get accuracy, precision, recall and accuracy for anthroscore when the inconclusive cases are not excluded 
# they are excluded in the comparison between the two models because AtypicalAnimacy only provides positive and negative scores

for dataset in multi_label_datasets:
    get_precision_recall_f1_and_accuracy(dataset,'experiment_1','anthroscore',print_res=True,map_pred=False,include_inconclusive=True)

for dataset in multi_label_datasets:
    get_precision_recall_f1_and_accuracy(dataset,'experiment_2','anthroscore',print_res=True,map_pred=False,include_inconclusive=True)

evaluation_inconclusives()

Model: anthroscore Experiment: experiment_1 Dataset: adjective_phrases
Correct predictions (accuracy): 50 / 107
Accuracy: 0.4673
Precision: 0.5683
Recall:    0.4673
F1 Score:  0.3644

Model: anthroscore Experiment: experiment_1 Dataset: verb_objects
Correct predictions (accuracy): 75 / 147
Accuracy: 0.5102
Precision: 0.5569
Recall:    0.5102
F1 Score:  0.4273

Model: anthroscore Experiment: experiment_1 Dataset: verb_subjects
Correct predictions (accuracy): 68 / 145
Accuracy: 0.4690
Precision: 0.5684
Recall:    0.4690
F1 Score:  0.4077

Model: anthroscore Experiment: experiment_2 Dataset: adjective_phrases
Correct predictions (accuracy): 39 / 120
Accuracy: 0.3250
Precision: 0.3868
Recall:    0.3250
F1 Score:  0.3071

Model: anthroscore Experiment: experiment_2 Dataset: verb_objects
Correct predictions (accuracy): 56 / 150
Accuracy: 0.3733
Precision: 0.4143
Recall:    0.3733
F1 Score:  0.3772

Model: anthroscore Experiment: experiment_2 Dataset: verb_subjects
Correct predictions (accura

### Get the trends for the inconclusive sets only

Count for each inconclusive set, how many positive, negative and inconclusive (in the case of anthroscore only) predictions were given.

In [23]:
inconclusives = ["adjective_phrases", "verb_objects", "verb_subjects","comparisons"] 

for dataset in inconclusives:
    get_inconclusive_trends(dataset,'experiment_1','anthroscore')
    get_inconclusive_trends(dataset,'experiment_2','anthroscore')
    get_inconclusive_trends(dataset,'experiment_1','AtypicalAnimacy')
    get_inconclusive_trends(dataset,'experiment_2','AtypicalAnimacy')

Model: anthroscore Experiment: experiment_1 Dataset: adjective_phrases_inconclusive
Number of inconclusive cases in the dataset: 17
Number total predictions: 17
Number of positive predictions: 1
Number of negative predictions: 15
Number of inconlcusive predictions: 1 

Model: anthroscore Experiment: experiment_2 Dataset: adjective_phrases_inconclusive
Number of inconclusive cases in the dataset: 21
Number total predictions: 21
Number of positive predictions: 2
Number of negative predictions: 15
Number of inconlcusive predictions: 4 

Model: AtypicalAnimacy Experiment: experiment_1 Dataset: adjective_phrases_inconclusive
Number of inconclusive cases in the dataset: 17
Number total predictions: 17
Number of positive predictions: 2
Number of negative predictions: 15
Number of inconlcusive predictions: 0  (AtypicalAnimacy does not have inconclusive predictions)

Model: AtypicalAnimacy Experiment: experiment_2 Dataset: adjective_phrases_inconclusive
Number of inconclusive cases in the datas