# Create final evaluation sets and evaluate the models

In [6]:
# imports

import csv
import re
import pandas as pd

In [7]:
files = ["adjective_phrases_inconclusive",
         "adjective_phrases_negative",
         "adjective_phrases_positive",
         "comparisons_inconclusive",
         "noun_phrases_positive",
         "possessives_positive",
         "verb_objects_inconclusive",
         "verb_objects_negative",
         "verb_objects_positive",
         "verb_subjects_inconclusive",
         "verb_subjects_negative",
         "verb_subjects_positive"
        ]

In [5]:
def normalized(string):
    return re.sub(r'\s+', ' ', string.strip())

def get_final_prediction(score,model):
    """
     This function converts the anthroscore/AtypicalAnimacy scores to a single numerical value in {0,1,2}
    """     
    AtypicalAnimacy_threshold = 0.3 # this was calculated during the experiment.
    score = float(score)

    if model == 'anthroscore':
        if score > 1.0:
            pred = '1'
        elif score < -1.0:
            pred = '0'
        else:
            pred = '2'
            
    elif model == 'AtypicalAnimacy':
        if score > AtypicalAnimacy_threshold:
            pred = '1'
        else:
            pred = '0'

    return pred

def create_final_eval_file(filename,experiment,model,all_indices_dict):
    """
    this function reads info from csv file and writes it to a file with uniform structure to facilitate evaluation.

    :param filename (str): name of the file 
    :param experiment (str): specify the experiment - used in input and output paths, and for obtaining correct indices
    :param experiment (str): specify the model - used to obtain final prediction {0,1,2} based on the anthro/AtypicalAnimacy score
    :param experiment (dict): pre-defined dictionary containing experiment+model string as key and index dict as value
    
    """ 
    with open(f"../final_sets/{filename}_{experiment}_{model}_predictions.csv","w") as outfile:
        
        writer = csv.writer(outfile)
        new_header = ['id','sentence','masked_sentence','AI_phrase','mask','component','expectation','model_score','prediction']
        writer.writerow(new_header)
        infile = open(f"../{experiment}/{model}/predictions/csv/{filename}.csv","r")
        header = infile.readline()
        reader = csv.reader(infile)

        eval_set = f"{experiment}_{model}"
        
        for row in reader:

            indices = all_indices_dict[eval_set]
            
            sentence_id = normalized(row[indices['id']])
            sentence = normalized(row[indices['sent']])
            masked_sent = normalized(row[indices['masked_sent']])
            AI_phrase = normalized(row[indices['phrase']])
            mask = normalized(row[indices['mask']])
            component = normalized(row[indices['comp']])
            expectation = (normalized(row[indices['exp']])) # should be numerical value {0,1,2}
            expectation = int(float(expectation))
            prediction = normalized(row[indices['pred']])


            final_pred = get_final_prediction(prediction,model)
            
            write_to_file = [sentence_id,sentence,masked_sent,AI_phrase,mask,component,expectation,prediction,final_pred]
            
            writer.writerow(write_to_file)
        
        print(f"Created {filename}_{experiment}_{model}_predictions.csv in ../final_sets/")

all_indices_dict = {'experiment_1_anthroscore':{'id':0,'sent':1,'masked_sent':2,'phrase':3,'mask':4,'comp':6,'exp':7,'pred':8},
              'experiment_1_AtypicalAnimacy':{'id':0,'sent':2,'masked_sent':3,'phrase':5,'mask':6,'comp':8,'exp':10,'pred':15},
              'experiment_2_anthroscore':{'id':0,'sent':1,'masked_sent':2,'phrase':3,'mask':4,'comp':6,'exp':7,'pred':8},
              'experiment_2_AtypicalAnimacy':{'id':0,'sent':2,'masked_sent':3,'phrase':5,'mask':6,'comp':8,'exp':9,'pred':10}
             }

#for file in files:
    #create_final_eval_file(file,'experiment_2','anthroscore',all_indices_dict)

### Get evaluation metrics

In [4]:
import pandas as pd
import os
import glob
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

def get_multiclass_evaluation_metrics(dataset,experiment,model,include_inconclusive=False):

    directory = "../final_sets"
    
    pattern = os.path.join(directory, f"{dataset}*_{experiment}_{model}_predictions.csv")
    file_list = glob.glob(pattern)
        
    df_list = [pd.read_csv(file) for file in file_list]
    df = pd.concat(df_list, ignore_index=True)

    average_ = 'macro'

    if include_inconclusive == True:
        df = df[df['expectation'] != '2']
        average_ = 'weighted'

    if 'expectation' not in df.columns or 'prediction' not in df.columns:
        raise ValueError("CSV must contain 'expectation' and 'prediction' columns.")

    # convert to integers
    #df['expectation'] = df['expectation'].astype(int)
    #df['prediction'] = df['prediction'].astype(int)

    y_true = df['expectation'].astype(int)
    y_pred = df['prediction'].astype(int)
    mapped_pred = [0 if pred == 2 else pred for pred in df['prediction']]

    y_pred_to_check = y_pred

    #print("Unique values in expectation:", df['expectation'].unique())
    #print("Unique values in prediction:", df['prediction'].unique())

    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average=average_, zero_division=0)
    recall = recall_score(y_true, y_pred_to_check, average=average_, zero_division=0)
    f1 = f1_score(y_true, y_pred_to_check, average=average_, zero_division=0)

    #from sklearn.metrics import confusion_matrix
    #print(confusion_matrix(df['expectation'], df['prediction']))

    print(f"Model: {model} Experiment: {experiment} Dataset: {dataset}")
    
    correct = (y_true == y_pred_to_check).sum()
    total = len(df)
    print(f"Correct predictions: {correct} / {total}")

    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall:    {recall:.4f}")
    print(f"F1 Score:  {f1:.4f}")
    print()

def get_single_class_evaluation_metrics(dataset,experiment,model,include_inconclusive=False):

    directory = "../final_sets"
    
    pattern = os.path.join(directory, f"{dataset}*_{experiment}_{model}_predictions.csv")
    file_list = glob.glob(pattern)
        
    df_list = [pd.read_csv(file) for file in file_list]
    df = pd.concat(df_list, ignore_index=True)

    if not include_inconclusive:
        df = df[df['expectation'] != '2']

    if 'expectation' not in df.columns or 'prediction' not in df.columns:
        raise ValueError("CSV must contain 'expectation' and 'prediction' columns.")

    # convert to integers
    df['expectation'] = df['expectation'].astype(int)
    df['prediction'] = df['prediction'].astype(int)

    #print("Unique values in expectation:", df['expectation'].unique())
    #print("Unique values in prediction:", df['prediction'].unique())

    accuracy = accuracy_score(df['expectation'], df['prediction'])

    print(f"Model: {model} Experiment: {experiment} Dataset: {dataset}")
    
    correct = (df['expectation'] == df['prediction']).sum()
    total = len(df)
    print(f"Correct predictions: {correct} / {total}")
    print(f"Accuracy: {accuracy:.4f}")
    print()
    

multi_label_datasets = ["adjective_phrases", "verb_objects", "verb_subjects"]
single_label_datasets = ['noun_phrases', 'possessives']
comparisons = ['comparisons']

for dataset in multi_label_datasets:
    get_multiclass_evaluation_metrics(dataset,'experiment_1','anthroscore',include_inconclusive=False)

for dataset in single_label_datasets:
    get_single_class_evaluation_metrics(dataset,'experiment_1','anthroscore')

Model: anthroscore Experiment: experiment_1 Dataset: adjective_phrases
Correct predictions: 50 / 107
Accuracy: 0.4673
Precision: 0.4877
Recall:    0.3763
F1 Score:  0.3037

Model: anthroscore Experiment: experiment_1 Dataset: verb_objects
Correct predictions: 75 / 147
Accuracy: 0.5102
Precision: 0.5186
Recall:    0.4608
F1 Score:  0.3996

Model: anthroscore Experiment: experiment_1 Dataset: verb_subjects
Correct predictions: 68 / 145
Accuracy: 0.4690
Precision: 0.5415
Recall:    0.4419
F1 Score:  0.3946

Model: anthroscore Experiment: experiment_1 Dataset: noun_phrases
Correct predictions: 7 / 66
Accuracy: 0.1061

Model: anthroscore Experiment: experiment_1 Dataset: possessives
Correct predictions: 1 / 57
Accuracy: 0.0175



In [14]:
def temp_check(filename,experiment,all_indices_dict):
    """
    this function reads info from csv file and writes it to a file with uniform structure to facilitate evaluation.

    :param filename (str): name of the file 
    :param experiment (str): specify the experiment - used in input and output paths, and for obtaining correct indices
    :param experiment (str): specify the model - used to obtain final prediction {0,1,2} based on the anthro/AtypicalAnimacy score
    :param experiment (dict): pre-defined dictionary containing experiment+model string as key and index dict as value
    
    """ 
    with open(f"../{experiment}/AtypicalAnimacy/19thcentury/expectations/csv/{filename}.csv","w") as outfile:
        
        writer = csv.writer(outfile)
        new_header = ['id','Previous Sentence','Current Sentence','Masked Sentence','Next Sentence','AI Phrase','Suggested Mask','AI Entity',
                      'Anthropomorphic Component','Target Expression', 'Animated']
        writer.writerow(new_header)

        infile = open(f"../{experiment}/AtypicalAnimacy/predictions/txt/{filename}.txt","r")
        reader = csv.reader(infile)
        
        for row in reader:

            write_to_file = row[:11]            
            writer.writerow(write_to_file)
        
        print(f"Created ../{experiment}/AtypicalAnimacy_19thBERT/expectations/csv/{filename}.csv")


for file in files:
    temp_check(file,'experiment_2',AA_dict)

Created ../experiment_2/AtypicalAnimacy_19thBERT/expectations/csv/adjective_phrases_inconclusive.csv
Created ../experiment_2/AtypicalAnimacy_19thBERT/expectations/csv/adjective_phrases_negative.csv
Created ../experiment_2/AtypicalAnimacy_19thBERT/expectations/csv/adjective_phrases_positive.csv
Created ../experiment_2/AtypicalAnimacy_19thBERT/expectations/csv/comparisons_inconclusive.csv
Created ../experiment_2/AtypicalAnimacy_19thBERT/expectations/csv/noun_phrases_positive.csv
Created ../experiment_2/AtypicalAnimacy_19thBERT/expectations/csv/possessives_positive.csv
Created ../experiment_2/AtypicalAnimacy_19thBERT/expectations/csv/verb_objects_inconclusive.csv
Created ../experiment_2/AtypicalAnimacy_19thBERT/expectations/csv/verb_objects_negative.csv
Created ../experiment_2/AtypicalAnimacy_19thBERT/expectations/csv/verb_objects_positive.csv
Created ../experiment_2/AtypicalAnimacy_19thBERT/expectations/csv/verb_subjects_inconclusive.csv
Created ../experiment_2/AtypicalAnimacy_19thBERT/e