# Result Generation

This abstracts the core of the former test_results.py script, and outputs scores for the four fluency metrics and our ontology detection metric.

In [2]:
import os
import evaluate
import pandas as pd

#The four metrics we will evaluate
bleu = evaluate.load("sacrebleu")
chrf = evaluate.load("chrf")
ter = evaluate.load("ter")
meteor = evaluate.load('meteor')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ethan\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ethan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\ethan\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [3]:
#Now, repeat this workflow for all predictions we've gathered so far - 32 of them
from os import listdir
from os.path import isfile, join
base_path = "predictions/opus_en_fr_base/"
big_path = "predictions/opus_en_fr_big/"
base_files = [join(base_path, f) for f in listdir(base_path) if isfile(join(base_path, f))] #16 files, each comprising all our term predictions
big_files = [join(big_path, f) for f in listdir(big_path) if isfile(join(big_path, f))]
all_filenames = base_files + big_files

In [4]:
def getSentences(filename):
    f = open(filename, "r", encoding = "utf8")
    sentences = [line.strip() for line in f.readlines()]
    f.close()
    return sentences

In [5]:
REFERENCE_FILE = "wmt22gold.txt"
ONTO_FILE = "wmt22gold_onto_concepts.txt"

In [6]:
def assign_tag(pred_counts, ref_counts): #Helps us to compute summary statistics later
    tags = []
    true_positives = []
    false_positives = []
    false_negatives = []
    for (idx, count) in enumerate(pred_counts):
        if (pred_counts[idx] == 0):
            tags.append("MISSED")
            true_positives.append(0) #We are missing this number of concepts
            false_positives.append(0)
            false_negatives.append(ref_counts[idx])
        elif (ref_counts[idx] == 0):
            tags.append("EXTRANEOUS")
            true_positives.append(0) #We have overpredicted this number of concepts
            false_positives.append(pred_counts[idx])
            false_negatives.append(0)
        else:
            tags.append("RECOVERED")
            true_positives.append(min(pred_counts[idx], ref_counts[idx])) #If we predict too many, report a positive number. If we predict too few, report a negative number.
            discrepancy = pred_counts[idx] - ref_counts[idx]
            if (discrepancy > 0): #Overpredicted
                false_positives.append(discrepancy)
                false_negatives.append(0)
            elif (discrepancy < 0): #Underpredicted
                false_positives.append(0)
                false_negatives.append(abs(discrepancy))
            else:
                false_positives.append(0)
                false_negatives.append(0)
    return (true_positives, false_positives, false_negatives, tags)

In [65]:
from tqdm import tqdm
for filename in tqdm(all_filenames):

    #Compose filenames
    destination_metrics = filename.replace("predictions", "results").replace("_pred", "_results")
    destination_onto_table = filename.replace("predictions", "results").replace("_pred", "_onto_output")
    predicted_onto = filename.replace("predictions", "results/ontology_annotations").replace("_pred", "_onto_concepts")

    #Read in data
    preds = getSentences(filename)
    onto_preds = pd.read_csv(predicted_onto, sep = "\t")
    refs = getSentences(REFERENCE_FILE)
    onto_refs = pd.read_csv(ONTO_FILE, sep = "\t")

    #Scan for duplicates and handle them if they made it through our initial filter - rare edge case where the first annotated occurrence is not detected by a subsequent ontology
    #but subsequent occurrences are detected
    #In this case, we know that the duplicated row has fewer counts than the first occurrence, so we can drop it safely. The fact of the matter is that "Poids" was detected twice.
    #onto_preds = onto_preds[["sent_ID", "concept", "count"]]
    #onto_preds = onto_preds.drop_duplicates(subset=["sent_ID", "concept"], ignore_index = True).reset_index(drop=True)
    #onto_preds.to_csv(predicted_onto, sep = "\t", header = True, index = False) #Write back to source

    #Full outer join enables easy detection of matches, extraneous predictions, and misses
    aggregated_onto = onto_preds.merge(onto_refs, how = "outer", on = ["sent_ID", "concept"], 
                                       suffixes = ["_preds", "_refs"]).fillna(0).astype({"count_preds": int, "count_refs": int})  #NaNs just mean either extraneous or missed
    aggregated_onto["true_positives"], aggregated_onto["false_positives"], aggregated_onto["false_negatives"], aggregated_onto["remark"] = assign_tag(aggregated_onto["count_preds"], 
                                                                                                                                                      aggregated_onto["count_refs"])
    
    #Log table for human interpretation
    aggregated_onto.to_csv(destination_onto_table, sep = "\t", index = False)
    
    #Compute summary statistics for ontology prediction
    total_ref_concepts = sum(aggregated_onto["count_refs"])
    total_pred_concepts = sum(aggregated_onto["count_preds"])
    TP = sum(aggregated_onto["true_positives"])
    FP = sum(aggregated_onto["false_positives"])
    FN = sum(aggregated_onto["false_negatives"])
    recall = TP / (TP + FN)
    precision = TP / (TP + FP)
    f1 = 2 * precision * recall / (precision + recall)

    #Compute fluency metrics and write everything out
    output = open(destination_metrics, "w", encoding = "utf8")
    output.write("Fluency Metrics\n\n")
    result = bleu.compute(predictions = preds, references = refs) #BLEU score for provided input and references
    output.write("BLEU: " + str(result["score"]) + "\n")
    result = chrf.compute(predictions = preds, references = refs, word_order = 2) #Include word bigrams for CHRF++
    output.write("CHRF++: " + str(result["score"]) + "\n")
    result = ter.compute(predictions = preds, references = refs, case_sensitive = True) #Casing is important - treat as an edit error
    output.write("TER: " + str(result["score"]) + "\n")
    result = meteor.compute(predictions = preds, references = refs)
    output.write("METEOR: " + str(result["meteor"]) + "\n")
    output.write("\nOntology Prediction Metrics\n\n")
    output.write("Total Concepts Present: " + str(total_ref_concepts) + "\n")
    output.write("Total Concepts Predicted: " + str(total_pred_concepts) + "\n")
    output.write("Recall: " + str(recall) + "\n")
    output.write("Precision: " + str(precision) + "\n")
    output.write("F1 Score: " + str(f1) + "\n")
    output.close()

100%|██████████| 32/32 [02:54<00:00,  5.44s/it]
