# This notebook is used to test the ANNABELL model on the SQuAD dataset.


In [1]:
from dataset_processing import any_word_match, embedding_for_sentence, \
    cosine_distance, ids_questions_answers_from_log_file
import os
import platform
import datetime
import pandas as pd
from tqdm import tqdm

In [6]:
experiment_number = 14
operating_system = platform.system()
if operating_system == 'Windows':
	base_directory = "G:\\My Drive\\Shared with Julia\\Education\\Kent University\\PhD\\work\\annabell"
elif operating_system == 'Linux':
    base_directory = "/home/chris/gdrive/work/annabell"
elif operating_system == 'Darwin':  #macOS
    base_directory = "/Users/chris/Library/CloudStorage/GoogleDrive-cjameswalmsley@gmail.com/My Drive/Shared with Julia/Education/Kent University/PhD/work/annabell/"
else:
    raise Exception("unsupported OS")

#These parameters need to be set to match the files being used
referenced_files_timestamp = "20251022_085512"
test_log_filename = "logfile_nyc_squad_pretraining_validation_testing_commands_2025-10-22_16-38-14.txt"
test_input_filename = "nyc_squad_pretraining_validation_testing_commands_" + referenced_files_timestamp + ".txt"
#test_input_filename = "nyc_squad_testing_commands_" + referenced_files_timestamp + ".txt"

timestamp = datetime.datetime.now().strftime("_%Y%m%d_%H%M%S")
test_input_dir = os.path.join(base_directory, "experiments", str(experiment_number), "testing")
test_log_dir = os.path.join(base_directory, "experiments", str(experiment_number), "logs")
data_dir = os.path.join(base_directory, "experiments/data")
dataframe_directory = os.path.join(base_directory, "experiments", "dataframes")
test_results_dir = os.path.join(base_directory, "experiments", str(experiment_number), "results")
if not os.path.exists(test_results_dir):
    os.makedirs(test_results_dir)
pretraining_dir = os.path.join(base_directory, "experiments", str(experiment_number), "pre_training")

dataset_filename = "nyc_squad_with_pretraining_commands_" + referenced_files_timestamp + ".jsonl"
pretraining_filename ="nyc_squad_pretraining_commands_" + referenced_files_timestamp + ".txt"

test_log_filepath = os.path.join(test_log_dir, test_log_filename)
test_input_filepath = os.path.join(test_input_dir, test_input_filename)
dataset_filepath = os.path.join(dataframe_directory, dataset_filename)
pretraining_filepath =  os.path.join(pretraining_dir, pretraining_filename)

test_answer_summary_filename = "test_nyc_answer_summary" + timestamp + ".tsv"
tsv_results_filepath = os.path.join(test_results_dir, test_answer_summary_filename)

ids_questions_answers = ids_questions_answers_from_log_file(test_log_filepath)
for index, each_tuple in enumerate(ids_questions_answers):
    if each_tuple[-1] == "" or each_tuple[-1] == None:
        ids_questions_answers[index] = (each_tuple[0], each_tuple[1], "NO ANSWER GIVEN")

print("length of log file questions and answers: " + str(len(ids_questions_answers)))
with open(test_input_filepath, 'r') as test_input_file:
    test_input_lines = test_input_file.readlines()
total_number_of_test_samples = len([id_line for id_line in test_input_lines if id_line.startswith("#id:")])
with open(pretraining_filepath, 'r') as pretraining_file:
	pretraining_lines = pretraining_file.readlines()
print(f"total number of test samples in input file: {total_number_of_test_samples}")
total_number_of_pretraining_samples = len([id_line for id_line in pretraining_lines if id_line.startswith("# ID:")])
print(f"total number of pretraining samples in input file: {total_number_of_pretraining_samples}")
response_formatted_df = pd.read_json(dataset_filepath, lines=True)
#add the test questions to the dataframe
questions_not_found = []
for the_id, question, answer, in ids_questions_answers:
    if the_id in response_formatted_df["id"].values:
        response_formatted_df.loc[response_formatted_df["id"] == the_id, "test_answer"] = answer
    else:
        questions_not_found.append(question)
print(f"number of test samples not found in training data: {len(questions_not_found)}")
print("test samples not found in training data: " + str(questions_not_found[:5]) + " ...")
#drop any rows that are not in the test samples
response_formatted_df.dropna(subset=["test_answer"], inplace=True)
response_formatted_df.reset_index(inplace=True)
response_formatted_df

length of log file questions and answers: 211
total number of test samples in input file: 211
total number of pretraining samples in input file: 211
number of test samples not found in training data: 0
test samples not found in training data: [] ...


Unnamed: 0,index,id,question,response_question,response_answer,response_declarative_sentence,answer,response_question_formatted,response_answer_formatted,response_declarative_sentence_formatted,is_pretraining,question_category,sentence_category,created_commands,test_answer
0,4,56cf5d41aab44d1400b89130,The major gateway for immigration has been whi...,The major gateway for immigrate –ion has been ...,New York City,The major gateway for immigrate –ion has been ...,New York City,? the major gateway for immigrate -ion has bee...,New-York-City,the major gateway for immigrate -ion has been ...,True,Wh-in-situ Question,Subject-Verb-Complement (SVC),"[# ID: 56cf5d41aab44d1400b89130, the major gat...",city
1,5,56cf5d41aab44d1400b89131,The most populated city in the United States i...,The most populate –d city in the United States...,New York City,The most populate –d city in the United States...,New York City,? the most populate -d city in the United-Stat...,New-York-City,the most populate -d city in the United-States...,True,Wh-in-situ Question,Subject-Verb-Complement (SVC),"[# ID: 56cf5d41aab44d1400b89131, the most popu...",New-York-City
2,8,56ce3124aab44d1400b8852c,"In 2014, what did the census estimate the popu...","In 2014, what did the cense –us estimate the p...",8491079,"In 2014, the cense –us estimate –d the populat...",8491079,? in 2014 what did the cense -us estimate the ...,8491079,in 2014 the cense -us estimate -d the populate...,True,Wh-Object/Complement Question,Subject-Verb-Object (SVO/SVOA),"[# ID: 56ce3124aab44d1400b8852c, in 2014 the c...",8491079
3,13,56cf9d81234ae51400d9be1d,All five boroughs of New York City formed into...,All five borough –s of New York City form –ed ...,1898,All five borough –s of New York City form –ed ...,1898,? all five borough -s of New-York-City form -e...,1898,all five borough -s of New-York-City form -ed ...,True,Wh-Adverbial Question,Subject-Verb-Adverbial (SVA),"[# ID: 56cf9d81234ae51400d9be1d, all five boro...",date
4,16,56ce31baaab44d1400b8853a,What was the trading post that preceded New Yo...,What was the trade –ing post that precede –d N...,New Amsterdam,The trade –ing post that precede –d New York C...,New Amsterdam,? what was the trade -ing post that precede -d...,New-Amsterdam,the trade -ing post that precede -d New-York-C...,True,Wh-Object/Complement Question,Passive Construction,"[# ID: 56ce31baaab44d1400b8853a, the trade -in...",New-Amsterdam
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
206,696,56d1196917492d1400aab93f,What industry is Broadway associated with?,What industry is Broadway associate –d with?,the theater,Broadway is associate –d with the theater indu...,the theater,? what industry is Broadway associate -d with,the theater,Broadway is associate -d with the theater indu...,True,Wh-Object/Complement Question,Passive Construction,"[# ID: 56d1196917492d1400aab93f, Broadway is a...",the theater
207,702,56d119ec17492d1400aab949,What part of New Jersey can be reached from Ne...,What part of New Jersey can be reach –ed from ...,northern,The northern part of New Jersey can be reach –...,northern,? what part of New-Jersey can be reach -ed fro...,northern,the northern part of New-Jersey can be reach -...,True,Wh-Subject Question,Passive Construction,"[# ID: 56d119ec17492d1400aab949, the northern ...",northern
208,711,56d11d3317492d1400aab9c4,What is the name of a notable green office bui...,What is the name of a notable green office bui...,Hearst Tower,The Hearst Tower is the name of a notable gree...,Hearst Tower,? what is the name of a notable green office b...,Hearst-Tower,the Hearst-Tower is the name of a notable gree...,True,Wh-Object/Complement Question,Subject-Verb-Complement (SVC),"[# ID: 56d11d3317492d1400aab9c4, the Hearst-To...",Exploitation number of updates >= 4000
209,722,56d1204617492d1400aab9fb,What square is home to the US Court of Interna...,What square is home to the US Court of Interna...,Foley Square,Foley Square is home to the US Court of Intern...,Foley Square,? what square is home to the US-Court of Inter...,Foley-Square,Foley-Square is home to the US-Court of Intern...,True,Wh-Subject Question,Subject-Verb-Object (SVO/SVOA),"[# ID: 56d1204617492d1400aab9fb, Foley-Square ...",Foley-Square


In [7]:
#generate embeddings for the test answer and the response_answer_formatted columns and compare them using cosine distance
tqdm.pandas(desc="Generating test answer embeddings")
response_formatted_df["test_answer_embedding"] = response_formatted_df["test_answer"].progress_apply(
    lambda x: embedding_for_sentence(x) if pd.notnull(x) else None)
tqdm.pandas(desc="Generating response answer embeddings")
response_formatted_df["response_answer_formatted_embedding"] = response_formatted_df[
    "response_answer_formatted"].progress_apply(
    lambda x: embedding_for_sentence(x) if pd.notnull(x) else None)

Generating test answer embeddings: 100%|██████████| 211/211 [00:18<00:00, 11.51it/s]
Generating response answer embeddings: 100%|██████████| 211/211 [00:18<00:00, 11.53it/s]


In [8]:
response_formatted_df["test_answer_cosine_distance"] = response_formatted_df.apply(cosine_distance, axis=1)
response_formatted_df

Unnamed: 0,index,id,question,response_question,response_answer,response_declarative_sentence,answer,response_question_formatted,response_answer_formatted,response_declarative_sentence_formatted,is_pretraining,question_category,sentence_category,created_commands,test_answer,test_answer_embedding,response_answer_formatted_embedding,test_answer_cosine_distance
0,4,56cf5d41aab44d1400b89130,The major gateway for immigration has been whi...,The major gateway for immigrate –ion has been ...,New York City,The major gateway for immigrate –ion has been ...,New York City,? the major gateway for immigrate -ion has bee...,New-York-City,the major gateway for immigrate -ion has been ...,True,Wh-in-situ Question,Subject-Verb-Complement (SVC),"[# ID: 56cf5d41aab44d1400b89130, the major gat...",city,"[-0.21468619, 0.0059239822, 0.028324073, 0.005...","[-0.19555481, 0.0066168252, 0.03951459, -0.000...",0.111184
1,5,56cf5d41aab44d1400b89131,The most populated city in the United States i...,The most populate –d city in the United States...,New York City,The most populate –d city in the United States...,New York City,? the most populate -d city in the United-Stat...,New-York-City,the most populate -d city in the United-States...,True,Wh-in-situ Question,Subject-Verb-Complement (SVC),"[# ID: 56cf5d41aab44d1400b89131, the most popu...",New-York-City,"[-0.19555481, 0.0066168252, 0.03951459, -0.000...","[-0.19555481, 0.0066168252, 0.03951459, -0.000...",0.000000
2,8,56ce3124aab44d1400b8852c,"In 2014, what did the census estimate the popu...","In 2014, what did the cense –us estimate the p...",8491079,"In 2014, the cense –us estimate –d the populat...",8491079,? in 2014 what did the cense -us estimate the ...,8491079,in 2014 the cense -us estimate -d the populate...,True,Wh-Object/Complement Question,Subject-Verb-Object (SVO/SVOA),"[# ID: 56ce3124aab44d1400b8852c, in 2014 the c...",8491079,"[-0.16186388, 0.0097914515, 0.026937574, 0.010...","[-0.16186388, 0.0097914515, 0.026937574, 0.010...",0.000000
3,13,56cf9d81234ae51400d9be1d,All five boroughs of New York City formed into...,All five borough –s of New York City form –ed ...,1898,All five borough –s of New York City form –ed ...,1898,? all five borough -s of New-York-City form -e...,1898,all five borough -s of New-York-City form -ed ...,True,Wh-Adverbial Question,Subject-Verb-Adverbial (SVA),"[# ID: 56cf9d81234ae51400d9be1d, all five boro...",date,"[-0.21165806, -0.002644378, 0.010959443, -0.00...","[-0.1872016, 0.0097118905, 0.052524198, -0.017...",0.171242
4,16,56ce31baaab44d1400b8853a,What was the trading post that preceded New Yo...,What was the trade –ing post that precede –d N...,New Amsterdam,The trade –ing post that precede –d New York C...,New Amsterdam,? what was the trade -ing post that precede -d...,New-Amsterdam,the trade -ing post that precede -d New-York-C...,True,Wh-Object/Complement Question,Passive Construction,"[# ID: 56ce31baaab44d1400b8853a, the trade -in...",New-Amsterdam,"[-0.19081914, -0.0004763055, 0.032385446, 0.00...","[-0.19081914, -0.0004763055, 0.032385446, 0.00...",0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
206,696,56d1196917492d1400aab93f,What industry is Broadway associated with?,What industry is Broadway associate –d with?,the theater,Broadway is associate –d with the theater indu...,the theater,? what industry is Broadway associate -d with,the theater,Broadway is associate -d with the theater indu...,True,Wh-Object/Complement Question,Passive Construction,"[# ID: 56d1196917492d1400aab93f, Broadway is a...",the theater,"[-0.18867414, 0.013204971, 0.011697494, 0.0165...","[-0.18867414, 0.013204971, 0.011697494, 0.0165...",0.000000
207,702,56d119ec17492d1400aab949,What part of New Jersey can be reached from Ne...,What part of New Jersey can be reach –ed from ...,northern,The northern part of New Jersey can be reach –...,northern,? what part of New-Jersey can be reach -ed fro...,northern,the northern part of New-Jersey can be reach -...,True,Wh-Subject Question,Passive Construction,"[# ID: 56d119ec17492d1400aab949, the northern ...",northern,"[-0.20853904, -0.0014738719, 0.023412427, 0.00...","[-0.20853904, -0.0014738719, 0.023412427, 0.00...",0.000000
208,711,56d11d3317492d1400aab9c4,What is the name of a notable green office bui...,What is the name of a notable green office bui...,Hearst Tower,The Hearst Tower is the name of a notable gree...,Hearst Tower,? what is the name of a notable green office b...,Hearst-Tower,the Hearst-Tower is the name of a notable gree...,True,Wh-Object/Complement Question,Subject-Verb-Complement (SVC),"[# ID: 56d11d3317492d1400aab9c4, the Hearst-To...",Exploitation number of updates >= 4000,"[-0.15511338, 0.028740732, 0.008751592, 0.0579...","[-0.20579295, -0.007590037, 0.021350298, 0.014...",0.315318
209,722,56d1204617492d1400aab9fb,What square is home to the US Court of Interna...,What square is home to the US Court of Interna...,Foley Square,Foley Square is home to the US Court of Intern...,Foley Square,? what square is home to the US-Court of Inter...,Foley-Square,Foley-Square is home to the US-Court of Intern...,True,Wh-Subject Question,Subject-Verb-Object (SVO/SVOA),"[# ID: 56d1204617492d1400aab9fb, Foley-Square ...",Foley-Square,"[-0.19269383, -0.00928507, -0.012149676, 0.003...","[-0.19269383, -0.00928507, -0.012149676, 0.003...",0.000000


In [9]:
# Get the counts for each unique value in the 'test_answer' column
test_answer_summary = response_formatted_df['test_answer'].value_counts().reset_index()
# Rename the columns for clarity
test_answer_summary.columns = ['test_answer', 'count']
# Sort the results by count in descending order
test_answer_summary.sort_values(by='count', ascending=False, inplace=True)
test_answer_summary

Unnamed: 0,test_answer,count
0,city,3
2,New-York-City,3
3,Manhattan,3
4,date,3
1,NYC,3
...,...,...
76,20,1
77,250,1
78,PATCO-Speedline,1
79,the theater,1


In [10]:
#write the results dataframe to a tsv file
test_answer_summary.to_csv(tsv_results_filepath, sep="\t", index=False)
#count the number of results where the test answer is > 20 words
num_long_answers = response_formatted_df["test_answer"].apply(
    lambda x: len(x.split()) > 20 if pd.notnull(x) else False).sum()
print(f"number of test answers longer than 20 words: {num_long_answers}")
response_formatted_df["test_answer_correct"] = response_formatted_df["test_answer"] == response_formatted_df[
    "response_answer_formatted"]
number_correct = response_formatted_df["test_answer_correct"].sum()
print(f"number correct = {number_correct} out of {len(response_formatted_df)}")
percentage_correct = response_formatted_df["test_answer_correct"].mean() * 100
print(f"percentage correct = {percentage_correct} %")

response_formatted_df["test_answer_any_matching_word"] = response_formatted_df.apply(any_word_match, axis=1)
percentage_any_word_matches = response_formatted_df["test_answer_any_matching_word"].mean() * 100
number_any_word_matches = response_formatted_df["test_answer_any_matching_word"].sum()
print(f"number any word matches = {number_any_word_matches} out of {len(response_formatted_df)}")
print(f"percentage any word matches = {percentage_any_word_matches} %")

cosine_distance_threshold = 0.1
#create a dataframe with the rows where the cosine distance is less than the threshold
close_cosine_distance_df = response_formatted_df[
    response_formatted_df["test_answer_cosine_distance"] < cosine_distance_threshold]
print(f"number of rows with cosine distance less than {cosine_distance_threshold}: {len(close_cosine_distance_df)}")
print("percentage of total: " + str(len(close_cosine_distance_df) / len(response_formatted_df) * 100) + " %")

#create a dataframe with the rows where the cosine distance is less than the threshold and any matching answer is correct
close_cosine_distance_correct_df = close_cosine_distance_df[
    close_cosine_distance_df["test_answer_any_matching_word"]]
print(
    f"number of rows with cosine distance less than {cosine_distance_threshold} and any matching answer correct: {len(close_cosine_distance_correct_df)}")
print("percentage of total: " + str(len(close_cosine_distance_correct_df) / len
(response_formatted_df) * 100) + " %")

close_cosine_distance_correct_df

number of test answers longer than 20 words: 0
number correct = 173 out of 211
percentage correct = 81.99052132701422 %
number any word matches = 177 out of 211
percentage any word matches = 83.88625592417061 %
number of rows with cosine distance less than 0.1: 190
percentage of total: 90.04739336492891 %
number of rows with cosine distance less than 0.1 and any matching answer correct: 175
percentage of total: 82.93838862559242 %


Unnamed: 0,index,id,question,response_question,response_answer,response_declarative_sentence,answer,response_question_formatted,response_answer_formatted,response_declarative_sentence_formatted,is_pretraining,question_category,sentence_category,created_commands,test_answer,test_answer_embedding,response_answer_formatted_embedding,test_answer_cosine_distance,test_answer_correct,test_answer_any_matching_word
1,5,56cf5d41aab44d1400b89131,The most populated city in the United States i...,The most populate –d city in the United States...,New York City,The most populate –d city in the United States...,New York City,? the most populate -d city in the United-Stat...,New-York-City,the most populate -d city in the United-States...,True,Wh-in-situ Question,Subject-Verb-Complement (SVC),"[# ID: 56cf5d41aab44d1400b89131, the most popu...",New-York-City,"[-0.19555481, 0.0066168252, 0.03951459, -0.000...","[-0.19555481, 0.0066168252, 0.03951459, -0.000...",0.0,True,True
2,8,56ce3124aab44d1400b8852c,"In 2014, what did the census estimate the popu...","In 2014, what did the cense –us estimate the p...",8491079,"In 2014, the cense –us estimate –d the populat...",8491079,? in 2014 what did the cense -us estimate the ...,8491079,in 2014 the cense -us estimate -d the populate...,True,Wh-Object/Complement Question,Subject-Verb-Object (SVO/SVOA),"[# ID: 56ce3124aab44d1400b8852c, in 2014 the c...",8491079,"[-0.16186388, 0.0097914515, 0.026937574, 0.010...","[-0.16186388, 0.0097914515, 0.026937574, 0.010...",0.0,True,True
4,16,56ce31baaab44d1400b8853a,What was the trading post that preceded New Yo...,What was the trade –ing post that precede –d N...,New Amsterdam,The trade –ing post that precede –d New York C...,New Amsterdam,? what was the trade -ing post that precede -d...,New-Amsterdam,the trade -ing post that precede -d New-York-C...,True,Wh-Object/Complement Question,Passive Construction,"[# ID: 56ce31baaab44d1400b8853a, the trade -in...",New-Amsterdam,"[-0.19081914, -0.0004763055, 0.032385446, 0.00...","[-0.19081914, -0.0004763055, 0.032385446, 0.00...",0.0,True,True
5,20,56ce31baaab44d1400b8853e,In what year did New York cease the be the cap...,In what year did New York cease the be the cap...,1790,New York cease –d to be the capital of the Uni...,1790,? in what year did New-York cease the be the c...,1790,New-York cease -d to be the capital of the Uni...,True,Wh-Adverbial Question,Subject-Verb-Object (SVO/SVOA),"[# ID: 56ce31baaab44d1400b8853e, New-York ceas...",1790,"[-0.17822242, -0.003509889, 0.058659993, 0.012...","[-0.17822242, -0.003509889, 0.058659993, 0.012...",0.0,True,True
7,26,56ce32e7aab44d1400b88551,How many stations are operated by the New York...,How many station –s are operate –d by the New ...,469,There are 469 station –s operate –d by the New...,469,? how many station -s are operate -d by the Ne...,469,there are 469 station -s operate -d by the New...,True,Quantitative Wh-Question,Existential Clause,"[# ID: 56ce32e7aab44d1400b88551, there are 469...",469,"[-0.19621344, 0.028691122, 0.032106396, -0.000...","[-0.19621344, 0.028691122, 0.032106396, -0.000...",0.0,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
204,691,56d1170417492d1400aab8fd,What 24-hour rapid transit system is in Philad...,What 24-hour rapid transit system is in Philad...,PATCO Speedline,The PATCO Speedline is a 24-hour rapid transit...,PATCO Speedline,? what 24-hour rapid transit system is in Phil...,PATCO-Speedline,the PATCO-Speedline is a 24-hour rapid transit...,True,Wh-Subject Question,Subject-Verb-Complement (SVC),"[# ID: 56d1170417492d1400aab8fd, the PATCO-Spe...",PATCO-Speedline,"[-0.1767848, -0.018809885, 0.027706172, 0.0169...","[-0.1767848, -0.018809885, 0.027706172, 0.0169...",0.0,True,True
205,695,56d1191d17492d1400aab933,Where does the aerial tramway that starts on R...,Where does the aerial tramway that start –s on...,Manhattan Island,The aerial tramway that start –s on Roosevelt ...,Manhattan Island,? where does the aerial tramway that start -s ...,Manhattan-Island,the aerial tramway that start -s on Roosevelt-...,True,Wh-Adverbial Question,Subject-Verb-Adverbial (SVA),"[# ID: 56d1191d17492d1400aab933, the aerial tr...",Manhattan-Island,"[-0.20197348, 0.0015129941, 0.034708988, -0.00...","[-0.20197348, 0.0015129941, 0.034708988, -0.00...",0.0,True,True
206,696,56d1196917492d1400aab93f,What industry is Broadway associated with?,What industry is Broadway associate –d with?,the theater,Broadway is associate –d with the theater indu...,the theater,? what industry is Broadway associate -d with,the theater,Broadway is associate -d with the theater indu...,True,Wh-Object/Complement Question,Passive Construction,"[# ID: 56d1196917492d1400aab93f, Broadway is a...",the theater,"[-0.18867414, 0.013204971, 0.011697494, 0.0165...","[-0.18867414, 0.013204971, 0.011697494, 0.0165...",0.0,True,True
207,702,56d119ec17492d1400aab949,What part of New Jersey can be reached from Ne...,What part of New Jersey can be reach –ed from ...,northern,The northern part of New Jersey can be reach –...,northern,? what part of New-Jersey can be reach -ed fro...,northern,the northern part of New-Jersey can be reach -...,True,Wh-Subject Question,Passive Construction,"[# ID: 56d119ec17492d1400aab949, the northern ...",northern,"[-0.20853904, -0.0014738719, 0.023412427, 0.00...","[-0.20853904, -0.0014738719, 0.023412427, 0.00...",0.0,True,True


In [11]:
correct_matches = response_formatted_df[response_formatted_df["test_answer_correct"]]
correct_matches

Unnamed: 0,index,id,question,response_question,response_answer,response_declarative_sentence,answer,response_question_formatted,response_answer_formatted,response_declarative_sentence_formatted,is_pretraining,question_category,sentence_category,created_commands,test_answer,test_answer_embedding,response_answer_formatted_embedding,test_answer_cosine_distance,test_answer_correct,test_answer_any_matching_word
1,5,56cf5d41aab44d1400b89131,The most populated city in the United States i...,The most populate –d city in the United States...,New York City,The most populate –d city in the United States...,New York City,? the most populate -d city in the United-Stat...,New-York-City,the most populate -d city in the United-States...,True,Wh-in-situ Question,Subject-Verb-Complement (SVC),"[# ID: 56cf5d41aab44d1400b89131, the most popu...",New-York-City,"[-0.19555481, 0.0066168252, 0.03951459, -0.000...","[-0.19555481, 0.0066168252, 0.03951459, -0.000...",0.0,True,True
2,8,56ce3124aab44d1400b8852c,"In 2014, what did the census estimate the popu...","In 2014, what did the cense –us estimate the p...",8491079,"In 2014, the cense –us estimate –d the populat...",8491079,? in 2014 what did the cense -us estimate the ...,8491079,in 2014 the cense -us estimate -d the populate...,True,Wh-Object/Complement Question,Subject-Verb-Object (SVO/SVOA),"[# ID: 56ce3124aab44d1400b8852c, in 2014 the c...",8491079,"[-0.16186388, 0.0097914515, 0.026937574, 0.010...","[-0.16186388, 0.0097914515, 0.026937574, 0.010...",0.0,True,True
4,16,56ce31baaab44d1400b8853a,What was the trading post that preceded New Yo...,What was the trade –ing post that precede –d N...,New Amsterdam,The trade –ing post that precede –d New York C...,New Amsterdam,? what was the trade -ing post that precede -d...,New-Amsterdam,the trade -ing post that precede -d New-York-C...,True,Wh-Object/Complement Question,Passive Construction,"[# ID: 56ce31baaab44d1400b8853a, the trade -in...",New-Amsterdam,"[-0.19081914, -0.0004763055, 0.032385446, 0.00...","[-0.19081914, -0.0004763055, 0.032385446, 0.00...",0.0,True,True
5,20,56ce31baaab44d1400b8853e,In what year did New York cease the be the cap...,In what year did New York cease the be the cap...,1790,New York cease –d to be the capital of the Uni...,1790,? in what year did New-York cease the be the c...,1790,New-York cease -d to be the capital of the Uni...,True,Wh-Adverbial Question,Subject-Verb-Object (SVO/SVOA),"[# ID: 56ce31baaab44d1400b8853e, New-York ceas...",1790,"[-0.17822242, -0.003509889, 0.058659993, 0.012...","[-0.17822242, -0.003509889, 0.058659993, 0.012...",0.0,True,True
7,26,56ce32e7aab44d1400b88551,How many stations are operated by the New York...,How many station –s are operate –d by the New ...,469,There are 469 station –s operate –d by the New...,469,? how many station -s are operate -d by the Ne...,469,there are 469 station -s operate -d by the New...,True,Quantitative Wh-Question,Existential Clause,"[# ID: 56ce32e7aab44d1400b88551, there are 469...",469,"[-0.19621344, 0.028691122, 0.032106396, -0.000...","[-0.19621344, 0.028691122, 0.032106396, -0.000...",0.0,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
204,691,56d1170417492d1400aab8fd,What 24-hour rapid transit system is in Philad...,What 24-hour rapid transit system is in Philad...,PATCO Speedline,The PATCO Speedline is a 24-hour rapid transit...,PATCO Speedline,? what 24-hour rapid transit system is in Phil...,PATCO-Speedline,the PATCO-Speedline is a 24-hour rapid transit...,True,Wh-Subject Question,Subject-Verb-Complement (SVC),"[# ID: 56d1170417492d1400aab8fd, the PATCO-Spe...",PATCO-Speedline,"[-0.1767848, -0.018809885, 0.027706172, 0.0169...","[-0.1767848, -0.018809885, 0.027706172, 0.0169...",0.0,True,True
205,695,56d1191d17492d1400aab933,Where does the aerial tramway that starts on R...,Where does the aerial tramway that start –s on...,Manhattan Island,The aerial tramway that start –s on Roosevelt ...,Manhattan Island,? where does the aerial tramway that start -s ...,Manhattan-Island,the aerial tramway that start -s on Roosevelt-...,True,Wh-Adverbial Question,Subject-Verb-Adverbial (SVA),"[# ID: 56d1191d17492d1400aab933, the aerial tr...",Manhattan-Island,"[-0.20197348, 0.0015129941, 0.034708988, -0.00...","[-0.20197348, 0.0015129941, 0.034708988, -0.00...",0.0,True,True
206,696,56d1196917492d1400aab93f,What industry is Broadway associated with?,What industry is Broadway associate –d with?,the theater,Broadway is associate –d with the theater indu...,the theater,? what industry is Broadway associate -d with,the theater,Broadway is associate -d with the theater indu...,True,Wh-Object/Complement Question,Passive Construction,"[# ID: 56d1196917492d1400aab93f, Broadway is a...",the theater,"[-0.18867414, 0.013204971, 0.011697494, 0.0165...","[-0.18867414, 0.013204971, 0.011697494, 0.0165...",0.0,True,True
207,702,56d119ec17492d1400aab949,What part of New Jersey can be reached from Ne...,What part of New Jersey can be reach –ed from ...,northern,The northern part of New Jersey can be reach –...,northern,? what part of New-Jersey can be reach -ed fro...,northern,the northern part of New-Jersey can be reach -...,True,Wh-Subject Question,Passive Construction,"[# ID: 56d119ec17492d1400aab949, the northern ...",northern,"[-0.20853904, -0.0014738719, 0.023412427, 0.00...","[-0.20853904, -0.0014738719, 0.023412427, 0.00...",0.0,True,True


In [12]:
any_matches = response_formatted_df[response_formatted_df["test_answer_any_matching_word"]]
any_matches

Unnamed: 0,index,id,question,response_question,response_answer,response_declarative_sentence,answer,response_question_formatted,response_answer_formatted,response_declarative_sentence_formatted,is_pretraining,question_category,sentence_category,created_commands,test_answer,test_answer_embedding,response_answer_formatted_embedding,test_answer_cosine_distance,test_answer_correct,test_answer_any_matching_word
1,5,56cf5d41aab44d1400b89131,The most populated city in the United States i...,The most populate –d city in the United States...,New York City,The most populate –d city in the United States...,New York City,? the most populate -d city in the United-Stat...,New-York-City,the most populate -d city in the United-States...,True,Wh-in-situ Question,Subject-Verb-Complement (SVC),"[# ID: 56cf5d41aab44d1400b89131, the most popu...",New-York-City,"[-0.19555481, 0.0066168252, 0.03951459, -0.000...","[-0.19555481, 0.0066168252, 0.03951459, -0.000...",0.0,True,True
2,8,56ce3124aab44d1400b8852c,"In 2014, what did the census estimate the popu...","In 2014, what did the cense –us estimate the p...",8491079,"In 2014, the cense –us estimate –d the populat...",8491079,? in 2014 what did the cense -us estimate the ...,8491079,in 2014 the cense -us estimate -d the populate...,True,Wh-Object/Complement Question,Subject-Verb-Object (SVO/SVOA),"[# ID: 56ce3124aab44d1400b8852c, in 2014 the c...",8491079,"[-0.16186388, 0.0097914515, 0.026937574, 0.010...","[-0.16186388, 0.0097914515, 0.026937574, 0.010...",0.0,True,True
4,16,56ce31baaab44d1400b8853a,What was the trading post that preceded New Yo...,What was the trade –ing post that precede –d N...,New Amsterdam,The trade –ing post that precede –d New York C...,New Amsterdam,? what was the trade -ing post that precede -d...,New-Amsterdam,the trade -ing post that precede -d New-York-C...,True,Wh-Object/Complement Question,Passive Construction,"[# ID: 56ce31baaab44d1400b8853a, the trade -in...",New-Amsterdam,"[-0.19081914, -0.0004763055, 0.032385446, 0.00...","[-0.19081914, -0.0004763055, 0.032385446, 0.00...",0.0,True,True
5,20,56ce31baaab44d1400b8853e,In what year did New York cease the be the cap...,In what year did New York cease the be the cap...,1790,New York cease –d to be the capital of the Uni...,1790,? in what year did New-York cease the be the c...,1790,New-York cease -d to be the capital of the Uni...,True,Wh-Adverbial Question,Subject-Verb-Object (SVO/SVOA),"[# ID: 56ce31baaab44d1400b8853e, New-York ceas...",1790,"[-0.17822242, -0.003509889, 0.058659993, 0.012...","[-0.17822242, -0.003509889, 0.058659993, 0.012...",0.0,True,True
7,26,56ce32e7aab44d1400b88551,How many stations are operated by the New York...,How many station –s are operate –d by the New ...,469,There are 469 station –s operate –d by the New...,469,? how many station -s are operate -d by the Ne...,469,there are 469 station -s operate -d by the New...,True,Quantitative Wh-Question,Existential Clause,"[# ID: 56ce32e7aab44d1400b88551, there are 469...",469,"[-0.19621344, 0.028691122, 0.032106396, -0.000...","[-0.19621344, 0.028691122, 0.032106396, -0.000...",0.0,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
204,691,56d1170417492d1400aab8fd,What 24-hour rapid transit system is in Philad...,What 24-hour rapid transit system is in Philad...,PATCO Speedline,The PATCO Speedline is a 24-hour rapid transit...,PATCO Speedline,? what 24-hour rapid transit system is in Phil...,PATCO-Speedline,the PATCO-Speedline is a 24-hour rapid transit...,True,Wh-Subject Question,Subject-Verb-Complement (SVC),"[# ID: 56d1170417492d1400aab8fd, the PATCO-Spe...",PATCO-Speedline,"[-0.1767848, -0.018809885, 0.027706172, 0.0169...","[-0.1767848, -0.018809885, 0.027706172, 0.0169...",0.0,True,True
205,695,56d1191d17492d1400aab933,Where does the aerial tramway that starts on R...,Where does the aerial tramway that start –s on...,Manhattan Island,The aerial tramway that start –s on Roosevelt ...,Manhattan Island,? where does the aerial tramway that start -s ...,Manhattan-Island,the aerial tramway that start -s on Roosevelt-...,True,Wh-Adverbial Question,Subject-Verb-Adverbial (SVA),"[# ID: 56d1191d17492d1400aab933, the aerial tr...",Manhattan-Island,"[-0.20197348, 0.0015129941, 0.034708988, -0.00...","[-0.20197348, 0.0015129941, 0.034708988, -0.00...",0.0,True,True
206,696,56d1196917492d1400aab93f,What industry is Broadway associated with?,What industry is Broadway associate –d with?,the theater,Broadway is associate –d with the theater indu...,the theater,? what industry is Broadway associate -d with,the theater,Broadway is associate -d with the theater indu...,True,Wh-Object/Complement Question,Passive Construction,"[# ID: 56d1196917492d1400aab93f, Broadway is a...",the theater,"[-0.18867414, 0.013204971, 0.011697494, 0.0165...","[-0.18867414, 0.013204971, 0.011697494, 0.0165...",0.0,True,True
207,702,56d119ec17492d1400aab949,What part of New Jersey can be reached from Ne...,What part of New Jersey can be reach –ed from ...,northern,The northern part of New Jersey can be reach –...,northern,? what part of New-Jersey can be reach -ed fro...,northern,the northern part of New-Jersey can be reach -...,True,Wh-Subject Question,Passive Construction,"[# ID: 56d119ec17492d1400aab949, the northern ...",northern,"[-0.20853904, -0.0014738719, 0.023412427, 0.00...","[-0.20853904, -0.0014738719, 0.023412427, 0.00...",0.0,True,True


In [13]:
incorrect_matches = response_formatted_df[~response_formatted_df["test_answer_correct"]]
incorrect_matches

Unnamed: 0,index,id,question,response_question,response_answer,response_declarative_sentence,answer,response_question_formatted,response_answer_formatted,response_declarative_sentence_formatted,is_pretraining,question_category,sentence_category,created_commands,test_answer,test_answer_embedding,response_answer_formatted_embedding,test_answer_cosine_distance,test_answer_correct,test_answer_any_matching_word
0,4,56cf5d41aab44d1400b89130,The major gateway for immigration has been whi...,The major gateway for immigrate –ion has been ...,New York City,The major gateway for immigrate –ion has been ...,New York City,? the major gateway for immigrate -ion has bee...,New-York-City,the major gateway for immigrate -ion has been ...,True,Wh-in-situ Question,Subject-Verb-Complement (SVC),"[# ID: 56cf5d41aab44d1400b89130, the major gat...",city,"[-0.21468619, 0.0059239822, 0.028324073, 0.005...","[-0.19555481, 0.0066168252, 0.03951459, -0.000...",0.111184,False,False
3,13,56cf9d81234ae51400d9be1d,All five boroughs of New York City formed into...,All five borough –s of New York City form –ed ...,1898,All five borough –s of New York City form –ed ...,1898,? all five borough -s of New-York-City form -e...,1898,all five borough -s of New-York-City form -ed ...,True,Wh-Adverbial Question,Subject-Verb-Adverbial (SVA),"[# ID: 56cf9d81234ae51400d9be1d, all five boro...",date,"[-0.21165806, -0.002644378, 0.010959443, -0.00...","[-0.1872016, 0.0097118905, 0.052524198, -0.017...",0.171242,False,False
6,24,56cf9df0234ae51400d9be29,New York City is the biggest city in the Unite...,New York City is the big –gest city in the Uni...,1790,New York City has been the big –gest city in t...,1790,? New-York-City is the big -gest city in the U...,1790,New-York-City has been the big -gest city in t...,True,Wh-Adverbial Question,Subject-Verb-Complement (SVC),"[# ID: 56cf9df0234ae51400d9be29, New-York-City...",date,"[-0.21165806, -0.002644378, 0.010959443, -0.00...","[-0.17822242, -0.003509889, 0.058659993, 0.012...",0.169862,False,False
11,39,56cfa06a234ae51400d9be3b,Giovani da Verrazzano was an explorer from whi...,Giovani da Verrazzano was an explore –er from ...,France,Giovani da Verrazzano was an explore –er from ...,France,? Giovani da Verrazzano was an explore -er fro...,France,Giovani da Verrazzano was an explore -er from ...,True,Wh-in-situ Question,Subject-Verb-Complement (SVC),"[# ID: 56cfa06a234ae51400d9be3b, Giovani da Ve...",country,"[-0.2165549, -0.0008460405, 0.025422502, 0.009...","[-0.21558376, -0.0055248025, 0.034814924, 0.00...",0.04457,False,False
17,60,56cfabed234ae51400d9be4b,Which street in NYC today is now named after J...,Which street in NYC today is now name –d after...,"Broadway, from 159th Street to 218th Street",The street in NYC today that is now name –d af...,"Broadway, from 159th Street to 218th Street",? which street in NYC today is now name -d aft...,Broadway from 159th Street to 218th Street,the street in NYC today that is now name -d af...,True,Wh-Subject Question,Subject-Verb-Complement (SVC),"[# ID: 56cfabed234ae51400d9be4b, the street in...",...,"[-0.19871223, -0.014779658, 0.0133242775, -0.0...","[-0.17046607, -0.0011177479, 0.04031374, -0.00...",0.247959,False,False
21,69,56cedc10aab44d1400b88b19,What did the English call New Amsterdam after ...,What did the English call New Amsterdam after ...,New York,The English call –ed New Amsterdam New York af...,New York,? what did the English call New-Amsterdam afte...,New-York,the English call -ed New-Amsterdam-New-York af...,True,Wh-Object/Complement Question,Subject-Verb-Object-Complement (SVOC),"[# ID: 56cedc10aab44d1400b88b19, the English c...",capture,"[-0.2194085, -0.0031048143, 0.013731885, -0.00...","[-0.1972834, 0.00014738683, 0.026876723, 0.000...",0.100256,False,False
27,77,56cedc8eaab44d1400b88b25,What did Colve call New York after he captured...,What did Colve call New York after he capture ...,New Orange,Colve call –ed New York New Orange after he ca...,New Orange,? what did Colve call New-York after he captur...,New-Orange,Colve call -ed New-York-New-Orange after he ca...,True,Wh-Object/Complement Question,Subject-Verb-Object-Complement (SVOC),"[# ID: 56cedc8eaab44d1400b88b25, Colve call -e...",capture,"[-0.2194085, -0.0031048143, 0.013731885, -0.00...","[-0.18589011, 0.010557237, -0.0034622413, -0.0...",0.095947,False,False
30,84,56cfb502234ae51400d9beb1,"In 1700, the Lenape Native American population...","In 1700, the Lenape Native American populate –...",200,"In 1700, the Lenape Native American populate –...",200,? in 1700 the Lenape-Native-American populate ...,200,in 1700 the Lenape-Native-American populate -i...,True,Quantitative Wh-Question,Subject-Verb-Adverbial (SVA),"[# ID: 56cfb502234ae51400d9beb1, in 1700 the L...",many,"[-0.21317689, -0.0027783406, 0.020835098, 0.00...","[-0.17841734, 0.015454305, 0.0016121272, 0.010...",0.161082,False,False
31,90,56cfb5cb234ae51400d9beb6,Which city in North America held the most slav...,Which city in North America held the most slav...,"Charleston, South Carolina","Charleston, South Carolina held the most slave...","Charleston, South Carolina",? which city in North-America held the most sl...,Charleston-South-Carolina,Charleston-South-Carolina held the most slave ...,True,Wh-Subject Question,Subject-Verb-Object (SVO/SVOA),"[# ID: 56cfb5cb234ae51400d9beb6, Charleston-So...",1700s,"[-0.18142706, -0.001205836, 0.046104997, 0.007...","[-0.2112118, -0.0225463, 0.015903791, 0.009704...",0.233699,False,False
44,142,56cee5a1aab44d1400b88c27,What events provoked the immigration of people...,What event –s provoke –d the immigrate –ion of...,revolutions,Revolution –s provoke –d the immigrate –ion of...,revolutions,? what event -s provoke -d the immigrate -ion ...,revolutions,Revolution -s provoke -d the immigrate -ion of...,True,Wh-Subject Question,Subject-Verb-Object (SVO/SVOA),"[# ID: 56cee5a1aab44d1400b88c27, Revolution -s...",Germany,"[-0.2064394, -0.0018712592, 0.026567126, -0.00...","[-0.20433809, -0.010912872, 0.014363855, 0.003...",0.07652,False,False


In [14]:
#write the results to a file and export the results dataframe to a tsv file
import datetime
tsv_results_filename = "test_nyc_results" + datetime.datetime.now().strftime("_%Y%m%d_%H%M%S") + ".tsv"
tsv_results_filepath = os.path.join(test_results_dir, tsv_results_filename)
response_formatted_df.to_csv(tsv_results_filepath, sep="\t", index=False)

results_summary_filename = "test_nyc_results_summary" + datetime.datetime.now().strftime("_%Y%m%d_%H%M%S") + ".txt"
results_summary_filepath = os.path.join(test_results_dir, results_summary_filename)

with open(results_summary_filepath, 'w') as results_file:
    #write the number of samples tested
    results_file.write(f"total number of samples\t{total_number_of_test_samples}\n")
    results_file.write(f"number_of_test_answers\t{len(response_formatted_df)}\n")
    results_file.write(f"total_number_of_pretraining_samples\t{total_number_of_pretraining_samples}\n")
    results_file.write(f"percentage_correct\t{percentage_correct}\n")
    results_file.write(f"percentage_any_word_matches\t{percentage_any_word_matches}\n")
    results_file.write(f"percentage_close_cosine_distance\t{len(close_cosine_distance_df) / len(response_formatted_df) * 100}\n")
    results_file.write(f"percentage_close_cosine_distance_and_any_word_match\t{len(close_cosine_distance_correct_df) / len(response_formatted_df) * 100}\n")
    results_file.write(f"number of test answers longer than 20 words (removed)\t{num_long_answers}\n")
    # write the rows that had exact word matches to the file
    results_file.write("\nRows with exact matches:\n")
    results_file.write(
        correct_matches[["response_question", "response_answer", "test_answer"]].to_markdown(index=False))
    #write the rows in any_matches to the file
    results_file.write("\nRows with any word matches:\n")
    results_file.write(any_matches[["response_question", "response_answer", "test_answer"]].to_markdown(index=False))
    # write the rows that had a close cosine distance to the file
    results_file.write(f"\nRows with cosine distance less than {cosine_distance_threshold}:\n")
    results_file.write(
        close_cosine_distance_df[
            ["response_question", "response_answer", "test_answer", "test_answer_cosine_distance"]].to_markdown(
            index=False))
    # write the rows that had a close cosine distance and any word match to the file
    results_file.write(f"\nRows with cosine distance less than {cosine_distance_threshold} and any word match:\n")
    results_file.write(close_cosine_distance_correct_df[
                           ["response_question", "response_answer", "test_answer",
                            "test_answer_cosine_distance"]].to_markdown(index=False))
    #write the rows that had any matches and with a close cosine distance to the file
    results_file.write(f"\nRows with cosine distance less than {cosine_distance_threshold} and exact match:\n")
    results_file.write(correct_matches[
                           ["response_question", "response_answer", "test_answer",
                            "test_answer_cosine_distance"]].to_markdown(index=False))
print(f"results written to {tsv_results_filepath} and {results_summary_filepath}")

results written to /Users/chris/Library/CloudStorage/GoogleDrive-cjameswalmsley@gmail.com/My Drive/Shared with Julia/Education/Kent University/PhD/work/annabell/experiments/14/results/test_nyc_results_20251023_074417.tsv and /Users/chris/Library/CloudStorage/GoogleDrive-cjameswalmsley@gmail.com/My Drive/Shared with Julia/Education/Kent University/PhD/work/annabell/experiments/14/results/test_nyc_results_summary_20251023_074417.txt
