# This notebook is used to test the ANNABELL model on the SQuAD dataset.


In [2]:
from dataset_processing import any_word_match, embedding_for_sentence, \
    cosine_distance, ids_questions_answers_from_log_file
import os
import platform
import datetime
import pandas as pd
from tqdm import tqdm

In [25]:
experiment_number = 12
operating_system = platform.system()
if operating_system == 'Windows':
	base_directory = "G:\\My Drive\\Shared with Julia\\Education\\Kent University\\PhD\\work\\annabell"
elif operating_system == 'Linux':
    base_directory = "/home/chris/gdrive/work/annabell"
elif operating_system == 'Darwin':  #macOS
    base_directory = "/Users/chris/Library/CloudStorage/GoogleDrive-cjameswalmsley@gmail.com/My Drive/Shared with Julia/Education/Kent University/PhD/work/annabell/"
else:
    raise Exception("unsupported OS")

timestamp = datetime.datetime.now().strftime("_%Y%m%d_%H%M%S")
referenced_files_timestamp = "20251014_063732"

test_input_dir = os.path.join(base_directory, "experiments", str(experiment_number), "testing")
test_log_dir = os.path.join(base_directory, "experiments", str(experiment_number), "logs")
data_dir = os.path.join(base_directory, "experiments/data")
test_results_dir = os.path.join(base_directory, "experiments", str(experiment_number), "results")
if not os.path.exists(test_results_dir):
    os.makedirs(test_results_dir)
pretraining_dir = os.path.join(base_directory, "experiments", str(experiment_number), "pre_training")

test_log_filename = "logfile_nyc_squad_testing_commands_2025-10-15_08-31-02.txt"
test_input_filename = "nyc_squad_testing_commands_" + referenced_files_timestamp + ".txt"
dataset_filename = "nyc_squad_with_pretraining_commands_20251014_063732.jsonl"
pretraining_filename ="nyc_squad_pretraining_commands_" + referenced_files_timestamp + ".txt"

test_log_filepath = os.path.join(test_log_dir, test_log_filename)
test_input_filepath = os.path.join(test_input_dir, test_input_filename)
dataset_filepath = os.path.join(data_dir, dataset_filename)
pretraining_filepath =  os.path.join(pretraining_dir, pretraining_filename)

test_answer_summary_filename = "test_nyc_answer_summary" + timestamp + ".tsv"
tsv_results_filepath = os.path.join(test_results_dir, test_answer_summary_filename)

ids_questions_answers = ids_questions_answers_from_log_file(test_log_filepath)
for index, each_tuple in enumerate(ids_questions_answers):
    if each_tuple[-1] == "" or each_tuple[-1] == None:
        ids_questions_answers[index] = (each_tuple[0], each_tuple[1], "NO ANSWER GIVEN")

print("length of log file questions and answers: " + str(len(ids_questions_answers)))
with open(test_input_filepath, 'r') as test_input_file:
    test_input_lines = test_input_file.readlines()
total_number_of_test_samples = len([id_line for id_line in test_input_lines if id_line.startswith("#id:")])
with open(pretraining_filepath, 'r') as pretraining_file:
	pretraining_lines = pretraining_file.readlines()
print(f"total number of test samples in input file: {total_number_of_test_samples}")
total_number_of_pretraining_samples = len([id_line for id_line in pretraining_lines if id_line.startswith("# ID:")])
print(f"total number of pretraining samples in input file: {total_number_of_pretraining_samples}")
response_formatted_df = pd.read_json(dataset_filepath, lines=True)
#add the test questions to the dataframe
questions_not_found = []
for the_id, question, answer, in ids_questions_answers:
    if the_id in response_formatted_df["id"].values:
        response_formatted_df.loc[response_formatted_df["id"] == the_id, "test_answer"] = answer
    else:
        questions_not_found.append(question)
print(f"number of test samples not found in training data: {len(questions_not_found)}")
print("test samples not found in training data: " + str(questions_not_found[:5]) + " ...")
#drop any rows that are not in the test samples
response_formatted_df.dropna(subset=["test_answer"], inplace=True)
response_formatted_df.reset_index(inplace=True)
response_formatted_df

length of log file questions and answers: 601
total number of test samples in input file: 601
total number of pretraining samples in input file: 128
number of test samples not found in training data: 0
test samples not found in training data: [] ...


Unnamed: 0,index,id,question,response_question,response_answer,response_declarative_sentence,answer,response_question_formatted,response_answer_formatted,response_declarative_sentence_formatted,is_pretraining,question_category,sentence_category,created_commands,test_answer
0,0,56ce304daab44d1400b8850e,What city in the United States has the highest...,What city in the United States has the high –e...,New York,The city in the United States with the high –e...,New York,? what city in the United-States has the high ...,New-York,the city in the United-States with the high -e...,False,Wh-Subject Question,Subject-Verb-Complement (SVC),"[# ID: 56ce304daab44d1400b8850e, the city in t...",1700s
1,1,56ce304daab44d1400b8850f,In what city is the United Nations based?,In what city is the United Nations base –d?,New York,The United Nations is base –d in New York.,New York,? in what city is the United-Nations base -d,New-York,the United-Nations is base -d in New-York,False,Wh-Adverbial Question,Passive Construction,"[# ID: 56ce304daab44d1400b8850f, the United-Na...",Exploitation number of updates >= 4000
2,2,56ce304daab44d1400b88510,What city has been called the cultural capital...,What city has been call –ed the culture –al ca...,New York,New York has been call –ed the culture –al cap...,New York,? what city has been call -ed the culture -al ...,New-York,New-York has been call -ed the culture -al cap...,False,Wh-Subject Question,Passive Construction,"[# ID: 56ce304daab44d1400b88510, New-York has ...",One-World-Trade-Center
3,3,56ce304daab44d1400b88511,What American city welcomes the largest number...,What American city welcome –s the large –st nu...,New York,New York is the American city that welcome –s ...,New York,? what American city welcome -s the large -st ...,New-York,New-York is the American city that welcome -s ...,False,Wh-Subject Question,Subject-Verb-Object (SVO/SVOA),"[# ID: 56ce304daab44d1400b88511, New-York is t...",Exploitation number of updates >= 4000
4,6,56ce3124aab44d1400b8852a,How many boroughs comprise New York City?,How many borough –s comprise New York City?,five,New York City is comprise –d of five borough –s.,five,? how many borough -s comprise New-York-City,five,New-York-City is comprise -d of five borough -s,False,Quantitative Wh-Question,Passive Construction,"[# ID: 56ce3124aab44d1400b8852a, New-York-City...",NO ANSWER GIVEN
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
596,723,56d1204617492d1400aab9fc,What federal district court has jurisdiction o...,What federal district court has jurisdiction o...,the District Court for the Eastern District of...,The District Court for the Eastern District of...,the District Court for the Eastern District of...,? what federal district court has jurisdiction...,the District-Court for the Eastern-District of...,the District-Court for the Eastern-District of...,False,Wh-Subject Question,Subject-Verb-Object (SVO/SVOA),"[# ID: 56d1204617492d1400aab9fc, the District-...",NO ANSWER GIVEN
597,724,56d1204617492d1400aab9fd,In what borough is the New York City Hall found?,In what borough is the New York City Hall found?,Manhattan,The New York City Hall is found in the borough...,Manhattan,? in what borough is the New-York-City-Hall found,Manhattan,the New-York-City-Hall is found in the borough...,False,Wh-Adverbial Question,Passive Construction,"[# ID: 56d1204617492d1400aab9fd, the New-York-...",in
598,725,56d1218c17492d1400aaba1f,How much money in cents does New York City rec...,How much money in cent –s does New York City r...,83,New York City receive –s 83 cent –s for every ...,83,? how much money in cent -s does New-York-City...,83,New-York-City receive -s 83 cent -s for every ...,False,Quantitative Wh-Question,Subject-Verb-Object (SVO/SVOA),"[# ID: 56d1218c17492d1400aaba1f, New-York-City...",two
599,727,56d1218c17492d1400aaba21,"Each year, how much more money does New York C...","Each year, how much more money does New York C...",$ 11.4 billion,New York City give –s $ 11.4 billion more mone...,$11.4 billion,? each year how much more money does New-York-...,11 point 4 billion,New-York-City give -s 11 point 4 billion more...,False,Quantitative Wh-Question,Subject-Verb-Object (SVO/SVOA),"[# ID: 56d1218c17492d1400aaba21, New-York-City...",1783


In [5]:
#generate embeddings for the test answer and the response_answer_formatted columns and compare them using cosine distance
tqdm.pandas(desc="Generating test answer embeddings")
response_formatted_df["test_answer_embedding"] = response_formatted_df["test_answer"].progress_apply(
    lambda x: embedding_for_sentence(x) if pd.notnull(x) else None)
tqdm.pandas(desc="Generating response answer embeddings")
response_formatted_df["response_answer_formatted_embedding"] = response_formatted_df[
    "response_answer_formatted"].progress_apply(
    lambda x: embedding_for_sentence(x) if pd.notnull(x) else None)

Generating test answer embeddings: 100%|██████████| 601/601 [03:11<00:00,  3.14it/s]
Generating response answer embeddings: 100%|██████████| 601/601 [03:18<00:00,  3.02it/s]


In [6]:
response_formatted_df["test_answer_cosine_distance"] = response_formatted_df.apply(cosine_distance, axis=1)
response_formatted_df

Unnamed: 0,index,id,question,response_question,response_answer,response_declarative_sentence,answer,response_question_formatted,response_answer_formatted,response_declarative_sentence_formatted,is_pretraining,question_category,sentence_category,created_commands,test_answer,test_answer_embedding,response_answer_formatted_embedding,test_answer_cosine_distance
0,0,56ce304daab44d1400b8850e,What city in the United States has the highest...,What city in the United States has the high –e...,New York,The city in the United States with the high –e...,New York,? what city in the United-States has the high ...,New-York,the city in the United-States with the high -e...,False,Wh-Subject Question,Subject-Verb-Complement (SVC),"[# ID: 56ce304daab44d1400b8850e, the city in t...",1700s,"[-0.18147065, -0.0011770469, 0.046052348, 0.00...","[-0.19721355, 0.00012896695, 0.026783189, 0.00...",0.208907
1,1,56ce304daab44d1400b8850f,In what city is the United Nations based?,In what city is the United Nations base –d?,New York,The United Nations is base –d in New York.,New York,? in what city is the United-Nations base -d,New-York,the United-Nations is base -d in New-York,False,Wh-Adverbial Question,Passive Construction,"[# ID: 56ce304daab44d1400b8850f, the United-Na...",Exploitation number of updates >= 4000,"[-0.15523376, 0.028737156, 0.009077885, 0.0581...","[-0.19721355, 0.00012896695, 0.026783189, 0.00...",0.306210
2,2,56ce304daab44d1400b88510,What city has been called the cultural capital...,What city has been call –ed the culture –al ca...,New York,New York has been call –ed the culture –al cap...,New York,? what city has been call -ed the culture -al ...,New-York,New-York has been call -ed the culture -al cap...,False,Wh-Subject Question,Passive Construction,"[# ID: 56ce304daab44d1400b88510, New-York has ...",One-World-Trade-Center,"[-0.16677752, 0.012732404, 0.05187011, 0.00256...","[-0.19721355, 0.00012896695, 0.026783189, 0.00...",0.207320
3,3,56ce304daab44d1400b88511,What American city welcomes the largest number...,What American city welcome –s the large –st nu...,New York,New York is the American city that welcome –s ...,New York,? what American city welcome -s the large -st ...,New-York,New-York is the American city that welcome -s ...,False,Wh-Subject Question,Subject-Verb-Object (SVO/SVOA),"[# ID: 56ce304daab44d1400b88511, New-York is t...",Exploitation number of updates >= 4000,"[-0.15523376, 0.028737156, 0.009077885, 0.0581...","[-0.19721355, 0.00012896695, 0.026783189, 0.00...",0.306210
4,6,56ce3124aab44d1400b8852a,How many boroughs comprise New York City?,How many borough –s comprise New York City?,five,New York City is comprise –d of five borough –s.,five,? how many borough -s comprise New-York-City,five,New-York-City is comprise -d of five borough -s,False,Quantitative Wh-Question,Passive Construction,"[# ID: 56ce3124aab44d1400b8852a, New-York-City...",NO ANSWER GIVEN,"[-0.17671387, 0.017996417, 0.028865896, 0.0410...","[-0.21587223, 0.0065979995, 0.021248076, -0.00...",0.146315
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
596,723,56d1204617492d1400aab9fc,What federal district court has jurisdiction o...,What federal district court has jurisdiction o...,the District Court for the Eastern District of...,The District Court for the Eastern District of...,the District Court for the Eastern District of...,? what federal district court has jurisdiction...,the District-Court for the Eastern-District of...,the District-Court for the Eastern-District of...,False,Wh-Subject Question,Subject-Verb-Object (SVO/SVOA),"[# ID: 56d1204617492d1400aab9fc, the District-...",NO ANSWER GIVEN,"[-0.17671387, 0.017996417, 0.028865896, 0.0410...","[-0.14033447, 0.00794487, 0.021146098, 0.00230...",0.356770
597,724,56d1204617492d1400aab9fd,In what borough is the New York City Hall found?,In what borough is the New York City Hall found?,Manhattan,The New York City Hall is found in the borough...,Manhattan,? in what borough is the New-York-City-Hall found,Manhattan,the New-York-City-Hall is found in the borough...,False,Wh-Adverbial Question,Passive Construction,"[# ID: 56d1204617492d1400aab9fd, the New-York-...",in,"[-0.2097755, -0.0062216036, 0.035323795, -0.00...","[-0.20694321, -0.0033040794, 0.038179982, -0.0...",0.085570
598,725,56d1218c17492d1400aaba1f,How much money in cents does New York City rec...,How much money in cent –s does New York City r...,83,New York City receive –s 83 cent –s for every ...,83,? how much money in cent -s does New-York-City...,83,New-York-City receive -s 83 cent -s for every ...,False,Quantitative Wh-Question,Subject-Verb-Object (SVO/SVOA),"[# ID: 56d1218c17492d1400aaba1f, New-York-City...",two,"[-0.20569867, 0.0018786144, 0.028507987, 0.026...","[-0.19078965, -0.013918617, -0.0024308818, 0.0...",0.189569
599,727,56d1218c17492d1400aaba21,"Each year, how much more money does New York C...","Each year, how much more money does New York C...",$ 11.4 billion,New York City give –s $ 11.4 billion more mone...,$11.4 billion,? each year how much more money does New-York-...,11 point 4 billion,New-York-City give -s 11 point 4 billion more...,False,Quantitative Wh-Question,Subject-Verb-Object (SVO/SVOA),"[# ID: 56d1218c17492d1400aaba21, New-York-City...",1783,"[-0.18825383, -0.010496782, 0.055308685, 0.007...","[-0.13434438, -0.012159896, 0.04242972, 0.0124...",0.302956


In [7]:
# Get the counts for each unique value in the 'test_answer' column
test_answer_summary = response_formatted_df['test_answer'].value_counts().reset_index()
# Rename the columns for clarity
test_answer_summary.columns = ['test_answer', 'count']
# Sort the results by count in descending order
test_answer_summary.sort_values(by='count', ascending=False, inplace=True)
test_answer_summary

Unnamed: 0,test_answer,count
0,NO ANSWER GIVEN,193
1,Exploitation number of updates >= 4000,125
2,Manhattan,10
3,the,9
4,1700,8
...,...,...
127,George-Washington-Bridge,1
128,populate,1
129,United-States,1
130,5937,1


In [8]:
#write the results dataframe to a tsv file
test_answer_summary.to_csv(tsv_results_filepath, sep="\t", index=False)
#count the number of results where the test answer is > 20 words
num_long_answers = response_formatted_df["test_answer"].apply(
    lambda x: len(x.split()) > 20 if pd.notnull(x) else False).sum()
print(f"number of test answers longer than 20 words: {num_long_answers}")
response_formatted_df["test_answer_correct"] = response_formatted_df["test_answer"] == response_formatted_df[
    "response_answer_formatted"]
number_correct = response_formatted_df["test_answer_correct"].sum()
print(f"number correct = {number_correct} out of {len(response_formatted_df)}")
percentage_correct = response_formatted_df["test_answer_correct"].mean() * 100
print(f"percentage correct = {percentage_correct} %")

response_formatted_df["test_answer_any_matching_word"] = response_formatted_df.apply(any_word_match, axis=1)
percentage_any_word_matches = response_formatted_df["test_answer_any_matching_word"].mean() * 100
number_any_word_matches = response_formatted_df["test_answer_any_matching_word"].sum()
print(f"number any word matches = {number_any_word_matches} out of {len(response_formatted_df)}")
print(f"percentage any word matches = {percentage_any_word_matches} %")

cosine_distance_threshold = 0.1
#create a dataframe with the rows where the cosine distance is less than the threshold
close_cosine_distance_df = response_formatted_df[
    response_formatted_df["test_answer_cosine_distance"] < cosine_distance_threshold]
print(f"number of rows with cosine distance less than {cosine_distance_threshold}: {len(close_cosine_distance_df)}")
print("percentage of total: " + str(len(close_cosine_distance_df) / len(response_formatted_df) * 100) + " %")

#create a dataframe with the rows where the cosine distance is less than the threshold and any matching answer is correct
close_cosine_distance_correct_df = close_cosine_distance_df[
    close_cosine_distance_df["test_answer_any_matching_word"]]
print(
    f"number of rows with cosine distance less than {cosine_distance_threshold} and any matching answer correct: {len(close_cosine_distance_correct_df)}")
print("percentage of total: " + str(len(close_cosine_distance_correct_df) / len
(response_formatted_df) * 100) + " %")

close_cosine_distance_correct_df

number of test answers longer than 20 words: 0
number correct = 19 out of 601
percentage correct = 3.1613976705490847 %
number any word matches = 24 out of 601
percentage any word matches = 3.9933444259567388 %
number of rows with cosine distance less than 0.1: 68
percentage of total: 11.314475873544092 %
number of rows with cosine distance less than 0.1 and any matching answer correct: 21
percentage of total: 3.494176372712146 %


Unnamed: 0,index,id,question,response_question,response_answer,response_declarative_sentence,answer,response_question_formatted,response_answer_formatted,response_declarative_sentence_formatted,is_pretraining,question_category,sentence_category,created_commands,test_answer,test_answer_embedding,response_answer_formatted_embedding,test_answer_cosine_distance,test_answer_correct,test_answer_any_matching_word
53,66,56cfb206234ae51400d9be8f,New Netherland established a permanent Europea...,New Netherland establish –ed a permanent Europ...,1624,New Netherland establish –ed a permanent Europ...,1624,? New-Netherland establish -ed a permanent Eur...,1624,New-Netherland establish -ed a permanent Europ...,False,Wh-Adverbial Question,Subject-Verb-Object (SVO/SVOA),"[# ID: 56cfb206234ae51400d9be8f, New-Netherlan...",1624,"[-0.18507972, 0.007633989, 0.024701936, -0.009...","[-0.18507972, 0.007633989, 0.024701936, -0.009...",0.0,True,True
56,70,56cedc10aab44d1400b88b1a,What was the regnal name of the Duke of York?,What was the regnal name of the Duke of York?,James II,The regnal name of the Duke of York was James II.,James II,? what was the regnal name of the Duke of York,James-II,the regnal name of the Duke of York was James-II,False,Wh-Object/Complement Question,Subject-Verb-Complement (SVC),"[# ID: 56cedc10aab44d1400b88b1a, the regnal na...",James-II,"[-0.20683673, -0.024920216, 0.020619316, 0.011...","[-0.20683673, -0.024920216, 0.020619316, 0.011...",0.0,True,True
65,82,56cedd1caab44d1400b88b42,How many Lenape lived in the area in 1700?,How many Lenape live –d in the area in 1700?,200,200 Lenape live –d in the area in 1700.,200,? how many Lenape live -d in the area in 1700,200,200 Lenape live -d in the area in 1700,False,Quantitative Wh-Question,Subject-Verb-Adverbial (SVA),"[# ID: 56cedd1caab44d1400b88b42, 200 Lenape li...",200,"[-0.17850119, 0.015458436, 0.0018362452, 0.010...","[-0.17850119, 0.015458436, 0.0018362452, 0.010...",0.0,True,True
73,92,56cede40aab44d1400b88b71,In what year was John Peter Zenger tried?,In what year was John Peter Zenger try –ed?,1735,John Peter Zenger was try –ed in 1735.,1735,? in what year was John-Peter-Zenger try -ed,1735,John-Peter-Zenger was try -ed in 1735,False,Wh-Adverbial Question,Passive Construction,"[# ID: 56cede40aab44d1400b88b71, John-Peter-Ze...",1735,"[-0.18994845, -0.008756593, 0.054180417, 0.010...","[-0.18994845, -0.008756593, 0.054180417, 0.010...",0.0,True,True
74,93,56cede40aab44d1400b88b72,On what island did Zenger's trial occur?,On what island did Zenger's trial occur?,Manhattan,Zenger's trial occur –ed on Manhattan.,Manhattan,? on what island did Zengers trial occur,Manhattan,Zengers trial occur -ed on Manhattan,False,Wh-Adverbial Question,Subject-Verb-Adverbial (SVA),"[# ID: 56cede40aab44d1400b88b72, Zengers trial...",Manhattan,"[-0.20694321, -0.0033040794, 0.038179982, -0.0...","[-0.20694321, -0.0033040794, 0.038179982, -0.0...",0.0,True,True
75,94,56cede40aab44d1400b88b73,In what year was Columbia University chartered?,In what year was Columbia University charter –ed?,1754,Columbia University was charter –ed in 1754.,1754,? in what year was Columbia-University charter...,1754,Columbia-University was charter -ed in 1754,False,Wh-Adverbial Question,Passive Construction,"[# ID: 56cede40aab44d1400b88b73, Columbia-Univ...",1754,"[-0.1863562, -0.0045614582, 0.049112216, 0.002...","[-0.1863562, -0.0045614582, 0.049112216, 0.002...",0.0,True,True
77,96,56cede40aab44d1400b88b75,What was the original name of Columbia Univers...,What was the origin –al name of Columbia Unive...,King's College,The origin –al name of Columbia University was...,King's College,? what was the origin -al name of Columbia-Uni...,Kings-College,the origin -al name of Columbia-University was...,False,Wh-Object/Complement Question,Subject-Verb-Complement (SVC),"[# ID: 56cede40aab44d1400b88b75, the origin -a...",was Kings-College,"[-0.18255968, -0.017762745, 0.014112608, -0.00...","[-0.19066477, -0.008647565, 0.016890023, 0.004...",0.04005,False,True
88,109,56cee30faab44d1400b88bf2,Who was the British representative at the Conf...,Who was the British representative at the Conf...,Lord Howe,The British representative at the Conference H...,Lord Howe,? who was the British representative at the Co...,Lord-Howe,the British representative at the Conference-H...,False,Wh-Subject Question,Subject-Verb-Complement (SVC),"[# ID: 56cee30faab44d1400b88bf2, the British r...",Lord-Howe,"[-0.16061674, -0.011461509, 0.016838027, 0.016...","[-0.16061674, -0.011461509, 0.016838027, 0.016...",0.0,True,True
89,111,56cee30faab44d1400b88bf4,In what modern-day borough did the Great Fire ...,In what modern-day borough did the Great Fire ...,Manhattan,The Great Fire happen –ed in the modern-day bo...,Manhattan,? in what modern-day borough did the Great-Fir...,Manhattan,the Great-Fire happen -ed in the modern-day bo...,False,Wh-Adverbial Question,Subject-Verb-Adverbial (SVA),"[# ID: 56cee30faab44d1400b88bf4, the Great-Fir...",Manhattan,"[-0.20694321, -0.0033040794, 0.038179982, -0.0...","[-0.20694321, -0.0033040794, 0.038179982, -0.0...",0.0,True,True
97,119,56cee398aab44d1400b88bff,What was the second largest city in the United...,What was the second large –st city in the Unit...,Philadelphia,The second large –st city in the United States...,Philadelphia,? what was the second large -st city in the Un...,Philadelphia,the second large -st city in the United-States...,False,Wh-Object/Complement Question,Subject-Verb-Complement (SVC),"[# ID: 56cee398aab44d1400b88bff, the second la...",Philadelphia,"[-0.22285119, -0.0072190664, 0.023087623, -0.0...","[-0.22285119, -0.0072190664, 0.023087623, -0.0...",0.0,True,True


In [9]:
correct_matches = response_formatted_df[response_formatted_df["test_answer_correct"]]
correct_matches

Unnamed: 0,index,id,question,response_question,response_answer,response_declarative_sentence,answer,response_question_formatted,response_answer_formatted,response_declarative_sentence_formatted,is_pretraining,question_category,sentence_category,created_commands,test_answer,test_answer_embedding,response_answer_formatted_embedding,test_answer_cosine_distance,test_answer_correct,test_answer_any_matching_word
53,66,56cfb206234ae51400d9be8f,New Netherland established a permanent Europea...,New Netherland establish –ed a permanent Europ...,1624,New Netherland establish –ed a permanent Europ...,1624,? New-Netherland establish -ed a permanent Eur...,1624,New-Netherland establish -ed a permanent Europ...,False,Wh-Adverbial Question,Subject-Verb-Object (SVO/SVOA),"[# ID: 56cfb206234ae51400d9be8f, New-Netherlan...",1624,"[-0.18507972, 0.007633989, 0.024701936, -0.009...","[-0.18507972, 0.007633989, 0.024701936, -0.009...",0.0,True,True
56,70,56cedc10aab44d1400b88b1a,What was the regnal name of the Duke of York?,What was the regnal name of the Duke of York?,James II,The regnal name of the Duke of York was James II.,James II,? what was the regnal name of the Duke of York,James-II,the regnal name of the Duke of York was James-II,False,Wh-Object/Complement Question,Subject-Verb-Complement (SVC),"[# ID: 56cedc10aab44d1400b88b1a, the regnal na...",James-II,"[-0.20683673, -0.024920216, 0.020619316, 0.011...","[-0.20683673, -0.024920216, 0.020619316, 0.011...",0.0,True,True
65,82,56cedd1caab44d1400b88b42,How many Lenape lived in the area in 1700?,How many Lenape live –d in the area in 1700?,200,200 Lenape live –d in the area in 1700.,200,? how many Lenape live -d in the area in 1700,200,200 Lenape live -d in the area in 1700,False,Quantitative Wh-Question,Subject-Verb-Adverbial (SVA),"[# ID: 56cedd1caab44d1400b88b42, 200 Lenape li...",200,"[-0.17850119, 0.015458436, 0.0018362452, 0.010...","[-0.17850119, 0.015458436, 0.0018362452, 0.010...",0.0,True,True
73,92,56cede40aab44d1400b88b71,In what year was John Peter Zenger tried?,In what year was John Peter Zenger try –ed?,1735,John Peter Zenger was try –ed in 1735.,1735,? in what year was John-Peter-Zenger try -ed,1735,John-Peter-Zenger was try -ed in 1735,False,Wh-Adverbial Question,Passive Construction,"[# ID: 56cede40aab44d1400b88b71, John-Peter-Ze...",1735,"[-0.18994845, -0.008756593, 0.054180417, 0.010...","[-0.18994845, -0.008756593, 0.054180417, 0.010...",0.0,True,True
74,93,56cede40aab44d1400b88b72,On what island did Zenger's trial occur?,On what island did Zenger's trial occur?,Manhattan,Zenger's trial occur –ed on Manhattan.,Manhattan,? on what island did Zengers trial occur,Manhattan,Zengers trial occur -ed on Manhattan,False,Wh-Adverbial Question,Subject-Verb-Adverbial (SVA),"[# ID: 56cede40aab44d1400b88b72, Zengers trial...",Manhattan,"[-0.20694321, -0.0033040794, 0.038179982, -0.0...","[-0.20694321, -0.0033040794, 0.038179982, -0.0...",0.0,True,True
75,94,56cede40aab44d1400b88b73,In what year was Columbia University chartered?,In what year was Columbia University charter –ed?,1754,Columbia University was charter –ed in 1754.,1754,? in what year was Columbia-University charter...,1754,Columbia-University was charter -ed in 1754,False,Wh-Adverbial Question,Passive Construction,"[# ID: 56cede40aab44d1400b88b73, Columbia-Univ...",1754,"[-0.1863562, -0.0045614582, 0.049112216, 0.002...","[-0.1863562, -0.0045614582, 0.049112216, 0.002...",0.0,True,True
88,109,56cee30faab44d1400b88bf2,Who was the British representative at the Conf...,Who was the British representative at the Conf...,Lord Howe,The British representative at the Conference H...,Lord Howe,? who was the British representative at the Co...,Lord-Howe,the British representative at the Conference-H...,False,Wh-Subject Question,Subject-Verb-Complement (SVC),"[# ID: 56cee30faab44d1400b88bf2, the British r...",Lord-Howe,"[-0.16061674, -0.011461509, 0.016838027, 0.016...","[-0.16061674, -0.011461509, 0.016838027, 0.016...",0.0,True,True
89,111,56cee30faab44d1400b88bf4,In what modern-day borough did the Great Fire ...,In what modern-day borough did the Great Fire ...,Manhattan,The Great Fire happen –ed in the modern-day bo...,Manhattan,? in what modern-day borough did the Great-Fir...,Manhattan,the Great-Fire happen -ed in the modern-day bo...,False,Wh-Adverbial Question,Subject-Verb-Adverbial (SVA),"[# ID: 56cee30faab44d1400b88bf4, the Great-Fir...",Manhattan,"[-0.20694321, -0.0033040794, 0.038179982, -0.0...","[-0.20694321, -0.0033040794, 0.038179982, -0.0...",0.0,True,True
97,119,56cee398aab44d1400b88bff,What was the second largest city in the United...,What was the second large –st city in the Unit...,Philadelphia,The second large –st city in the United States...,Philadelphia,? what was the second large -st city in the Un...,Philadelphia,the second large -st city in the United-States...,False,Wh-Object/Complement Question,Subject-Verb-Complement (SVC),"[# ID: 56cee398aab44d1400b88bff, the second la...",Philadelphia,"[-0.22285119, -0.0072190664, 0.023087623, -0.0...","[-0.22285119, -0.0072190664, 0.023087623, -0.0...",0.0,True,True
142,177,56cfdceb234ae51400d9bf95,Which decade did massive job losses happen in ...,Which decade did massive job loss –es happen i...,1970s,Massive job loss –es happen –ed in NYC during ...,1970s,? which decade did massive job loss -es happen...,1970s,Massive job loss -es happen -ed in NYC during ...,False,Wh-Adverbial Question,Subject-Verb-Adverbial (SVA),"[# ID: 56cfdceb234ae51400d9bf95, Massive job l...",1970s,"[-0.18953218, 0.009121594, 0.041348565, 0.0259...","[-0.18953218, 0.009121594, 0.041348565, 0.0259...",0.0,True,True


In [10]:
any_matches = response_formatted_df[response_formatted_df["test_answer_any_matching_word"]]
any_matches

Unnamed: 0,index,id,question,response_question,response_answer,response_declarative_sentence,answer,response_question_formatted,response_answer_formatted,response_declarative_sentence_formatted,is_pretraining,question_category,sentence_category,created_commands,test_answer,test_answer_embedding,response_answer_formatted_embedding,test_answer_cosine_distance,test_answer_correct,test_answer_any_matching_word
53,66,56cfb206234ae51400d9be8f,New Netherland established a permanent Europea...,New Netherland establish –ed a permanent Europ...,1624,New Netherland establish –ed a permanent Europ...,1624,? New-Netherland establish -ed a permanent Eur...,1624,New-Netherland establish -ed a permanent Europ...,False,Wh-Adverbial Question,Subject-Verb-Object (SVO/SVOA),"[# ID: 56cfb206234ae51400d9be8f, New-Netherlan...",1624,"[-0.18507972, 0.007633989, 0.024701936, -0.009...","[-0.18507972, 0.007633989, 0.024701936, -0.009...",0.0,True,True
56,70,56cedc10aab44d1400b88b1a,What was the regnal name of the Duke of York?,What was the regnal name of the Duke of York?,James II,The regnal name of the Duke of York was James II.,James II,? what was the regnal name of the Duke of York,James-II,the regnal name of the Duke of York was James-II,False,Wh-Object/Complement Question,Subject-Verb-Complement (SVC),"[# ID: 56cedc10aab44d1400b88b1a, the regnal na...",James-II,"[-0.20683673, -0.024920216, 0.020619316, 0.011...","[-0.20683673, -0.024920216, 0.020619316, 0.011...",0.0,True,True
65,82,56cedd1caab44d1400b88b42,How many Lenape lived in the area in 1700?,How many Lenape live –d in the area in 1700?,200,200 Lenape live –d in the area in 1700.,200,? how many Lenape live -d in the area in 1700,200,200 Lenape live -d in the area in 1700,False,Quantitative Wh-Question,Subject-Verb-Adverbial (SVA),"[# ID: 56cedd1caab44d1400b88b42, 200 Lenape li...",200,"[-0.17850119, 0.015458436, 0.0018362452, 0.010...","[-0.17850119, 0.015458436, 0.0018362452, 0.010...",0.0,True,True
73,92,56cede40aab44d1400b88b71,In what year was John Peter Zenger tried?,In what year was John Peter Zenger try –ed?,1735,John Peter Zenger was try –ed in 1735.,1735,? in what year was John-Peter-Zenger try -ed,1735,John-Peter-Zenger was try -ed in 1735,False,Wh-Adverbial Question,Passive Construction,"[# ID: 56cede40aab44d1400b88b71, John-Peter-Ze...",1735,"[-0.18994845, -0.008756593, 0.054180417, 0.010...","[-0.18994845, -0.008756593, 0.054180417, 0.010...",0.0,True,True
74,93,56cede40aab44d1400b88b72,On what island did Zenger's trial occur?,On what island did Zenger's trial occur?,Manhattan,Zenger's trial occur –ed on Manhattan.,Manhattan,? on what island did Zengers trial occur,Manhattan,Zengers trial occur -ed on Manhattan,False,Wh-Adverbial Question,Subject-Verb-Adverbial (SVA),"[# ID: 56cede40aab44d1400b88b72, Zengers trial...",Manhattan,"[-0.20694321, -0.0033040794, 0.038179982, -0.0...","[-0.20694321, -0.0033040794, 0.038179982, -0.0...",0.0,True,True
75,94,56cede40aab44d1400b88b73,In what year was Columbia University chartered?,In what year was Columbia University charter –ed?,1754,Columbia University was charter –ed in 1754.,1754,? in what year was Columbia-University charter...,1754,Columbia-University was charter -ed in 1754,False,Wh-Adverbial Question,Passive Construction,"[# ID: 56cede40aab44d1400b88b73, Columbia-Univ...",1754,"[-0.1863562, -0.0045614582, 0.049112216, 0.002...","[-0.1863562, -0.0045614582, 0.049112216, 0.002...",0.0,True,True
77,96,56cede40aab44d1400b88b75,What was the original name of Columbia Univers...,What was the origin –al name of Columbia Unive...,King's College,The origin –al name of Columbia University was...,King's College,? what was the origin -al name of Columbia-Uni...,Kings-College,the origin -al name of Columbia-University was...,False,Wh-Object/Complement Question,Subject-Verb-Complement (SVC),"[# ID: 56cede40aab44d1400b88b75, the origin -a...",was Kings-College,"[-0.18255968, -0.017762745, 0.014112608, -0.00...","[-0.19066477, -0.008647565, 0.016890023, 0.004...",0.04005,False,True
88,109,56cee30faab44d1400b88bf2,Who was the British representative at the Conf...,Who was the British representative at the Conf...,Lord Howe,The British representative at the Conference H...,Lord Howe,? who was the British representative at the Co...,Lord-Howe,the British representative at the Conference-H...,False,Wh-Subject Question,Subject-Verb-Complement (SVC),"[# ID: 56cee30faab44d1400b88bf2, the British r...",Lord-Howe,"[-0.16061674, -0.011461509, 0.016838027, 0.016...","[-0.16061674, -0.011461509, 0.016838027, 0.016...",0.0,True,True
89,111,56cee30faab44d1400b88bf4,In what modern-day borough did the Great Fire ...,In what modern-day borough did the Great Fire ...,Manhattan,The Great Fire happen –ed in the modern-day bo...,Manhattan,? in what modern-day borough did the Great-Fir...,Manhattan,the Great-Fire happen -ed in the modern-day bo...,False,Wh-Adverbial Question,Subject-Verb-Adverbial (SVA),"[# ID: 56cee30faab44d1400b88bf4, the Great-Fir...",Manhattan,"[-0.20694321, -0.0033040794, 0.038179982, -0.0...","[-0.20694321, -0.0033040794, 0.038179982, -0.0...",0.0,True,True
97,119,56cee398aab44d1400b88bff,What was the second largest city in the United...,What was the second large –st city in the Unit...,Philadelphia,The second large –st city in the United States...,Philadelphia,? what was the second large -st city in the Un...,Philadelphia,the second large -st city in the United-States...,False,Wh-Object/Complement Question,Subject-Verb-Complement (SVC),"[# ID: 56cee398aab44d1400b88bff, the second la...",Philadelphia,"[-0.22285119, -0.0072190664, 0.023087623, -0.0...","[-0.22285119, -0.0072190664, 0.023087623, -0.0...",0.0,True,True


In [11]:
incorrect_matches = response_formatted_df[~response_formatted_df["test_answer_correct"]]
incorrect_matches

Unnamed: 0,index,id,question,response_question,response_answer,response_declarative_sentence,answer,response_question_formatted,response_answer_formatted,response_declarative_sentence_formatted,is_pretraining,question_category,sentence_category,created_commands,test_answer,test_answer_embedding,response_answer_formatted_embedding,test_answer_cosine_distance,test_answer_correct,test_answer_any_matching_word
0,0,56ce304daab44d1400b8850e,What city in the United States has the highest...,What city in the United States has the high –e...,New York,The city in the United States with the high –e...,New York,? what city in the United-States has the high ...,New-York,the city in the United-States with the high -e...,False,Wh-Subject Question,Subject-Verb-Complement (SVC),"[# ID: 56ce304daab44d1400b8850e, the city in t...",1700s,"[-0.18147065, -0.0011770469, 0.046052348, 0.00...","[-0.19721355, 0.00012896695, 0.026783189, 0.00...",0.208907,False,False
1,1,56ce304daab44d1400b8850f,In what city is the United Nations based?,In what city is the United Nations base –d?,New York,The United Nations is base –d in New York.,New York,? in what city is the United-Nations base -d,New-York,the United-Nations is base -d in New-York,False,Wh-Adverbial Question,Passive Construction,"[# ID: 56ce304daab44d1400b8850f, the United-Na...",Exploitation number of updates >= 4000,"[-0.15523376, 0.028737156, 0.009077885, 0.0581...","[-0.19721355, 0.00012896695, 0.026783189, 0.00...",0.306210,False,False
2,2,56ce304daab44d1400b88510,What city has been called the cultural capital...,What city has been call –ed the culture –al ca...,New York,New York has been call –ed the culture –al cap...,New York,? what city has been call -ed the culture -al ...,New-York,New-York has been call -ed the culture -al cap...,False,Wh-Subject Question,Passive Construction,"[# ID: 56ce304daab44d1400b88510, New-York has ...",One-World-Trade-Center,"[-0.16677752, 0.012732404, 0.05187011, 0.00256...","[-0.19721355, 0.00012896695, 0.026783189, 0.00...",0.207320,False,False
3,3,56ce304daab44d1400b88511,What American city welcomes the largest number...,What American city welcome –s the large –st nu...,New York,New York is the American city that welcome –s ...,New York,? what American city welcome -s the large -st ...,New-York,New-York is the American city that welcome -s ...,False,Wh-Subject Question,Subject-Verb-Object (SVO/SVOA),"[# ID: 56ce304daab44d1400b88511, New-York is t...",Exploitation number of updates >= 4000,"[-0.15523376, 0.028737156, 0.009077885, 0.0581...","[-0.19721355, 0.00012896695, 0.026783189, 0.00...",0.306210,False,False
4,6,56ce3124aab44d1400b8852a,How many boroughs comprise New York City?,How many borough –s comprise New York City?,five,New York City is comprise –d of five borough –s.,five,? how many borough -s comprise New-York-City,five,New-York-City is comprise -d of five borough -s,False,Quantitative Wh-Question,Passive Construction,"[# ID: 56ce3124aab44d1400b8852a, New-York-City...",NO ANSWER GIVEN,"[-0.17671387, 0.017996417, 0.028865896, 0.0410...","[-0.21587223, 0.0065979995, 0.021248076, -0.00...",0.146315,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
596,723,56d1204617492d1400aab9fc,What federal district court has jurisdiction o...,What federal district court has jurisdiction o...,the District Court for the Eastern District of...,The District Court for the Eastern District of...,the District Court for the Eastern District of...,? what federal district court has jurisdiction...,the District-Court for the Eastern-District of...,the District-Court for the Eastern-District of...,False,Wh-Subject Question,Subject-Verb-Object (SVO/SVOA),"[# ID: 56d1204617492d1400aab9fc, the District-...",NO ANSWER GIVEN,"[-0.17671387, 0.017996417, 0.028865896, 0.0410...","[-0.14033447, 0.00794487, 0.021146098, 0.00230...",0.356770,False,False
597,724,56d1204617492d1400aab9fd,In what borough is the New York City Hall found?,In what borough is the New York City Hall found?,Manhattan,The New York City Hall is found in the borough...,Manhattan,? in what borough is the New-York-City-Hall found,Manhattan,the New-York-City-Hall is found in the borough...,False,Wh-Adverbial Question,Passive Construction,"[# ID: 56d1204617492d1400aab9fd, the New-York-...",in,"[-0.2097755, -0.0062216036, 0.035323795, -0.00...","[-0.20694321, -0.0033040794, 0.038179982, -0.0...",0.085570,False,False
598,725,56d1218c17492d1400aaba1f,How much money in cents does New York City rec...,How much money in cent –s does New York City r...,83,New York City receive –s 83 cent –s for every ...,83,? how much money in cent -s does New-York-City...,83,New-York-City receive -s 83 cent -s for every ...,False,Quantitative Wh-Question,Subject-Verb-Object (SVO/SVOA),"[# ID: 56d1218c17492d1400aaba1f, New-York-City...",two,"[-0.20569867, 0.0018786144, 0.028507987, 0.026...","[-0.19078965, -0.013918617, -0.0024308818, 0.0...",0.189569,False,False
599,727,56d1218c17492d1400aaba21,"Each year, how much more money does New York C...","Each year, how much more money does New York C...",$ 11.4 billion,New York City give –s $ 11.4 billion more mone...,$11.4 billion,? each year how much more money does New-York-...,11 point 4 billion,New-York-City give -s 11 point 4 billion more...,False,Quantitative Wh-Question,Subject-Verb-Object (SVO/SVOA),"[# ID: 56d1218c17492d1400aaba21, New-York-City...",1783,"[-0.18825383, -0.010496782, 0.055308685, 0.007...","[-0.13434438, -0.012159896, 0.04242972, 0.0124...",0.302956,False,False


In [27]:
#write the results to a file and export the results dataframe to a tsv file
import datetime
tsv_results_filename = "test_nyc_results" + datetime.datetime.now().strftime("_%Y%m%d_%H%M%S") + ".tsv"
tsv_results_filepath = os.path.join(test_results_dir, tsv_results_filename)
response_formatted_df.to_csv(tsv_results_filepath, sep="\t", index=False)

results_summary_filename = "test_nyc_results_summary" + datetime.datetime.now().strftime("_%Y%m%d_%H%M%S") + ".txt"
results_summary_filepath = os.path.join(test_results_dir, results_summary_filename)

with open(results_summary_filepath, 'w') as results_file:
    #write the number of samples tested
    results_file.write(f"total number of samples\t{total_number_of_test_samples}\n")
    results_file.write(f"number_of_test_answers\t{len(response_formatted_df)}\n")
    results_file.write(f"total_number_of_pretraining_samples\t{total_number_of_pretraining_samples}\n")
    results_file.write(f"percentage_correct\t{percentage_correct}\n")
    results_file.write(f"percentage_any_word_matches\t{percentage_any_word_matches}\n")
    results_file.write(f"percentage_close_cosine_distance\t{len(close_cosine_distance_df) / len(response_formatted_df) * 100}\n")
    results_file.write(f"percentage_close_cosine_distance_and_any_word_match\t{len(close_cosine_distance_correct_df) / len(response_formatted_df) * 100}\n")
    results_file.write(f"number of test answers longer than 20 words (removed)\t{num_long_answers}\n")
    # write the rows that had exact word matches to the file
    results_file.write("\nRows with exact matches:\n")
    results_file.write(
        correct_matches[["response_question", "response_answer", "test_answer"]].to_markdown(index=False))
    #write the rows in any_matches to the file
    results_file.write("\nRows with any word matches:\n")
    results_file.write(any_matches[["response_question", "response_answer", "test_answer"]].to_markdown(index=False))
    # write the rows that had a close cosine distance to the file
    results_file.write(f"\nRows with cosine distance less than {cosine_distance_threshold}:\n")
    results_file.write(
        close_cosine_distance_df[
            ["response_question", "response_answer", "test_answer", "test_answer_cosine_distance"]].to_markdown(
            index=False))
    # write the rows that had a close cosine distance and any word match to the file
    results_file.write(f"\nRows with cosine distance less than {cosine_distance_threshold} and any word match:\n")
    results_file.write(close_cosine_distance_correct_df[
                           ["response_question", "response_answer", "test_answer",
                            "test_answer_cosine_distance"]].to_markdown(index=False))
    #write the rows that had any matches and with a close cosine distance to the file
    results_file.write(f"\nRows with cosine distance less than {cosine_distance_threshold} and exact match:\n")
    results_file.write(correct_matches[
                           ["response_question", "response_answer", "test_answer",
                            "test_answer_cosine_distance"]].to_markdown(index=False))
print(f"results written to {tsv_results_filepath} and {results_summary_filepath}")

results written to G:\My Drive\Shared with Julia\Education\Kent University\PhD\work\annabell\experiments\12\results\test_nyc_results_20251021_110149.tsv and G:\My Drive\Shared with Julia\Education\Kent University\PhD\work\annabell\experiments\12\results\test_nyc_results_summary_20251021_110150.txt
