# This notebook is used to test the ANNABELL model on the SQuAD dataset.


In [8]:
from dataset_processing import question_and_answer_pairs_from_log_file, any_word_match, embedding_for_sentence, \
    cosine_distance
import os
import platform
import datetime
import pandas as pd
from tqdm import tqdm

In [4]:
experiment_number = 8
operating_system = platform.system()
if operating_system == 'Windows':
    raise Exception("not used on Windows yet")
elif operating_system == 'Linux':
    base_directory = "/home/chris/gdrive/work/annabell"
elif operating_system == 'Darwin':  #macOS
    base_directory = "/Users/chris/Library/CloudStorage/GoogleDrive-cjameswalmsley@gmail.com/My Drive/Shared with Julia/Education/Kent University/PhD/work/annabell/"
else:
    raise Exception("unsupported OS")

timestamp = datetime.datetime.now().strftime("_%Y%m%d_%H%M%S")

test_input_dir = os.path.join(base_directory, "experiments", str(experiment_number), "testing")
test_log_dir = os.path.join(base_directory, "experiments", str(experiment_number), "logs")
data_dir = os.path.join(base_directory, "experiments/data")
test_results_dir = os.path.join(base_directory, "experiments", str(experiment_number), "results")

test_log_filename = "test_nyc_samples_log_2025-09-24_05-28-54.txt"
test_input_filename = "nyc_declarative_sentences_testing_20250924_062527.txt"
dataset_filename = "response_formatted_20250924_174653.jsonl"

test_log_filepath = os.path.join(test_log_dir, test_log_filename)
test_input_filepath = os.path.join(test_input_dir, test_input_filename)
dataset_filepath = os.path.join(data_dir, dataset_filename)

test_answer_summary_filename = "test_nyc_answer_summary" + timestamp + ".tsv"
tsv_results_filepath = os.path.join(test_results_dir, test_answer_summary_filename)
question_and_answer_pairs = question_and_answer_pairs_from_log_file(test_log_filepath)
print("length of log file questions and answers: " + str(len(question_and_answer_pairs)))
with open(test_input_filepath, 'r') as test_input_file:
    test_input_lines = test_input_file.readlines()
print("length of test file questions: " + str(len(test_input_lines)))
response_formatted_df = pd.read_json(dataset_filepath, lines=True)
response_formatted_df = response_formatted_df[response_formatted_df["is_pretraining"] == False]
response_formatted_df = response_formatted_df.reset_index(drop=True)
#add the test questions to the dataframe
questions_not_found = []
for question, answer in question_and_answer_pairs:
    if question in response_formatted_df["response_question_formatted"].values:
        if answer == "":
            answer = "NO ANSWER GIVEN"
        response_formatted_df.loc[
            response_formatted_df["response_question_formatted"] == question, "test_answer"] = answer
    else:
        questions_not_found.append(question)
print(f"number of questions not found in training data: {len(questions_not_found)}")
print("questions not found in training data: " + str(questions_not_found[:5]) + " ...")

length of log file questions and answers: 830
length of test file questions: 1568
number of questions not found in training data: 46
questions not found in training data: ['? when', '? the', '? on', '? New Netherland', '?'] ...


In [12]:
#generate embeddings for the test answer and the response_answer_formatted columns and compare them using cosine distance
tqdm.pandas(desc="Generating test answer embeddings")
response_formatted_df["test_answer_embedding"] = response_formatted_df["test_answer"].progress_apply(
    lambda x: embedding_for_sentence(x) if pd.notnull(x) else None)
tqdm.pandas(desc="Generating response answer embeddings")
response_formatted_df["response_answer_formatted_embedding"] = response_formatted_df[
    "response_answer_formatted"].progress_apply(
    lambda x: embedding_for_sentence(x) if pd.notnull(x) else None)
response_formatted_df["test_answer_cosine_distance"] = response_formatted_df.apply(cosine_distance, axis=1)
response_formatted_df

Unnamed: 0,id,question,response_question,response_answer,response_declarative_sentence,answer,response_question_formatted,response_answer_formatted,response_declarative_sentence_formatted,is_pretraining,test_answer,test_answer_correct,test_answer_any_matching_word,test_answer_embedding,response_answer_formatted_embedding,test_answer_cosine_distance
0,56ce304daab44d1400b8850e,What city in the United States has the highest...,What city in the United States has the high –e...,New York,The city in the United States with the high –e...,New York,? what city in the United States has the high ...,New York,the city in the United States with the high -e...,False,Hill,False,False,"[-0.2109014, -0.010532444, 0.017017616, 0.0056...","[-0.2009252, 0.0065351515, 0.024541078, -0.001...",0.084467
1,56ce304daab44d1400b88510,What city has been called the cultural capital...,What city has been call –ed the culture –al ca...,New York,New York has been call –ed the culture –al cap...,New York,? what city has been call -ed the culture -al ...,New York,New York has been call -ed the culture -al cap...,False,Exploitation number of updates >= 4000,False,False,"[-0.15502132, 0.028481387, 0.009052947, 0.0581...","[-0.2009252, 0.0065351515, 0.024541078, -0.001...",0.309620
2,56ce304daab44d1400b88511,What American city welcomes the largest number...,What American city welcome –s the large –st nu...,New York,New York is the American city that welcome –s ...,New York,? what American city welcome -s the large -st ...,New York,New York is the American city that welcome -s ...,False,the Bronx,False,False,"[-0.1964538, -0.009560595, 0.018535579, -0.006...","[-0.2009252, 0.0065351515, 0.024541078, -0.001...",0.102214
3,56cf5d41aab44d1400b89130,The major gateway for immigration has been whi...,The major gateway for immigrate –ion has been ...,New York City,The major gateway for immigrate –ion has been ...,New York City,? the major gateway for immigrate -ion has bee...,New York City,the major gateway for immigrate -ion has been ...,False,Exploitation number of updates >= 4000,False,False,"[-0.15502132, 0.028481387, 0.009052947, 0.0581...","[-0.1971769, 0.014245291, 0.030869555, 0.00048...",0.321828
4,56cf5d41aab44d1400b89131,The most populated city in the United States i...,The most populate –d city in the United States...,New York City,The most populate –d city in the United States...,New York City,? the most populate -d city in the United Stat...,New York City,the most populate -d city in the United States...,False,New York become the,False,True,"[-0.20136768, 0.021992227, 0.025505755, 0.0067...","[-0.1971769, 0.014245291, 0.030869555, 0.00048...",0.045506
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
779,56d1218c17492d1400aaba1f,How much money in cents does New York City rec...,How much money in cent –s does New York City r...,83,New York City receive –s 83 cent –s for every ...,83,? how much money in cent -s does New York City...,83,New York City receive -s 83 cent -s for every ...,False,Pelham Bay Park,False,False,"[-0.20642424, -0.023026522, 0.021445049, 0.027...","[-0.19078888, -0.014057412, -0.002494617, 0.01...",0.230405
780,56d1218c17492d1400aaba20,How much more money does the city give to the ...,How much more money does the city give to the ...,$ 11 billion,The city give –s $ 11 billion more money to th...,$11 billion,? how much more money does the city give to th...,11 billion,the city give -s 11 billion more money to the...,False,more than 200,False,False,"[-0.16952485, 0.0033991009, 0.0044423803, -0.0...","[-0.16274187, -0.02008049, 0.05266985, 0.00129...",0.300898
781,56d1218c17492d1400aaba21,"Each year, how much more money does New York C...","Each year, how much more money does New York C...",$ 11.4 billion,New York City give –s $ 11.4 billion more mone...,$11.4 billion,? each year how much more money does New York ...,11 point 4 billion,New York City give -s 11 point 4 billion more...,False,Pelham Bay Park,False,False,"[-0.20642424, -0.023026522, 0.021445049, 0.027...","[-0.13401222, -0.0122689605, 0.04238571, 0.012...",0.335063
782,56d121d817492d1400aaba2d,What is the new name of the Sister City Progra...,What is the new name of the Sister City Progra...,New York City Global Partners,The new name of the Sister City Program of the...,New York City Global Partners,? what is the new name of the Sister City Prog...,New York City Global Partners,the new name of the Sister City Program of the...,False,NO ANSWER GIVEN,False,False,"[-0.17647709, 0.017767908, 0.028887603, 0.0407...","[-0.18002905, 0.03261394, 0.030344076, 0.00359...",0.206474


In [5]:
# Get the counts for each unique value in the 'test_answer' column
test_answer_summary = response_formatted_df['test_answer'].value_counts().reset_index()
# Rename the columns for clarity
test_answer_summary.columns = ['test_answer', 'count']
# Sort the results by count in descending order
test_answer_summary.sort_values(by='count', ascending=False, inplace=True)
test_answer_summary

Unnamed: 0,test_answer,count
0,NO ANSWER GIVEN,267
1,Exploitation number of updates >= 4000,89
2,.,45
4,NYCTV,12
3,more than 200,12
...,...,...
105,system,1
104,Great Fire of New York,1
103,the full,1
102,the populate,1


In [14]:
#write the results dataframe to a tsv file
test_answer_summary.to_csv(tsv_results_filepath, sep="\t", index=False)
#count the number of results where the test answer is > 20 words
num_long_answers = response_formatted_df["test_answer"].apply(
    lambda x: len(x.split()) > 20 if pd.notnull(x) else False).sum()
print(f"number of test answers longer than 20 words: {num_long_answers}")
response_formatted_df["test_answer_correct"] = response_formatted_df["test_answer"] == response_formatted_df[
    "response_answer_formatted"]
number_correct = response_formatted_df["test_answer_correct"].sum()
print(f"number correct = {number_correct} out of {len(response_formatted_df)}")
percentage_correct = response_formatted_df["test_answer_correct"].mean() * 100
print(f"percentage correct = {percentage_correct} %")

response_formatted_df["test_answer_any_matching_word"] = response_formatted_df.apply(any_word_match, axis=1)
percentage_any_word_matches = response_formatted_df["test_answer_any_matching_word"].mean() * 100
number_any_word_matches = response_formatted_df["test_answer_any_matching_word"].sum()
print(f"number any word matches = {number_any_word_matches} out of {len(response_formatted_df)}")
print(f"percentage any word matches = {percentage_any_word_matches} %")

cosine_distance_threshold = 0.1
#create a dataframe with the rows where the cosine distance is less than the threshold
close_cosine_distance_df = response_formatted_df[
    response_formatted_df["test_answer_cosine_distance"] < cosine_distance_threshold]
print(f"number of rows with cosine distance less than {cosine_distance_threshold}: {len(close_cosine_distance_df)}")
print("percentage of total: " + str(len(close_cosine_distance_df) / len(response_formatted_df) * 100) + " %")

#create a dataframe with the rows where the cosine distance is less than the threshold and any matching answer is correct
close_cosine_distance_correct_df = close_cosine_distance_df[
    close_cosine_distance_df["test_answer_any_matching_word"]]
print(
    f"number of rows with cosine distance less than {cosine_distance_threshold} and any matching answer correct: {len(close_cosine_distance_correct_df)}")
print("percentage of total: " + str(len(close_cosine_distance_correct_df) / len
(response_formatted_df) * 100) + " %")

close_cosine_distance_correct_df

number of test answers longer than 20 words: 0
number correct = 19 out of 784
percentage correct = 2.423469387755102 %
number any word matches = 59 out of 784
percentage any word matches = 7.525510204081633 %
number of rows with cosine distance less than 0.1: 138
percentage of total: 17.602040816326532 %
number of rows with cosine distance less than 0.1 and any matching answer correct: 41
percentage of total: 5.229591836734694 %


Unnamed: 0,id,question,response_question,response_answer,response_declarative_sentence,answer,response_question_formatted,response_answer_formatted,response_declarative_sentence_formatted,is_pretraining,test_answer,test_answer_correct,test_answer_any_matching_word,test_answer_embedding,response_answer_formatted_embedding,test_answer_cosine_distance
4,56cf5d41aab44d1400b89131,The most populated city in the United States i...,The most populate –d city in the United States...,New York City,The most populate –d city in the United States...,New York City,? the most populate -d city in the United Stat...,New York City,the most populate -d city in the United States...,False,New York become the,False,True,"[-0.20136768, 0.021992227, 0.025505755, 0.0067...","[-0.1971769, 0.014245291, 0.030869555, 0.00048...",0.045506
5,56ce3124aab44d1400b8852a,How many boroughs comprise New York City?,How many borough –s comprise New York City?,five,New York City is comprise –d of five borough –s.,five,? how many borough -s comprise New York City,five,New York City is comprise -d of five borough -s,False,five,True,True,"[-0.21606758, 0.0066151097, 0.021371702, -0.00...","[-0.21606758, 0.0066151097, 0.021371702, -0.00...",0.0
10,56cf9d81234ae51400d9be1b,How man boroughs does New York City contain?,How many borough –s does New York City contain?,five,New York City contain –s five borough –s.,five,? how many borough -s does New York City contain,five,New York City contain -s five borough -s,False,five,True,True,"[-0.21606758, 0.0066151097, 0.021371702, -0.00...","[-0.21606758, 0.0066151097, 0.021371702, -0.00...",0.0
16,56ce31baaab44d1400b8853b,What nation founded New Amsterdam?,What nation found –ed New Amsterdam?,the Dutch Republic,The Dutch Republic found –ed New Amsterdam.,the Dutch Republic,? what nation found -ed New Amsterdam,the Dutch Republic,the Dutch Republic found -ed New Amsterdam,False,the Dutch,False,True,"[-0.19508469, 0.006579182, 0.028311893, 0.0073...","[-0.18797612, 0.01739416, 0.035622064, -0.0006...",0.02927
18,56ce31baaab44d1400b8853d,When did the English take over the area from t...,When did the English take over the area from t...,1664,The English took over the area from the Dutch ...,1664,? when did the English take over the area from...,1664,the English took over the area from the Dutch ...,False,in 1664,False,True,"[-0.14499317, -0.0015699557, 0.041724227, -0.0...","[-0.18192707, 0.0006892185, 0.036272198, 0.000...",0.09078
26,56ce32e7aab44d1400b88552,In what borough is Wall Street located?,In what borough is Wall Street locate –d?,Manhattan,Wall Street is locate –d in the borough of Man...,Manhattan,? in what borough is Wall Street locate -d,Manhattan,Wall Street is locate -d in the borough of Man...,False,Manhattan,True,True,"[-0.20684884, -0.0033799286, 0.038709026, -0.0...","[-0.20684884, -0.0033799286, 0.038709026, -0.0...",0.0
40,56ce34c7aab44d1400b88596,What was the nationality of Estêvão Gomes?,What was the national –ity of Estêvão Gomes?,Portuguese,The national –ity of Estêvão Gomes was Portugu...,Portuguese,? what was the national -ity of Estevao Gomes,Portuguese,the national -ity of Estevao Gomes was Portuguese,False,Portuguese,True,True,"[-0.19723552, -0.017073331, 0.018096691, 0.000...","[-0.19723552, -0.017073331, 0.018096691, 0.000...",0.0
42,56ce34c7aab44d1400b88598,What was the name of Estêvão Gomes's ship?,What was the name of Estêvão Gomes's ship?,La Anunciada,The name of Estêvão Gomes's ship was La Anunci...,La Anunciada,? what was the name of Estevao Gomess ship,La Anunciada,the name of Estevao Gomess ship was La Anunciada,False,La,False,True,"[-0.21685533, -0.0018898161, 0.016474504, -0.0...","[-0.2023023, 0.0072115106, 0.01968599, -0.0017...",0.063791
54,56ce362aaab44d1400b885bd,What did the Dutch call Juan Rodriguez?,What did the Dutch call Juan Rodriguez?,Jan Rodrigues,The Dutch call –ed Juan Rodriguez Jan Rodrigues.,Jan Rodrigues,? what did the Dutch call Juan Rodriguez,Jan Rodrigues,the Dutch call -ed Juan Rodriguez Jan Rodrigues,False,Rodrigues,False,True,"[-0.20646654, -0.0369005, 0.014651128, -0.0019...","[-0.20713769, -0.014252867, 0.020896228, -0.00...",0.050122
87,56ceddd9aab44d1400b88b61,Near what square was the African Burial Ground...,Near what square was the African Burial Ground...,Foley Square,The African Burial Ground was unearth –ed near...,Foley Square,? Near what square was the African Burial Grou...,Foley Square,the African Burial Ground was unearth -ed near...,False,near Foley,False,True,"[-0.2192533, -0.0012577282, 0.022020834, 0.024...","[-0.2034413, -0.011547362, -0.0044129845, 0.01...",0.083724


In [15]:
correct_matches = response_formatted_df[response_formatted_df["test_answer_correct"]]
correct_matches

Unnamed: 0,id,question,response_question,response_answer,response_declarative_sentence,answer,response_question_formatted,response_answer_formatted,response_declarative_sentence_formatted,is_pretraining,test_answer,test_answer_correct,test_answer_any_matching_word,test_answer_embedding,response_answer_formatted_embedding,test_answer_cosine_distance
5,56ce3124aab44d1400b8852a,How many boroughs comprise New York City?,How many borough –s comprise New York City?,five,New York City is comprise –d of five borough –s.,five,? how many borough -s comprise New York City,five,New York City is comprise -d of five borough -s,False,five,True,True,"[-0.21606758, 0.0066151097, 0.021371702, -0.00...","[-0.21606758, 0.0066151097, 0.021371702, -0.00...",0.0
10,56cf9d81234ae51400d9be1b,How man boroughs does New York City contain?,How many borough –s does New York City contain?,five,New York City contain –s five borough –s.,five,? how many borough -s does New York City contain,five,New York City contain -s five borough -s,False,five,True,True,"[-0.21606758, 0.0066151097, 0.021371702, -0.00...","[-0.21606758, 0.0066151097, 0.021371702, -0.00...",0.0
26,56ce32e7aab44d1400b88552,In what borough is Wall Street located?,In what borough is Wall Street locate –d?,Manhattan,Wall Street is locate –d in the borough of Man...,Manhattan,? in what borough is Wall Street locate -d,Manhattan,Wall Street is locate -d in the borough of Man...,False,Manhattan,True,True,"[-0.20684884, -0.0033799286, 0.038709026, -0.0...","[-0.20684884, -0.0033799286, 0.038709026, -0.0...",0.0
40,56ce34c7aab44d1400b88596,What was the nationality of Estêvão Gomes?,What was the national –ity of Estêvão Gomes?,Portuguese,The national –ity of Estêvão Gomes was Portugu...,Portuguese,? what was the national -ity of Estevao Gomes,Portuguese,the national -ity of Estevao Gomes was Portuguese,False,Portuguese,True,True,"[-0.19723552, -0.017073331, 0.018096691, 0.000...","[-0.19723552, -0.017073331, 0.018096691, 0.000...",0.0
97,56cfb691234ae51400d9bec5,The Stamp Act Congress had a meeting in 1765 i...,The Stamp Act Congress had a meet –ing in 1765...,New York,The Stamp Act Congress had a meet –ing in 1765...,New York,? the Stamp Act Congress had a meet -ing in 17...,New York,the Stamp Act Congress had a meet -ing in 1765...,False,New York,True,True,"[-0.2009252, 0.0065351515, 0.024541078, -0.001...","[-0.2009252, 0.0065351515, 0.024541078, -0.001...",0.0
153,56cee70daab44d1400b88c51,In what year did the General Slocum disaster o...,In what year did the General Slocum disaster o...,1904,The General Slocum disaster occur –ed in 1904.,1904,? in what year did the General Slocum disaster...,1904,the General Slocum disaster occur -ed in 1904,False,1904,True,True,"[-0.18448333, -0.008389682, 0.037368163, 0.012...","[-0.18448333, -0.008389682, 0.037368163, 0.012...",0.0
187,56cfdde6234ae51400d9bfa6,The Hudson Terminal which was also demolished ...,The Hudson Terminal which was also demolish –e...,1909,"The Hudson Terminal, which was also demolish –...",1909,? the Hudson Terminal which was also demolish ...,1909,the Hudson Terminal which was also demolish -e...,False,1909,True,True,"[-0.17983598, 0.005019228, 0.036290534, 0.0007...","[-0.17983598, 0.005019228, 0.036290534, 0.0007...",0.0
195,56cef532aab44d1400b88d1a,In what year was the National Review founded?,In what year was the National Review found –ed?,1955,The National Review was found –ed in 1955.,1955,? in what year was the National Review found -ed,1955,the National Review was found -ed in 1955,False,1955,True,True,"[-0.19271274, 0.0036500944, 0.045841116, 0.023...","[-0.19271274, 0.0036500944, 0.045841116, 0.023...",0.0
205,56cef613aab44d1400b88d2b,The Hudson River serves as a dividing line bet...,The Hudson River serve –s as a divide –ing lin...,New Jersey,The Hudson River serve –s as a divide –ing lin...,New Jersey,? the Hudson River serve -s as a divide -ing l...,New Jersey,the Hudson River serve -s as a divide -ing lin...,False,New Jersey,True,True,"[-0.20539168, 0.0017549262, 0.0106013315, -0.0...","[-0.20539168, 0.0017549262, 0.0106013315, -0.0...",0.0
231,56cf1a05aab44d1400b88d7e,In what year was the Woolworth Building comple...,In what year was the Woolworth Building comple...,1913,The Woolworth Building was complete –d in 1913.,1913,? in what year was the Woolworth Building comp...,1913,the Woolworth Building was complete -d in 1913,False,1913,True,True,"[-0.18806027, 0.006733494, 0.030512037, 0.0229...","[-0.18806027, 0.006733494, 0.030512037, 0.0229...",0.0


In [16]:
any_matches = response_formatted_df[response_formatted_df["test_answer_any_matching_word"]]
any_matches

Unnamed: 0,id,question,response_question,response_answer,response_declarative_sentence,answer,response_question_formatted,response_answer_formatted,response_declarative_sentence_formatted,is_pretraining,test_answer,test_answer_correct,test_answer_any_matching_word,test_answer_embedding,response_answer_formatted_embedding,test_answer_cosine_distance
4,56cf5d41aab44d1400b89131,The most populated city in the United States i...,The most populate –d city in the United States...,New York City,The most populate –d city in the United States...,New York City,? the most populate -d city in the United Stat...,New York City,the most populate -d city in the United States...,False,New York become the,False,True,"[-0.20136768, 0.021992227, 0.025505755, 0.0067...","[-0.1971769, 0.014245291, 0.030869555, 0.00048...",0.045506
5,56ce3124aab44d1400b8852a,How many boroughs comprise New York City?,How many borough –s comprise New York City?,five,New York City is comprise –d of five borough –s.,five,? how many borough -s comprise New York City,five,New York City is comprise -d of five borough -s,False,five,True,True,"[-0.21606758, 0.0066151097, 0.021371702, -0.00...","[-0.21606758, 0.0066151097, 0.021371702, -0.00...",0.0
10,56cf9d81234ae51400d9be1b,How man boroughs does New York City contain?,How many borough –s does New York City contain?,five,New York City contain –s five borough –s.,five,? how many borough -s does New York City contain,five,New York City contain -s five borough -s,False,five,True,True,"[-0.21606758, 0.0066151097, 0.021371702, -0.00...","[-0.21606758, 0.0066151097, 0.021371702, -0.00...",0.0
13,56cf9d81234ae51400d9be1e,What is the population of New York City as of ...,What is the populate –ion of New York City as ...,8491079,The populate –ion of New York City as of 2014 ...,8491079,? what is the populate -ion of New York City a...,8491079,the populate -ion of New York City as of 2014 ...,False,2014 is 8491079,False,True,"[-0.11828961, -0.029860718, 0.006400737, 0.029...","[-0.16185391, 0.009814939, 0.027025407, 0.0109...",0.19295
16,56ce31baaab44d1400b8853b,What nation founded New Amsterdam?,What nation found –ed New Amsterdam?,the Dutch Republic,The Dutch Republic found –ed New Amsterdam.,the Dutch Republic,? what nation found -ed New Amsterdam,the Dutch Republic,the Dutch Republic found -ed New Amsterdam,False,the Dutch,False,True,"[-0.19508469, 0.006579182, 0.028311893, 0.0073...","[-0.18797612, 0.01739416, 0.035622064, -0.0006...",0.02927
18,56ce31baaab44d1400b8853d,When did the English take over the area from t...,When did the English take over the area from t...,1664,The English took over the area from the Dutch ...,1664,? when did the English take over the area from...,1664,the English took over the area from the Dutch ...,False,in 1664,False,True,"[-0.14499317, -0.0015699557, 0.041724227, -0.0...","[-0.18192707, 0.0006892185, 0.036272198, 0.000...",0.09078
26,56ce32e7aab44d1400b88552,In what borough is Wall Street located?,In what borough is Wall Street locate –d?,Manhattan,Wall Street is locate –d in the borough of Man...,Manhattan,? in what borough is Wall Street locate -d,Manhattan,Wall Street is locate -d in the borough of Man...,False,Manhattan,True,True,"[-0.20684884, -0.0033799286, 0.038709026, -0.0...","[-0.20684884, -0.0033799286, 0.038709026, -0.0...",0.0
40,56ce34c7aab44d1400b88596,What was the nationality of Estêvão Gomes?,What was the national –ity of Estêvão Gomes?,Portuguese,The national –ity of Estêvão Gomes was Portugu...,Portuguese,? what was the national -ity of Estevao Gomes,Portuguese,the national -ity of Estevao Gomes was Portuguese,False,Portuguese,True,True,"[-0.19723552, -0.017073331, 0.018096691, 0.000...","[-0.19723552, -0.017073331, 0.018096691, 0.000...",0.0
42,56ce34c7aab44d1400b88598,What was the name of Estêvão Gomes's ship?,What was the name of Estêvão Gomes's ship?,La Anunciada,The name of Estêvão Gomes's ship was La Anunci...,La Anunciada,? what was the name of Estevao Gomess ship,La Anunciada,the name of Estevao Gomess ship was La Anunciada,False,La,False,True,"[-0.21685533, -0.0018898161, 0.016474504, -0.0...","[-0.2023023, 0.0072115106, 0.01968599, -0.0017...",0.063791
54,56ce362aaab44d1400b885bd,What did the Dutch call Juan Rodriguez?,What did the Dutch call Juan Rodriguez?,Jan Rodrigues,The Dutch call –ed Juan Rodriguez Jan Rodrigues.,Jan Rodrigues,? what did the Dutch call Juan Rodriguez,Jan Rodrigues,the Dutch call -ed Juan Rodriguez Jan Rodrigues,False,Rodrigues,False,True,"[-0.20646654, -0.0369005, 0.014651128, -0.0019...","[-0.20713769, -0.014252867, 0.020896228, -0.00...",0.050122


In [20]:
#write the results to a file and export the results dataframe to a tsv file
tsv_results_filename = "test_nyc_results" + datetime.datetime.now().strftime("_%Y%m%d_%H%M%S") + ".tsv"
tsv_results_filepath = os.path.join(test_results_dir, tsv_results_filename)
response_formatted_df.to_csv(tsv_results_filepath, sep="\t", index=False)

results_summary_filename = "test_nyc_results_summary" + datetime.datetime.now().strftime("_%Y%m%d_%H%M%S") + ".txt"
results_summary_filepath = os.path.join(test_results_dir, results_summary_filename)

with open(results_summary_filepath, 'w') as results_file:
    #write the number of samples tested
    results_file.write(f"total number of samples\t{len(test_input_lines) / 2}\n")
    results_file.write(f"number_of_test_answers\t{len(response_formatted_df)}\n")
    results_file.write(f"percentage_correct\t{percentage_correct}\n")
    results_file.write(f"percentage_any_word_matches\t{percentage_any_word_matches}\n")
    results_file.write(f"number of test answers longer than 20 words (removed)\t{num_long_answers}\n")
    # write the rows that had exact word matches to the file
    results_file.write("\nRows with exact matches:\n")
    results_file.write(
        correct_matches[["response_question", "response_answer", "test_answer"]].to_markdown(index=False))
    #write the rows in any_matches to the file
    results_file.write("\nRows with any word matches:\n")
    results_file.write(any_matches[["response_question", "response_answer", "test_answer"]].to_markdown(index=False))
    # write the rows that had a close cosine distance to the file
    results_file.write(f"\nRows with cosine distance less than {cosine_distance_threshold}:\n")
    results_file.write(
        close_cosine_distance_df[
            ["response_question", "response_answer", "test_answer", "test_answer_cosine_distance"]].to_markdown(
            index=False))
    # write the rows that had a close cosine distance and any word match to the file
    results_file.write(f"\nRows with cosine distance less than {cosine_distance_threshold} and any word match:\n")
    results_file.write(close_cosine_distance_correct_df[
                           ["response_question", "response_answer", "test_answer",
                            "test_answer_cosine_distance"]].to_markdown(index=False))
    #write the rows that had any matches and with a close cosine distance to the file
    results_file.write(f"\nRows with cosine distance less than {cosine_distance_threshold} and exact match:\n")
    results_file.write(correct_matches[
                           ["response_question", "response_answer", "test_answer",
                            "test_answer_cosine_distance"]].to_markdown(index=False))
print(f"results written to {tsv_results_filepath} and {results_summary_filepath}")

results written to /Users/chris/Library/CloudStorage/GoogleDrive-cjameswalmsley@gmail.com/My Drive/Shared with Julia/Education/Kent University/PhD/work/annabell/experiments/8/results/test_nyc_results_20250927_180159.tsv and /Users/chris/Library/CloudStorage/GoogleDrive-cjameswalmsley@gmail.com/My Drive/Shared with Julia/Education/Kent University/PhD/work/annabell/experiments/8/results/test_nyc_results_summary_20250927_180159.txt
