In [40]:
from dataset_processing import question_and_answer_pairs_from_log_file, remove_quotes_from_file, filter_by_max_words, clean_text
import pandas as pd
import nltk
from nltk.corpus import stopwords
import os
import platform
import datetime

# This notebook is used to test the ANNABELL model on the SQuAD dataset.


In [41]:
experiment_number = 6
operating_system = platform.system()
if operating_system == 'Windows':
    raise Exception("not used on Windows yet")
elif operating_system == 'Linux':
    base_directory = "/home/chris/gdrive/work/annabell"
elif operating_system == 'Darwin': #macOS
    base_directory = "/Users/chris/Library/CloudStorage/GoogleDrive-cjameswalmsley@gmail.com/My Drive/Shared with Julia/Education/Kent University/PhD/work/annabell/"
else:
    raise Exception("unsupported OS")

test_input_dir = os.path.join(base_directory, "experiments", str(experiment_number), "testing")
test_log_dir = os.path.join(base_directory, "experiments", str(experiment_number), "logs")
train_dir = os.path.join(base_directory, "training")
test_results_dir = os.path.join(base_directory, "experiments", str(experiment_number), "results")

test_log_filename = "test_nyc_log_2025-09-14_16-14-53.txt"
test_input_filename = "test_nyc_questions_without_pretrain.txt"
train_filename = "declarative_sentences_train_gemma3:4b_20250617_201822.tsv"

test_log_filepath = os.path.join(test_log_dir, test_log_filename)
test_input_filepath = os.path.join(test_input_dir, test_input_filename)
train_filepath = os.path.join(train_dir, train_filename)

In [44]:
question_and_answer_pairs = question_and_answer_pairs_from_log_file(test_log_filepath)

#print("length of log file questions: " + str(len(questions)))
#print("length of log file answers: " + str(len(answers)))
print("length of log file questions and answers: " + str(len(question_and_answer_pairs)))

with open(test_input_filepath, 'r') as test_input_file:
    test_input_lines = test_input_file.readlines()
print("length of test file questions: " + str(len(test_input_lines)))

length of log file questions and answers: 343
length of log file questions and answers: 343
length of test file questions: 1550


In [45]:
#Load and clean the training data derived from the SQuAD dataset
train_filepath = remove_quotes_from_file(train_filepath)
train_df = pd.read_csv(train_filepath, sep="\t")
train_df = train_df.dropna()

filtered_train_df = train_df[train_df["title"] == "New_York_City"]
filtered_train_df = filter_by_max_words(filtered_train_df, max_words=20)
filtered_train_df["response_question"] = clean_text(filtered_train_df["response_question"], True)
filtered_train_df["response_answer"] = clean_text(filtered_train_df["response_answer"], False)
filtered_train_df["statement"] = clean_text(filtered_train_df["statement"], False)
filtered_train_df.reset_index(drop=True, inplace=True)
filtered_train_df

Cleaned data saved to /home/chris/gdrive/work/annabell/training/declarative_sentences_train_gemma3:4b_20250617_201822_cleaned.tsv


Unnamed: 0,id,title,question,answer,response_question,response_answer,statement
0,56ce304daab44d1400b8850e,New_York_City,What city in the United States has the highest...,New York,? What city in the United States has the highe...,new york,new york city has the highest population in th...
1,56ce304daab44d1400b8850f,New_York_City,In what city is the United Nations based?,New York,? in what city is the united nations based,new york,the united nations is based in New York
2,56ce304daab44d1400b88510,New_York_City,What city has been called the cultural capital...,New York,? what city has been called the cultural capit...,new york,new york has been called the cultural capital ...
3,56ce304daab44d1400b88511,New_York_City,What American city welcomes the largest number...,New York,? What American city welcomes the largest numb...,new york,new york welcomes the largest number of legal ...
4,56cf5d41aab44d1400b89130,New_York_City,The major gateway for immigration has been whi...,New York City,? the major gateway for immigration has been w...,new york city,the major gateway for immigration has been in ...
...,...,...,...,...,...,...,...
801,56d1218c17492d1400aaba1e,New_York_City,What ZIP code was responsible for the greatest...,10021,? what ZIP code was responsible for the greate...,10021,the ZIP code 10021 was responsible for the gre...
802,56d1218c17492d1400aaba1f,New_York_City,How much money in cents does New York City rec...,83,? how much money in cents does New York City r...,83,New York City receives 83 cents for every doll...
803,56d1218c17492d1400aaba20,New_York_City,How much more money does the city give to the ...,$11 billion,? how much more money does the city give to th...,11 billion,the city gives 11 billion to the state of new ...
804,56d1218c17492d1400aaba21,New_York_City,"Each year, how much more money does New York C...",$11.4 billion,? each year how much more money does New York ...,11 point 4 billion,new york city gives 11 point 4 billion more to...


In [46]:
#add the test questions to the dataframe
for question, answer in question_and_answer_pairs:
    if question in filtered_train_df["response_question"].values:
        filtered_train_df.loc[filtered_train_df["response_question"] == question, "test_answer"] = answer
    else:
        print(f"Question not found in training data: {question}")
filtered_train_df

Question not found in training data: ? what was the myth that Manhattan was bought for by General Peter Minuit
Question not found in training data: ? for
Question not found in training data: ? the
Question not found in training data: ? a
Question not found in training data: ? in
Question not found in training data: ? what was
Question not found in training data: ? how


Unnamed: 0,id,title,question,answer,response_question,response_answer,statement,test_answer
0,56ce304daab44d1400b8850e,New_York_City,What city in the United States has the highest...,New York,? What city in the United States has the highe...,new york,new york city has the highest population in th...,Exploitation number of updates >= 4000
1,56ce304daab44d1400b8850f,New_York_City,In what city is the United Nations based?,New York,? in what city is the united nations based,new york,the united nations is based in New York,
2,56ce304daab44d1400b88510,New_York_City,What city has been called the cultural capital...,New York,? what city has been called the cultural capit...,new york,new york has been called the cultural capital ...,Exploitation number of updates >= 4000
3,56ce304daab44d1400b88511,New_York_City,What American city welcomes the largest number...,New York,? What American city welcomes the largest numb...,new york,new york welcomes the largest number of legal ...,
4,56cf5d41aab44d1400b89130,New_York_City,The major gateway for immigration has been whi...,New York City,? the major gateway for immigration has been w...,new york city,the major gateway for immigration has been in ...,Exploitation number of updates >= 4000
...,...,...,...,...,...,...,...,...
801,56d1218c17492d1400aaba1e,New_York_City,What ZIP code was responsible for the greatest...,10021,? what ZIP code was responsible for the greate...,10021,the ZIP code 10021 was responsible for the gre...,
802,56d1218c17492d1400aaba1f,New_York_City,How much money in cents does New York City rec...,83,? how much money in cents does New York City r...,83,New York City receives 83 cents for every doll...,
803,56d1218c17492d1400aaba20,New_York_City,How much more money does the city give to the ...,$11 billion,? how much more money does the city give to th...,11 billion,the city gives 11 billion to the state of new ...,
804,56d1218c17492d1400aaba21,New_York_City,"Each year, how much more money does New York C...",$11.4 billion,? each year how much more money does New York ...,11 point 4 billion,new york city gives 11 point 4 billion more to...,


In [47]:
# Get the counts for each unique value in the 'test_answer' column
test_answer_summary = filtered_train_df['test_answer'].value_counts().reset_index()
# Rename the columns for clarity
test_answer_summary.columns = ['test_answer', 'count']
# Sort the results by count in descending order
test_answer_summary.sort_values(by='count', ascending=False, inplace=True)
test_answer_summary

Unnamed: 0,test_answer,count
0,,104
1,Exploitation number of updates >= 4000,55
2,...,21
3,... Exploitation number of updates >= 4000,9
4,Northeastern,6
...,...,...
119,as anthonio,1
120,52,1
121,she personal pronoun for Laura is,1
122,approximately president of,1


In [57]:
#write the results to a file and export the results dataframe to a tsv file
test_answer_summary_filename = "test_nyc_answer_summary" + datetime.datetime.now().strftime("_%Y%m%d_%H%M%S") + ".tsv"
tsv_results_filepath = os.path.join(test_results_dir, test_answer_summary_filename)
test_answer_summary.to_csv(tsv_results_filepath, sep="\t", index=False)

In [49]:
#count the number of results where the test answer is > 20 words
num_long_answers = filtered_train_df["test_answer"].apply(lambda x: len(x.split()) > 20 if pd.notnull(x) else False).sum()
print(f"number of test answers longer than 20 words: {num_long_answers}")

number of test answers longer than 20 words: 5


In [50]:
#truncate the test answers to 20 words
def truncate_to_max_words(text, max_words=20):
	words = text.split()
	if len(words) > max_words:
		return ' '.join(words[:max_words])
	else:
		return text
#filtered_train_df["test_answer"] = filtered_train_df["test_answer"].apply(lambda x: truncate_to_max_words(x, max_words=20) if pd.notnull(x) else x)
#filtered_train_df

In [51]:
#remove rows where the test answer is longer than 20 words
filtered_train_df.dropna(subset=["test_answer"], inplace=True)
filtered_train_df = filtered_train_df[filtered_train_df.apply(lambda row: len(row["test_answer"].split()) <= 20 , axis=1)]
filtered_train_df.reset_index(drop=True, inplace=True)
filtered_train_df

Unnamed: 0,id,title,question,answer,response_question,response_answer,statement,test_answer
0,56ce304daab44d1400b8850e,New_York_City,What city in the United States has the highest...,New York,? What city in the United States has the highe...,new york,new york city has the highest population in th...,Exploitation number of updates >= 4000
1,56ce304daab44d1400b88510,New_York_City,What city has been called the cultural capital...,New York,? what city has been called the cultural capit...,new york,new york has been called the cultural capital ...,Exploitation number of updates >= 4000
2,56ce304daab44d1400b88511,New_York_City,What American city welcomes the largest number...,New York,? What American city welcomes the largest numb...,new york,new york welcomes the largest number of legal ...,
3,56cf5d41aab44d1400b89130,New_York_City,The major gateway for immigration has been whi...,New York City,? the major gateway for immigration has been w...,new york city,the major gateway for immigration has been in ...,Exploitation number of updates >= 4000
4,56cf5d41aab44d1400b89131,New_York_City,The most populated city in the United States i...,New York City,? the most populated city in the united states...,new york city,the most populated city in the united states i...,the most populated city
...,...,...,...,...,...,...,...,...
328,56cf3862aab44d1400b88e8c,New_York_City,"In 2013, how many people of Puerto Rican ances...",1.3 million,? in what year were the five borough -s combin...,1 point 3 million,the number of people of Puerto Rican ancestry ...,
329,56cf39b4aab44d1400b88eb3,New_York_City,"In 2014, millionaires made up what percentage ...",4.6%,? in what year were the five borough -s combin...,4 point 6,millionaires made up 4 point 6 of New York Cit...,
330,56d1062317492d1400aab76e,New_York_City,"In the first half of 2010, what percentage of ...",95.9%,? in what year were the five borough -s combin...,1898,the five borough -s were combined into one cit...,
331,56d10ab417492d1400aab7e8,New_York_City,"In 2012-3, what number of people saw a show on...",11.57 million,? in what year were the five borough -s combin...,11 point 57 million,the number of people who saw a show on broadwa...,


In [52]:
filtered_train_df["test_answer_correct"] = filtered_train_df["test_answer"] == filtered_train_df["response_answer"]
percentage_correct =  filtered_train_df["test_answer_correct"].mean() * 100
percentage_correct

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_train_df["test_answer_correct"] = filtered_train_df["test_answer"] == filtered_train_df["response_answer"]


np.float64(0.9009009009009009)

In [53]:
#filtered_train_df.dropna(subset=["test_answer"], inplace=True)

def any_word_match(row):
    # Split both strings into sets of words for efficient lookup
    try:
        stopwords.words('english')
    except LookupError:
        nltk.download('stopwords')
    stop_words = set(stopwords.words('english'))
    test_words = set(row["test_answer"].split())
    response_words = set(row["response_answer"].split())
    intersecting_words = test_words.intersection(response_words)
    open_class_intersecting_words = intersecting_words - stop_words

    return open_class_intersecting_words != set()

filtered_train_df["test_answer_any_matching_word"] = filtered_train_df.apply(any_word_match, axis=1)

percentage_any_word_matches =  filtered_train_df["test_answer_any_matching_word"].mean() * 100
print(f"percentage any word matches = {percentage_any_word_matches} %")

percentage any word matches = 3.303303303303303 %


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_train_df["test_answer_any_matching_word"] = filtered_train_df.apply(any_word_match, axis=1)


In [54]:
correct_matches = filtered_train_df[filtered_train_df["test_answer_correct"]]
correct_matches

Unnamed: 0,id,title,question,answer,response_question,response_answer,statement,test_answer,test_answer_correct,test_answer_any_matching_word
98,56cedf11aab44d1400b88b97,New_York_City,In what year did the American Revolutionary Wa...,1783,? in what year did the american revolutionary ...,1783,the american revolutionary war ended in 1783,1783,True,True
147,56cee70daab44d1400b88c51,New_York_City,In what year did the General Slocum disaster o...,1904,? in what year did the general slocum disaster...,1904,the general slocum disaster occurred in 1904,1904,True,True
199,56cef613aab44d1400b88d2b,New_York_City,The Hudson River serves as a dividing line bet...,New Jersey,? the Hudson River serves as a dividing line b...,New Jersey,the Hudson River serves as a dividing line bet...,New Jersey,True,True


In [55]:
any_matches = filtered_train_df[filtered_train_df["test_answer_any_matching_word"]]
any_matches

Unnamed: 0,id,title,question,answer,response_question,response_answer,statement,test_answer,test_answer_correct,test_answer_any_matching_word
4,56cf5d41aab44d1400b89131,New_York_City,The most populated city in the United States i...,New York City,? the most populated city in the united states...,new york city,the most populated city in the united states i...,the most populated city,False,True
16,56ce31baaab44d1400b8853b,New_York_City,What nation founded New Amsterdam?,the Dutch Republic,? What nation founded New Amsterdam,the Dutch Republic,the Dutch Republic founded New Amsterdam,the Dutch,False,True
50,56cfab96234ae51400d9be45,New_York_City,Which explorer sailed his ship into New York h...,Henry Hudson,? which explorer sailed his ship into New York...,henry hudson,henry hudson sailed his ship into New York har...,henry hudson sailed,False,True
70,56cedc8eaab44d1400b88b23,New_York_City,What was the regnal name of the Prince of Orange?,William III,? What was the regnal name of the Prince of Or...,william III,the regnal name of the Prince of Orange was wi...,william,False,True
98,56cedf11aab44d1400b88b97,New_York_City,In what year did the American Revolutionary Wa...,1783,? in what year did the american revolutionary ...,1783,the american revolutionary war ended in 1783,1783,True,True
111,56cee398aab44d1400b88bfe,New_York_City,On what street did the writing of the Bill of ...,Wall Street,? on what street did the writing of the bill o...,wall Street,the writing of the bill of rights occurred on ...,on wall Street,False,True
143,56cee6a3aab44d1400b88c3b,New_York_City,In what year was the modern City of New York f...,1898,? in what year was the modern City of New York...,1898,the modern City of New York was founded in 1898,in 1898,False,True
147,56cee70daab44d1400b88c51,New_York_City,In what year did the General Slocum disaster o...,1904,? in what year did the general slocum disaster...,1904,the general slocum disaster occurred in 1904,1904,True,True
169,56cfdc71234ae51400d9bf88,New_York_City,Where did the Stonewall riots happen?,Stonewall Inn in the Greenwich Village neighbo...,? where did the stonewall riots happen,stonewall inn in the greenwich village neighbo...,the stonewall riots happened at stonewall inn ...,the stonewall,False,True
199,56cef613aab44d1400b88d2b,New_York_City,The Hudson River serves as a dividing line bet...,New Jersey,? the Hudson River serves as a dividing line b...,New Jersey,the Hudson River serves as a dividing line bet...,New Jersey,True,True


In [59]:
#write the results to a file and export the results dataframe to a tsv file
tsv_results_filename = "test_nyc_results" + datetime.datetime.now().strftime("_%Y%m%d_%H%M%S") + ".tsv"
tsv_results_filepath = os.path.join(test_results_dir, tsv_results_filename)
filtered_train_df.to_csv(tsv_results_filepath, sep="\t", index=False)

results_summary_filename = "test_nyc_results_summary" + datetime.datetime.now().strftime("_%Y%m%d_%H%M%S") + ".txt"
results_summary_filepath = os.path.join(test_results_dir, results_summary_filename)

with open(results_summary_filepath, 'w') as results_file:
	#write the number of samples tested
	results_file.write(f"total number of samples\t{len(test_input_lines)/2}\n")
	results_file.write(f"number_of_test_answers\t{len(filtered_train_df)}\n")
	results_file.write(f"percentage_correct\t{percentage_correct}\n")
	results_file.write(f"percentage_any_word_matches\t{percentage_any_word_matches}\n")
	results_file.write(f"number of test answers longer than 20 words (removed)\t{num_long_answers}\n")
	# write the rows that had exact word matches to the file
	results_file.write("\nRows with exact matches:\n")
	results_file.write(correct_matches[["response_question", "response_answer", "test_answer"]].to_markdown(index=False))
	#write the rows in any_matches to the file
	results_file.write("\nRows with any word matches:\n")
	results_file.write(any_matches[["response_question", "response_answer", "test_answer"]].to_markdown(index=False))
print(f"results written to {tsv_results_filepath} and {results_summary_filepath}")




results written to /home/chris/gdrive/work/annabell/experiments/6/results/test_nyc_results_20250916_140722.tsv and /home/chris/gdrive/work/annabell/experiments/6/results/test_nyc_results_summary_20250916_140725.txt
