# This notebook prepares a prompt that can be used to generate declarative sentences from questions and answers
# The model used is Gemini 2.5 pro

1. Load the squad dataset
2. Filter the dataset by the category "New_York_City"
3. Write the dataset rows to a dataframe
3. Create a prompt that, for each row, using the question, answer pair and statement, will generate a declarative sentence
4. The response will be a JSON object with the following fields:
	* "declarative_sentence": The declarative sentence generated from the question and answer
	* "original_question": The original question
	* "original_answer": The original answer

In [None]:
from datasets import load_dataset, load_from_disk
import os
import json
from dataset_processing import load_squad_dataset

In [None]:
base_dir = "/Users/chris/Library/CloudStorage/GoogleDrive-cjameswalmsley@gmail.com/My Drive/Shared with Julia/Education/Kent University/PhD/work"
dataset_dir = "datasets/squad_dataset"
dataset_path = os.path.join(base_dir, dataset_dir)
nyc_train_json_l_dir = os.path.join(base_dir, "annabell/experiments/data")
nyc_train_json_l_filename = "nyc_squad_train.jsonl"
nyc_train_json_l_filepath = os.path.join(nyc_train_json_l_dir, nyc_train_json_l_filename)
prompt_directory = "prompts"
prompt_filename = "declarative_sentences_prompt_JSON.jsonl"
prompt_path = os.path.join(prompt_directory, prompt_filename)
response_filename = "responses.jsonl"
response_filepath = os.path.join(nyc_train_json_l_dir, response_filename)

In [None]:
dataset = load_squad_dataset(dataset_path)
train_df = dataset["train"].to_pandas()
nyc_train_df = train_df[train_df["title"] == "New_York_City"]
#add columns to the dataframe for response_question, response_answer, response_declarative_sentence, test_answer
nyc_train_df["response_question"] = "<INSERT RESPONSE QUESTION HERE>"
nyc_train_df["response_answer"] = "<INSERT RESPONSE ANSWER HERE>"
nyc_train_df["response_declarative_sentence"] = "<INSERT RESPONSE SENTENCE HERE>"
#drop the context and the title columns
nyc_train_df = nyc_train_df.drop(columns=["context", "title"])
#create an answer column that contains the first answer text if there are any answers, otherwise an empty string
nyc_train_df['answer'] = nyc_train_df['answers'].apply(lambda x: x['text'][0] if x['text'] else '')
nyc_train_df = nyc_train_df.drop(columns=["answers"])
nyc_train_df.reset_index(drop=True, inplace=True)
#write the dataframe to a jsonl file
nyc_train_df.to_json(nyc_train_json_l_filepath, orient="records", lines=True)
print("NYC train dataframe written to: " + nyc_train_json_l_filepath)
#for each row in the dataframe create an entry in JSON format.
prompt_entries = []
for index, row in train_df.iterrows():
	question = row["question"]
	answer = row["answers"]["text"][0] if len(row["answers"]["text"]) > 0 else ""
	prompt_entry = {
		"declarative_sentence": "<INSERT DECLARATIVE SENTENCE HERE>",
		"original_question": question,
		"original_answer": answer,
	}
	prompt_entries.append(prompt_entry)
#write the prompt entries to a jsonl file
with open(prompt_path, "w") as prompt_file:
	for entry in prompt_entries:
		prompt_file.write(json.dumps(entry) + "\n")
print("Prompt written to: " + prompt_path)

### Use the prompt with the Gemini 2.5 Pro model, paste the response into the responses.jsonl file
Note that the input file that goes with the prompt had to be split in 2 as Gemini in chrome kept aborting the processing with the full file.