In [1]:
from config.config import PlatformConfig
from dataset_processing import load_squad_dataset
from generate_declarative_sentences import generated_model_from_prompt
import json

config = PlatformConfig()
nyc_train_json_l_filepath = config.prompt_inputs_jsonl_filepath()
response_filepath = config.responses_jsonl_filepath()
base_prompt_filepath = config.base_prompt_filepath()

2025-11-01 16:30:43,997 - root - INFO - Logging initialized. Log file: /Users/chris/logs/cognitive_language_model_logs/run_20251101_163043.log


In [2]:
dataset = load_squad_dataset(config.dataset_directory())
train_df = dataset["train"].to_pandas()


def prepare_nyc_train_df(the_train_df, the_json_l_filepath, the_title):
    filtered_train_df = the_train_df[the_train_df["title"] == the_title].copy()

    # Create the 'answer' column from the 'answers' dictionary
    filtered_train_df["answer"] = filtered_train_df["answers"].apply(
        lambda x: x["text"][0] if x["text"] else ""
    )

    # Drop the original, now unneeded, columns
    filtered_train_df = filtered_train_df.drop(columns=["context", "title", "answers"])

    # Add the new column for the response, which will now be at the end
    filtered_train_df["response_declarative_sentence"] = (
        "<INSERT RESPONSE SENTENCE HERE>"
    )

    # Reorder columns to a specific, desired order
    final_columns = [
        "id",
        "question",
        "answer",
        "response_declarative_sentence",
    ]
    filtered_train_df = filtered_train_df[final_columns]

    filtered_train_df.reset_index(drop=True, inplace=True)
    # write the dataframe to a jsonl file
    filtered_train_df.to_json(the_json_l_filepath, orient="records", lines=True)
    print("NYC train dataframe written to: " + the_json_l_filepath)
    return filtered_train_df


nyc_train_df = prepare_nyc_train_df(train_df, nyc_train_json_l_filepath, "New_York_City")

Loading dataset from: /Users/chris/Library/CloudStorage/GoogleDrive-cjameswalmsley@gmail.com/My Drive/Shared with Julia/Education/Kent University/PhD/work/annabell/datasets/squad_dataset
NYC train dataframe written to: /Users/chris/Library/CloudStorage/GoogleDrive-cjameswalmsley@gmail.com/My Drive/Shared with Julia/Education/Kent University/PhD/work/annabell/experiments/data/prompts/prompt_inputs.jsonl


In [3]:
def process_prompt(the_base_prompt, the_line, the_id):
    the_prompt = the_base_prompt + "\n" + the_line
    print("Prompt:\n", the_prompt)
    generated_json = generated_model_from_prompt(the_prompt, the_id)
    return generated_json

In [4]:
with open(base_prompt_filepath, "r") as base_prompt_file:
    base_prompt = base_prompt_file.read()
with open(nyc_train_json_l_filepath, "r") as nyc_train_json_l_file:
    nyc_train_json_l = nyc_train_json_l_file.readlines()
model_string = config.ollama_models()[-1]
options = config.ollama_options_dict()

for line in nyc_train_json_l[:1]:
    #convert the json line to a dict to get the id
    line_dict = json.loads(line)
    the_id = line_dict["id"]
    # Create a dictionary for the prompt
    prompt_dict = {
        "question": line_dict["question"],
        "answer": line_dict["answer"],
        "response_declarative_sentence": line_dict["response_declarative_sentence"],
    }
    # Convert the dictionary to a JSON string
    line_json = json.dumps(prompt_dict)
    response_model = process_prompt(base_prompt, line_json, the_id)
    print("Response:\n", response_model)
    with open(response_filepath, "a") as response_file:
        response_file.write(response_model.model_dump_json() + "\n")

Prompt:
 ROLE:  You are a writing assistant.

TASK:  create a declarative statement using the associated question and answer from the JSON input provided .

TASK REQUIREMENTS:
1. Proper nouns must begin with capital letters

FORMAT:
For each row always answer in the following JSON format:

{"response_declarative_sentence":"<INSERT RESPONSE SENTENCE HERE>",}

Only this JSON format is allowed as an answer. No explanation or other text is allowed.

For example, for the following JSON row:

{"question":"What city in the United States has the highest population?", "answer":"New York"}

Your response would be:

{"response_declarative_sentence":"the city with the highest population in the United States is New York"}

Provide the specified response for the following json line:
{"question": "What city in the United States has the highest population?", "answer": "New York", "response_declarative_sentence": "<INSERT RESPONSE SENTENCE HERE>"}
2025-11-01 16:30:46,735 - generate_declarative_sentence

In [8]:
sample_prompt = """Prompt:
 ROLE:  You are a writing assistant.

TASK:  create a declarative statement using the associated question and answer from the JSON input provided .

TASK REQUIREMENTS:
1 Proper nouns must begin with capital letters

FORMAT:
For each row always answer in the following JSON format:

{"id":"<INSERT COPY OF ORIGINAL ID HERE>","response_declarative_sentence":"<INSERT RESPONSE SENTENCE HERE>",}

Only this JSON format is allowed as an answer. No explanation or other text is allowed.

For example, for the following JSON row:

{"id":"56ce304daab44d1400b8850f","question":"What city in the United States has the highest population?", "answer":"New York"}

Your response would be:

{"id":"56ce304daab44d1400b8850f", "response_declarative_sentence":"the city with the highest population in the United States is New York"}

Provide the specified response for the following json line:
{"id":"56ce304daab44d1400b8850e","question":"What city in the United States has the highest population?","answer":"New York","response_declarative_sentence":"<INSERT RESPONSE SENTENCE HERE>"}"""

import ollama
from generate_declarative_sentences import DeclarativeStatement

response = ollama.generate(
    model=model_string,
    prompt=sample_prompt,
    format=DeclarativeStatement.model_json_schema(),
    stream=False,
    think=False,
    options=options)

In [10]:
response.response

'{"id":"56ce304daab44d1:56ce304daab44d1400b8850e","declarative_statement":"the city with the highest population in the United States is New York"}\n'