# This notebook uses a LLM to generate a set of commands that can be used to train the ANNABELL model.
The aim is to enable the model to extract information form a declarative sentence to answer a question.
A prompt is used that includes an example of a declarative sentence, a question and an answer along with the appropriate training commands.

In [3]:
import pandas as pd
import os
from pydantic import BaseModel
from ollama import generate
import platform
import datetime

operating_system = platform.system()
if operating_system == 'Windows':
    raise Exception("not used on Windows yet")
elif operating_system == 'Linux':
    base_directory = "/home/chris/gdrive/work/annabell"
elif operating_system == 'Darwin':  #macOS
    base_directory = "/Users/chris/Library/CloudStorage/GoogleDrive-cjameswalmsley@gmail.com/My Drive/Shared with Julia/Education/Kent University/PhD/work/annabell/"
else:
    raise Exception("unsupported OS")

number_of_pretraining_samples = 5

timestamp = datetime.datetime.now().strftime("_%Y%m%d_%H%M%S")
data_directory = os.path.join(base_directory, "experiments/data")
dataset_filename = "response_formatted_20250924_174653.jsonl"
dataset_filepath = os.path.join(data_directory, dataset_filename)
base_prompt_directory = os.path.join(data_directory, "prompts")
base_prompt_filename = "prompt_for_annabell_pretraining_command_generation.txt"
base_prompt_filepath = os.path.join(base_prompt_directory, base_prompt_filename)
nyc_squad_df = pd.read_json(dataset_filepath, lines=True)
#drop all  the columns except for id response_question, response_answer, response_declarative_sentence
nyc_squad_df_trimmed = nyc_squad_df[
    ["id", "response_question_formatted", "response_answer_formatted", "response_declarative_sentence_formatted"]]
#take a random sample of 200 rows from the dataframe
nyc_squad_sample_df = nyc_squad_df_trimmed.sample(n=number_of_pretraining_samples, random_state=42).reset_index(
    drop=True)
#create a list of prompt_inputs in JSON lines format
prompt_inputs = []
for index, row in nyc_squad_sample_df.iterrows():
    json_line = row.to_json()
    prompt_inputs.append(json_line)

with open(base_prompt_filepath, "r") as base_prompt_file:
    base_prompt = base_prompt_file.read()
base_prompt[:5]

In [4]:
class PretrainingCommand(BaseModel):
    id: str
    commands: list[str]


def generated_text_from_prompt(the_model_string, the_prompt, the_format=None, the_options=None):
    generated_text = generate(model=the_model_string, prompt=the_prompt, format=the_format, options=the_options)
    return generated_text.response


def generated_json_from_prompt(the_model_string, the_prompt, the_options=None):
    the_response = generated_text_from_prompt(
        the_model_string=the_model_string,
        the_prompt=the_prompt,
        the_format=PretrainingCommand.model_json_schema(),
        the_options=the_options)
    return PretrainingCommand.model_validate_json(the_response)

In [1]:
model_string = "mistral-nemo"
results = []
for prompt_input_sample in prompt_inputs:
    prompt = base_prompt + prompt_input_sample
    result = generated_text_from_prompt(model_string, prompt)
    results.append(result)
results

NameError: name 'prompt_inputs' is not defined

In [6]:
#save the results as a JSON lines file
with open(os.path.join(data_directory, "pretraining_commands_responses.jsonl"), "w") as results_file:
    for result in results:
        results_file.write(result + "\n")
#load the file back in as a dataframe
results_df = pd.read_json(os.path.join(data_directory, "pretraining_commands_responses.jsonl"), lines=True)
results_df.head()
#append the training_commands column to the original dataframe using the id column to match
final_df = pd.merge(nyc_squad_df_trimmed, results_df, on="id", how="left")
final_df = pd.merge(nyc_squad_df_trimmed, results_df, on="id", how="left")
final_df["training_commands"] = final_df["training_commands"].apply(lambda x: x if isinstance(x, list) else [])
final_df

Unnamed: 0,id,response_question_formatted,response_answer_formatted,response_declarative_sentence_formatted,training_commands
0,56ce304daab44d1400b8850e,? what city in the United States has the high ...,New York,the city in the United States with the high -e...,[]
1,56ce304daab44d1400b8850f,? in what city is the United Nations base -d,New York,the United Nations is base -d in New York,[]
2,56ce304daab44d1400b88510,? what city has been call -ed the culture -al ...,New York,New York has been call -ed the culture -al cap...,[]
3,56ce304daab44d1400b88511,? what American city welcome -s the large -st ...,New York,New York is the American city that welcome -s ...,[]
4,56cf5d41aab44d1400b89130,? the major gateway for immigrate -ion has bee...,New York City,the major gateway for immigrate -ion has been ...,[]
...,...,...,...,...,...
812,56d1218c17492d1400aaba1f,? how much money in cent -s does New York City...,83,New York City receive -s 83 cent -s for every ...,[]
813,56d1218c17492d1400aaba20,? how much more money does the city give to th...,11 billion,the city give -s 11 billion more money to the...,[]
814,56d1218c17492d1400aaba21,? each year how much more money does New York ...,11 point 4 billion,New York City give -s 11 point 4 billion more...,[]
815,56d121d817492d1400aaba2d,? what is the new name of the Sister City Prog...,New York City Global Partners,the new name of the Sister City Program of the...,[]
