In [52]:
import pandas as pd
import os
from pydantic import BaseModel
from ollama import generate
base_directory = "/home/chris/gdrive"
data_directory = os.path.join(base_directory, "work/annabell/experiments/data")
dataset_filename = "response_formatted_20250924_174653.jsonl"
dataset_filepath = os.path.join(data_directory, dataset_filename)
base_prompt_filepath = "/home/chris/PycharmProjects/dataset/prompts/prompt_for_annabell_pretraining_command_generation.txt"
nyc_squad_df = pd.read_json(dataset_filepath, lines=True)
#drop all  the columns except for id response_question, response_answer, response_declarative_sentence
nyc_squad_df_trimmed = nyc_squad_df[["id", "response_question_formatted", "response_answer_formatted", "response_declarative_sentence_formatted"]]
#create a list of prompt_inputs in JSON lines format
prompt_inputs = []
for index, row in nyc_squad_df_trimmed.iterrows():
	json_line = row.to_json()
	prompt_inputs.append(json_line)

with open(base_prompt_filepath, "r") as base_prompt_file:
	base_prompt = base_prompt_file.read()

In [53]:
class PretrainingCommand(BaseModel):
  id: str
  commands: list[str]

def generated_text_from_prompt(the_model_string, the_prompt, the_format=None, the_options=None):
    generated_text = generate(model=the_model_string, prompt=the_prompt, format=the_format, options=the_options)
    return generated_text.response

def generated_json_from_prompt(the_model_string, the_prompt, the_options=None):
    the_response = generated_text_from_prompt(
        the_model_string=the_model_string,
        the_prompt=the_prompt,
        the_format=PretrainingCommand.model_json_schema(),
        the_options=the_options)
    return PretrainingCommand.model_validate_json(the_response)

In [54]:
number_of_samples = 1
model_string = "mistral-nemo"
results = []
prompt_input_samples = prompt_inputs[:number_of_samples]
for prompt_input_sample in prompt_input_samples:
	prompt = base_prompt + prompt_input_sample
	result = generated_text_from_prompt(model_string, prompt)
	results.append(result)
results

['{"id":"56ce304daab44d1400b8850e","training_commands": ["the city in the United States with the highest population is New York", "? what city in the United States has the highest population", ".wg city", ".wg United States", ".wg high -est", ".wg populate -ion", ".ph the city in the United States with the highest population is New York", ".wg New York", ".rw"]}']

In [57]:
number_of_samples = 1
results = []
prompt_input_samples = prompt_inputs[:number_of_samples]
for prompt_input_sample in prompt_input_samples:
	prompt = base_prompt + prompt_input_sample
	print("Prompt:")
	print(prompt)
	break

Prompt:
ROLE:  You are training a cognitive language model.

TASK:  create a set of commands for every JSON item in the provided file using the corresponding question, answer and declarative sentence.

TASK REQUIREMENTS:
1. create a sequence of commands in the following order:
a. the declarative sentence
b. the question
c. a number of .wg prefixed commands , one for each keyword
d. the declarative sentence prefixed with a .ph command
e. the answer prefixed with a .wg command
f. a .rw command
2. include an exact copy of the id provided in the response

FORMAT:
For each row always answer in the following JSON format:

{"id":"<INSERT COPY OF ORIGINAL ID HERE>","training_commands":"<INSERT LIST OF TRAINING COMMANDS HERE>"}

Only JSON is allowed as an answer. No explanation or other text is allowed.

For example, for the following JSON example:

{"id":"56ce304daab44d1400b8850f","question":"? in what borough did the Stonewall riots happen","answer":"",declarative_sentence":"the Stonewall rio

In [55]:
#save the results as a JSON lines file
with open(os.path.join(data_directory, "pretraining_commands_responses.jsonl"), "w") as results_file:
	for result in results:
		results_file.write(result + "\n")
#load the file back in as a dataframe
results_df = pd.read_json(os.path.join(data_directory, "pretraining_commands_responses.jsonl"), lines=True)
results_df.head()
#append the training_commands column to the original dataframe using the id column to match
final_df = pd.merge(nyc_squad_df_trimmed, results_df, on="id", how="left")
final_df = pd.merge(nyc_squad_df_trimmed, results_df, on="id", how="left")
final_df["training_commands"] = final_df["training_commands"].apply(lambda x: x if isinstance(x, list) else [])
#save the final dataframe as a JSON lines file
final_df.to_json(os.path.join(data_directory, "nyc_squad_with_pretraining_commands.jsonl"), orient="records", lines=True)
final_df

Unnamed: 0,id,response_question_formatted,response_answer_formatted,response_declarative_sentence_formatted,training_commands
0,56ce304daab44d1400b8850e,? what city in the United States has the high ...,New York,the city in the United States with the high -e...,[the city in the United States with the highes...
1,56ce304daab44d1400b8850f,? in what city is the United Nations base -d,New York,the United Nations is base -d in New York,[]
2,56ce304daab44d1400b88510,? what city has been call -ed the culture -al ...,New York,New York has been call -ed the culture -al cap...,[]
3,56ce304daab44d1400b88511,? what American city welcome -s the large -st ...,New York,New York is the American city that welcome -s ...,[]
4,56cf5d41aab44d1400b89130,? the major gateway for immigrate -ion has bee...,New York City,the major gateway for immigrate -ion has been ...,[]
...,...,...,...,...,...
812,56d1218c17492d1400aaba1f,? how much money in cent -s does New York City...,83,New York City receive -s 83 cent -s for every ...,[]
813,56d1218c17492d1400aaba20,? how much more money does the city give to th...,11 billion,the city give -s 11 billion more money to the...,[]
814,56d1218c17492d1400aaba21,? each year how much more money does New York ...,11 point 4 billion,New York City give -s 11 point 4 billion more...,[]
815,56d121d817492d1400aaba2d,? what is the new name of the Sister City Prog...,New York City Global Partners,the new name of the Sister City Program of the...,[]


In [56]:
#write a training file with one command per line
with open(os.path.join(data_directory, "nyc_squad_pretraining_commands_automatic.txt"), "w") as commands_file:
	for index, row in final_df.iterrows():
		commands = row["training_commands"]
		for command in commands:
			commands_file.write(command + "\n")
with open(os.path.join(data_directory, "nyc_squad_pretraining_commands_automatic.txt"), "r") as commands_file:
	lines = commands_file.readlines()
	print(f"Number of commands: {len(lines)}")
	print("First 5 commands:")
	for line in lines[:10]:
		print(line.strip())

Number of commands: 9
First 5 commands:
the city in the United States with the highest population is New York
? what city in the United States has the highest population
.wg city
.wg United States
.wg high -est
.wg populate -ion
.ph the city in the United States with the highest population is New York
.wg New York
.rw
