### Fine-tune a chatbot with recipes. Work in progress. Dataset taken from:
### https://github.com/Glorf/recipenlg

In [5]:
import json
import openai
import pandas as pd, os
from pprint import pprint
from dotenv import load_dotenv
from openai import OpenAI

In [6]:
load_dotenv()  # Load all environment variables from '.env' file

api_key = os.getenv('OPENAI_API_KEY')
if api_key is None:
    print("API key is not set.")
else:
    print("API key is correctly set.")

API key is correctly set.


In [7]:
client = OpenAI()

In [9]:
## Testing the API, making sure I can use my API key and getting a valid return
completion = client.chat.completions.create(
    model="gpt-3.5-turbo",
    messages=[
        {"role": "system", "content": "You are a poetic assistant, skilled in explaining complex programming concepts with creative flair."},
        {"role": "user", "content": "Compose a poem that explains the concept of recursion in programming."}
    ]
)

print(completion.choices[0].message)

ChatCompletionMessage(content='In the realm of code, a dance so refined,\nRecursion whispers of a loop redefined.\nA function calls itself, a cycle unending,\nLike a mirror reflecting, infinitely extending.\n\nThrough levels of stacks, it travels with grace,\nEach iteration a new chapter to embrace.\nBreaking problems into parts, it does unravel,\nWith elegance and beauty that truly marvel.\n\nLike a Russian doll nested within another,\nRecursion dives deep, exploring like a brother.\nWith recursive might, problems are tamed,\nInto elegant solutions, they are framed.\n\nSo let your functions loop within their own space,\nEmbrace the recursive, its enchanting grace.\nIn the world of programming, a poetic rhyme,\nRecursion weaves magic, one call at a time.', role='assistant', function_call=None, tool_calls=None)


### Dataset originated at cookbooks.com. We will bring the data in:

In [10]:
df = pd.read_csv('data/full_dataset.csv', index_col=0)
df.head()

Unnamed: 0,title,ingredients,directions,link,source,NER
0,No-Bake Nut Cookies,"[""1 c. firmly packed brown sugar"", ""1/2 c. eva...","[""In a heavy 2-quart saucepan, mix brown sugar...",www.cookbooks.com/Recipe-Details.aspx?id=44874,Gathered,"[""brown sugar"", ""milk"", ""vanilla"", ""nuts"", ""bu..."
1,Jewell Ball'S Chicken,"[""1 small jar chipped beef, cut up"", ""4 boned ...","[""Place chipped beef on bottom of baking dish....",www.cookbooks.com/Recipe-Details.aspx?id=699419,Gathered,"[""beef"", ""chicken breasts"", ""cream of mushroom..."
2,Creamy Corn,"[""2 (16 oz.) pkg. frozen corn"", ""1 (8 oz.) pkg...","[""In a slow cooker, combine all ingredients. C...",www.cookbooks.com/Recipe-Details.aspx?id=10570,Gathered,"[""frozen corn"", ""cream cheese"", ""butter"", ""gar..."
3,Chicken Funny,"[""1 large whole chicken"", ""2 (10 1/2 oz.) cans...","[""Boil and debone chicken."", ""Put bite size pi...",www.cookbooks.com/Recipe-Details.aspx?id=897570,Gathered,"[""chicken"", ""chicken gravy"", ""cream of mushroo..."
4,Reeses Cups(Candy),"[""1 c. peanut butter"", ""3/4 c. graham cracker ...","[""Combine first four ingredients and press in ...",www.cookbooks.com/Recipe-Details.aspx?id=659239,Gathered,"[""peanut butter"", ""graham cracker crumbs"", ""bu..."


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2231142 entries, 0 to 2231141
Data columns (total 6 columns):
 #   Column       Dtype 
---  ------       ----- 
 0   title        object
 1   ingredients  object
 2   directions   object
 3   link         object
 4   source       object
 5   NER          object
dtypes: object(6)
memory usage: 119.2+ MB


In [13]:
# Subset the dataframe for only recipes the team gathered:
df = df[df.source=='Gathered']
df.head()

Unnamed: 0,title,ingredients,directions,link,source,NER
0,No-Bake Nut Cookies,"[""1 c. firmly packed brown sugar"", ""1/2 c. eva...","[""In a heavy 2-quart saucepan, mix brown sugar...",www.cookbooks.com/Recipe-Details.aspx?id=44874,Gathered,"[""brown sugar"", ""milk"", ""vanilla"", ""nuts"", ""bu..."
1,Jewell Ball'S Chicken,"[""1 small jar chipped beef, cut up"", ""4 boned ...","[""Place chipped beef on bottom of baking dish....",www.cookbooks.com/Recipe-Details.aspx?id=699419,Gathered,"[""beef"", ""chicken breasts"", ""cream of mushroom..."
2,Creamy Corn,"[""2 (16 oz.) pkg. frozen corn"", ""1 (8 oz.) pkg...","[""In a slow cooker, combine all ingredients. C...",www.cookbooks.com/Recipe-Details.aspx?id=10570,Gathered,"[""frozen corn"", ""cream cheese"", ""butter"", ""gar..."
3,Chicken Funny,"[""1 large whole chicken"", ""2 (10 1/2 oz.) cans...","[""Boil and debone chicken."", ""Put bite size pi...",www.cookbooks.com/Recipe-Details.aspx?id=897570,Gathered,"[""chicken"", ""chicken gravy"", ""cream of mushroo..."
4,Reeses Cups(Candy),"[""1 c. peanut butter"", ""3/4 c. graham cracker ...","[""Combine first four ingredients and press in ...",www.cookbooks.com/Recipe-Details.aspx?id=659239,Gathered,"[""peanut butter"", ""graham cracker crumbs"", ""bu..."


In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1643098 entries, 0 to 1643097
Data columns (total 6 columns):
 #   Column       Non-Null Count    Dtype 
---  ------       --------------    ----- 
 0   title        1643097 non-null  object
 1   ingredients  1643098 non-null  object
 2   directions   1643098 non-null  object
 3   link         1643098 non-null  object
 4   source       1643098 non-null  object
 5   NER          1643098 non-null  object
dtypes: object(6)
memory usage: 87.8+ MB


In [15]:
# Drop the null value (only one, this is rare):
df = df.dropna()
len(df)

1643097

### Data Preparation
### Fine-tuning with the ChatCompletion API format--each training example is a simple list of messages. For example, an entry could look like:
[{'role': 'system',
  'content': 'You are a helpful recipe assistant. You are to extract the generic ingredients from each of the recipes provided.'},

 {'role': 'user',
  'content': 'Title: No-Bake Nut Cookies\n\nIngredients: ["1 c. firmly packed brown sugar", "1/2 c. evaporated milk", "1/2 tsp. vanilla", "1/2 c. broken nuts (pecans)", "2 Tbsp. butter or margarine", "3 1/2 c. bite size shredded rice biscuits"]\n\nGeneric ingredients: '},

 {'role': 'assistant',
  'content': '["brown sugar", "milk", "vanilla", "nuts", "butter", "bite size shredded rice biscuits"]'}]

In [17]:
training_data = [] # Create an empty list for our training data to go into

# Prompt the LLM to give it instruction on how to respond
system_message = "You are a helpful recipe assistant. You are to extract the generic ingredients from each of the recipes provided."

def create_user_message(row):
    return f"""Title: {row['title']}\n\nIngredients: {row['ingredients']}\n\nGeneric ingredients: """

def prepare_example_conversation(row):
    messages = []
    messages.append({"role": "system", "content": system_message})

    user_message = create_user_message(row)
    messages.append({"role": "user", "content": user_message})

    messages.append({"role": "assistant", "content": row["NER"]})

    return {"messages": messages}

pprint(prepare_example_conversation(df.iloc[0]))

{'messages': [{'content': 'You are a helpful recipe assistant. You are to '
                          'extract the generic ingredients from each of the '
                          'recipes provided.',
               'role': 'system'},
              {'content': 'Title: No-Bake Nut Cookies\n'
                          '\n'
                          'Ingredients: ["1 c. firmly packed brown sugar", '
                          '"1/2 c. evaporated milk", "1/2 tsp. vanilla", "1/2 '
                          'c. broken nuts (pecans)", "2 Tbsp. butter or '
                          'margarine", "3 1/2 c. bite size shredded rice '
                          'biscuits"]\n'
                          '\n'
                          'Generic ingredients: ',
               'role': 'user'},
              {'content': '["brown sugar", "milk", "vanilla", "nuts", '
                          '"butter", "bite size shredded rice biscuits"]',
               'role': 'assistant'}]}


### Even though we have 1.65 million rows, for this project I am only going to use 1000 rows of training data. This will keep the cost low and the training time low. 

In [19]:
# Use the first 1000 rows of the dataset for training:
train_df = df.loc[0:1000]

# Apply the prepare_example_conversation function to each row of the training dataframe:
training_data = train_df.apply(prepare_example_conversation, axis=1).tolist()

for example in training_data[:5]:
    print(example)

{'messages': [{'role': 'system', 'content': 'You are a helpful recipe assistant. You are to extract the generic ingredients from each of the recipes provided.'}, {'role': 'user', 'content': 'Title: No-Bake Nut Cookies\n\nIngredients: ["1 c. firmly packed brown sugar", "1/2 c. evaporated milk", "1/2 tsp. vanilla", "1/2 c. broken nuts (pecans)", "2 Tbsp. butter or margarine", "3 1/2 c. bite size shredded rice biscuits"]\n\nGeneric ingredients: '}, {'role': 'assistant', 'content': '["brown sugar", "milk", "vanilla", "nuts", "butter", "bite size shredded rice biscuits"]'}]}
{'messages': [{'role': 'system', 'content': 'You are a helpful recipe assistant. You are to extract the generic ingredients from each of the recipes provided.'}, {'role': 'user', 'content': 'Title: Jewell Ball\'S Chicken\n\nIngredients: ["1 small jar chipped beef, cut up", "4 boned chicken breasts", "1 can cream of mushroom soup", "1 carton sour cream"]\n\nGeneric ingredients: '}, {'role': 'assistant', 'content': '["bee

### First 5 recipes: no-bake nut cookies, jewell ball's chicken, creamy corn, chicken funny and Reeses Cups :)

_____

### We then need to save our data as .jsonl files, with each line being one training example conversation.

In [21]:
def write_jsonl(data_list: list, filename: str) -> None:
    with open(filename, "w") as out:
        for ddict in data_list:
            jout = json.dumps(ddict) + "\n"
            out.write(jout)

In [22]:
training_file_name = "tmp_recipe_finetune_training.jsonl"
write_jsonl(training_data, training_file_name)

In [23]:
# print the first 5 lines of the training file
!head -n 5 tmp_recipe_finetune_training.jsonl

{"messages": [{"role": "system", "content": "You are a helpful recipe assistant. You are to extract the generic ingredients from each of the recipes provided."}, {"role": "user", "content": "Title: No-Bake Nut Cookies\n\nIngredients: [\"1 c. firmly packed brown sugar\", \"1/2 c. evaporated milk\", \"1/2 tsp. vanilla\", \"1/2 c. broken nuts (pecans)\", \"2 Tbsp. butter or margarine\", \"3 1/2 c. bite size shredded rice biscuits\"]\n\nGeneric ingredients: "}, {"role": "assistant", "content": "[\"brown sugar\", \"milk\", \"vanilla\", \"nuts\", \"butter\", \"bite size shredded rice biscuits\"]"}]}
{"messages": [{"role": "system", "content": "You are a helpful recipe assistant. You are to extract the generic ingredients from each of the recipes provided."}, {"role": "user", "content": "Title: Jewell Ball'S Chicken\n\nIngredients: [\"1 small jar chipped beef, cut up\", \"4 boned chicken breasts\", \"1 can cream of mushroom soup\", \"1 carton sour cream\"]\n\nGeneric ingredients: "}, {"role":

In [24]:
#Upload training data to OpenAI Servers:

response = client.files.create(
    file=open("tmp_recipe_finetune_training.jsonl", "rb"),
    purpose='fine-tune'
)

print(response)

FileObject(id='file-XBQoCFLONT3bkIrhRHMs4SNi', bytes=610999, created_at=1713965463, filename='tmp_recipe_finetune_training.jsonl', object='file', purpose='fine-tune', status='processed', status_details=None)


In [32]:
#Using response file id to train:  'file-XBQoCFLONT3bkIrhRHMs4SNi'

# Directly assign the file ID to a variable
file_id = 'file-XBQoCFLONT3bkIrhRHMs4SNi'

# Start the fine-tuning process using the file ID
try:
    fine_tune_response = client.fine_tuning.jobs.create(
        training_file=file_id,
        model="gpt-3.5-turbo",
    )
    print(fine_tune_response)
except Exception as e:
    print(f"An error occurred during fine-tuning: {str(e)}")

FineTuningJob(id='ftjob-J8DwdRvbbUIk2KEVlXomufjM', created_at=1713965936, error=Error(code=None, message=None, param=None, error=None), fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(n_epochs='auto', batch_size='auto', learning_rate_multiplier='auto'), model='gpt-3.5-turbo-0125', object='fine_tuning.job', organization_id='org-9M2kSzSr6rF5nWWogudGNLgy', result_files=[], seed=860681095, status='validating_files', trained_tokens=None, training_file='file-XBQoCFLONT3bkIrhRHMs4SNi', validation_file=None, integrations=[], user_provided_suffix=None)


In [33]:
# Check the status
client.fine_tuning.jobs.retrieve("ftjob-J8DwdRvbbUIk2KEVlXomufjM")

FineTuningJob(id='ftjob-J8DwdRvbbUIk2KEVlXomufjM', created_at=1713965936, error=Error(code=None, message=None, param=None, error=None), fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(n_epochs=3, batch_size=2, learning_rate_multiplier=2), model='gpt-3.5-turbo-0125', object='fine_tuning.job', organization_id='org-9M2kSzSr6rF5nWWogudGNLgy', result_files=[], seed=860681095, status='queued', trained_tokens=None, training_file='file-XBQoCFLONT3bkIrhRHMs4SNi', validation_file=None, integrations=[], user_provided_suffix=None)

### I realize I forgot to include hyperparameters to reduce training time. This could get costly :)

In [34]:
# Check the status
client.fine_tuning.jobs.retrieve("ftjob-J8DwdRvbbUIk2KEVlXomufjM")

FineTuningJob(id='ftjob-J8DwdRvbbUIk2KEVlXomufjM', created_at=1713965936, error=Error(code=None, message=None, param=None, error=None), fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(n_epochs=3, batch_size=2, learning_rate_multiplier=2), model='gpt-3.5-turbo-0125', object='fine_tuning.job', organization_id='org-9M2kSzSr6rF5nWWogudGNLgy', result_files=[], seed=860681095, status='running', trained_tokens=None, training_file='file-XBQoCFLONT3bkIrhRHMs4SNi', validation_file=None, integrations=[], user_provided_suffix=None)

In [35]:
job_id = "ftjob-J8DwdRvbbUIk2KEVlXomufjM"

In [39]:
response = client.fine_tuning.jobs.list_events(job_id)

events = response.data
events.reverse()

for event in events:
    print(event.message)

Step 1033/1502: training loss=0.00
Step 1034/1502: training loss=0.04
Step 1035/1502: training loss=0.00
Step 1036/1502: training loss=0.03
Step 1037/1502: training loss=0.00
Step 1038/1502: training loss=0.05
Step 1039/1502: training loss=0.00
Step 1040/1502: training loss=0.00
Step 1041/1502: training loss=0.00
Step 1042/1502: training loss=0.03
Step 1043/1502: training loss=0.00
Step 1044/1502: training loss=0.01
Step 1045/1502: training loss=0.01
Step 1046/1502: training loss=0.00
Step 1047/1502: training loss=0.01
Step 1048/1502: training loss=0.06
Step 1049/1502: training loss=0.37
Step 1050/1502: training loss=0.23
Step 1051/1502: training loss=0.01
Step 1052/1502: training loss=0.05
