In [2]:
!pip install openai backoff tiktoken

Collecting openai
  Downloading openai-1.12.0-py3-none-any.whl.metadata (18 kB)
Collecting backoff
  Downloading backoff-2.2.1-py3-none-any.whl (15 kB)
Collecting tiktoken
  Downloading tiktoken-0.6.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)
Collecting distro<2,>=1.7.0 (from openai)
  Downloading distro-1.9.0-py3-none-any.whl.metadata (6.8 kB)
Collecting httpx<1,>=0.23.0 (from openai)
  Downloading httpx-0.26.0-py3-none-any.whl.metadata (7.6 kB)
Collecting pydantic<3,>=1.9.0 (from openai)
  Downloading pydantic-2.6.1-py3-none-any.whl.metadata (83 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m83.5/83.5 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
Collecting httpcore==1.* (from httpx<1,>=0.23.0->openai)
  Downloading httpcore-1.0.2-py3-none-any.whl.metadata (20 kB)
Collecting h11<0.15,>=0.13 (from httpcore==1.*->httpx<1,>=0.23.0->openai)
  Downloading h11-0.14.0-py3-none-any.whl (58 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

# Either load the openai api key as an environment variable or paste it in

In [7]:
import os
import openai
import backoff
import json
import tiktoken
import pandas as pd
from openai import OpenAI


#OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
#openai.api_key = OPENAI_API_KEY
#client = OpenAI()


class ItemExtractor():
    # 
    def __init__(self):
        self.chat_completion_model = 'gpt-4-0125-preview'
        self.encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")

    def generate_messages(self, model_type: str, field):
        classification_messages = [
            {"role": "system", "content": "You are an expert in Identifying and labeling various products using SP codes."},
            {"role": "user", "content": f"""
                                        Your task is to extract the item number from the item Descrption you are given.
                                        If there is no item number to extract, return "None".

                                        ###ITEM Description###
                                        {field}
                                        
                                        Return JSON

                                        "item_number": ,
                                        "explanation":
                                        
                                        """}
        ]

        lookup = {'item': classification_messages}
        return lookup.get(model_type)

    def num_tokens(self, string: str) -> int:
        num_tokens = len(self.encoding.encode(string))
        return num_tokens

    @backoff.on_exception(backoff.expo, openai.RateLimitError)
    def chat_completion(self, model_type, field):
        messages = self.generate_messages(model_type, field)
        response = openai.chat.completions.create(
            model = self.chat_completion_model,
            messages = messages,
            seed = 10,
            response_format={ "type": "json_object" },
            )
        return response.choices[0].message.content

  from pandas.core.computation.check import NUMEXPR_INSTALLED


In [8]:
df = pd.read_excel('CPP_Dataset_TrainingDataSet_V1.xlsx')

# initialize llm

In [None]:
llm = ItemExtractor()

# Test on one description

Iterate here to find best performing prompt.

In [None]:
sample_prediction = llm.chat_completion('item', df.loc[0]['Description'])

In [None]:
print(sample_prediction)

# Take sample to generate gpt4 outputs for training data creation

In [10]:
sample_dataset = df.sample(n=300, random_state=1).reset_index(drop=True)

In [None]:
def generate_fine_tuning_data(message, test_output):
    message.append({"role": "assistant", "content": f"{test_output}"})
    return message

In [None]:
# outputs for training data
outputs = {}

In [None]:
for index, row in sample_dataset.iterrows():
    print(index)
    description = row['Description']
    outputs[index] = llm.chat_completion('item', description)

# Create fine tuned training data

In [None]:
fine_tuned_data = []

In [None]:
for index, row in sample_dataset.iterrows():
    current_desc = row['Description']
    current_message = llm.generate_messages('item', current_desc)
    # get gpt 4 output
    generated_output = outputs[index]
    # create message
    current_message = generate_fine_tuning_data(current_message, generated_output)
    lookup = {}
    lookup['messages'] = current_message
    fine_tuned_data.append(lookup)

# save training data, must be json

In [None]:
with open('item_extraction_fine_tuned_data_1.jsonl', 'w', encoding='utf-8') as f:
    for example in fine_tuned_data:
        json.dump(example, f, ensure_ascii=False)
        f.write('\n')

# upload fine tuning training dataset

response will look something like this:

```
FileObject(id='file-aHU7gXwuZ8k6piUW6VL4sXe6', bytes=378875, created_at=1707760105, filename='test_item_extraction_fine_tuned_data_1.jsonl', object='file', purpose='fine-tune', status='processed', status_details=None)
```

In [None]:
client.files.create(
  file=open("item_extraction_fine_tuned_data_1.jsonl", "rb"),
  purpose="fine-tune"
)

# when you upload a file, the response will have the training file name, use that to create the job below

Response will look something like this:

```
FineTuningJob(id='ftjob-Aj3dNE4UM0AWtumph0i7HYHG', created_at=1707760159, error=Error(code=None, message=None, param=None, error=None), fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(n_epochs='auto', batch_size='auto', learning_rate_multiplier='auto'), model='gpt-3.5-turbo-1106', object='fine_tuning.job', organization_id='org-LCxZpk5IFggag3CMCzOB4Zku', result_files=[], status='validating_files', trained_tokens=None, training_file='file-aHU7gXwuZ8k6piUW6VL4sXe6', validation_file=None)
```

In [None]:
client.fine_tuning.jobs.create(
  training_file="",
  model="gpt-3.5-turbo"
)

# When you create a job, you can retrieve its status by calling below. You will likely have to keep checking every so often to see if it done. Response will have a status field.

The create jobs response will have the job id

Response will look something like this:

```
FineTuningJob(id='ftjob-Aj3dNE4UM0AWtumph0i7HYHG', created_at=1707760159, error=Error(code=None, message=None, param=None, error=None), fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(n_epochs='auto', batch_size='auto', learning_rate_multiplier='auto'), model='gpt-3.5-turbo-1106', object='fine_tuning.job', organization_id='org-LCxZpk5IFggag3CMCzOB4Zku', result_files=[], status='validating_files', trained_tokens=None, training_file='file-aHU7gXwuZ8k6piUW6VL4sXe6', validation_file=None)
```

In [None]:
client.fine_tuning.jobs.retrieve("")

# call the fine tuned model

the jobs retrive response, when finished, will have the model id

In [None]:
# generate message, fill description value
description = ""
message = llm.generate_messages('item', description)

In [None]:
response = client.chat.completions.create(
    model="",
    messages=message)

In [None]:
# view response
print(response.choices[0].message.content)

# Notes

1. You can't overwrite a previously uploaded training dataset, you must give it a new name, so I suggest a versioning approach.

2. There is likely some point of diminishing returns for a fine tuned model in regards to training samples, depending on the task maybe between 100-300 samples will be sufficient.

3. Very complex prompts with complex outputs may not perform that well.