In [1]:
# required imports/libraries
import os
import math
import json
import pandas as pd
from dotenv import load_dotenv
from openai import OpenAI

#### Necessary pip installation commands include:
- pip install os
- pip install openai
- pip install pandas
- pip install python-dotenv

#### <b>Note</b>: have a <b>.env</b> file already created for accessing API key


In [2]:
# load .env file values 
load_dotenv()

# insert OpenAI API key in here from .env
FINE_TUNED_TOXIC_DETECTION_API_KEY = os.getenv("FINE_TUNED_TOXIC_DETECTION_API_KEY")

In [3]:
# instantiate OpenAI client with API key
client = OpenAI(
    api_key=FINE_TUNED_TOXIC_DETECTION_API_KEY
)

#### These next steps pull your preprocessed data into a valid format for fine-tuning. This includes:
- reading data from csv file
- transforming data into json format needed for fine-tuning by OpenAI's platform
- splitting data into training and validation (even though this is not training, OpenAI uses this naming structure)
- writing newly created formatted data to jsonl files

In [4]:
# retrieve preprocessed set of choice for fine-tuning
fine_tuning_set = pd.read_csv('./data/csv/toxic_classification_training_set.csv')

In [5]:
training_json = []
evaluation_prompt = ''
# Read the evaluation prompt from the text file of choice with utf-8 encoding
with open("./data/text/fine_tuned_prompts/gpt-3-5-1106-two-shot-prompt.txt", "r", encoding="utf-8") as file:
    evaluation_prompt += file.read()

In [6]:
# appends text and toxic values to a readable format for GPT models
# can input prompt of choice within system->content through hard write or from file
for index, row in fine_tuning_set.iterrows():
    training_json.append({
        "messages" : [
            {
                "role": "system", 
                "content": evaluation_prompt
            },
            {
                "role": "user",
                "content": row['Text']
            },
            {
                "role": "assistant",
                "content": str(row['Toxic'])
            },
        ]
    })

In [7]:
# half length of dataset
half = math.floor(len(training_json) / 2)
half

2000

In [8]:
# trains first half and validates second half
training = training_json[:half]
validation = training_json[half + 1:]

In [9]:
# saves training and validation data to jsonl format for finetuning and validation
with open('./data/jsonl/new_training_kaggle_toxic_classification_data.jsonl', 'w') as f:
    for entry in training:
        f.write(f"{json.dumps(entry)}\n")

with open('./data/jsonl/new_validation_kaggle_toxic_classification_data.jsonl', 'w') as f:
    for entry in validation:
        f.write(f"{json.dumps(entry)}\n")

#### Create fine-tuning files for storing within OpenAI platform
#### visit <b>https://platform.openai.com/docs/guides/fine-tuning</b> to see how <b>.jsonl</b> files are formatted

In [10]:
# insert .jsonl validation and training files for fine-tuning
client.files.create(
  file=open("./data/jsonl/new_training_kaggle_toxic_classification_data.jsonl", "rb"),
  purpose="fine-tune"
)

client.files.create(
  file=open("./data/jsonl/new_validation_kaggle_toxic_classification_data.jsonl", "rb"),
  purpose="fine-tune"
)

FileObject(id='file-kJYVD13EjKaomidTppvhfbqS', bytes=1469663, created_at=1731294840, filename='new_validation_kaggle_toxic_classification_data.jsonl', object='file', purpose='fine-tune', status='processed', status_details=None)

In [11]:
# use for checking for successful file(s) creation
client.files.list()

SyncPage[FileObject](data=[FileObject(id='file-kJYVD13EjKaomidTppvhfbqS', bytes=1469663, created_at=1731294840, filename='new_validation_kaggle_toxic_classification_data.jsonl', object='file', purpose='fine-tune', status='processed', status_details=None), FileObject(id='file-NwP2wPPJskd1o7NRYF4a3k4U', bytes=1530740, created_at=1731294839, filename='new_training_kaggle_toxic_classification_data.jsonl', object='file', purpose='fine-tune', status='processed', status_details=None), FileObject(id='file-iUgNNVTLRkRONwd0x4wQxo0S', bytes=31340, created_at=1730099682, filename='step_metrics.csv', object='file', purpose='fine-tune-results', status='processed', status_details=None), FileObject(id='file-WPKuhg74ZjLRBriUMwpvBP26', bytes=1314247, created_at=1730097407, filename='unprocessed_two_thousand_validation_kaggle_toxic_classification_dataset.jsonl', object='file', purpose='fine-tune', status='processed', status_details=None), FileObject(id='file-vlOGiQQRCG6HmJajVHIjJxFu', bytes=1268455, crea

#### Created files are located within the <b>Storage</b> of your OpenAI Dashboard
#### Link: <b>https://platform.openai.com/storage/files</b>

#### Here are the successful models for fine-tuning:
- <b>gpt-4o-2024-08-06</b>
- <b>gpt-4o-mini-2024-07-18</b>
- <b>gpt-4-0613</b>
- <b>gpt-3.5-turbo-0125</b>
- <b>gpt-3.5-turbo-1106</b>
- <b>gpt-3.5-turbo-0613</b> (model not functioning during testing)

In [12]:
# insert created training and validation files from platform 
# along with model of choice to begin fine-tuning jobs
client.fine_tuning.jobs.create(
    training_file="file-NwP2wPPJskd1o7NRYF4a3k4U",
    validation_file="file-kJYVD13EjKaomidTppvhfbqS",
    model="gpt-3.5-turbo-0125"
)

FineTuningJob(id='ftjob-8VAiHPyhnW7gIDJNp5ji8aZm', created_at=1731295020, error=Error(code=None, message=None, param=None), fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(n_epochs='auto', batch_size='auto', learning_rate_multiplier='auto'), model='gpt-3.5-turbo-0125', object='fine_tuning.job', organization_id='org-S652ryxRoR0nJQpnMy7g1xcm', result_files=[], seed=1372160539, status='validating_files', trained_tokens=None, training_file='file-NwP2wPPJskd1o7NRYF4a3k4U', validation_file='file-kJYVD13EjKaomidTppvhfbqS', estimated_finish=None, integrations=[], user_provided_suffix=None)

In [13]:
# status check of fine-tuning jobs
client.fine_tuning.jobs.retrieve("ftjob-Mymd6RM7VvSJwOVBBpqNwx5p")

FineTuningJob(id='ftjob-Mymd6RM7VvSJwOVBBpqNwx5p', created_at=1727979309, error=Error(code=None, message=None, param=None), fine_tuned_model='ft:gpt-4o-2024-08-06:personal::AELSRgXJ', finished_at=1727982438, hyperparameters=Hyperparameters(n_epochs=1, batch_size=10, learning_rate_multiplier=2), model='gpt-4o-2024-08-06', object='fine_tuning.job', organization_id='org-S652ryxRoR0nJQpnMy7g1xcm', result_files=['file-5gCTWySFk0MgmOZ2YjQvyEZJ'], seed=69129836, status='succeeded', trained_tokens=1277205, training_file='file-FpwsaDITa0co3Z4sNvzknGGX', validation_file=None, estimated_finish=None, integrations=[], user_provided_suffix=None)