In [1]:
# required imports/libraries
import os
from dotenv import load_dotenv
from openai import OpenAI

#### Necessary pip installation commands include:
- pip install os
- pip install openai
- pip install pandas
- pip install python-dotenv

#### <b>Note</b>: have a <b>.env</b> file already created for accessing API key


In [2]:
# load .env file values 
load_dotenv()

# insert OpenAI API key in here from .env
FINE_TUNED_TOXIC_DETECTION_API_KEY = os.getenv("FINE_TUNED_TOXIC_DETECTION_API_KEY")

In [3]:
# instantiate OpenAI client with API key
client = OpenAI(
    api_key=FINE_TUNED_TOXIC_DETECTION_API_KEY
)

#### Create fine-tuning files for storing within OpenAI platform
#### visit <b>https://platform.openai.com/docs/guides/fine-tuning</b> to see how <b>.jsonl</b> files are formatted

In [6]:
# insert .jsonl validation and training files for fine-tuning
client.files.create(
  file=open("../data/jsonl/two_hundred_training_kaggle_toxic_classification_dataset.jsonl", "rb"),
  purpose="fine-tune"
)

client.files.create(
  file=open("../data/jsonl/two_hundred_validation_kaggle_toxic_classification_dataset.jsonl", "rb"),
  purpose="fine-tune"
)

FileObject(id='file-WPKuhg74ZjLRBriUMwpvBP26', bytes=1314247, created_at=1730097407, filename='unprocessed_two_thousand_validation_kaggle_toxic_classification_dataset.jsonl', object='file', purpose='fine-tune', status='processed', status_details=None)

In [5]:
# use for checking for successful file(s) creation
client.files.list()

SyncPage[FileObject](data=[FileObject(id='file-N3D3ggRs7GtWFsuhxZSPRvOh', bytes=1200903, created_at=1730097350, filename='two_half_thousand_validation_kaggle_toxic_classification_dataset.jsonl', object='file', purpose='fine-tune', status='processed', status_details=None), FileObject(id='file-uw8VUBBvA5ZocdlSNq9d4NaH', bytes=1268455, created_at=1730097349, filename='unprocessed_two_thousand_training_kaggle_toxic_classification_dataset.jsonl', object='file', purpose='fine-tune', status='processed', status_details=None), FileObject(id='file-ha6eNWpQpU9VIyrAA7h3b1mJ', bytes=31888, created_at=1730093282, filename='step_metrics.csv', object='file', purpose='fine-tune-results', status='processed', status_details=None), FileObject(id='file-HPI70my3NvcIsEgnnSKJP445', bytes=1200903, created_at=1730090514, filename='two_half_thousand_validation_kaggle_toxic_classification_dataset.jsonl', object='file', purpose='fine-tune', status='processed', status_details=None), FileObject(id='file-aEDGQWAZl2D1

#### Created files are located within the <b>Storage</b> of your OpenAI Dashboard
#### Link: <b>https://platform.openai.com/storage/files</b>

#### Here are the successful models for fine-tuning:
- <b>gpt-4o-2024-08-06</b>
- <b>gpt-4o-mini-2024-07-18</b>
- <b>gpt-4-0613</b>
- <b>gpt-3.5-turbo-0125</b>
- <b>gpt-3.5-turbo-1106</b>
- <b>gpt-3.5-turbo-0613</b> (model not functioning during testing)

In [7]:
# insert created training and validation files from platform 
# along with model of choice to begin fine-tuning jobs
client.fine_tuning.jobs.create(
    training_file="file-vlOGiQQRCG6HmJajVHIjJxFu",
    validation_file="file-WPKuhg74ZjLRBriUMwpvBP26",
    model="gpt-3.5-turbo-0125"
)

FineTuningJob(id='ftjob-ZVK0wj8WwvYUMa2MjZ1H6SL7', created_at=1730097428, error=Error(code=None, message=None, param=None), fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(n_epochs='auto', batch_size='auto', learning_rate_multiplier='auto'), model='gpt-3.5-turbo-0125', object='fine_tuning.job', organization_id='org-S652ryxRoR0nJQpnMy7g1xcm', result_files=[], seed=427831886, status='validating_files', trained_tokens=None, training_file='file-vlOGiQQRCG6HmJajVHIjJxFu', validation_file='file-WPKuhg74ZjLRBriUMwpvBP26', estimated_finish=None, integrations=[], user_provided_suffix=None)

In [13]:
# status check of fine-tuning jobs
client.fine_tuning.jobs.retrieve("ftjob-Mymd6RM7VvSJwOVBBpqNwx5p")

FineTuningJob(id='ftjob-Mymd6RM7VvSJwOVBBpqNwx5p', created_at=1727979309, error=Error(code=None, message=None, param=None), fine_tuned_model='ft:gpt-4o-2024-08-06:personal::AELSRgXJ', finished_at=1727982438, hyperparameters=Hyperparameters(n_epochs=1, batch_size=10, learning_rate_multiplier=2), model='gpt-4o-2024-08-06', object='fine_tuning.job', organization_id='org-S652ryxRoR0nJQpnMy7g1xcm', result_files=['file-5gCTWySFk0MgmOZ2YjQvyEZJ'], seed=69129836, status='succeeded', trained_tokens=1277205, training_file='file-FpwsaDITa0co3Z4sNvzknGGX', validation_file=None, estimated_finish=None, integrations=[], user_provided_suffix=None)