# Fine-tuning GPT-4o-mini for Sentiment Analysis

In [141]:
# auto-reload modules and imported functions when they are changed
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [142]:
from langfuse.openai import openai
from sklearn.metrics import classification_report
import pandas as pd
import os
import json


OPENAI_API_KEY = os.environ.get('OPENAI_API_KEY')
OPENAI_ORG_ID = os.environ.get('OPENAI_ORG_ID')

## 1. Load dataset

In [138]:
# run download_dataset.py before to prepare the dataset
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

In [139]:
df_test.head()

Unnamed: 0,text,score,label
0,"Based on an actual story, John Boorman shows t...",9,1
1,This is a gem. As a Film Four production - the...,9,1
2,"I really like this show. It has drama, romance...",9,1
3,This is the best 3-D experience Disney has at ...,10,1
4,"Of the Korean movies I've seen, only three had...",10,1


In [140]:
len(df_test)

25000

## 2. Zero-shot classification

In [147]:
# selecting a subset of the test dataset
df_test2 = df_test.sample(1000, random_state=123).copy()
X = df_test2["text"].values
y = df_test2["label"].values

In [58]:
# from skllm.models.gpt.classification.zero_shot import ZeroShotGPTClassifier


# clf = ZeroShotGPTClassifier(key=OPENAI_API_KEY, org=OPENAI_ORG_ID, model="gpt-4o-mini-2024-07-18")
# clf.fit(None, ["positive", "negative", "neutral"])
# labels = clf.predict(X)

In [148]:
from langchain.prompts import ChatPromptTemplate
from llm_utils import run_zeroshot_clf


user_prompt = "Classify the sentiment of the following movie review as either 'positive' or 'negative'. Movie review: {input}"

prompt_template = ChatPromptTemplate.from_template(
    user_prompt
)

In [149]:
model_id = "gpt-4o-mini-2024-07-18"
predicted = run_zeroshot_clf(docs=X, prompt=prompt_template, model=model_id)

In [150]:
df_test2["predicted"] = [v.sentiment for v in predicted]
df_test2["predicted"] = df_test2["predicted"].map({"positive": 1, "negative": 0})
df_test2

Unnamed: 0,text,score,label,predicted
20000,As a Bruce Campbell fan for nearly two decades...,3,0,0.0
5515,I think One True Thing is one of Meryl Streeps...,10,1,1.0
966,"This film grabbed me right from its start, whe...",8,1,1.0
22726,The film starts well enough. It is a truly ter...,3,0,0.0
2690,Walter Matthau can always improve a mediocre f...,7,1,1.0
...,...,...,...,...
18889,"As I am a fan of hospital and medical shows, I...",4,0,0.0
18110,The idea for the movie wasn't too bad: a horro...,3,0,0.0
18953,I like underground films when they have someth...,2,0,0.0
19865,Mario Racocevic from Europe is the only user w...,4,0,0.0


In [151]:
df_test2["predicted"].isna().sum()
df_test2.dropna(subset=["predicted"], inplace=True)

In [152]:
from sklearn.metrics import classification_report

print(classification_report(df_test2["label"], df_test2["predicted"]))

              precision    recall  f1-score   support

           0       0.93      0.97      0.95       511
           1       0.97      0.92      0.95       488

    accuracy                           0.95       999
   macro avg       0.95      0.95      0.95       999
weighted avg       0.95      0.95      0.95       999



In [127]:
# df_test2.to_csv("fpredictions.csv")

## 3. Fine-tuning

### 3.1 Prepare ft instances

In [103]:
# Define your training data 80/20 split
training_data = []
validation_data = []

for i, (_, row) in enumerate(df_train.query("label == 1").sample(100, random_state=123).iterrows()):
    if i < 80:
        training_data.append({"messages": [{"role": "user", "content": user_prompt.format(input=row["text"])}, {"role": "assistant", "content": "positive"}]})
    else:
        validation_data.append({"messages": [{"role": "user", "content": user_prompt.format(input=row["text"])}, {"role": "assistant", "content": "positive"}]})

for i, (_, row) in enumerate(df_train.query("label == 0").sample(100, random_state=123).iterrows()):
    if i < 80:
        training_data.append({"messages": [{"role": "user", "content": user_prompt.format(input=row["text"])}, {"role": "assistant", "content": "negative"}]})
    else:
        validation_data.append({"messages": [{"role": "user", "content": user_prompt.format(input=row["text"])}, {"role": "assistant", "content": "negative"}]})

In [104]:
len(training_data), len(validation_data)

(160, 40)

In [105]:
with open("finetuning_training_movie_reviews.jsonl", "w") as f:
    for data in training_data:
        f.write(json.dumps(data) + "\n")

with open("finetuning_validation_movie_reviews.jsonl", "w") as f:
    for data in validation_data:
        f.write(json.dumps(data) + "\n")

### 3.2 Upload training file

In [106]:
# Upload a training file
from openai import OpenAI


client = OpenAI()

res = client.files.create(
  file=open("finetuning_training_movie_reviews.jsonl", "rb"),
  purpose="fine-tune"
)

training_file_id = res.id

res

FileObject(id='file-czqmGrEWG6NAmtns3WjeompB', bytes=224766, created_at=1725639802, filename='finetuning_training_movie_reviews.jsonl', object='file', purpose='fine-tune', status='processed', status_details=None)

In [107]:
res = client.files.create(
  file=open("finetuning_validation_movie_reviews.jsonl", "rb"),
  purpose="fine-tune"
)

validation_file_id = res.id

res

FileObject(id='file-HUA4edsRJxwX2UBf0uLtNHos', bytes=62273, created_at=1725639804, filename='finetuning_validation_movie_reviews.jsonl', object='file', purpose='fine-tune', status='processed', status_details=None)

In [108]:
training_file_id, validation_file_id

('file-czqmGrEWG6NAmtns3WjeompB', 'file-HUA4edsRJxwX2UBf0uLtNHos')

### 3.3 Run fine-tuning job

In [115]:
# Create a fine-tuned model
from openai import OpenAI


client = OpenAI()

ft_res = client.fine_tuning.jobs.create(
  training_file=training_file_id, 
  validation_file=validation_file_id,
  model="gpt-4o-mini-2024-07-18",
  suffix="aitecit",
  # hyperparameters={"n_epochs": 3},
  seed=123
)

ft_res

In [114]:
job_id = ft_res.id

FineTuningJob(id='ftjob-2J6w6jlIY9Wih1H9oZTQeoy7', created_at=1725640326, error=Error(code=None, message=None, param=None), fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(n_epochs=3, batch_size='auto', learning_rate_multiplier='auto'), model='gpt-4o-mini-2024-07-18', object='fine_tuning.job', organization_id='org-HRXXhIy6fEPecJi7HZUMzoEv', result_files=[], seed=123, status='validating_files', trained_tokens=None, training_file='file-czqmGrEWG6NAmtns3WjeompB', validation_file='file-HUA4edsRJxwX2UBf0uLtNHos', estimated_finish=None, integrations=[], user_provided_suffix='aitecit')

In [119]:
response = client.fine_tuning.jobs.retrieve(job_id)
fine_tuned_model_id = response.fine_tuned_model

In [120]:
fine_tuned_model_id

'ft:gpt-4o-mini-2024-07-18:technology-it:aitecit:A4WO5hk6'

### 3.4 Inference

In [134]:
fine_tuned_model_id = "ft:gpt-4o-mini-2024-07-18:technology-it:aitecit:A4WO5hk6"
predicted = run_zeroshot_clf(docs=X, prompt=prompt_template, model=fine_tuned_model_id)

In [135]:
df_test2["predicted_ft"] = [v.sentiment for v in predicted]
df_test2["predicted_ft"] = df_test2["predicted_ft"].map({"positive": 1, "negative": 0})
print(classification_report(df_test2["label"], df_test2["predicted_ft"]))

              precision    recall  f1-score   support

           0       0.98      0.94      0.96       511
           1       0.94      0.98      0.96       489

    accuracy                           0.96      1000
   macro avg       0.96      0.96      0.96      1000
weighted avg       0.96      0.96      0.96      1000

