# ChatGPT fine-tuning for smishing detection

In [1]:
from openai import OpenAI
import json
import pickle
import time
client = OpenAI()

### Fine-tuning input file with the train dataset

In [2]:
# train data
with open("./data/train_data.pkl", "rb") as input_file:
    train_data = pickle.load(input_file)

In [3]:
X_train = train_data["X_train"]
y_train = train_data["y_train"]

total_hams_count = 0
total_smishes_count = 0

for label in y_train:
    if label == "ham":
        total_hams_count += 1
    if label == "smish":
        total_smishes_count += 1

In [4]:
print("There is {} hams and {} smishes in the train dataset.".format(total_hams_count, total_smishes_count))

There is 3873 hams and 586 smishes in the train dataset.


In [5]:
for i in range(len(X_train)):
    user_content = """Do you think it is a ham or smish message?
    Your output should be a single word 'smish' or 'ham'.
    Do not write a sentence.
    Output is case-sensitive.
    
    {}
    """.format(X_train[i])
    assistant_content = y_train[i]
    messages = {"messages": [{"role": "user", "content": user_content}, 
                             {"role": "assistant", "content": assistant_content}]}
     
    messages_json = json.dumps(messages)

    with open("./data/fine_tuning_data.json", "a") as fine_tuning_data:
        fine_tuning_data.write(messages_json + "\n")

### Fine tuning data format validation

Source: https://cookbook.openai.com/examples/chat_finetuning_data_prep

In [6]:
# source: https://cookbook.openai.com/examples/chat_finetuning_data_prep
data_path = "./data/fine_tuning_data.json"

# Load the dataset
with open(data_path, 'r', encoding='utf-8') as f:
    dataset = [json.loads(line) for line in f]

# Initial dataset stats
print("Num examples:", len(dataset))
print("First example:")
for message in dataset[0]["messages"]:
    print(message)

Num examples: 4459
First example:
{'role': 'user', 'content': "Do you think it is a ham or smish message?\n    Your output should be a single word 'smish' or 'ham'.\n    Do not write a sentence.\n    Output is case-sensitive.\n    \n    FREE2DAY sexy St George's Day pic of Jordan!Txt PIC to 89080 dont miss out, then every wk a saucy celeb!4 more pics c PocketBabe.co.uk 0870241182716 Â£3/wk\n    "}
{'role': 'assistant', 'content': 'smish'}


In [7]:
# source: https://cookbook.openai.com/examples/chat_finetuning_data_prep
from collections import defaultdict

# Format error checks
format_errors = defaultdict(int)

for ex in dataset:
    if not isinstance(ex, dict):
        format_errors["data_type"] += 1
        continue
        
    messages = ex.get("messages", None)
    if not messages:
        format_errors["missing_messages_list"] += 1
        continue
        
    for message in messages:
        if "role" not in message or "content" not in message:
            format_errors["message_missing_key"] += 1
        
        if any(k not in ("role", "content", "name", "function_call") for k in message):
            format_errors["message_unrecognized_key"] += 1
        
        if message.get("role", None) not in ("system", "user", "assistant", "function"):
            format_errors["unrecognized_role"] += 1
            
        content = message.get("content", None)
        function_call = message.get("function_call", None)
        
        if (not content and not function_call) or not isinstance(content, str):
            format_errors["missing_content"] += 1
    
    if not any(message.get("role", None) == "assistant" for message in messages):
        format_errors["example_missing_assistant_message"] += 1

if format_errors:
    print("Found errors:")
    for k, v in format_errors.items():
        print(f"{k}: {v}")
else:
    print("No errors found")

No errors found


### Creating fine-tuning job

In [8]:
client.files.create(
  file=open("./data/fine_tuning_data.json", "rb"),
  purpose="fine-tune"
)

FileObject(id='file-JJcljNgAXqK5DBWPoEJ5EyDD', bytes=1568253, created_at=1705889196, filename='fine_tuning_data.json', object='file', purpose='fine-tune', status='processed', status_details=None)

##### ChatGPT 3.5

In [9]:
client.fine_tuning.jobs.create(
  training_file="file-JJcljNgAXqK5DBWPoEJ5EyDD",
  model="gpt-3.5-turbo",
  suffix="smishing_detection"
)

FineTuningJob(id='ftjob-9TTYC2UpaS4UK2EqpxOEhXp8', created_at=1705889245, error=None, fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(n_epochs='auto', batch_size='auto', learning_rate_multiplier='auto'), model='gpt-3.5-turbo-0613', object='fine_tuning.job', organization_id='org-Dd5j4OKgmIVEpx7Sdr8jsS4u', result_files=[], status='validating_files', trained_tokens=None, training_file='file-JJcljNgAXqK5DBWPoEJ5EyDD', validation_file=None)

###### Model name: ft:gpt-3.5-turbo-0613:personal:smishing-detection:8jf7LHPv

###### epochs: 3, tokens: 352087, time: ~1h 11min, cost: ~8.45$

##### ChatGPT 4

For now, fine-tuning for GPT-4 is available only for eligible users.