## GPT Fine-Tuning

This file will be used to create and validate the dataset to be used for the fine-tuning of GPT

In [1]:
from openai import OpenAI
from datasets import load_dataset
import json
from collections import defaultdict
import re
import pickle

  from .autonotebook import tqdm as notebook_tqdm


### Reading the CounselChat Dataset

Train Data

In [2]:
with open('processed_data/counselchat_top_votes_train.pkl', 'rb') as file:
    dataset_top_votes_train = pickle.load(file)

dataset_top_votes_train.head()

Unnamed: 0,topic,question,answerText
0,relationships,I am currently suffering from erectile dysfunc...,"Hi, First and foremost, I want to acknowledge ..."
1,family-conflict,For the past week or so me and my boyfriend ha...,Forgetting one's emotions is impossible.Since ...
2,depression,I am in high school and have been facing anxie...,"Hi Helena,I felt a bit sad when I read this. T..."
3,anxiety,I'm concerned about my boyfriend. I suffer fro...,Hello! Thank you for your question. There are ...
4,spirituality,"I'm a Christian teenage girl, and I have lost ...",Having sex with your boyfriend is and was a mi...


Test Data

In [3]:
with open('processed_data/counselchat_top_votes_test.pkl', 'rb') as file:
    dataset_top_votes_test = pickle.load(file)

dataset_top_votes_test.head()

Unnamed: 0,topic,question,answerText
0,relationships,I had to go to the emergency room today to get...,It is extremely frustrating when our significa...
1,marriage,What makes a healthy marriage last? What makes...,"This is a fantastic question. In one sentence,..."
2,relationships,"I'm a female freshman in high school, and this...","First off, I think it is great that you are wi..."
3,intimacy,"My wife and I are newly married, about 2 month...","You are newly married, you Have a hectic sched..."
4,legal-regulatory,"I think I have depression, anxiety, bipolar di...",It can be difficult to get counseling if you d...


#### Preparing the data into the GPT based format

Train Data

In [4]:
gpt_fine_tune_train_data = []
for index, row in dataset_top_votes_train.iterrows():
    system_prompt = "You are an expert mental health professional trained to counsel and guide patients suffering from ill mental-health"
    user_content = row['question']
    assistant_content = row['answerText']
    
    formatted_message = {"messages": [{"role": "system", "content": system_prompt},
                                        {"role": "user", "content": user_content},
                                        {"role": "assistant", "content": assistant_content}]}
    
    gpt_fine_tune_train_data.append(formatted_message)

Test Data

In [5]:
gpt_fine_tune_test_data = []
for index, row in dataset_top_votes_test.iterrows():
    system_prompt = "You are an expert mental health professional trained to counsel and guide patients suffering from ill mental-health"
    user_content = row['question']
    assistant_content = row['answerText']
    
    formatted_message = {"messages": [{"role": "system", "content": system_prompt},
                                        {"role": "user", "content": user_content},
                                        {"role": "assistant", "content": assistant_content}]}
    
    gpt_fine_tune_test_data.append(formatted_message)

Saving the data in `jsonl` file

Train Data

In [6]:
file_path = "gpt_fine_tune_train_data.jsonl"

with open(file_path, 'w') as file:
    for element in gpt_fine_tune_train_data:
        json_line = json.dumps(element)
        file.write(json_line + '\n')

Test Data

In [7]:
file_path = "gpt_fine_tune_test_data.jsonl"

with open(file_path, 'w') as file:
    for element in gpt_fine_tune_test_data:
        json_line = json.dumps(element)
        file.write(json_line + '\n')

#### Verifying the format using GPT Provided Script

Train Data

In [8]:
data_path = "gpt_fine_tune_train_data.jsonl"

# Load the dataset
with open(data_path, 'r', encoding='utf-8') as f:
    dataset = [json.loads(line) for line in f]

# Initial dataset stats
print("Num examples:", len(dataset))
print("First example:")
for message in dataset[47]["messages"]:
    print(message)

Num examples: 690
First example:
{'role': 'system', 'content': 'You are an expert mental health professional trained to counsel and guide patients suffering from ill mental-health'}
{'role': 'user', 'content': "In this argument my friends are all against me so I'm wondering if I'm exaggerating my feelings. I feel like I'm being attacked and that is making me blow things out of proportion and taking the role of the victim, when in reality I'm the one being unreasonable and I am simply annoying my reasonable friends. Or are they really being insensitive and not respecting me in our friendship? I feel like my friends are ganging up on me"}
{'role': 'assistant', 'content': "Conflict is a tricky beast and shows up in every one of our relationships. What I'm reading is that you find yourself in a loop of seeking validation of your thoughts and feelings from your friends rather than validating yourself. Am I right? If you aren't validating yourself, no one else will be able to either. Try fin

In [9]:
# Format error checks
format_errors = defaultdict(int)

for ex in dataset:
    if not isinstance(ex, dict):
        format_errors["data_type"] += 1
        continue
        
    messages = ex.get("messages", None)
    if not messages:
        format_errors["missing_messages_list"] += 1
        continue
        
    for message in messages:
        if "role" not in message or "content" not in message:
            format_errors["message_missing_key"] += 1
        
        if any(k not in ("role", "content", "name", "function_call", "weight") for k in message):
            format_errors["message_unrecognized_key"] += 1
        
        if message.get("role", None) not in ("system", "user", "assistant", "function"):
            format_errors["unrecognized_role"] += 1
            
        content = message.get("content", None)
        function_call = message.get("function_call", None)
        
        if (not content and not function_call) or not isinstance(content, str):
            format_errors["missing_content"] += 1
            print(content)
    
    if not any(message.get("role", None) == "assistant" for message in messages):
        format_errors["example_missing_assistant_message"] += 1

if format_errors:
    print("Found errors:")
    for k, v in format_errors.items():
        print(f"{k}: {v}")
else:
    print("No errors found")

No errors found


Test Data

In [10]:
data_path = "gpt_fine_tune_test_data.jsonl"

# Load the dataset
with open(data_path, 'r', encoding='utf-8') as f:
    dataset = [json.loads(line) for line in f]

# Initial dataset stats
print("Num examples:", len(dataset))
print("First example:")
for message in dataset[47]["messages"]:
    print(message)

Num examples: 173
First example:
{'role': 'system', 'content': 'You are an expert mental health professional trained to counsel and guide patients suffering from ill mental-health'}
{'role': 'user', 'content': "I'm starting to think that I might have depression. The thing is though, I'm not exactly sad. I can joke with my friends and stuff, but I've stopped caring about my work, and I've been starting to think that life is pointless. I really have no clue if I need help, or if I'm just tired or something. Is it possible that I have depression?"}
{'role': 'assistant', 'content': 'Depression isn\'t something someone "has", it is a dark mood and lack of satisfaction with the way the current life is structured.If you think of depression as motivation to look for new ways to understand yourself, find ways which are more satisfying in your work and relationships, then depression is a charm.The typical attitude most people have when they think of the word "depression", is that their life will

In [11]:
# Format error checks
format_errors = defaultdict(int)

for ex in dataset:
    if not isinstance(ex, dict):
        format_errors["data_type"] += 1
        continue
        
    messages = ex.get("messages", None)
    if not messages:
        format_errors["missing_messages_list"] += 1
        continue
        
    for message in messages:
        if "role" not in message or "content" not in message:
            format_errors["message_missing_key"] += 1
        
        if any(k not in ("role", "content", "name", "function_call", "weight") for k in message):
            format_errors["message_unrecognized_key"] += 1
        
        if message.get("role", None) not in ("system", "user", "assistant", "function"):
            format_errors["unrecognized_role"] += 1
            
        content = message.get("content", None)
        function_call = message.get("function_call", None)
        
        if (not content and not function_call) or not isinstance(content, str):
            format_errors["missing_content"] += 1
            print(content)
    
    if not any(message.get("role", None) == "assistant" for message in messages):
        format_errors["example_missing_assistant_message"] += 1

if format_errors:
    print("Found errors:")
    for k, v in format_errors.items():
        print(f"{k}: {v}")
else:
    print("No errors found")

No errors found


#### Uploading the training file

In [12]:
with open("../../api.key", 'r') as file:
    openai_api_key = file.read()

openai_client = OpenAI(api_key=openai_api_key)

Train Data

In [13]:
openai_client.files.create(
  file=open("gpt_fine_tune_train_data.jsonl", "rb"),
  purpose="fine-tune"
)

FileObject(id='file-MeRjhRBE4G7DUChioUd3zT', bytes=1135189, created_at=1743222571, filename='gpt_fine_tune_train_data.jsonl', object='file', purpose='fine-tune', status='processed', expires_at=None, status_details=None)

Test Data

In [14]:
openai_client.files.create(
  file=open("gpt_fine_tune_test_data.jsonl", "rb"),
  purpose="fine-tune"
)

FileObject(id='file-3sxmE3khxksjyYXXxBhQw8', bytes=276684, created_at=1743222572, filename='gpt_fine_tune_test_data.jsonl', object='file', purpose='fine-tune', status='processed', expires_at=None, status_details=None)