## GPT Fine-Tuning

This file will be used to create and validate the dataset to be used for the fine-tuning of GPT

In [None]:
from openai import OpenAI
from datasets import load_dataset
import json
from collections import defaultdict
import re

### Get the CounselChat Dataset

In [None]:
dataset_name = "nbertagnolli/counsel-chat"
dataset = load_dataset(dataset_name, split="all")
dataset = dataset.shuffle(seed=42)

In [None]:
dataset_df = dataset.to_pandas()
dataset_df.head()

In [None]:
dataset_df_top_votes = dataset_df.groupby('questionID').apply(lambda x: x.sort_values('upvotes', ascending=False).iloc[0], include_groups=False).reset_index()
dataset_df_top_votes

In [None]:
dataset_df_top_votes['question'] = dataset_df_top_votes['questionText'] + " " + dataset_df_top_votes['questionTitle']
dataset_df_top_votes

In [None]:
dataset_df_final = dataset_df_top_votes[['topic', 'question', 'answerText']]
dataset_df_final

In [None]:
dataset_df_final = dataset_df_final.dropna().reset_index(drop=True)
dataset_df_final

Removing emojis, unbreakable space and double spaces from answers/questions

In [None]:
def remove_emojis(df_bios):
    emoj = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002500-\U00002BEF"  # chinese char
        u"\U00002702-\U000027B0"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642" 
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"  # dingbats
        u"\u3030"
                      "]+", re.UNICODE)
    return re.sub(emoj, '', df_bios)

In [None]:
dataset_df_final['question'] = dataset_df_final['question'].apply(remove_emojis)
dataset_df_final['question'] = dataset_df_final['question'].apply(lambda x: x.replace('\xa0', ' ').replace(u"\u2019","'").replace(u"\u00e9","ee").replace("\n",' ').replace("  ", " "))

dataset_df_final['answerText'] = dataset_df_final['answerText'].apply(remove_emojis)
dataset_df_final['answerText'] = dataset_df_final['answerText'].apply(lambda x: x.replace('\xa0', ' ').replace(u"\u2019","'").replace(u"\u00e9","ee").replace("\n",' ').replace("  ", " "))

dataset_df_final

#### Preparing the data into the GPT based format

In [None]:
gpt_fine_tune_data = []
for index, row in dataset_df_final.iterrows():
    system_prompt = "You are an expert mental health professional trained to counsel and guide patients suffering from ill mental-health"
    user_content = row['question']
    assistant_content = row['answerText']
    
    formatted_message = {"messages": [{"role": "system", "content": system_prompt},
                                        {"role": "user", "content": user_content},
                                        {"role": "assistant", "content": assistant_content}]}
    
    gpt_fine_tune_data.append(formatted_message)

Saving the data in `jsonl` file

In [None]:
file_path = "gpt_fine_tune_data.jsonl"

with open(file_path, 'w') as file:
    for element in gpt_fine_tune_data:
        json_line = json.dumps(element)
        file.write(json_line + '\n')

Verifying the format using GPT Provided Script

In [None]:
data_path = "gpt_fine_tune_data.jsonl"

# Load the dataset
with open(data_path, 'r', encoding='utf-8') as f:
    dataset = [json.loads(line) for line in f]

# Initial dataset stats
print("Num examples:", len(dataset))
print("First example:")
for message in dataset[47]["messages"]:
    print(message)

In [None]:
# Format error checks
format_errors = defaultdict(int)

for ex in dataset:
    if not isinstance(ex, dict):
        format_errors["data_type"] += 1
        continue
        
    messages = ex.get("messages", None)
    if not messages:
        format_errors["missing_messages_list"] += 1
        continue
        
    for message in messages:
        if "role" not in message or "content" not in message:
            format_errors["message_missing_key"] += 1
        
        if any(k not in ("role", "content", "name", "function_call", "weight") for k in message):
            format_errors["message_unrecognized_key"] += 1
        
        if message.get("role", None) not in ("system", "user", "assistant", "function"):
            format_errors["unrecognized_role"] += 1
            
        content = message.get("content", None)
        function_call = message.get("function_call", None)
        
        if (not content and not function_call) or not isinstance(content, str):
            format_errors["missing_content"] += 1
            print(content)
    
    if not any(message.get("role", None) == "assistant" for message in messages):
        format_errors["example_missing_assistant_message"] += 1

if format_errors:
    print("Found errors:")
    for k, v in format_errors.items():
        print(f"{k}: {v}")
else:
    print("No errors found")

Uploading the training file

In [None]:
with open("../../api.key", 'r') as file:
    openai_api_key = file.read()

openai_client = OpenAI(api_key=openai_api_key)

In [None]:
openai_client.files.create(
  file=open("gpt_fine_tune_data.jsonl", "rb"),
  purpose="fine-tune"
)