### Response Generation

This file will generate the responses for CounselChat questions using 4 models

1. Base GPT (gpt-4o)
2. Fine-Tuned GPT (gpt-4o)
3. Base LLaMA (LLaMA-3.2 3B Instruct)
4. Fine-Tuned LLaMA

In [None]:
import os

In [None]:
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [None]:
from unsloth import FastLanguageModel
import torch
from openai import OpenAI
from tqdm import tqdm
import pandas as pd
import pickle
import numpy as np

### Read the Processed CounselChat Validation Dataset

In [None]:
with open('processed_data/counselchat_top_votes_test.pkl', 'rb') as file:
    dataset_top_votes_test = pickle.load(file)

dataset_top_votes_test.head()

Initializing a response generation dataframe to record all the answers

In [None]:
df_response_generation = dataset_top_votes_test
df_response_generation.head()

### OpenAI Configuration and Responses

In [None]:
with open("../../api.key", 'r') as file:
    openai_api_key = file.read()

openai_client = OpenAI(api_key=openai_api_key)

### OpenAI Base Model

In [None]:
def get_openai_response(user_prompt: str) -> str:
        
    completion = openai_client.chat.completions.create(
    model="gpt-4o",
    temperature=0,
    messages=[
        {"role": "system", "content": "You are an expert mental health professional trained to counsel and guide patients suffering from ill mental-health. Limit your response to a maximum of 250 words"},
        {"role": "user", "content": user_prompt}
        ]
    )

    openai_response = completion.choices[0].message.content
    
    return openai_response

Get GPT Responses

In [None]:
gpt_responses_base = []
for index, row in tqdm(dataset_top_votes_test.iterrows(), total=len(dataset_top_votes_test)):
    question_input = row['question']
    try:
        gpt_resp = get_openai_response(user_prompt = question_input)
        gpt_responses_base.append(gpt_resp)
    except:
        gpt_responses_base.append('gpt_resp')

with open('response_generation_data/gpt_base_resp.pkl', 'wb') as file:
    pickle.dump(gpt_responses_base, file)

In [None]:
with open('response_generation_data/gpt_base_resp.pkl', 'rb') as file:
    gpt_responses_base = pickle.load(file)

In [None]:
df_response_generation['gpt_responses_base'] = gpt_responses_base
df_response_generation.head()

### OpenAI Fine-Tuned Model Response

In [None]:
def get_openai_response_finetuned(user_prompt: str) -> str:
        
    completion = openai_client.chat.completions.create(
    model="ft:gpt-4o-2024-08-06:university-of-texas-at-austin:counselchat-train:BGJdvzQV",
    temperature=0,
    messages=[
        {"role": "system", "content": "You are an expert mental health professional trained to counsel and guide patients suffering from ill mental-health"},
        {"role": "user", "content": user_prompt}
        ],
    max_tokens=2048
    )

    openai_response = completion.choices[0].message.content
    
    return openai_response

In [None]:
gpt_responses_ft = []
for index, row in tqdm(dataset_top_votes_test.iterrows(), total=len(dataset_top_votes_test)):
    question_input = row['question']
    try:
        gpt_resp = get_openai_response_finetuned(user_prompt=question_input)
        gpt_responses_ft.append(gpt_resp)
    except:
        gpt_responses_ft.append('')
        
with open('response_generation_data/gpt_ft_resp.pkl', 'wb') as file:
    pickle.dump(gpt_responses_ft, file)

In [None]:
with open('response_generation_data/gpt_ft_resp.pkl', 'rb') as file:
    gpt_responses_ft = pickle.load(file)

In [None]:
df_response_generation['gpt_responses_ft'] = gpt_responses_ft
df_response_generation.head()

### Inferencing from LLaMA Base Model

Preparing the batches of data

In [None]:
batch_size = 10
question_list = dataset_top_votes_test['question'].to_list()
batch_indices = np.arange(0, len(question_list), batch_size)
if batch_indices[-1] != len(question_list):
    batch_indices = np.append(batch_indices, len(question_list))

Loading the base model from Unsloth

In [None]:
max_seq_length = 2048 
dtype = None # None for auto-detection.
load_in_4bit = False # Use 4bit quantization to reduce memory usage. Can be False.

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Llama-3.2-3B-Instruct",
    max_seq_length = max_seq_length,
    load_in_4bit=load_in_4bit,
    dtype=dtype,
    device_map="auto"
)

Implementing batch Inference

In [None]:
def get_llama_response_base(question_inputs: str):
    
    llama_inputs = [[{"role": "system", "content": "You are an expert mental health professional trained to counsel and guide patients suffering from ill mental-health. Limit your response to a maximum of 250 words"},
                     {"role": "user", "content": question}] for question in question_inputs]

    prompt = tokenizer.apply_chat_template(llama_inputs, tokenize=False, add_generation_prompt=True)
    
    inputs = tokenizer(prompt, padding=True, truncation=True, return_tensors="pt")
    inputs = {key: val.to(model.device) for key, val in inputs.items()}
    temp_texts = tokenizer.batch_decode(inputs['input_ids'], skip_special_tokens=True)
    
    outputs = model.generate(
        **inputs, 
        max_new_tokens=max_seq_length,
        num_return_sequences=1,
        temperature=0.0
    )

    texts = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    texts = [i[len(temp_texts[idx]):] for idx, i in enumerate(texts)]
    
    return texts

In [None]:
# Implementing the Unsloth Fast Inference
FastLanguageModel.for_inference(model)

llama_responses_base = []
for i in tqdm(range(0, len(batch_indices) - 1)):
    questions_input = question_list[batch_indices[i]:batch_indices[i+1]]
    llama_resp = get_llama_response_base(questions_input)
    llama_responses_base = llama_responses_base + llama_resp

with open('response_generation_data/llama_responses_base.pkl', 'wb') as file:
    pickle.dump(llama_responses_base, file)

In [None]:
with open('response_generation_data/llama_responses_base.pkl', 'rb') as file:
    llama_responses_base = pickle.load(file)

In [None]:
df_response_generation['llama_responses_base'] = llama_responses_base
df_response_generation.head()

### Inferencing from LLaMA Fine-Tune Model

Preparing the batches of data

In [None]:
batch_size = 10
question_list = dataset_top_votes_test['question'].to_list()
batch_indices = np.arange(0, len(question_list), batch_size)
if batch_indices[-1] != len(question_list):
    batch_indices = np.append(batch_indices, len(question_list))

Loading the fine-tuned model

In [None]:
model_id = "llama32-sft-fine-tune-counselchat"

max_seq_length = 2048 
dtype = None # None for auto-detection.
load_in_4bit = False # Use 4bit quantization to reduce memory usage. Can be False.

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = model_id,
    max_seq_length = max_seq_length,
    load_in_4bit=load_in_4bit,
    dtype=dtype,
    device_map="auto"
)

Implementing batch Inference

In [None]:
def get_llama_response_ft(question_inputs: str):
    
    llama_inputs = [[{"role": "system", "content": "You are an expert mental health professional trained to counsel and guide patients suffering from ill mental-health."},
                     {"role": "user", "content": question}] for question in question_inputs]

    prompt = tokenizer.apply_chat_template(llama_inputs, tokenize=False, add_generation_prompt=True)
    
    inputs = tokenizer(prompt, padding=True, truncation=True, return_tensors="pt")
    inputs = {key: val.to(model.device) for key, val in inputs.items()}
    temp_texts = tokenizer.batch_decode(inputs['input_ids'], skip_special_tokens=True)
    
    outputs = model.generate(
        **inputs, 
        max_new_tokens=max_seq_length,
        num_return_sequences=1
    )

    texts = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    texts = [i[len(temp_texts[idx]):] for idx, i in enumerate(texts)]
    
    return texts

In [None]:
# Implementing the Unsloth Fast Inference
FastLanguageModel.for_inference(model)

llama_responses_ft = []
for i in tqdm(range(0, len(batch_indices) - 1)):
    questions_input = question_list[batch_indices[i]:batch_indices[i+1]]
    llama_resp = get_llama_response_base(questions_input)
    llama_responses_ft = llama_responses_ft + llama_resp

with open('response_generation_data/llama_responses_ft.pkl', 'wb') as file:
    pickle.dump(llama_responses_ft, file)

In [None]:
with open('response_generation_data/llama_responses_ft.pkl', 'rb') as file:
    llama_responses_ft = pickle.load(file)

In [None]:
df_response_generation['llama_responses_ft'] = llama_responses_ft
df_response_generation.head()