In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
from openai import OpenAI
from tqdm import tqdm
import pickle
import json
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

### Get the Counsel Chat Dataset

Extract 5 questions from each topic

In [None]:
dataset_name = "nbertagnolli/counsel-chat"
dataset = load_dataset(dataset_name, split="all")
dataset = dataset.shuffle(seed=42)

In [None]:
dataset_df = dataset.to_pandas()
dataset_df.head()

In [None]:
dataset_df_filt = dataset_df[['topic', 'questionTitle', 'questionText', 'answerText', 'upvotes']].groupby('topic', group_keys=False).apply(lambda x: x.sort_values(['upvotes'], ascending=False)[:5]).reset_index(drop=True)
dataset_df_filt = dataset_df_filt.fillna('')
dataset_df_filt

### Use OpenAI to generate synthetic data

We need to generate a question pair from OpenAI by giving a relevant example from the CounselChat Dataset as one-shot instruction tuning.

In [None]:
with open("../../api.key", 'r') as file:
    openai_api_key = file.read()
    
openai_client = OpenAI(api_key=openai_api_key)

In [None]:
def get_openai_response(system_prompt: str, user_prompt: str) -> str:
    
    completion = openai_client.chat.completions.create(
    model="gpt-4o",
    temperature=1,
    messages=[
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt}
        ]
    )

    openai_response = completion.choices[0].message.content
    
    return openai_response

In [None]:
system_prompt_qa = 'You are an expert mental-health counsellor'
user_prompt_qa = '''You are given a broad topic which covers a specific area in which humans suffer from ill mental health.
You job is to generate a topic relevant question/answer pair with question describing the mental state of the patient and answer describing the counselling advice given to the patient.

Topic: {topic}

Example:
Question-> {question}
Answer-> {answer}

You must return response in a json serializable format as following {{question: question_text, answer:answer_text}}
'''

In [None]:
# openai_responses_qa = []

# for index, row in tqdm(dataset_df_filt.iterrows(), total=len(dataset_df_filt)):
    
#     topic = row['topic']
#     question = row['questionText'] + row['questionTitle']
#     answer = row['answerText']
    
#     response_qa = get_openai_response(system_prompt=system_prompt_qa, user_prompt=user_prompt_qa.format(topic=topic, question=question, answer=answer))
    
#     openai_responses_qa.append(response_qa)

# with open('openai_responses_qa.pkl', 'wb') as file:
#     pickle.dump(openai_responses_qa, file)

### Reading the OpenAI Response files and converting to json for processing

In [None]:
with open('openai_responses_qa.pkl', 'rb') as file:
    openai_responses_qa = pickle.load(file)

Converting to json responses

In [None]:
json_responses = []
for index, response in enumerate(openai_responses_qa):
    response = response.replace('json','').replace("```","")
    try:
        json_responses.append(json.loads(response))
    except:
        print(index)
        json_responses.append(None)

Manually processing the ones with errors

In [None]:
json_responses[3] = json.loads(openai_responses_qa[3].split("```")[1].replace('json',''))

In [None]:
dataset_df_topic_list = dataset_df_filt['topic'].to_list()

for index, topic in enumerate(dataset_df_topic_list):
    json_responses[index]['topic'] = topic

Adding the topics to each json response for later processing or analysis

In [None]:
json_responses

### Inference from already fine-tuned model

In [None]:
model_id = "llama32-sft-fine-tune-counselchat"
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.padding_side = "left"
tokenizer.model_max_length = 2048

In [None]:
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, device_map="auto") # Must be float32 for MacBooks!
model.config.pad_token_id = tokenizer.pad_token_id # Updating the model config to use the special pad token

Preparing a list of questions from GPT QA Responses for batch inference

In [None]:
llama_inputs = [[{"role": "user", "content": response['question']}] for response in json_responses]

texts = tokenizer.apply_chat_template(llama_inputs, tokenize=False, add_generation_prompt=True)
inputs = tokenizer(texts, padding="longest", truncation=True, return_tensors="pt")
inputs = {key: val.to(model.device) for key, val in inputs.items()}
temp_texts = tokenizer.batch_decode(inputs['input_ids'], skip_special_tokens=True)

In [None]:
terminators = [
    tokenizer.eos_token_id,
    tokenizer.convert_tokens_to_ids("<|eot_id|>")
]

In [None]:
# gen_tokens = model.generate(
#     **inputs, 
#     max_new_tokens=2048, 
#     pad_token_id=tokenizer.pad_token_id, 
#     eos_token_id=terminators,
#     do_sample=True,
#     temperature=0.6,
#     top_p=0.9
# )

# gen_text = tokenizer.batch_decode(gen_tokens, skip_special_tokens=True)
# gen_text = [i[len(temp_texts[idx]):] for idx, i in enumerate(gen_text)]

# with open('llama_responses_qa.pkl', 'wb') as file:
#     pickle.dump(gen_text, file)

In [None]:
with open('llama_responses_qa.pkl', 'rb') as file:
    llama_response = pickle.load(file)

### Integrating the LLaMA Response within the json responses

In [None]:
for index, response in enumerate(llama_response):
    json_responses[index]['llama_answer'] = response

### Asking the GPT to rate the responses based on CTRS evaluations

In [None]:
json_responses

In [None]:
system_prompt_eval = 'You are an expert mental-health counsellor'
user_prompt_eval = '''You are given a conversation between a patient and therapist. Your job is to evaluate the response of therapist against the problem described by the patient as per the criterion of
Understanding, Interpersonal Effectiveness, Collaboration, Guided Discovery, Focus and Strategy. The definition for each of these criterion is mentioned below:

Understanding: How accurately does the therapist demonstrate understanding of the client’s issues and concerns?
Interpersonal Effectiveness: How effective is the therapist in maintaining a positive and therapeutic relationship with the client?
Collaboration: To what extent does the therapist engage the client in collaborative goalsetting and decision-making?
Guided Discovery: How effectively does the therapist use guided discovery techniques to facilitate client self-reflection and insight?
Focus: How well does the therapist identify and address the client’s key cognitions or behaviors that need change?
Strategy: How appropriate and coherent is the therapist’s strategy for promoting change in the client’s problematic behaviors or thoughts?

For each of these criterion, you need to assign a rating of 0 to 6 based on how well the therapist response fulfills the definition of the specific criterion.

You final response must not contain any description about any criterion and it must ONLY be in a json serializable format as following {{
    Understanding: understanding_rating,
    Interpersonal Effectiveness: interpersonal_effectiveness_rating,
    Collaboration: collaboration_rating,
    Guided Discovery: guided_discovery_rating,
    Focus: focus_rating,
    Strategy: strategy_rating,
}}

[PATIENT Problem]:
{patient}

[THERAPIST Response]:
{therapist}
'''

Evaluating the Responses of LLaMA

In [None]:
# openai_responses_eval_llama = []

# for response in tqdm(json_responses):
    
#     patient_problem = response['question']
#     therapist_response = response['llama_answer'] #LLaMA Response
    
#     response_eval = get_openai_response(system_prompt=system_prompt_eval, user_prompt=user_prompt_eval.format(patient=patient_problem, therapist=therapist_response))
    
#     openai_responses_eval_llama.append(response_eval)

# with open('openai_responses_eval_llama.pkl', 'wb') as file:
#     pickle.dump(openai_responses_eval_llama, file)

In [None]:
with open('openai_responses_eval_llama.pkl', 'rb') as file:
    openai_responses_eval_llama = pickle.load(file)

Adding to Json Responses

In [None]:
for index, response in enumerate(openai_responses_eval_llama):
    response = response.replace('json','').replace("```",'')
    try:
        json_responses[index]['llama_answer_eval'] = json.loads(response)
    except:
        print(index)
        json_responses[index]['llama_answer_eval'] = None

Evaluating the Responses of GPT

In [None]:
# openai_responses_eval_gpt = []

# for response in tqdm(json_responses):
    
#     patient_problem = response['question']
#     therapist_response = response['answer'] #GPT Response
    
#     response_eval = get_openai_response(system_prompt=system_prompt_eval, user_prompt=user_prompt_eval.format(patient=patient_problem, therapist=therapist_response))
    
#     openai_responses_eval_gpt.append(response_eval)

# with open('openai_responses_eval_gpt.pkl', 'wb') as file:
#     pickle.dump(openai_responses_eval_gpt, file)

In [None]:
with open('openai_responses_eval_gpt.pkl', 'rb') as file:
    openai_responses_eval_gpt = pickle.load(file)

In [None]:
for index, response in enumerate(openai_responses_eval_gpt):
    response = response.replace('json','').replace("```",'')
    try:
        json_responses[index]['gpt_answer_eval'] = json.loads(response)
    except:
        print(index)
        json_responses[index]['gpt_answer_eval'] = None

### Forming the visualizations using the evaluation criteria

In [None]:
topic_data = []
llama_undr = []
llama_intr_eff = []
llama_collab = []
llama_gd_disc = []
llama_foc = []
llama_strat = []
gpt_undr = []
gpt_intr_eff = []
gpt_collab = []
gpt_gd_disc = []
gpt_foc = []
gpt_strat = []

for response in tqdm(json_responses):
    topic_data.append(response['topic'])
    
    llama_undr.append(response['llama_answer_eval']['Understanding'])
    llama_intr_eff.append(response['llama_answer_eval']['Interpersonal Effectiveness'])
    llama_collab.append(response['llama_answer_eval']['Collaboration'])
    llama_gd_disc.append(response['llama_answer_eval']['Guided Discovery'])
    llama_foc.append(response['llama_answer_eval']['Focus'])
    llama_strat.append(response['llama_answer_eval']['Strategy'])
    
    gpt_undr.append(response['gpt_answer_eval']['Understanding'])
    gpt_intr_eff.append(response['gpt_answer_eval']['Interpersonal Effectiveness'])
    gpt_collab.append(response['gpt_answer_eval']['Collaboration'])
    gpt_gd_disc.append(response['gpt_answer_eval']['Guided Discovery'])
    gpt_foc.append(response['gpt_answer_eval']['Focus'])
    gpt_strat.append(response['gpt_answer_eval']['Strategy'])

In [None]:
eval_df = pd.DataFrame({'llama_undr': llama_undr, 'gpt_undr': gpt_undr, 'llama_intr_eff': llama_intr_eff, 'gpt_intr_eff': gpt_intr_eff,
                        'llama_collab': llama_collab, 'gpt_collab': gpt_collab, 'llama_gd_disc': llama_gd_disc, 'gpt_gd_disc': gpt_gd_disc,
                        'llama_foc': llama_foc, 'gpt_foc': gpt_foc, 'llama_strat': llama_strat, 'gpt_strat': gpt_strat}, index=topic_data)

eval_df.head()

In [None]:
avg_metrics = eval_df.agg('mean')
avg_metrics

In [None]:
plt.figure(figsize=(15,6))
plt.bar(avg_metrics.index.to_list(), avg_metrics.values.tolist(), color=['r','b','r','b','r','b','r','b','r','b','r','b'])
plt.show()

In [None]:
topic_groupby_avg = eval_df.groupby(eval_df.index).mean()
topic_groupby_avg

In [None]:
plt.figure(figsize=(20,6))
plt.bar(topic_groupby_avg.index.to_list(), topic_groupby_avg['llama_undr'].values.tolist())
plt.xticks(rotation=45, ha='right')
plt.show()