#### install necessary package

In [1]:
!pip install unsloth
!pip uninstall unsloth -y && pip install --upgrade --no-cache-dir "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install scikit-learn

Found existing installation: unsloth 2024.12.4
Uninstalling unsloth-2024.12.4:
  Successfully uninstalled unsloth-2024.12.4
Collecting unsloth@ git+https://github.com/unslothai/unsloth.git (from unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git)
  Cloning https://github.com/unslothai/unsloth.git to /tmp/pip-install-kjbzjhpt/unsloth_15ef7430fc9c4ab3bd391f5c9a0c2b34
  Running command git clone --filter=blob:none --quiet https://github.com/unslothai/unsloth.git /tmp/pip-install-kjbzjhpt/unsloth_15ef7430fc9c4ab3bd391f5c9a0c2b34
  Resolved https://github.com/unslothai/unsloth.git to commit 85f1fa096afde5efe2fb8521d8ceec8d13a00715
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Building wheels for collected packages: unsloth
  Building wheel for unsloth (pyproject.toml) ... [?25ldone
[?25h  Created wheel for unsloth: filename=unsloth-2024.12.4-py3-none-any.whl

### Imports and Setup

In [2]:
import torch
from unsloth import FastLanguageModel
from transformers import TextStreamer
import pandas as pd
from rouge_score import rouge_scorer
from nltk.translate.bleu_score import sentence_bleu
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


  from .autonotebook import tqdm as notebook_tqdm


🦥 Unsloth Zoo will now patch everything to make training faster!


### Load Fine-Tuned Model

In [3]:

# Load fine-tuned model  
ft_model, ft_tokenizer = FastLanguageModel.from_pretrained(
    model_name="./ft_model/1B_finetuned_llama3.2",
    max_seq_length=5020,
    load_in_4bit=True, 
    dtype=None
)
ft_model = FastLanguageModel.for_inference(ft_model)
    
   

==((====))==  Unsloth 2024.12.4: Fast Llama patching. Transformers:4.47.1.
   \\   /|    GPU: NVIDIA GeForce RTX 4090. Max memory: 23.61 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 8.9. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Unsloth 2024.12.4 patched 16 layers with 16 QKV layers, 16 O layers and 16 MLP layers.


### Clear CUDA cache

In [4]:
import gc
def clear_memory():
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        gc.collect()

### Define Inference Function

In [5]:
def generate_response(model, tokenizer, text, data_prompt):
    clear_memory()
    
    with torch.no_grad():
        inputs = tokenizer(
            [data_prompt.format(input=text)],
            return_tensors="pt",
            padding=True
        ).to("cuda")
        
        outputs = model.generate(
            **inputs,
            max_new_tokens=5020,  # Keeping the original parameter
            use_cache=True,
            do_sample=True,
            num_return_sequences=1,
            # streamer=TextStreamer(tokenizer)
        )
        
        response = tokenizer.batch_decode(outputs)[0]
        cleaned_response = response.split("Response:")[-1].strip()
        
        # Remove unwanted tokens
        cleaned_response = cleaned_response.replace("<|begin_of_text|>", "").replace("<|end_of_text|>", "").strip()
        
    clear_memory()
    return cleaned_response

In [6]:
data_prompt = """Analyze the mental health aspects in this text:
{input}
Response:"""

In [9]:
text = "I'm going through some things with my feelings and myself. I barely sleep and I do nothing but think about how I'm worthless and how I shouldn't be here. I've never tried or contemplated suicide. I've always wanted to fix my issues, but I never get around to it. How can I change my feeling of being worthless to everyone?"

response = generate_response(ft_model, ft_tokenizer, text, data_prompt)
print(response)

I can't say for sure but based on your description of how you've felt all your life and have experienced as a young adult and still experience today what is new is feeling like you've never tried or contemplated suicide. Since suicide is a serious decision and because it's illegal, I'm wondering if talking about your feelings and what's going on right now would be helpful? Maybe an appropriate counselor can point you in the right direction.


In [115]:

dataset = pd.read_json("hf://datasets/Amod/mental_health_counseling_conversations/combined_dataset.json", lines=True)

In [16]:
# Load dataset from CSV file
dataset = pd.read_csv("./dataset/test_mental_health.csv")

# Select only Context and Response columns
test_dataset = dataset[['Context', 'Response']]

# Display first few rows to verify
print("Dataset shape:", test_dataset.shape)
print("\nFirst few rows:")
print(test_dataset.head())

Dataset shape: (696, 2)

First few rows:
                                             Context  \
0  Any time my family and friends are in an alter...   
1  I've only been married three months. Every wee...   
2  I was in a car accident and totaled my car. I ...   
3  My dad is doing some really bad drugs, and I'm...   
4  My daughter is in later elementary school. She...   

                                            Response  
0  Breaking the patterns of relating to family me...  
1  At the begin g of marriages, we expect to be l...  
2  When we feel overwhelmed by life situations, i...  
3  It seems like you are going trough stages of g...  
4  Depending on your daughter’s age, this could b...  


In [24]:
from sklearn.metrics import precision_recall_fscore_support
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import f1_score

In [25]:
def calculate_f1_score(model, tokenizer, dataset, data_prompt):
    y_true = []
    y_pred = []
    
    for index, row in dataset.iterrows():
        context = row['Context']
        actual_response = row['Response']
        
        generated_response = generate_response(model, tokenizer, context, data_prompt)
        
        y_true.append(actual_response)
        y_pred.append(generated_response)
    
    # Preprocess responses for F1 score calculation
    vectorizer = CountVectorizer().fit(y_true + y_pred)
    y_true_vectors = vectorizer.transform(y_true).toarray()
    y_pred_vectors = vectorizer.transform(y_pred).toarray()
    
    # Calculate F1 score for each word and average them
    f1_scores = []
    for i in range(y_true_vectors.shape[1]):
        f1 = f1_score(y_true_vectors[:, i], y_pred_vectors[:, i], average='weighted', zero_division=1)
        f1_scores.append(f1)
    
    return np.mean(f1_scores)

In [47]:

# Calculate F1 score
f1 = calculate_f1_score(ft_model, ft_tokenizer, test_dataset, data_prompt)
print(f"F1 Score: {f1:.4f}")

F1 Score: 0.6863


In [29]:
# Load original model
base_model, base_tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/Llama-3.2-1B-bnb-4bit",
    max_seq_length=5020,
    load_in_4bit=True, 
    dtype=None
)
base_model = FastLanguageModel.for_inference(base_model)

==((====))==  Unsloth 2024.12.4: Fast Llama patching. Transformers:4.47.1.
   \\   /|    GPU: NVIDIA GeForce RTX 4090. Max memory: 23.61 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 8.9. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [30]:
# Calculate F1 score with original model
f1_base = calculate_f1_score(base_model, base_tokenizer, test_dataset, data_prompt)
print(f"F1 Score with Original Model: {f1_base:.4f}")

F1 Score with Original Model: 0.6994


### K-shot prompt

In [31]:
def create_k_shot_prompt(examples, k, query):
    prompt = ""
    for i in range(k):
        prompt += f"Example {i+1}:\nContext: {examples.iloc[i]['Context']}\nResponse: {examples.iloc[i]['Response']}\n\n"
    prompt += f"Now analyze this:\nContext: {query}\nResponse:"
    return prompt

In [32]:
def generate_k_shot_response(model, tokenizer, text, data_prompt, examples, k):
    clear_memory()
    
    prompt = create_k_shot_prompt(examples, k, text)
    
    with torch.no_grad():
        inputs = tokenizer(
            [prompt],
            return_tensors="pt",
            padding=True
        ).to("cuda")
        
        outputs = model.generate(
            **inputs,
            max_new_tokens=512,  # Adjusted for faster inference
            use_cache=True,
            do_sample=True,
            num_return_sequences=1,
        )
        
        response = tokenizer.batch_decode(outputs)[0]
        cleaned_response = response.split("Response:")[-1].strip()
        
        # Remove unwanted tokens
        cleaned_response = cleaned_response.replace("<|begin_of_text|>", "").replace("<|end_of_text|>", "").strip()
        
    clear_memory()
    return cleaned_response

In [33]:
def calculate_k_shot_f1_score(model, tokenizer, dataset, data_prompt, k):
    y_true = []
    y_pred = []
    
    examples = dataset.head(k)  # Use the first k examples for prompting
    
    for index, row in dataset.iterrows():
        context = row['Context']
        actual_response = row['Response']
        
        generated_response = generate_k_shot_response(model, tokenizer, context, data_prompt, examples, k)
        
        y_true.append(actual_response)
        y_pred.append(generated_response)
    
    # Preprocess responses for F1 score calculation
    vectorizer = CountVectorizer().fit(y_true + y_pred)
    y_true_vectors = vectorizer.transform(y_true).toarray()
    y_pred_vectors = vectorizer.transform(y_pred).toarray()
    
    # Calculate F1 score for each word and average them
    f1_scores = []
    for i in range(y_true_vectors.shape[1]):
        f1 = f1_score(y_true_vectors[:, i], y_pred_vectors[:, i], average='weighted', zero_division=1)
        f1_scores.append(f1)
    
    return np.mean(f1_scores)

In [34]:
# Calculate F1 score with k-shot prompting using the fine-tuned model
k = 1  # Number of examples to include in the prompt
f1_k_shot = calculate_k_shot_f1_score(ft_model, ft_tokenizer, test_dataset, data_prompt, k)
print(f"F1 Score with Fine-Tuned Model (K-Shot): {f1_k_shot:.4f}")

F1 Score with Fine-Tuned Model (K-Shot): 0.6834


In [37]:
k = 2  # Number of examples to include in the prompt
f1_k_shot = calculate_k_shot_f1_score(ft_model, ft_tokenizer, test_dataset, data_prompt, k)
print(f"F1 Score with Fine-Tuned Model (K-Shot): {f1_k_shot:.4f}")

F1 Score with Fine-Tuned Model (K-Shot): 0.7031


In [40]:
k = 3  # Number of examples to include in the prompt
f1_k_shot = calculate_k_shot_f1_score(ft_model, ft_tokenizer, test_dataset, data_prompt, k)
print(f"F1 Score with Fine-Tuned Model (K-Shot): {f1_k_shot:.4f}")

F1 Score with Fine-Tuned Model (K-Shot): 0.7117


In [48]:
# Calculate F1 score with k-shot prompting using original model
k = 1  # Number of examples to include in the prompt
f1_k_shot = calculate_k_shot_f1_score(base_model, base_tokenizer, test_dataset, data_prompt, k)
print(f"F1 Score with Base Model (K-Shot): {f1_k_shot:.4f}")

F1 Score with Base Model (K-Shot): 0.6657


In [51]:
# Calculate F1 score with k-shot prompting using original model
k = 2  # Number of examples to include in the prompt
f1_k_shot = calculate_k_shot_f1_score(base_model, base_tokenizer, test_dataset, data_prompt, k)
print(f"F1 Score with Base Model (K-Shot): {f1_k_shot:.4f}")

F1 Score with Base Model (K-Shot): 0.6845


In [55]:
# Calculate F1 score with k-shot prompting using original model
k = 3  # Number of examples to include in the prompt
f1_k_shot = calculate_k_shot_f1_score(base_model, base_tokenizer, test_dataset, data_prompt, k)
print(f"F1 Score with Base Model (K-Shot): {f1_k_shot:.4f}")

F1 Score with Base Model (K-Shot): 0.6799


### Benchmarking GPT3.5 Turbo

In [57]:
!pip install openai

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting openai
  Downloading openai-1.58.1-py3-none-any.whl.metadata (27 kB)
Collecting anyio<5,>=3.5.0 (from openai)
  Using cached anyio-4.7.0-py3-none-any.whl.metadata (4.7 kB)
Collecting distro<2,>=1.7.0 (from openai)
  Using cached distro-1.9.0-py3-none-any.whl.metadata (6.8 kB)
Collecting httpx<1,>=0.23.0 (from openai)
  Using cached httpx-0.28.1-py3-none-any.whl.metadata (7.1 kB)
Collecting jiter<1,>=0.4.0 (from openai)
  Downloading jiter-0.8.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.2 kB)
Collecting pydantic<3,>=1.9.0 (from openai)
  Downloading pydantic-2.10.4-py3-none-any.whl.metadata (29 kB)
Collecting sniffio (from openai)
  Using cached sniffio-1.3.1-py3-none-any.whl.metadata (3.9 kB)
Collecting httpcore==1.* (from httpx<1,>=0.23.0->openai)
  Using cached httpcore-1.0.7-py3-none-any.whl.metadata (21 kB)
Collecting h11<0.15,>=0.13 (from httpcore==1.*->httpx<1,>=0.23.0->openai)
  Using cached h11-0.14.0-py3-none-any.whl.metadata (8.2 kB)
Co

In [83]:
from openai import OpenAI
from dotenv import load_dotenv
import os
load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
client = OpenAI(api_key=OPENAI_API_KEY)
import time

In [79]:
def generate_gpt_response(text, data_prompt):
    try:
        response = client.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=[
                {"role": "system", "content": "Analyze mental health aspects in the given text."},
                {"role": "user", "content": data_prompt.format(input=text)}
            ],
            max_tokens=512,
            temperature=0.7
        )
        return response.choices[0].message.content.strip()
    except Exception as e:
        print(f"Error generating response: {e}")
        return ""

In [80]:
def calculate_gpt_f1_score(dataset, data_prompt):
    y_true = []
    y_pred = []
    
    for index, row in dataset.iterrows():
        context = row['Context']
        actual_response = row['Response']
        generated_response = generate_gpt_response(context, data_prompt)
        
        y_true.append(actual_response)
        y_pred.append(generated_response)
    
    vectorizer = CountVectorizer().fit(y_true + y_pred)
    y_true_vectors = vectorizer.transform(y_true).toarray()
    y_pred_vectors = vectorizer.transform(y_pred).toarray()
    
    f1_scores = []
    for i in range(y_true_vectors.shape[1]):
        f1 = f1_score(y_true_vectors[:, i], y_pred_vectors[:, i], average='weighted', zero_division=1)
        f1_scores.append(f1)
    
    return np.mean(f1_scores)

In [81]:
# Calculate GPT-3.5 F1 Score
gpt_f1 = calculate_gpt_f1_score(test_dataset, data_prompt)
print(f"GPT-3.5 Turbo F1 Score: {gpt_f1:.4f}")

GPT-3.5 Turbo F1 Score: 0.6691


### GPT 3.5 K-shot

In [84]:
def generate_gpt_k_shot_response(text, examples, k):
    try:
        k_shot_prompt = create_k_shot_prompt(examples, k, text)
        response = client.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=[
                {"role": "system", "content": "You are a mental health expert analyzing text."},
                {"role": "user", "content": k_shot_prompt}
            ],
            max_tokens=512,
            temperature=0.7
        )
        time.sleep(1)
        return response.choices[0].message.content.strip()
    except Exception as e:
        print(f"Error: {e}")
        return ""

In [85]:
def calculate_gpt_k_shot_f1(dataset, k):
    y_true = []
    y_pred = []
    examples = dataset.head(k)  # Get k examples for prompting
    
    for index, row in dataset.iterrows():
        generated_response = generate_gpt_k_shot_response(row['Context'], examples, k)
        y_true.append(row['Response'])
        y_pred.append(generated_response)
    
    vectorizer = CountVectorizer().fit(y_true + y_pred)
    y_true_vectors = vectorizer.transform(y_true).toarray()
    y_pred_vectors = vectorizer.transform(y_pred).toarray()
    
    f1_scores = []
    for i in range(y_true_vectors.shape[1]):
        f1 = f1_score(y_true_vectors[:, i], y_pred_vectors[:, i], average='weighted', zero_division=1)
        f1_scores.append(f1)
    
    return np.mean(f1_scores)

In [86]:
# Calculate k-shot F1 score
k = 1  # Number of examples
gpt_k_shot_f1 = calculate_gpt_k_shot_f1(test_dataset, k)
print(f"GPT-3.5 Turbo {k}-Shot F1 Score: {gpt_k_shot_f1:.4f}")

GPT-3.5 Turbo 1-Shot F1 Score: 0.6740


In [87]:
# Calculate k-shot F1 score
k = 2  # Number of examples
gpt_k_shot_f1 = calculate_gpt_k_shot_f1(test_dataset, k)
print(f"GPT-3.5 Turbo {k}-Shot F1 Score: {gpt_k_shot_f1:.4f}")

GPT-3.5 Turbo 2-Shot F1 Score: 0.6799


In [88]:
# Calculate k-shot F1 score
k = 3  # Number of examples
gpt_k_shot_f1 = calculate_gpt_k_shot_f1(test_dataset, k)
print(f"GPT-3.5 Turbo {k}-Shot F1 Score: {gpt_k_shot_f1:.4f}")

GPT-3.5 Turbo 3-Shot F1 Score: 0.6765
