In [1]:
!pip install rouge_score
!pip install unsloth
!pip uninstall unsloth -y && pip install --upgrade --no-cache-dir "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"

Found existing installation: unsloth 2024.12.4
Uninstalling unsloth-2024.12.4:
  Successfully uninstalled unsloth-2024.12.4
Collecting unsloth@ git+https://github.com/unslothai/unsloth.git (from unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git)
  Cloning https://github.com/unslothai/unsloth.git to /tmp/pip-install-bfmi4ku_/unsloth_8a02d9be91ad4cffaecde82a39ffb921
  Running command git clone --filter=blob:none --quiet https://github.com/unslothai/unsloth.git /tmp/pip-install-bfmi4ku_/unsloth_8a02d9be91ad4cffaecde82a39ffb921
  Resolved https://github.com/unslothai/unsloth.git to commit 85f1fa096afde5efe2fb8521d8ceec8d13a00715
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Building wheels for collected packages: unsloth
  Building wheel for unsloth (pyproject.toml) ... [?25ldone
[?25h  Created wheel for unsloth: filename=unsloth-2024.12.4-py3-none-any.whl

In [86]:
!pip install scikit-learn

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting scikit-learn
  Downloading scikit_learn-1.6.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (18 kB)
Collecting scipy>=1.6.0 (from scikit-learn)
  Downloading scipy-1.14.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Using cached threadpoolctl-3.5.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn-1.6.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.1/13.1 MB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading scipy-1.14.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (40.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.8/40.8 MB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hUsing cached threadpoolctl-3.5.0-py3-none-any.whl (18 kB)
Installing collected packages: threadpoolctl, scipy, scikit-l

### Imports and Setup

In [87]:
import torch
from unsloth import FastLanguageModel
from transformers import TextStreamer
import pandas as pd
from rouge_score import rouge_scorer
from nltk.translate.bleu_score import sentence_bleu
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

### Load Fine-Tuned Model

In [3]:

# Load fine-tuned model  
ft_model, ft_tokenizer = FastLanguageModel.from_pretrained(
    model_name="./unsloth/model/1B_finetuned_llama3.2",
    max_seq_length=5020,
    load_in_4bit=True, 
    dtype=None
)
ft_model = FastLanguageModel.for_inference(ft_model)
    
   

==((====))==  Unsloth 2024.12.4: Fast Llama patching. Transformers:4.47.1.
   \\   /|    GPU: NVIDIA GeForce RTX 4090. Max memory: 23.61 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 8.9. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Unsloth 2024.12.4 patched 16 layers with 16 QKV layers, 16 O layers and 16 MLP layers.


### Clear CUDA cache

In [4]:
import gc
def clear_memory():
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        gc.collect()

### Define Inference Function

In [15]:
def generate_response(model, tokenizer, text, data_prompt):
    clear_memory()
    
    with torch.no_grad():
        inputs = tokenizer(
            [data_prompt.format(input=text)],
            return_tensors="pt",
            padding=True
        ).to("cuda")
        
        outputs = model.generate(
            **inputs,
            max_new_tokens=5020,  # Keeping the original parameter
            use_cache=True,
            do_sample=True,
            num_return_sequences=1,
            # streamer=TextStreamer(tokenizer)
        )
        
        response = tokenizer.batch_decode(outputs)[0]
        cleaned_response = response.split("Response:")[-1].strip()
        
        # Remove unwanted tokens
        cleaned_response = cleaned_response.replace("<|begin_of_text|>", "").replace("<|end_of_text|>", "").strip()
        
    clear_memory()
    return cleaned_response

In [16]:
data_prompt = """Analyze the mental health aspects in this text:
{input}
Response:"""

In [103]:
text = "I'm going through some things with my feelings and myself. I barely sleep and I do nothing but think about how I'm worthless and how I shouldn't be here. I've never tried or contemplated suicide. I've always wanted to fix my issues, but I never get around to it. How can I change my feeling of being worthless to everyone?"

response = generate_response(ft_model, ft_tokenizer, text, data_prompt)
print(response)

We don't want to say that we aren't worthy of human lives. It is a common sentiment that you carry, and that there may be other valid reasons why you may not realize that you are worthy of respect and care. People often express that they don't even realize that they think that those they love may feel that they do not deserve them. With self compassion we realize that we have many virtues to feel proud of. I wonder if you can acknowledge all the ways that you are a worthy, wonderful person? Are there any ways that you think you may feel unworthy of being who you are? Can you feel that you don't always think of yourself as worthy? Self Compassion means being fully loving, accepting, confident in all of who you are. When we feel confident knowing we are worthy of others, our behavior is usually more caring and compassionate. Self Compassion is a gift that we all get to give to ourselves if we will let ourselves know it is OK to care about who you are, and to be grateful for all the wonde

In [115]:

dataset = pd.read_json("hf://datasets/Amod/mental_health_counseling_conversations/combined_dataset.json", lines=True)

In [152]:
# Use random sampling to get 100 rows for testing
test_dataset = dataset.sample(n=5, random_state=42)  # setting random_state for reproducibility
test_dataset

Unnamed: 0,Context,Response
495,I've hit my head on walls and floors ever sinc...,The best way to handle anxiety of this level i...
1592,Over a year ago I had a female friend. She tur...,We women really do tend to struggle with the c...
2314,"My long-distance girlfriend is in a sorority, ...",You may already be doing as much as possible f...
1475,Cheating is something unacceptable for me but ...,It is completely understandable that you are s...
2772,I have twin toddlers. I experienced a death of...,"First, let me say that you are a survivor and ..."


In [153]:
from sklearn.metrics import precision_recall_fscore_support
from sklearn.feature_extraction.text import CountVectorizer

In [155]:
def calculate_f1_score(model, tokenizer, dataset, data_prompt):
    y_true = []
    y_pred = []
    
    for index, row in dataset.iterrows():
        context = row['Context']
        actual_response = row['Response']
        
        generated_response = generate_response(model, tokenizer, context, data_prompt)
        
        y_true.append(actual_response)
        y_pred.append(generated_response)
    
    # Preprocess responses for F1 score calculation
    vectorizer = CountVectorizer().fit(y_true + y_pred)
    y_true_vectors = vectorizer.transform(y_true).toarray()
    y_pred_vectors = vectorizer.transform(y_pred).toarray()
    
    # Calculate F1 score for each word and average them
    f1_scores = []
    for i in range(y_true_vectors.shape[1]):
        f1 = f1_score(y_true_vectors[:, i], y_pred_vectors[:, i], average='weighted', zero_division=1)
        f1_scores.append(f1)
    
    return np.mean(f1_scores)

In [156]:

# Calculate F1 score
f1 = calculate_f1_score(ft_model, ft_tokenizer, test_dataset, data_prompt)
print(f"F1 Score: {f1:.4f}")

F1 Score: 0.6708


In [157]:
# Load original model
base_model, base_tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/Llama-3.2-1B-bnb-4bit",
    max_seq_length=5020,
    load_in_4bit=True, 
    dtype=None
)
base_model = FastLanguageModel.for_inference(base_model)

==((====))==  Unsloth 2024.12.4: Fast Llama patching. Transformers:4.47.1.
   \\   /|    GPU: NVIDIA GeForce RTX 4090. Max memory: 23.61 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 8.9. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [158]:
# Calculate F1 score with original model
f1_base = calculate_f1_score(base_model, base_tokenizer, test_dataset, data_prompt)
print(f"F1 Score with Original Model: {f1_base:.4f}")

F1 Score with Original Model: 0.7042


### K-shot prompt

In [165]:
def create_k_shot_prompt(examples, k, query):
    prompt = ""
    for i in range(k):
        prompt += f"Example {i+1}:\nContext: {examples.iloc[i]['Context']}\nResponse: {examples.iloc[i]['Response']}\n\n"
    prompt += f"Now analyze this:\nContext: {query}\nResponse:"
    return prompt

In [166]:
def generate_k_shot_response(model, tokenizer, text, data_prompt, examples, k):
    clear_memory()
    
    prompt = create_k_shot_prompt(examples, k, text)
    
    with torch.no_grad():
        inputs = tokenizer(
            [prompt],
            return_tensors="pt",
            padding=True
        ).to("cuda")
        
        outputs = model.generate(
            **inputs,
            max_new_tokens=512,  # Adjusted for faster inference
            use_cache=True,
            do_sample=True,
            num_return_sequences=1,
        )
        
        response = tokenizer.batch_decode(outputs)[0]
        cleaned_response = response.split("Response:")[-1].strip()
        
        # Remove unwanted tokens
        cleaned_response = cleaned_response.replace("<|begin_of_text|>", "").replace("<|end_of_text|>", "").strip()
        
    clear_memory()
    return cleaned_response

In [167]:
def calculate_k_shot_f1_score(model, tokenizer, dataset, data_prompt, k):
    y_true = []
    y_pred = []
    
    examples = dataset.head(k)  # Use the first k examples for prompting
    
    for index, row in dataset.iterrows():
        context = row['Context']
        actual_response = row['Response']
        
        generated_response = generate_k_shot_response(model, tokenizer, context, data_prompt, examples, k)
        
        y_true.append(actual_response)
        y_pred.append(generated_response)
    
    # Preprocess responses for F1 score calculation
    vectorizer = CountVectorizer().fit(y_true + y_pred)
    y_true_vectors = vectorizer.transform(y_true).toarray()
    y_pred_vectors = vectorizer.transform(y_pred).toarray()
    
    # Calculate F1 score for each word and average them
    f1_scores = []
    for i in range(y_true_vectors.shape[1]):
        f1 = f1_score(y_true_vectors[:, i], y_pred_vectors[:, i], average='weighted', zero_division=1)
        f1_scores.append(f1)
    
    return np.mean(f1_scores)

In [168]:
# Calculate F1 score with k-shot prompting using the fine-tuned model
k = 1  # Number of examples to include in the prompt
f1_k_shot = calculate_k_shot_f1_score(ft_model, ft_tokenizer, test_dataset, data_prompt, k)
print(f"F1 Score with Fine-Tuned Model (K-Shot): {f1_k_shot:.4f}")

F1 Score with Fine-Tuned Model (K-Shot): 0.7352


In [169]:
# Calculate F1 score with k-shot prompting using original model
k = 1  # Number of examples to include in the prompt
f1_k_shot = calculate_k_shot_f1_score(base_model, base_tokenizer, test_dataset, data_prompt, k)
print(f"F1 Score with Fine-Tuned Model (K-Shot): {f1_k_shot:.4f}")

F1 Score with Fine-Tuned Model (K-Shot): 0.6811
