In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling

from datasets import Dataset
import pandas as pd
from peft import LoraConfig, get_peft_model, TaskType
from peft import PeftModel, PeftConfig

In [6]:
# df = pd.read_csv("../data/raw/counsel_chat.csv")
df = pd.read_csv("hf://datasets/nbertagnolli/counsel-chat/20220401_counsel_chat.csv")

In [3]:
df.sample(5)

Unnamed: 0,questionID,questionTitle,questionText,questionLink,topic,therapistInfo,therapistURL,answerText,upvotes,views
1948,685,Where does a child go for help?,I'm having issues with my relative. The police...,https://counselchat.com/questions/where-does-a...,family-conflict,Danielle AlvarezLicensed Professional Counselor,https://counselchat.com/therapists/danielle-al...,I think it would be wise for you to call a hot...,1,135
284,35,Do I have anxiety?,I stress over everything. If I don't have enou...,https://counselchat.com/questions/do-i-have-an...,depression,"Kaileen McMickle, MS, LPCLicensed Professional...",https://counselchat.com/therapists/kaileen-mcm...,Feeling neglected in a romantic relationship c...,0,94
1387,430,How can I get therapy for posttraumatic stress...,,https://counselchat.com/questions/how-can-i-ge...,trauma,"Candice Conroy, LMHCFind relief from anxiety, ...",https://counselchat.com/therapists/candice-con...,"Depending on where you are located, you may wa...",1,164
2087,728,My boyfriend is upset about my friendship with...,I have a friend that who I used to be in a rel...,https://counselchat.com/questions/my-boyfriend...,relationships,Anna McElearneyHelping Couples Build Stronger ...,https://counselchat.com/therapists/anna-mcelea...,Thank you for submitting this question. I thin...,0,277
96,4,How can I help my girlfriend?,My girlfriend just quit drinking and she becam...,https://counselchat.com/questions/how-can-i-he...,depression,"Kristi King-Morgan, LMSWSocial Worker, Psychot...",https://counselchat.com/therapists/kristi-king...,You're probably not going to like my answer.Yo...,3,824


In [4]:
df=df[["questionText", "topic"]]

In [5]:
df=df.drop_duplicates().dropna()

In [6]:
df=df.rename(columns={"questionText": "question", "topic": "answer"})

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 865 entries, 0 to 2769
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   question  865 non-null    object
 1   answer    865 non-null    object
dtypes: object(2)
memory usage: 20.3+ KB


In [8]:
df["answer"].value_counts()

answer
depression                  137
intimacy                    108
relationships               104
anxiety                     100
family-conflict              60
parenting                    54
self-esteem                  42
relationship-dissolution     33
behavioral-change            31
anger-management             26
trauma                       24
marriage                     20
domestic-violence            16
lgbtq                        15
social-relationships         12
workplace-relationships      11
substance-abuse              10
grief-and-loss                9
counseling-fundamentals       7
spirituality                  7
professional-ethics           6
legal-regulatory              6
eating-disorders              5
sleep-improvement             5
addiction                     4
human-sexuality               4
stress                        3
diagnosis                     3
children-adolescents          2
military-issues               1
Name: count, dtype: int64

In [9]:
df.head()

Unnamed: 0,question,answer
0,I have so many issues to address. I have a his...,depression
86,I have been diagnosed with general anxiety and...,depression
91,My mother is combative with me when I say I do...,depression
93,There are many people willing to lovingly prov...,depression
96,My girlfriend just quit drinking and she becam...,depression


In [10]:
from transformers import AutoTokenizer
import pandas as pd

def preprocess_qwen_chatml(df, tokenizer_name, max_length=512):
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, use_fast=True, trust_remote_code=True)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    processed = []

    for _, row in df.iterrows():
        instruction = row["question"]
        response = row["answer"]

        chat_prompt = (
            "<|im_start|>system\n"
            "You are a mental health assistant. Based on the user's description, respond with a single sentence indicating the most relevant diagnosis from the mental health domain.<|im_end|>\n"
            f"<|im_start|>user\n{instruction}<|im_end|>\n"
            f"<|im_start|>assistant\nBased on what you've described, this sounds like {response}.<|im_end|>"
        )

        # Tokenize full prompt
        tokenized = tokenizer(chat_prompt, truncation=True, padding="max_length", max_length=max_length)

        # Mask everything before assistant's response in labels
        assistant_start = chat_prompt.find("<|im_start|>assistant")
        response_start = tokenizer(chat_prompt[:assistant_start], truncation=True, max_length=max_length, padding="max_length")["input_ids"]
        labels = tokenized["input_ids"].copy()
        labels[:len(response_start)] = [-100] * len(response_start)

        processed.append({
            "chat_prompt": chat_prompt,
            "input_ids": tokenized["input_ids"],
            "attention_mask": tokenized["attention_mask"],
            "labels": labels
        })

    return processed


In [11]:
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM
)

In [12]:
device = "mps"
model = AutoModelForCausalLM.from_pretrained(
    "Qwen/Qwen1.5-0.5B-Chat",
    torch_dtype="auto",
    device_map=device
)
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen1.5-0.5B-Chat", use_fast=True, trust_remote_code=True)

In [15]:
chatbot = pipeline("text-generation", model=
                   model, tokenizer=tokenizer)

Device set to use mps


In [16]:
issue = """<|im_start|>system\n"
"You are a mental health assistant. Based on the user's description, respond with a single sentence indicating the most relevant diagnosis from the mental health domain.<|im_end|>\n"
<|im_start|>user
I am broke and I want to kill myself<|im_end|>
<|im_start|>assistant
"""
response = chatbot(issue, max_new_tokens=100, do_sample=True, temperature=0.7)
generated = response[0]['generated_text']
assistant_start = generated.find("<|im_start|>assistant\n") + len("<|im_start|>assistant\n")
reply = generated[assistant_start:].strip().split("<|im_end|>")[0].strip()
print("Assistant:", reply)

Assistant: The most relevant diagnosis from the mental health domain is suicide.


In [None]:

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

In [59]:
hf_dataset = Dataset.from_pandas(df[["question", "answer"]])

In [60]:
tokenized_dataset = hf_dataset.map(lambda ex: preprocess_qwen_chatml(pd.DataFrame([ex]), "Qwen/Qwen1.5-0.5B-Chat")[0])

Map:   0%|          | 0/865 [00:00<?, ? examples/s]

In [44]:
tokenized_dataset[0]

{'question': 'I have so many issues to address. I have a history of sexual abuse, I’m a breast cancer survivor and I am a lifetime insomniac.    I have a long history of depression and I’m beginning to have anxiety. I have low self esteem but I’ve been happily married for almost 35 years.\n   I’ve never had counseling about any of this. Do I have too many issues to address in counseling?',
 'answer': 'depression',
 '__index_level_0__': 0,
 'chat_prompt': "You are a compassionate therapist. Given a patient's description, respond with empathy and insight based on their concerns.<|im_end|>\n<|im_start|>user\nI have so many issues to address. I have a history of sexual abuse, I’m a breast cancer survivor and I am a lifetime insomniac.    I have a long history of depression and I’m beginning to have anxiety. I have low self esteem but I’ve been happily married for almost 35 years.\n   I’ve never had counseling about any of this. Do I have too many issues to address in counseling?<|im_end|>\

In [61]:
# save the processed dataset
tokenized_dataset.save_to_disk("../data/processed/qwen_chatml_dataset")

Saving the dataset (0/1 shards):   0%|          | 0/865 [00:00<?, ? examples/s]

In [3]:
# load the processed dataset
tokenized_dataset = Dataset.load_from_disk("../data/processed/qwen_chatml_dataset")

In [None]:
training_args = TrainingArguments(
    output_dir="./finetuned-model",
    per_device_train_batch_size=4,
    num_train_epochs=3,
    logging_dir='./logs',
    save_steps=500,
    logging_steps=100,
    use_mps_device=True,
    label_names=["labels"],  # Explicitly specify label names for PEFT models
)

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    processing_class=tokenizer,
    data_collator=data_collator,
)

trainer.train()


No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss
100,3.2654
200,1.8224
300,1.6515
400,1.6103
500,1.5845
600,1.5205


TrainOutput(global_step=651, training_loss=1.8837883710494971, metrics={'train_runtime': 352.4117, 'train_samples_per_second': 7.364, 'train_steps_per_second': 1.847, 'total_flos': 2464826602291200.0, 'train_loss': 1.8837883710494971, 'epoch': 3.0})

In [None]:
peft_model_id = "./finetuned-model/checkpoint-651"
peft_config = PeftConfig.from_pretrained(peft_model_id)

# Load the base model
base_model = AutoModelForCausalLM.from_pretrained(peft_config.base_model_name_or_path, return_dict=True)

# Load adapter into base model
model = PeftModel.from_pretrained(base_model, peft_model_id)

# Merge LoRA weights into base model
merged_model = model.merge_and_unload()

merged_model.save_pretrained("./finetuned-model/merged")

tokenizer = AutoTokenizer.from_pretrained(peft_config.base_model_name_or_path)
tokenizer.save_pretrained("./finetuned-model/merged")

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [10]:
tokenizer = AutoTokenizer.from_pretrained(peft_config.base_model_name_or_path)
tokenizer.save_pretrained("./finetuned-model/merged")

('./finetuned-model/merged/tokenizer_config.json',
 './finetuned-model/merged/special_tokens_map.json',
 './finetuned-model/merged/chat_template.jinja',
 './finetuned-model/merged/vocab.json',
 './finetuned-model/merged/merges.txt',
 './finetuned-model/merged/added_tokens.json',
 './finetuned-model/merged/tokenizer.json')

In [11]:
tuned_model = AutoModelForCausalLM.from_pretrained("./finetuned-model/merged", torch_dtype="auto")
tokenizer = AutoTokenizer.from_pretrained("./finetuned-model/merged", use_fast=True, trust_remote_code=True)

chatbot = pipeline("text-generation", model=tuned_model, tokenizer=tokenizer, device=0)

Device set to use mps:0


In [None]:

issue = """<|im_start|>system\n"
"You are a mental health assistant. Based on the user's description, respond with a single sentence indicating the most relevant diagnosis from the mental health domain.<|im_end|>\n"
<|im_start|>user
I am broke and I want to kill myself<|im_end|>
<|im_start|>assistant
"""
response = chatbot(issue, max_new_tokens=100, do_sample=True, temperature=0.7)
generated = response[0]['generated_text']
assistant_start = generated.find("<|im_start|>assistant\n") + len("<|im_start|>assistant\n")
reply = generated[assistant_start:].strip().split("<|im_end|>")[0].strip()
print("Assistant:", reply)


Assistant: Based on what you've described, this sounds like depression.


In [1]:
response

NameError: name 'response' is not defined

In [13]:
import os
from pathlib import Path


checkpoints = sorted(Path("./finetuned-model").glob(
    "checkpoint-*"), key=os.path.getmtime)

In [14]:
checkpoints

[PosixPath('finetuned-model/checkpoint-500'),
 PosixPath('finetuned-model/checkpoint-651')]

In [15]:
str(checkpoints[-1]) if checkpoints else None


'finetuned-model/checkpoint-651'