In [1]:
#imports

import pandas as pd
import numpy as np
import os
import torch
from datasets import Dataset, load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from peft import get_peft_model, LoraConfig, prepare_model_for_kbit_training, PeftModel, PeftConfig
#from trl import SFTConfig, SFTTrainer

2025-11-27 11:35:33.017938: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1764243333.040621     190 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1764243333.047312     190 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

## Preprocessing

In [2]:
data_path = "/kaggle/input/annomi/"

annomi = pd.read_csv(data_path + "AnnoMI-full.csv")
annomi = annomi.drop(['video_title', 'video_url', 'topic', 'timestamp', 'annotator_id', 'therapist_input_exists', 'therapist_input_subtype', 'reflection_exists', 'reflection_subtype', 'question_exists', 'question_subtype', 'main_therapist_behaviour', 'client_talk_type'], axis=1)
annomi = annomi.query("mi_quality == 'high'")
annomi = annomi.drop_duplicates(subset=['transcript_id', 'utterance_id'])
annomi = annomi.drop(['mi_quality'], axis=1) #, 'utterance_id'

#cleaning unwanted characters
annomi['utterance_text'] = annomi['utterance_text'].str.replace(r'\[.*?\]', '', regex=True)
annomi['utterance_text'] = annomi['utterance_text'].str.replace(r'-', ' ', regex=True)
annomi['utterance_text'] = annomi['utterance_text'].str.replace(r'â€”', ' ', regex=True)

#making everything lower case and removing multiple spaces
annomi['utterance_text'] = annomi['utterance_text'].str.lower()
annomi['utterance_text'] = annomi['utterance_text'].str.strip().replace(r'\s+', ' ', regex=True)

#getting only client-therapist pairs of utterances
first_utterances = annomi.groupby('transcript_id')['utterance_id'].transform('min') == annomi['utterance_id']
last_utterances = annomi.groupby('transcript_id')['utterance_id'].transform('max') == annomi['utterance_id']
annomi = annomi[~((first_utterances & (annomi['interlocutor'] == 'therapist')) | (last_utterances & (annomi['interlocutor'] == 'client')))]

#checks
print(annomi.head())
#print(annomi.tail())
print(annomi.shape)
#print(annomi.iloc[0])
#pd.set_option('display.max_colwidth', None)
#print(annomi[(annomi['transcript_id'] == 18) & (annomi['utterance_id'] == 91)]['utterance_text'])
max_length = annomi['utterance_text'].str.len().max()
print(f"Max sentence length: {max_length} characters")

#count occurrences of each speaker type
speaker_counts = annomi['interlocutor'].value_counts()
print(speaker_counts)
interviewer_count = annomi['interlocutor'].value_counts()['therapist']
interviewed_count = annomi['interlocutor'].value_counts()['client']
print(f"Therapist: {interviewer_count}")
print(f"Client: {interviewed_count}")

   transcript_id  utterance_id interlocutor  \
1              0             1       client   
2              0             2    therapist   
3              0             3       client   
4              0             4    therapist   
5              0             5       client   

                                      utterance_text  
1                                              sure.  
2  so, let's see. it looks that you put you drink...  
3                                            mm hmm.  
4  and you usually have three to four drinks when...  
5          usually three drinks and glasses of wine.  
(8693, 4)
Max sentence length: 1193 characters
interlocutor
therapist    4347
client       4346
Name: count, dtype: int64
Therapist: 4347
Client: 4346


In [3]:
#putting examples in instruct format for mistral 7b

training_examples = []

for transcript_id, group in annomi.groupby('transcript_id'):
    
    group = group.sort_values(by='utterance_id')
    utterances = list(group[['interlocutor', 'utterance_text']].itertuples(index=False))
    current_prompt = ""
    
    for interlocutor, text in utterances:
        
        if interlocutor.lower() == 'client':
            current_prompt += text + " "
        
        elif interlocutor.lower() == 'therapist':

            if current_prompt.strip():

                formatted_turn = (
                    "<s>[INST] " + 
                    current_prompt.strip() + 
                    " [/INST] " + 
                    text.strip() + 
                    "</s>"
                )
                training_examples.append(formatted_turn)
                
            current_prompt = "" 
            
annomi_finetune = pd.DataFrame(training_examples, columns=['text'])

print(len(annomi_finetune))
print(annomi_finetune['text'].iloc[0])
print(annomi_finetune.head())
print(annomi_finetune.shape)

dataset = Dataset.from_pandas(annomi_finetune)
print(dataset)

print(dataset["text"][100])

4325
<s>[INST] sure. [/INST] so, let's see. it looks that you put you drink alcohol at least four times a week on average</s>
                                                text
0  <s>[INST] sure. [/INST] so, let's see. it look...
1  <s>[INST] mm hmm. [/INST] and you usually have...
2  <s>[INST] usually three drinks and glasses of ...
3  <s>[INST] something like that. [/INST] okay. j...
4  <s>[INST] okay. [/INST] uh, what else can you ...
(4325, 1)
Dataset({
    features: ['text'],
    num_rows: 4325
})
<s>[INST] yeah, no, it's easy. [/INST] yeah. yeah. you know, i've been thinking about, um, thinking about you, and thinking about, um, that first meeting we had, and where you are now. so, wh what do you think some of the changes are that that you've made in this in this, sort of, five five to six weeks that we've been meeting?</s>


## Finetuning

In [4]:
#configurations

#bitsandbytes
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16
)

#lora
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj", "k_proj", "o_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)

#trainer
trainer_config = TrainingArguments(
    num_train_epochs=1,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=1,

    optim="paged_adamw_32bit",
    learning_rate=1e-4,
    warmup_ratio=0.03,
    lr_scheduler_type="constant",

    max_grad_norm=0.3,
    weight_decay=0.001,

    bf16=True,
    save_steps=100,
    logging_steps=20,
    output_dir="/kaggle/working/",
    report_to="none",
    gradient_checkpointing = False,
    label_names=["labels"]
)

In [None]:
'''#downloading model
model_name = "mistralai/Mistral-7B-Instruct-v0.3"

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.save_pretrained("/kaggle/working/mistral-7b-saved")

model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=quantization_config, device_map="auto", trust_remote_code=True, low_cpu_mem_usage=True)
model.save_pretrained("/kaggle/working/mistral-7b-saved")'''

In [None]:
#loading model and tokenizer

tokenizer = AutoTokenizer.from_pretrained("/kaggle/working/mistral-7b-saved", local_files_only=True)

tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"
tokenizer.add_bos_token = False
tokenizer.add_eos_token = False
tokenizer.model_max_length = 4096

model = AutoModelForCausalLM.from_pretrained(
    "/kaggle/working/mistral-7b-saved",
    quantization_config=quantization_config,
    device_map="auto",
    local_files_only=True,
    low_cpu_mem_usage=True
)

model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, lora_config)

In [None]:
#tokenizing the dataset
def tokenize_function(dialogues):
    return tokenizer(dialogues["text"], max_length=4096, truncation=True, padding=True)

tokenized_dataset = dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=dataset.column_names,
)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

In [None]:
#training the model
trainer = Trainer(
    model=model,
    train_dataset=tokenized_dataset,
    args=trainer_config,
    data_collator = data_collator
)

trainer.train()

In [None]:
#saving the model
trainer.model.save_pretrained("/kaggle/working/mistral-7b-annomi-adapter")

## Inference

In [None]:
from peft import PeftConfig

adapter_config = PeftConfig.from_pretrained("/kaggle/input/mistral-7b-annomi-adapter/transformers/default/1")
print('loaded')

In [5]:
#loading model for inference
base_model = AutoModelForCausalLM.from_pretrained(
    #model_name,
    "/kaggle/working/mistral-7b-saved",
    quantization_config=quantization_config,
    device_map="auto",
    local_files_only=True,
    low_cpu_mem_usage=True
)

tokenizer = AutoTokenizer.from_pretrained("/kaggle/working/mistral-7b-saved", local_files_only=True)

tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"
tokenizer.add_bos_token = False
tokenizer.add_eos_token = False

model = PeftModel.from_pretrained(base_model, "/kaggle/input/mistral-7b-annomi-adapter/transformers/default/1")
print(model)
#model.save_pretrained("/kaggle/working/mistral-7b-annomi-adapter")



PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): MistralForCausalLM(
      (model): MistralModel(
        (embed_tokens): Embedding(32768, 4096)
        (layers): ModuleList(
          (0-31): 32 x MistralDecoderLayer(
            (self_attn): MistralAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_pro

In [6]:
#running prompts
model.eval()

generator = pipeline('text-generation', model=model, tokenizer=tokenizer)

prompt_text = "I am having trouble getting out of bed"
formatted_prompt = f"<s>[INST] {prompt_text} [/INST]"

response = generator(formatted_prompt, max_new_tokens=120, num_return_sequences=1, temperature=0.7, do_sample=True)

print(response)

Device set to use cuda:0


[{'generated_text': "<s>[INST] I am having trouble getting out of bed [/INST] mm hmm. and how long have you been having trouble getting out of bed? i'm curious about that. how long have you been having trouble getting out of bed? is this a new thing? how long have you been having trouble getting out of bed? how long has this issue lasted? what's the history there? and what have you done to get out of bed? what have you done to get yourself out of bed? what's your history with getting out of bed? so that i can understand your situation a little bit better. can you kind of give me a"}]


In [7]:
prompt_text = "I have been drinking too much recently"
formatted_prompt = f"<s>[INST] {prompt_text} [/INST]"

response = generator(formatted_prompt, max_new_tokens=200, num_return_sequences=1, temperature=0.7, do_sample=True)

print(response)

[{'generated_text': "<s>[INST] I have been drinking too much recently [/INST] uh huh. and how has that been a problem for you? um, um, and can you tell me what you know about drinking during pregnancy? have you picked up any information? have you talked with anyone about this? um, what have you heard? what do you know about that? what's your knowledge base? do you have any ideas about drinking and pregnancy? what do you know? um, and what concerns do you have about that? what's your feeling about that? do you want to continue drinking during pregnancy? what would you like to do? how do you feel about that? how would you like to proceed? yeah, go ahead. go ahead, go ahead. so what is your knowledge base on this? where does your information come from? where do you have any concerns? what do you have any idea about, um, the impact of alcohol during pregnancy? um, and what do you think you would like to do? um, what would"}]


In [8]:
prompt_text = "I am finding it quite hard to find motivation to go to work"
formatted_prompt = f"<s>[INST] {prompt_text} [/INST]"

response = generator(formatted_prompt, max_new_tokens=50, num_return_sequences=1, temperature=0.7, do_sample=True)

print(response)

[{'generated_text': "<s>[INST] I am finding it quite hard to find motivation to go to work [/INST] right. so what do you think is the reason for that? what do you think is the reason you're struggling to find motivation to go to work? what do you think is the reason why you're finding it hard to get up in"}]


In [12]:
system_prompt = "You are a helpful and empathetic mental health assistant that is a specialist in motivational interviewing. Provide supportive, thoughtful responses."
prompt_text = "I am having trouble getting out of bed"

formatted_prompt = f"<s>[INST] {system_prompt} {prompt_text} [/INST]"

response = generator(formatted_prompt, max_new_tokens=120, num_return_sequences=1, temperature=0.7, do_sample=True)

print(response)

[{'generated_text': "<s>[INST] You are a helpful and empathetic mental health assistant that is a specialist in motivational interviewing. Provide supportive, thoughtful responses. I am having trouble getting out of bed [/INST] it's a bit of a struggle then. you're not enjoying it, you're just not. what's going on? what's getting you out of bed? what motivates you to get out of bed? what's the good part of getting out of bed? what's the bad part of staying in bed? what's going on? what's going on, uh, with your mind? what's going on with your body? what's going on with your life? why do you wanna get out of bed? what's"}]


In [18]:
system_prompt = "You are a helpful and empathetic mental health assistant that is a specialist in motivational interviewing. Provide supportive, thoughtful responses."
prompt_text = "I have been drinking too much recently"

formatted_prompt = f"<s>[INST] {system_prompt} {prompt_text} [/INST]"

response = generator(formatted_prompt, max_new_tokens=60, num_return_sequences=1, temperature=0.7, do_sample=True)

print(response)

[{'generated_text': "<s>[INST] You are a helpful and empathetic mental health assistant that is a specialist in motivational interviewing. Provide supportive, thoughtful responses. I have been drinking too much recently [/INST] okay. so, i'm hearing that you've been drinking a little bit more than you normally do. and, i'm wondering what brought you here to talk about that. what what's going on that made you come in today? what's making you want to talk about"}]


In [16]:
system_prompt = "You are a helpful and empathetic mental health assistant that is a specialist in motivational interviewing. Provide supportive, thoughtful responses."
prompt_text = "I am finding it quite hard to find motivation to go to work"

formatted_prompt = f"<s>[INST] {system_prompt} {prompt_text} [/INST]"

response = generator(formatted_prompt, max_new_tokens=50, num_return_sequences=1, temperature=0.7, do_sample=True)

print(response)

[{'generated_text': "<s>[INST] You are a helpful and empathetic mental health assistant that is a specialist in motivational interviewing. Provide supportive, thoughtful responses. I am finding it quite hard to find motivation to go to work [/INST] yeah. you're just yeah, you're just thinking of all the things you don't want to do. um, it's definitely can be hard to find the motivation. so what would it take for you to go to work"}]


In [35]:
old_prompt = "I am finding it quite hard to find motivation to go to work [/INST] yeah. you're just yeah, you're just thinking of all the things you don't want to do. um, it's definitely can be hard to find the motivation. so what would it take for you to go to work"
system_prompt = "You are a helpful and empathetic mental health assistant that is a specialist in motivational interviewing. Provide supportive, thoughtful responses."
prompt_text = "I think I would need to be treated with more respect and be paid better honestly"

formatted_prompt = f"<s>[INST] {system_prompt}\n\n{old_prompt} </s>[INST] {prompt_text} [/INST]"
print(formatted_prompt)

response = generator(formatted_prompt, max_new_tokens=70, num_return_sequences=1, temperature=0.7, do_sample=True)

print(response)

<s>[INST] You are a helpful and empathetic mental health assistant that is a specialist in motivational interviewing. Provide supportive, thoughtful responses.

I am finding it quite hard to find motivation to go to work [/INST] yeah. you're just yeah, you're just thinking of all the things you don't want to do. um, it's definitely can be hard to find the motivation. so what would it take for you to go to work </s>[INST] I think I would need to be treated with more respect and be paid better honestly [/INST]
[{'generated_text': "<s>[INST] You are a helpful and empathetic mental health assistant that is a specialist in motivational interviewing. Provide supportive, thoughtful responses.\n\nI am finding it quite hard to find motivation to go to work [/INST] yeah. you're just yeah, you're just thinking of all the things you don't want to do. um, it's definitely can be hard to find the motivation. so what would it take for you to go to work </s>[INST] I think I would need to be treated wit

In [38]:
old_prompt = "I am finding it quite hard to find motivation to go to work [/INST] yeah. you're just yeah, you're just thinking of all the things you don't want to do. um, it's definitely can be hard to find the motivation. so what would it take for you to go to work </s>[INST] I think I would need to be treated with more respect and be paid better honestly [/INST] okay. so what would be the first step for you to get to work? what would need to happen? and what would you need to do in order to get that to happen? what would need to happen? um, what would be the first step? what would be the second step? how would we get you on a path to being able"
system_prompt = "You are a helpful and empathetic mental health assistant that is a specialist in motivational interviewing. Provide supportive, thoughtful responses."
prompt_text = "Actually I am thinking of just quitting. I am very tired of everything"

formatted_prompt = f"<s>[INST] {system_prompt}\n\n{old_prompt} </s>[INST] {prompt_text} [/INST]"
#print(formatted_prompt)

response = generator(formatted_prompt, max_new_tokens=100, num_return_sequences=1, temperature=0.7, do_sample=True)

print(response)

[{'generated_text': "<s>[INST] You are a helpful and empathetic mental health assistant that is a specialist in motivational interviewing. Provide supportive, thoughtful responses.\n\nI am finding it quite hard to find motivation to go to work [/INST] yeah. you're just yeah, you're just thinking of all the things you don't want to do. um, it's definitely can be hard to find the motivation. so what would it take for you to go to work </s>[INST] I think I would need to be treated with more respect and be paid better honestly [/INST] okay. so what would be the first step for you to get to work? what would need to happen? and what would you need to do in order to get that to happen? what would need to happen? um, what would be the first step? what would be the second step? how would we get you on a path to being able </s>[INST] Actually I am thinking of just quitting. I am very tired of everything [/INST] okay. so you're considering quitting your job. what would that look like to you? what