In [None]:
import os

import pandas as pd
import torch
from datasets import load_dataset
from peft import LoraConfig, prepare_model_for_kbit_training
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
)
from trl import SFTTrainer

In [32]:
dataset = load_dataset("Amod/mental_health_counseling_conversations")
dataset

DatasetDict({
    train: Dataset({
        features: ['Context', 'Response'],
        num_rows: 3512
    })
})

In [33]:
df = pd.DataFrame(dataset["train"])

In [34]:
df.head()

Unnamed: 0,Context,Response
0,I'm going through some things with my feelings...,"If everyone thinks you're worthless, then mayb..."
1,I'm going through some things with my feelings...,"Hello, and thank you for your question and see..."
2,I'm going through some things with my feelings...,First thing I'd suggest is getting the sleep y...
3,I'm going through some things with my feelings...,Therapy is essential for those that are feelin...
4,I'm going through some things with my feelings...,I first want to let you know that you are not ...


In [None]:
def format_row(row):
    question = row["Context"]
    answer = row["Response"]
    return f"[INST]Q: {question}[/INST] \nA: {answer}\n"

In [36]:
df["Text"] = df.apply(format_row, axis=1)

In [37]:
df["Text"].head()

0    [INST] Q: I'm going through some things with m...
1    [INST] Q: I'm going through some things with m...
2    [INST] Q: I'm going through some things with m...
3    [INST] Q: I'm going through some things with m...
4    [INST] Q: I'm going through some things with m...
Name: Text, dtype: object

In [38]:
df_formatted = df[["Text"]]
df_formatted.to_csv(
    "../data/mental_health_counseling_conversations_formatted.csv", index=False
)
df_formatted.head()

Unnamed: 0,Text
0,[INST] Q: I'm going through some things with m...
1,[INST] Q: I'm going through some things with m...
2,[INST] Q: I'm going through some things with m...
3,[INST] Q: I'm going through some things with m...
4,[INST] Q: I'm going through some things with m...


In [None]:
train_dataset = load_dataset(
    "csv",
    data_files="../data/mental_health_counseling_conversations_formatted.csv",
    split="train",
)
train_dataset

Dataset({
    features: ['Text'],
    num_rows: 3512
})

In [41]:
## Fine tuning code here

In [None]:
base_model = "microsoft/phi-2"
new_model = "phi-2-mental-health-counseling"

In [None]:
## Tokeniser
tokeniser = AutoTokenizer.from_pretrained(base_model, use_fast=True)
tokeniser.pad_token = tokeniser.eos_token
tokeniser.padding_side = "right"  # pad to the right of the input

In [None]:
## bnb configuration
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=False,
)

In [None]:
## model
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=bnb_config,
    trust_remote_code=True,
    flash_attn=True,
    flash_rotary=True,
    flash_dense=True,
    low_cpu_mem_usage=True,
    device_map={" ", 0},  # "auto"
    use_safetensors=True,
)

model.config.use_cache = False
model.config.pretraining_tp = 1

model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=True)

In [None]:
train_args = TrainingArguments(
    output_dir=f"../models/{new_model}",
    overwrite_output_dir=True,
    num_train_epochs=1,  # 2
    per_device_train_batch_size=1,  # 2
    gradient_accumulation_steps=8,
    eval_strategy="steps",
    eval_steps=1000,
    logging_steps=10,
    optim="paged_adamw_8bit",
    learning_rate=5e-5,
    lr_scheduler_type="cosine",
    warmup_steps=0.05,
    weight_decay=0.01,
    save_steps=1000,
    max_steps=-1,
    # save_total_limit=1,
    # logging_dir="../logs",
    # report_to="none",
)

peft_config = LoraConfig(
    r=32,
    lora_alpha=64,
    lora_dropout=0.05,
    bias_type="none",
    task_type="causal_lm",
    target_modules=["Wqkv", "fc1", "fc2"],
)

In [None]:
trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset,
    peft_config=peft_config,
    dataset_text_field="Text",
    max_seq_length=512,
    tokenizer=tokeniser,
    args=train_args,
)

In [42]:
# train the model, this will take a while
trainer.train()

NameError: name 'trainer' is not defined