In [347]:
import os

import pandas as pd
import torch
from datasets import load_dataset
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from trl import SFTConfig, SFTTrainer

from config import settings

In [348]:
torch.cuda.empty_cache()

In [349]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

## 1. Data

In [350]:
## load dataset
dataset = load_dataset("Amod/mental_health_counseling_conversations")
dataset

DatasetDict({
    train: Dataset({
        features: ['Context', 'Response'],
        num_rows: 3512
    })
})

In [351]:
# watch the dataset
df = pd.DataFrame(dataset["train"])
df.head()

Unnamed: 0,Context,Response
0,I'm going through some things with my feelings...,"If everyone thinks you're worthless, then mayb..."
1,I'm going through some things with my feelings...,"Hello, and thank you for your question and see..."
2,I'm going through some things with my feelings...,First thing I'd suggest is getting the sleep y...
3,I'm going through some things with my feelings...,Therapy is essential for those that are feelin...
4,I'm going through some things with my feelings...,I first want to let you know that you are not ...


In [352]:
# process the dataset
def format_row(row):
    question = row["Context"]
    answer = row["Response"]
    return f"[INST]Q: {question}[/INST] \nA: {answer}\n"


df["text"] = df.apply(format_row, axis=1)

In [353]:
# split data to train and eval
train_size = int(0.8 * len(df))
df = df.sample(frac=1)
df_train = df[:train_size].reset_index(drop=True)
df_valid = df[train_size:].reset_index(drop=True)

In [354]:
# create train and valid file paths
train_path = os.path.join(settings.DATA_DIR, "mental_health_counseling", "train.csv")
valid_path = os.path.join(settings.DATA_DIR, "mental_health_counseling", "valid.csv")
# Create directory if it doesn't exist
directory = os.path.dirname(train_path)
if not os.path.exists(directory):
    os.makedirs(directory)

In [355]:
# save the dataset to local
df_train[["text"]].to_csv(train_path, index=False)
df_valid[["text"]].to_csv(valid_path, index=False)

In [356]:
# load the dataset
train_dataset = load_dataset("csv", data_files=train_path, split="train")
train_dataset

Generating train split: 0 examples [00:00, ? examples/s]

Dataset({
    features: ['text'],
    num_rows: 2809
})

In [357]:
valid_dataset = load_dataset("csv", data_files=valid_path, split="train")
valid_dataset

Generating train split: 0 examples [00:00, ? examples/s]

Dataset({
    features: ['text'],
    num_rows: 703
})

## Model

In [358]:
# set bnb config for 4bit training
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    # bnb_4bit_compute_dtype=torch.float16,
)

In [360]:
# load model
base_model = "microsoft/phi-2"
new_model = "phi-2-mental-health-counseling"
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=bnb_config,
    low_cpu_mem_usage=True,
    device_map="cuda:0",
)
model = prepare_model_for_kbit_training(model)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [361]:
# load tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"  # pad to the right of the input

In [362]:
print(model)

PhiForCausalLM(
  (model): PhiModel(
    (embed_tokens): Embedding(51200, 2560)
    (layers): ModuleList(
      (0-31): 32 x PhiDecoderLayer(
        (self_attn): PhiAttention(
          (q_proj): Linear4bit(in_features=2560, out_features=2560, bias=True)
          (k_proj): Linear4bit(in_features=2560, out_features=2560, bias=True)
          (v_proj): Linear4bit(in_features=2560, out_features=2560, bias=True)
          (dense): Linear4bit(in_features=2560, out_features=2560, bias=True)
        )
        (mlp): PhiMLP(
          (activation_fn): NewGELUActivation()
          (fc1): Linear4bit(in_features=2560, out_features=10240, bias=True)
          (fc2): Linear4bit(in_features=10240, out_features=2560, bias=True)
        )
        (input_layernorm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
        (resid_dropout): Dropout(p=0.1, inplace=False)
      )
    )
    (rotary_emb): PhiRotaryEmbedding()
    (embed_dropout): Dropout(p=0.0, inplace=False)
    (final_layernorm): 

In [363]:
print(model.get_memory_footprint() / 1e6)

2310.697024


In [364]:
# setting up lora config
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)

In [365]:
# get peft model
model = model.to(device)
# peft_model = get_peft_model(model, peft_config=peft_config, low_cpu_mem_usage=True)
peft_model = get_peft_model(model, peft_config=peft_config, low_cpu_mem_usage=False)

In [366]:
print(peft_model)

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): PhiForCausalLM(
      (model): PhiModel(
        (embed_tokens): Embedding(51200, 2560)
        (layers): ModuleList(
          (0-31): 32 x PhiDecoderLayer(
            (self_attn): PhiAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=2560, out_features=2560, bias=True)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=2560, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=2560, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): lora.Linear4b

In [367]:
print(peft_model.get_memory_footprint() / 1e6)

2342.154304


In [368]:
# config SFT training parameters
sft_args = SFTConfig(
    max_seq_length=128,
    output_dir=settings.MODEL_DIR,
    # memory optimize
    gradient_checkpointing=True,
    gradient_checkpointing_kwargs={"use_reentrant": False},
    # gradient_checkpointing=False,  # Disable gradient checkpointing
    gradient_accumulation_steps=1,
    # training config
    num_train_epochs=2,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    auto_find_batch_size=True,
    learning_rate=1e-4,
    lr_scheduler_type="cosine",
    weight_decay=0.01,
    warmup_ratio=0.05,
    optim="paged_adamw_8bit",
    eval_steps=10,
    save_total_limit=1,
    save_steps=10,
    # logging
    logging_dir=settings.LOG_DIR,
    logging_steps=10,
    report_to="none",
)

In [369]:
trainer = SFTTrainer(
    model=peft_model,
    processing_class=tokenizer,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    args=sft_args,
)

Map:   0%|          | 0/2809 [00:00<?, ? examples/s]

Map:   0%|          | 0/703 [00:00<?, ? examples/s]

In [370]:
trainer.train()

Step,Training Loss
10,2.8466
20,2.7829
30,2.7685
40,2.8007
50,2.501
60,2.355
70,2.3044
80,2.2548
90,2.2925
100,2.3155


TrainOutput(global_step=1406, training_loss=2.1099391926403235, metrics={'train_runtime': 1296.2812, 'train_samples_per_second': 4.334, 'train_steps_per_second': 1.085, 'total_flos': 1.146169579536384e+16, 'train_loss': 2.1099391926403235, 'epoch': 2.0})