In [None]:
import torch
import pandas as pd
from datasets import load_dataset
from peft import prepare_model_for_kbit_training
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

## 1. Data

In [3]:
## load dataset
dataset = load_dataset("Amod/mental_health_counseling_conversations")
dataset

DatasetDict({
    train: Dataset({
        features: ['Context', 'Response'],
        num_rows: 3512
    })
})

In [4]:
# watch the dataset
df = pd.DataFrame(dataset["train"])
df.head()

Unnamed: 0,Context,Response
0,I'm going through some things with my feelings...,"If everyone thinks you're worthless, then mayb..."
1,I'm going through some things with my feelings...,"Hello, and thank you for your question and see..."
2,I'm going through some things with my feelings...,First thing I'd suggest is getting the sleep y...
3,I'm going through some things with my feelings...,Therapy is essential for those that are feelin...
4,I'm going through some things with my feelings...,I first want to let you know that you are not ...


In [5]:
# process the dataset
def format_row(row):
    question = row["Context"]
    answer = row["Response"]
    return f"[INST]Q: {question}[/INST] \nA: {answer}\n"


df["text"] = df.apply(format_row, axis=1)

In [6]:
# split data to train and eval
train_size = int(0.8 * len(df))
df = df.sample(frac=1)
df_train = df[:train_size].reset_index(drop=True)
df_valid = df[train_size:].reset_index(drop=True)

In [7]:
# save the dataset to local
df_train[["text"]].to_csv("../data/mental_health_counseling/train.csv", index=False)
df_valid[["text"]].to_csv("../data/mental_health_counseling/valid.csv", index=False)

In [11]:
# load the dataset
dataset_train = load_dataset(
    "csv",
    data_files="../data/mental_health_counseling/train.csv",
    # split="train",
)
dataset_train

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 2809
    })
})

In [10]:
dataset_valid = load_dataset(
    "csv",
    data_files="../data/mental_health_counseling/valid.csv",
    # split="valid",
)
dataset_valid

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 703
    })
})

## Model

In [13]:
# set bnb config for 4bit training
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
)
# load model
base_model = "microsoft/phi-2"
new_model = "phi-2-mental-health-counseling"
model = AutoModelForCausalLM.from_pretrained(base_model, quantization_config=bnb_config)
model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=True)
# load tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model)
tokenizer.pad_token = tokenizer.eos_token

PackageNotFoundError: No package metadata was found for bitsandbytes