In [59]:
!pip install bitsandbytes accelerate transformers peft einops torch datasets trl -q

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/365.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m365.7/365.7 kB[0m [31m18.1 MB/s[0m eta [36m0:00:00[0m
[?25h

In [60]:
from datasets import load_dataset
import pandas as pd
import torch

from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import BitsAndBytesConfig
from transformers import TrainingArguments
from peft import prepare_model_for_kbit_training
from peft import LoraConfig

from trl import SFTTrainer

In [14]:
dataset_name = "Amod/mental_health_counseling_conversations"

dataset = load_dataset(dataset_name, split="train")

In [15]:
dataset

Dataset({
    features: ['Context', 'Response'],
    num_rows: 3512
})

In [18]:
df = pd.DataFrame(dataset)
df.head()

Unnamed: 0,Context,Response
0,I'm going through some things with my feelings...,"If everyone thinks you're worthless, then mayb..."
1,I'm going through some things with my feelings...,"Hello, and thank you for your question and see..."
2,I'm going through some things with my feelings...,First thing I'd suggest is getting the sleep y...
3,I'm going through some things with my feelings...,Therapy is essential for those that are feelin...
4,I'm going through some things with my feelings...,I first want to let you know that you are not ...


In [23]:
def transform_data(data):
    context = data['Context']
    response = data['Response']
    return f"[INST] {context} [/INST] {response}"

In [24]:
df['transformmed_data'] = df.apply(transform_data, axis=1)

In [25]:
df.head()

Unnamed: 0,Context,Response,transformmed_data
0,I'm going through some things with my feelings...,"If everyone thinks you're worthless, then mayb...",[INST] I'm going through some things with my f...
1,I'm going through some things with my feelings...,"Hello, and thank you for your question and see...",[INST] I'm going through some things with my f...
2,I'm going through some things with my feelings...,First thing I'd suggest is getting the sleep y...,[INST] I'm going through some things with my f...
3,I'm going through some things with my feelings...,Therapy is essential for those that are feelin...,[INST] I'm going through some things with my f...
4,I'm going through some things with my feelings...,I first want to let you know that you are not ...,[INST] I'm going through some things with my f...


In [31]:
transformmed_data = df[['transformmed_data']]

In [32]:
transformmed_data.to_csv("transformed_data.csv", index=False)

In [33]:
df = pd.read_csv("transformed_data.csv")
df.head()

Unnamed: 0,transformmed_data
0,[INST] I'm going through some things with my f...
1,[INST] I'm going through some things with my f...
2,[INST] I'm going through some things with my f...
3,[INST] I'm going through some things with my f...
4,[INST] I'm going through some things with my f...


In [34]:
dataset = load_dataset("csv", data_files="/content/transformed_data.csv", split="train")

Generating train split: 0 examples [00:00, ? examples/s]

In [35]:
dataset

Dataset({
    features: ['transformmed_data'],
    num_rows: 3512
})

In [36]:
base_model = "microsoft/phi-2"

In [38]:
tokenizer = AutoTokenizer.from_pretrained(base_model, use_fast=False, trust_remote_code=True)

tokenizer_config.json:   0%|          | 0.00/7.34k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/1.08k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

In [41]:
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

In [44]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True
)

In [46]:
model = AutoModelForCausalLM.from_pretrained(base_model, trust_remote_code=True, quantization_config=bnb_config, device_map="auto")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [47]:
model.config.use_cache = False
model.config.pretraining_tp = 1

In [51]:
model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=True)

In [63]:
args =TrainingArguments(
    output_dir="./result",
    num_train_epochs=3,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=32,
    # eval_strategy="steps",
    eval_steps=200,
    logging_steps=15,
    optim="paged_adamw_8bit",
    learning_rate=2e-5,
    lr_scheduler_type="cosine",
    save_steps=2000,
    warmup_ratio=0.05,
    weight_decay=0.01,
    max_steps=-1
)

In [64]:
peft_config = LoraConfig(
    r=32,
    lora_alpha=64,
    target_modules=["Wqkv", "fc1", "fc2"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

In [65]:
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_config,
    dataset_text_field="transformmed_data",
    tokenizer=tokenizer,
    args=args
)


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


In [None]:
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


  return fn(*args, **kwargs)


Step,Training Loss
15,6.0453
30,5.3096
45,4.519
60,4.0137
75,3.4782
