In [None]:
!pip install transformers trl accelerate torch bitsandbytes peft sentencepiece wandb datasets -qU 
!pip install huggingface-hub -qU

In [None]:
from huggingface_hub import notebook_login
import wandb
import json
import pandas as pd
import torch
from datasets import Dataset, load_dataset
from huggingface_hub import notebook_login
from peft import LoraConfig, PeftModel
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline,
    logging,
)
import torch
from trl import SFTTrainer

In [None]:
notebook_login()


In [None]:
wandb.login()

In [None]:
from datasets import load_dataset, Dataset

def create_text_row(data):
    if(input==None):
        text_row = f"""<start_of_turn>user {data['instruction']}<end_of_turn>\\n<start_of_turn>model {data['output']}<end_of_turn>"""
    else :
        text_row = f"""<start_of_turn>user {data['instruction']} with {data['input']} <end_of_turn> \\n<start_of_turn>model {data['output']}<end_of_turn>"""
    return text_row
    

def prepare_train_data(data_id):
    data = load_dataset(data_id, split="train")
    data_df = data.to_pandas() 
    data_df["text"] =data_df.apply(create_text_row, axis =1) 
    data = Dataset.from_pandas(data_df)
    return data 

In [None]:
instruct_tune_dataset = prepare_train_data("gouthamsk/embedded_dataset_mixed_small")

In [None]:
instruct_tune_dataset = instruct_tune_dataset.shuffle(seed=1234)

In [None]:
instruct_tune_dataset

In [None]:
instruct_tune_dataset[1]

In [None]:
nf4_config = BitsAndBytesConfig(
   load_in_4bit=True,
   bnb_4bit_quant_type="nf4",
   bnb_4bit_use_double_quant=True,
   bnb_4bit_compute_dtype=torch.bfloat16
)

model_id = "google/gemma-7b-it"
# model_id = "google/gemma-2b-it"

model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=nf4_config, device_map={"":0})
tokenizer = AutoTokenizer.from_pretrained(model_id, add_eos_token=True)

In [None]:
from peft import LoraConfig, get_peft_model
lora_config = LoraConfig(
    r=64,
    lora_alpha=32,
    target_modules=['o_proj', 'q_proj', 'up_proj', 'v_proj', 'k_proj', 'down_proj', 'gate_proj'],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
) 
model = get_peft_model(model, lora_config)

In [None]:
trainable, total = model.get_nb_trainable_parameters()
print(f"Trainable: {trainable} | total: {total} | Percentage: {trainable/total*100:.4f}%")

In [None]:
#new code using SFTTrainer
import transformers

from trl import SFTTrainer

args = TrainingArguments(
  output_dir = "outputs",
  #num_train_epochs=10,
  max_steps = 400, # comment out this line if you want to train in epochs
  per_device_train_batch_size = 4,
  gradient_accumulation_steps=4,
  warmup_steps = 0.03,
  logging_steps=10,
  save_strategy="epoch",
  #evaluation_strategy="epoch",
  # evaluation_strategy="steps",
  # eval_steps=20, # comment out this line if you want to evaluate at the end of each epoch
  learning_rate=2e-4,
  bf16=True,
  lr_scheduler_type='constant',
  report_to="wandb",
  logging_steps=1,
  optim="paged_adamw_8bit",
)

tokenizer.pad_token = tokenizer.eos_token
torch.cuda.empty_cache()
trainer = SFTTrainer(
    model=model,
    train_dataset=instruct_tune_dataset,
    # eval_dataset=test_data,
    dataset_text_field="text",
    peft_config=lora_config,
    args=args,
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)

In [None]:
# Start the training process
trainer.train()

In [None]:
new_model="gemma_embedded_c_7b"
trainer.model.save_pretrained(new_model)

In [None]:
base_model = AutoModelForCausalLM.from_pretrained(
    model_id,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map={"": 0},
)
merged_model= PeftModel.from_pretrained(base_model, new_model)
merged_model= merged_model.merge_and_unload()

# Save the merged model
merged_model.save_pretrained("merged_model",safe_serialization=True)
tokenizer.save_pretrained("merged_model")
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

In [None]:
merged_model.push_to_hub(new_model, use_temp_dir=False)
tokenizer.push_to_hub(new_model, use_temp_dir=False)