# Finetune RedPajama using LoRA


In [1]:
raw_dataset_link = "https://huggingface.co/datasets/Fredithefish/Instruction-Tuning-with-GPT-4-RedPajama-Chat/resolve/main/alpaca_gpt4_data.jsonl" #@param {type: "string"}


base_model_hf = "togethercomputer/RedPajama-INCITE-Chat-3B-v1" #@param {type: "string"}
finetune_epochs = 1 #@param {type: "integer"}

save_lora_adapters_to_google_drive = "yes"  #@param ["yes", "no"]
ADAPTERS_NAME='RedPajama-LoRA' #@param {type: "string"}
#@markdown  *Name under which the adapters will be saved (not needed, if save_lora_adapters_to_google_drive is "no")*  


In [None]:
!pip install --upgrade transformers datasets accelerate peft

In [None]:
import torch 
import torch.nn as nn 
import json
import transformers 
from datasets import Dataset 
from peft import LoraConfig, get_peft_model
from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM

In [None]:
import subprocess

# Define the wget command and the URL of the file to download
wget_cmd = ["wget", str(raw_dataset_link)]

# Run the wget command
subprocess.run(wget_cmd)

In [None]:
raw_splitted_link = raw_dataset_link.split("/")

# read datasets
with open(f'./{raw_splitted_link[-1]}', 'r') as fp:
    data = [json.loads(x) for x in fp.readlines()]

In [None]:
model = AutoModelForCausalLM.from_pretrained(
    "togethercomputer/RedPajama-INCITE-Chat-3B-v1", 
    device_map='auto',
)

In [None]:
tokenizer = AutoTokenizer.from_pretrained("togethercomputer/RedPajama-INCITE-Chat-3B-v1")
tokenizer.pad_token = tokenizer.eos_token

In [None]:
for param in model.parameters():
  param.requires_grad = False  # freeze the model - train adapters later
  if param.ndim == 1:
    # cast the small parameters (e.g. layernorm) to fp32 for stability
    param.data = param.data.to(torch.float32)

In [None]:
model.gradient_checkpointing_enable()  # reduce number of stored activations
model.enable_input_require_grads()

In [None]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [None]:
config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["query_key_value", "xxx"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

In [None]:
model = get_peft_model(model, config)
print_trainable_parameters(model)

In [None]:
data = Dataset.from_list(data)
data = data.map(lambda samples: tokenizer(samples['text']), batched=True)

In [None]:
trainer = transformers.Trainer(
    model=model, 
    train_dataset=data,
    args=transformers.TrainingArguments(
        per_device_train_batch_size=4, 
        gradient_accumulation_steps=4,
        warmup_steps=80, 
        learning_rate=2e-4, 
        fp16=True,
        num_train_epochs=finetune_epochs,
        logging_steps=1, 
        output_dir='outputs',
        report_to="none"
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False)
)

In [None]:
model.config.use_cache = False

In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"
trainer.train()

In [None]:
if save_lora_adapters_to_google_drive == "yes":
  from google.colab import drive
  drive.mount('/gdrive')
  # save the trained adapter to disk
  model.save_pretrained(f"/gdrive/My Drive/Colab Notebooks/Models/{ADAPTERS_NAME}")
else:
  pass