<a href="https://colab.research.google.com/github/dame-cell/Gaja/blob/main/training_the_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# steps followed for the full-fine tuning:

1. First we import and install the dependencies
2. second we load the configuration which makes life easier
3. mount goggle drive so that we can store our checkpoints
4. downloading the base-model using unsloth and in 4-bit
5. Get the peft-configuration
6. load the datasets and also pre-precoess it a bit
7. and finally truly starting the training



In [None]:
%%capture
import torch
major_version, minor_version = torch.cuda.get_device_capability()
if major_version >= 8:
    # Use this for new GPUs like Ampere, Hopper GPUs (RTX 30xx, RTX 40xx, A100, H100, L40)
    !pip install "unsloth[colab_ampere] @ git+https://github.com/unslothai/unsloth.git"
else:
    # Use this for older GPUs (V100, Tesla T4, RTX 20xx)
    !pip install "unsloth[colab] @ git+https://github.com/unslothai/unsloth.git"
pass

!pip install "git+https://github.com/huggingface/transformers.git" # Native 4bit loading works!

In [None]:
import yaml

# Specify the path to your YAML configuration file
config_file_path = "config.yaml"

# Load the YAML file
with open(config_file_path, "r") as config_file:
    config = yaml.safe_load(config_file)

# Access all configuration parameters
max_seq_length = config.get('max_seq_length')
dtype = config.get('dtype')
load_in_4bit = config.get('load_in_4bit')
model_name = config.get('model_name')
target_modules = config.get('target_modules')
ra_alpha = config.get('ra_alpha')
lora_dropout = config.get('lora_dropout')
bias = config.get('bias')
use_gradient_checkpointing = config.get('use_gradient_checkpointing')
random_state = config.get('random_state')
use_rslora = config.get('use_rslora')
loftq_config = config.get('loftq_config')
dataset_text_field = config.get('dataset_text_field')
dataset_num_proc = config.get('ataset_num_proc')  # Typo in the original config, fix it to 'dataset_num_proc'
packing = config.get('packing')
per_device_train_batch_size = config.get('per_device_train_batch_size')
gradient_accumulation_steps = config.get('gradient_accumulation_steps')
warmup_steps = config.get('warmup_steps')
hub_strategy = config.get('hub_strategy')
num_train_epochs = config.get('num_train_epochs')
push_to_hub = config.get('push_to_hub')
push_to_hub_model_id = config.get('push_to_hub_model_id')
learning_rate = config.get('learning_rate')
resume_from_checkpoint = config.get('resume_from_checkpoint')

logging_steps = config.get('logging_steps')
optim = config.get('optim')
weight_decay = config.get('weight_decay')
save_total_limit = config.get('save_total_limit')
save_steps = config.get('save_steps')
lr_scheduler_type = config.get('lr_scheduler_type')
seed = config.get('seed')

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
from unsloth import FastLanguageModel
import torch



model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "sarvamai/OpenHathi-7B-Hi-v0.1-Base",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,

)

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16,
    target_modules = target_modules,
    lora_alpha = 16,
    lora_dropout = lora_dropout,
    bias = bias,
    use_gradient_checkpointing = use_gradient_checkpointing,
    random_state = random_state,
    use_rslora = use_rslora,
    loftq_config = loftq_config,
)

In [None]:
from datasets import load_dataset

dataset = load_dataset("damerajee/insutrct-vls",split='train')

In [None]:
dataset

In [None]:
data= dataset.remove_columns('Unnamed: 0')

In [None]:
data

In [None]:
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
### Instruction:
{}

### Input:
{}

### Response:
{}"""

In [None]:



EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    inputs       = examples["input"]
    outputs      = examples["output"]
    texts = []
    for instruction, input, output in zip(instructions, inputs, outputs):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }
pass

dataset = data.map(formatting_prompts_func, batched = True,)

In [None]:
dataset['text'][0]

In [None]:
dataset = dataset.shuffle(seed=42)

In [None]:
output_dir = "output-path"

In [None]:
!huggingface-cli login

In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = dataset_text_field,
    max_seq_length = max_seq_length,
    dataset_num_proc = dataset_num_proc,
    packing = packing, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = per_device_train_batch_size,
        gradient_accumulation_steps = gradient_accumulation_steps,
        warmup_steps = warmup_steps,
        hub_strategy=hub_strategy,
        num_train_epochs=num_train_epochs,
        push_to_hub=push_to_hub,
        push_to_hub_model_id=push_to_hub_model_id,
        learning_rate = 2e-4,
        resume_from_checkpoint=resume_from_checkpoint,
        fp16 = not torch.cuda.is_bf16_supported(),
        bf16 = torch.cuda.is_bf16_supported(),
        logging_steps = logging_steps,
        optim = optim,
        weight_decay = weight_decay,
        save_total_limit=save_total_limit,
        save_steps=save_steps,
        lr_scheduler_type = lr_scheduler_type,
        seed = seed,

        output_dir=output_dir,
    ),
)

# Some-cool-information

1. First,you can easily run all teh code and try it yourselves
2. the max memory taken before training was 4.3 gb and after was almost 8gb


In [None]:
#@title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

In [None]:
trainer_stats = trainer.train("checkpoint-path")