In [1]:
# Block 1: Imports and Environment Setup
#!/usr/bin/env python3
import os
import sys
import numpy as np
import pandas as pd
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments
)
from peft import LoraConfig, get_peft_model, TaskType
from datasets import Dataset

os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
os.environ["WANDB_DISABLED"] = "true"
print("Environment variables set up.")

print("Python version:", sys.version)
print("Torch version:", torch.__version__)

Environment variables set up.
Python version: 3.10.12 (main, Feb  4 2025, 14:57:36) [GCC 11.4.0]
Torch version: 2.7.0.dev20250224+cu128


In [2]:
# Block 2: Tokenizer and Model Loading Functions
def load_tokenizer(model_checkpoint, hf_token):
    print("Loading tokenizer from:", model_checkpoint)
    tokenizer = AutoTokenizer.from_pretrained(
        model_checkpoint,
        token=hf_token,
        trust_remote_code=True
    )
    if tokenizer.pad_token is None:
        tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    print("Tokenizer loaded. Vocabulary size:", tokenizer.vocab_size)
    return tokenizer

def load_model(model_checkpoint, hf_token):
    print("Loading model from:", model_checkpoint)
    try:
        model = AutoModelForCausalLM.from_pretrained(
            model_checkpoint,
            token=hf_token,
            trust_remote_code=True,
            device_map="auto"
        )
        print("Model loaded successfully.")
    except Exception as e:
        print("Error loading model:", e)
        raise
    print("Model loaded in full precision.")
    return model

In [3]:
# Block 3: LoRA Application Function
def apply_lora(model, tokenizer):
    print("Resizing token embeddings to:", len(tokenizer))
    model.resize_token_embeddings(len(tokenizer))
    
    print("Applying LoRA config...")
    lora_config = LoraConfig(
        r=8,
        lora_alpha=32,
        target_modules=["q_proj", "v_proj"],
        lora_dropout=0.1,
        bias="none",
        task_type=TaskType.CAUSAL_LM,
    )
    try:
        model = get_peft_model(model, lora_config)
        print("LoRA applied successfully.")
    except Exception as e:
        print("Error applying LoRA:", e)
        raise
    return model

In [4]:
# Block 4: Data Loading and Tokenization Functions
def load_and_prepare_data(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        data = f.read()
    stories = data.split("START_OF_STORY")
    stories = [story.strip() for story in stories if story.strip()]
    return stories

def tokenize_dataset(tokenizer, stories):
    dataset = Dataset.from_dict({"text": stories})
    def tokenize_function(examples):
        return tokenizer(examples["text"], truncation=True, max_length=512)
    print("Tokenizing dataset...")
    tokenized_dataset = dataset.map(
        tokenize_function,
        batched=True,
        remove_columns=["text"]
    )
    print("Sample tokenized row:", tokenized_dataset[0])
    return tokenized_dataset

In [5]:
# Block 5: Main Training Function
def main():
    model_checkpoint = "sarvamai/sarvam-1"
    hf_token = "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"
    
    tokenizer = load_tokenizer(model_checkpoint, hf_token)
    model = load_model(model_checkpoint, hf_token)
    model = apply_lora(model, tokenizer)
    
    preprocessed_file = "final_dataset.md"
    stories = load_and_prepare_data(preprocessed_file)
    print(f"Number of stories: {len(stories)}")
    
    tokenized_dataset = tokenize_dataset(tokenizer, stories)
    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
    
    output_dir = "finetuned_sarvam"
    print("Output directory:", output_dir)
    training_args = TrainingArguments(
        output_dir=output_dir,
        num_train_epochs=1,
        per_device_train_batch_size=1,
        gradient_accumulation_steps=4,
        learning_rate=2e-5,
        weight_decay=0.01,
        save_steps=10,
        logging_steps=5,
        fp16=False,
        no_cuda=False,
        gradient_checkpointing=False,
    )
    
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_dataset,
        data_collator=data_collator,
    )
    print("Trainer created.")
    
    print("Starting training...")
    try:
        trainer.train()
        print("Training finished!")
    except Exception as e:
        print("Error during training:", e)
        raise
    
    model.save_pretrained("./my_finetuned_model")
    tokenizer.save_pretrained("./my_finetuned_model")
    
    try:
        os.system("zip -r my_finetuned_model.zip ./my_finetuned_model")
    except Exception as e:
        print("Error zipping the model directory:", e)

if __name__ == "__main__":
    main()

Loading tokenizer from: sarvamai/sarvam-1
Tokenizer loaded. Vocabulary size: 68096
Loading model from: sarvamai/sarvam-1


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`
The new lm_head weights will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


Model loaded successfully.
Model loaded in full precision.
Resizing token embeddings to: 68097
Applying LoRA config...
LoRA applied successfully.
Number of stories: 404
Tokenizing dataset...


Map:   0%|          | 0/404 [00:00<?, ? examples/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Sample tokenized row: {'input_ids': [1, 4489, 67659, 11799, 4898, 19869, 6842, 16075, 41115, 10747, 16295, 4844, 6996, 50042, 67621, 4489, 36499, 15432, 4373, 31538, 5179, 4578, 7196, 5705, 35845, 11025, 4373, 10072, 7642, 4424, 4373, 66471, 67491, 4615, 5058, 4366, 21618, 4412, 36365, 60150, 8659, 4809, 5705, 4366, 19999, 41161, 8382, 4844, 67494, 67709, 67494, 27067, 13164, 67494, 14732, 9455, 5058, 54509, 6294, 4741, 38188, 4900, 5058, 4373, 6642, 21187, 4408, 5992, 18082, 5956, 5058, 4366, 53374, 4643, 67491, 4427, 4900, 26060, 4412, 4373, 20041, 4408, 22308, 67491, 22452, 6232, 4373, 21040, 7869, 8687, 4424, 6270, 4412, 4366, 24269, 67494, 6879, 5058, 36525, 7196, 10072, 4427, 43366, 5051, 55522, 5548, 67494, 4615, 9177, 8760, 4412, 4366, 8435, 30105, 67481, 5310, 67594, 67484, 4397, 15075, 7237, 4373, 65570, 9183, 8760, 4969, 5992, 11103, 9455, 67491, 4427, 4900, 9183, 4373, 65570, 4489, 67782, 8167, 36694, 4385, 6761, 54509, 6294, 30783, 67491, 4366, 11486, 29253, 25633, 67491, 

Step,Training Loss
5,2.6517
10,2.615
15,2.7552
20,2.6802
25,2.6009
30,2.3949
35,2.671
40,2.4832
45,2.7004
50,2.4939




Training finished!




  adding: my_finetuned_model/ (stored 0%)
  adding: my_finetuned_model/added_tokens.json (stored 0%)
  adding: my_finetuned_model/adapter_config.json (deflated 54%)
  adding: my_finetuned_model/special_tokens_map.json (deflated 78%)
  adding: my_finetuned_model/tokenizer_config.json (deflated 97%)
 (deflated 84%)netuned_model/tokenizer.json
  adding: my_finetuned_model/tokenizer.model (deflated 62%)
 (deflated 53%)netuned_model/adapter_model.safetensors
  adding: my_finetuned_model/README.md (deflated 66%)
