<a href="https://colab.research.google.com/github/dhruv716/Nemesis/blob/main/Nemesis_llama_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install kagglehub torch transformers unsloth peft trl accelerate bitsandbytes datasets gradio

Collecting unsloth
  Downloading unsloth-2025.3.19-py3-none-any.whl.metadata (46 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.2/46.2 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
Collecting trl
  Downloading trl-0.16.1-py3-none-any.whl.metadata (12 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.45.5-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting gradio
  Downloading gradio-5.24.0-py3-none-any.whl.metadata (16 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.

In [None]:
import torch
from unsloth import FastLanguageModel
from transformers import TrainingArguments
from trl import SFTTrainer
import pandas as pd
import json
import os
from datasets import Dataset
import kagglehub

# Download latest version
path = kagglehub.dataset_download("regressingaddict/indian-judicial-codes-and-acts")

print("Path to dataset files:", path)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
Downloading from https://www.kaggle.com/api/v1/datasets/download/regressingaddict/indian-judicial-codes-and-acts?dataset_version_number=2...


100%|██████████| 1.04M/1.04M [00:00<00:00, 1.12MB/s]

Extracting files...
Path to dataset files: /root/.cache/kagglehub/datasets/regressingaddict/indian-judicial-codes-and-acts/versions/2





In [None]:
# Function to read JSON files
def read_json(file_path, expected_keys=None):
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
            if isinstance(data, list):
                return [{key: item.get(key, "") for key in expected_keys} for item in data]
            return data
    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return []

# Function to read CSV files
def read_csv(file_path, column_mapping):
    try:
        df = pd.read_csv(file_path)
        df.rename(columns=column_mapping, inplace=True)
        df.fillna("", inplace=True)
        return df.to_dict(orient='records')
    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return []

# Define column mappings for CSV files
csv_mappings = {
    "Copyright Amendment Act 2012.csv": {"Part": "part", "Section": "section", "Content": "content"},
    "CPC.csv": {"PART": "part", "ARTICLE": "article", "CONTENT": "content"},
    "CrPC Dataset.csv": {"PART": "part", "ARTICLE": "article", "CONTENT": "content"},
    "Indian Constitution Final.csv": {"article_id": "article", "article_desc": "content"},
    "Indian Evidence Act of 1872.csv": {"Part": "part", "Section": "section", "Content": "content"},
    "The Arbitration Act 1940.csv": {"Part": "part", "Section": "section", "Content": "content"},
    "The Indian Contract Act 1872.csv": {"Part": "part", "Section": "section", "Content": "content"},
    "The Copyright Act 1957.csv": {"PART": "part", "ARTICLE": "article", "CONTENT": "content"},
}

# Load JSON datasets
json_data = {
    "ipc_sections": read_json(os.path.join(path, "IPC Sections DB.json"), ["chapter", "title", "description"]),
    "constitution_qa": read_json(os.path.join(path, "constitution_qa.json"), ["question", "answer"]),
    "crpc_qa": read_json(os.path.join(path, "crpc_qa.json"), ["question", "answer"]),
    "ipc_qa": read_json(os.path.join(path, "ipc_qa.json"), ["question", "answer"])
}

# Load CSV datasets
csv_data = {}
for filename, mapping in csv_mappings.items():
    file_path = os.path.join(path, filename)
    csv_data[filename] = read_csv(file_path, mapping)

# Merge all datasets into one
data = []
for key, records in json_data.items():
    data.extend(records)
for key, records in csv_data.items():
    data.extend(records)

# Load dataset
raw_dataset = Dataset.from_pandas(pd.DataFrame(data))

In [None]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 512 # Choose any! We auto support RoPE Scaling internally!
dtype = torch.float16  # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/Meta-Llama-3.1-8B-bnb-4bit",      # Llama-3.1 15 trillion tokens model 2x faster!
    "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
    "unsloth/Meta-Llama-3.1-70B-bnb-4bit",
    "unsloth/Meta-Llama-3.1-405B-bnb-4bit",    # We also uploaded 4bit for 405b!
    "unsloth/Mistral-Nemo-Base-2407-bnb-4bit", # New Mistral 12b 2x faster!
    "unsloth/Mistral-Nemo-Instruct-2407-bnb-4bit",
    "unsloth/mistral-7b-v0.3-bnb-4bit",        # Mistral v3 2x faster!
    "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
    "unsloth/Phi-3.5-mini-instruct",           # Phi-3.5 2x faster!
    "unsloth/Phi-3-medium-4k-instruct",
    "unsloth/gemma-2-9b-bnb-4bit",
    "unsloth/gemma-2-27b-bnb-4bit",            # Gemma 2x faster!
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

==((====))==  Unsloth 2025.3.19: Fast Llama patching. Transformers: 4.50.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/5.70G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/55.5k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2025.3.19 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [None]:
print(raw_dataset.column_names)

['chapter', 'title', 'description', 'question', 'answer', 'part', 'section', 'content', 'article']


In [None]:
prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

def format_data(examples):
    instructions = examples.get("question", [""] * len(examples))
    inputs = examples.get("description", [""] * len(examples))
    outputs = examples.get("answer", [""] * len(examples))

    texts = []
    for instruction, input_text, output in zip(instructions, inputs, outputs):
        # Safely handle None values
        instruction = str(instruction or "").strip()
        input_text = str(input_text or "").strip()
        output = str(output or "").strip()

        text = prompt.format(
            instruction,
            input_text,
            output
        ) + tokenizer.eos_token
        texts.append(text)
    return {"text": texts}


formatted_dataset = raw_dataset.map(format_data, batched=True)


Map:   0%|          | 0/16714 [00:00<?, ? examples/s]

In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported
from sklearn.model_selection import train_test_split

# Split your dataset (80% train, 20% validation)
train_val = formatted_dataset.train_test_split(test_size=0.2, seed=42)
train_dataset = train_val["train"]
eval_dataset = train_val["test"]  # This is your validation set

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = train_val["train"],
    eval_dataset = train_val["test"],
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = True, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 4,
        gradient_accumulation_steps = 8,
        warmup_steps = 5,
        num_train_epochs = 1, # Set this for 1 full training run.
        max_steps = 200,
        learning_rate = 1e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "cosine",
        seed = 3407,
        output_dir = "outputs",
        report_to = "wandb",
        eval_strategy = "steps",
        eval_steps = 50,
        logging_strategy = "steps",
        logging_steps = 200,
        save_strategy = "steps",
        save_steps = 100,
    ),
)

Unsloth: Tokenizing ["text"] (num_proc=2):   0%|          | 0/13371 [00:00<?, ? examples/s]

Unsloth: Hugging Face's packing is currently buggy - we're disabling it for now!


Unsloth: Tokenizing ["text"] (num_proc=2):   0%|          | 0/3343 [00:00<?, ? examples/s]

Unsloth: Hugging Face's packing is currently buggy - we're disabling it for now!


In [None]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 13,371 | Num Epochs = 1 | Total steps = 200
O^O/ \_/ \    Batch size per device = 4 | Gradient accumulation steps = 8
\        /    Data Parallel GPUs = 1 | Total batch size (4 x 8 x 1) = 32
 "-____-"     Trainable parameters = 41,943,040/8,000,000,000 (0.52% trained)
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mdhruv-pai167[0m ([33mdhruv-pai167-self[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss,Validation Loss
50,No log,0.860027
100,No log,0.818434
150,No log,0.801103
200,0.928200,0.796997


Unsloth: Not an error, but LlamaForCausalLM does not accept `num_items_in_batch`.
Using gradient accumulation will be very slightly less accurate.
Read more on gradient accumulation issues here: https://unsloth.ai/blog/gradient


In [None]:
training_logs = trainer.state.log_history

# Convert to pandas DataFrame for analysis
import pandas as pd
logs_df = pd.DataFrame(training_logs)
print(logs_df[['epoch', 'loss', 'learning_rate']].tail())

      epoch    loss  learning_rate
1  0.239306     NaN            NaN
2  0.358959     NaN            NaN
3  0.478612  0.9282            0.0
4  0.478612     NaN            NaN
5  0.478612     NaN            NaN


In [None]:
save_path = "/content/my_finetuned_model"
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
save_path = '/content/drive/MyDrive/legal_model' # Replace with your path
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

('/content/drive/MyDrive/legal_model/tokenizer_config.json',
 '/content/drive/MyDrive/legal_model/special_tokens_map.json',
 '/content/drive/MyDrive/legal_model/tokenizer.json')