In [1]:
%%capture
!pip install accelerate peft bitsandbytes transformers trl wandb

In [2]:
import os
import random
from datasets import load_dataset, Dataset
from peft import LoraConfig, PeftModel
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
import torch
from trl import SFTTrainer
import wandb

In [7]:
# Model from Hugging Face hub
base_model = "princeton-nlp/Sheared-LLaMA-2.7B"

# New instruction dataset
dataset = "percins/IN-ABS"

# Fine-tuned model
new_model = "Legal-Sheared-LLaMA-2.7B"

In [8]:
# Load the Indian Legal dataset including all splits
dataset = load_dataset(dataset)

# Load train, validation, and test datasets separately
train_dataset = dataset["train"]
validation_dataset = dataset["validation"]
test_dataset = dataset["test"]

# Load only random 50, 5, and 5 samples for the train, validation and test dataset for now
random.seed(42)  # Set a seed for reproducibility

train_indices = random.sample(range(len(train_dataset)), k=50)
train_dataset = train_dataset.select(train_indices)

validation_indices = random.sample(range(len(validation_dataset)), k=5)
validation_dataset = validation_dataset.select(validation_indices)

test_indices = random.sample(range(len(test_dataset)), k=5)
test_dataset = test_dataset.select(test_indices)

# Define a function to merge intent and snippet into a single column
def merge_columns(example):
    text = example["text"]
    summary = example["summary"]
    merged_text = f"<s> [INST]{text} [/INST] {summary} </s>"
    return {"legal_text": merged_text}  # Return a dictionary with the merged text

# Apply the merge function to each split of the dataset
train_dataset = train_dataset.map(merge_columns)
validation_dataset = validation_dataset.map(merge_columns)
test_dataset = test_dataset.map(merge_columns)

# Convert dictionaries to strings
train_dataset = [example["legal_text"] for example in train_dataset]
validation_dataset = [example["legal_text"] for example in validation_dataset]
test_dataset = [example["legal_text"] for example in test_dataset]

# Show the number of samples in each split
print("Number of samples in train split:", len(train_dataset))
print("Number of samples in validation split:", len(validation_dataset))
print("Number of samples in test split:", len(test_dataset))

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

Number of samples in train split: 50
Number of samples in validation split: 5
Number of samples in test split: 5


In [9]:
# Convert the list of strings to a dataset object
train_dataset = Dataset.from_dict({"text": train_dataset})
validation_dataset = Dataset.from_dict({"text": validation_dataset})
test_dataset = Dataset.from_dict({"text": test_dataset})

In [10]:
train_dataset[0:5]

{'text': ["<s> [INST]Appeal No. 1104 of 1970.\nAppeal by special leave from the judgment and order dated April 9, 1970, of the Rajasthan High Court in D. B. Civil Special Appeal No. 126 of 1970.\nM. C. Chagla, F. section Nariman, P. N. Tiwari and O. C. Malther, for the appellant.\nM. C. Setalvad and B. P. Maheshwari, for the respondent.\nThe Judgment of the Court was delivered by Grover, J.\nThis is an appeal from a judgment of the Rajasthan High Court holding that the appellant was not entitled to file an appeal against the order of the Company Judge directing sale of lease hold rights of the Golcha Properties (P) Ltd. (in liquidation) in the land belonging to the appellant.\nThe facts briefly are that on November 5, 1960 an agreement was entered into between the appellant and the respondent company allowing Golcha Properties (P) Ltd. to construct a cinema threatre within three years from the issue of the 'No Objection Certificate ' on land measuring 42,900 sq.\nfeet at Bhagwandas Roa

In [11]:
compute_dtype = getattr(torch, "float16")

quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=True,
)

In [12]:
# Load base model
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=quant_config,
    device_map={"": 0}
)
model.config.use_cache = False
model.config.pretraining_tp = 1

config.json:   0%|          | 0.00/632 [00:00<?, ?B/s]

pytorch_model.bin.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

pytorch_model-00001-of-00002.bin:   0%|          | 0.00/9.95G [00:00<?, ?B/s]

pytorch_model-00002-of-00002.bin:   0%|          | 0.00/857M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

In [13]:
# Load LLaMA tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

tokenizer_config.json:   0%|          | 0.00/776 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

In [14]:
# Load LoRA configuration
peft_args = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    bias="none",
    task_type="CAUSAL_LM",
)

In [15]:
training_params = TrainingArguments(
    output_dir="./results",
    num_train_epochs=1,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    optim="paged_adamw_32bit",
    save_steps=25,
    logging_steps=25,
    learning_rate=2e-4,
    weight_decay=0.001,
    fp16=True,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=False,
    lr_scheduler_type="constant",
    report_to="tensorboard"
)

In [16]:
# Set supervised fine-tuning parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset,
    peft_config=peft_args,
    dataset_text_field="text",
    max_seq_length=None,
    tokenizer=tokenizer,
    args=training_params,
    packing=False,
)


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Map:   0%|          | 0/50 [00:00<?, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


In [17]:
# Train model
trainer.train()

  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Step,Training Loss


TrainOutput(global_step=6, training_loss=1.806861400604248, metrics={'train_runtime': 178.0524, 'train_samples_per_second': 0.281, 'train_steps_per_second': 0.034, 'total_flos': 778748118958080.0, 'train_loss': 1.806861400604248, 'epoch': 0.96})

In [18]:
# Save trained model
trainer.model.save_pretrained(new_model)

In [23]:
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer

# Load the fine-tuned model and tokenizer from the local directory
new_model = AutoModelForCausalLM.from_pretrained("Legal-Sheared-LLaMA-2.7B")
tokenizer = AutoTokenizer.from_pretrained("princeton-nlp/Sheared-LLaMA-2.7B", trust_remote_code=True)

# Ignore warnings
import logging
logging.getLogger("transformers").setLevel(logging.ERROR)

# Run text generation pipeline with our fine-tuned model
prompt = """<s> <INST> The Judgment of the Court was delivered by SHAH, J. Aktiebolaget Svenska Kullakerfabriken of Gothenburg is a company incorporated under the laws of Sweden, and is engaged in the manufacture of ball bearing equipment. section K. F. Ball Bearing Co., Ltd., which will hereinafter be referred to as " the section K. F." is a company registered under the Indian Companies Act, 1913. By an agreement dated January 1, 1939, the section K. F. was appointed by the Swedish company as its sole selling agent in India. On account ,of the commencement of hostilities in the second world war, a corporation known as the Panrope Corporation was incorporated in the Republic of Panama in 1940, to take over as a war time arrangement the assets and business of that Swedish company. With effect from July 1, 1947, the Panrope Corporation conveyed the property and business to the Swedish company. In the years 1947, 1948, 1949 and 1950 the section K. F. sold in India as the agent 'of the Swedish and Panamian companies which will hereinafter be collectively referred to as the " foreign corporations " the goods manufactured by them. </INST>"""
pipe = pipeline(task="text-generation", model=new_model, tokenizer=tokenizer, max_new_tokens=500)
result = pipe(prompt)
print(result[0]['generated_text'])

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

<s> <INST> The Judgment of the Court was delivered by SHAH, J. Aktiebolaget Svenska Kullakerfabriken of Gothenburg is a company incorporated under the laws of Sweden, and is engaged in the manufacture of ball bearing equipment. section K. F. Ball Bearing Co., Ltd., which will hereinafter be referred to as " the section K. F." is a company registered under the Indian Companies Act, 1913. By an agreement dated January 1, 1939, the section K. F. was appointed by the Swedish company as its sole selling agent in India. On account ,of the commencement of hostilities in the second world war, a corporation known as the Panrope Corporation was incorporated in the Republic of Panama in 1940, to take over as a war time arrangement the assets and business of that Swedish company. With effect from July 1, 1947, the Panrope Corporation conveyed the property and business to the Swedish company. In the years 1947, 1948, 1949 and 1950 the section K. F. sold in India as the agent 'of the Swedish and Pan