# LegalEase_ChatBot 

In [1]:
!pip install pandas
!pip install -q transformers datasets accelerate evaluate sentencepiece  





# 1. importing libabries

In [2]:
import math
import pandas as pd
import numpy as np
import torch

from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments
)

  from .autonotebook import tqdm as notebook_tqdm


# 2. importing Dataset

In [3]:
# Load the dataset
data = pd.read_csv('dataset/legal_qa.csv')

data

Unnamed: 0,question,answer
0,What is the difference between a petition and ...,A petition is a formal request submitted to a ...
1,When should a writ petition be filed in India?,A writ petition in India should be filed when ...
2,What is the procedure for filing a plaint in a...,To file a plaint in a civil case in Indiayou m...
3,What are the common reliefs sought through a p...,Public interest litigation (PIL) petitions in ...
4,Can a plaint be amended after it has been file...,Yesa plaint can be amended in a civil case in ...
...,...,...
28684,Which schedule includes small scale industries...,The Eleventh Schedule includes these industries.
28685,What welfare programs are included in the Elev...,"Family welfare, Women and child development, S..."
28686,Which Schedule includes programs such as famil...,These programs are included in the Eleventh Sc...
28687,What does the twelfth schedule of Article 243W...,"Urban planning including town planning, regula..."


# 3) Basic cleaning + formatting


In [4]:
data = data.copy()

data["question"] = data["question"].astype(str).str.strip()
data["answer"] = data["answer"].astype(str).str.strip()

# remove empty rows
data = data[(data["question"].str.len() > 0) & (data["answer"].str.len() > 0)].reset_index(drop=True)

def format_qa(q, a):
    return f"### Question:\n{q}\n\n### Answer:\n{a}\n"

data["text"] = data.apply(lambda row: format_qa(row["question"], row["answer"]), axis=1)

print(data["text"].iloc[0])
print("Rows:", len(data))


### Question:
What is the difference between a petition and a plaint in Indian law?

### Answer:
A petition is a formal request submitted to a courttribunalor authority to seek a specific remedy or relief. It is commonly used for various purposessuch as filing a writ petition in the High Court or submitting a petition for divorce. On the other handa plaint is a formal written statement of a plaintiff's claim in a civil lawsuit. The key difference is that a petition is more versatile and can be used for various legal matterswhile a plaint is specific to civil cases.

Rows: 28689



# 4) Convert to HuggingFace Dataset + split


In [5]:
hf_ds = Dataset.from_pandas(data[["text"]])

split = hf_ds.train_test_split(test_size=0.1, seed=42)
val_ds = split["train"]
train_ds = split["test"]

print(train_ds, val_ds)

Dataset({
    features: ['text'],
    num_rows: 2869
}) Dataset({
    features: ['text'],
    num_rows: 25820
})



# 5) Load GPT-2 tokenizer + model


In [6]:
MODEL_NAME = "gpt2"   # small GPT-2

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)

# GPT-2 has no pad token by default → set pad token = eos token
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.eos_token_id


Loading weights: 100%|██████████| 148/148 [00:00<00:00, 1194.38it/s, Materializing param=transformer.wte.weight]             
[1mGPT2LMHeadModel LOAD REPORT[0m from: gpt2
Key                  | Status     |  | 
---------------------+------------+--+-
h.{0...11}.attn.bias | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m



# 6) Tokenization


In [7]:
MAX_LEN = 384  # for MacBook keep smaller; later on GPU you can use 512/1024

def tokenize_fn(batch):
    out = tokenizer(
        batch["text"],
        truncation=True,
        max_length=MAX_LEN,
        padding="max_length"
    )
    out["labels"] = out["input_ids"].copy()
    return out

train_tok = train_ds.map(tokenize_fn, batched=True, remove_columns=["text"])
val_tok = val_ds.map(tokenize_fn, batched=True, remove_columns=["text"])


Map: 100%|██████████| 2869/2869 [00:00<00:00, 5536.19 examples/s]
Map: 100%|██████████| 25820/25820 [00:04<00:00, 5441.45 examples/s]


# 7) Data collator


In [8]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)



# 8) Training arguments (Mac-friendly)


In [9]:
training_args = TrainingArguments(
    output_dir="models",

    # NEW in v5
    eval_strategy="steps",
    eval_steps=300,

    save_strategy="steps",
    save_steps=300,
    save_total_limit=2,

    logging_strategy="steps",
    logging_steps=50,

    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,

    gradient_accumulation_steps=8,
    num_train_epochs=1,
    learning_rate=5e-5,

    weight_decay=0.01,
    warmup_steps=100,

    fp16=False,
    bf16=False,

    report_to="none",
)



# 9) Trainer + Train

In [10]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tok,
    eval_dataset=val_tok,
    data_collator=data_collator
)

trainer.train()

  super().__init__(loader)
`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.


Step,Training Loss,Validation Loss
300,7.297839,6.512819


Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  1.27it/s]
Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  1.15it/s]


TrainOutput(global_step=359, training_loss=6.189120810676086, metrics={'train_runtime': 6839.8412, 'train_samples_per_second': 0.419, 'train_steps_per_second': 0.052, 'total_flos': 562235129856000.0, 'train_loss': 6.189120810676086, 'epoch': 1.0})

# 10) Evaluation metrics (Loss + Perplexity)


In [None]:

eval_results = trainer.evaluate()
print(eval_results)

# perplexity = exp(loss)
loss = eval_results["eval_loss"]
perplexity = math.exp(loss)

print(f"\nEval Loss: {loss:.4f}")
print(f"Perplexity: {perplexity:.2f}")



# 11) Save model


In [11]:
trainer.save_model("models")
tokenizer.save_pretrained("models")

Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  2.11it/s]


('models/tokenizer_config.json', 'models/tokenizer.json')


# 12) Inference function (Ask the chatbot)

In [12]:

def ask_bot(question, max_new_tokens=180, temperature=0.7, top_p=0.9):
    prompt = f"### Question:\n{question}\n\n### Answer:\n"

    inputs = tokenizer(prompt, return_tensors="pt")
    inputs = {k: v.to(model.device) for k, v in inputs.items()}

    with torch.no_grad():
        output_ids = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=True,
            temperature=temperature,
            top_p=top_p,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id
        )

    decoded = tokenizer.decode(output_ids[0], skip_special_tokens=True)

    # Return only the answer part
    if "### Answer:" in decoded:
        return decoded.split("### Answer:")[-1].strip()
    return decoded.strip()


In [16]:

### Test it:
print(ask_bot("What is the punishment for cheque bounce in India?"))


​ in the power in the jurisdiction in a law as a the
b: theט the right of the any other that the legislative so of a issue this
 theb the law
 the
 the�b
 the the 15
 the specifically how

 theט (
, the a the a a in the the

 the
 a the
 the've

​ itsb a

 a the the services the
 the
 the
btheless
 the
 the the
 the
 the

 the

 from a in the the

� law



 or this 2 aא the
 the

 that a

 the
 the has

b

" theאו�ט its
 the the

 the
 and on a

 the, a the 2

b
 a and be


# 13. To save pickel file

In [15]:
import pickle

# Save the trained model to a file
with open("models/LegalQA.pkl", "wb") as file:  #
    pickle.dump(model, file)
