In [1]:
#/kaggle/input/asasasaa/final_legal_laws_metadata.json

In [None]:
import json
import os
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from tqdm import tqdm
import random

# ============================================
# ============================================
MODEL_NAME = "mistralai/Mistral-7B-Instruct-v0.1"
INPUT_JSON = "/kaggle/input/asasasaa/final_legal_laws_metadata.json"
OUTPUT_JSON = "/kaggle/working/nepal_legal_instruction_dataset.json"

# Generation parameters
TEMPERATURE = 0.7
MAX_NEW_TOKENS = 600
TOP_P = 0.9
DO_SAMPLE = True
SAMPLES_PER_CHUNK = 4  # 4 samples per chunk

# ============================================
# 25 CONTROLLED INSTRUCTION TEMPLATES
# ============================================
INSTRUCTION_TEMPLATES = [
    "Explain the legal meaning of this provision in simple language.",
    "Summarize the main points of this law in 2–3 sentences.",
    "Identify the rights and obligations established by this section.",
    "Clarify this clause for a non-lawyer audience.",
    "Interpret how this provision would be applied in practice.",
    "Provide a real-world example illustrating this law.",
    "Highlight the exceptions and limitations of this provision.",
    "Explain the historical or legislative context of this section.",
    "Compare this provision with related laws or sections.",
    "Analyze the consequences of violating this clause.",
    "Reword this law in plain language suitable for general public understanding.",
    "Identify key terms in this section and explain them.",
    "Explain which offences are covered under this provision.",
    "Describe any conditions under which this law does not apply.",
    "Provide a hypothetical scenario showing how this law would operate.",
    "Explain the responsibilities of public authorities under this clause.",
    "Analyze the connection of this section with human rights principles.",
    "Identify the punishments or legal outcomes defined by this provision.",
    "Explain the scope and jurisdiction of this law.",
    "Discuss the rationale or legal reasoning behind this section.",
    "Describe how this law affects individuals, corporations, or government entities.",
    "Explain the limitations on exercising rights (e.g., private defence, consent) under this clause.",
    "Highlight any conditions that make an act excused or not an offence.",
    "Identify the specific offences mentioned and classify them (e.g., grave, heinous, minor).",
    "Analyze this provision in the context of Nepal's National Penal Code 2017 and related chapters."
]

# ============================================
# DYNAMIC INPUT GENERATORS
# ============================================
def generate_dynamic_inputs(chunk):
    """Generate 3 dynamic inputs based on the chunk metadata"""
    
    section_title = chunk.get("section_title", "this section")
    chapter_title = chunk.get("chapter_title", "this chapter")
    law_name = chunk.get("law", "this law")
    
    dynamic_inputs = [
        # Dynamic Input 1: Question format
        f"What does {law_name}, Section {chunk.get('section', '')} ({section_title}) state?",
        
        # Dynamic Input 2: Contextual query
        f"In the context of {chapter_title}, how should '{section_title}' be understood?",
        
        # Dynamic Input 3: Application query
        f"How does the provision on '{section_title}' apply to situations in Nepal under {law_name}?"
    ]
    
    return dynamic_inputs

# ============================================
# CHECK GPU AVAILABILITY
# ============================================
print("="*70)
print("GPU CHECK")
print("="*70)
print(f"CUDA Available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU Device: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
print("="*70)

# ============================================
# LOAD MODEL WITH GPU
# ============================================
print("\nLoading Mistral 7B model on GPU...")

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.float16,
    device_map="auto",
    low_cpu_mem_usage=True
)

model.eval()
print("✓ Model loaded successfully on GPU!")
print(f"✓ Model device: {model.device}")
print(f"✓ Model dtype: {model.dtype}")

# ============================================
# GENERATION FUNCTION
# ============================================
def generate_output(instruction, input_text, chunk):
    """Generate output using the model"""
    
    # Create context-aware prompt
    prompt_template = f"""You are a legal expert on Nepal's National Penal Code 2017.

Legal Context:
- Law: {chunk.get('law', '')}
- Chapter: {chunk.get('chapter_title', '')}
- Section: {chunk.get('section_title', '')}
- Legal Text: {chunk.get('text', '')}

User Query: {input_text}

Task: {instruction}

Provide a clear, accurate, and comprehensive response based on the legal provision above."""

    messages = [{"role": "user", "content": prompt_template}]
    
    prompt = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    
    try:
        inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=2048).to(model.device)
        
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=MAX_NEW_TOKENS,
                temperature=TEMPERATURE,
                top_p=TOP_P,
                do_sample=DO_SAMPLE,
                pad_token_id=tokenizer.eos_token_id
            )
        
        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        assistant_response = generated_text[len(prompt):].strip()
        
        # Clean up the response
        if not assistant_response or len(assistant_response) < 20:
            assistant_response = f"Based on {chunk.get('section_title', 'this provision')}, {chunk.get('text', '')[:200]}"
        
        return assistant_response
        
    except Exception as e:
        print(f"Error generating output: {str(e)}")
        return f"Error processing this legal provision. Please refer to the original text."

# ============================================
# PROCESS CHUNKS
# ============================================
def process_legal_chunks(legal_chunks):
    """Process all chunks and generate 4 samples each"""
    
    instruction_dataset = []
    total_chunks = len(legal_chunks)
    
    print(f"\nProcessing {total_chunks} legal chunks...")
    print(f"Generating {SAMPLES_PER_CHUNK} samples per chunk = {total_chunks * SAMPLES_PER_CHUNK} total samples")
    print("="*70)
    
    for chunk_idx, chunk in enumerate(tqdm(legal_chunks, desc="Processing chunks")):
        
        # Select 4 random instructions for this chunk
        selected_instructions = random.sample(INSTRUCTION_TEMPLATES, SAMPLES_PER_CHUNK)
        
        # Generate dynamic inputs
        dynamic_inputs = generate_dynamic_inputs(chunk)
        
        # Generate 4 samples
        for sample_idx in range(SAMPLES_PER_CHUNK):
            instruction = selected_instructions[sample_idx]
            
            # First sample uses original text as input
            # Other 3 samples use dynamic inputs
            if sample_idx == 0:
                input_text = chunk.get("text", "")
            else:
                input_text = dynamic_inputs[sample_idx - 1]
            
            # Generate output
            output = generate_output(instruction, input_text, chunk)
            
            # Create dataset entry
            dataset_entry = {
                "instruction": instruction,
                "input": input_text,
                "output": output,
                "metadata": {
                    "law": chunk.get("law", ""),
                    "part": chunk.get("part", ""),
                    "chapter": chunk.get("chapter", ""),
                    "chapter_title": chunk.get("chapter_title", ""),
                    "section": chunk.get("section", ""),
                    "section_title": chunk.get("section_title", ""),
                    "subsection": chunk.get("subsection", ""),
                    "chunk_id": chunk.get("chunk_id", ""),
                    "source": chunk.get("source", ""),
                    "original_text": chunk.get("text", ""),
                    "sample_type": "original_text_input" if sample_idx == 0 else f"dynamic_input_{sample_idx}"
                }
            }
            
            instruction_dataset.append(dataset_entry)
        
        # Periodic checkpoint every 25 chunks
        if (chunk_idx + 1) % 25 == 0:
            with open(OUTPUT_JSON, 'w', encoding='utf-8') as f:
                json.dump(instruction_dataset, f, ensure_ascii=False, indent=2)
            
            if torch.cuda.is_available():
                gpu_memory = torch.cuda.max_memory_allocated() / 1e9
                print(f"\n✓ Checkpoint: Processed {chunk_idx + 1}/{total_chunks} chunks")
                print(f"  Generated: {len(instruction_dataset)} samples | GPU Memory: {gpu_memory:.2f} GB")
    
    return instruction_dataset

# ============================================
# MAIN EXECUTION
# ============================================
def main():
    print("\n" + "="*70)
    print("NEPAL LEGAL INSTRUCTION DATASET GENERATOR")
    print("="*70)
    
    # Load input data
    print(f"\nLoading input dataset from: {INPUT_JSON}")
    with open(INPUT_JSON, 'r', encoding='utf-8') as f:
        legal_chunks = json.load(f)
    
    print(f"✓ Loaded {len(legal_chunks)} legal chunks")
    
    # Process all chunks
    instruction_dataset = process_legal_chunks(legal_chunks)
    
    # Final save
    print(f"\n{'='*70}")
    print(f"Saving final dataset to: {OUTPUT_JSON}")
    with open(OUTPUT_JSON, 'w', encoding='utf-8') as f:
        json.dump(instruction_dataset, f, ensure_ascii=False, indent=2)
    
    print(f"✓ COMPLETE! Generated {len(instruction_dataset)} instruction-output pairs")
    print(f"✓ Total chunks processed: {len(legal_chunks)}")
    print(f"✓ Samples per chunk: {SAMPLES_PER_CHUNK}")
    print("="*70)
    
    # Display statistics
    print("\nDATASET STATISTICS:")
    print("="*70)
    original_text_samples = sum(1 for item in instruction_dataset if item['metadata']['sample_type'] == 'original_text_input')
    dynamic_samples = len(instruction_dataset) - original_text_samples
    
    print(f"Original text input samples: {original_text_samples}")
    print(f"Dynamic input samples: {dynamic_samples}")
    print(f"Total samples: {len(instruction_dataset)}")
    
    # Display sample outputs
    if instruction_dataset:
        print("\n" + "="*70)
        print("SAMPLE OUTPUTS (First 2 samples):")
        print("="*70)
        
        for i, sample in enumerate(instruction_dataset[:2]):
            print(f"\n--- SAMPLE {i+1} ---")
            print(f"Instruction: {sample['instruction']}")
            print(f"\nInput: {sample['input'][:150]}...")
            print(f"\nOutput: {sample['output'][:300]}...")
            print(f"\nMetadata:")
            print(f"  - Law: {sample['metadata']['law']}")
            print(f"  - Section: {sample['metadata']['section_title']}")
            print(f"  - Sample Type: {sample['metadata']['sample_type']}")
            print("-"*70)
    
    # Final GPU stats
    if torch.cuda.is_available():
        print(f"\n✓ Peak GPU Memory Used: {torch.cuda.max_memory_allocated() / 1e9:.2f} GB")
    
    print("\n" + "="*70)
    print("GENERATION COMPLETE!")
    print("="*70)

if __name__ == "__main__":
    main()

GPU CHECK
CUDA Available: True
GPU Device: Tesla P100-PCIE-16GB
GPU Memory: 17.06 GB

Loading Mistral 7B model on GPU...


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!
2026-01-16 11:35:03.924004: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1768563304.103924      55 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1768563304.156567      55 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1768563304.580678      55 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1768563304.580718      55 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1768563304.580721      55

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

✓ Model loaded successfully on GPU!
✓ Model device: cuda:0
✓ Model dtype: torch.float16

NEPAL LEGAL INSTRUCTION DATASET GENERATOR

Loading input dataset from: /kaggle/input/asasasaa/final_legal_laws_metadata.json
✓ Loaded 731 legal chunks

Processing 731 legal chunks...
Generating 4 samples per chunk = 2924 total samples


Processing chunks:   0%|          | 0/731 [00:00<?, ?it/s]