In [None]:
import os
from dotenv import load_dotenv
from openai import OpenAI
import json
from typing import List, Dict
import gradio as gr
import random

load_dotenv(override=True)
client = OpenAI()

In [None]:
LEGAL_TOPIC_SEEDS = [
    "criminal offenses and penalties",
    "property rights and disputes",
    "contract law and breach remedies",
    "civil procedure and court processes",
    "evidence admissibility rules",
    "constitutional rights protections",
    "family law and inheritance",
    "corporate governance regulations",
    "intellectual property protections",
    "cyber crime and digital law"
]

QUESTION_TYPES = [
    "definition",
    "procedure",
    "penalty",
    "rights",
    "obligations",
    "exceptions",
    "examples"
]

In [None]:
class SyntheticLegalGenerator:
    """Generates synthetic legal content and sections"""
    
    def __init__(self, client: OpenAI, model: str = "gpt-4o-mini"):
        self.client = client
        self.model = model
    
    def generate_legal_section(self, topic: str) -> Dict[str, str]:
        """Generate a completely synthetic legal section"""
        
        prompt = f"""Create a SYNTHETIC (fictional but realistic) Indian legal section about: {topic}

Generate:
1. A section number (format: IPC XXX or CrPC XXX or IEA XXX)
2. A clear title
3. A detailed legal provision (2-3 sentences)

Make it realistic but completely fictional. Use legal language.

Format:
SECTION: [number]
TITLE: [title]
PROVISION: [detailed text]"""

        try:
            response = self.client.chat.completions.create(
                model=self.model,
                messages=[
                    {"role": "system", "content": "You are a legal content generator creating synthetic Indian legal provisions for educational purposes."},
                    {"role": "user", "content": prompt}
                ],
                temperature=0.8,
                max_tokens=400
            )
            
            content = response.choices[0].message.content.strip()
            
            # Parse the response
            section_num = ""
            title = ""
            provision = ""
            
            for line in content.split('\n'):
                if line.startswith('SECTION:'):
                    section_num = line.replace('SECTION:', '').strip()
                elif line.startswith('TITLE:'):
                    title = line.replace('TITLE:', '').strip()
                elif line.startswith('PROVISION:'):
                    provision = line.replace('PROVISION:', '').strip()
            
            return {
                "section_number": section_num,
                "title": title,
                "provision": provision,
                "topic": topic
            }
            
        except Exception as e:
            print(f"Error generating section: {e}")
            return {
                "section_number": "IPC 000",
                "title": "Error",
                "provision": f"Failed to generate: {e}",
                "topic": topic
            }

In [None]:
class SyntheticQAGenerator:
    """Generates Q&A pairs from synthetic legal sections"""
    
    def __init__(self, client: OpenAI, model: str = "gpt-4o-mini"):
        self.client = client
        self.model = model
    
    def generate_qa_pair(self, legal_section: Dict[str, str], question_type: str) -> Dict[str, str]:
        """Generate Q&A pair from synthetic legal section"""
        
        prompt = f"""Based on this SYNTHETIC legal section, create a {question_type}-type question and answer:

Section: {legal_section['section_number']}
Title: {legal_section['title']}
Provision: {legal_section['provision']}

Create ONE question (focusing on {question_type}) and a clear, accurate answer based on this provision.

Format:
Q: [question]
A: [answer]

Keep it educational and clear."""

        try:
            response = self.client.chat.completions.create(
                model=self.model,
                messages=[
                    {"role": "system", "content": "You are creating educational Q&A pairs from synthetic legal content."},
                    {"role": "user", "content": prompt}
                ],
                temperature=0.7,
                max_tokens=350
            )
            
            content = response.choices[0].message.content.strip()
            
            # Parse Q&A
            question = ""
            answer = ""
            
            for line in content.split('\n'):
                if line.startswith('Q:'):
                    question = line[2:].strip()
                elif line.startswith('A:'):
                    answer = line[2:].strip()
            
            return {
                "section_number": legal_section['section_number'],
                "section_title": legal_section['title'],
                "provision": legal_section['provision'],
                "question_type": question_type,
                "question": question,
                "answer": answer
            }
            
        except Exception as e:
            print(f"Error generating Q&A: {e}")
            return {
                "section_number": legal_section['section_number'],
                "section_title": legal_section['title'],
                "provision": legal_section['provision'],
                "question_type": question_type,
                "question": "Error generating question",
                "answer": "Error generating answer"
            }

In [None]:
class SyntheticDataPipeline:
    """Complete pipeline for synthetic legal Q&A generation"""
    
    def __init__(self, legal_gen: SyntheticLegalGenerator, qa_gen: SyntheticQAGenerator):
        self.legal_gen = legal_gen
        self.qa_gen = qa_gen
        self.dataset: List[Dict[str, str]] = []
    
    def generate_complete_entry(self, topic: str = None, question_type: str = None) -> Dict[str, str]:
        """Generate synthetic legal section + Q&A in one go"""
        
        # Pick random topic if not provided
        if topic is None:
            topic = random.choice(LEGAL_TOPIC_SEEDS)
        
        # Pick random question type if not provided
        if question_type is None:
            question_type = random.choice(QUESTION_TYPES)
        
        # Step 1: Generate synthetic legal section
        legal_section = self.legal_gen.generate_legal_section(topic)
        
        # Step 2: Generate Q&A from that section
        qa_pair = self.qa_gen.generate_qa_pair(legal_section, question_type)
        
        return qa_pair
    
    def generate_batch(self, count: int, progress_callback=None) -> List[Dict[str, str]]:
        """Generate multiple synthetic entries"""
        batch = []
        
        for i in range(count):
            if progress_callback:
                progress_callback((i + 1) / count, desc=f"Generating {i+1}/{count}...")
            
            entry = self.generate_complete_entry()
            batch.append(entry)
            self.dataset.append(entry)
        
        return batch
    
    def save_dataset(self, filename: str = "synthetic_legal_qa.json") -> str:
        """Save dataset to JSON"""
        try:
            with open(filename, 'w', encoding='utf-8') as f:
                json.dump(self.dataset, f, indent=2, ensure_ascii=False)
            return f"✅ Saved {len(self.dataset)} synthetic Q&A pairs to {filename}"
        except Exception as e:
            return f"❌ Error saving: {e}"
    
    def get_summary(self) -> str:
        """Get dataset summary"""
        if not self.dataset:
            return "No synthetic data generated yet."
        
        summary = f"**Total Synthetic Q&A Pairs:** {len(self.dataset)}\n\n"
        summary += "**Topics Covered:**\n"
        
        topics = {}
        for entry in self.dataset:
            topic = entry.get('section_title', 'Unknown')
            topics[topic] = topics.get(topic, 0) + 1
        
        for topic, count in topics.items():
            summary += f"- {topic}: {count}\n"
        
        return summary

In [None]:
legal_generator = SyntheticLegalGenerator(client)
qa_generator = SyntheticQAGenerator(client)
pipeline = SyntheticDataPipeline(legal_generator, qa_generator)

print("✅ Synthetic data pipeline initialized!")

In [None]:
# Cell 8: UI functions with real-time progress updates
def generate_single_synthetic(topic_choice: str, question_type: str, progress=gr.Progress()):
    """Generate single synthetic entry with real-time updates"""
    
    # Step 1: Generate legal section
    progress(0.2, desc="🔍 Generating synthetic legal section...")
    yield "⏳ Creating synthetic legal provision...", pipeline.get_summary()
    
    legal_section = pipeline.legal_gen.generate_legal_section(topic_choice)
    
    # Show intermediate result
    intermediate = f"### 📜 Generated Section\n\n"
    intermediate += f"**{legal_section['section_number']}**: {legal_section['title']}\n\n"
    intermediate += f"_{legal_section['provision']}_\n\n"
    intermediate += "⏳ Now generating Q&A pair..."
    
    progress(0.5, desc="💭 Creating Q&A pair...")
    yield intermediate, pipeline.get_summary()
    
    # Step 2: Generate Q&A
    qa_pair = pipeline.qa_gen.generate_qa_pair(legal_section, question_type)
    pipeline.dataset.append(qa_pair)
    
    progress(0.9, desc="✨ Finalizing...")
    
    # Final result
    result = f"### 🏛️ {qa_pair['section_number']}: {qa_pair['section_title']}\n\n"
    result += f"**Provision:** {qa_pair['provision']}\n\n"
    result += f"**Question Type:** _{qa_pair['question_type']}_\n\n"
    result += f"**Q:** {qa_pair['question']}\n\n"
    result += f"**A:** {qa_pair['answer']}\n\n"
    result += "---\n✅ **Added to dataset!**"
    
    progress(1.0, desc="✅ Complete!")
    yield result, pipeline.get_summary()

def generate_batch_synthetic(num_pairs: int, progress=gr.Progress()):
    """Generate batch with live updates after each entry"""
    
    results = []
    count = int(num_pairs)
    
    for i in range(count):
        # Update progress
        progress_pct = (i + 1) / count
        progress(progress_pct, desc=f"🔄 Generating {i+1}/{count}...")
        
        # Generate entry
        entry = pipeline.generate_complete_entry()
        pipeline.dataset.append(entry)
        
        # Format result
        result = f"### {i+1}. {entry['section_number']}: {entry['section_title']}\n"
        result += f"**Q:** {entry['question']}\n"
        result += f"**A:** {entry['answer']}\n\n"
        results.append(result)
        
        # Yield intermediate results to update UI in real-time
        current_output = "".join(results)
        current_output += f"\n---\n⏳ **Progress: {i+1}/{count} completed**"
        
        yield current_output, pipeline.get_summary()
    
    # Final output
    final_output = "".join(results)
    final_output += f"\n---\n✅ **All {count} Q&A pairs generated successfully!**"
    
    progress(1.0, desc="✅ Batch complete!")
    yield final_output, pipeline.get_summary()

def save_synthetic_dataset():
    """Save the synthetic dataset"""
    return pipeline.save_dataset()

def clear_dataset():
    """Clear the current dataset"""
    pipeline.dataset.clear()
    return "✅ Dataset cleared!", pipeline.get_summary()

In [None]:
# Cell 9: Enhanced UI with real-time updates
with gr.Blocks(title="Synthetic Legal Q&A Generator", theme=gr.themes.Soft()) as demo:
    gr.Markdown("# 🤖 Synthetic Legal Q&A Data Generator")
    gr.Markdown("**Generates completely synthetic Indian legal sections AND Q&A pairs from scratch**")
    gr.Markdown("_Watch the magic happen in real-time! 🎬_")
    
    with gr.Tab("🎯 Single Generation"):
        gr.Markdown("### Generate one synthetic legal section with Q&A")
        gr.Markdown("_See each step of generation as it happens_")
        
        with gr.Row():
            with gr.Column(scale=1):
                topic_dropdown = gr.Dropdown(
                    choices=LEGAL_TOPIC_SEEDS,
                    label="🎯 Select Legal Topic",
                    value=LEGAL_TOPIC_SEEDS[0]
                )
                qtype_dropdown = gr.Dropdown(
                    choices=QUESTION_TYPES,
                    label="❓ Question Type",
                    value=QUESTION_TYPES[0]
                )
                gen_single_btn = gr.Button(
                    "🎲 Generate Synthetic Entry", 
                    variant="primary",
                    size="lg"
                )
            
            with gr.Column(scale=2):
                output_single = gr.Markdown(
                    label="Generated Content",
                    value="Click **Generate** to create synthetic legal content..."
                )
        
        summary_single = gr.Textbox(
            label="📊 Dataset Summary", 
            lines=6,
            interactive=False
        )
        
        gen_single_btn.click(
            fn=generate_single_synthetic,
            inputs=[topic_dropdown, qtype_dropdown],
            outputs=[output_single, summary_single]
        )
    
    with gr.Tab("🚀 Batch Generation"):
        gr.Markdown("### Generate multiple synthetic legal Q&A pairs")
        gr.Markdown("_Live updates as each Q&A pair is created!_")
        
        with gr.Row():
            with gr.Column(scale=1):
                num_slider = gr.Slider(
                    minimum=5,
                    maximum=1000,
                    value=5,
                    step=5,
                    label="📦 Number of Synthetic Q&A Pairs"
                )
                gr.Markdown("**Tip:** Start with 10-20 pairs to see live generation")
                gen_batch_btn = gr.Button(
                    "🔥 Generate Batch", 
                    variant="primary",
                    size="lg"
                )
            
            with gr.Column(scale=2):
                output_batch = gr.Markdown(
                    label="Generated Synthetic Data",
                    value="Click **Generate Batch** to start creating multiple Q&A pairs..."
                )
        
        summary_batch = gr.Textbox(
            label="📊 Dataset Summary", 
            lines=6,
            interactive=False
        )
        
        gen_batch_btn.click(
            fn=generate_batch_synthetic,
            inputs=[num_slider],
            outputs=[output_batch, summary_batch]
        )
    
    with gr.Tab("💾 Manage Dataset"):
        gr.Markdown("### Save or Clear Your Synthetic Dataset")
        
        with gr.Row():
            with gr.Column():
                gr.Markdown("**💾 Save your generated data**")
                gr.Markdown("Exports all Q&A pairs to `synthetic_legal_qa.json`")
                save_btn = gr.Button(
                    "💾 Save to JSON", 
                    variant="primary",
                    size="lg"
                )
            
            with gr.Column():
                gr.Markdown("**🗑️ Clear current dataset**")
                gr.Markdown("⚠️ This will remove all generated Q&A pairs")
                clear_btn = gr.Button(
                    "🗑️ Clear Dataset", 
                    variant="stop",
                    size="lg"
                )
        
        manage_status = gr.Textbox(
            label="Status", 
            lines=2,
            interactive=False
        )
        manage_summary = gr.Textbox(
            label="Current Dataset Overview", 
            lines=10,
            interactive=False,
            value=pipeline.get_summary()
        )
        
        save_btn.click(
            fn=save_synthetic_dataset,
            inputs=[],
            outputs=[manage_status]
        )
        
        clear_btn.click(
            fn=clear_dataset,
            inputs=[],
            outputs=[manage_status, manage_summary]
        )
    
    # Footer
    gr.Markdown("---")
    gr.Markdown("🎓 **LLM Engineering Week 3** | Synthetic Data Generation Challenge")

demo.launch(share=False, inbrowser=True)