# AI-Powered Malaysian Legal PDF Analysis
## Extract PDF Content with Unstructured + Label Context with OpenAI/Gemini

This notebook demonstrates how to:
1. Extract text from Malaysian legal PDFs using the `unstructured` library
2. Use AI models (OpenAI GPT-4 or Google Gemini) to intelligently label and categorize content
3. Optimize for Google Colab GPU environment
4. Export structured, labeled results for legal AI/RAG systems

**Perfect for:** Legal document processing, content classification, and building intelligent legal knowledge bases

---

### üöÄ **Optimized for Google Colab GPU Runtime**
- Make sure to enable GPU: Runtime ‚Üí Change runtime type ‚Üí Hardware accelerator: GPU

## üì¶ Install Required Libraries

Install all necessary packages for PDF processing and AI model integration.

In [None]:
# Install required packages for Google Colab
!pip install -q unstructured[pdf] 
!pip install -q openai>=1.0.0
!pip install -q google-generativeai
!pip install -q python-dotenv
!pip install -q tqdm
!pip install -q pandas
!pip install -q matplotlib
!pip install -q seaborn

# Install additional dependencies for unstructured PDF processing
!apt-get update -qq
!apt-get install -qq poppler-utils tesseract-ocr

print("‚úÖ All packages installed successfully!")

## üîß Import Dependencies and Setup API Keys

Import all necessary libraries and configure secure API key handling.

In [None]:
import os
import json
import time
from pathlib import Path
from typing import List, Dict, Any, Optional
import logging
from datetime import datetime

# PDF Processing
from unstructured.partition.pdf import partition_pdf

# AI Model Libraries
import openai
import google.generativeai as genai

# Data Processing
import pandas as pd
from tqdm import tqdm

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

print("‚úÖ All libraries imported successfully!")

# API Key Setup
print("\nüîë API Key Configuration:")
print("Please set your API keys using one of these methods:")
print("1. Use Colab Secrets (recommended)")
print("2. Set environment variables")
print("3. Direct assignment (not recommended for production)")

# Secure API key handling for Colab
try:
    from google.colab import userdata
    OPENAI_API_KEY = userdata.get('OPENAI_API_KEY')
    GEMINI_API_KEY = userdata.get('GEMINI_API_KEY')
    print("‚úÖ Using Colab secrets for API keys")
except:
    # Fallback to environment variables or manual input
    OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
    GEMINI_API_KEY = os.getenv('GEMINI_API_KEY')
    
    if not OPENAI_API_KEY and not GEMINI_API_KEY:
        print("‚ö†Ô∏è  No API keys found. Please set them manually:")
        print("OPENAI_API_KEY = 'your-openai-key-here'")
        print("GEMINI_API_KEY = 'your-gemini-key-here'")

# Set API keys
if OPENAI_API_KEY:
    openai.api_key = OPENAI_API_KEY
    print("‚úÖ OpenAI API key configured")

if GEMINI_API_KEY:
    genai.configure(api_key=GEMINI_API_KEY)
    print("‚úÖ Gemini API key configured")

## üöÄ Configure GPU and Environment

Check GPU availability and optimize environment for efficient processing.

In [None]:
# Check GPU availability
try:
    import torch
    gpu_available = torch.cuda.is_available()
    if gpu_available:
        gpu_name = torch.cuda.get_device_name(0)
        gpu_memory = torch.cuda.get_device_properties(0).total_memory / 1024**3
        print(f"‚úÖ GPU Available: {gpu_name}")
        print(f"üìä GPU Memory: {gpu_memory:.1f} GB")
        
        # Set optimal settings for GPU
        torch.backends.cudnn.benchmark = True
    else:
        print("‚ö†Ô∏è  No GPU detected. Using CPU (will be slower)")
except ImportError:
    print("‚ö†Ô∏è  PyTorch not available. Install if you need GPU acceleration.")

# Check system resources
import psutil
cpu_count = psutil.cpu_count()
memory_gb = psutil.virtual_memory().total / 1024**3

print(f"üñ•Ô∏è  CPU cores: {cpu_count}")
print(f"üíæ RAM: {memory_gb:.1f} GB")

# Optimize environment settings
os.environ['TOKENIZERS_PARALLELISM'] = 'false'  # Avoid tokenizer warnings
os.environ['OMP_NUM_THREADS'] = str(min(4, cpu_count))  # Optimize CPU usage

print("‚úÖ Environment optimized for processing")

## üìÑ Load and Extract PDF Content

Use unstructured library to extract elements from Malaysian legal PDFs with high accuracy.

In [None]:
# Install Google Drive dependency for Colab
try:
    from google.colab import drive
    import shutil
    print("‚úÖ Google Colab environment detected")
    COLAB_ENV = True
except ImportError:
    print("‚ö†Ô∏è  Not in Google Colab - manual file upload required")
    COLAB_ENV = False

def mount_and_find_pdfs():
    """Mount Google Drive and search for PDF files"""
    if not COLAB_ENV:
        print("‚ùå This function requires Google Colab environment")
        return []
    
    try:
        # Mount Google Drive
        print("üîó Mounting Google Drive...")
        drive.mount('/content/drive')
        print("‚úÖ Google Drive mounted successfully!")
        
        # Search for PDF files in common locations
        import glob
        
        search_patterns = [
            '/content/drive/MyDrive/**/*.pdf',
            '/content/drive/MyDrive/legal_documents/**/*.pdf',
            '/content/drive/MyDrive/malaysian_acts/**/*.pdf',
            '/content/drive/MyDrive/PDFs/**/*.pdf',
            '/content/drive/Shareddrives/**/*.pdf'
        ]
        
        all_pdfs = []
        print("üîç Searching for PDF files...")
        
        for pattern in search_patterns:
            found_pdfs = glob.glob(pattern, recursive=True)
            all_pdfs.extend(found_pdfs)
        
        # Remove duplicates and sort
        unique_pdfs = list(set(all_pdfs))
        unique_pdfs.sort()
        
        print(f"üìÅ Found {len(unique_pdfs)} PDF files in Google Drive")
        
        # Display found PDFs
        if unique_pdfs:
            print("\nüìã Available PDF files:")
            for i, pdf_path in enumerate(unique_pdfs[:20]):  # Show first 20
                file_size = os.path.getsize(pdf_path) / 1024 / 1024  # Size in MB
                print(f"   {i+1:2d}. {Path(pdf_path).name} ({file_size:.1f} MB)")
                print(f"       üìÇ {pdf_path}")
            
            if len(unique_pdfs) > 20:
                print(f"   ... and {len(unique_pdfs) - 20} more files")
        
        return unique_pdfs
        
    except Exception as e:
        print(f"‚ùå Error accessing Google Drive: {e}")
        return []

def copy_pdf_to_workspace(drive_pdf_path: str, workspace_name: str = None) -> str:
    """Copy PDF from Google Drive to Colab workspace for processing"""
    try:
        if workspace_name is None:
            workspace_name = Path(drive_pdf_path).name
        
        workspace_path = f"/content/{workspace_name}"
        
        print(f"üìã Copying {Path(drive_pdf_path).name} to workspace...")
        shutil.copy2(drive_pdf_path, workspace_path)
        
        file_size = os.path.getsize(workspace_path) / 1024 / 1024
        print(f"‚úÖ PDF copied to: {workspace_path} ({file_size:.1f} MB)")
        
        return workspace_path
        
    except Exception as e:
        print(f"‚ùå Error copying PDF: {e}")
        return ""

def extract_pdf_content(pdf_path: str, max_pages: Optional[int] = None) -> List[Dict[str, Any]]:
    """
    Extract content from PDF using unstructured library with high accuracy settings.
    
    Args:
        pdf_path: Path to the PDF file (local or Google Drive)
        max_pages: Maximum number of pages to process (None for all pages)
    
    Returns:
        List of extracted elements with metadata
    """
    logger.info(f"üîç Extracting content from: {Path(pdf_path).name}")
    
    try:
        # Configure extraction settings for maximum accuracy
        elements = partition_pdf(
            filename=pdf_path,
            strategy="hi_res",  # High resolution for legal documents
            infer_table_structure=True,  # Detect tables
            extract_images_in_pdf=False,  # Skip images for text focus
            include_page_breaks=True,  # Preserve page structure
        )
        
        logger.info(f"‚úÖ Extracted {len(elements)} elements from entire PDF")
        
        # Convert to structured format
        structured_elements = []
        
        for i, element in enumerate(elements):
            # Get element metadata
            page_num = 1  # default
            coordinates = None
            
            if hasattr(element, 'metadata') and element.metadata:
                if hasattr(element.metadata, 'page_number'):
                    page_num = element.metadata.page_number
                if hasattr(element.metadata, 'coordinates'):
                    coordinates = str(element.metadata.coordinates)
            
            # Filter by page limit if specified
            if max_pages and page_num > max_pages:
                continue
            
            # Create structured element
            structured_element = {
                'index': i,
                'page_number': page_num,
                'element_type': str(type(element).__name__),
                'text': str(element).strip(),
                'character_count': len(str(element)),
                'coordinates': coordinates,
                'metadata': {
                    'extraction_timestamp': datetime.now().isoformat(),
                    'source_file': Path(pdf_path).name
                }
            }
            
            structured_elements.append(structured_element)
        
        # Filter and sort
        filtered_elements = [e for e in structured_elements if e['character_count'] > 5]  # Remove tiny elements
        filtered_elements.sort(key=lambda x: (x['page_number'], x['index']))  # Sort by page and order
        
        logger.info(f"üìä Final elements: {len(filtered_elements)} (after filtering)")
        return filtered_elements
        
    except Exception as e:
        logger.error(f"‚ùå Error extracting PDF: {e}")
        return []

def process_drive_pdf(pdf_index: int = None, pdf_path: str = None, max_pages: int = 3):
    """
    Process a PDF from Google Drive by index or path
    
    Args:
        pdf_index: Index of PDF from the found list (1-based)
        pdf_path: Direct path to PDF file
        max_pages: Maximum pages to process
    """
    if COLAB_ENV:
        # Get available PDFs
        available_pdfs = mount_and_find_pdfs()
        
        if not available_pdfs:
            print("‚ùå No PDF files found in Google Drive")
            return None
        
        # Select PDF
        if pdf_index:
            if 1 <= pdf_index <= len(available_pdfs):
                selected_pdf = available_pdfs[pdf_index - 1]
            else:
                print(f"‚ùå Invalid index. Please choose 1-{len(available_pdfs)}")
                return None
        elif pdf_path:
            if pdf_path in available_pdfs:
                selected_pdf = pdf_path
            else:
                print(f"‚ùå PDF not found: {pdf_path}")
                return None
        else:
            # Use first PDF as example
            selected_pdf = available_pdfs[0]
            print(f"üìù Using first PDF as example: {Path(selected_pdf).name}")
        
        # Copy to workspace and process
        workspace_pdf = copy_pdf_to_workspace(selected_pdf)
        if workspace_pdf:
            print(f"\nüîÑ Processing PDF: {Path(workspace_pdf).name}")
            elements = extract_pdf_content(workspace_pdf, max_pages=max_pages)
            
            if elements:
                print(f"‚úÖ Successfully extracted {len(elements)} elements")
                return elements
            else:
                print("‚ùå Failed to extract content")
                return None
    else:
        print("‚ùå Google Drive integration requires Google Colab environment")
        return None

# Demo usage and setup
print("üöÄ Google Drive PDF Processing Setup")
print("=" * 50)

if COLAB_ENV:
    print("üì± Available commands:")
    print("1. mount_and_find_pdfs() - Find all PDFs in your Google Drive")
    print("2. process_drive_pdf(pdf_index=1) - Process first PDF")
    print("3. process_drive_pdf(pdf_index=2, max_pages=5) - Process 2nd PDF, 5 pages")
    print("\nüéØ Quick start:")
    print("   available_pdfs = mount_and_find_pdfs()")
    print("   elements = process_drive_pdf(pdf_index=1, max_pages=3)")
    
    # Uncomment the next line to automatically find PDFs
    # available_pdfs = mount_and_find_pdfs()
    
else:
    print("üìÅ Manual PDF upload required (not in Colab)")
    print("1. Upload your PDF to Colab files")
    print("2. Use: elements = extract_pdf_content('/content/your_file.pdf')")
    
    # Example for manual upload
    pdf_path = "/content/sample_legal_document.pdf"  # Update this path
    max_pages = 3
    
    print(f"\nüîÑ Ready to extract from: {pdf_path}")
    print(f"üìÑ Max pages: {max_pages}")
    print("# Uncomment when ready: elements = extract_pdf_content(pdf_path, max_pages=max_pages)")

## ü§ñ Setup AI Model Clients

Initialize OpenAI GPT and Google Gemini clients for intelligent content labeling.

In [None]:
class AILabelingSystem:
    """Unified system for labeling PDF content using OpenAI or Gemini"""
    
    def __init__(self):
        self.openai_client = None
        self.gemini_model = None
        self.setup_clients()
    
    def setup_clients(self):
        """Initialize available AI model clients"""
        
        # Setup OpenAI client
        if OPENAI_API_KEY:
            try:
                self.openai_client = openai.OpenAI(api_key=OPENAI_API_KEY)
                logger.info("‚úÖ OpenAI client initialized")
            except Exception as e:
                logger.error(f"‚ùå OpenAI setup failed: {e}")
        
        # Setup Gemini client
        if GEMINI_API_KEY:
            try:
                self.gemini_model = genai.GenerativeModel('gemini-1.5-flash')
                logger.info("‚úÖ Gemini client initialized")
            except Exception as e:
                logger.error(f"‚ùå Gemini setup failed: {e}")
        
        if not self.openai_client and not self.gemini_model:
            logger.warning("‚ö†Ô∏è  No AI models available. Please configure API keys.")
    
    def test_models(self):
        """Test both models with a simple query"""
        test_text = "This is a test of the legal document analysis system."
        
        print("üß™ Testing AI Models:\\n")
        
        # Test OpenAI
        if self.openai_client:
            try:
                response = self.openai_client.chat.completions.create(
                    model="gpt-3.5-turbo",
                    messages=[
                        {"role": "user", "content": f"Classify this text: {test_text}"}
                    ],
                    max_tokens=50
                )
                print("‚úÖ OpenAI GPT: Working")
                print(f"   Response: {response.choices[0].message.content.strip()}")
            except Exception as e:
                print(f"‚ùå OpenAI test failed: {e}")
        
        # Test Gemini
        if self.gemini_model:
            try:
                response = self.gemini_model.generate_content(
                    f"Classify this text in one sentence: {test_text}"
                )
                print("‚úÖ Gemini: Working")
                print(f"   Response: {response.text.strip()}")
            except Exception as e:
                print(f"‚ùå Gemini test failed: {e}")

# Initialize the AI labeling system
ai_system = AILabelingSystem()

# Test the models
ai_system.test_models()

## üè∑Ô∏è Create Context Labeling Functions

Define intelligent functions to classify and label legal document content using AI models.

In [None]:
def create_labeling_prompt(text: str, element_type: str) -> str:
    """Create a sophisticated prompt for legal content classification"""
    
    prompt = f"""
You are an expert legal document analyst specializing in Malaysian law. 
Analyze the following text from a legal document and provide a comprehensive classification.

TEXT TO ANALYZE:
"{text}"

ELEMENT TYPE: {element_type}

Please provide a JSON response with the following fields:
1. "content_category": Main category (e.g., "section_header", "legal_definition", "procedural_requirement", "penalty_clause", "table_of_contents", "preamble", "schedule", "interpretation")
2. "legal_significance": Level of legal importance (e.g., "high", "medium", "low")
3. "subject_matter": What the text is about (e.g., "criminal_procedure", "corporate_law", "tax_provisions", "regulatory_compliance")
4. "contains_definitions": Whether it contains legal definitions (true/false)
5. "contains_penalties": Whether it mentions penalties or sanctions (true/false)
6. "references_other_sections": Whether it references other legal sections (true/false)
7. "actionable_requirements": Whether it contains specific requirements or obligations (true/false)
8. "confidence_score": Your confidence in this classification (0.0 to 1.0)
9. "keywords": Key legal terms found in the text (array)
10. "summary": Brief summary of the content (1-2 sentences)

Respond with valid JSON only.
"""
    return prompt.strip()

def label_with_openai(ai_system, text: str, element_type: str) -> Dict[str, Any]:
    """Label content using OpenAI GPT"""
    try:
        prompt = create_labeling_prompt(text, element_type)
        
        response = ai_system.openai_client.chat.completions.create(
            model="gpt-4",  # Use GPT-4 for better accuracy
            messages=[
                {"role": "system", "content": "You are a legal document analysis expert. Always respond with valid JSON."},
                {"role": "user", "content": prompt}
            ],
            max_tokens=500,
            temperature=0.1  # Low temperature for consistent results
        )
        
        # Parse JSON response
        content = response.choices[0].message.content.strip()
        if content.startswith('```json'):
            content = content[7:]
        if content.endswith('```'):
            content = content[:-3]
        
        result = json.loads(content)
        result['model_used'] = 'gpt-4'
        result['processing_time'] = response.usage.total_tokens if hasattr(response, 'usage') else None
        
        return result
        
    except Exception as e:
        logger.error(f"OpenAI labeling failed: {e}")
        return {
            "content_category": "unknown",
            "error": str(e),
            "model_used": "gpt-4"
        }

def label_with_gemini(ai_system, text: str, element_type: str) -> Dict[str, Any]:
    """Label content using Google Gemini"""
    try:
        prompt = create_labeling_prompt(text, element_type)
        
        response = ai_system.gemini_model.generate_content(
            prompt,
            generation_config=genai.types.GenerationConfig(
                temperature=0.1,
                max_output_tokens=500,
            )
        )
        
        # Parse JSON response
        content = response.text.strip()
        if content.startswith('```json'):
            content = content[7:]
        if content.endswith('```'):
            content = content[:-3]
        
        result = json.loads(content)
        result['model_used'] = 'gemini-1.5-flash'
        
        return result
        
    except Exception as e:
        logger.error(f"Gemini labeling failed: {e}")
        return {
            "content_category": "unknown", 
            "error": str(e),
            "model_used": "gemini-1.5-flash"
        }

def smart_label_content(ai_system, text: str, element_type: str, preferred_model: str = "auto") -> Dict[str, Any]:
    """
    Intelligently label content using the best available model
    
    Args:
        ai_system: AILabelingSystem instance
        text: Text content to label
        element_type: Type of element from unstructured
        preferred_model: "openai", "gemini", or "auto"
    """
    
    # Skip very short text
    if len(text.strip()) < 10:
        return {
            "content_category": "minimal_content",
            "confidence_score": 0.9,
            "model_used": "rule_based"
        }
    
    # Choose model
    if preferred_model == "auto":
        # Prefer OpenAI for complex analysis, Gemini for speed
        if ai_system.openai_client and len(text) > 200:
            preferred_model = "openai"
        elif ai_system.gemini_model:
            preferred_model = "gemini"
        elif ai_system.openai_client:
            preferred_model = "openai"
    
    # Label with chosen model
    if preferred_model == "openai" and ai_system.openai_client:
        return label_with_openai(ai_system, text, element_type)
    elif preferred_model == "gemini" and ai_system.gemini_model:
        return label_with_gemini(ai_system, text, element_type)
    else:
        logger.warning("No AI model available for labeling")
        return {
            "content_category": "unlabeled",
            "error": "No AI model available",
            "model_used": "none"
        }

print("‚úÖ Context labeling functions created!")
print("üéØ Available functions:")
print("   ‚Ä¢ smart_label_content() - Auto-select best model")
print("   ‚Ä¢ label_with_openai() - Use GPT-4")
print("   ‚Ä¢ label_with_gemini() - Use Gemini")
print("\\nüß™ Test with a sample:")

# Test the labeling system
sample_text = "31A. Establishment of the Commission\\n(1) There is established a commission to be known as the Malaysian Communications and Multimedia Commission."
sample_result = smart_label_content(ai_system, sample_text, "NarrativeText")
print(f"Sample result: {json.dumps(sample_result, indent=2)}")

## üîÑ Process PDF Elements with AI

Extract PDF content and apply intelligent AI labeling to each element.

In [None]:
def process_pdf_with_ai_labeling(
    pdf_path: str, 
    max_pages: Optional[int] = None,
    preferred_model: str = "auto",
    rate_limit_delay: float = 1.0,
    batch_size: int = 10
) -> Dict[str, Any]:
    """
    Complete pipeline: Extract PDF content and label with AI
    
    Args:
        pdf_path: Path to PDF file
        max_pages: Max pages to process
        preferred_model: "openai", "gemini", or "auto"
        rate_limit_delay: Delay between API calls (seconds)
        batch_size: Process elements in batches
    
    Returns:
        Complete analysis results
    """
    
    logger.info(f"üöÄ Starting AI-powered PDF analysis: {Path(pdf_path).name}")
    start_time = time.time()
    
    # Step 1: Extract PDF content
    elements = extract_pdf_content(pdf_path, max_pages)
    if not elements:
        return {"error": "Failed to extract PDF content", "elements": []}
    
    logger.info(f"üìÑ Processing {len(elements)} elements with AI labeling...")
    
    # Step 2: Process elements with AI
    labeled_elements = []
    api_calls_made = 0
    errors = 0
    
    # Process in batches with progress bar
    for i in tqdm(range(0, len(elements), batch_size), desc="ü§ñ AI Labeling"):
        batch = elements[i:i + batch_size]
        
        for element in batch:
            try:
                # AI labeling
                ai_label = smart_label_content(
                    ai_system, 
                    element['text'], 
                    element['element_type'], 
                    preferred_model
                )
                
                # Combine original element with AI labels
                labeled_element = {
                    **element,  # Original element data
                    'ai_labels': ai_label,  # AI-generated labels
                    'processing_timestamp': datetime.now().isoformat()
                }
                
                labeled_elements.append(labeled_element)
                api_calls_made += 1
                
                # Rate limiting
                time.sleep(rate_limit_delay)
                
            except Exception as e:
                logger.error(f"Error processing element {element['index']}: {e}")
                errors += 1
                
                # Add element without AI labels
                labeled_elements.append({
                    **element,
                    'ai_labels': {'error': str(e), 'content_category': 'processing_failed'},
                    'processing_timestamp': datetime.now().isoformat()
                })
    
    # Step 3: Generate analysis summary
    processing_time = time.time() - start_time
    
    # Analyze results
    categories = {}
    high_importance = 0
    total_chars = 0
    
    for element in labeled_elements:
        ai_labels = element.get('ai_labels', {})
        category = ai_labels.get('content_category', 'unknown')
        categories[category] = categories.get(category, 0) + 1
        
        if ai_labels.get('legal_significance') == 'high':
            high_importance += 1
        
        total_chars += element.get('character_count', 0)
    
    # Compile final results
    results = {
        'document_info': {
            'source_file': Path(pdf_path).name,
            'total_elements': len(labeled_elements),
            'total_characters': total_chars,
            'pages_processed': max_pages if max_pages else "all",
            'processing_time_seconds': round(processing_time, 2),
            'api_calls_made': api_calls_made,
            'errors': errors
        },
        'analysis_summary': {
            'content_categories': categories,
            'high_importance_elements': high_importance,
            'category_distribution': {k: round(v/len(labeled_elements)*100, 1) for k, v in categories.items()},
            'avg_chars_per_element': round(total_chars / len(labeled_elements)) if labeled_elements else 0
        },
        'labeled_elements': labeled_elements
    }
    
    logger.info(f"‚úÖ Analysis complete! Processed {len(labeled_elements)} elements in {processing_time:.1f}s")
    return results

# Example processing function
def run_example_analysis(pdf_path: str = "/content/sample_legal_document.pdf"):
    """Run a complete example analysis"""
    
    if not Path(pdf_path).exists():
        print(f"‚ùå PDF not found: {pdf_path}")
        print("üìÅ Please upload your PDF to Colab and update the path")
        return None
    
    print(f"üîç Analyzing: {pdf_path}")
    print("‚öôÔ∏è  Settings: First 3 pages, auto model selection, 1s rate limit")
    
    # Run analysis
    results = process_pdf_with_ai_labeling(
        pdf_path=pdf_path,
        max_pages=3,  # First 3 pages only
        preferred_model="auto",
        rate_limit_delay=1.0,
        batch_size=5
    )
    
    if results and 'document_info' in results:
        # Display summary
        info = results['document_info']
        summary = results['analysis_summary']
        
        print(f"\\nüìä ANALYSIS RESULTS:")
        print(f"üìÑ Elements processed: {info['total_elements']}")
        print(f"‚è±Ô∏è  Processing time: {info['processing_time_seconds']}s")
        print(f"ü§ñ API calls made: {info['api_calls_made']}")
        print(f"‚ö†Ô∏è  Errors: {info['errors']}")
        
        print(f"\\nüè∑Ô∏è  CONTENT CATEGORIES:")
        for category, count in summary['content_categories'].items():
            percentage = summary['category_distribution'][category]
            print(f"   ‚Ä¢ {category}: {count} ({percentage}%)")
        
        print(f"\\nüéØ High importance elements: {summary['high_importance_elements']}")
        
        return results
    
    return None

print("‚úÖ PDF processing functions ready!")
print("\\nüöÄ To run analysis:")
print("1. Upload your PDF to Colab")
print("2. Update the pdf_path in run_example_analysis()")
print("3. Call: results = run_example_analysis()")

# Uncomment to run with your PDF
# results = run_example_analysis("/content/your_legal_document.pdf")

## ‚ö° Batch Processing and Optimization

Advanced techniques for processing multiple PDFs efficiently with caching and GPU optimization.

In [None]:
import hashlib
import pickle
from concurrent.futures import ThreadPoolExecutor, as_completed

class OptimizedPDFProcessor:
    """Advanced PDF processor with caching and batch optimization"""
    
    def __init__(self, cache_dir: str = "/content/pdf_cache"):
        self.cache_dir = Path(cache_dir)
        self.cache_dir.mkdir(exist_ok=True)
        self.processed_files = {}
        
    def get_file_hash(self, file_path: str) -> str:
        """Generate hash for file caching"""
        with open(file_path, 'rb') as f:
            return hashlib.md5(f.read()).hexdigest()
    
    def load_from_cache(self, file_hash: str) -> Optional[Dict]:
        """Load processed results from cache"""
        cache_file = self.cache_dir / f"{file_hash}.pkl"
        if cache_file.exists():
            try:
                with open(cache_file, 'rb') as f:
                    return pickle.load(f)
            except Exception as e:
                logger.warning(f"Cache load failed: {e}")
        return None
    
    def save_to_cache(self, file_hash: str, results: Dict):
        """Save results to cache"""
        cache_file = self.cache_dir / f"{file_hash}.pkl"
        try:
            with open(cache_file, 'wb') as f:
                pickle.dump(results, f)
        except Exception as e:
            logger.warning(f"Cache save failed: {e}")
    
    def process_single_pdf_optimized(
        self, 
        pdf_path: str,
        use_cache: bool = True,
        **kwargs
    ) -> Dict:
        """Process single PDF with caching"""
        
        file_hash = self.get_file_hash(pdf_path)
        
        # Check cache first
        if use_cache:
            cached_result = self.load_from_cache(file_hash)
            if cached_result:
                logger.info(f"üìÅ Loaded from cache: {Path(pdf_path).name}")
                return cached_result
        
        # Process if not cached
        logger.info(f"üîÑ Processing (not cached): {Path(pdf_path).name}")
        results = process_pdf_with_ai_labeling(pdf_path, **kwargs)
        
        # Save to cache
        if use_cache and results:
            self.save_to_cache(file_hash, results)
        
        return results
    
    def batch_process_pdfs(
        self,
        pdf_paths: List[str],
        max_workers: int = 3,  # Conservative for API rate limits
        use_cache: bool = True,
        **kwargs
    ) -> Dict[str, Any]:
        """Process multiple PDFs in parallel with optimization"""
        
        logger.info(f"üöÄ Starting batch processing of {len(pdf_paths)} PDFs")
        
        results = {}
        total_start_time = time.time()
        
        # Process with controlled parallelism
        with ThreadPoolExecutor(max_workers=max_workers) as executor:
            # Submit all jobs
            future_to_path = {
                executor.submit(
                    self.process_single_pdf_optimized, 
                    pdf_path, 
                    use_cache, 
                    **kwargs
                ): pdf_path for pdf_path in pdf_paths
            }
            
            # Collect results with progress bar
            for future in tqdm(as_completed(future_to_path), total=len(pdf_paths), desc="üìÑ Processing PDFs"):
                pdf_path = future_to_path[future]
                try:
                    result = future.result()
                    results[Path(pdf_path).name] = result
                except Exception as e:
                    logger.error(f"Error processing {pdf_path}: {e}")
                    results[Path(pdf_path).name] = {"error": str(e)}
        
        # Generate batch summary
        total_time = time.time() - total_start_time
        successful = sum(1 for r in results.values() if "error" not in r)
        failed = len(results) - successful
        
        batch_summary = {
            'batch_info': {
                'total_files': len(pdf_paths),
                'successful': successful,
                'failed': failed,
                'total_processing_time': round(total_time, 2),
                'avg_time_per_file': round(total_time / len(pdf_paths), 2)
            },
            'file_results': results
        }
        
        logger.info(f"‚úÖ Batch complete: {successful}/{len(pdf_paths)} successful in {total_time:.1f}s")
        return batch_summary

def demonstrate_optimization():
    """Demonstrate advanced processing features"""
    
    print("‚ö° OPTIMIZATION FEATURES:")
    print("\\n1. üìÅ Intelligent Caching:")
    print("   ‚Ä¢ Files are hashed to detect changes")
    print("   ‚Ä¢ Processed results cached to disk")
    print("   ‚Ä¢ Automatic cache invalidation")
    
    print("\\n2. üîÑ Parallel Processing:")
    print("   ‚Ä¢ Multiple PDFs processed simultaneously")
    print("   ‚Ä¢ Controlled concurrency for API limits")
    print("   ‚Ä¢ Progress tracking and error handling")
    
    print("\\n3. üéØ Memory Optimization:")
    print("   ‚Ä¢ Batch processing to manage memory")
    print("   ‚Ä¢ Garbage collection between files")
    print("   ‚Ä¢ GPU memory monitoring")
    
    print("\\n4. üìä Performance Monitoring:")
    print("   ‚Ä¢ Processing time tracking")
    print("   ‚Ä¢ API call counting")
    print("   ‚Ä¢ Success/failure rates")
    
    # Example usage
    processor = OptimizedPDFProcessor()
    
    print(f"\\nüíæ Cache directory: {processor.cache_dir}")
    print(f"üìÅ Cache size: {len(list(processor.cache_dir.glob('*.pkl')))} files")
    
    return processor

# Initialize optimized processor
processor = demonstrate_optimization()

# Example batch processing
def run_batch_example(pdf_directory: str = "/content/legal_pdfs/"):
    """Example of batch processing multiple PDFs"""
    
    pdf_dir = Path(pdf_directory)
    if not pdf_dir.exists():
        print(f"‚ùå Directory not found: {pdf_directory}")
        print("üìÅ Create directory and upload PDFs to test batch processing")
        return
    
    # Find all PDFs
    pdf_files = list(pdf_dir.glob("*.pdf"))
    if not pdf_files:
        print(f"‚ùå No PDF files found in {pdf_directory}")
        return
    
    print(f"üîç Found {len(pdf_files)} PDF files")
    
    # Run batch processing
    batch_results = processor.batch_process_pdfs(
        pdf_paths=[str(p) for p in pdf_files],
        max_workers=2,  # Conservative for API limits
        max_pages=2,    # First 2 pages only for demo
        preferred_model="auto",
        rate_limit_delay=0.5
    )
    
    # Display results
    print("\\nüìä BATCH RESULTS:")
    info = batch_results['batch_info']
    print(f"‚úÖ Successful: {info['successful']}/{info['total_files']}")
    print(f"‚è±Ô∏è  Total time: {info['total_processing_time']}s")
    print(f"üìà Avg per file: {info['avg_time_per_file']}s")
    
    return batch_results

print("\\nüöÄ Ready for optimized processing!")
print("üí° Tips for Google Colab:")
print("   ‚Ä¢ Use GPU runtime for faster processing")
print("   ‚Ä¢ Enable high-RAM if processing many files")
print("   ‚Ä¢ Monitor API usage to avoid rate limits")
print("   ‚Ä¢ Cache results to avoid reprocessing")

## üìä Export Results and Visualization

Save labeled PDF content and create insightful visualizations of the analysis results.

In [None]:
def export_results_to_formats(results: Dict, output_dir: str = "/content/parsed/EN"):
    """Export Malaysian legal analysis results to multiple formats in parsed folder structure"""
    
    output_path = Path(output_dir)
    output_path.mkdir(parents=True, exist_ok=True)
    
    if 'document_info' not in results:
        logger.error("Invalid results format")
        return
    
    doc_info = results['document_info']
    doc_name = doc_info['source_file'].replace('.pdf', '')
    language = doc_info.get('language', 'unknown')
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    
    # 1. Complete JSON export (this is the main output for parsed folder)
    json_file = output_path / f"{doc_name}_complete_analysis.json"
    with open(json_file, 'w', encoding='utf-8') as f:
        json.dump(results, f, indent=2, ensure_ascii=False)
    
    # 2. Structured CSV for legal review and analysis
    elements_data = []
    for element in results['labeled_elements']:
        ai_labels = element.get('ai_labels', {})
        row = {
            'document_name': doc_name,
            'language': language,
            'index': element['index'],
            'page_number': element['page_number'],
            'element_type': element['element_type'],
            'character_count': element['character_count'],
            'content_category': ai_labels.get('content_category', 'unknown'),
            'legal_significance': ai_labels.get('legal_significance', 'unknown'),
            'subject_matter': ai_labels.get('subject_matter', 'unknown'),
            'confidence_score': ai_labels.get('confidence_score', 0),
            'contains_definitions': ai_labels.get('contains_definitions', False),
            'contains_penalties': ai_labels.get('contains_penalties', False),
            'actionable_requirements': ai_labels.get('actionable_requirements', False),
            'references_other_sections': ai_labels.get('references_other_sections', False),
            'model_used': ai_labels.get('model_used', 'unknown'),
            'keywords': ', '.join(ai_labels.get('keywords', [])) if ai_labels.get('keywords') else '',
            'summary': ai_labels.get('summary', ''),
            'text_preview': element['text'][:200] + "..." if len(element['text']) > 200 else element['text']
        }
        elements_data.append(row)
    
    df = pd.DataFrame(elements_data)
    csv_file = output_path / f"{doc_name}_legal_analysis.csv"
    df.to_csv(csv_file, index=False, encoding='utf-8')
    
    # 3. Legal summary report for human review
    report_file = output_path / f"{doc_name}_legal_summary.txt"
    with open(report_file, 'w', encoding='utf-8') as f:
        info = results['document_info']
        summary = results['analysis_summary']
        
        f.write(f"MALAYSIAN LEGAL DOCUMENT ANALYSIS REPORT\\n")
        f.write(f"{'='*60}\\n\\n")
        f.write(f"Document: {info['source_file']}\\n")
        f.write(f"Language: {info.get('language', 'Unknown')}\\n")
        f.write(f"Source Path: {info.get('source_path', 'Unknown')}\\n")
        f.write(f"Processed: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\\n")
        f.write(f"Processing Time: {info['processing_time_seconds']}s\\n")
        f.write(f"Extraction Method: {info.get('extraction_method', 'Unknown')}\\n\\n")
        
        f.write(f"DOCUMENT STATISTICS:\\n")
        f.write(f"{'-'*30}\\n")
        f.write(f"Total Elements: {info['total_elements']}\\n")
        f.write(f"Total Characters: {info['total_characters']:,}\\n")
        f.write(f"API Calls Made: {info['api_calls_made']}\\n")
        f.write(f"Models Used: {', '.join(info.get('models_used', []))}\\n")
        f.write(f"Average Confidence: {summary['average_confidence']}\\n\\n")
        
        f.write(f"LEGAL CONTENT ANALYSIS:\\n")
        f.write(f"{'-'*30}\\n")
        for category, count in summary['content_categories'].items():
            percentage = summary['category_distribution'][category]
            f.write(f"{category}: {count} elements ({percentage}%)\\n")
        
        f.write(f"\\nLEGAL SIGNIFICANCE:\\n")
        f.write(f"{'-'*30}\\n")
        f.write(f"High Importance Elements: {summary['high_importance_elements']}\\n")
        
        # Extract high importance elements
        high_importance_elements = [
            el for el in results['labeled_elements'] 
            if el.get('ai_labels', {}).get('legal_significance') == 'high'
        ]
        
        if high_importance_elements:
            f.write(f"\\nHIGH IMPORTANCE SECTIONS:\\n")
            f.write(f"{'-'*30}\\n")
            for i, element in enumerate(high_importance_elements[:10]):  # Top 10
                ai_labels = element.get('ai_labels', {})
                f.write(f"{i+1}. Page {element['page_number']} - {ai_labels.get('content_category', 'Unknown')}\\n")
                f.write(f"   Subject: {ai_labels.get('subject_matter', 'Unknown')}\\n")
                f.write(f"   Text: {element['text'][:150]}{'...' if len(element['text']) > 150 else ''}\\n\\n")
    
    # 4. Create a metadata index file for the parsed folder
    metadata_file = output_path / f"{doc_name}_metadata.json"
    metadata = {
        'document_info': doc_info,
        'analysis_summary': summary,
        'files_generated': {
            'complete_analysis': json_file.name,
            'csv_analysis': csv_file.name,
            'text_summary': report_file.name,
            'metadata': metadata_file.name
        },
        'generation_timestamp': datetime.now().isoformat()
    }
    
    with open(metadata_file, 'w', encoding='utf-8') as f:
        json.dump(metadata, f, indent=2, ensure_ascii=False)
    
    logger.info(f"‚úÖ Malaysian legal analysis exported to: {output_path}")
    logger.info(f"üìÑ Files created:")
    logger.info(f"   ‚Ä¢ {json_file.name} (Complete analysis - main output)")
    logger.info(f"   ‚Ä¢ {csv_file.name} (Structured data for review)")
    logger.info(f"   ‚Ä¢ {report_file.name} (Human-readable summary)")
    logger.info(f"   ‚Ä¢ {metadata_file.name} (Document metadata)")
    
    return {
        'json_file': str(json_file),
        'csv_file': str(csv_file),
        'report_file': str(report_file),
        'metadata_file': str(metadata_file),
        'dataframe': df,
        'output_directory': str(output_path)
    }

def create_analysis_visualizations(results: Dict, output_dir: str = "/content/parsed/EN"):
    """Create comprehensive visualizations for Malaysian legal document analysis"""
    
    if 'labeled_elements' not in results:
        logger.error("No labeled elements found for visualization")
        return
    
    doc_info = results['document_info']
    doc_name = doc_info['source_file'].replace('.pdf', '')
    language = doc_info.get('language', 'unknown')
    
    # Prepare data
    elements_data = []
    for element in results['labeled_elements']:
        ai_labels = element.get('ai_labels', {})
        elements_data.append({
            'page': element['page_number'],
            'element_type': element['element_type'],
            'category': ai_labels.get('content_category', 'unknown'),
            'significance': ai_labels.get('legal_significance', 'unknown'),
            'confidence': ai_labels.get('confidence_score', 0),
            'char_count': element['character_count'],
            'has_definitions': ai_labels.get('contains_definitions', False),
            'has_penalties': ai_labels.get('contains_penalties', False),
            'has_requirements': ai_labels.get('actionable_requirements', False)
        })
    
    df = pd.DataFrame(elements_data)
    
    # Create enhanced visualizations for legal documents
    fig, axes = plt.subplots(3, 3, figsize=(20, 15))
    fig.suptitle(f'Malaysian Legal Document Analysis: {doc_name} ({language})', fontsize=16, fontweight='bold')
    
    # 1. Content Categories Distribution
    category_counts = df['category'].value_counts()
    colors = plt.cm.Set3(range(len(category_counts)))
    axes[0, 0].pie(category_counts.values, labels=category_counts.index, autopct='%1.1f%%', 
                  startangle=90, colors=colors)
    axes[0, 0].set_title('Legal Content Categories')
    
    # 2. Legal Significance Distribution
    significance_counts = df['significance'].value_counts()
    colors_sig = {'high': 'red', 'medium': 'orange', 'low': 'green', 'unknown': 'gray'}
    bar_colors = [colors_sig.get(sig, 'gray') for sig in significance_counts.index]
    bars = axes[0, 1].bar(significance_counts.index, significance_counts.values, color=bar_colors)
    axes[0, 1].set_title('Legal Significance Levels')
    axes[0, 1].set_ylabel('Count')
    
    # 3. Elements Distribution by Page
    page_counts = df['page'].value_counts().sort_index()
    axes[0, 2].plot(page_counts.index, page_counts.values, marker='o', linewidth=2, markersize=6, color='blue')
    axes[0, 2].set_title('Elements per Page')
    axes[0, 2].set_xlabel('Page Number')
    axes[0, 2].set_ylabel('Element Count')
    axes[0, 2].grid(True, alpha=0.3)
    
    # 4. Element Types Distribution
    type_counts = df['element_type'].value_counts()
    axes[1, 0].barh(type_counts.index, type_counts.values, color='skyblue')
    axes[1, 0].set_title('Document Element Types')
    axes[1, 0].set_xlabel('Count')
    
    # 5. AI Confidence Score Distribution
    axes[1, 1].hist(df['confidence'], bins=20, color='lightgreen', alpha=0.7, edgecolor='black')
    axes[1, 1].set_title('AI Confidence Score Distribution')
    axes[1, 1].set_xlabel('Confidence Score')
    axes[1, 1].set_ylabel('Frequency')
    mean_conf = df['confidence'].mean()
    axes[1, 1].axvline(mean_conf, color='red', linestyle='--', label=f'Mean: {mean_conf:.2f}')
    axes[1, 1].legend()
    
    # 6. Character Count vs Confidence (colored by page)
    scatter = axes[1, 2].scatter(df['char_count'], df['confidence'], c=df['page'], 
                               cmap='viridis', alpha=0.6, s=30)
    axes[1, 2].set_title('Content Length vs AI Confidence')
    axes[1, 2].set_xlabel('Character Count')
    axes[1, 2].set_ylabel('Confidence Score')
    plt.colorbar(scatter, ax=axes[1, 2], label='Page Number')
    
    # 7. Legal Features Analysis
    features = ['has_definitions', 'has_penalties', 'has_requirements']
    feature_counts = [df[feature].sum() for feature in features]
    feature_labels = ['Contains Definitions', 'Contains Penalties', 'Has Requirements']
    axes[2, 0].bar(feature_labels, feature_counts, color=['purple', 'orange', 'green'])
    axes[2, 0].set_title('Legal Features Detection')
    axes[2, 0].set_ylabel('Count')
    plt.setp(axes[2, 0].xaxis.get_majorticklabels(), rotation=45, ha='right')
    
    # 8. High Importance Elements by Page
    high_importance = df[df['significance'] == 'high']
    if not high_importance.empty:
        high_by_page = high_importance['page'].value_counts().sort_index()
        axes[2, 1].bar(high_by_page.index, high_by_page.values, color='red', alpha=0.7)
        axes[2, 1].set_title('High Importance Elements by Page')
        axes[2, 1].set_xlabel('Page Number')
        axes[2, 1].set_ylabel('High Importance Count')
    else:
        axes[2, 1].text(0.5, 0.5, 'No High Importance\\nElements Found', 
                       ha='center', va='center', transform=axes[2, 1].transAxes)
        axes[2, 1].set_title('High Importance Elements by Page')
    
    # 9. Category vs Significance Heatmap
    if len(df) > 0:
        pivot_table = df.pivot_table(values='confidence', index='category', 
                                   columns='significance', aggfunc='count', fill_value=0)
        if not pivot_table.empty:
            im = axes[2, 2].imshow(pivot_table.values, cmap='YlOrRd', aspect='auto')
            axes[2, 2].set_xticks(range(len(pivot_table.columns)))
            axes[2, 2].set_yticks(range(len(pivot_table.index)))
            axes[2, 2].set_xticklabels(pivot_table.columns)
            axes[2, 2].set_yticklabels(pivot_table.index)
            axes[2, 2].set_title('Category vs Significance Heatmap')
            plt.colorbar(im, ax=axes[2, 2])
        else:
            axes[2, 2].text(0.5, 0.5, 'Insufficient Data\\nfor Heatmap', 
                           ha='center', va='center', transform=axes[2, 2].transAxes)
            axes[2, 2].set_title('Category vs Significance Heatmap')
    
    plt.tight_layout()
    
    # Save visualization
    output_path = Path(output_dir)
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    viz_file = output_path / f"{doc_name}_legal_analysis_dashboard.png"
    plt.savefig(viz_file, dpi=300, bbox_inches='tight')
    plt.show()
    
    logger.info(f"üìä Legal analysis dashboard saved: {viz_file}")
    
    # Enhanced analysis summary
    print(f"\\n? MALAYSIAN LEGAL DOCUMENT ANALYSIS SUMMARY:")
    print(f"üìÑ Document: {doc_name} ({language})")
    print(f"üìà Total elements: {len(df)}")
    print(f"üéØ Average AI confidence: {df['confidence'].mean():.2f}")
    print(f"üìù Average chars per element: {df['char_count'].mean():.0f}")
    print(f"‚≠ê High confidence elements (>0.8): {len(df[df['confidence'] > 0.8])}")
    print(f"üö® High importance elements: {len(df[df['significance'] == 'high'])}")
    
    print(f"\\nüè∑Ô∏è  TOP LEGAL CATEGORIES:")
    for cat, count in category_counts.head().items():
        percentage = (count / len(df)) * 100
        print(f"   ‚Ä¢ {cat}: {count} ({percentage:.1f}%)")
    
    print(f"\\n‚öñÔ∏è  LEGAL FEATURES DETECTED:")
    print(f"   ‚Ä¢ Documents with definitions: {df['has_definitions'].sum()}")
    print(f"   ‚Ä¢ Documents with penalties: {df['has_penalties'].sum()}")
    print(f"   ‚Ä¢ Documents with requirements: {df['has_requirements'].sum()}")
    
    return df

def demonstrate_malaysian_legal_export():
    """Demonstrate export workflow for Malaysian legal documents"""
    
    print("üìä MALAYSIAN LEGAL DOCUMENT EXPORT WORKFLOW:")
    print("\\n1. üìÑ Structured Output for Parsed Folder:")
    print("   ‚Ä¢ Complete JSON analysis (main output)")
    print("   ‚Ä¢ CSV file for legal review and analysis")
    print("   ‚Ä¢ Human-readable summary report")
    print("   ‚Ä¢ Metadata index for document tracking")
    
    print("\\n2. üìà Legal-Specific Visualizations:")
    print("   ‚Ä¢ Content category analysis")
    print("   ‚Ä¢ Legal significance assessment")
    print("   ‚Ä¢ Feature detection (definitions, penalties, requirements)")
    print("   ‚Ä¢ Page-by-page importance mapping")
    print("   ‚Ä¢ AI confidence and quality metrics")
    
    print("\\n3. üéØ Legal Analysis Features:")
    print("   ‚Ä¢ High importance section identification")
    print("   ‚Ä¢ Legal terminology detection")
    print("   ‚Ä¢ Cross-reference analysis")
    print("   ‚Ä¢ Language-aware processing")
    
    print("\\nüí° Usage for Malaysian Legal Acts:")
    print("   # Process single document")
    print("   results = process_pdf_with_ai_labeling('malaysian_acts/EN/act125.pdf')")
    print("   export_data = export_results_to_formats(results, 'parsed/EN')")
    print("   df = create_analysis_visualizations(results, 'parsed/EN')")
    
    return True

demonstrate_malaysian_legal_export()

print("\\n‚úÖ Malaysian legal document export functions ready!")
print("?Ô∏è  Optimized for legal document analysis and parsed folder structure")

## üöÄ Complete Processing Workflow

Now we'll put everything together into a complete workflow that processes your Malaysian legal PDFs with AI-powered content labeling.

In [None]:
async def complete_malaysian_legal_analysis(
    pdf_path: str, 
    use_openai: bool = True,
    use_gemini: bool = False,
    output_dir: str = "/content/parsed",
    language: str = "auto"  # "EN", "BM", or "auto"
):
    """
    Complete workflow for analyzing Malaysian legal PDFs with AI labeling
    
    Args:
        pdf_path: Path to the PDF file (from malaysian_acts folder)
        use_openai: Whether to use OpenAI GPT-4 for labeling
        use_gemini: Whether to use Google Gemini for labeling
        output_dir: Directory to save results (parsed folder)
        language: Document language ("EN", "BM", or "auto" for detection)
    
    Returns:
        Dict containing complete analysis results
    """
    
    logger.info(f"üèõÔ∏è  Starting Malaysian Legal PDF Analysis: {Path(pdf_path).name}")
    start_time = time.time()
    
    try:
        # Step 1: Detect language if auto
        if language == "auto":
            if '/EN/' in pdf_path.upper() or '/en/' in pdf_path.lower():
                language = "EN"
            elif '/BM/' in pdf_path.upper() or '/bm/' in pdf_path.lower():
                language = "BM"
            else:
                language = "other"
        
        print(f"üåê Document language: {language}")
        
        # Step 2: Extract PDF content with unstructured
        print("üìÑ Step 1: Extracting PDF content...")
        elements = extract_pdf_content(pdf_path)
        if not elements:
            raise ValueError("No elements extracted from PDF")
        
        print(f"‚úÖ Extracted {len(elements)} elements")
        
        # Step 3: Set up AI models
        print("ü§ñ Step 2: Setting up AI models...")
        
        clients = {}
        if use_openai and ai_system.openai_client:
            clients['openai'] = ai_system.openai_client
            print("‚úÖ OpenAI GPT-4 ready")
        
        if use_gemini and ai_system.gemini_model:
            clients['gemini'] = ai_system.gemini_model
            print("‚úÖ Google Gemini ready")
        
        if not clients:
            raise ValueError("No AI models available. Please configure API keys.")
        
        # Step 4: Process elements with AI labeling
        print("üè∑Ô∏è  Step 3: AI content labeling...")
        
        labeled_elements = []
        total_api_calls = 0
        
        for i, element in enumerate(elements):
            print(f"Processing element {i+1}/{len(elements)} (Page {element['page_number']})...", end="\\r")
            
            # Choose AI model (prefer OpenAI for legal documents)
            if 'openai' in clients:
                ai_labels = smart_label_content(ai_system, element['text'], element['element_type'], "openai")
                if ai_labels and 'error' not in ai_labels:
                    ai_labels['model_used'] = 'gpt-4'
                    total_api_calls += 1
            elif 'gemini' in clients:
                ai_labels = smart_label_content(ai_system, element['text'], element['element_type'], "gemini")
                if ai_labels and 'error' not in ai_labels:
                    ai_labels['model_used'] = 'gemini-pro'
                    total_api_calls += 1
            else:
                ai_labels = {}
            
            element['ai_labels'] = ai_labels
            labeled_elements.append(element)
            
            # Rate limiting
            if i % 10 == 0 and i > 0:
                time.sleep(1)  # Brief pause every 10 elements
        
        print(f"\\n‚úÖ Completed AI labeling with {total_api_calls} API calls")
        
        # Step 5: Create analysis summary
        print("üìä Step 4: Generating analysis summary...")
        
        # Calculate statistics
        total_chars = sum(len(el['text']) for el in labeled_elements)
        content_categories = {}
        high_importance = 0
        category_distribution = {}
        
        for element in labeled_elements:
            ai_labels = element.get('ai_labels', {})
            category = ai_labels.get('content_category', 'unknown')
            significance = ai_labels.get('legal_significance', 'unknown')
            
            content_categories[category] = content_categories.get(category, 0) + 1
            
            if significance == 'high':
                high_importance += 1
        
        # Calculate percentages
        total_elements = len(labeled_elements)
        for category, count in content_categories.items():
            category_distribution[category] = round((count / total_elements) * 100, 1)
        
        # Step 6: Compile complete results
        processing_time = time.time() - start_time
        
        results = {
            'document_info': {
                'source_file': Path(pdf_path).name,
                'source_path': str(pdf_path),
                'language': language,
                'total_elements': total_elements,
                'total_characters': total_chars,
                'processing_time_seconds': round(processing_time, 2),
                'api_calls_made': total_api_calls,
                'models_used': list(clients.keys()),
                'processed_at': datetime.now().isoformat(),
                'extraction_method': 'unstructured_hi_res'
            },
            'labeled_elements': labeled_elements,
            'analysis_summary': {
                'content_categories': content_categories,
                'category_distribution': category_distribution,
                'high_importance_elements': high_importance,
                'average_confidence': round(
                    sum(el.get('ai_labels', {}).get('confidence_score', 0) for el in labeled_elements) / total_elements, 3
                ) if total_elements > 0 else 0
            }
        }
        
        # Step 7: Save to parsed folder with proper structure
        print("üíæ Step 5: Saving to parsed folder...")
        
        # Create output directory structure
        parsed_path = Path(output_dir)
        lang_output_dir = parsed_path / language
        lang_output_dir.mkdir(parents=True, exist_ok=True)
        
        # Save complete JSON analysis
        output_filename = f"{Path(pdf_path).stem}_analysis.json"
        output_file = lang_output_dir / output_filename
        
        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump(results, f, indent=2, ensure_ascii=False)
        
        print(f"‚úÖ Analysis saved: {output_file}")
        
        # Step 8: Export additional formats
        print("üìä Step 6: Creating additional exports...")
        export_data = export_results_to_formats(results, str(lang_output_dir))
        
        # Step 9: Create visualizations
        print("üìà Step 7: Creating visualizations...")
        df = create_analysis_visualizations(results, str(lang_output_dir))
        
        logger.info(f"üéâ Analysis completed in {processing_time:.2f} seconds")
        logger.info(f"üìä Summary: {total_elements} elements, {high_importance} high importance")
        logger.info(f"üìÅ Saved to: {output_file}")
        
        return results
        
    except Exception as e:
        logger.error(f"‚ùå Analysis failed: {str(e)}")
        return None

# Updated demo function for Malaysian legal documents
def quick_malaysian_legal_demo():
    """Demonstrate the analysis workflow specifically for Malaysian legal documents"""
    
    print("?Ô∏è  MALAYSIAN LEGAL PDF AI ANALYSIS SYSTEM")
    print("=" * 60)
    
    print("\\nüìã Specialized Features for Malaysian Legal Documents:")
    print("‚Ä¢ üìÑ PDF extraction optimized for legal formatting")
    print("‚Ä¢ ü§ñ AI models trained to understand legal terminology")
    print("‚Ä¢ üèõÔ∏è  Malaysian legal document structure recognition")
    print("‚Ä¢ üåê Language-aware processing (English/Bahasa Malaysia)")
    print("‚Ä¢ üìä Legal significance classification")
    print("‚Ä¢ üíæ Structured output in JSON format for parsed folder")
    print("‚Ä¢ ‚ö° Batch processing for multiple acts")
    
    print("\\nüìÇ Folder Structure:")
    print("   üìÇ malaysian_acts/ (Input)")
    print("      üìÇ EN/ (English legal documents)")
    print("      üìÇ BM/ (Bahasa Malaysia legal documents)")
    print("   üìÇ parsed/ (Output)")
    print("      üìÇ EN/ (English analysis results)")
    print("      üìÇ BM/ (Bahasa Malaysia analysis results)")
    
    print("\\nüöÄ Quick Start for Malaysian Legal Acts:")
    print("1. Ensure your malaysian_acts folder is uploaded to Colab")
    print("2. Configure API keys (OpenAI or Gemini)")
    print("3. Run batch processing or individual file analysis")
    print("4. Review results in the parsed folder")
    
    print("\\nüí° API Key Setup (Colab Secrets):")
    print("‚Ä¢ In Colab: Go to üîë Secrets panel")
    print("‚Ä¢ Add OPENAI_API_KEY and/or GEMINI_API_KEY")
    print("‚Ä¢ Enable 'Notebook access' for each key")
    
    print("\\n‚ö†Ô∏è  Processing Notes:")
    print("‚Ä¢ Legal documents are processed with high-resolution extraction")
    print("‚Ä¢ AI models provide specialized legal classification")
    print("‚Ä¢ Results include legal significance ratings")
    print("‚Ä¢ Processing time varies with document complexity")
    print("‚Ä¢ API costs apply for AI analysis")
    
    print("\\nüéØ Legal Classification Categories:")
    print("‚Ä¢ section_header, legal_definition, procedural_requirement")
    print("‚Ä¢ penalty_clause, schedule, interpretation, preamble")
    print("‚Ä¢ Legal significance: high, medium, low")
    print("‚Ä¢ Subject matter: criminal_procedure, corporate_law, etc.")
    
    return True

quick_malaysian_legal_demo()

## üß™ Testing Your Malaysian Legal PDFs

Ready to test the system! Upload your Malaysian legal PDF files and run the analysis.

In [None]:
# üìÅ STEP 1: Setup Malaysian Legal PDFs from Google Drive
# Complete setup for accessing your PDF files in Google Colab
import os
import shutil
from pathlib import Path
from google.colab import drive, files

def setup_malaysian_legal_pdfs_from_drive():
    """Complete setup for Malaysian legal PDFs from Google Drive in Google Colab"""
    
    print("üèõÔ∏è  MALAYSIAN LEGAL PDF SETUP FOR GOOGLE COLAB")
    print("=" * 60)
    
    # Step 1: Check if files already exist in Colab workspace
    colab_dir = "/content/malaysian_acts"
    if os.path.exists(colab_dir):
        print("‚úÖ malaysian_acts folder already exists in Colab")
        check_folder_contents(colab_dir)
        return colab_dir
    
    # Step 2: Mount Google Drive
    print("üîÑ Mounting Google Drive...")
    try:
        drive.mount('/content/drive')
        print("‚úÖ Google Drive mounted successfully")
        
        # Step 3: Look for malaysian_acts folder in common Drive locations
        print("üîç Searching for malaysian_acts folder in Google Drive...")
        drive_search_paths = [
            "/content/drive/MyDrive/malaysian_acts",
            "/content/drive/MyDrive/reg-intel/malaysian_acts",
            "/content/drive/MyDrive/Github/reg-intel/malaysian_acts",
            "/content/drive/MyDrive/Downloads/malaysian_acts",
            "/content/drive/MyDrive/Documents/malaysian_acts"
        ]
        
        source_dir = None
        for path in drive_search_paths:
            if os.path.exists(path):
                source_dir = path
                print(f"‚úÖ Found malaysian_acts folder at: {source_dir}")
                break
        
        if source_dir:
            # Step 4: Copy from Drive to Colab workspace for faster processing
            print("üîÑ Copying malaysian_acts folder to Colab workspace...")
            shutil.copytree(source_dir, colab_dir)
            print("‚úÖ Successfully copied malaysian_acts folder to /content/")
            
            # Verify the copy
            check_folder_contents(colab_dir)
            return colab_dir
            
        else:
            print("‚ùå malaysian_acts folder not found in Google Drive")
            print("\nüì§ UPLOAD INSTRUCTIONS:")
            print("1. Go to your Google Drive (drive.google.com)")
            print("2. Upload your 'malaysian_acts' folder with EN/ and BM/ subfolders")
            print("3. Recommended location: MyDrive/malaysian_acts/")
            print("4. Re-run this cell after upload")
            return None
            
    except Exception as e:
        print(f"‚ùå Google Drive mount failed: {e}")
        print("üí° Alternative: Use manual file upload")
        return None

def check_folder_contents(base_dir):
    """Check and display Malaysian legal PDFs folder contents"""
    print(f"\nüìÇ MALAYSIAN LEGAL ACTS FOLDER: {base_dir}")
    print("-" * 50)
    
    base_path = Path(base_dir)
    if not base_path.exists():
        print("‚ùå Folder does not exist")
        return
    
    # Check EN (English) folder
    en_dir = base_path / "EN"
    if en_dir.exists():
        en_files = list(en_dir.glob("*.pdf"))
        print(f"üìÇ EN/ (English): {len(en_files)} PDF files")
        for i, file in enumerate(en_files[:5]):  # Show first 5
            print(f"   üìÑ {file.name}")
        if len(en_files) > 5:
            print(f"   ... and {len(en_files) - 5} more files")
    else:
        print("‚ùå EN/ folder not found")
    
    # Check BM (Bahasa Malaysia) folder
    bm_dir = base_path / "BM"
    if bm_dir.exists():
        bm_files = list(bm_dir.glob("*.pdf"))
        print(f"üìÇ BM/ (Bahasa Malaysia): {len(bm_files)} PDF files")
        for i, file in enumerate(bm_files[:5]):  # Show first 5
            print(f"   üìÑ {file.name}")
        if len(bm_files) > 5:
            print(f"   ... and {len(bm_files) - 5} more files")
    else:
        print("‚ùå BM/ folder not found")
    
    # Summary
    total_files = len(list(base_path.glob("**/*.pdf")))
    print(f"\nüìä Total PDF files found: {total_files}")

def setup_output_directories():
    """Setup output directories for parsed results"""
    
    # Create local output directory
    local_output = Path("/content/parsed")
    local_output.mkdir(exist_ok=True)
    (local_output / "EN").mkdir(exist_ok=True)
    (local_output / "BM").mkdir(exist_ok=True)
    (local_output / "other").mkdir(exist_ok=True)
    
    # Create Drive backup directory (if Drive is mounted)
    if os.path.exists("/content/drive/MyDrive"):
        drive_output = Path("/content/drive/MyDrive/parsed")
        drive_output.mkdir(exist_ok=True)
        (drive_output / "EN").mkdir(exist_ok=True)
        (drive_output / "BM").mkdir(exist_ok=True)
        (drive_output / "other").mkdir(exist_ok=True)
        print("‚úÖ Output directories created:")
        print(f"   üìÅ Local: {local_output}")
        print(f"   üìÅ Drive backup: {drive_output}")
    else:
        print("‚úÖ Local output directory created:")
        print(f"   üìÅ Local: {local_output}")

def manual_upload_alternative():
    """Alternative manual upload method"""
    print("\nüì§ ALTERNATIVE: MANUAL FILE UPLOAD")
    print("-" * 40)
    print("If Google Drive setup doesn't work, you can upload files manually:")
    print()
    print("1. Run this code to upload files:")
    print("   uploaded = files.upload()")
    print()
    print("2. Or use the file browser:")
    print("   ‚Ä¢ Click üìÅ folder icon in left sidebar")
    print("   ‚Ä¢ Create folder structure: malaysian_acts/EN/ and malaysian_acts/BM/")
    print("   ‚Ä¢ Upload PDF files to appropriate folders")
    print()
    print("3. After upload, verify with:")
    print("   check_folder_contents('/content/malaysian_acts')")

# üöÄ MAIN SETUP EXECUTION
print("üá≤üáæ STARTING MALAYSIAN LEGAL PDF SETUP...")
pdf_directory = setup_malaysian_legal_pdfs_from_drive()

if pdf_directory:
    print(f"\n‚úÖ SUCCESS: PDFs ready at {pdf_directory}")
    
    # Setup output directories
    setup_output_directories()
    
    print(f"\nüéØ READY FOR ANALYSIS!")
    print(f"You can now run:")
    print(f"   results = await process_malaysian_acts_folder()")
    print(f"   # or for specific language:")
    print(f"   results = await process_malaysian_acts_folder(language_filter='EN')")
    
else:
    print(f"\n‚ùå SETUP INCOMPLETE")
    manual_upload_alternative()

# üéØ STEP 2: Configure AI Analysis for Malaysian Legal Documents
def configure_colab_analysis():
    """Configure analysis specifically for Google Colab environment"""
    
    print("\nü§ñ AI MODEL CONFIGURATION FOR GOOGLE COLAB:")
    print("-" * 50)
    
    # Check API key availability in Colab Secrets
    try:
        from google.colab import userdata
        
        # Try to get API keys from Colab Secrets
        try:
            openai_key = userdata.get('OPENAI_API_KEY')
            openai_available = bool(openai_key)
        except:
            openai_available = False
            
        try:
            gemini_key = userdata.get('GEMINI_API_KEY')
            gemini_available = bool(gemini_key)
        except:
            gemini_available = False
            
        print(f"üîë API Key Status (Colab Secrets):")
        print(f"   OpenAI: {'‚úÖ Available' if openai_available else '‚ùå Not configured'}")
        print(f"   Gemini: {'‚úÖ Available' if gemini_available else '‚ùå Not configured'}")
        
        if not openai_available and not gemini_available:
            print(f"\n‚ö†Ô∏è  NO API KEYS CONFIGURED!")
            print(f"To add API keys in Google Colab:")
            print(f"1. Click üîë 'Secrets' in the left sidebar")
            print(f"2. Add new secret:")
            print(f"   ‚Ä¢ Name: OPENAI_API_KEY")
            print(f"   ‚Ä¢ Value: your OpenAI API key")
            print(f"   ‚Ä¢ Enable 'Notebook access'")
            print(f"3. Optional: Add GEMINI_API_KEY the same way")
            print(f"4. Restart runtime: Runtime ‚Üí Restart runtime")
            return False
        else:
            print(f"\n‚úÖ AI models configured!")
            return True
            
    except ImportError:
        print(f"‚ùå Not running in Google Colab")
        print(f"üí° For local development, set environment variables:")
        print(f"   export OPENAI_API_KEY='your-key-here'")
        return False

# Configure analysis
analysis_ready = configure_colab_analysis()

# üöÄ STEP 3: Quick Analysis Function for Google Colab
async def run_colab_legal_analysis(
    language_filter: str = "EN",  # "EN", "BM", or "all"
    max_files: int = 3,           # Limit for testing
    max_pages_per_file: int = 5   # Limit pages for faster testing
):
    """Quick analysis function optimized for Google Colab"""
    
    print("üèõÔ∏è  MALAYSIAN LEGAL PDF ANALYSIS - GOOGLE COLAB")
    print("=" * 60)
    
    # Check if setup is complete
    acts_dir = "/content/malaysian_acts"
    if not os.path.exists(acts_dir):
        print("‚ùå PDF files not found. Run the setup cell first!")
        return None
    
    if not analysis_ready:
        print("‚ùå API keys not configured. Set up Colab Secrets first!")
        return None
    
    # Run the analysis
    print(f"üéØ Analysis Configuration:")
    print(f"   Language filter: {language_filter}")
    print(f"   Max files: {max_files}")
    print(f"   Max pages per file: {max_pages_per_file}")
    print(f"   Input: {acts_dir}")
    print(f"   Output: /content/parsed")
    
    # Execute the batch processing
    results = await process_malaysian_acts_folder(
        acts_dir=acts_dir,
        output_dir="/content/parsed",
        language_filter=language_filter,
        max_files=max_files,
        max_pages_per_file=max_pages_per_file
    )
    
    # Backup results to Drive if available
    if os.path.exists("/content/drive/MyDrive"):
        try:
            drive_backup = "/content/drive/MyDrive/parsed"
            if os.path.exists("/content/parsed"):
                print("üíæ Backing up results to Google Drive...")
                if os.path.exists(drive_backup):
                    shutil.rmtree(drive_backup)
                shutil.copytree("/content/parsed", drive_backup)
                print("‚úÖ Results backed up to Google Drive")
        except Exception as e:
            print(f"‚ö†Ô∏è  Drive backup failed: {e}")
    
    return results

print("\n" + "="*60)
print("üéØ GOOGLE COLAB SETUP COMPLETE!")
print("="*60)

if pdf_directory and analysis_ready:
    print("\n‚ö° Ready to analyze! Run:")
    print("   results = await run_colab_legal_analysis()")
    print("\nüéõÔ∏è  Or with custom settings:")
    print("   results = await run_colab_legal_analysis(")
    print("       language_filter='EN',")
    print("       max_files=5,")
    print("       max_pages_per_file=10")
    print("   )")
else:
    print("\nüìã Complete the setup steps above before running analysis")

print("\nüí° Google Colab Tips:")
print("‚Ä¢ Use GPU runtime: Runtime ‚Üí Change runtime type ‚Üí GPU")
print("‚Ä¢ Files in /content/ are temporary - backup important results to Drive")
print("‚Ä¢ Colab sessions timeout after ~12 hours of inactivity")