# PDF Extraction dengan Groq API

Notebook ini menunjukkan cara mengekstrak konten PDF menggunakan Groq API dengan model LLaMA 3.1

## 1. Install Dependencies
Install package yang dibutuhkan untuk PDF extraction dengan Groq

In [None]:
# # Install required packages
# import subprocess
# import sys

# packages = ["groq", "PyPDF2", "python-dotenv", "reportlab"]

# for package in packages:
#     try:
#         __import__(package)
#     except ImportError:
#         print(f"Installing {package}...")
#         subprocess.check_call([sys.executable, "-m", "pip", "install", package])

# print("All packages installed successfully!")

## 2. Import Libraries dan Setup Environment

In [None]:
import os
import json
from groq import Groq
import PyPDF2
from dotenv import load_dotenv
from typing import Dict, List, Optional

# Load environment variables
load_dotenv()

# Initialize Groq client
client = Groq(api_key=os.getenv('GROQ_API_KEY'))

print("Setup complete! Groq client initialized.")

## 3. PDF Reader Function
Fungsi untuk membaca file PDF dan mengekstrak teks

In [None]:
def read_pdf(file_path: str) -> Dict:
    """
    Read PDF file and return text content with metadata
    
    Args:
        file_path (str): Path ke file PDF
    
    Returns:
        Dict: Dictionary berisi text, metadata, dan status
    """
    try:
        with open(file_path, 'rb') as file:
            pdf_reader = PyPDF2.PdfReader(file)
            
            metadata = pdf_reader.metadata
            num_pages = len(pdf_reader.pages)
            
            full_text = ""
            for i, page in enumerate(pdf_reader.pages):
                page_text = page.extract_text()
                if page_text:
                    full_text += f"[Page {i+1}]\n{page_text}\n\n"
            
            return {
                "success": True,
                "metadata": {
                    "title": metadata.get('/Title', 'Unknown') if metadata else 'Unknown',
                    "author": metadata.get('/Author', 'Unknown') if metadata else 'Unknown',
                    "subject": metadata.get('/Subject', 'Unknown') if metadata else 'Unknown',
                    "creator": metadata.get('/Creator', 'Unknown') if metadata else 'Unknown',
                    "pages": num_pages
                },
                "text": full_text.strip(),
                "total_chars": len(full_text),
                "total_pages": num_pages
            }
            
    except Exception as e:
        return {
            "success": False,
            "error": str(e),
            "text": "",
            "metadata": {},
            "total_chars": 0,
            "total_pages": 0
        }

## 4. Groq Extraction Function
Fungsi untuk mengekstrak informasi dari teks menggunakan Groq API

In [None]:
def extract_with_groq(text: str, model: str = "llama-3.1-8b-instant") -> Dict:
    """
    Extract information from text using Groq API
    
    Args:
        text (str): Teks yang akan dianalisis
        model (str): Model Groq yang digunakan (default: llama-3.1-8b-instant)
    
    Returns:
        Dict: Hasil ekstraksi dalam format JSON
    """
    
    # Batasi teks agar tidak terlalu panjang untuk model
    max_chars = 8000  # Batas aman untuk model
    truncated_text = text[:max_chars] if len(text) > max_chars else text
    
    prompt = f"""
    Anda adalah asisten ahli dalam analisis dokumen. Analisis teks berikut dari dokumen PDF dan ekstrak informasi berikut dalam format JSON yang valid:
    
    1. **judul_dokumen**: Judul atau nama dokumen
    2. **ringkasan**: Ringkasan isi dokumen dalam Bahasa Indonesia (maksimal 150 kata)
    3. **topik_utama**: Topik utama yang dibahas
    4. **tanggal_tersimpan**: Semua tanggal yang disebutkan dalam format YYYY-MM-DD
    5. **nama_penting**: Nama-nama orang, perusahaan, atau institusi penting
    6. **angka_penting**: Angka-angka atau nilai penting (revenue, profit, jumlah, dll)
    7. **kategori_dokumen**: Kategori dokumen (laporan, kontrak, surat, dll)
    8. **kata_kunci**: 5-7 kata kunci utama
    9. **bahasa**: Bahasa utama dokumen
    10. **jenis_file**: Tipe dokumen (laporan keuangan, surat resmi, dll)
    
    Teks PDF:
    {truncated_text}
    
    Format JSON Response:
    {{
        "judul_dokumen": "...",
        "ringkasan": "...",
        "topik_utama": "...",
        "tanggal_tersimpan": ["YYYY-MM-DD", ...],
        "nama_penting": ["...", ...],
        "angka_penting": ["...", ...],
        "kategori_dokumen": "...",
        "kata_kunci": ["...", ...],
        "bahasa": "...",
        "jenis_file": "...",
        "confidence": 0.95
    }}
    
    Pastikan JSON yang dihasilkan valid dan tidak ada karakter khusus yang mengganggu format.
    """
    
    try:
        response = client.chat.completions.create(
            model=model,
            messages=[
                {
                    "role": "system", 
                    "content": "Anda adalah asisten ahli dalam analisis dokumen PDF. Berikan respons dalam format JSON yang valid dan akurat. Gunakan Bahasa Indonesia untuk semua respons kecuali untuk istilah teknis."
                },
                {
                    "role": "user", 
                    "content": prompt
                }
            ],
            temperature=0.2,
            max_tokens=1000,
            response_format={"type": "json_object"}
        )
        
        content = response.choices[0].message.content
        
        # Parse JSON response
        try:
            result = json.loads(content)
            result["model_used"] = model
            result["success"] = True
            return result
        except json.JSONDecodeError as e:
            return {
                "success": False,
                "error": f"Invalid JSON response: {str(e)}",
                "raw_response": content
            }
            
    except Exception as e:
        return {
            "success": False,
            "error": str(e),
            "model_used": model
        }

## 5. Create Sample PDF for Testing

In [None]:
from reportlab.pdfgen import canvas
from reportlab.lib.pagesizes import letter
from reportlab.lib.units import inch

def create_sample_pdf(filename: str = "sample_report_groq.pdf") -> str:
    """Create a comprehensive sample PDF file for testing"""
    
    c = canvas.Canvas(filename, pagesize=letter)
    width, height = letter
    
    # Header
    c.setFont("Helvetica-Bold", 18)
    c.drawString(1*inch, height - 1*inch, "LAPORAN KEUANGAN TRIWULANAN")
    
    c.setFont("Helvetica", 14)
    c.drawString(1*inch, height - 1.3*inch, "PT United Tractors Tbk")
    c.drawString(1*inch, height - 1.6*inch, "Periode: Januari - Maret 2024")
    
    # Content
    content = [
        "RINGKASAN EKSEKUTIF",
        "",
        "Perusahaan mencatat kinerja yang solid pada triwulan pertama 2024:",
        "• Revenue: Rp 15,7 triliun (naik 25% YoY)",
        "• Net Profit: Rp 2,8 triliun (naik 30% YoY)",
        "• EBITDA: Rp 4,2 triliun",
        "• Jumlah unit terjual: 1.250 unit",
        "• Pangsa pasar: 45% di sektor alat berat",
        "",
        "PENCAPAIAN STRATEGIS",
        "",
        "1. Ekspansi ke Kalimantan Timur",
        "2. Kerja sama strategis dengan PT Pertamina",
        "3. Peluncuran produk baru: Excavator EC950F",
        "4. Target tahun 2024: Rp 75 triliun revenue",
        "",
        "TIM MANAJEMEN",
        "",
        "• Direktur Utama: Frans Kesuma",
        "• Direktur Keuangan: Sarah Wijaya",
        "• Tanggal laporan: 30 April 2024",
        "• Kantor pusat: Jakarta, Indonesia"
    ]
    
    c.setFont("Helvetica", 11)
    y_position = height - 2.5*inch
    
    for line in content:
        if y_position < 1*inch:
            c.showPage()
            y_position = height - 1*inch
        
        if line and line.isupper():
            c.setFont("Helvetica-Bold", 12)
        else:
            c.setFont("Helvetica", 11)
            
        c.drawString(1*inch, y_position, line)
        y_position -= 0.25*inch
    
    c.save()
    print(f"Sample PDF created: {filename}")
    return filename

# Create sample PDF
sample_pdf = create_sample_pdf()
print(f"Sample file ready: {sample_pdf}")

## 6. Setup Environment Variables

In [None]:
# Check if GROQ_API_KEY is set
if not os.getenv('GROQ_API_KEY'):
    print("GROQ_API_KEY not found!")
    print("\nPlease create a .env file with your Groq API key:")
    print("echo GROQ_API_KEY=your_actual_api_key_here > .env")
    print("\nOr set it directly:")
    print("import os")
    print("os.environ['GROQ_API_KEY'] = 'your_actual_api_key_here'")
else:
    print("GROQ_API_KEY found!")

## 7. Demo Ekstraksi PDF

In [None]:
def process_single_pdf(file_path: str, model: str = "llama-3.1-8b-instant") -> Dict:
    """
    Process single PDF file with Groq API
    
    Args:
        file_path (str): Path ke file PDF
        model (str): Model Groq yang digunakan
    
    Returns:
        Dict: Hasil lengkap ekstraksi
    """
    print(f"Processing: {file_path}")
    
    # Read PDF
    pdf_data = read_pdf(file_path)
    
    if not pdf_data["success"]:
        print(f"Error reading PDF: {pdf_data['error']}")
        return pdf_data
    
    print(f"PDF loaded: {pdf_data['total_pages']} pages, {pdf_data['total_chars']} characters")
    
    # Extract information with Groq
    extraction_result = extract_with_groq(pdf_data["text"], model)
    
    return {
        "file_path": file_path,
        "pdf_metadata": pdf_data["metadata"],
        "text_length": pdf_data["total_chars"],
        "total_pages": pdf_data["total_pages"],
        "extraction": extraction_result,
        "processing_time": None  # Could add timing if needed
    }

# Run demo if API key is available
if os.getenv('GROQ_API_KEY'):
    try:
        result = process_single_pdf(sample_pdf)
        
        print("\n RESULTS:")
        print("=" * 50)
        print(json.dumps(result, indent=2, ensure_ascii=False))
        
        # Save results
        with open('groq_extraction_result.json', 'w', encoding='utf-8') as f:
            json.dump(result, f, indent=2, ensure_ascii=False)
        print(" Results saved to: groq_extraction_result.json")
        
    except Exception as e:
        print(f"Error during processing: {e}")
else:
    print("⚠️  Cannot run demo - GROQ_API_KEY not configured")

## 8. Batch Processing untuk Multiple PDFs

In [None]:
import glob
import os

def process_multiple_pdfs(directory_path: str, model: str = "llama-3.1-8b-instant") -> List[Dict]:
    """
    Process multiple PDF files in a directory
    
    Args:
        directory_path (str): Path ke folder berisi PDF
        model (str): Model Groq yang digunakan
    
    Returns:
        List[Dict]: List hasil ekstraksi untuk semua PDF
    """
    
    # Find all PDF files
    pdf_files = glob.glob(f"{directory_path}/*.pdf")
    
    if not pdf_files:
        print(f"No PDF files found in {directory_path}")
        return []
    
    print(f"Found {len(pdf_files)} PDF files")
    
    results = []
    
    for i, pdf_file in enumerate(pdf_files, 1):
        print(f"\n[{i}/{len(pdf_files)}] Processing: {os.path.basename(pdf_file)}")
        
        try:
            result = process_single_pdf(pdf_file, model)
            results.append(result)
            
            # Small delay to avoid rate limiting
            import time
            time.sleep(1)
            
        except Exception as e:
            results.append({
                "file_path": pdf_file,
                "error": str(e),
                "success": False
            })
    
    return results

# Example usage (uncomment to use)
# results = process_multiple_pdfs("./sample_pdfs")
# print(f"Processed {len(results)} files")

## 9. Available Groq Models

In [None]:
# List of available Groq models for PDF extraction
groq_models = {
    "llama-3.1-8b-instant": {
        "description": "Fast and efficient for basic extraction",
        "max_tokens": 8192,
        "speed": "Fast"
    },
    "llama-3.1-70b-versatile": {
        "description": "More accurate for complex documents",
        "max_tokens": 8192,
        "speed": "Medium"
    },
    "mixtral-8x7b-32768": {
        "description": "Good balance of speed and accuracy",
        "max_tokens": 32768,
        "speed": "Medium"
    }
}

print("Available Groq Models:")
for model, info in groq_models.items():
    print(f"\n• {model}")
    print(f"  Description: {info['description']}")
    print(f"  Max Tokens: {info['max_tokens']}")
    print(f"  Speed: {info['speed']}")

## 10. Advanced Usage Examples

In [None]:
# Example: Process specific PDF from custom path
def process_pdf_from_path(pdf_path: str, model: str = "llama-3.1-8b-instant"):
    """
    Process PDF from any given path
    
    Args:
        pdf_path (str): Full path ke file PDF
        model (str): Model yang digunakan
    """
    
    if not os.path.exists(pdf_path):
        print(f"File not found: {pdf_path}")
        return None
    
    if not pdf_path.lower().endswith('.pdf'):
        print("File must be a PDF")
        return None
    
    return process_single_pdf(pdf_path, model)

# Example usage
# custom_pdf_path = "/path/to/your/document.pdf"
# result = process_pdf_from_path(custom_pdf_path)

# Example: Save results to different formats
def save_results_to_csv(results: List[Dict], filename: str = "extraction_results.csv"):
    """Save extraction results to CSV format"""
    import pandas as pd
    
    flattened_results = []
    for result in results:
        if result.get("extraction", {}).get("success"):
            flat_result = {
                "file_name": os.path.basename(result["file_path"]),
                "judul_dokumen": result["extraction"].get("judul_dokumen", ""),
                "ringkasan": result["extraction"].get("ringkasan", ""),
                "topik_utama": result["extraction"].get("topik_utama", ""),
                "kategori_dokumen": result["extraction"].get("kategori_dokumen", ""),
                "tanggal_tersimpan": ", ".join(result["extraction"].get("tanggal_tersimpan", [])),
                "nama_penting": ", ".join(result["extraction"].get("nama_penting", [])),
                "angka_penting": ", ".join(result["extraction"].get("angka_penting", [])),
                "total_pages": result.get("total_pages", 0),
                "text_length": result.get("text_length", 0)
            }
            flattened_results.append(flat_result)
    
    if flattened_results:
        df = pd.DataFrame(flattened_results)
        df.to_csv(filename, index=False, encoding='utf-8')
        print(f"Results saved to CSV: {filename}")
    else:
        print("N valid results to save")

# Example: Batch processing with progress bar
from tqdm import tqdm

def process_batch_with_progress(directory_path: str, model: str = "llama-3.1-8b-instant"):
    """Process multiple PDFs with progress bar"""
    
    pdf_files = glob.glob(f"{directory_path}/*.pdf")
    
    if not pdf_files:
        print("No PDF files found")
        return []
    
    results = []
    
    for pdf_file in tqdm(pdf_files, desc="Processing PDFs"):
        try:
            result = process_single_pdf(pdf_file, model)
            results.append(result)
            
            # Rate limiting
            import time
            time.sleep(1)
            
        except Exception as e:
            results.append({
                "file_path": pdf_file,
                "error": str(e),
                "success": False
            })
    
    return results

## 11. Setup Instructions

In [None]:
print(" Setup Instructions for PDF Extraction with Groq")
print("=" * 50)
print()
print("1. Get Groq API Key:")
print("   • Go to https://console.groq.com/keys")
print("   • Create new API key")
print()
print("2. Create .env file:")
print("   echo GROQ_API_KEY=your_actual_api_key_here > .env")
print()
print("3. Install dependencies:")
print("   pip install groq PyPDF2 python-dotenv reportlab pandas tqdm")
print()
print("4. Basic usage:")
print("   from notebook.pdf_extraction_groq import process_pdf_from_path")
print("   result = process_pdf_from_path('path/to/your/file.pdf')")
print()
print("5. Available models:")
print("   • llama-3.1-8b-instant (fastest)")
print("   • llama-3.1-70b-versatile (most accurate)")
print("   • mixtral-8x7b-32768 (balanced)")

## 12. Error Handling & Troubleshooting

In [None]:
# Common error handling
def safe_process_pdf(file_path: str, model: str = "llama-3.1-8b-instant"):
    """Safe PDF processing with comprehensive error handling"""
    
    try:
        # Check file exists
        if not os.path.exists(file_path):
            return {"error": f"File not found: {file_path}", "success": False}
        
        # Check file size (Groq has limits)
        file_size = os.path.getsize(file_path)
        if file_size > 50 * 1024 * 1024:  # 50MB limit
            return {"error": "File too large (>50MB)", "success": False}
        
        # Process PDF
        result = process_single_pdf(file_path, model)
        
        # Validate extraction
        if not result.get("extraction", {}).get("success"):
            return {"error": "Extraction failed", "details": result, "success": False}
        
        return result
        
    except Exception as e:
        return {"error": str(e), "success": False}

print("Error handling functions ready!")