# PDF Extraction dengan OpenAI API

Notebook ini menunjukkan cara mengekstrak konten PDF menggunakan OpenAI API secara native.

## 1. Install Dependencies

In [None]:
# Install required packages
import subprocess
import sys

packages = ["openai", "PyPDF2", "python-dotenv"]
for package in packages:
    try:
        __import__(package)
    except ImportError:
        print(f"Installing {package}...")
        subprocess.check_call([sys.executable, "-m", "pip", "install", package])

## 2. Import Libraries

In [None]:
import os
import json
from openai import OpenAI
import PyPDF2
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# Initialize OpenAI client
client = OpenAI(api_key=os.getenv('OPENAI_API_KEY'))

print("Setup complete!")

## 3. PDF Reader Function

In [None]:
def read_pdf(file_path):
    """Read PDF file and return text content"""
    try:
        with open(file_path, 'rb') as file:
            pdf_reader = PyPDF2.PdfReader(file)
            
            text = ""
            for page in pdf_reader.pages:
                text += page.extract_text() + "\n\n"
            
            return text.strip()
    except Exception as e:
        print(f"Error reading PDF: {e}")
        return None

## 4. OpenAI Extraction Function

In [None]:
def extract_with_openai(text):
    """Extract information from text using OpenAI"""
    prompt = f"""
    Analisis teks berikut dari dokumen PDF dan ekstrak informasi berikut:
    
    1. Judul dokumen
    2. Ringkasan isi (maksimal 100 kata)
    3. Topik utama
    4. Tanggal yang disebutkan
    5. Nama-nama penting
    6. Angka penting
    7. Kategori dokumen
    8. Kata kunci utama (5 kata)
    
    Format JSON:
    {{
        "title": "...",
        "summary": "...",
        "main_topic": "...",
        "dates": [...],
        "important_names": [...],
        "important_numbers": [...],
        "document_type": "...",
        "keywords": [...]
    }}
    
    Teks: {text[:3000]}...
    """
    
    try:
        response = client.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=[
                {"role": "system", "content": "Anda adalah asisten ahli dalam analisis dokumen."},
                {"role": "user", "content": prompt}
            ],
            temperature=0.3,
            max_tokens=1000
        )
        
        content = response.choices[0].message.content
        return json.loads(content)
        
    except Exception as e:
        return {"error": str(e)}

## 5. Create Sample PDF

In [None]:
# Create sample PDF for testing
from reportlab.pdfgen import canvas
from reportlab.lib.pagesizes import letter

def create_sample_pdf():
    """Create a sample PDF file"""
    try:
        from reportlab.pdfgen import canvas
    except ImportError:
        import subprocess
        subprocess.check_call([sys.executable, "-m", "pip", "install", "reportlab"])
        from reportlab.pdfgen import canvas
    
    c = canvas.Canvas("sample_report.pdf", pagesize=letter)
    c.setFont("Helvetica-Bold", 16)
    c.drawString(100, 700, "Laporan Keuangan Triwulanan")
    c.setFont("Helvetica", 12)
    c.drawString(100, 680, "PT Teknologi Maju Sejahtera")
    c.drawString(100, 660, "Periode: Januari - Maret 2024")
    
    content = [
        "Ringkasan Eksekutif:",
        "Perusahaan mencatat pertumbuhan revenue sebesar 25%",
        "dengan total revenue mencapai Rp 15,7 miliar.",
        "Net profit: Rp 2,8 miliar",
        "Jumlah pelanggan: 1.250 aktif",
        "Direktur: Budi Santoso",
        "Tanggal: 15 April 2024"
    ]
    
    y = 620
    for line in content:
        c.drawString(100, y, line)
        y -= 20
    
    c.save()
    return "sample_report.pdf"

sample_pdf = create_sample_pdf()
print(f"Created: {sample_pdf}")

## 6. Demo Eksekusi

In [None]:
# Check API key
if not os.getenv('OPENAI_API_KEY'):
    print("Please set OPENAI_API_KEY in .env file")
else:
    # Read PDF
    text = read_pdf(sample_pdf)
    if text:
        print(f"ext length: {len(text)} characters")
        
        # Extract information
        result = extract_with_openai(text)
        
        # Display results
        print("\nEXTRACTION RESULTS:")
        print(json.dumps(result, indent=2, ensure_ascii=False))
        
        # Save to file
        with open('extraction_result.json', 'w', encoding='utf-8') as f:
            json.dump(result, f, indent=2, ensure_ascii=False)
        print("Saved to extraction_result.json")

## 7. Usage Instructions

### Setup
1. Buat file `.env` dengan konten:
   ```
   OPENAI_API_KEY=your_actual_api_key_here
   ```

### Basic Usage
```python
text = read_pdf("your_file.pdf")
result = extract_with_openai(text)
```

### Available Models
- `gpt-3.5-turbo` (default)
- `gpt-4` (more accurate, higher cost)