diff --git a/demos/extractor/.gitignore b/demos/extractor/.gitignore new file mode 100644 index 000000000..d55627f3e --- /dev/null +++ b/demos/extractor/.gitignore @@ -0,0 +1,20 @@ +# Dependencies +node_modules/ +package-lock.json + +# Upload directory +uploads/ + +# Logs +*.log +npm-debug.log* + +# OS files +.DS_Store +Thumbs.db + +# IDE +.vscode/ +.idea/ +*.swp +*.swo diff --git a/demos/extractor/README.md b/demos/extractor/README.md new file mode 100644 index 000000000..d8e78d0e7 --- /dev/null +++ b/demos/extractor/README.md @@ -0,0 +1,90 @@ +# PDF Data Extractor Demo + +This demo application allows you to extract structured data from PDF documents using JSON schemas and AI models. + +## Features + +- 📄 Upload and process PDF files +- 📋 Define custom JSON schemas for data extraction +- đŸŽ¯ Pre-built schema examples (Invoice, Receipt, Form) +- 📊 View extracted data with token usage statistics +- âš™ī¸ Configurable temperature and model selection + +## Prerequisites + +Before running this demo, you need: + +1. **Node.js** (version 18 or higher) +2. **Docker Model Runner** +3. **A suitable AI model** for text extraction + +## Setup Instructions + +### 1. Enable Docker Model Runner + +**Using Docker Desktop:** +- Open Docker Desktop settings +- Go to the **AI** tab +- Select **Enable Docker Model Runner** +- Enable **host-side TCP support** on port `12434` (default) + +For detailed instructions, see the [Docker Model Runner documentation](https://docs.docker.com/ai/model-runner/get-started/#enable-docker-model-runner). + +**Using Standalone Docker Engine:** +TCP support is enabled by default on port `12434`. + +#### 2. Pull a Suitable Model + +You'll need a model capable of understanding and extracting text. Recommended models: + +```bash +# Pull a general-purpose model +docker model pull ai/gemma3 +``` + +To see available models, visit [Docker Hub - AI Models](https://hub.docker.com/r/ai). + +## Installation + +1. **Navigate to the demo directory:** + ```bash + cd demos/extractor + ``` + +2. **Install dependencies:** + ```bash + npm install + ``` + +3. **Start the server:** + ```bash + npm start + ``` + + The server will start on `http://localhost:3000` + +4. **Open the demo:** + Open `demo.html` in your web browser (you can simply double-click the file or serve it with a local server) + +## Usage Guide + +### Basic Workflow + +1. **Configure API Settings** + - **Base API URL**: Set to `http://127.0.0.1:12434/engines/v1` for Docker Model Runner + - **Model**: Select from available models + +2. **Define Your Schema** + - Use the provided examples (Invoice, Receipt, Form) or create your own + - The schema defines what data to extract from the PDF + - Use standard JSON Schema format with `type`, `properties`, etc. + +3. **Upload a PDF** + - Click "Choose File" and select your PDF document + - Supported: Any text-based PDF (not scanned images without OCR) + - You can use sample PDFs [invoice.pdf](invoice.pdf) + +4. **Extract Data** + - Click "Extract Data" button + - Wait for processing (may take 10-30 seconds depending on PDF size and model) + - View extracted data in the result section diff --git a/demos/extractor/demo.html b/demos/extractor/demo.html new file mode 100644 index 000000000..e4d113415 --- /dev/null +++ b/demos/extractor/demo.html @@ -0,0 +1,447 @@ + + + + + + PDF Data Extractor Demo + + + + +

PDF Data Extractor Demo

+

Extract structured data from PDF documents using JSON schemas and AI models

+ + +
+
🔧 API Configuration
+ +
+ + +
+ â„šī¸ To pull a model, run: docker model pull <model-name>
+ Find more models at: https://hub.docker.com/r/ai +
+
+ +
+ + +
+
+ +
+
+ + +
+
+
+ + +
+
📋 JSON Schema
+ +
+ + +
+ Quick examples: + + + +
+
+
+ + +
+
📄 PDF Upload
+ +
+ + +
+ +
+ +
+
+ + +
+
📊 Extraction Result
+
Upload a PDF and click "Extract Data" to see results...
+
+ + + + + diff --git a/demos/extractor/invoice.pdf b/demos/extractor/invoice.pdf new file mode 100644 index 000000000..376327a73 Binary files /dev/null and b/demos/extractor/invoice.pdf differ diff --git a/demos/extractor/package.json b/demos/extractor/package.json new file mode 100644 index 000000000..20e343aca --- /dev/null +++ b/demos/extractor/package.json @@ -0,0 +1,27 @@ +{ + "name": "pdf-data-extractor-demo", + "version": "1.0.0", + "description": "Demo application for extracting structured data from PDFs using JSON schemas", + "main": "server.js", + "scripts": { + "start": "node server.js", + "dev": "nodemon server.js" + }, + "keywords": [ + "pdf", + "extraction", + "openai", + "json-schema" + ], + "author": "", + "license": "MIT", + "dependencies": { + "express": "^4.18.2", + "cors": "^2.8.5", + "multer": "2.0.2", + "pdf-data-extractor": "^1.0.1" + }, + "devDependencies": { + "nodemon": "^3.0.2" + } +} diff --git a/demos/extractor/server.js b/demos/extractor/server.js new file mode 100644 index 000000000..8d5038bbb --- /dev/null +++ b/demos/extractor/server.js @@ -0,0 +1,162 @@ +const express = require('express'); +const cors = require('cors'); +const multer = require('multer'); +const { PdfDataExtractor } = require('pdf-data-extractor'); +const fs = require('fs').promises; +const path = require('path'); + +const app = express(); +const PORT = process.env.PORT || 3000; +const UPLOADS_DIR = 'uploads/'; + +// Middleware +app.use(cors()); +app.use(express.json()); + +// Configure multer for file upload +const upload = multer({ + dest: UPLOADS_DIR, + limits: { fileSize: 10 * 1024 * 1024 } // 10MB limit +}); + +// Health check endpoint +app.get('/health', (req, res) => { + res.json({ status: 'ok', message: 'PDF Data Extractor Demo Server' }); +}); + +// Fetch available models from the API +app.post('/api/models', async (req, res) => { + try { + const { baseUrl } = req.body; + + if (!baseUrl) { + return res.status(400).json({ error: 'Base URL is required' }); + } + + const response = await fetch(`${baseUrl}/models`); + + if (!response.ok) { + return res.status(response.status).json({ + error: `Failed to fetch models: ${response.statusText}` + }); + } + + const data = await response.json(); + res.json(data); + } catch (error) { + console.error('Error fetching models:', error); + res.status(500).json({ + error: 'Failed to fetch models', + message: error.message + }); + } +}); + +// Extract data from PDF +app.post('/api/extract', upload.single('pdf'), async (req, res) => { + let pdfPath = null; + + try { + // Validate request + if (!req.file) { + return res.status(400).json({ error: 'No PDF file provided' }); + } + + // Store file path for cleanup in finally block + pdfPath = req.file.path; + + const { schema, baseUrl, model, apiKey, temperature, maxTokens } = req.body; + + if (!schema) { + return res.status(400).json({ error: 'No schema provided' }); + } + + if (!baseUrl) { + return res.status(400).json({ error: 'No base URL provided' }); + } + + if (!model) { + return res.status(400).json({ error: 'No model provided' }); + } + + // Parse schema + let parsedSchema; + try { + parsedSchema = JSON.parse(schema); + } catch (error) { + return res.status(400).json({ + error: 'Invalid JSON schema', + message: error.message + }); + } + + // Initialize extractor with provided configuration + const extractor = new PdfDataExtractor({ + openaiApiKey: apiKey || 'not-required-for-local-models', + model: model, + baseUrl: baseUrl + }); + + const extractOptions = { + pdfPath: pdfPath, + schema: parsedSchema + }; + + // Add optional parameters if provided + if (temperature !== undefined && temperature !== '') { + extractOptions.temperature = parseFloat(temperature); + } + if (maxTokens !== undefined && maxTokens !== '') { + extractOptions.maxTokens = parseInt(maxTokens); + } + + console.log(`Extracting data from PDF using model: ${model}`); + const result = await extractor.extract(extractOptions); + + // Return results + res.json({ + success: true, + data: result.data, + tokensUsed: result.tokensUsed, + model: result.model + }); + + } catch (error) { + console.error('Error extracting data:', error); + + res.status(500).json({ + success: false, + error: 'Failed to extract data from PDF', + message: error.message + }); + } finally { + // Always clean up uploaded file + if (pdfPath) { + try { + await fs.unlink(pdfPath); + console.log(`Cleaned up uploaded file: ${pdfPath}`); + } catch (cleanupError) { + console.error('Error cleaning up file:', cleanupError); + } + } + } +}); + +// Initialize server +(async () => { + try { + // Create uploads directory before starting server + const uploadsDir = path.join(__dirname, UPLOADS_DIR); + await fs.mkdir(uploadsDir, { recursive: true }); + console.log(`Uploads directory ready: ${uploadsDir}`); + + // Start server only after uploads directory is ready + app.listen(PORT, () => { + console.log(`PDF Data Extractor Demo Server running on http://localhost:${PORT}`); + console.log(`Upload endpoint: http://localhost:${PORT}/api/extract`); + }); + } catch (error) { + console.error('Failed to initialize server:', error); + process.exit(1); + } +})();