ModuleNotFoundError: No module named 'PIL'

In [4]:
!pip install requests PyMuPDF Pillow

Collecting Pillow
  Downloading pillow-11.2.1-cp310-cp310-manylinux_2_28_aarch64.whl (4.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.5/4.5 MB[0m [31m62.7 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Installing collected packages: Pillow
Successfully installed Pillow-11.2.1
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [8]:
"""
Mistral OCR API Python Implementation

Required installations:
pip install requests PyMuPDF Pillow

PyMuPDF is used to convert PDF pages to images since Mistral's API
expects image formats, not direct PDF input.
"""

import requests
import base64
import json
from pathlib import Path
from typing import Optional, Dict, Any, List
import fitz  # PyMuPDF
from PIL import Image
import io

class MistralOCR:
    def __init__(self, api_key: str):
        """
        Initialize Mistral OCR client
        
        Args:
            api_key: Your Mistral API key
        """
        self.api_key = api_key
        self.base_url = "https://api.mistral.ai/v1/chat/completions"
        self.headers = {
            "Authorization": f"Bearer {api_key}",
            "Content-Type": "application/json"
        }
    
    def encode_image_to_base64(self, image_path: str) -> str:
        """
        Encode image file to base64 string
        
        Args:
            image_path: Path to the image file
            
        Returns:
            Base64 encoded string
        """
        with open(image_path, "rb") as image_file:
            return base64.b64encode(image_file.read()).decode('utf-8')
    
    def pdf_to_images(self, pdf_path: str, dpi: int = 300) -> List[Image.Image]:
        """
        Convert PDF pages to images
        
        Args:
            pdf_path: Path to the PDF file
            dpi: Resolution for conversion (higher = better quality, larger file)
            
        Returns:
            List of PIL Images, one per page
        """
        doc = fitz.open(pdf_path)
        images = []
        
        for page_num in range(len(doc)):
            page = doc.load_page(page_num)
            # Convert to pixmap with specified DPI
            mat = fitz.Matrix(dpi/72, dpi/72)  # 72 is default DPI
            pix = page.get_pixmap(matrix=mat)
            # Convert to PIL Image
            img_data = pix.tobytes("png")
            img = Image.open(io.BytesIO(img_data))
            images.append(img)
        
        doc.close()
        return images
    
    def image_to_base64(self, image: Image.Image, format: str = "PNG") -> str:
        """
        Convert PIL Image to base64 string
        
        Args:
            image: PIL Image object
            format: Image format (PNG, JPEG, etc.)
            
        Returns:
            Base64 encoded string
        """
        buffer = io.BytesIO()
        image.save(buffer, format=format)
        return base64.b64encode(buffer.getvalue()).decode('utf-8')
    
    def process_pdf_from_file(self, pdf_path: str, output_format: str = "markdown", max_pages: int = 10) -> Dict[str, Any]:
        """
        Process PDF file using Mistral OCR by converting to images first
        
        Args:
            pdf_path: Path to the PDF file
            output_format: Output format ("markdown" or "text")
            max_pages: Maximum number of pages to process (to avoid token limits)
            
        Returns:
            Combined API response containing extracted content from all pages
        """
        # Convert PDF to images
        print(f"Converting PDF to images...")
        images = self.pdf_to_images(pdf_path)
        
        if len(images) > max_pages:
            print(f"Warning: PDF has {len(images)} pages, processing first {max_pages} pages only")
            images = images[:max_pages]
        
        all_extracted_text = []
        
        for i, image in enumerate(images):
            print(f"Processing page {i+1}/{len(images)}...")
            
            # Convert image to base64
            image_base64 = self.image_to_base64(image, "PNG")
            
            # Prepare the request payload
            payload = {
                "model": "pixtral-12b-2409",
                "messages": [
                    {
                        "role": "user",
                        "content": [
                            {
                                "type": "text",
                                "text": f"Extract all text from this PDF page and convert it to {output_format} format. Preserve the structure, including tables, headers, and formatting. This is page {i+1}."
                            },
                            {
                                "type": "image_url",
                                "image_url": {
                                    "url": f"data:image/png;base64,{image_base64}"
                                }
                            }
                        ]
                    }
                ],
                "max_tokens": 4000,
                "temperature": 0.1
            }
            
            # Make the API request
            response = requests.post(self.base_url, headers=self.headers, json=payload)
            
            if response.status_code == 200:
                page_text = response.json()['choices'][0]['message']['content']
                all_extracted_text.append(f"# Page {i+1}\n\n{page_text}\n\n")
            else:
                print(f"Error processing page {i+1}: {response.status_code} - {response.text}")
                all_extracted_text.append(f"# Page {i+1}\n\n[Error processing this page]\n\n")
        
        # Combine all pages
        combined_text = "".join(all_extracted_text)
        
        # Return in the same format as single page processing
        return {
            "choices": [{
                "message": {
                    "content": combined_text
                }
            }],
            "pages_processed": len(images)
        }
    
    def process_image_ocr(self, image_path: str, output_format: str = "markdown") -> Dict[str, Any]:
        """
        Process image file using Mistral OCR
        
        Args:
            image_path: Path to the image file
            output_format: Output format ("markdown" or "text")
            
        Returns:
            API response containing extracted content
        """
        # Encode image to base64
        image_base64 = self.encode_image_to_base64(image_path)
        
        # Determine image type
        image_type = Path(image_path).suffix.lower()
        mime_type = {
            '.jpg': 'image/jpeg',
            '.jpeg': 'image/jpeg',
            '.png': 'image/png',
            '.gif': 'image/gif',
            '.bmp': 'image/bmp'
        }.get(image_type, 'image/jpeg')
        
        # Prepare the request payload
        payload = {
            "model": "pixtral-12b-2409",
            "messages": [
                {
                    "role": "user",
                    "content": [
                        {
                            "type": "text",
                            "text": f"Extract all text from this image and convert it to {output_format} format. Preserve the structure and formatting."
                        },
                        {
                            "type": "image_url",
                            "image_url": {
                                "url": f"data:{mime_type};base64,{image_base64}"
                            }
                        }
                    ]
                }
            ],
            "max_tokens": 4000,
            "temperature": 0.1
        }
        
        # Make the API request
        response = requests.post(self.base_url, headers=self.headers, json=payload)
        
        if response.status_code == 200:
            return response.json()
        else:
            raise Exception(f"API request failed: {response.status_code} - {response.text}")
    
    def extract_structured_data(self, file_path: str, structure_prompt: str, max_pages: int = 5) -> Dict[str, Any]:
        """
        Extract structured data from document with custom prompt
        
        Args:
            file_path: Path to the document file
            structure_prompt: Custom prompt for specific data extraction
            max_pages: Maximum pages to process for PDFs
            
        Returns:
            API response with structured data
        """
        file_ext = Path(file_path).suffix.lower()
        
        if file_ext == '.pdf':
            # Convert PDF to images first
            images = self.pdf_to_images(file_path)
            if len(images) > max_pages:
                images = images[:max_pages]
            
            all_structured_data = []
            
            for i, image in enumerate(images):
                image_base64 = self.image_to_base64(image, "PNG")
                
                payload = {
                    "model": "pixtral-12b-2409",
                    "messages": [
                        {
                            "role": "user",
                            "content": [
                                {
                                    "type": "text",
                                    "text": f"{structure_prompt}\n\nThis is page {i+1} of the document."
                                },
                                {
                                    "type": "image_url",
                                    "image_url": {
                                        "url": f"data:image/png;base64,{image_base64}"
                                    }
                                }
                            ]
                        }
                    ],
                    "max_tokens": 4000,
                    "temperature": 0.1
                }
                
                response = requests.post(self.base_url, headers=self.headers, json=payload)
                
                if response.status_code == 200:
                    page_data = response.json()['choices'][0]['message']['content']
                    all_structured_data.append(f"Page {i+1}:\n{page_data}\n\n")
                else:
                    print(f"Error processing page {i+1}: {response.status_code}")
            
            combined_data = "".join(all_structured_data)
            return {
                "choices": [{
                    "message": {
                        "content": combined_data
                    }
                }]
            }
        else:
            # Handle image files
            file_base64 = self.encode_image_to_base64(file_path)
            mime_type = f'image/{file_ext[1:]}'
            
            payload = {
                "model": "pixtral-12b-2409",
                "messages": [
                    {
                        "role": "user",
                        "content": [
                            {
                                "type": "text",
                                "text": structure_prompt
                            },
                            {
                                "type": "image_url",
                                "image_url": {
                                    "url": f"data:{mime_type};base64,{file_base64}"
                                }
                            }
                        ]
                    }
                ],
                "max_tokens": 4000,
                "temperature": 0.1
            }
            
            response = requests.post(self.base_url, headers=self.headers, json=payload)
            
            if response.status_code == 200:
                return response.json()
            else:
                raise Exception(f"API request failed: {response.status_code} - {response.text}")

# Usage examples
def main():
    # Initialize the OCR client with your API key
    api_key = "978I45lWHc8mcUD48ie0hmNn2nlGQ0Sj"  # Replace with your actual API key
    ocr_client = MistralOCR(api_key)
    
    try:
        # Example 1: Basic PDF to Markdown conversion
        print("Processing PDF to Markdown...")
        pdf_result = ocr_client.process_pdf_from_file("EP18823397W1B9.pdf", "markdown", max_pages=5)
        extracted_text = pdf_result['choices'][0]['message']['content']
        
        # Save the result
        with open("output.md", "w", encoding="utf-8") as f:
            f.write(extracted_text)
        print(f"PDF processed successfully! Processed {pdf_result.get('pages_processed', 'unknown')} pages.")
        
        # Example 2: Image OCR (skip if no image file)
        try:
            print("\nProcessing image...")
            image_result = ocr_client.process_image_ocr("image.jpg", "markdown")
            image_text = image_result['choices'][0]['message']['content']
            print("Extracted text from image:")
            print(image_text[:200] + "..." if len(image_text) > 200 else image_text)
        except FileNotFoundError:
            print("Image file not found, skipping image processing example.")
        
        # Example 3: Structured data extraction
        print("\nExtracting structured data...")
        structure_prompt = """
        Extract the following information from this document and return it as JSON:
        - Title
        - Author(s)
        - Date
        - Key points (as a list)
        - Any tables (preserve structure)
        """
        
        structured_result = ocr_client.extract_structured_data("EP18823397W1B9.pdf", structure_prompt, max_pages=3)
        structured_data = structured_result['choices'][0]['message']['content']
        print("Structured data:")
        print(structured_data[:500] + "..." if len(structured_data) > 500 else structured_data)
        
    except Exception as e:
        print(f"Error: {e}")

# Quick test function for single page
def quick_test(api_key: str, pdf_path: str):
    """
    Quick test function to process just the first page of a PDF
    """
    ocr_client = MistralOCR(api_key)
    
    try:
        print("Converting first page to image...")
        images = ocr_client.pdf_to_images(pdf_path)
        if not images:
            print("No images found in PDF")
            return
        
        # Process only first page
        first_page = images[0]
        image_base64 = ocr_client.image_to_base64(first_page, "PNG")
        
        payload = {
            "model": "pixtral-12b-2409",
            "messages": [
                {
                    "role": "user",
                    "content": [
                        {
                            "type": "text",
                            "text": "Extract all text from this PDF page and convert it to markdown format. Preserve the structure."
                        },
                        {
                            "type": "image_url",
                            "image_url": {
                                "url": f"data:image/png;base64,{image_base64}"
                            }
                        }
                    ]
                }
            ],
            "max_tokens": 4000,
            "temperature": 0.1
        }
        
        response = requests.post(ocr_client.base_url, headers=ocr_client.headers, json=payload)
        
        if response.status_code == 200:
            result = response.json()
            extracted_text = result['choices'][0]['message']['content']
            print("First page extracted successfully!")
            print("Preview:")
            print(extracted_text[:300] + "..." if len(extracted_text) > 300 else extracted_text)
            return extracted_text
        else:
            print(f"Error: {response.status_code} - {response.text}")
            
    except Exception as e:
        print(f"Error in quick test: {e}")

# Alternative using the official Mistral client (if available)
def alternative_with_mistral_client():
    """
    Alternative implementation using the official Mistral client library
    Install with: pip install mistralai
    """
    from mistralai.client import MistralClient
    
    api_key = "978I45lWHc8mcUD48ie0hmNn2nlGQ0Sj"
    client = MistralClient(api_key=api_key)
    
    # For file-based processing, you might need to handle base64 encoding
    with open("document.pdf", "rb") as f:
        pdf_data = base64.b64encode(f.read()).decode()
    
    messages = [
        {
            "role": "user",
            "content": [
                {
                    "type": "text",
                    "text": "Convert this PDF to structured markdown format."
                },
                {
                    "type": "image_url",
                    "image_url": {
                        "url": f"data:application/pdf;base64,{pdf_data}"
                    }
                }
            ]
        }
    ]
    
    response = client.chat(
        model="pixtral-12b-2409",
        messages=messages,
        max_tokens=4000
    )
    
    return response.choices[0].message.content



In [9]:
main()

Processing PDF to Markdown...
Converting PDF to images...
Processing page 1/5...
Processing page 2/5...
Processing page 3/5...
Processing page 4/5...
Processing page 5/5...
PDF processed successfully! Processed 5 pages.

Processing image...
Image file not found, skipping image processing example.

Extracting structured data...
Structured data:
Page 1:
```json
{
  "Title": "CORRECTED EUROPEAN PATENT SPECIFICATION",
  "Author(s)": [
    "SHEN, Wang",
    "DING, Yue",
    "JIANG, Hao",
    "CHEN, Fu Ji"
  ],
  "Date": "20.11.2024",
  "Key Points": [
    "Correction information: Corrected version no. 1 (W1 B1)",
    "Claims DE, DN, FR: 4 each",
    "Correspondence issued on: 04.06.2025 Bulletin 2025/23",
    "Date of publication and mention of the grant of the patent: 20.11.2024 Bulletin 2024/47",
    "Application number: 18823397.7",
   ...


In [7]:
api_key = "978I45lWHc8mcUD48ie0hmNn2nlGQ0Sj" 
quick_test(api_key, "EP18823397W1B9.pdf")

Converting first page to image...
First page extracted successfully!
Preview:
```markdown
# (19)

![European Patent Office Logo](https://example.com/logo.png)

# CORRECTED EUROPEAN PATENT SPECIFICATION

## (12)

## (15) Correction information:
- **Corrected version no**: 1 (W1 B1)
- **Corrections, see**: Claims DE, Claims EN, Claims FR

## (48) Corrigendum issued on:
- 04.06....


'```markdown\n# (19)\n\n![European Patent Office Logo](https://example.com/logo.png)\n\n# CORRECTED EUROPEAN PATENT SPECIFICATION\n\n## (12)\n\n## (15) Correction information:\n- **Corrected version no**: 1 (W1 B1)\n- **Corrections, see**: Claims DE, Claims EN, Claims FR\n\n## (48) Corrigendum issued on:\n- 04.06.2025 Bulletin 2025/23\n\n## (45) Date of publication and mention of the grant of the patent:\n- 20.11.2024 Bulletin 2024/47\n\n## (21) Application number:\n- 18823397.7\n\n## (22) Date of filing:\n- 21.05.2018\n\n## (51) International Patent Classification (IPC):\n- A61K 9/00 (2006.01)\n- A61K 9/08 (2006.01)\n- A61K 47/02 (2006.01)\n- C07F 9/53 (2006.01)\n- C07F 9/572 (2006.01)\n- C07F 9/574 (2006.01)\n- C07F 9/30 (2006.01)\n- C07F 9/58 (2006.01)\n- C07F 9/59 (2006.01)\n- C07F 9/65 (2006.01)\n- A61P 27/02 (2006.01)\n\n## (52) Cooperative Patent Classification (CPC):\n- C07F 9/306; A61K 9/048; A61K 9/06;\n- A61K 47/02; A61K 47/027; C07F 9/53;\n- C07F 9/572; C07F 9/574; C07F 9/5