# Docling API Client

This notebook demonstrates how to call the Docling API service to extract tables from PDF files.

## Import Required Libraries

First, we need to import the necessary libraries for making HTTP requests and handling files.

In [None]:
import requests
import os
import zipfile
import tempfile
import shutil
from pathlib import Path
import pandas as pd

## Set API Endpoint and File Path

Define the base URL for the FastAPI service and specify the path to the PDF file to upload.

In [None]:
# Configure the API base URL
base_url = "http://127.0.0.1:8000"  # Adjust this if your API is running on a different host/port
api_endpoint = f"{base_url}/docling/extract-tables-csv/"

# Path to the PDF file to upload
# Using a sample from the data directory - adjust this path as needed
pdf_path = "../data/forms/fw4.pdf"  # Path to a PDF file with tables

# Check if the file exists
if os.path.exists(pdf_path):
    print(f"Found PDF file: {pdf_path}")
else:
    print(f"WARNING: File {pdf_path} not found. Please update the path to a valid PDF file.")

## Define Function to Upload PDF and Download ZIP

Let's define a function that uploads a PDF file to the Docling API endpoint and saves the returned ZIP file.

In [None]:
def upload_pdf_and_get_tables(pdf_path, api_url):
    """
    Upload a PDF file to the Docling API and download the resulting ZIP file with CSV tables.
    
    Args:
        pdf_path: Path to the PDF file
        api_url: URL of the API endpoint
        
    Returns:
        tuple: (status_code, output_dir) where output_dir contains extracted CSVs
    """
    # Create a temporary directory to store the extracted files
    output_dir = tempfile.mkdtemp()
    
    try:
        # Prepare the file for upload
        with open(pdf_path, 'rb') as pdf_file:
            files = {'file': (os.path.basename(pdf_path), pdf_file, 'application/pdf')}
            
            print(f"Uploading {pdf_path} to {api_url}...")
            # Make the POST request to the API
            response = requests.post(api_url, files=files)
            
            if response.status_code == 200:
                # Save the ZIP file temporarily
                zip_path = os.path.join(output_dir, "tables.zip")
                with open(zip_path, 'wb') as f:
                    f.write(response.content)
                
                # Extract the ZIP file
                with zipfile.ZipFile(zip_path, 'r') as zip_ref:
                    zip_ref.extractall(output_dir)
                
                # Remove the ZIP file after extraction
                os.remove(zip_path)
                
                print(f"Successfully downloaded and extracted tables to {output_dir}")
                return response.status_code, output_dir
            else:
                print(f"Error: API returned status code {response.status_code}")
                print(f"Response: {response.text}")
                return response.status_code, None
                
    except Exception as e:
        print(f"Error: {e}")
        return -1, None

## Call the API and Print Output

Now let's call the API with our PDF file and display the results.

In [None]:
# Call the function to upload the PDF and get tables
status_code, output_dir = upload_pdf_and_get_tables(pdf_path, api_endpoint)

if status_code == 200 and output_dir:
    # List all CSV files in the output directory
    csv_files = [f for f in os.listdir(output_dir) if f.endswith('.csv')]
    
    print(f"Found {len(csv_files)} CSV files:")
    for i, csv_file in enumerate(csv_files):
        print(f"{i+1}. {csv_file}")
    
    # Display the first table if available
    if csv_files:
        first_csv = os.path.join(output_dir, csv_files[0])
        df = pd.read_csv(first_csv)
        print("\nPreview of the first table:")
        display(df.head())
else:
    print("Failed to retrieve tables from the API")

## Cleanup

Finally, let's clean up the temporary files we created.

In [None]:
# Clean up the temporary directory
if output_dir and os.path.exists(output_dir):
    shutil.rmtree(output_dir)
    print(f"Cleaned up temporary directory: {output_dir}")
else:
    print("No temporary directory to clean up")