# olmOCR Google Colab Setup
# This notebook installs and runs olmOCR on Google Colab

In [None]:
# 1. Install system dependencies
!apt-get update

# Accept Microsoft fonts EULA non-interactively
!echo ttf-mscorefonts-installer msttcorefonts/accepted-mscorefonts-eula select true | sudo debconf-set-selections

# Install the required packages - use -y flag to avoid prompts
!apt-get install -y poppler-utils ttf-mscorefonts-installer fonts-crosextra-caladea fonts-crosextra-carlito gsfonts lcdf-typetools ghostscript

# Don't fail if Microsoft fonts aren't found in cache
!fc-cache -f -v
print("Font cache updated")

Hit:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease
Hit:2 http://archive.ubuntu.com/ubuntu jammy InRelease
Hit:3 http://security.ubuntu.com/ubuntu jammy-security InRelease
Hit:4 http://archive.ubuntu.com/ubuntu jammy-updates InRelease
Hit:5 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
Hit:6 https://r2u.stat.illinois.edu/ubuntu jammy InRelease
Hit:7 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Hit:8 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:9 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:10 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Reading package lists... Done
W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
Reading package lists... Done
Building dependency tree... Done
Reading

In [None]:
# 2. Install olmOCR
!pip install git+https://github.com/allenai/olmocr.git

Collecting git+https://github.com/allenai/olmocr.git
  Cloning https://github.com/allenai/olmocr.git to /tmp/pip-req-build-ne3siuzk
  Running command git clone --filter=blob:none --quiet https://github.com/allenai/olmocr.git /tmp/pip-req-build-ne3siuzk
  Resolved https://github.com/allenai/olmocr.git to commit b62ccc25ddaa547de778d8d72fe1e2d41ab8a0a3
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


In [None]:
# 3. Install sglang with flashinfer (for GPU acceleration)
# Note: These versions may need to be updated depending on your Colab environment
!pip install sgl-kernel==0.0.3.post1 --force-reinstall --no-deps
try:
    !pip install "sglang[all]==0.4.2" --find-links https://flashinfer.ai/whl/cu124/torch2.4/flashinfer/
except:
    print("Failed to install sglang with flashinfer. Installing without GPU acceleration.")
    !pip install "sglang[all]==0.4.2"

Collecting sgl-kernel==0.0.3.post1
  Using cached sgl_kernel-0.0.3.post1-cp39-abi3-manylinux2014_x86_64.whl.metadata (13 kB)
Using cached sgl_kernel-0.0.3.post1-cp39-abi3-manylinux2014_x86_64.whl (9.1 MB)
Installing collected packages: sgl-kernel
  Attempting uninstall: sgl-kernel
    Found existing installation: sgl-kernel 0.0.3.post1
    Uninstalling sgl-kernel-0.0.3.post1:
      Successfully uninstalled sgl-kernel-0.0.3.post1
Successfully installed sgl-kernel-0.0.3.post1
Looking in links: https://flashinfer.ai/whl/cu124/torch2.4/flashinfer/


In [None]:
# Install required libraries
!pip install zstandard tqdm webdavclient3 -q

In [None]:
!pip install pikepdf



In [None]:
# Install required libraries (if not already installed)
!pip install webdavclient3 tqdm

import os
from webdav3.client import Client
from google.colab import userdata  # For accessing Colab secrets
from tqdm import tqdm

# --- 1. Set Up Workspace Directories ---
# Updated workspace to "olmocr_workspace_rest"
drive_workspace = "/content/drive/MyDrive/olmocr_workspace_rest"
drive_pdfs_dir = os.path.join(drive_workspace, "pdfs_to_process")
os.makedirs(drive_workspace, exist_ok=True)
os.makedirs(drive_pdfs_dir, exist_ok=True)
print(f"Workspace directory created: {drive_workspace}")

# --- 2. Retrieve WebDAV Credentials from Colab Secrets ---
webdav_username = userdata.get('WEBDAV_USERNAME')
webdav_password = userdata.get('WEBDAV_PASSWORD')
webdav_url      = userdata.get('WEBDAV_URL')  # e.g., "https://files.chao.eu/remote.php/dav/files/admin/"
if not (webdav_username and webdav_password and webdav_url):
    raise ValueError("Missing one or more WebDAV secrets: WEBDAV_USERNAME, WEBDAV_PASSWORD, WEBDAV_URL")

# --- 3. Configure the WebDAV Client ---
options = {
    'webdav_hostname': webdav_url,
    'webdav_login':    webdav_username,
    'webdav_password': webdav_password
}
client = Client(options)
# Remove the insecure setting so that certificate verification happens
# client.verify = False  # Do not disable verification for production!
print("Connected to WebDAV.")

# --- 4. Define the Remote Directory Containing PDFs ---
# This should correspond to the folder from which you want to download PDFs.
remote_pdf_dir = "DT/olmocr_backups/missing_pdfs"

# --- 5. List the PDF Files in the Remote Directory ---
try:
    remote_files = client.list(remote_pdf_dir)
    # Filter out only PDF files (case-insensitive)
    pdf_files = [f for f in remote_files if f.lower().endswith('.pdf')]
    print(f"Found {len(pdf_files)} PDF files in remote directory '{remote_pdf_dir}'.")
except Exception as e:
    print(f"Error listing remote directory: {e}")
    pdf_files = []

# --- 6. Download the PDF Files to the "olmocr_workspace_missing" Directory ---
if pdf_files:
    pbar = tqdm(total=len(pdf_files), desc="Downloading PDFs", unit="file")
    for file in pdf_files:
        # If the file path doesn't start with the remote directory, prepend it.
        if not file.startswith(remote_pdf_dir):
            remote_file_path = os.path.join(remote_pdf_dir, file)
        else:
            remote_file_path = file
        local_file_path = os.path.join(drive_pdfs_dir, os.path.basename(file))
        try:
            client.download_file(remote_file_path, local_file_path)
            pbar.update(1)
        except Exception as e:
            print(f"Error downloading {file}: {e}")
    pbar.close()
    print(f"Downloaded {len(pdf_files)} PDF files to {drive_pdfs_dir}.")
else:
    print("No PDF files found in remote directory.")


Workspace directory created: /content/drive/MyDrive/olmocr_workspace_rest
Connected to WebDAV.
Found 6 PDF files in remote directory 'DT/olmocr_backups/missing_pdfs'.


Downloading PDFs:  67%|██████▋   | 4/6 [00:21<00:10,  5.24s/file]

In [None]:
!python -m olmocr.pipeline /content/drive/MyDrive/olmocr_workspace_missing --pdfs /content/drive/MyDrive/olmocr_workspace_rest/pdfs_to_process/*.pdf --workers 1

INFO:numexpr.utils:NumExpr defaulting to 12 threads.
ERROR:olmocr.check:pdftoppm is not installed.
ERROR:olmocr.check:Check the README in the https://github.com/allenai/olmocr/blob/main/README.md for installation instructions
