<a href="https://colab.research.google.com/github/fxrdhan/DeepseekOCR/blob/main/DeepseekOCR.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# STEP 1: Upload file (must be its own cell)
import platform, torch
from google.colab import files

print("Python", platform.python_version(), "| CUDA?", torch.cuda.is_available())
up = files.upload()               # choose your file here
assert up, "No file uploaded."
FNAME = next(iter(up))
print("FNAME =", FNAME)

Python 3.12.12 | CUDA? True


Saving image.png to image.png
FNAME = image.png


In [None]:
# ONE-CELL: pick latest uploaded image automatically → run DeepSeek-OCR
%pip -q install "transformers==4.46.3" "tokenizers==0.20.3" einops addict easydict pillow

import os, glob, time, torch
from PIL import Image
from transformers import AutoModel, AutoTokenizer

assert torch.cuda.is_available(), "GPU is OFF. Runtime → Change runtime type → GPU, then rerun."

# 1) Find most recent image in /content if FNAME is missing
def pick_latest_image():
    pats = ["/content/*.png", "/content/*.jpg", "/content/*.jpeg", "/content/*.webp"]
    files = []
    for p in pats:
        files += glob.glob(p)
    assert files, "No image file found in /content. If you just uploaded, it’s saved in /content; try again."
    files.sort(key=lambda f: os.path.getmtime(f), reverse=True)
    return files[0]

img_path = None
try:
    # use FNAME if it exists and points to a file
    if "FNAME" in globals() and isinstance(FNAME, str) and os.path.isfile(FNAME):
        img_path = FNAME
except Exception:
    pass
if not img_path:
    img_path = pick_latest_image()

print("Using image:", img_path)

# 2) Optional: shrink giant screenshots for speed
img = Image.open(img_path).convert("RGB")
if max(img.size) > 2000:
    s = 2000 / max(img.size)
    img = img.resize((int(img.width*s), int(img.height*s)))
    img_path_proc = "/content/_shrunk.png"
    img.save(img_path_proc, optimize=True)
else:
    img_path_proc = img_path

# 3) Load model (needs eager attention)
model_id = "deepseek-ai/DeepSeek-OCR"
tok = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
if tok.pad_token is None and tok.eos_token is not None:
    tok.pad_token = tok.eos_token

t0 = time.time()
model = AutoModel.from_pretrained(
    model_id,
    trust_remote_code=True,
    use_safetensors=True,
    attn_implementation="eager"   # required for this arch
).to(dtype=torch.bfloat16, device="cuda").eval()
print(f"Model loaded in {time.time()-t0:.1f}s")

# 4) Fast OCR for screenshots (no tiling)
prompt = "<image>\nFree OCR."
outdir = "/content/out"; os.makedirs(outdir, exist_ok=True)

@torch.inference_mode()
def run_ocr(path):
    t = time.time()
    res = model.infer(
        tok,
        prompt=prompt,
        image_file=path,
        output_path=outdir,
        base_size=768,       # smaller canvas = faster
        image_size=512,      # smaller tiles
        crop_mode=False,     # screenshots usually don't need tiling
        save_results=True,
        test_compress=False
    )
    print(f"[OK] {os.path.basename(path)} in {time.time()-t:.1f}s")
    return res

out = run_ocr(img_path_proc)
print("Saved files in:", outdir)
print(out)

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.1/44.1 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.0/10.0 MB[0m [31m105.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.0/3.0 MB[0m [31m80.6 MB/s[0m eta [36m0:00:00[0m
[?25hUsing image: image.png


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/801 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

modeling_deepseekocr.py: 0.00B [00:00, ?B/s]

conversation.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/deepseek-ai/DeepSeek-OCR:
- conversation.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_deepseekv2.py: 0.00B [00:00, ?B/s]

configuration_deepseek_v2.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/deepseek-ai/DeepSeek-OCR:
- configuration_deepseek_v2.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/deepseek-ai/DeepSeek-OCR:
- modeling_deepseekv2.py
- configuration_deepseek_v2.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


deepencoder.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/deepseek-ai/DeepSeek-OCR:
- deepencoder.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/deepseek-ai/DeepSeek-OCR:
- modeling_deepseekocr.py
- conversation.py
- modeling_deepseekv2.py
- deepencoder.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
You are using a model of type deepseek_vl_v2 to instantiate a model of type DeepseekOCR. This is not supported for all configurations of models and can yield errors.


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Downloading shards:   0%|          | 0/1 [00:00<?, ?it/s]

model-00001-of-000001.safetensors:   0%|          | 0.00/6.67G [00:00<?, ?B/s]

In [None]:
# Inspect outputs, render Markdown inline, and download the file
import os, glob
from IPython.display import display, Markdown
from google.colab import files as cfiles

outdir = "/content/out"
assert os.path.isdir(outdir), "No /content/out directory found."

outs = sorted(glob.glob(f"{outdir}/*"), key=os.path.getmtime, reverse=True)
print("All output files (newest first):")
for p in outs: print(" -", os.path.basename(p))

# pick the newest markdown-like file
md = [p for p in outs if p.lower().endswith((".md", ".markdown", ".txt"))]
assert md, "No .md/.txt output found. Check the listed files above."
md_path = md[0]
print("\nRendering:", os.path.basename(md_path))
with open(md_path, "r", encoding="utf-8", errors="ignore") as f:
    text = f.read()

display(Markdown(text))        # show in notebook
cfiles.download(md_path)       # trigger download to your device

In [None]:
# View + convert DeepSeek-OCR .mmd → .md, and download
import os, glob, shutil
from IPython.display import Markdown, display
from google.colab import files as cfiles

outdir = "/content/out"
mmds = sorted(glob.glob(f"{outdir}/*.mmd"), key=os.path.getmtime, reverse=True)
assert mmds, "No .mmd file found in /content/out."
mmd_path = mmds[0]
md_path  = os.path.splitext(mmd_path)[0] + ".md"

# Copy/normalize to .md (also make a .txt for good measure)
with open(mmd_path, "r", encoding="utf-8", errors="ignore") as f:
    text = f.read()
with open(md_path, "w", encoding="utf-8") as f:
    f.write(text)
txt_path = os.path.splitext(mmd_path)[0] + ".txt"
with open(txt_path, "w", encoding="utf-8") as f:
    f.write(text)

print("Showing:", os.path.basename(md_path))
display(Markdown(text))

print("\nDownloads:")
print(" -", os.path.basename(md_path))
print(" -", os.path.basename(txt_path))
cfiles.download(md_path)
cfiles.download(txt_path)

In [None]:
with open("/content/out/result.mmd", "r", encoding="utf-8") as f:
    print(f.read())

In [None]:
# STEP A: Upload a new image (PNG/JPG/JPEG/WEBP). Stores name in FNAME.
from google.colab import files
up = files.upload()
assert up, "No file uploaded."
FNAME = next(iter(up))
print("FNAME =", FNAME)

In [None]:
# STEP B: Run DeepSeek-OCR on FNAME and show results inline (no downloads)
%pip -q install "transformers==4.46.3" "tokenizers==0.20.3" einops addict easydict pillow

import os, time, glob, torch
from PIL import Image
from IPython.display import display, Markdown, Image as IPyImage
from transformers import AutoModel, AutoTokenizer

assert torch.cuda.is_available(), "GPU is OFF. Runtime → Change runtime type → GPU."

# --- load once, reuse on subsequent runs
if "tok" not in globals() or "model" not in globals():
    model_id = "deepseek-ai/DeepSeek-OCR"
    tok = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
    if tok.pad_token is None and tok.eos_token is not None:
        tok.pad_token = tok.eos_token
    t0 = time.time()
    model = AutoModel.from_pretrained(
        model_id,
        trust_remote_code=True,
        use_safetensors=True,
        attn_implementation="eager"  # required for this arch
    ).to(dtype=torch.bfloat16, device="cuda").eval()
    print(f"Model loaded in {time.time()-t0:.1f}s")
else:
    print("Reusing model already in memory.")

# --- prep image (optional shrink for giant screenshots)
img = Image.open(FNAME).convert("RGB")
if max(img.size) > 2000:
    s = 2000 / max(img.size)
    img = img.resize((int(img.width*s), int(img.height*s)))
    img_path = "/content/_current_shrunk.png"
    img.save(img_path, optimize=True)
else:
    img_path = FNAME

# --- unique output folder so runs don’t overwrite each other
outdir = f"/content/out/run_{int(time.time())}"
os.makedirs(outdir, exist_ok=True)

# --- OCR (fast settings for screenshots)
prompt = "<image>\nFree OCR."
with torch.inference_mode():
    _ = model.infer(
        tok,
        prompt=prompt,
        image_file=img_path,
        output_path=outdir,
        base_size=768,     # smaller canvas = faster
        image_size=512,    # smaller tiles
        crop_mode=False,   # screenshots usually don't need tiling
        save_results=True,
        test_compress=False
    )

# --- show results inline
# DeepSeek writes Markdown as .mmd
mmd = sorted(glob.glob(f"{outdir}/*.mmd"), key=os.path.getmtime, reverse=True)
if mmd:
    with open(mmd[0], "r", encoding="utf-8", errors="ignore") as f:
        text = f.read()
    print("\n--- OCR TEXT (Markdown) ---")
    display(Markdown(text))
else:
    print("No .mmd produced — check overlay image for boxes/text regions.")

# Show overlay with detected boxes if present
boxes = os.path.join(outdir, "result_with_boxes.jpg")
if os.path.isfile(boxes):
    print("\n--- Detected regions overlay ---")
    display(IPyImage(filename=boxes))

print("\nOutput folder:", outdir)

In [None]:
# STEP A: Upload complex image (PNG/JPG/JPEG/WEBP). Stores name in FNAME.
from google.colab import files
up = files.upload()
assert up, "No file uploaded."
FNAME = next(iter(up))
print("FNAME =", FNAME)

In [None]:
# STEP C: DeepSeek-OCR complex-mode (bigger canvas + tiling), show results inline
%pip -q install "transformers==4.46.3" "tokenizers==0.20.3" einops addict easydict pillow

import os, time, glob, torch
from PIL import Image
from IPython.display import display, Markdown, Image as IPyImage
from transformers import AutoModel, AutoTokenizer

assert torch.cuda.is_available(), "GPU is OFF. Runtime → Change runtime type → GPU."

# --- load once, reuse on subsequent runs
if "tok" not in globals() or "model" not in globals():
    model_id = "deepseek-ai/DeepSeek-OCR"
    tok = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
    if tok.pad_token is None and tok.eos_token is not None:
        tok.pad_token = tok.eos_token
    t0 = time.time()
    model = AutoModel.from_pretrained(
        model_id,
        trust_remote_code=True,
        use_safetensors=True,
        attn_implementation="eager"  # required by this model
    ).to(dtype=torch.bfloat16, device="cuda").eval()
    print(f"Model loaded in {time.time()-t0:.1f}s")
else:
    print("Reusing model already in memory.")

# --- prep image (keep more detail; only shrink if truly massive)
img = Image.open(FNAME).convert("RGB")
if max(img.size) > 4000:       # only downscale *very* large images
    s = 4000 / max(img.size)
    img = img.resize((int(img.width*s), int(img.height*s)))
    img_path = "/content/_current_shrunk_complex.png"
    img.save(img_path, optimize=True)
else:
    img_path = FNAME

# --- separate output folder per run
outdir = f"/content/out/run_complex_{int(time.time())}"
os.makedirs(outdir, exist_ok=True)

# --- complex OCR: larger canvas + tiles + compression test
#     (roughly the “Gundam-ish” multi-tiling behavior from their README)
prompt = "<image>\n<|grounding|>Convert the document to markdown."

with torch.inference_mode():
    _ = model.infer(
        tok,
        prompt=prompt,
        image_file=img_path,
        output_path=outdir,
        base_size=1024,    # bigger canvas for small text
        image_size=640,    # tile size
        crop_mode=True,    # enable tiling for dense/long pages
        save_results=True,
        test_compress=True # run an extra pass to improve text fidelity
    )

# --- show Markdown inline
mmd = sorted(glob.glob(f"{outdir}/*.mmd"), key=os.path.getmtime, reverse=True)
if mmd:
    with open(mmd[0], "r", encoding="utf-8", errors="ignore") as f:
        text = f.read()
    print("\n--- OCR TEXT (Markdown) ---")
    display(Markdown(text))
else:
    print("No .mmd produced — check overlay image for detected boxes.")

# --- show overlay (detected regions) if produced
boxes = os.path.join(outdir, "result_with_boxes.jpg")
if os.path.isfile(boxes):
    print("\n--- Detected regions overlay ---")
    display(IPyImage(filename=boxes))

print("\nOutput folder:", outdir)