In [None]:
import sys
import torch, transformers, pytesseract, cv2, pdf2image

print("Python version:", sys.version)
print("Torch:", torch.__version__)
print("Transformers:", transformers.__version__)
print("Pytesseract:", pytesseract.get_tesseract_version())
print("OpenCV:", cv2.__version__)
print("pdf2image:", pdf2image.__version__)

In [8]:
!{sys.executable} -m pip install opencv-python


Collecting opencv-python
  Downloading opencv_python-4.12.0.88-cp37-abi3-win_amd64.whl.metadata (19 kB)
Collecting numpy<2.3.0,>=2 (from opencv-python)
  Downloading numpy-2.2.6-cp312-cp312-win_amd64.whl.metadata (60 kB)
Downloading opencv_python-4.12.0.88-cp37-abi3-win_amd64.whl (39.0 MB)
   ---------------------------------------- 0.0/39.0 MB ? eta -:--:--
   ------ --------------------------------- 6.8/39.0 MB 42.0 MB/s eta 0:00:01
   ---------------- ----------------------- 16.3/39.0 MB 44.5 MB/s eta 0:00:01
   -------------------------- ------------- 25.7/39.0 MB 45.2 MB/s eta 0:00:01
   ------------------------------------ --- 35.7/39.0 MB 46.3 MB/s eta 0:00:01
   ---------------------------------------- 39.0/39.0 MB 43.5 MB/s  0:00:00
Downloading numpy-2.2.6-cp312-cp312-win_amd64.whl (12.6 MB)
   ---------------------------------------- 0.0/12.6 MB ? eta -:--:--
   ---------------------------------- ----- 10.7/12.6 MB 51.4 MB/s eta 0:00:01
   ------------------------------------

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
gensim 4.3.3 requires numpy<2.0,>=1.18.5, but you have numpy 2.2.6 which is incompatible.
numba 0.60.0 requires numpy<2.1,>=1.22, but you have numpy 2.2.6 which is incompatible.


In [9]:
!{sys.executable} -m pip install pdf2image


Collecting pdf2image
  Downloading pdf2image-1.17.0-py3-none-any.whl.metadata (6.2 kB)
Downloading pdf2image-1.17.0-py3-none-any.whl (11 kB)
Installing collected packages: pdf2image
Successfully installed pdf2image-1.17.0


In [10]:
import os
os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"   # silence symlink warning on Windows

from PIL import Image
import pytesseract
from pytesseract import Output
import cv2
import numpy as np
from pdf2image import convert_from_path
import torch

In [14]:
pip install huggingface_hub[hf_xet]


Collecting hf-xet<2.0.0,>=1.1.2 (from huggingface_hub[hf_xet])
  Downloading hf_xet-1.1.9-cp37-abi3-win_amd64.whl.metadata (4.7 kB)
Downloading hf_xet-1.1.9-cp37-abi3-win_amd64.whl (2.8 MB)
   ---------------------------------------- 0.0/2.8 MB ? eta -:--:--
   ---------------------------------------- 2.8/2.8 MB 23.3 MB/s  0:00:00
Installing collected packages: hf-xet
Successfully installed hf-xet-1.1.9
Note: you may need to restart the kernel to use updated packages.


In [15]:
from transformers import logging
logging.set_verbosity_error()


In [16]:
from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer, pipeline

In [17]:
TESSERACT_EXE = r"C:\Program Files\Tesseract-OCR\tesseract.exe"  # adapt if needed
pytesseract.pytesseract.tesseract_cmd = TESSERACT_EXE

CAPTION_MODEL = "nlpconnect/vit-gpt2-image-captioning"   # lightweight caption model
SUMMARIZER_MODEL = "google/flan-t5-small"                # lightweight local summarizer (CPU OK)
# If you prefer OpenAI for summarization, see function summarize_with_openai below.

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

In [18]:
caption_model = VisionEncoderDecoderModel.from_pretrained(CAPTION_MODEL)
feature_extractor = ViTImageProcessor.from_pretrained(CAPTION_MODEL)
caption_tokenizer = AutoTokenizer.from_pretrained(CAPTION_MODEL)
if DEVICE == "cuda":
    caption_model.to("cuda")

# Load summarizer pipeline (local fallback)
summarizer = pipeline("text2text-generation", model=SUMMARIZER_MODEL, device=0 if DEVICE=="cuda" else -1)


# ---------- helpers ----------
def image_from_path_or_pil(x):
    if isinstance(x, str):
        img = Image.open(x).convert("RGB")
    elif isinstance(x, Image.Image):
        img = x.convert("RGB")
    else:
        raise ValueError("Input must be filepath or PIL.Image")
    return img

def pdf_to_images(pdf_path, dpi=200, poppler_path=None):
    # poppler_path needed on Windows if poppler not in PATH
    images = convert_from_path(pdf_path, dpi=dpi, poppler_path=poppler_path)
    return images

def preprocess_for_ocr(pil_img):
    # basic denoise + threshold for better OCR
    arr = np.array(pil_img.convert("RGB"))
    gray = cv2.cvtColor(arr, cv2.COLOR_RGB2GRAY)
    # Gaussian blur then Otsu threshold
    blur = cv2.GaussianBlur(gray, (5,5), 0)
    _, thresh = cv2.threshold(blur, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
    return Image.fromarray(thresh)

def ocr_image_get_text_and_conf(pil_img, lang="eng"):
    """
    Returns (text, avg_confidence_0_to_1)
    """
    pre = preprocess_for_ocr(pil_img)
    data = pytesseract.image_to_data(pre, output_type=Output.DICT, lang=lang)
    words = []
    confs = []
    for i, w in enumerate(data["text"]):
        if w and w.strip():
            words.append(w)
            try:
                c = float(data["conf"][i])
                if c >= 0:
                    confs.append(c)
            except:
                pass
    text = " ".join(words).strip()
    avg_conf = (sum(confs)/len(confs))/100.0 if confs else 0.0
    return text, avg_conf

def generate_caption(pil_img, max_length=40):
    img_for_model = pil_img.convert("RGB")
    pixel_values = feature_extractor(images=img_for_model, return_tensors="pt").pixel_values
    if DEVICE == "cuda":
        pixel_values = pixel_values.to("cuda")
    output_ids = caption_model.generate(pixel_values, max_length=max_length, num_beams=4)
    caption = caption_tokenizer.decode(output_ids[0], skip_special_tokens=True).strip()
    return caption

def build_prompt(ocr_text, caption):
    prompt = f"""OCR:
{ocr_text}

Caption:
{caption}

INSTRUCTION:
Produce a concise 3-sentence professional summary.
- Sentence1: describe the main scene (what's visible).
- Sentence2: summarize important textual content (numbers, dates, amounts).
- Sentence3: recommended action or caveat.
Also append a confidence score (0-1) on its own line and list data sources used (OCR/caption).
"""
    return prompt

def summarize_with_local_model(prompt):
    # Uses local Flan-T5 pipeline; returns string
    out = summarizer(prompt, max_length=200, do_sample=False)
    if isinstance(out, list):
        text = out[0].get("generated_text") or out[0].get("text") or str(out[0])
    else:
        text = str(out)
    return text.strip()

# Optional: better/higher-quality summarization using OpenAI (if you have an API key)
def summarize_with_openai(prompt):
    try:
        import openai
    except ImportError:
        raise RuntimeError("openai package not installed. pip install openai")

    openai.api_key = os.getenv("OPENAI_API_KEY")
    if not openai.api_key:
        raise RuntimeError("Set OPENAI_API_KEY environment variable for OpenAI summarization")

    resp = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role":"system","content":"You are a concise assistant that returns exactly 3 sentences plus a confidence and sources line."},
            {"role":"user","content":prompt}
        ],
        temperature=0.0,
        max_tokens=300,
    )
    return resp.choices[0].message.content.strip()

def combine_page_summaries(page_summaries):
    """
    page_summaries: list of dicts: { 'summary_text':..., 'ocr_conf':..., 'caption':... }
    Produce a single consolidated 3-sentence summary and compute final confidence.
    """
    combined_text = "\n\n".join([f"Page {i+1} OCR:{p['ocr_text']}\nCaption:{p['caption']}" 
                                 for i,p in enumerate(page_summaries)])
    prompt = "Consolidate the following page-level context into a single concise 3-sentence professional summary using the same format (S1 scene, S2 textual content, S3 action/caveat). Then append a confidence (0-1) and data sources.\n\n" + combined_text
    consolidated = summarize_with_local_model(prompt)
    # simple numeric confidence aggregation
    avg_ocr_conf = sum(p['ocr_conf'] for p in page_summaries)/len(page_summaries) if page_summaries else 0.0
    caption_score = sum(min(1, len(p['caption'])/50) for p in page_summaries)/len(page_summaries) if page_summaries else 0.0
    final_conf = 0.7*avg_ocr_conf + 0.3*caption_score
    return consolidated, float(final_conf)


In [20]:
# Path to your image
image_path = "sample.png"

# Load image
img = image_from_path_or_pil(image_path)

# Run OCR
ocr_text, ocr_conf = ocr_image_get_text_and_conf(img)

# Run captioning
caption = generate_caption(img)

# Build prompt
prompt = build_prompt(ocr_text, caption)

# Summarize
summary = summarize_with_local_model(prompt)

# print("OCR text:", ocr_text[:200], "...")
print("Caption:", caption)
print("Summary:\n", summary)


Caption: a collage of photos showing a newspaper advertisement
Summary:
 Describe the main scene (what's visible). - Sentence2 Describe the main scene (what's visible). - Sentence3 Describe the main scene (what's visible). - Sentence4 Describe the main scene (what's visible). - Sentence4 Describe the main scene (what's visible). - Sentence4 Describe the main scene (what's visible). - Sentence4 Describe the main scene (what's visible). - Sentence4 Describe the main scene (what's visible). - Sentence4 Describe the main scene (what's visible). - Sentence4 Describe the main scene (what's visible). - Sentence4 Describe the main scene (what's visible). - Sentence4 Describe the main scene (what's visible). - Sentence4 Describe the main scene (what's visible). - Sentence4 Describe the main scene (what's visible). - Sentence4 Describe the main scene (what's visible). - Sentence4


In [None]:
# Path to your PDF
pdf_path = "document.pdf"

# Convert each page to images
pages = pdf_to_images(pdf_path, dpi=200, poppler_path=r"C:\path\to\poppler\bin")  # on Windows
# on Linux/Mac, just use: pdf_to_images(pdf_path)

page_summaries = []
for i, page_img in enumerate(pages):
    ocr_text, ocr_conf = ocr_image_get_text_and_conf(page_img)
    caption = generate_caption(page_img)
    prompt = build_prompt(ocr_text, caption)
    summary_text = summarize_with_local_model(prompt)
    
    page_summaries.append({
        "ocr_text": ocr_text,
        "ocr_conf": ocr_conf,
        "caption": caption,
        "summary_text": summary_text
    })

# Combine into one final summary
final_summary, confidence = combine_page_summaries(page_summaries)

print("Final summary:\n", final_summary)
print("Confidence:", confidence)
