### Test_Rag_PH

The task involves the following:
* Extract the textual and graphical information from the PDF pages.
* Convert the extracted graphical data (such as charts or graphs) into a structured, queryable format.
* Implement a system where users can ask questions and receive meaningful responses based on the extracted data.

In [1]:
import importlib.util
import cv2
import numpy as np
from pathlib import Path
from PIL import Image
import torch
import pytesseract
from transformers import Pix2StructProcessor, Pix2StructForConditionalGeneration

try:
    from sklearn.cluster import DBSCAN
    _have_dbscan = True
except Exception:
    _have_dbscan = False

MONTH_NAMES = ["January","February","March","April","May","June","July","August","September","October","November","December"]
month_name_to_idx = {m.lower(): i for i, m in enumerate(MONTH_NAMES)}

def get_device():
    if torch.backends.mps.is_available() and torch.backends.mps.is_built():
        return torch.device("mps")
    return torch.device("cpu")

def load_models(device):
    deplot_proc = Pix2StructProcessor.from_pretrained("google/deplot")
    deplot_model = Pix2StructForConditionalGeneration.from_pretrained("google/deplot").to(device)
    return deplot_proc, deplot_model

def _project_root():
    """
    Resolve a sensible project root in a notebook:
      - prefer __file__ when available (script run)
      - fallback to current working directory when running inside a notebook
    """
    try:
        return Path(__file__).resolve().parent
    except Exception:
        return Path.cwd().resolve()

def _ensure_markdown_generated(md_path):
    """
    If requested markdown doesn't exist, attempt to run MinerU demo.parse_doc to produce it.
    Looks for PDF(s) under MinerU/demo/pdfs and calls parse_doc -> output dir matching the demo layout.
    """
    md_path = Path(md_path)
    if md_path.exists():
        return True

    project_root = _project_root()
    demo_py = project_root / "MinerU" / "demo" / "demo.py"
    demo_output_dir = project_root / "MinerU" / "demo" / "output"
    demo_pdfs_dir = project_root / "MinerU" / "demo" / "pdfs"

    if not demo_py.exists():
        print("demo.py not found at", demo_py, "- cannot auto-generate markdown.")
        return False

    pdfs = list(demo_pdfs_dir.glob("*.pdf")) if demo_pdfs_dir.exists() else []
    if not pdfs:
        print("No PDFs found under", demo_pdfs_dir, "— cannot run demo.parse_doc.")
        return False

    try:
        spec = importlib.util.spec_from_file_location("mineru_demo", str(demo_py))
        demo_mod = importlib.util.module_from_spec(spec)
        spec.loader.exec_module(demo_mod)
        demo_mod.parse_doc([p for p in pdfs], output_dir=str(demo_output_dir), backend="pipeline")
        print("Ran demo.parse_doc; waiting for markdown generation.")
        return md_path.exists()
    except Exception as e:
        print("Failed to run demo.parse_doc:", e)
        return False

def _resolve_md_path_input(md_input):
    """
    Resolve a user-provided md_input:
      - if it's an existing path, return it
      - if it's just a filename (with or without .md), resolve to:
          MinerU/demo/output/<stem>/auto/<stem>.md
    """
    inp = str(md_input)
    p = Path(inp).expanduser()
    # if looks like a path or exists, use it
    if p.exists() or ("/" in inp) or ("\\" in inp) or inp.startswith("."):
        return p.resolve()
    # otherwise treat as filename and build demo output path
    stem = Path(inp).stem
    project_root = _project_root()
    demo_output_dir = project_root / "MinerU" / "demo" / "output"
    candidate = demo_output_dir / stem / "auto" / f"{stem}.md"
    return candidate

The chart in the PDF only labels the starting and ending values along the horizontal axis, although ticks are provided in between. 

Converting this chart into structured data requires that each tick has a corresponding label. 

Below is a classic CV pipeline to detect the ticks from the chart image.

In [2]:
def detect_ticks_classic(img_bgr, *,
                         axis_search_frac=(0.55, 0.95),
                         axis_min_length_frac=0.35,
                         band_half_h_frac=0.02,
                         tick_min_gap_frac=0.03,
                         ocr_label_h_frac=0.12,
                         edge_blur=3):
    """
    Classic CV pipeline to find X-axis tick x positions and labels without foundation models.

    Returns:
      list of {"x": int, "label": str|None, "bbox": (x0,y0,x1,y1)}
    Notes:
      - All *_frac params are relative to image width/height (robust across sizes).
      - Uses Hough (fast) with a projection fallback, then vertical projection + simple peak clustering.
    """
    h, w = img_bgr.shape[:2]
    gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
    if edge_blur and edge_blur > 1:
        gray = cv2.GaussianBlur(gray, (edge_blur|1, edge_blur|1), 0)

    # 1) search lower area for a dominant horizontal axis
    y0 = int(h * axis_search_frac[0])
    y1 = int(h * axis_search_frac[1])
    roi = gray[y0:y1, :]
    edges = cv2.Canny(roi, 50, 150)

    axis_y = None
    min_len = int(w * axis_min_length_frac)
    lines = cv2.HoughLinesP(edges, rho=1, theta=np.pi/180, threshold=60,
                            minLineLength=min_len, maxLineGap=int(w*0.02))
    if lines is not None and len(lines):
        ys = []
        for x1_, y1_, x2_, y2_ in lines.reshape(-1,4):
            ys.append(int((y1_ + y2_)/2))
        axis_y = int(np.median(ys)) + y0
    else:
        # fallback: horizontal projection (sum of edges) in ROI
        proj = edges.sum(axis=1)
        if proj.max() > 0:
            rel = int(np.argmax(proj))
            axis_y = rel + y0

    if axis_y is None:
        return []

    # 2) Build narrow band around axis and compute vertical projection to find tick candidates
    band_h = max(2, int(h * band_half_h_frac))
    band_top = max(0, axis_y - band_h)
    band_bot = min(h, axis_y + band_h)
    band = gray[band_top:band_bot, :]

    # use Sobel/edges to emphasize vertical ticks
    sob = cv2.Sobel(band, cv2.CV_16S, 1, 0, ksize=3)
    sob = cv2.convertScaleAbs(sob)
    proj_v = sob.sum(axis=0).astype(np.float32)
    # normalize and threshold relative to mean
    thr = max(1.0, proj_v.mean() * 1.2)
    peaks = np.where(proj_v > thr)[0]

    if peaks.size == 0:
        # fallback: connected components on thresholded band
        _, bw = cv2.threshold(band, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
        cc = cv2.connectedComponentsWithStats(bw, 8, cv2.CV_32S)
        stats = cc[2]
        centers = []
        for s in stats[1:]:
            x, y, ww, hh, area = s
            # keep narrow tall components relative to band height
            if hh >= max(3, band.shape[0]*0.4) and ww <= max(3, w*0.02):
                centers.append(int(x + ww/2))
        peaks = np.array(sorted(set(centers)), dtype=int)

    if peaks.size == 0:
        return []

    # 3) cluster contiguous peak indices into single tick centers (gap clustering)
    gap_px = max(1, int(w * tick_min_gap_frac))
    clusters = []
    cur = [peaks[0]]
    for p in peaks[1:]:
        if p - cur[-1] <= gap_px:
            cur.append(p)
        else:
            clusters.append(int(np.median(cur)))
            cur = [p]
    clusters.append(int(np.median(cur)))
    tick_xs = [int(c) for c in clusters]

    # 4) OCR labels below axis and associate by proximity
    label_h = int(h * ocr_label_h_frac)
    ly0 = axis_y + 2
    ly1 = min(h, axis_y + 2 + label_h)
    if ly0 >= h or ly0 >= ly1:
        # no label area
        labels = []
    else:
        crop = img_bgr[ly0:ly1, :]
        pil = Image.fromarray(crop[:,:,::-1])
        ocr = pytesseract.image_to_data(pil, output_type=pytesseract.Output.DICT)
        labels = []
        for i, txt in enumerate(ocr['text']):
            t = txt.strip()
            if not t:
                continue
            left = ocr['left'][i]
            top = ocr['top'][i]
            width = ocr['width'][i]
            cx = left + width/2
            cy = top + ocr['height'][i]/2 + ly0
            labels.append({"text": t, "cx": cx, "cy": cy, "bbox": (left, top+ly0, left+width, top+ly0+ocr['height'][i])})

    # Cluster/merge OCR boxes horizontally to make label candidates (DBSCAN if available)
    label_clusters = []
    if labels:
        xs = np.array([l['cx'] for l in labels]).reshape(-1,1)
        if _have_dbscan:
            db = DBSCAN(eps=max(6, w*0.02), min_samples=1).fit(xs)
            for cid in np.unique(db.labels_):
                members = [labels[i] for i,lab in enumerate(db.labels_) if lab==cid]
                tx = " ".join([m['text'] for m in members])
                cx = np.mean([m['cx'] for m in members])
                bb0 = min([m['bbox'][0] for m in members])
                bb1 = min([m['bbox'][1] for m in members])
                bb2 = max([m['bbox'][2] for m in members])
                bb3 = max([m['bbox'][3] for m in members])
                label_clusters.append({"text": tx, "cx": cx, "bbox": (bb0, bb1, bb2, bb3)})
        else:
            # simple gap-based cluster
            labels_sorted = sorted(labels, key=lambda x: x['cx'])
            cur = [labels_sorted[0]]
            for lb in labels_sorted[1:]:
                if lb['cx'] - cur[-1]['cx'] <= max(6, w*0.02):
                    cur.append(lb)
                else:
                    tx = " ".join([m['text'] for m in cur])
                    cx = np.mean([m['cx'] for m in cur])
                    label_clusters.append({"text": tx, "cx": cx, "bbox": (min([m['bbox'][0] for m in cur]), min([m['bbox'][1] for m in cur]), max([m['bbox'][2] for m in cur]), max([m['bbox'][3] for m in cur]))})
                    cur = [lb]
            if cur:
                tx = " ".join([m['text'] for m in cur])
                cx = np.mean([m['cx'] for m in cur])
                label_clusters.append({"text": tx, "cx": cx, "bbox": (min([m['bbox'][0] for m in cur]), min([m['bbox'][1] for m in cur]), max([m['bbox'][2] for m in cur]), max([m['bbox'][3] for m in cur]))})

    # 5) Associate tick xs to nearest label cluster (within half inter-tick gap)
    if len(tick_xs) > 1:
        inter_gap = np.median(np.diff(sorted(tick_xs)))
    else:
        inter_gap = w * 0.1
    max_assoc_dist = max(8, inter_gap * 0.6)

    results = []
    for tx in tick_xs:
        assoc = None
        if label_clusters:
            dists = [abs(tx - lc['cx']) for lc in label_clusters]
            best_i = int(np.argmin(dists))
            if dists[best_i] <= max_assoc_dist:
                assoc = label_clusters[best_i]['text']
                bbox = label_clusters[best_i]['bbox']
            else:
                assoc = None
                bbox = (tx-3, axis_y-3, tx+3, axis_y+3)
        else:
            assoc = None
            bbox = (tx-3, axis_y-3, tx+3, axis_y+3)
        results.append({"x": int(tx), "label": assoc, "bbox": tuple(map(int, bbox))})

    # sort by x
    results = sorted(results, key=lambda r: r['x'])
    return results


When the ticks are detected, these must be labeled.

1. With the starting and ending months known, the month numbers in between are deducible.
2. These month numbers must be aligned with the horizontal position of the tick. 
3. Any existing labels must be replaced by the labels generated from the tick positions.

In [3]:
def annotate_ticks(img_bgr, ticks, color=(0,0,255), erase_pad=6, sample_h=12):
    """
    Draw replacement labels while erasing any original X-axis labels/marks.
    - Erase original label areas (using 'bbox' or 'tick_bbox') but do NOT erase the detected axis line.
    - Sample a small patch above the label bbox to estimate background color.
    - Draw replacement labels so that the first character of each label
      starts exactly at the tick x coordinate (left-aligned).
    - Put a common baseline close to the horizontal axis so all labels are horizontally aligned.
    """
    out = img_bgr.copy()
    h, w = out.shape[:2]

    # infer axis_y from provided ticks (prefer explicit y_axis)
    axis_candidates = [int(t.get("y_axis")) for t in ticks if t.get("y_axis") is not None]
    axis_y = int(np.median(axis_candidates)) if axis_candidates else None

    # If axis_y still unknown, try to infer it from label/tick bboxes:
    if axis_y is None:
        bboxes = [t.get("bbox") or t.get("tick_bbox") for t in ticks if (t.get("bbox") or t.get("tick_bbox"))]
        if bboxes:
            # labels' top coord (bbox[1]) is usually just below axis -> axis slightly above that
            tops = [int(b[1]) for b in bboxes]
            axis_y = max(0, min(tops) - 4)  # small offset above the top of label bboxes
        else:
            axis_y = None

    # First pass: erase original label areas (per-tick) but never erase the axis line.
    for t in ticks:
        bbox = t.get("bbox") or t.get("tick_bbox")
        if not bbox:
            continue
        x0, y0, x1, y1 = map(int, bbox)
        # expand bbox a little to remove nearby artifacts / vertical grid fragments
        x0e = max(0, x0 - erase_pad)
        x1e = min(w, x1 + erase_pad)
        y0e = max(0, y0 - erase_pad)
        y1e = min(h, y1 + erase_pad)

        # ensure we do not erase the horizontal axis line (if known)
        if axis_y is not None:
            # if erase rect would cross or touch the axis, clamp it to start strictly below the axis
            if y0e <= axis_y:
                y0e = axis_y + 1
            # if after clamping there's nothing to erase, skip
            if y0e >= y1e:
                continue

        # try sampling a small strip just above the label bbox to estimate background color
        # but avoid sampling across the axis
        sy1 = y0e
        sy0 = max(0, sy1 - sample_h)
        if axis_y is not None and sy0 <= axis_y < sy1:
            # move sampling region to be above axis (or just below if not possible)
            sy0 = max(0, axis_y - sample_h)
            sy1 = axis_y
            if sy0 >= sy1:
                sy0 = max(0, y1e - sample_h)
                sy1 = y1e

        sample = out[sy0:sy1, x0e:x1e] if sy1 > sy0 and x1e > x0e else None
        if sample is not None and sample.size:
            med = np.median(sample.reshape(-1, 3), axis=0).astype(int)
            bg_color = (int(med[0]), int(med[1]), int(med[2]))
        else:
            bg_color = (255, 255, 255)

        cv2.rectangle(out, (x0e, y0e), (x1e, y1e), bg_color, thickness=-1)

    # Optional: also erase a continuous band spanning all label bboxes to remove residuals,
    # but make sure band does not include the axis line.
    try:
        bboxes = [t.get("bbox") or t.get("tick_bbox") for t in ticks if (t.get("bbox") or t.get("tick_bbox"))]
        if bboxes:
            lefts = [int(b[0]) for b in bboxes]
            rights = [int(b[2]) for b in bboxes]
            tops = [int(b[1]) for b in bboxes]
            bots = [int(b[3]) for b in bboxes]
            band_x0 = max(0, min(lefts) - erase_pad)
            band_x1 = min(w, max(rights) + erase_pad)
            band_y0 = max(0, min(tops) - erase_pad)
            band_y1 = min(h, max(bots) + erase_pad)

            # ensure band does not include axis_y
            if axis_y is not None and band_y0 <= axis_y <= band_y1:
                band_y0 = axis_y + 1
                if band_y0 >= band_y1:
                    raise ValueError("band would erase axis; skip band erase")

            sy0b = max(0, band_y0 - sample_h)
            sample_band = out[sy0b:band_y0, band_x0:band_x1] if band_y0 > sy0b and band_x1 > band_x0 else None
            if sample_band is not None and sample_band.size:
                med = np.median(sample_band.reshape(-1, 3), axis=0).astype(int)
                bg_color = (int(med[0]), int(med[1]), int(med[2]))
            else:
                bg_color = (255,255,255)
            cv2.rectangle(out, (band_x0, band_y0), (band_x1, band_y1), bg_color, thickness=-1)
    except Exception:
        # if any geometry fails or would erase axis, ignore band erase
        pass

    # Determine a common baseline (y) for all labels — place it just below the axis if axis available,
    # otherwise use the bottoms of original bbox regions.
    font = cv2.FONT_HERSHEY_SIMPLEX
    font_scale = 0.5
    thickness = 1

    if axis_y is not None:
        # baseline nominally a short distance below axis
        baseline_y = min(h - 6, axis_y + 16)
    else:
        bottoms = []
        for t in ticks:
            bbox = t.get("bbox") or t.get("tick_bbox")
            if bbox:
                bottoms.append(int(bbox[3]))
            elif t.get("y_axis") is not None:
                bottoms.append(int(t.get("y_axis")) + 10)
        if bottoms:
            baseline_y = min(h - 6, max(bottoms) + 14)
        else:
            baseline_y = h - 20
    baseline_y = max(12, int(baseline_y))

    # Second pass: draw replacement labels (left-aligned so first char sits at tick x),
    # using the common baseline_y and ensuring text never overlaps the axis.
    for t in ticks:
        lbl = (t.get("label") or "").strip() or str(t.get("x", ""))
        x = int(t.get("x", 0))

        # compute text size
        (text_w, text_h), baseline = cv2.getTextSize(lbl, font, font_scale, thickness)

        # left-align text: first character x coordinate == tick x
        x_text = x
        # clamp so text stays inside image (allow text to start at x even if it extends to right)
        x_text = max(2, min(x_text, w - text_w - 2))

        # determine y_text baseline: keep common baseline but ensure it's below axis + text height
        y_text = int(baseline_y)
        if axis_y is not None:
            min_ok = axis_y + text_h + 6
            if y_text - text_h <= axis_y:
                y_text = max(y_text, min_ok)
            else:
                # still ensure minimum clearance
                y_text = max(y_text, min_ok)
        else:
            y_text = y_text

        y_text = min(h - 4, y_text)

        # background rectangle for readability
        pad = 2
        rect_x0 = x_text - pad
        rect_y0 = y_text - text_h - pad
        rect_x1 = x_text + text_w + pad
        rect_y1 = y_text + baseline + pad
        rect_x0 = max(0, rect_x0)
        rect_y0 = max(0, rect_y0)
        rect_x1 = min(w, rect_x1)
        rect_y1 = min(h, rect_y1)

        # sample bg color just above text rect to blend, avoid sampling axis
        sy0 = max(0, rect_y0 - sample_h)
        sy1 = rect_y0
        if axis_y is not None and sy0 <= axis_y < sy1:
            sy0 = max(0, axis_y - sample_h)
            sy1 = axis_y
            if sy0 >= sy1:
                sy0 = max(0, rect_y1 - sample_h)
                sy1 = rect_y1

        sample = out[sy0:sy1, rect_x0:rect_x1] if sy1 > sy0 and rect_x1 > rect_x0 else None
        if sample is not None and sample.size:
            med = np.median(sample.reshape(-1, 3), axis=0).astype(int)
            bg_color = (int(med[0]), int(med[1]), int(med[2]))
        else:
            bg_color = (255,255,255)

        cv2.rectangle(out, (rect_x0, rect_y0), (rect_x1, rect_y1), bg_color, thickness=-1)
        cv2.putText(out, lbl, (int(x_text), int(y_text)), font, font_scale, (0,0,0), thickness, cv2.LINE_AA)

    return out

def detect_month_index_from_text(s):
    if not s:
        return None
    ss = s.lower()
    mnum = re.search(r'\b(1[0-2]|[1-9])\b', ss)
    if mnum:
        try:
            val = int(mnum.group(0))
            if 1 <= val <= 12:
                return val - 1
        except Exception:
            pass
    for token, idx in month_name_to_idx.items():
        if token in ss:
            return idx
    return None

def fallback_find_months_in_band(img_bgr, axis_y=None, search_frac=(0.65,0.99)):
    h, w = img_bgr.shape[:2]
    if axis_y:
        ly0 = min(h-1, axis_y - 6)
    else:
        ly0 = int(h * search_frac[0])
    ly1 = min(h, int(h * search_frac[1]))
    band = img_bgr[ly0:ly1, :]
    pil = Image.fromarray(band[:,:,::-1])
    ocr = pytesseract.image_to_data(pil, output_type=pytesseract.Output.DICT)
    found = []
    for i, txt in enumerate(ocr['text']):
        t = txt.strip()
        if not t:
            continue
        left = ocr['left'][i]
        width = ocr['width'][i]
        cx = left + width/2
        idx = detect_month_index_from_text(t)
        if idx is not None:
            found.append({"x": int(cx), "label": t, "month_idx": idx, "abs_x": int(cx)})
        else:
            # also accept text that contains month substring (defensive)
            tl = t.lower()
            for token, mi in month_name_to_idx.items():
                if token in tl:
                    found.append({"x": int(cx), "label": t, "month_idx": mi, "abs_x": int(cx)})
                    break
    return found, (ly0, ly1)



### Convert charts to table

In [5]:
from pathlib import Path
from PIL import Image
import re

from chart_to_structured import preprocess_for_deplot, run_deplot

def process_markdown(md_path, out_path=None, overwrite=False, save_debug=False):
    """
    Process markdown keeping only two image-related items:
      - Replace the single image that appears immediately before the phrase
        "Your annual electricity use" with deplot-extracted table (preprocess_for_deplot -> run_deplot).
        The deplot extracted text is post-processed: 'Year' -> 'kWh' and numeric first-column values
        (1-12) are replaced by month names.
      - Ensure the image link that appears immediately after the phrase
        "than similar nearby homes" is present right after that phrase (inserted if necessary).
        That particular image is NOT preprocessed/deplot'ed — it is inserted as-is.
    All other lines containing image links are deleted.
    """
    md_path = _resolve_md_path_input(md_path)
    md_path = Path(md_path)
    if not md_path.exists():
        ok = _ensure_markdown_generated(md_path)
        if not ok:
            print("Markdown not found and demo generation failed; aborting.")
            return

    text = md_path.read_text(encoding="utf-8")
    img_matches = list(re.finditer(r'!\[.*?\]\(([^)]+)\)', text))
    if not img_matches:
        print("No image links found.")
        # still write cleaned output (no images to remove)
        out_file = md_path if overwrite else (Path(out_path) if out_path else md_path.with_name(md_path.stem + "_processed.md"))
        out_file.write_text(text, encoding="utf-8")
        print("Wrote:", out_file)
        return

    # Identify phrase locations in original text
    phrase1 = "Your annual electricity use"
    phrase2 = "than similar nearby homes"
    orig_phrase1_pos = text.lower().find(phrase1.lower())
    orig_phrase2_pos = text.lower().find(phrase2.lower())

    # determine the target image before phrase1 (if any)
    target_before_m = None
    if orig_phrase1_pos != -1:
        candidates = [m for m in img_matches if m.end() <= orig_phrase1_pos]
        if candidates:
            target_before_m = max(candidates, key=lambda m: m.start())

    # determine the image immediately after phrase2 (if any)
    after_phrase2_m = None
    if orig_phrase2_pos != -1:
        post_candidates = [m for m in img_matches if m.start() >= orig_phrase2_pos]
        if post_candidates:
            after_phrase2_m = min(post_candidates, key=lambda m: m.start())

    # Build cleaned text by removing all lines that contain image links,
    # except we do NOT keep the original lines for the two special images
    # (they will be handled/inserted explicitly).
    lines = text.splitlines()
    cleaned_lines = []
    # prepare canonical markdown snippets for comparison
    target_before_md = text[target_before_m.start():target_before_m.end()] if target_before_m else None
    after_phrase2_md = text[after_phrase2_m.start():after_phrase2_m.end()] if after_phrase2_m else None

    img_re = re.compile(r'!\[.*?\]\(([^)]+)\)')
    for ln in lines:
        m = img_re.search(ln)
        if not m:
            cleaned_lines.append(ln)
            continue
        # line contains an image link
        # drop any image lines; the after-phrase2 image will be inserted later if needed,
        # and the target-before image will be replaced by the extracted block.
        continue

    new_text = "\n".join(cleaned_lines)

    device = get_device()
    print("Device:", device)
    deplot_proc, deplot_model = load_models(device)


    # --- process target_before image: run preprocess_for_deplot -> deplot, then insert extracted block
    if target_before_m:
        img_rel = target_before_m.group(1).strip()
        # only attempt processing for local images
        if img_rel.startswith("http://") or img_rel.startswith("https://"):
            print(f"Target-before image is remote ({img_rel}) — will not run deplot; no extracted block inserted.")
        else:
            img_path = (md_path.parent / img_rel).resolve()
            if not img_path.exists():
                print(f"Missing target-before image: {img_path} — skipping extraction.")
            else:
                try:
                    pil = Image.open(img_path).convert("RGB")
                except Exception as e:
                    print(f"Cannot open {img_path}: {e}. Skipping extraction.")
                    pil = None

                if pil is not None:
                    device = get_device()
                    deplot_proc = deplot_model = None
                    models_loaded = False
                    try:
                        from transformers import Pix2StructProcessor, Pix2StructForConditionalGeneration
                        deplot_proc = Pix2StructProcessor.from_pretrained("google/deplot")
                        deplot_model = Pix2StructForConditionalGeneration.from_pretrained("google/deplot").to(device)
                        models_loaded = True
                    except Exception as e:
                        print("Failed to load deplot model:", e)
                        models_loaded = False

                    extracted = None
                    if models_loaded:
                        try:
                            prepped_pil, did_pre = preprocess_for_deplot(pil, img_path=img_path, save_debug=save_debug)
                            extracted = run_deplot(prepped_pil, deplot_proc, deplot_model, device)
                            if (not extracted or len(extracted.strip()) < 8) and did_pre:
                                print("deplot returned little/empty on annotated image — retrying with original image.")
                                extracted = run_deplot(pil, deplot_proc, deplot_model, device)
                        except Exception as e:
                            print(f"deplot failed for {img_rel}: {e}. Leaving no extracted block.")
                            extracted = None

                        # post-process extracted text:
                        def transform_extracted_text(txt):
                            # replace Year -> kWh (word-boundary, case-insensitive)
                            txt = re.sub(r'\bYear\b', 'kWh', txt, flags=re.IGNORECASE)
                            MONTHS = ["January","February","March","April","May","June","July","August","September","October","November","December"]

                            out_lines = []
                            for ln in txt.splitlines():
                                if not ln.strip():
                                    out_lines.append(ln)
                                    continue

                                # try markdown table row with pipes
                                if '|' in ln:
                                    parts = ln.split('|')
                                    # find first non-empty cell index
                                    first_idx = None
                                    for i, cell in enumerate(parts):
                                        if cell.strip() != '':
                                            first_idx = i
                                            break
                                    if first_idx is not None:
                                        first = parts[first_idx].strip()
                                        mnum = re.match(r'^(\d{1,2})\b', first)
                                        if mnum:
                                            n = int(mnum.group(1))
                                            if 1 <= n <= 12:
                                                parts[first_idx] = ' ' + MONTHS[n-1] + ' '
                                                out_lines.append('|'.join(parts))
                                                continue
                                    # if no replacement done, fall through to append original
                                    out_lines.append(ln)
                                    continue

                                # CSV-like (comma separated)
                                if ',' in ln:
                                    cells = [c for c in ln.split(',')]
                                    first = cells[0].strip()
                                    mnum = re.match(r'^(\d{1,2})\b', first)
                                    if mnum:
                                        n = int(mnum.group(1))
                                        if 1 <= n <= 12:
                                            cells[0] = MONTHS[n-1]
                                            out_lines.append(','.join(cells))
                                            continue
                                    out_lines.append(ln)
                                    continue

                                # whitespace separated / plain row: replace leading integer token if present
                                toks = ln.lstrip()
                                leading_ws = ln[:len(ln)-len(toks)]
                                toks_split = toks.split()
                                if toks_split:
                                    mnum = re.match(r'^(\d{1,2})\b', toks_split[0])
                                    if mnum:
                                        n = int(mnum.group(1))
                                        if 1 <= n <= 12:
                                            # replace only the first token while preserving original leading whitespace and remainder
                                            replaced = re.sub(r'^(\s*)\d{1,2}\b', r'\1' + MONTHS[n-1], ln, count=1)
                                            out_lines.append(replaced)
                                            continue
                                out_lines.append(ln)
                            return '\n'.join(out_lines)

                        processed_extracted = transform_extracted_text(extracted)
                        replacement = f"\n\n```\\n{processed_extracted}\\n```\\n\\n"
                    else:
                        replacement = f"\n\n```\\n_deplot produced no textual output_\\n```\\n\\n"

                    # Insert into cleaned text at the location that follows the line after the original image line in the ORIGINAL markdown.
                    # Strategy: locate the "line after" content in the original 'text', then find that same line in new_text and insert after it.
                    try:
                        orig = text  # original markdown (before cleaning)
                        # compute the end of the original image line
                        img_line_end = orig.find('\n', target_before_m.end())
                        if img_line_end == -1:
                            # no following line => append at end of cleaned text
                            new_text = new_text + replacement
                            print("No following line after original image; appended extracted block.")
                        else:
                            # start of next line
                            next_line_start = img_line_end + 1
                            next_line_end = orig.find('\n', next_line_start)
                            if next_line_end == -1:
                                next_line = orig[next_line_start:].rstrip('\n')
                            else:
                                next_line = orig[next_line_start:next_line_end]

                            if next_line:
                                # find that line in the cleaned/filtered new_text
                                pos = new_text.find(next_line)
                                if pos != -1:
                                    insert_pos = pos + len(next_line)
                                    new_text = new_text[:insert_pos] + replacement + new_text[insert_pos:]
                                    print("Inserted extracted block after the original image's following line.")
                                else:
                                    # fallback: insert before phrase1 if present, else prepend
                                    if phrase1.lower() in new_text.lower():
                                        insert_pos = new_text.lower().find(phrase1.lower())
                                        new_text = new_text[:insert_pos] + replacement + new_text[insert_pos:]
                                        print("Could not find original following line in cleaned text; inserted before phrase1 as fallback.")
                                    else:
                                        new_text = replacement + new_text
                                        print("Could not find original following line in cleaned text; prepended extracted block as fallback.")
                            else:
                                # next_line empty -> append
                                new_text = new_text + replacement
                                print("Following line empty; appended extracted block.")
                    except Exception as e:
                        print("Error inserting extracted block at desired location:", e)
                        # fallback to insert before phrase or prepend
                        if phrase1.lower() in new_text.lower():
                            insert_pos = new_text.lower().find(phrase1.lower())
                            new_text = new_text[:insert_pos] + replacement + new_text[insert_pos:]
                        else:
                            new_text = replacement + new_text
                        print("Fallback insertion performed.")
    else:
        print(f'No image found immediately before phrase "{phrase1}" — no extraction performed.')

    # --- ensure the image link immediately after phrase2 is processed with deplot (insert extracted block)
    if after_phrase2_m and after_phrase2_md:
        img_rel2 = after_phrase2_m.group(1).strip()
        # only attempt processing for local images
        if img_rel2.startswith("http://") or img_rel2.startswith("https://"):
            print(f"Image after phrase2 is remote ({img_rel2}) — skipping deplot; no insertion.")
        else:
            img_path2 = (md_path.parent / img_rel2).resolve()
            if not img_path2.exists():
                print(f"Missing image after phrase2: {img_path2} — skipping insertion.")
            else:
                try:
                    pil2 = Image.open(img_path2).convert("RGB")
                except Exception as e:
                    print(f"Cannot open {img_path2}: {e}. Skipping insertion.")
                    pil2 = None

                if pil2 is not None:
                    device = get_device()
                    deplot_proc2 = deplot_model2 = None
                    models_loaded2 = False
                    try:
                        from transformers import Pix2StructProcessor, Pix2StructForConditionalGeneration
                        deplot_proc2 = Pix2StructProcessor.from_pretrained("google/deplot")
                        deplot_model2 = Pix2StructForConditionalGeneration.from_pretrained("google/deplot").to(device)
                        models_loaded2 = True
                    except Exception as e:
                        print("Failed to load deplot model for phrase2 image:", e)
                        models_loaded2 = False

                    extracted2 = None
                    if models_loaded2:
                        try:
                            prepped_pil2, did_pre2 = preprocess_for_deplot(pil2, img_path=img_path2, save_debug=save_debug)
                            extracted2 = run_deplot(prepped_pil2, deplot_proc2, deplot_model2, device)
                            if (not extracted2 or len(extracted2.strip()) < 8) and did_pre2:
                                print("deplot returned little/empty on annotated image — retrying with original image.")
                                extracted2 = run_deplot(pil2, deplot_proc2, deplot_model2, device)
                            #extracted2 = run_deplot(pil2, deplot_proc2, deplot_model2, device)
                        except Exception as e:
                            print(f"deplot failed for {img_rel2}: {e}. Leaving no extracted block.")
                            extracted2 = None

                    if extracted2:
                        # For the phrase2 insertion we also perform Year->kWh replacement for consistency
                        #extracted2 = re.sub(r'\bYear\b', 'kWh', extracted2, flags=re.IGNORECASE)
                        replacement2 = f"\n\n```\\n{extracted2}\\n```\\n\\n"
                    else:
                        replacement2 = f"\n\n```\\n_deplot produced no textual output_\\n```\\n\\n"

                    # insert extracted block immediately after phrase2 (or append if phrase2 not found)
                    if phrase2.lower() in new_text.lower():
                        phrase2_pos = new_text.lower().find(phrase2.lower())
                        insert_pos = phrase2_pos + len(phrase2)
                        new_text = new_text[:insert_pos] + replacement2 + new_text[insert_pos:]
                        print(f"Inserted extracted block for image after phrase '{phrase2}': {img_rel2}")
                    else:
                        new_text = new_text + replacement2
                        print(f"Phrase '{phrase2}' not found; appended its extracted block at document end.")
    else:
        print(f"No image found immediately after phrase '{phrase2}' — nothing to insert for that phrase.")

    # final: write cleaned-and-processed output
    if overwrite:
        out_file = md_path
    else:
        out_file = Path(out_path) if out_path else md_path.with_name(md_path.stem + "_processed.md")
    out_file.write_text(new_text, encoding="utf-8")
    print("Wrote:", out_file)

## Generate the context from the PDF

In [6]:
process_markdown("test_info_extract",
                 out_path=None,
                 overwrite=False,
                 save_debug=True)

[32m2025-11-07 01:47:17.503[0m | [1mINFO    [0m | [36mmineru.backend.pipeline.pipeline_analyze[0m:[36mdoc_analyze[0m:[36m128[0m - [1mBatch 1/1: 2 pages/2 pages[0m
[32m2025-11-07 01:47:17.526[0m | [1mINFO    [0m | [36mmineru.backend.pipeline.model_init[0m:[36m__init__[0m:[36m208[0m - [1mDocAnalysis init, this may take some times......[0m


Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 7 files:   0%|          | 0/7 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

[32m2025-11-07 01:47:27.215[0m | [1mINFO    [0m | [36mmineru.backend.pipeline.model_init[0m:[36m__init__[0m:[36m270[0m - [1mDocAnalysis init done![0m
[32m2025-11-07 01:47:27.216[0m | [1mINFO    [0m | [36mmineru.backend.pipeline.pipeline_analyze[0m:[36mcustom_model_init[0m:[36m65[0m - [1mmodel init cost: 9.689947128295898[0m
Layout Predict: 100%|██████████| 2/2 [00:01<00:00,  1.15it/s]
MFD Predict: 100%|██████████| 2/2 [00:01<00:00,  1.12it/s]
MFR Predict: 100%|██████████| 2/2 [00:00<00:00,  5.37it/s]
Table-ocr det: 0it [00:00, ?it/s]
Table-wireless Predict: 0it [00:00, ?it/s]
OCR-det Predict:   0%|          | 0/2 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

OCR-det Predict: 100%|██████████| 2/2 [00:10<00:00,  5.09s/it]
[32m2025-11-07 01:47:41.462[0m | [34m[1mDEBUG   [0m | [36mmineru_demo[0m:[36mdo_parse[0m:[36m139[0m - [34m[1mCould not open image-list entry, keeping entry: <class 'dict'>[0m
[32m2025-11-07 01:47:41.463[0m | [34m[1mDEBUG   [0m | [36mmineru_demo[0m:[36mdo_parse[0m:[36m139[0m - [34m[1mCould not open image-list entry, keeping entry: <class 'dict'>[0m
Processing pages:   0%|          | 0/2 [00:00<?, ?it/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

Processing pages: 100%|██████████| 2/2 [00:02<00:00,  1.03s/it]
[32m2025-11-07 01:47:43.564[0m | [1mINFO    [0m | [36mmineru_demo[0m:[36m_process_output[0m:[36m245[0m - [1mlocal output dir is /Users/earl/Documents/test_rag_ph/MinerU/demo/output/test_info_extract/auto[0m


Ran demo.parse_doc; waiting for markdown generation.
Device: mps
Inserted extracted block after the original image's following line.
Inserted extracted block for image after phrase 'than similar nearby homes': images/cc50f95fa5997e6351965fb16e335b4d1038587ef538831d6853fea680a4f593.jpg
Wrote: /Users/earl/Documents/test_rag_ph/MinerU/demo/output/test_info_extract/auto/test_info_extract_processed.md


### Interact with the PDF

1. The markdown from the PDF feeds context to the LLM `qwen3:4b` on `ollama`.
2. Query the LLM by typing in your query in the line assigning `ask`.

In [7]:
import requests
import json
from pathlib import Path

md_path_processed = _resolve_md_path_input("test_info_extract")
md_path_processed = Path(md_path_processed).with_name(Path(md_path_processed).stem + "_processed.md")

with open(md_path_processed, "r") as f:
    content = f.read()

############# TYPE YOUR QUESTION BELOW #############
ask = 'In which month does the electricity usage peak?'
###################################################

response = requests.post(
    "http://localhost:11434/api/generate",
    json={
        "model": "qwen3:4b",
        "prompt": content+ask,
        "stream": True
    },
    stream=True
)

for line in response.iter_lines():
    if line:
        data = json.loads(line.decode("utf-8"))
        if "response" in data:
            print(data["response"], end="", flush=True)


Based on the **monthly electricity usage data** provided in your Home Energy Report (specifically the "Your annual electricity use compared with similar and efficient homes" table), **December** shows the peak electricity usage for your home.  

Here's the clear breakdown of your monthly usage (in kWh) for the user ("You"):

| Month       | Your Usage (kWh) |
|-------------|------------------|
| April       | 115              |
| May         | 141              |
| June        | 160              |
| July        | 174              |
| August      | 168              |
| September   | 150              |
| October     | 160              |
| November    | 170              |
| **December**| **181**          |
| January     | 160              |
| February    | 126              |
| March       | 114              |

### Why December is the peak:
- **December has the highest value at 181 kWh** (the only month exceeding 174 kWh).
- This is higher than all other months, including July (174 kWh) and

## Limitations and Improvements

1. This solution was tested locally in Apple Silicon M3 for now. 
2. The chart conversion to table has substantial room for improvement, especially for diverse chart types (e.g., bar chart, line, scatterplot, etc.)
3. We utilize a relatively small (**4B parameters**) LLM for efficiency. Quantized models may be faster, but larger ones may provide better responses.