# Table Detection, Extraction and Summarization Pipeline

This notebook: 
1. Detects if an image contains a table using TATR detection model.
2. If a table is present, extracts and OCRs its cells then summarizes.
3. If no table, performs full-image OCR.

In [None]:
# Install required packages
!pip install transformers torch paddleocr opencv-python numpy pandas




[notice] A new release of pip available: 22.3 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [73]:
import os
import json
import cv2
import torch
import numpy as np
import pandas as pd
from datetime import datetime
from transformers import (
    TableTransformerForObjectDetection,
    DetrImageProcessor,
    T5ForConditionalGeneration,
    AutoTokenizer
)
from paddleocr import PaddleOCR

## 1. Table Detection Function

In [74]:
def is_table_image(image_path, detect_thresh=0.5):
    '''Detect if the image contains a table region using TATR detection model'''
    MODEL_DET = "microsoft/table-transformer-structure-recognition-v1.1-all"
    proc_det = DetrImageProcessor.from_pretrained(MODEL_DET)
    model_det = TableTransformerForObjectDetection.from_pretrained(MODEL_DET).eval()
    # 2) read & preprocess
    img_bgr = cv2.imread(image_path)
    img_rgb = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2RGB)

    # 3) run it through TATR
    inputs = proc_det(images=img_rgb, return_tensors="pt", size={"height": img_rgb.shape[0], "width": img_rgb.shape[1]})
    outputs = model_det(**inputs)

    # 4) post‐process detections
    target_sizes = torch.tensor([img_rgb.shape[:2]])
    results = proc_det.post_process_object_detection(
        outputs, threshold=detect_thresh, target_sizes=target_sizes
    )[0]
    
        # 5) check for any 'table row' / 'table column' labels
    id2label = model_det.config.id2label
    highest_confidence = 0.0
    is_table = False
    
    for score, lbl in zip(results["scores"], results["labels"]):
        if score < detect_thresh:
            continue
        name = id2label[int(lbl.item())]
        if name in ("table row", "table column"):
            confidence = float(score.item())
            highest_confidence = max(highest_confidence, confidence)
            is_table = True

    return is_table, highest_confidence

## 2. Table Extraction Function

In [75]:
def extract_table_from_image(image_path, detect_thresh=0.9):
    '''Extract table structure and content from an image using TATR structure-recognition'''
    MODEL_NAME = 'microsoft/table-transformer-structure-recognition-v1.1-all'
    processor = DetrImageProcessor.from_pretrained(MODEL_NAME)
    model = TableTransformerForObjectDetection.from_pretrained(MODEL_NAME)
    model.eval()
    ocr = PaddleOCR(use_angle_cls=True, lang='en')
    img_bgr = cv2.imread(image_path)
    img_rgb = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2RGB)
    inputs  = processor(images=img_rgb, return_tensors="pt", size={"height": img_rgb.shape[0], "width": img_rgb.shape[1]})
    outputs = model(**inputs)
    target_sizes = torch.tensor([img_rgb.shape[:2]])
    results = processor.post_process_object_detection(
        outputs, threshold=detect_thresh, target_sizes=target_sizes
    )[0]
    rows, columns = [], []
    for score, label, box in zip(results['scores'], results['labels'], results['boxes']):
        if score < detect_thresh: continue
        xmin, ymin, xmax, ymax = map(int, box.tolist())
        obj = model.config.id2label[int(label.item())]
        if obj == 'table row': rows.append((ymin, ymax))
        elif obj == 'table column': columns.append((xmin, xmax))
    rows = sorted(rows, key=lambda r: r[0])
    columns = sorted(columns, key=lambda c: c[0])
    cell_bboxes=[]
    for i, (y0,y1) in enumerate(rows):
        for j,(x0,x1) in enumerate(columns):
            cell_bboxes.append({'row':i,'col':j,'bbox':{'xmin':x0,'ymin':y0,'xmax':x1,'ymax':y1}})
    data = [['' for _ in columns] for _ in rows]
    for c in cell_bboxes:
        r,cidx=c['row'],c['col']
        b=c['bbox']
        crop=img_bgr[b['ymin']:b['ymax'],b['xmin']:b['xmax']]
        res = ocr.ocr(crop, cls=True)
        text = ""
        if res:
            text_lines = []
            for block in res:
                if block:
                    text_lines.extend([line[1][0] for line in block if line])
            text = " ".join(text_lines).strip()
        data[r][cidx] = text

    return data

## 3. Full Image OCR Function

In [76]:
def ocr_full_image(image_path):
    '''Extract all text from an image using PaddleOCR'''
    ocr = PaddleOCR(use_angle_cls=True, lang='en')
    img = cv2.imread(image_path)
    res = ocr.ocr(img, cls=True)
    if res is None:
        return ''
    lines = [ln[1][0] for blk in res for ln in blk]
    return ''.join(lines)

## 4. Table Text Formatting & Summarization

In [77]:
def format_table_as_text(data):
    header=[h.strip() for h in data[0]]
    rows=data[1:]
    lines=["Table Contents:"]
    for row in rows:
        if not any(cell.strip() for cell in row): continue
        label=row[0].strip()
        parts=[]
        for i,val in enumerate(row[1:],start=1):
            if (v:=val.strip()): parts.append(f"{header[i]} {v}")
        lines.append(f"- {label}: "+'; '.join(parts) if parts else f"- {label}")
    return ''.join(lines)

def format_table_for_flant5(data):
    body=format_table_as_text(data)
    return 'Summarize the following table: '+body

## 5. Summarization Function

In [78]:
def summarize_text(text, model_name="google/flan-t5-xl", max_length=256, min_length=64):
    tokenizer=AutoTokenizer.from_pretrained(model_name)
    model=T5ForConditionalGeneration.from_pretrained(model_name).to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))
    inputs=tokenizer(text,return_tensors="pt",truncation=True,max_length=4096).to(model.device)
    ids = model.generate(inputs.input_ids, attention_mask=inputs.attention_mask, max_length=max_length, min_length=min_length, num_beams=4, length_penalty=2.0, early_stopping=True)
    return tokenizer.decode(ids[0],skip_special_tokens=True)

## 6. Combined Pipeline Function

In [79]:
def process_table_image(image_path, model_name="google/flan-t5-xl"):
    is_table, confidence = is_table_image(image_path)
    if is_table:
        data=extract_table_from_image(image_path)
        if data and len(data) > 0:
            table_text = format_table_for_flant5(data)
            summary = summarize_text(table_text, model_name)
            return {
                'type': 'table',
                'table_data': data,
                'table_text': table_text,
                'summary': summary
            }
        else:
            # If no table data was extracted despite detection, fall back to OCR
            txt = ocr_full_image(image_path)
            return {
                'type': 'text',
                'ocr_text': txt
            }
    else:
        txt=ocr_full_image(image_path)
        return {
            'type':'text',
            'ocr_text':txt
        }

## 7. Save Results Function

In [80]:
def save_results_to_json(result, output_file="table_summary_result.json"):
    res_copy=result.copy()
    res_copy['timestamp']=datetime.now().isoformat()
    with open(output_file,'w',encoding='utf-8') as f: json.dump(res_copy,f,indent=2,ensure_ascii=False)
    return output_file

## 8. Example Usage

In [81]:
# Test with a table image:
res=process_table_image("client_table_1.png")
print(res)
# Test with a non-table image:
res2=process_table_image("figure_example.png")
print(res2)
# Save:
print(save_results_to_json(res))

[2025/05/08 03:26:25] ppocr DEBUG: Namespace(help='==SUPPRESS==', use_gpu=False, use_xpu=False, use_npu=False, use_mlu=False, use_gcu=False, ir_optim=True, use_tensorrt=False, min_subgraph_size=15, precision='fp32', gpu_mem=500, gpu_id=0, image_dir=None, page_num=0, det_algorithm='DB', det_model_dir='C:\\Users\\looya/.paddleocr/whl\\det\\en\\en_PP-OCRv3_det_infer', det_limit_side_len=960, det_limit_type='max', det_box_type='quad', det_db_thresh=0.3, det_db_box_thresh=0.6, det_db_unclip_ratio=1.5, max_batch_size=10, use_dilation=False, det_db_score_mode='fast', det_east_score_thresh=0.8, det_east_cover_thresh=0.1, det_east_nms_thresh=0.2, det_sast_score_thresh=0.5, det_sast_nms_thresh=0.2, det_pse_thresh=0, det_pse_box_thresh=0.85, det_pse_min_area=16, det_pse_scale=1, scales=[8, 16, 32], alpha=1.0, beta=1.0, fourier_degree=5, rec_algorithm='SVTR_LCNet', rec_model_dir='C:\\Users\\looya/.paddleocr/whl\\rec\\en\\en_PP-OCRv4_rec_infer', rec_image_inverse=True, rec_image_shape='3, 48, 320',

Loading checkpoint shards: 100%|██████████| 2/2 [00:20<00:00, 10.16s/it]


{'type': 'table', 'table_data': [["(Amt in $'000)", 'Notes', 'Book value', 'Estimated ealisable value ("ERV")', 'Liquidation Scenario'], ['ASSETS', '', '', '', ''], ['Sundry debtors', 'a', '9,760', '5,548', '666'], ['Cash in bank', 'b', '0', '0', ''], ['Other assets', 'C', '1,438', '107', ''], ['Contingent assets', 'p', '', '28', ''], ['Total assets', '', '11,198', '5,683', '666'], ['LIABILITIES', '', '', '', ''], ['Total creditors', '-', '(335)', '(7,312)', '(9,940'], ['Contingent', 'f', '', '(9,707)', '(9,633'], ['liabilities', '', '', '', ''], ['Total liabilities', '', '', '(17,019)', '(19,573)'], ['Estimated', '', '', '(11,336)3', '(18,907)'], ['deficiency', '', '', '', '']], 'table_text': 'Summarize the following table: Table Contents:- ASSETS- Sundry debtors: Notes a; Book value 9,760; Estimated ealisable value ("ERV") 5,548; Liquidation Scenario 666- Cash in bank: Notes b; Book value 0; Estimated ealisable value ("ERV") 0- Other assets: Notes C; Book value 1,438; Estimated ealis