In [0]:
!wget -qO- https://github.com/jgm/pandoc/releases/download/3.5/pandoc-3.5-linux-amd64.tar.gz | tar xvz -C /usr/local/bin
!mv /usr/local/bin/pandoc-3.5/bin/pandoc /usr/local/bin

In [0]:
%pip install pymupdf python-Levenshtein pandoc
dbutils.library.restartPython()

In [0]:
from huggingface_hub import hf_hub_download
import re
from PIL import Image

from transformers import NougatProcessor, VisionEncoderDecoderModel
from datasets import load_dataset
import torch

In [0]:
processor = NougatProcessor.from_pretrained("facebook/nougat-base")
model = VisionEncoderDecoderModel.from_pretrained("facebook/nougat-base")

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

In [0]:
!nvidia-smi

In [0]:
import fitz

# _pdf_path = "/Volumes/field_ai_examples/alphaleger/financebench/pdf/3M_2015_10K.pdf"
_pdf_path = "/Volumes/felixflory/ey_dbs_workshop_2024_10/raw_data/project_churches/Project Churches - FINAL Red Flag Report 181121.pdf"
pdf_document = fitz.open(_pdf_path)

In [0]:
# Uncomment if running for the first time
# 
for page in pdf_document:  # iterate through the pages
    pix = page.get_pixmap(dpi=600)  # render page to an image
    pix.save("images/page-%i.png" % page.number)  # store image as a PNG

In [0]:
# prepare PDF image for the model
filepath = "/Workspace/Users/felix.flory@databricks.com/git/ey-dbs-cookbook-2024-10/entity_extraction/images/page-12.png"
image = Image.open(filepath)
pixel_values = processor(image, return_tensors="pt").pixel_values

# generate transcription (here we only generate 30 tokens)
outputs = model.generate(
    pixel_values.to(device),
    min_length=1,
    max_new_tokens=2000,
    bad_words_ids=[[processor.tokenizer.unk_token_id]],
)

decoded = processor.batch_decode(outputs, skip_special_tokens=True)[0]

In [0]:
sequence = processor.post_process_generation(decoded, fix_markdown=False)
# note: we're using repr here such for the sake of printing the \n characters, feel free to just print the sequence
print(repr(sequence))

In [0]:
print(sequence)

$$
\begin{tabular}{l r r r r r r r r r}
& \multicolumn{4}{c}{2015} & \multicolumn{3}{c}{2014} & \multicolumn{2}{c}{2015 vs 2014} \\
\cline{2-10}
& Net & \% of & Oper. & Net & \% of & Oper. & Net & Oper. & Net \\
(Dollars in millions) & Sales & Total & Income & Sales & Total & Income & Sales & Income & Sales \\
\hline
\textbf{Business Segments} & & & & & & & & & \\
Industrial & \$ & 10,328 & 34.1\% & \$ & 2,263 & \$ & 10,990 & 34.5\% & (6.0)\% \\
Safety and Graphics & & 5,515 & 18.2\% & 1,305 & & 5,732 & 18.0\% & 1,296 & (3.8)\% \\
Health Care & & 5,420 & 17.9\% & 1,724 & 5,572 & 17.5\% & 1,724 & (2.7)\% & --\% \\
Electronics and Energy & & 5,240 & 17.2\% & 1,802 & 5,604 & 17.6\% & 1,115 & (6.8)\% & (1.1)\% \\
Consumer & & 4,422 & 14.6\% & 1,046 & 4,523 & 14.2\% & 995 & (2.2)\% & 5.2\% \\
Corporate and Unallocated & & -- & --\% & (355) & & -- & --\% & (251) & \\
Elimination of Dual Credit & & (632) & (2.0)\% & (139) & (604) & (1.8)\% & (133) & --\% & --\% \\
\hline
Total Company & \$ & \textbf{30,274} & \textbf{100.0\%} & \$ & 6,946 & \$ & 31,821 & 100.0\% & (4.9)\% \\
\end{tabular}
$$

$$c = \\pm\\sqrt{a^2 + b^2}$$

In [0]:
%pip install panflute

In [0]:
import pandoc
from panflute import *

def latex_to_markdown_table(latex_str):
    # First, convert the LaTeX table to a more structured format
    rows = []
    in_table = False
    
    for line in latex_str.split('\n'):
        if '\\begin{tabular}' in line:
            in_table = True
            continue
        elif '\\end{tabular}' in line:
            in_table = False
            continue
        elif '\\hline' in line:
            continue
            
        if in_table and line.strip():
            # Remove trailing \\
            line = line.replace('\\\\', '')
            # Split by & and strip whitespace
            cells = [cell.strip() for cell in line.split('&')]
            rows.append(cells)
    
    # Convert to markdown table format
    if not rows:
        return ""
        
    markdown_table = []
    # Header
    markdown_table.append("| " + " | ".join(rows[0]) + " |")
    # Separator
    markdown_table.append("| " + " | ".join(['---'] * len(rows[0])) + " |")
    # Data rows
    for row in rows[1:]:
        markdown_table.append("| " + " | ".join(row) + " |")
    
    return "\n".join(markdown_table)

# Your LaTeX table
test_str = r"""\begin{tabular}{|l|l|}\hline
Age & Frequency \\ \hline
18--25  & 15 \\
26--35  & 33 \\
36--45  & 22 \\ \hline
\end{tabular}
"""

# Convert to markdown table
markdown_table = latex_to_markdown_table(test_str)
print("Markdown table:")
print(markdown_table)

# Convert to HTML using pandoc
doc = pandoc.read(markdown_table, format="markdown")
html = pandoc.write(doc, format="html")
print("\nHTML output:")
print(html)

In [0]:
pandoc.write(out, format="markdown_strict+raw_html")

In [0]:
from pandoc.types import *

pandoc.write(RawBlock(Format("latex"), "\begin{tabular}{l r r r r r r r r r}  & & & 2015 & & & & & & 2015 vs 2014 & & \\% change \\\\ \\cline{2-11}  & Net & \\% of & Oper. & Net & \\% of & Oper. & Net & Oper. & Net & Oper. \\\\ (Ballars in millieas) & Sales & Total & Income & Sales & Total & Income & Sales & Income & Sales & Income \\\\ \\hline\n**Business Segments** & & & & & & & & & & \\\\ Industrial & \\$ & 10,328 & 34.1 \\% & \\$ & 2,263 & \\$ & 10,990 & 34.5 \\% & \\$ & 2,389 & (6.0)\\% & (5.3)\\% \\\\ Safety and Graphics & & 5,515 & 18.2 \\% & 1,305 & & 5,732 & 18.0 \\% & 1,296 & (3.8)\\% & 0.7 \\% \\\\ Health Care & & 5,420 & 17.9 \\% & 1,724 & 5,572 & 17.5 \\% & 1,724 & (2.7)\\% & \\(-\\)\\% \\\\ Electronics and Energy & & 5,240 & 17.2 \\% & 1,802 & 5,604 & 17.6 \\% & 1,115 & (6.8)\\% & (1.1)\\% \\\\ Consumer & & 4,422 & 14.6 \\% & 1,046 & 4,523 & 14.2 \\% & 995 & (2.2)\\% & 5.2 \\% \\\\ Corporate and Unallocated & & & \\(\\;\\) & \\(\\;\\) \\% & (355) & & \\(\\;\\) & \\(\\;\\) \\(-\\)\\% & (251) & \\\\ Elimination of Dual Credit & & (632) & (2.0)\\% & (139) & (604) & (1.8)\\% & (133) & \\(-\\)\\% & \\(-\\)\\% \\\\ Total Company & \\$ & **30,274** & **100.0 \\%** & \\$ & 6,946 & \\$ & 31,821 & 100.0 \\% & \\$ & 7,135 & (4.9)\\% & (2.6)\\% \\\\ \\end{tabular}"))