In [None]:
# from grobid_client.grobid_client import GrobidClient

# client = GrobidClient(config_path="./config.json", check_server=False)
# client.process("processFulltextDocument", "./test_pdf", output="./test_out/", 
#                     force=True,verbose=True)

2fffe3ce-3a9d-47f9-970a-c5de6a57f3ce.pdf
1 files to process in current batch


In [5]:
from bs4 import BeautifulSoup
import json

def extract_figures_from_tei(tei_file_path):
    """
    Extract figures from TEI XML and return structured data
    
    Args:
        tei_file_path (str): Path to the TEI XML file
        
    Returns:
        list: Extracted figures data
    """
    with open(tei_file_path, 'r', encoding='utf-8') as tei:
        soup = BeautifulSoup(tei, 'lxml-xml')
    
    figures = soup.find_all('figure')
    result = []
    
    # Track the order of figures in the XML
    order_index = 0
    
    for figure in figures:
        figure_obj = {}
        
        # Add order index to track position in the original XML
        figure_obj['order_index'] = order_index
        order_index += 1
        
        # Extract figure ID if available
        figure_id = figure.get('xml:id')
        if figure_id:
            figure_obj['figure_id'] = figure_id
        
        # Extract the heading/caption
        head = figure.find('head')
        if head:
            head_text = head.get_text()
            clean_head_text = ' '.join(head_text.split())
            figure_obj['head'] = clean_head_text
            
            # Get label if available
            label = figure.find('label')
            if label:
                figure_obj['label'] = label.get_text().strip()
        
        # # Extract figure description
        # fig_desc = figure.find('figDesc')
        # if fig_desc:
        #     desc_text = fig_desc.get_text()
        #     clean_desc_text = ' '.join(desc_text.split())
        #     figure_obj['description'] = clean_desc_text
        
        # # Extract graphic information if available
        # graphic = figure.find('graphic')
        # if graphic:
        #     figure_obj['graphic'] = {
        #         'url': graphic.get('url', ''),
        #         'coords': graphic.get('coords', ''),
        #         'type': graphic.get('type', '')
        #     }
        
        result.append(figure_obj)
    
    return result



In [9]:
import pprint

# Example usage in notebook:
tei_path = "./test.grobid.tei.xml"
figures = extract_figures_from_tei(tei_path)
print(f"Found {len(figures)} figures")
pp = pprint.PrettyPrinter(indent=4)
print("\nFirst 5 figures:")
pp.pprint(figures)

Found 43 figures

First 5 figures:
[   {   'figure_id': 'fig_0',
        'head': 'Table A. 1 :',
        'label': '1',
        'order_index': 0},
    {'figure_id': 'fig_1', 'order_index': 1},
    {   'figure_id': 'fig_2',
        'head': 'Figure 1 :',
        'label': '1',
        'order_index': 2},
    {'figure_id': 'fig_3', 'order_index': 3},
    {   'figure_id': 'fig_4',
        'head': 'Figure 3 :',
        'label': '3',
        'order_index': 4},
    {'figure_id': 'fig_6', 'order_index': 5},
    {   'figure_id': 'fig_7',
        'head': 'Figure A. 1 :',
        'label': '1',
        'order_index': 6},
    {   'figure_id': 'fig_8',
        'head': 'Figure A. 2 :',
        'label': '2',
        'order_index': 7},
    {   'figure_id': 'fig_9',
        'head': 'Figure A. 3 :',
        'label': '3',
        'order_index': 8},
    {'figure_id': 'fig_10', 'head': 'Figure', 'order_index': 9},
    {'order_index': 10},
    {'order_index': 11},
    {'order_index': 12},
    {'order_index': 13

In [None]:
import os
import uuid
from pathlib import Path

import fitz  # PyMuPDF

pdf_document = fitz.open("./test.pdf")



# Create directory for extracted images if it doesn't exist
output_dir = Path("extracted_images")
output_dir.mkdir(exist_ok=True)

# Process all pages in the PDF
extracted_image_paths = []

for page_num in range(len(pdf_document)):
    # Get the page
    page = pdf_document[page_num]
    
    # Extract images from the page
    image_list = page.get_images(full=True)
    
    # Process each image on the page
    for img_index, img in enumerate(image_list):
        try:
            xref = img[0]  # image reference
            base_image = pdf_document.extract_image(xref)
            image_bytes = base_image["image"]
            image_ext = base_image["ext"]
            
            # Convert extension to standard format if needed
            if image_ext.lower() == "jpeg":
                file_ext = "jpeg"
            else:
                file_ext = image_ext.lower()
            
            # Create a filename for the image
            image_filename = f"page{page_num+1}_img{img_index+1}.{file_ext}"
            image_path = output_dir / image_filename
            
            # Save the image
            with open(image_path, "wb") as img_file:
                img_file.write(image_bytes)
                
            # Add to the list of extracted paths
            extracted_image_paths.append(str(image_path))
            
        except Exception as e:
            print(f"Error extracting image {img_index} from page {page_num+1}: {e}")

print(f"Extracted {len(extracted_image_paths)} images from the PDF")

Extracted 78 images from the PDF


In [5]:
import pandas as pd 
pdf_url = "https://arxiv.org/pdf/2106.00676.pdf" # link/to/your/paper.pdf 
relative_coordinates = True # whether returning relative coordinates or not 
# Read the data from the API
parsed = pd.read_csv(f"http://34.131.181.227:8080/parse/?pdf_url={pdf_url}&relative_coordinates={relative_coordinates}")

# Save the parsed data to a CSV file
parsed.to_csv('parsed_document.csv', index=False)
print(f"Saved parsed data to 'parsed_document.csv' with {len(parsed)} rows")

Saved parsed data to 'parsed_document.csv' with 228 rows


In [6]:
# import layoutparser as lp
# page_tokens, page_images = lp.load_pdf("test.pdf", load_images=True)
# for page_id in range(len(page_images)):
#     cur_page_w, cur_page_h = page_images[page_id].size
#     tdf = (parsed[parsed['page']==page_id][["x1", "y1", "x2", "y2"]])
#     tdf['x1'] *= cur_page_w
#     tdf['x2'] *= cur_page_w
#     tdf['y1'] *= cur_page_h
#     tdf['y2'] *= cur_page_h
#     tdf = tdf.rename(columns={"x1":"x_1", "y1":"y_1", "x2":"x_2", "y2":"y_2"})
#     display(
#         lp.draw_box(
#         page_images[page_id],
#         lp.load_dataframe(tdf,block_type="rectangle")
#         )
#     )


In [7]:
print(parsed)

     page       type                                               text  \
0       0      Title  VILA: Improving Structured Content Extraction ...   
1       0     Author  Zejiang Shen 1 Kyle Lo 1 Lucy Lu Wang 1 Bailey...   
2       0   Abstract  Abstract Accurately extracting structured cont...   
3       0    Section                                     1 Introduction   
4       0  Paragraph  Scientiﬁc papers are usually distributed in Po...   
..    ...        ...                                                ...   
223    16    Caption  Table 9: Prediction F1 breakdown for all model...   
224    16    Section                               Correct Label Errors   
225    16  Paragraph  Given the VILA struc- tures, we can easily cor...   
226    16  Paragraph  “paragraph”.Weupdate our methods for several r...   
227    16   Footnote  15 We randomly sample 30 pages from both the t...   

           x1        y1        x2        y2 block_type  block_id  
0    0.138936  0.083545  0.86279

In [9]:
import requests, io
f = open("test.pdf", 'rb')
files = {"pdf_file": (f.name, f, "multipart/form-data")}
r = requests.post('http://34.131.181.227:8080/parse', files=files)
parsed = pd.read_csv(io.StringIO(r.content.decode('utf-8')))

In [10]:
parsed.to_csv('parsed_document2.csv', index=False)