Note: As of 4/26/24, [detectron2](https://github.com/facebookresearch/detectron2/issues/5010) needs to be installed with the following command:

```
 python3 -m pip install -U 'git+https://github.com/facebookresearch/detectron2.git@ff53992b1985b63bd3262b5a36167098e3dada02'
Collecting git+https://github.com/facebookresearch/detectron2.git@ff53992b1985b63bd3262b5a36167098e3dada02
```

In [1]:
import os
import glob
import pdf2image
import layoutparser as lp
from PIL import Image
import sqlite3
import cv2

In [2]:
import torch
if torch.backends.mps.is_available():
    mps_device = torch.device("mps")
    x = torch.ones(1, device=mps_device)
    print (x)
else:
    print ("MPS device not found.")

print(torch.cuda.is_available())

tensor([1.], device='mps:0')
False


In [2]:
image = cv2.imread("../samples/sample_01_inline.png")
image = image[..., ::-1]
    # Convert the image from BGR (cv2 default loading style)
    # to RGB

Note: To load the model correctly, you need to follow these steps:

1. Run the `Detectron2LayoutModel` command to download the model using this command:
``` 
model = lp.Detectron2LayoutModel(
    'lp://PubLayNet/mask_rcnn_X_101_32x8d_FPN_3x/config',
    extra_config=["MODEL.ROI_HEADS.SCORE_THRESH_TEST", 0.8],
    label_map={0: "Text", 1: "Title", 2: "List", 3:"Table", 4:"Figure"})
```
This will exit in an error saying that the `model_final.pth` file is not found. However, the file was downloaded, so you need to do two things still.

2. Rename the downloaded file (it will be called `model_final.pth?dl=1`) to `model_final.pth`. This file will be located in the `.torch` directory of your home folder. It's easiest to simply rename it in place. In my case, this required:
```
> mv ~/.torch/iopath_cache/s/57zjbwv6gh3srry/model_final.pth\?dl=1 ~/.torch/iopath_cache/s/57zjbwv6gh3srry/model_final.pth
```
3. Update the model command to find this file directly (otherwise it will try to download the file again):

```
model = lp.Detectron2LayoutModel('lp://PubLayNet/mask_rcnn_X_101_32x8d_FPN_3x/config',
                                 model_path="/Users/kellycaylor/.torch/iopath_cache/s/57zjbwv6gh3srry/model_final.pth",
                                 extra_config=["MODEL.ROI_HEADS.SCORE_THRESH_TEST", 0.8],
                                 label_map={0: "Text", 1: "Title", 2: "List", 3:"Table", 4:"Figure"})
 ```

 This should now work. 

 WARNING: It can take a _very_ long time to load the model!

In [8]:
model = lp.Detectron2LayoutModel('lp://PubLayNet/mask_rcnn_X_101_32x8d_FPN_3x/config',
                                 model_path="/Users/kellycaylor/.torch/iopath_cache/s/57zjbwv6gh3srry/model_final.pth",
                                 extra_config=["MODEL.ROI_HEADS.SCORE_THRESH_TEST", 0.8],
                                 label_map={0: "Text", 1: "Title", 2: "List", 3:"Table", 4:"Figure"})
    # Load the deep layout model from the layoutparser API
    # For all the supported model, please check the Model
    
    # Zoo Page: https://layout-parser.readthedocs.io/en/latest/notes/modelzoo.html

In [None]:

def initialize_db(db_path):
    conn = sqlite3.connect(db_path)
    c = conn.cursor()
    c.execute('''CREATE TABLE IF NOT EXISTS documents
                 (filename TEXT, page_number INTEGER, table_count INTEGER)''')
    c.execute('''CREATE TABLE IF NOT EXISTS tables
                 (filename TEXT, page_number INTEGER, bbox TEXT, image_path TEXT)''')
    conn.commit()
    return conn


In [None]:

def convert_pdf_to_images(pdf_path, dpi=200):
    return pdf2image.convert_from_path(pdf_path, dpi=dpi)


In [None]:

def detect_tables(images, model):
    # Assuming model is a preloaded LayoutLM model or similar
    tables = []
    for i, img in enumerate(images):
        image_pil = Image.fromarray(img)
        layout = model.detect(image_pil)
        tables.append((i, layout))
    return tables


In [None]:

def save_table_images(tables, base_filename, output_dir):
    for page_number, layouts in tables:
        for idx, bbox in enumerate(layouts):
            x, y, w, h = bbox.coordinates
            cropped_image = images[page_number].crop((x, y, x + w, y + h))
            img_path = os.path.join(output_dir, f"{base_filename}_page_{page_number}_table_{idx}.png")
            cropped_image.save(img_path)
            yield page_number, str(bbox), img_path


In [None]:

def process_files(pdf_files, output_dir, db_conn):
    for pdf_file in pdf_files:
        base_filename = os.path.basename(pdf_file)
        images = convert_pdf_to_images(pdf_file)
        tables = detect_tables(images, model)  # model needs to be defined or loaded
        table_details = list(save_table_images(tables, base_filename, output_dir))
        db_conn.execute('INSERT INTO documents (filename, page_number, table_count) VALUES (?, ?, ?)',
                        (base_filename, len(images), len(table_details)))
        for page_number, bbox, img_path in table_details:
            db_conn.execute('INSERT INTO tables (filename, page_number, bbox, image_path) VALUES (?, ?, ?, ?)',
                            (base_filename, page_number, bbox, img_path))
        db_conn.commit()


In [None]:

input_dir = '../samples'
output_dir = '../samples/output'
db_path = '../samples/tables.db'

if not os.path.exists(output_dir):
    os.makedirs(output_dir)


In [None]:

pdf_files = glob.glob(os.path.join(input_dir, 'sample*.pdf'))
db_conn = initialize_db(db_path)


In [None]:

# Load your model here, for example:
model = lp.models.LayoutLMv2Model("microsoft/layoutlmv2-base-uncased")


In [None]:

process_files(pdf_files, output_dir, db_conn)