In [None]:
!python -m pip install --user -r ../requirements.txt

In [None]:
!python -m pip install --user  opencv-python-headless

In [None]:
!python -m pip install --user  opencv-python-headless
!python -m pip install --user layoutparser # Install the base layoutparser library with
!python -m pip install --user  "layoutparser[layoutmodels]" # Install DL layout model toolkit
!python -m pip install --user "layoutparser[ocr]" # Install OCR toolkit

In [None]:
!python -m pip install --user pyarrow

In [None]:
!python -m pip install --user 'git+https://github.com/facebookresearch/detectron2.git'

In [None]:
!sudo apt-get update

In [None]:
!sudo apt install --yes libgl1 libgl1-mesa-glx libglib2.0-0

In [None]:
!pip install "Pillow<=9.5.0"

* restart kernel here

In [None]:
import layoutparser as lp
import cv2
import numpy as np
import pdf2image
import os
from PIL import Image
import json
import pandas as pd

In [None]:
PATH_OF_THE_PDF="echantillon/"
NAME_OF_THE_PDF_TO_READ="T06319001085-42943397200016.pdf"
TESSERACT_LANGUAGE="fra"
SHOW_IMAGE=False
SHOW_ZOOM_TITLE=True
PRINT_DETECTED_TITLE=True

In [None]:
model = lp.models.Detectron2LayoutModel(
    config_path ="../config.yaml",
    model_path ="../model_final.pth",
    extra_config=["MODEL.ROI_HEADS.SCORE_THRESH_TEST", 0.5],
    label_map={3: "Text", 4: "Title", 1: "List", 2: "Table", 0: "Figure"},
)

In [None]:
def pdf_to_img(pdf_file):
    return pdf2image.convert_from_path(
        pdf_file, grayscale=False, thread_count=os.cpu_count()
    )

In [None]:
all_images = pdf_to_img(PATH_OF_THE_PDF+NAME_OF_THE_PDF_TO_READ)
list_of_titles=[]
list_of_subtitles=[]
for i,one_image in enumerate(all_images):
    one_image_np = np.asarray(one_image)
    layout = model.detect(one_image_np)
    if SHOW_IMAGE:
        lp.draw_box(one_image_np, layout, box_width=3, show_element_type=True).show()
    title_blocks = lp.Layout([b for b in layout if b.type == "Title"])
    ocr_agent = lp.TesseractAgent(languages=TESSERACT_LANGUAGE)
    for block in title_blocks.sort(key=lambda x: x.coordinates[1]):
        segment_image = block.pad(left=20, right=20, top=20, bottom=20).crop_image(one_image_np)
        if SHOW_ZOOM_TITLE:
            Image.fromarray(segment_image).show()
        # add padding in each image segment can help
        # improve robustness
    
        text = ocr_agent.detect(segment_image).replace("\n"," ").replace("  ","")
        block.set(text=text, inplace=True)
        if i==0:
            list_of_titles.append(text)
        else:
            list_of_subtitles.append(text)
        if PRINT_DETECTED_TITLE:
            print(text)

    if len(all_images)==1 and list_of_titles:
        list_of_subtitles=list_of_titles[1:]+list_of_subtitles
        list_of_titles=[list_of_titles[0]]

In [None]:
list_of_titles=list(map(lambda x: x.replace("\x0c",""),list_of_titles))
list_of_subtitles=list(map(lambda x: x.replace("\x0c",""),list_of_subtitles))

In [None]:
list_of_titles

# Write result files Part

In [None]:
!mkdir result

In [None]:
OUTPUT_FILE_PARQUET = "Dares_accords_docx_sommaire_result.parquet"

In [None]:
df = pd.DataFrame(columns=["num_dossier", "sommaire_result"],dtype=object)
for file in os.listdir(PATH_OF_THE_PDF):
    if file.endswith(".pdf"):
        all_images = pdf_to_img(PATH_OF_THE_PDF+file)
        list_of_titles=[]
        list_of_subtitles=[]
        for i,one_image in enumerate(all_images):
            one_image_np = np.asarray(one_image)
            layout = model.detect(one_image_np)
            title_blocks = lp.Layout([b for b in layout if b.type == "Title"])
            ocr_agent = lp.TesseractAgent(languages=TESSERACT_LANGUAGE)
            for block in title_blocks.sort(key=lambda x: x.coordinates[1]):
                segment_image = block.pad(left=20, right=20, top=20, bottom=20).crop_image(one_image_np)
                text = ocr_agent.detect(segment_image).replace("\n"," ").replace("  ","")
                block.set(text=text, inplace=True)
                if i==0:
                    list_of_titles.append(text)
                else:
                    list_of_subtitles.append(text)

        if len(all_images)==1 and list_of_titles:
            list_of_subtitles=list_of_titles[1:]+list_of_subtitles
            list_of_titles=[list_of_titles[0]]
            
        list_of_titles=list(map(lambda x: x.replace("\x0c",""),list_of_titles))
        list_of_subtitles=list(map(lambda x: x.replace("\x0c",""),list_of_subtitles))
        contenu_fichier=" ".join(list_of_titles)+"\n-----\n"+"\n".join(list_of_subtitles)
        with open(f"result/{file}".replace(".pdf",".txt"),"w") as file_to_write:
            file_to_write.write(contenu_fichier)
        new_line = [num_dossier] + [
            contenu_fichier
        ] 
        try:
            new_line = {
                "num_dossier": new_line[0],
                "sommaire_result": new_line[1],
                "index" : [i]
            }
        except:
            print(f"bug {num_dossier}")
            
        
        df_new_row = pd.DataFrame.from_records(data=new_line)
        df = pd.concat([df, df_new_row])
df=df.set_index("num_dossier").drop(columns=["index"])
df.to_parquet(OUTPUT_FILE_PARQUET)