## Install dependencies

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
%pip install -e ../../extraction aicacia-document-exporter==0.1.5

Obtaining file:///C:/Users/grast/IdeaProjects/aicacia/extraction
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Checking if build backend supports build_editable: started
  Checking if build backend supports build_editable: finished with status 'done'
  Getting requirements to build editable: started
  Getting requirements to build editable: finished with status 'done'
  Preparing editable metadata (pyproject.toml): started
  Preparing editable metadata (pyproject.toml): finished with status 'done'
Building wheels for collected packages: aicacia_extraction
  Building editable for aicacia_extraction (pyproject.toml): started
  Building editable for aicacia_extraction (pyproject.toml): finished with status 'done'
  Created wheel for aicacia_extraction: filename=aicacia_extraction-0.0.1-0.editable-py3-none-any.whl size=1466 sha256=ac8a45416eb9ba06c91bd8022485f3dd9fc62f88e6c9df505dd4588852b211c7
  Stored in directory: C:\Users\grast\

## Imports

In [3]:
import os
import time
import json
import glob
from pathlib import Path
from subprocess import Popen, PIPE, STDOUT
from urllib.request import urlretrieve

from aicacia_document_exporter.Document import Document
from aicacia_document_exporter.PreprocessingModel import PreprocessingModel
from aicacia_document_exporter.SimpleFileDocumentExporter import SimpleFileDocumentExporter

# from marker.settings import settings
# from marker.models import load_all_models
# from marker.convert import convert_single_pdf
# from marker.output import save_markdown

import pymupdf4llm

from aicacia_extraction.sources.wri import WriSimpleExtractor

## Configuration

In [4]:
data_dir = '../data/db'
db_file_name = 'wri_metadata.db'
pdf_dir = '../data/pdf'
md_dir = '../data/markdown'

batch_size = 20

# settings.EXTRACT_IMAGES = False
# os.environ["EXTRACT_IMAGES"] = "False"

os.makedirs(data_dir, exist_ok=True)
os.makedirs(pdf_dir, exist_ok=True)
os.makedirs(md_dir, exist_ok=True)

# model_lst = load_all_models()

## Define preprocessing model

In [5]:
class Preprocess(PreprocessingModel):
    def preprocess_batch(self, docs: list[Document]):
        for doc in docs:
            pdf_sources = [source for source in doc.sources if 'application/pdf' in source]
            
            if pdf_sources:
                # Take only the latest available PDF
                pdf_source = json.loads(pdf_sources[0])['link']
    
                file_path = f'{pdf_dir}/{doc.id}.pdf'
                
                print(f'Downloading PDF source for "{doc.title}"') 
                urlretrieve(pdf_source, file_path)        
                doc.metadata['file_path'] = file_path

                print(f'Converting PDF to Markdown for "{doc.title}"')
                # full_text, images, out_meta = convert_single_pdf(file_path, model_lst, batch_multiplier=2)       
                # save_markdown(md_dir, f'{doc.id}.md', full_text, images, out_meta)

                md_bytes = pymupdf4llm.to_markdown(file_path, table_strategy='lines', ignore_code=True).encode()
                Path(f'{md_dir}/{doc.id}.md').write_bytes(md_bytes)
                doc.raw_content = md_bytes

        if os.listdir(pdf_dir):
            # print("Converting PDF batch to Markdown...")
            # with Popen(["marker", pdf_dir, md_dir, '--workers', '3'], stdout=PIPE, stderr=STDOUT, text=True) as proc:
            #     for line in proc.stdout:
            #         print(line)
            
            for f in glob.glob(f'{pdf_dir}/*'):
                os.remove(f)

            # for doc in docs:
            #     md_result_dir = Path(f"{md_dir}/{doc.id}")
            #     if md_result_dir.is_dir():
            #         doc.raw_content = md_result_dir.joinpath(f'{doc.id}.md').read_bytes()
            #         doc.metadata['marker_meta'] = md_result_dir.joinpath(f"{doc.id}_meta.json").read_text(encoding="utf8")
                

## Run extraction (first 3 pages)

In [6]:
start_time = time.time()

extractor = WriSimpleExtractor()

with SimpleFileDocumentExporter(f'{data_dir}/{db_file_name}', batch_size=batch_size, preprocessing_model=Preprocess()) as exporter:
    for doc in extractor.extract(start_page=0, page_limit=3):
        exporter.insert([doc])

end_time = time.time()

print(f'Finished extraction! Elapsed time: {end_time - start_time} sec')

Extracting 20 docs from page #0...
Extracted 1 of 20
Extracted 2 of 20
Extracted 3 of 20
Extracted 4 of 20
Extracted 5 of 20
Extracted 6 of 20
Extracted 7 of 20
Extracted 8 of 20
Extracted 9 of 20
Extracted 10 of 20
Extracted 11 of 20
Extracted 12 of 20
Extracted 13 of 20
Extracted 14 of 20
Extracted 15 of 20
Extracted 16 of 20
Extracted 17 of 20
Extracted 18 of 20
Extracted 19 of 20
Downloading PDF source for "Sustainable Behavior in Climate Pledges: An Analysis of Top Emitters’ Strategies"
Converting PDF to Markdown for "Sustainable Behavior in Climate Pledges: An Analysis of Top Emitters’ Strategies"
Processing ../data/pdf/387da84b-99e7-486a-9f44-08df66d8e940.pdf...
Downloading PDF source for "The role of faith-based organizations in tackling food loss and waste in Rwanda: A preliminary study"
Converting PDF to Markdown for "The role of faith-based organizations in tackling food loss and waste in Rwanda: A preliminary study"
Processing ../data/pdf/9a0aa590-486a-4101-a522-9a5dbe5ab51