## Install dependencies

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
%pip install -e ../../extraction aicacia-document-exporter==0.1.5

## Imports

In [None]:
import os
import time
import json
import glob
from pathlib import Path
from subprocess import Popen, PIPE, STDOUT
from urllib.request import urlretrieve

from aicacia_document_exporter.Document import Document
from aicacia_document_exporter.PreprocessingModel import PreprocessingModel
from aicacia_document_exporter.SimpleFileDocumentExporter import SimpleFileDocumentExporter

# from marker.settings import settings
# from marker.models import load_all_models
# from marker.convert import convert_single_pdf
# from marker.output import save_markdown

from aicacia_extraction.sources.wri import WriSimpleExtractor

## Configuration

In [None]:
data_dir = '../data/db'
db_file_name = 'wri_metadata.db'
pdf_dir = '../data/pdf'
md_dir = '../data/markdown'

batch_size = 20

# settings.EXTRACT_IMAGES = False
os.environ["EXTRACT_IMAGES"] = "False"

os.makedirs(data_dir, exist_ok=True)
os.makedirs(pdf_dir, exist_ok=True)
os.makedirs(md_dir, exist_ok=True)

# model_lst = load_all_models()

## Define preprocessing model

In [None]:
class Preprocess(PreprocessingModel):
    def preprocess_batch(self, docs: list[Document]):
        for doc in docs:
            pdf_sources = [source for source in doc.sources if 'application/pdf' in source]
            
            if pdf_sources:
                # Take only the latest available PDF
                pdf_source = json.loads(pdf_sources[0])['link']
    
                file_path = f'{pdf_dir}/{doc.id}.pdf'
                
                print(f'Downloading PDF source for "{doc.title}"') 
                urlretrieve(pdf_source, file_path)        
                doc.metadata['file_path'] = file_path

                # print(f'Converting PDF to Markdown for "{doc.title}"')
                # full_text, images, out_meta = convert_single_pdf(file_path, model_lst, batch_multiplier=2)       
                # save_markdown(md_dir, f'{doc.id}.md', full_text, images, out_meta)

        if os.listdir(pdf_dir):
            print("Converting PDF batch to Markdown...")
            with Popen(["marker", pdf_dir, md_dir, '--workers', '3'], stdout=PIPE, stderr=STDOUT, text=True) as proc:
                for line in proc.stdout:
                    print(line)
            
            for f in glob.glob(f'{pdf_dir}/*'):
                os.remove(f)

            for doc in docs:
                md_result_dir = Path(f"{md_dir}/{doc.id}")
                if md_result_dir.is_dir():
                    doc.raw_content = md_result_dir.joinpath(f'{doc.id}.md').read_bytes()
                    doc.metadata['marker_meta'] = md_result_dir.joinpath(f"{doc.id}_meta.json").read_text(encoding="utf8")
                

## Run extraction (first 3 pages)

In [None]:
start_time = time.time()

extractor = WriSimpleExtractor()

with SimpleFileDocumentExporter(f'{data_dir}/{db_file_name}', batch_size=batch_size, preprocessing_model=Preprocess()) as exporter:
    for doc in extractor.extract(start_page=0, page_limit=3):
        exporter.insert([doc])

end_time = time.time()

print(f'Finished extraction! Elapsed time: {end_time - start_time} sec')