In [14]:
import pandas as pd
df = pd.read_csv("../assets/elaws_links.csv")
df.head(3)

Unnamed: 0,title,link_to_page,link_to_doc_file,asset_path
0,"Food Safety and Quality Act, 2001, S.O. 2001, ...",https://www.ontario.ca/laws/statute/01f20,https://www.ontario.ca/laws/docs/01f20_e.doc,assets/elaws_pdfs/01f20_e.pdf
1,"Nutrient Management Act, 2002, S.O. 2002, c. 4",https://www.ontario.ca/laws/statute/02n04,https://www.ontario.ca/laws/docs/02n04_e.doc,assets/elaws_pdfs/02n04_e.pdf
2,"Animal Health Act, 2009, S.O. 2009, c. 31",https://www.ontario.ca/laws/statute/09a31,https://www.ontario.ca/laws/docs/09a31_e.doc,assets/elaws_pdfs/09a31_e.pdf


In [11]:
# Setup a non-OCR document converter
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.datamodel.base_models import InputFormat
from docling.chunking import HybridChunker
from src.retrievers import make_text_chunk

# We don't need OCR for these PDFs
pipeline_options = PdfPipelineOptions()
pipeline_options.do_ocr = False

converter = DocumentConverter(
    format_options={
        InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
    }
)

# Setup a hybrid chunker that respects headings and hierarchy
chunker = HybridChunker(max_tokens=1000)

In [15]:
# Process each PDF
from pathlib import Path

all_chunks = []
for idx, row in df.iterrows():
    try:
        print(f"Processing {row.title}")
        result = converter.convert(Path("../") / row.asset_path)
        chunk_iter = chunker.chunk(result.document)
        doc_chunks = [
            make_text_chunk(x, doc_uri=row.link_to_page) 
            for x in chunk_iter
            ]
        all_chunks.extend(doc_chunks)
    except Exception as e:
        print(f"Error processing {row.asset_path}: {str(e)}")

Processing Food Safety and Quality Act, 2001, S.O. 2001, c. 20
Processing Nutrient Management Act, 2002, S.O. 2002, c. 4
Processing Animal Health Act, 2009, S.O. 2009, c. 31
Processing Commodity Board Members Act, R.S.O. 1990, c. C.18
Processing Education Act, R.S.O. 1990, c. E.2
Processing Farm Products Payments Act, R.S.O. 1990, c. F.10
Processing Health Protection and Promotion Act, R.S.O. 1990, c. H.7
Processing Highway Traffic Act, R.S.O. 1990, c. H.8
Processing Milk Act, R.S.O. 1990, c. M.12
Processing Ministry of Agriculture, Food and Rural Affairs Act, R.S.O. 1990, c. M.16
Processing Retail Sales Tax Act, R.S.O. 1990, c. R.31
Processing Farming and Food Production Protection Act, 1998, S.O. 1998, c. 1
Processing Commodity Boards and Marketing Agencies Act, R.S.O. 1990, c. C.19


In [12]:
pd.DataFrame(all_chunks)

Unnamed: 0,doc_uri,pages,doc_refs,headings,captions,text,enriched_text
0,https://www.ontario.ca/laws/statute/01f20,[1],"[#/texts/1, #/texts/2, #/texts/3, #/texts/4, #...",[Français],,1.\nPurposes\n2.\nDefinitions\n3.\nDirectors\n...,\n Headings:Français\n\n ...
1,https://www.ontario.ca/laws/statute/01f20,[1],"[#/texts/30, #/texts/31]","[Consolidation Period: From November 29, 2021 ...",,"Last amendment: 2019, c. 15, Sched. 22, s. 93....",\n Headings:Consolidation Period: F...
2,https://www.ontario.ca/laws/statute/01f20,[1],"[#/texts/40, #/texts/41, #/texts/42, #/texts/4...",[INSPECTIONS AND ORDERS],,INSPECTIONS\nInspectors\nSearch without warran...,\n Headings:INSPECTIONS AND ORDERS\...
3,https://www.ontario.ca/laws/statute/01f20,"[1, 2]","[#/texts/62, #/tables/0]",[ORDERS],,"1\n34., 1 = Hearing by director. 34., 2 = . 35...",\n Headings:ORDERS\n\n C...
4,https://www.ontario.ca/laws/statute/01f20,[2],"[#/texts/68, #/texts/69, #/texts/70, #/texts/71]",[Purposes],,"1 The purposes of this Act are to provide for,...",\n Headings:Purposes\n\n ...
...,...,...,...,...,...,...,...
7139,https://www.ontario.ca/laws/statute/98f01,[7],[#/texts/211],[Technical help],,(3) The Board may appoint one or more persons...,\n Headings:Technical help\n\n ...
7140,https://www.ontario.ca/laws/statute/98f01,[7],[#/texts/213],"[Guidelines, etc.]",,"9 (1) The Minister may issue directives, guid...","\n Headings:Guidelines, etc.\n\n ..."
7141,https://www.ontario.ca/laws/statute/98f01,[7],[#/texts/215],[Adoption by reference],,"(2) For the purposes of subsection (1), the M...",\n Headings:Adoption by reference\n...
7142,https://www.ontario.ca/laws/statute/98f01,[7],[#/texts/217],[Other considerations],,"(3) Despite subsections (1) and (2), the...",\n Headings:Other considerations\n\...


In [12]:
pd.DataFrame(chunks)

Unnamed: 0,pages,doc_refs,headings,captions,text,enriched_text
0,[1],"[#/texts/1, #/texts/2, #/texts/3, #/texts/4, #...",[Français],,1.\nPurposes\n2.\nDefinitions\n3.\nDirectors\n...,\n Headings:Français\n\n ...
1,[1],"[#/texts/30, #/texts/31]","[Consolidation Period: From November 29, 2021 ...",,"Last amendment: 2019, c. 15, Sched. 22, s. 93....",\n Headings:Consolidation Period: F...
2,[1],"[#/texts/40, #/texts/41, #/texts/42, #/texts/4...",[INSPECTIONS AND ORDERS],,INSPECTIONS\nInspectors\nSearch without warran...,\n Headings:INSPECTIONS AND ORDERS\...
3,"[1, 2]","[#/texts/62, #/tables/0]",[ORDERS],,"1\n34., 1 = Hearing by director. 34., 2 = . 35...",\n Headings:ORDERS\n\n C...
4,[2],"[#/texts/68, #/texts/69, #/texts/70, #/texts/71]",[Purposes],,"1 The purposes of this Act are to provide for,...",\n Headings:Purposes\n\n ...
...,...,...,...,...,...,...
259,[37],[#/texts/1036],[Adoption of instruments],,"(4) A regulation may adopt by reference,...",\n Headings:Adoption of instruments...
260,[37],[#/texts/1038],[Source of instruments],,"(5) An Act, regulation, law, code, formula, s...",\n Headings:Source of instruments\n...
261,[37],[#/texts/1040],[Amendments to instruments],,(6) The power to adopt by reference and requi...,\n Headings:Amendments to instrumen...
262,[37],"[#/texts/1042, #/texts/1043, #/texts/1044]",[Review of this Act],,55 The Minister may conduct a review of this A...,\n Headings:Review of this Act\n\n ...
