In [1]:
!pip install -e ..

Obtaining file:///Users/astoyano/Documents/code/docling-sdg
  Installing build dependencies ... [?25ldone
[?25h  Checking if build backend supports build_editable ... [?25ldone
[?25h  Getting requirements to build editable ... [?25ldone
[?25h  Preparing editable metadata (pyproject.toml) ... [?25ldone
Building wheels for collected packages: docling-sdg
  Building editable for docling-sdg (pyproject.toml) ... [?25ldone
[?25h  Created wheel for docling-sdg: filename=docling_sdg-1.0.0-0.editable-py3-none-any.whl size=7121 sha256=715a2afd0ac04e750cc69df58c7a2795ec75d15fac564952f0d5a77e046fbe68
  Stored in directory: /private/var/folders/9j/ynpwh37j4sz174pxnpf0pdv00000gn/T/pip-ephem-wheel-cache-tsfs73wm/wheels/8b/6e/59/f1f3f2085f61d792ec72c3fa023bf918460ef2c50de1103d35
Successfully built docling-sdg
Installing collected packages: docling-sdg
  Attempting uninstall: docling-sdg
    Found existing installation: docling-sdg 1.0.0
    Uninstalling docling-sdg-1.0.0:
      Successfully 

In [2]:
!pip install llama-index-node-parser-docling llama-index-readers-docling



In [2]:
import glob

from llama_index.node_parser.docling import DoclingNodeParser
from llama_index.readers.docling import DoclingReader
from docling_sdg.qa.base import QaChunk
from docling.document_converter import DocumentConverter, PdfFormatOption

input_files = glob.glob(f"data/*.pdf")

doc_converter = DocumentConverter()
docs = doc_converter.convert_all(input_files)

In [3]:
from docling_core.transforms.chunker.hierarchical_chunker import HierarchicalChunker

chunker = HierarchicalChunker()
dataset = {}

for doc in docs:
    print(f"Docling parsed document {doc.document.name}")

    chunks = list(chunker.chunk(dl_doc=doc.document))
    print(f"Computed {len(chunks)} chunks")
    print(chunks[:1])
    
    qa_chunks = []
    i = 0
    for chunk in chunks:
        if len(chunk.text) > 500:
            qa_chunk = QaChunk(
                meta={"doc_items": [{"self_ref":"#/a","label": "text"}], "chunk_id": str(i),"doc_id": doc.document.name}, 
                text= chunk.text
            )
            qa_chunks.append(qa_chunk)
            i += 1

    print(qa_chunks[0])
    dataset[doc.document.name] = qa_chunks

    print(f"Created dataset {doc.document.name} with {len(qa_chunks)} QA chunks")

Docling parsed document oa_pdf_00_c3_gkl753_PMC1781121
Computed 29 chunks
[DocChunk(text='Yong Li 1,2 , Mario G. Rosso 1,2 , Prisca Viehoever 1 and Bernd Weisshaar 1, *', meta=DocMeta(schema_name='docling_core.transforms.chunker.DocMeta', version='1.0.0', doc_items=[TextItem(self_ref='#/texts/4', parent=RefItem(cref='#/body'), children=[], content_layer=<ContentLayer.BODY: 'body'>, label=<DocItemLabel.TEXT: 'text'>, prov=[ProvenanceItem(page_no=1, bbox=BoundingBox(l=40.819, t=641.28, r=451.91, b=627.535, coord_origin=<CoordOrigin.BOTTOMLEFT: 'BOTTOMLEFT'>), charspan=(0, 78))], orig='Yong Li 1,2 , Mario G. Rosso 1,2 , Prisca Viehoever 1 and Bernd Weisshaar 1, *', text='Yong Li 1,2 , Mario G. Rosso 1,2 , Prisca Viehoever 1 and Bernd Weisshaar 1, *', formatting=None, hyperlink=None)], headings=['GABI-Kat SimpleSearch: an Arabidopsis thaliana T-DNA mutant database with detailed information for confirmed insertions'], captions=None, origin=DocumentOrigin(mimetype='application/pdf', binary_h

In [4]:
from docling_sdg.qa.generate import Generator
from docling_sdg.qa.base import GenerateOptions

generate_options = GenerateOptions(api_key="fake", project_id="project_id")
generate_options.api_key = "fake"
generate_options.model_id = "mixtral" # for local ollama
gen = Generator(generate_options=generate_options)

In [5]:
for doc, chunks in dataset.items():
    print(f"processing chunks that looks like {chunks[0].text}")
    results = gen.generate_from_chunks(chunks)
    print(f"{doc}: {results}")
    break

processing chunks that looks like Insertional mutagenesis approaches, especially by T-DNA, play important roles in gene function studies of the model plant Arabidopsis thaliana . GABI-Kat SimpleSearch (http://www.GABI-Kat.de) is a Flanking Sequence Tag (FST)-based database for T-DNA insertion mutants generated by the GABI-Kat project. Currently, the database contains . 108 000 mapped FSTs from /C24 64 000 lines which cover 64% of all annotated A.thaliana protein-coding genes. The web interface allows searching for relevant insertions by gene code, keyword, line identifier, GenBank accession number of the FST, and also by BLAST. A graphic display of the genome region around the gene or the FST assists users to select insertion lines of their interests. About 3500 insertions were confirmed in the offspring of the plant from which the original FST was generated, and the seeds of these lines are available from the Nottingham Arabidopsis Stock Centre. The database now also contains addition

In [6]:
import json
import yaml

qnas = {}
chunk_id_to_text = {}
with open("docling_sdg_generated_qac.jsonl", "rt") as f:
    for line in f.readlines():
        entry = json.loads(line)
        chunk_id = entry['chunk_id']
        if chunk_id not in chunk_id_to_text:
            chunk_id_to_text[chunk_id] = entry['context']
        if chunk_id not in qnas:
            qnas[chunk_id] = []
        qnas[chunk_id].append({'question': entry['question'], 'answer': entry['answer']})

def str_presenter(dumper, data):
  if len(data.splitlines()) > 1:  # check for multiline string
    return dumper.represent_scalar('tag:yaml.org,2002:str', data, style='|')
  return dumper.represent_scalar('tag:yaml.org,2002:str', data)

yaml.add_representer(str, str_presenter)

# to use with safe_dump:
yaml.representer.SafeRepresenter.add_representer(str, str_presenter)

data = {'seed_examples': []}
for chunk_id, context in chunk_id_to_text.items():
    data['seed_examples'].append({
        'context': context,
        'questions_and_answers': [
            {
                'question': example['question'],
                'answer': example['answer'],
            } for example in qnas[chunk_id]
        ]
    })

with open('qna.yml', 'w') as yaml_file:
    yaml.dump(data, yaml_file, default_flow_style=False, sort_keys=False)

In [55]:
from docling_sdg.qa.base import initialize_llm
from docling_sdg.qa.base import QaChunk

#chunk = QaChunk(meta={"doc_items": [{"self_ref":"#/CfL","label": "text"}], "chunk_id": "","doc_id": ""}, text="George Washington (February 22, 1732 [O.S. February 11, 1731][a] – December 14, 1799) was a Founding Father and the first president of the United States, serving from 1789 to 1797. As commander of the Continental Army, Washington led Patriot forces to victory in the American Revolutionary War against the British Empire. He is commonly known as the Father of His Country for his role in bringing about American independence.")
#gen.generate_from_chunks([chunk])