In [1]:
!pip install -qq -e ..

# Find PDF files from data/ subdirectory and convert using Docling

In [2]:
import glob

from docling.document_converter import DocumentConverter

input_files = glob.glob(f"data/*.pdf")

doc_converter = DocumentConverter()
docs = doc_converter.convert_all(input_files)

# Chunk each document and apply filters to chunks

In [3]:
from docling_core.transforms.chunker.hierarchical_chunker import HierarchicalChunker
from docling_sdg.qa.utils import get_qa_chunks

chunker = HierarchicalChunker()

filters = [
    lambda chunk: len(str(chunk.text)) > 500
]

dataset = {}
for doc in docs:
    print(f"Chunking and filtering document {doc.document.name}")

    chunks = list(chunker.chunk(dl_doc=doc.document))
    qa_chunks = list(get_qa_chunks(doc.document.name, chunks, filters))
    dataset[doc.document.name] = qa_chunks
    
    print(f"Created dataset {doc.document.name} with {len(qa_chunks)} QA chunks")

Chunking and filtering document oa_pdf_00_c3_gkl753_PMC1781121
Created dataset oa_pdf_00_c3_gkl753_PMC1781121 with 16 QA chunks
Chunking and filtering document oa_pdf_00_c5_main_PMC467086
Created dataset oa_pdf_00_c5_main_PMC467086 with 9 QA chunks


# Initialize QA generator, supplying details for which model to use

In [4]:
from docling_sdg.qa.generate import Generator
from docling_sdg.qa.base import GenerateOptions

generate_options = GenerateOptions(api_key="fake", project_id="project_id")
generate_options.api_key = "fake"
generate_options.model_id = "mixtral" # for local ollama
gen = Generator(generate_options=generate_options)

# For each set of per-document chunks, generate QAs using model

In [5]:
for doc, chunks in dataset.items():
    print(f"processing chunks that looks like:\n{chunks[0].text}")
    results = gen.generate_from_chunks(chunks)
    print(f"{doc}: {results.status}")
    break

processing chunks that looks like:
Insertional mutagenesis approaches, especially by T-DNA, play important roles in gene function studies of the model plant Arabidopsis thaliana . GABI-Kat SimpleSearch (http://www.GABI-Kat.de) is a Flanking Sequence Tag (FST)-based database for T-DNA insertion mutants generated by the GABI-Kat project. Currently, the database contains . 108 000 mapped FSTs from /C24 64 000 lines which cover 64% of all annotated A.thaliana protein-coding genes. The web interface allows searching for relevant insertions by gene code, keyword, line identifier, GenBank accession number of the FST, and also by BLAST. A graphic display of the genome region around the gene or the FST assists users to select insertion lines of their interests. About 3500 insertions were confirmed in the offspring of the plant from which the original FST was generated, and the seeds of these lines are available from the Nottingham Arabidopsis Stock Centre. The database now also contains additio

 62%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                 | 10/16 [00:33<00:27,  4.63s/it]Failed parsing JSON from generated question: 
 94%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████           | 15/16 [01:10<00:07,  7.15s/it]Failed parsing JSON from generated question: 
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16/16 [01:35<00:00,  5.96s/it]

oa_pdf_00_c3_gkl753_PMC1781121: Status.SUCCESS





# Process docling-sdg JSON output into InstructLab qna.yaml and save to disk

In [6]:
import json
import yaml
from textwrap import wrap

qnas = {}
chunk_id_to_text = {}
with open("docling_sdg_generated_qac.jsonl", "rt") as f:
    for line in f.readlines():
        entry = json.loads(line)
        chunk_id = entry['chunk_id']
        if chunk_id not in chunk_id_to_text:
            chunk_id_to_text[chunk_id] = entry['context']
        if chunk_id not in qnas:
            qnas[chunk_id] = []
        qnas[chunk_id].append({'question': entry['question'], 'answer': entry['answer']})


def str_presenter(dumper, data):
  if len(data.splitlines()) > 1:  # check for multiline string
    return dumper.represent_scalar('tag:yaml.org,2002:str', data, style='|')
  elif len(data) > 80:
    data = "\n".join(wrap(data, 80))
    return dumper.represent_scalar('tag:yaml.org,2002:str', data, style='|')
  return dumper.represent_scalar('tag:yaml.org,2002:str', data)

yaml.add_representer(str, str_presenter)

# to use with safe_dump:
yaml.representer.SafeRepresenter.add_representer(str, str_presenter)

class IndentedDumper(yaml.Dumper):
    def increase_indent(self, flow=False, indentless=False):
        return super(IndentedDumper, self).increase_indent(flow, False)

data = {'seed_examples': []}
for chunk_id, context in chunk_id_to_text.items():
    data['seed_examples'].append({
        'context': context,
        'questions_and_answers': [
            {
                'question': example['question'],
                'answer': example['answer'],
            } for example in qnas[chunk_id]
        ]
    })

with open('qna.yml', 'w') as yaml_file:
    yaml.dump(data, yaml_file, Dumper=IndentedDumper, default_flow_style=False, sort_keys=False, width=80)

print("Done")

Done


# Appendix: How to generate QAs for a specific text string

In [7]:
from docling_sdg.qa.base import QaChunk

# example to see what gets generated for a specific text string

#text = "George Washington (February 22, 1732 [O.S. February 11, 1731][a] – December 14, 1799) was a Founding Father and the first president of the United States, serving from 1789 to 1797. As commander of the Continental Army, Washington led Patriot forces to victory in the American Revolutionary War against the British Empire. He is commonly known as the Father of His Country for his role in bringing about American independence."
#chunk = QaChunk(meta={"doc_items": [{"self_ref":"#/CfL","label": "text"}], "chunk_id": "","doc_id": ""}, text=text)
#gen.generate_from_chunks([chunk])