In [28]:
import json
import os

def json_to_jsonl(input_path, output_path):
    try:
        with open(input_path, 'r', encoding='utf-8') as f:
            content = f.read().strip()
            if not content:
                print(f"Error: {input_path} is empty.")
                return
            data = json.loads(content)
    except json.JSONDecodeError as e:
        print(f"Error decoding JSON from {input_path}: {e}")
        return
    if isinstance(data, dict) and 'data' in data:
        data = data['data']
    with open(output_path, 'w', encoding='utf-8') as f_out:
        for item in data:
            f_out.write(json.dumps(item, ensure_ascii=False))
            f_out.write('\n')

In [29]:
import os

def extract_documents_to_markdown(input_path, output_folder):
    # Ensure output folder exists
    os.makedirs(output_folder, exist_ok=True)

    # Load documents
    documents = []
    if input_path.endswith('.jsonl'):
        # Read JSONL file line by line
        with open(input_path, 'r', encoding='utf-8') as f:
            for line in f:
                if line.strip():  # Skip empty lines
                    try:
                        doc = json.loads(line)
                        documents.append(doc)
                    except json.JSONDecodeError as e:
                        print(f"Error parsing line: {e}")
    else:
        # Read JSON file
        with open(input_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
            if isinstance(data, dict) and 'data' in data:
                documents = data['data']
            else:
                documents = data

    # Process each document
    for doc in documents:
        if 'DocumentID' in doc and 'Text' in doc:
            document_id = doc['DocumentID']
            text = doc['Text']

            # Create markdown file with document ID as filename
            markdown_path = os.path.join(output_folder, f"{document_id}.md")
            with open(markdown_path, 'w', encoding='utf-8') as f:
                f.write(text)

            print(f"Created markdown file: {markdown_path}")

In [30]:
input_path = 'documentation.json'
output_path = 'documentation.jsonl'
output_folder = 'documents'

json_to_jsonl(input_path, output_path)

if os.path.exists(output_path):
    extract_documents_to_markdown(output_path, output_folder)
    print("Document extraction completed!")
else:
    print(f"Skipping markdown extraction: {output_path} does not exist.")

Created markdown file: documents/abc919b6-1aaa-4626-bd5d-7ee63e517434.md
Created markdown file: documents/6fd97ae4-9ddc-42cd-b984-0e530a295200.md
Created markdown file: documents/da809602-54e7-462b-9be9-a4e4b18b1f2d.md
Created markdown file: documents/efd000bf-3aa1-4336-bba7-7f5db54040b1.md
Created markdown file: documents/5de910bf-ebc6-4539-9d36-65c7485109b4.md
Created markdown file: documents/5cf67363-9f95-4591-800a-b5786d641094.md
Created markdown file: documents/08b352a6-8a5f-49d5-89f5-d9237121700a.md
Created markdown file: documents/a1963674-1b72-48a6-bbf3-ab9943512077.md
Created markdown file: documents/68eb8f73-e19c-40e1-b6f3-08b8afa75d4a.md
Created markdown file: documents/606462a5-fd2c-4f4c-a0dc-33856b5cd2c4.md
Created markdown file: documents/a0f352f6-5f63-4272-ae68-f156dd2be0df.md
Created markdown file: documents/9f51fbd8-f4c1-4a69-9379-7bda9fec2355.md
Created markdown file: documents/b85ef49c-355e-4e2a-a1e0-eb6f47654327.md
Created markdown file: documents/6d7f69b3-ad17-49a1