In [7]:
# bring in our LLAMA_CLOUD_API_KEY
from dotenv import load_dotenv
load_dotenv()

# bring in deps
from llama_parse import LlamaParse
import os
import json

In [8]:
# Function to read metadata from a JSON file
def load_metadata(json_path):
    with open(json_path, 'r', encoding='utf-8') as file:
        return json.load(file)
    
metadata = load_metadata('documents_metadata.json')
metadata[0]

{'source_id': 1,
 'topic': 'Setup your Amazon business',
 'url': 'https://sell.amazon.com/sell#ready-to-sell',
 'title': 'How to sell on Amazon: a guide for beginners',
 'sub_title': 'Get an overview of how to create a selling account, list products, fulfill customer orders, and more. Learn how to sell—whether you’re new to online retail or just new to selling in the Amazon store.'}

In [9]:
# set up parser
parser = LlamaParse(
    api_key = os.environ['LLAMA_PARSE_API_KEY'],
    result_type="markdown"  # "markdown" and "text" are available
)

In [10]:
# List of PDF file paths
pdf_files = [f"amazon_seller_central_documents_pdf/{i}.pdf" for i in range(1, len(metadata) + 1)]
print(len(pdf_files))
pdf_files[:3]

15


['amazon_seller_central_documents_pdf/1.pdf',
 'amazon_seller_central_documents_pdf/2.pdf',
 'amazon_seller_central_documents_pdf/3.pdf']

In [11]:
# Function to save markdown content to a file
def save_markdown(content, output_path):
    with open(output_path, 'w', encoding='utf-8') as file:
        file.write(content)

md_directory = "amazon_seller_central_documents_md"

In [13]:
import nest_asyncio
nest_asyncio.apply()

# Parse PDF files and combine with metadata
documents = []
for i, (pdf_file, meta) in enumerate(zip(pdf_files, metadata), start=1):
    
    doc = parser.load_data(pdf_file)
    
    # Extract text from the parsed document
    document_text = doc[0].text
    
    # Save the markdown text to a file
    markdown_path = f"{md_directory}/{i}.md"
    save_markdown(document_text, markdown_path)
    
    # Combine parsed text with metadata
    document = {
        "source_id": meta["source_id"],
        "topic": meta["topic"],
        "url": meta["url"],
        "title": meta["title"],
        "sub_title": meta["sub_title"],
        "content": document_text
    }
    
    documents.append(document)

# Save the combined data as a JSON file
output_json_path = "rag_dataset.json"
with open(output_json_path, 'w', encoding='utf-8') as file:
    json.dump(documents, file, ensure_ascii=False, indent=4)

print(f"Dataset saved to {output_json_path}")

Started parsing the file under job_id 7e62c265-103a-4bfc-be77-080dc9c89b88
Started parsing the file under job_id fde7b8d6-f482-4e48-9860-7eec6d089590
Started parsing the file under job_id a6d2b299-21fb-4b57-8cf1-abcf0a18fd34
.Started parsing the file under job_id 22f3e0db-b840-4412-9924-b18b7d442da0
.Started parsing the file under job_id f28958a1-a7c8-4c75-8237-2b063ac67663
Started parsing the file under job_id b5a48914-fbd5-45e9-9c09-00f976297a50
Started parsing the file under job_id 1b5732fe-aa37-48a7-9eb8-461f6c2d793f
Started parsing the file under job_id 0a4e9564-1601-4bc7-a20a-436591e8b913
Started parsing the file under job_id c4bde2a6-cab8-4ac1-969e-539d06c03e15
Started parsing the file under job_id dfb290f6-45b9-4c51-afc4-4154705cfac0
Started parsing the file under job_id b99fb847-1225-4981-b4e0-9615df00e72e
Started parsing the file under job_id 25d04273-2908-426f-bfda-4c8a8bb0f6f3
Started parsing the file under job_id 80a38e52-e5c4-4d8a-8ace-f05c26edca6d
Started parsing the fil