# Llama Parse Lab

In [19]:
from llama_cloud_services import LlamaParse
import os
import nest_asyncio

In [5]:
api_key = os.environ.get("LLAMA_CLOUD_API_KEY")

In [6]:
parser = LlamaParse(
    api_key=api_key,  # can also be set in your env as LLAMA_CLOUD_API_KEY
    num_workers=1,       # if multiple files passed, split in `num_workers` API calls
    verbose=True,
    language="en",       # optionally define a language, default=en
)

In [25]:
# parse a single file
nest_asyncio.apply()
job_result = parser.parse("../data/arXiv_1706_03762v7_Attention_Is_All_You_Need.pdf")
print(type(job_result))


Started parsing the file under job_id 2a03f300-24b4-4158-bff8-c82774103cd7
<class 'llama_cloud_services.parse.types.JobResult'>


The result object is a fully typed `JobResult` object. You can interact with it to parse and transform various parts of the result:

## `Case 1`: Get the llama-index markdown documents

In [8]:
# get the llama-index markdown documents
markdown_documents = job_result.get_markdown_documents(split_by_page=True)
print(len(markdown_documents))
print(type(markdown_documents))

15
<class 'list'>


In [9]:
# Get metadata from page 0
markdown_documents[0].metadata

{'page_number': 1,
 'file_name': '../data/arXiv_1706_03762v7_Attention_Is_All_You_Need.pdf'}

In [10]:
# Get text from page 0
markdown_documents[0].text

'arXiv:1706.03762v7  [cs.CL]  2 Aug 2023\n\nProvided proper attribution is provided, Google hereby grants permission to reproduce the tables and figures in this paper solely for use in journalistic or scholarly works.\n\n# Attention Is All You Need\n\nAshish Vaswani∗            Noam Shazeer∗            Niki Parmar∗         Jakob Uszkoreit∗\n\nGoogle Brain          Google Brain          Google Research        Google Research\n\navaswani@google.com     noam@google.com         nikip@google.com        usz@google.com\n\nLlion Jones∗              Aidan N. Gomez∗ †                    Łukasz Kaiser∗\n\nGoogle Research          University of Toronto                  Google Brain\n\nllion@google.com          aidan@cs.toronto.edu            lukaszkaiser@google.com\n\nIllia Polosukhin∗ ‡\n\nillia.polosukhin@gmail.com\n\n# Abstract\n\nThe dominant sequence transduction models are based on complex recurrent or convolutional neural networks that include an encoder and a decoder. The best performing m

## `Case 2:` Get the llama-index text documents

`split_by_page` parameter allow us split the pages into separate documents if its value is `True`. Otherwise, `False` return all the text from pdf file in a single text document.

In [11]:
# get the text from all pdf pages
text_documents = job_result.get_text_documents(split_by_page=False)
print(len(text_documents))
print(type(text_documents))

1
<class 'list'>


In [12]:
# Show the document structure
text_documents

[Document(id_='37eb529b-f018-4695-befe-9c0b8dfb80bb', embedding=None, metadata={'file_name': '../data/arXiv_1706_03762v7_Attention_Is_All_You_Need.pdf'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, metadata_template='{key}: {value}', metadata_separator='\n', text_resource=MediaResource(embeddings=None, data=None, text='                                arXiv:1706.03762v7  [cs.CL]  2 Aug 2023\n                            Provided proper attribution is provided, Google hereby grants permission to\n                          reproduce the tables and figures in this paper solely for use in journalistic or\n                                        scholarly works.\n                          Attention Is All You Need\n      Ashish Vaswani∗            Noam Shazeer∗            Niki Parmar∗         Jakob Uszkoreit∗\n             Google Brain          Google Brain          Google Research        Google Research\n         avaswani@google.com     noam@google.com  

In [13]:
text_documents[0].metadata

{'file_name': '../data/arXiv_1706_03762v7_Attention_Is_All_You_Need.pdf'}

## `Case 3`: Get the image from documents

In [20]:
nest_asyncio.apply()

image_documents = job_result.get_image_documents(
    include_screenshot_images=True,
    include_object_images=True,
    # Optional: download the images to a directory
    # (default is to return the image bytes in ImageDocument objects)
    image_download_dir="../data/images",
)

In [None]:
# Show the Image Document structure
image_documents

[ImageDocument(id_='e1858951-4775-4af2-bda4-f41e7d1ba016', embedding=None, metadata={'page_number': 3, 'file_name': '../data/arXiv_1706_03762v7_Attention_Is_All_You_Need.pdf', 'width': 1520, 'height': 2239, 'x': 196.559, 'y': 72.00200000000001}, excluded_embed_metadata_keys=['width', 'height', 'x', 'y'], excluded_llm_metadata_keys=['width', 'height', 'x', 'y'], relationships={}, metadata_template='{key}: {value}', metadata_separator='\n', text_resource=None, image_resource=MediaResource(embeddings=None, data=None, text=None, path=WindowsPath('../data/images/img_p2_1.png'), url=None, mimetype='image/png'), audio_resource=None, video_resource=None, text_template='{metadata_str}\n\n{content}'),
 ImageDocument(id_='7595a9c1-9d98-4e72-a1b5-e030595d3eed', embedding=None, metadata={'page_number': 4, 'file_name': '../data/arXiv_1706_03762v7_Attention_Is_All_You_Need.pdf', 'width': 445, 'height': 884, 'x': 174.96, 'y': 94.02500000000003}, excluded_embed_metadata_keys=['width', 'height', 'x', 'y

## `Case 4`: Get all from each page 

In [34]:
# access the raw job result
# Items will vary based on the parser configuration
for page in job_result.pages:
    print(page)
    

page=1 text='                                arXiv:1706.03762v7  [cs.CL]  2 Aug 2023\n                            Provided proper attribution is provided, Google hereby grants permission to\n                          reproduce the tables and figures in this paper solely for use in journalistic or\n                                        scholarly works.\n                          Attention Is All You Need\n      Ashish Vaswani∗            Noam Shazeer∗            Niki Parmar∗         Jakob Uszkoreit∗\n             Google Brain          Google Brain          Google Research        Google Research\n         avaswani@google.com     noam@google.com         nikip@google.com        usz@google.com\n         Llion Jones∗              Aidan N. Gomez∗ †                    Łukasz Kaiser∗\n         Google Research          University of Toronto                  Google Brain\n         llion@google.com          aidan@cs.toronto.edu            lukaszkaiser@google.com\n                                