In [9]:
import os
import nest_asyncio
from llama_parse import LlamaParse
from llama_index.core.schema import ImageDocument, TextNode
from llama_index.multi_modal_llms.openai import OpenAIMultiModal
from typing import List

nest_asyncio.apply()

FILE_NAME = "./data/example1/memes.pdf"
IMAGES_DOWNLOAD_PATH = "./data/example1/images/"

LLAMA_CLOUD_API_KEY = os.environ["LLAMA_CLOUD_API_KEY"]
OPENAI_API_KEY = os.environ["OPENAI_API_KEY"]

In [10]:
parser = LlamaParse(
    api_key=LLAMA_CLOUD_API_KEY,
    result_type="markdown",
)

json_objs = parser.get_json_result(FILE_NAME)
json_list = json_objs[0]["pages"]

Started parsing the file under job_id 1131bb53-f134-42a4-9c38-84391b24a100


In [11]:
def get_text_nodes(json_list: List[dict]) -> List[TextNode]:
    return [TextNode(text=page["text"], metadata={"page": page["page"]}) for page in json_list]

text_nodes = get_text_nodes(json_list)

def get_image_nodes(json_objs: List[dict], download_path: str) -> List[ImageDocument]:
    image_dicts = parser.get_images(json_objs, download_path=download_path)
    return [ImageDocument(image_path=image_dict["path"]) for image_dict in image_dicts]

image_documents = get_image_nodes(json_objs, IMAGES_DOWNLOAD_PATH)

> Image for page 1: [{'name': 'img_p0_1.png', 'height': 371, 'width': 556, 'x': 168.139, 'y': 124.80138916800004, 'original_width': 1599, 'original_height': 1066}]


In [12]:
openai_mm_llm = OpenAIMultiModal(
    model="gpt-4o", api_key=OPENAI_API_KEY, max_new_tokens=300
)

response = openai_mm_llm.complete(
    prompt=f"How does the image relate to the text: '{text_nodes[0].text}'?",
    image_documents=image_documents,
)

print(response)


The image shows a cat with a relaxed or indifferent expression, accompanied by the text "WHEN YOU REALISE IT'S MONDAY." This relates to the text you provided by conveying a humorous or relatable sentiment about the feeling of facing a Monday, which is often associated with the end of the weekend and the start of the workweek. The phrase "Memes can tell more than 1000 words" suggests that the image effectively communicates this common feeling without needing further explanation.
