In [None]:
import os
import nest_asyncio
from llama_parse import LlamaParse
from llama_index.core.schema import ImageDocument, TextNode
from llama_index.multi_modal_llms.openai import OpenAIMultiModal
from typing import List

nest_asyncio.apply()

FILE_NAME = "./data/example1/memes.pdf"
IMAGES_DOWNLOAD_PATH = "./data/example1/images/"

LLAMA_CLOUD_API_KEY = os.environ["LLAMA_CLOUD_API_KEY"]
OPENAI_API_KEY = os.environ["OPENAI_API_KEY"]

In [None]:
parser = LlamaParse(
    api_key=LLAMA_CLOUD_API_KEY,
    result_type="markdown",
)

json_objs = parser.get_json_result(FILE_NAME)
json_list = json_objs[0]["pages"]

In [None]:
def get_text_nodes(json_list: List[dict]) -> List[TextNode]:
    return [TextNode(text=page["text"], metadata={"page": page["page"]}) for page in json_list]

text_nodes = get_text_nodes(json_list)

def get_image_nodes(json_objs: List[dict], download_path: str) -> List[ImageDocument]:
    image_dicts = parser.get_images(json_objs, download_path=download_path)
    return [ImageDocument(image_path=image_dict["path"]) for image_dict in image_dicts]

image_documents = get_image_nodes(json_objs, IMAGES_DOWNLOAD_PATH)

In [8]:
openai_mm_llm = OpenAIMultiModal(
    model="gpt-4o", api_key=OPENAI_API_KEY, max_new_tokens=300
)

response = openai_mm_llm.complete(
    prompt=f"How does the image relate to the text: '{text_nodes[0].text}'?",
    image_documents=image_documents,
)

print(response)


Started parsing the file under job_id d8e12945-d03d-489a-93bc-71c66b6b366e
> Image for page 1: [{'name': 'img_p0_1.png', 'height': 371, 'width': 556, 'x': 168.139, 'y': 124.80107680000003, 'original_width': 1024, 'original_height': 683}]
The image shows a man smiling while holding a mug and using a laptop. This image is often used in memes to convey a sense of irony or hidden discomfort behind a smile. It relates to the text "Memes can tell more than 1000 words" by illustrating how a simple image can convey complex emotions or narratives, often more effectively than words alone.
