In [1]:
%pip install llama-index-llms-openai
%pip install llama-index-extractors-entity

Note: you may need to restart the kernel to use updated packages.
Collecting llama-index-extractors-entity
  Downloading llama_index_extractors_entity-0.2.0-py3-none-any.whl.metadata (732 bytes)
Collecting huggingface-hub<0.24.0 (from llama-index-extractors-entity)
  Downloading huggingface_hub-0.23.5-py3-none-any.whl.metadata (12 kB)
Collecting span-marker>=1.5.0 (from llama-index-extractors-entity)
  Downloading span_marker-1.5.0-py3-none-any.whl.metadata (18 kB)
Collecting filelock (from huggingface-hub<0.24.0->llama-index-extractors-entity)
  Downloading filelock-3.16.0-py3-none-any.whl.metadata (3.0 kB)
Collecting torch (from span-marker>=1.5.0->llama-index-extractors-entity)
  Downloading torch-2.4.1-cp310-none-macosx_11_0_arm64.whl.metadata (26 kB)
Collecting accelerate (from span-marker>=1.5.0->llama-index-extractors-entity)
  Downloading accelerate-0.34.2-py3-none-any.whl.metadata (19 kB)
Collecting transformers>=4.19.0 (from span-marker>=1.5.0->llama-index-extractors-entity)


In [2]:
!pip install llama-index

Collecting llama-index
  Downloading llama_index-0.11.7-py3-none-any.whl.metadata (11 kB)
Collecting llama-index-cli<0.4.0,>=0.3.0 (from llama-index)
  Downloading llama_index_cli-0.3.0-py3-none-any.whl.metadata (1.5 kB)
Collecting llama-index-indices-managed-llama-cloud>=0.3.0 (from llama-index)
  Downloading llama_index_indices_managed_llama_cloud-0.3.0-py3-none-any.whl.metadata (3.8 kB)
Collecting llama-index-legacy<0.10.0,>=0.9.48 (from llama-index)
  Downloading llama_index_legacy-0.9.48.post3-py3-none-any.whl.metadata (8.5 kB)
Collecting llama-index-multi-modal-llms-openai<0.3.0,>=0.2.0 (from llama-index)
  Downloading llama_index_multi_modal_llms_openai-0.2.0-py3-none-any.whl.metadata (728 bytes)
Collecting llama-index-program-openai<0.3.0,>=0.2.0 (from llama-index)
  Downloading llama_index_program_openai-0.2.0-py3-none-any.whl.metadata (766 bytes)
Collecting llama-index-question-gen-openai<0.3.0,>=0.2.0 (from llama-index)
  Downloading llama_index_question_gen_openai-0.2.0-py3

In [37]:
import nest_asyncio

nest_asyncio.apply()

import os
import openai

os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
LLAMA_CLOUD_API_KEY = os.getenv("LLAMA_CLOUD_API_KEY")

In [5]:
from llama_index.llms.openai import OpenAI
from llama_index.core.schema import MetadataMode

In [6]:
llm = OpenAI(temperature=0.1, model="gpt-4o-2024-08-06", max_tokens=1024)

In [7]:
from llama_index.core.extractors import (
    SummaryExtractor,
    QuestionsAnsweredExtractor,
    TitleExtractor,
    KeywordExtractor,
    BaseExtractor,
)
from llama_index.extractors.entity import EntityExtractor
from llama_index.core.node_parser import TokenTextSplitter

text_splitter = TokenTextSplitter(
    separator=" ", chunk_size=512, chunk_overlap=128
)


class CustomExtractor(BaseExtractor):
    def extract(self, nodes):
        metadata_list = [
            {
                "custom": (
                    node.metadata["document_title"]
                    + "\n"
                    + node.metadata["excerpt_keywords"]
                )
            }
            for node in nodes
        ]
        return metadata_list


extractors = [
    TitleExtractor(nodes=5, llm=llm),
    QuestionsAnsweredExtractor(questions=3, llm=llm),
    # EntityExtractor(prediction_threshold=0.5),
    # SummaryExtractor(summaries=["prev", "self"], llm=llm),
    # KeywordExtractor(keywords=10, llm=llm),
    # CustomExtractor()
]

transformations = [text_splitter] + extractors

  from .autonotebook import tqdm as notebook_tqdm



In [19]:
from llama_index.core import SimpleDirectoryReader
# bring in deps
from llama_parse import LlamaParse
import os
import glob
import nest_asyncio
nest_asyncio.apply()

In [38]:
# set up parser
parser = LlamaParse(
    api_key=LLAMA_CLOUD_API_KEY,  # can also be set in your env as LLAMA_CLOUD_API_KEY
    result_type="markdown",  # "markdown" and "text" are available
    num_workers=4,  # if multiple files passed, split in `num_workers` API calls
    verbose=True,
    language="en",
    fast_mode=True,
)

In [43]:
file_extractor = {".pdf": parser}

input_files = glob.glob('data/*.pdf')
documents = SimpleDirectoryReader(input_files=input_files, file_extractor=file_extractor).load_data()
# Note the uninformative document file name, which may be a common scenario in a production setting
front_pages = documents[0:3]
content = documents[63:69]
documents = front_pages + content
print(documents)

Started parsing the file under job_id 52c6e626-36c2-4a89-8ae7-1acf2409aebb
.Error while parsing the file '<bytes/buffer>': 'markdown'
[]


In [None]:
from llama_index.core.ingestion import IngestionPipeline

pipeline = IngestionPipeline(transformations=transformations)

nodes = pipeline.run(documents=docs)
print(nodes)