In [1]:
import json
import logging
import re
import warnings
from pathlib import Path
from pprint import pprint
from typing import Annotated, Any, Generator, Literal, Type, TypeVar

# Standard imports
import numpy as np
import numpy.typing as npt
import pandas as pd
import polars as pl

# Visualization
# import matplotlib.pyplot as plt

# NumPy settings
np.set_printoptions(precision=4)

# Pandas settings
pd.options.display.max_rows = 1_000
pd.options.display.max_columns = 1_000
pd.options.display.max_colwidth = 600

# Polars settings
pl.Config.set_fmt_str_lengths(1_000)
pl.Config.set_tbl_cols(n=1_000)
pl.Config.set_tbl_rows(n=200)

warnings.filterwarnings("ignore")

# Black code formatter (Optional)
%load_ext lab_black

# auto reload imports
%load_ext autoreload
%autoreload 2

In [2]:
import os

from rich.console import Console
from rich.theme import Theme

custom_theme = Theme(
    {
        "white": "#FFFFFF",  # Bright white
        "info": "#00FF00",  # Bright green
        "warning": "#FFD700",  # Bright gold
        "error": "#FF1493",  # Deep pink
        "success": "#00FFFF",  # Cyan
        "highlight": "#FF4500",  # Orange-red
    }
)
console = Console(theme=custom_theme)


def create_path(path: str | Path) -> None:
    """
    Create parent directories for the given path if they don't exist.

    Parameters
    ----------
    path : str | Path
        The file path for which to create parent directories.
    """
    # Convert to Path object if it's a string
    path_obj: Path = Path(path) if isinstance(path, str) else path

    # Get the parent directory and create it if it doesn't exist
    path_obj.parent.mkdir(parents=True, exist_ok=True)


def go_up_from_current_directory(*, go_up: int = 1) -> None:
    """This is used to up a number of directories.

    Params:
    -------
    go_up: int, default=1
        This indicates the number of times to go back up from the current directory.

    Returns:
    --------
    None
    """
    import os
    import sys

    CONST: str = "../"
    NUM: str = CONST * go_up

    # Goto the previous directory
    prev_directory = os.path.join(os.path.dirname(__name__), NUM)
    # Get the 'absolute path' of the previous directory
    abs_path_prev_directory = os.path.abspath(prev_directory)

    # Add the path to the System paths
    sys.path.insert(0, abs_path_prev_directory)
    print(abs_path_prev_directory)

In [3]:
go_up_from_current_directory(go_up=1)

from settings import refresh_settings  # noqa: E402

settings = refresh_settings()

/Users/neidu/Desktop/Projects/Personal/RAG-Tutorials


## Extract

## [Simple Loader](https://docs.llamaindex.ai/en/stable/module_guides/loading/documents_and_nodes/usage_documents/#defining-documents)

In [4]:
from llama_index.core import SimpleDirectoryReader

fp: str = "../data"
docs = SimpleDirectoryReader(input_dir=fp).load_data()

# console.print(docs)
docs

[Document(id_='b537fd39-08f3-414b-b670-c50f81b8de90', embedding=None, metadata={'page_label': '1', 'file_name': 'chelsea_transfer_news.pdf', 'file_path': '/Users/neidu/Desktop/Projects/Personal/RAG-Tutorials/notebooks/../data/chelsea_transfer_news.pdf', 'file_type': 'application/pdf', 'file_size': 604575, 'creation_date': '2025-07-25', 'last_modified_date': '2025-07-25'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={}, metadata_template='{key}: {value}', metadata_separator='\n', text_resource=MediaResource(embeddings=None, data=None, text='Friday 25 July 2025 14:39, UK\nChelsea transfer news, rumours and\ngossip: Live updates and latest on\ndeals, signings, loans and contracts\nLatest Chelsea news\xa0\nSort by: Latest Oldest\nIn full: Chelsea 2025/26 Premier L

In [5]:
len(docs)

22

## Transform

### [Modify The Metadata](https://docs.llamaindex.ai/en/stable/module_guides/loading/documents_and_nodes/usage_documents/#summary)

- Adjust the metadata before it's sent to the embedding model

In [6]:
from llama_index.core import Document
from llama_index.core.schema import MetadataMode

document = Document(
    text="This is a sample document.",
    metadata={
        "file_name": "example.txt",
        "author": "Neidu E.",
        "number": 42,
        "date": "2025-07-25",
        "category": "example",
    },
    excluded_embed_metadata_keys=["file_name"],
    excluded_llm_metadata_keys=["file_name", "number"],
    metadata_separator="\n",
    metadata_template="{key}:{value}",
    text_template="Metadata: \n{metadata_str}\n-----\n\nContent: {content}",
)

# Embedding model sees this:
console.print(document.get_content(metadata_mode=MetadataMode.EMBED))

# LLM model sees this:
console.print(document.get_content(metadata_mode=MetadataMode.LLM))

In [7]:
console.print(docs[0].get_content(metadata_mode=MetadataMode.EMBED))

In [8]:
console.print(docs[0])

### Update The Docs' Metadata

In [9]:
for doc in docs:
    # Define the metadata template
    doc.text_template = "Metadata: \n{metadata_str}\n-----\n\nContent: {content}"

    # Exclude the page_label from embedding
    if "page_label" not in doc.excluded_embed_metadata_keys:
        doc.excluded_embed_metadata_keys.append("page_label")

In [10]:
# Verify the changes
console.print(docs[0].get_content(metadata_mode=MetadataMode.EMBED))

### Extract Information Using An LLM From The Document

- Extract relevant information from the document using an LLM before embedding.
- We'll be extracting the following:
  - A title
  - Potential questions and answers that can be answered using the document.
- Apply transformations to the documents.

In [11]:
from llama_index.core.llms import ChatMessage
from llama_index.llms.openrouter import OpenRouter

llm = OpenRouter(
    api_key=settings.OPENROUTER_API_KEY.get_secret_value(),
    max_tokens=2_048,
    context_window=4_096,
    model="meta-llama/llama-3.2-3b-instruct",
    temperature=0.1,
)

None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


In [12]:
message = ChatMessage(role="user", content="Tell me a joke about Tyrion Lannister")
resp = llm.chat([message])
console.print(resp)

### [Data Ingestion](https://docs.llamaindex.ai/en/stable/module_guides/loading/ingestion_pipeline/#ingestion-pipeline)

In [15]:
from llama_index.core.extractors import QuestionsAnsweredExtractor, TitleExtractor
from llama_index.core.ingestion import IngestionPipeline
from llama_index.core.node_parser import SentenceSplitter

text_splitter = SentenceSplitter(separator=" ", chunk_size=1024, chunk_overlap=128)
title_extractor = TitleExtractor(llm=llm, nodes=5)
qa_extractor = QuestionsAnsweredExtractor(llm=llm, questions=3)

ingestion_pipeline: IngestionPipeline = IngestionPipeline(
    # Order of execution
    transformations=[
        text_splitter,
        title_extractor,
        qa_extractor,
    ]
)

nodes = ingestion_pipeline.run(
    documents=docs,
    inplace=True,
    show_progress=True,
)

Parsing nodes: 100%|██████████| 22/22 [00:00<00:00, 1056.17it/s]
100%|██████████| 22/22 [00:36<00:00,  1.64s/it]
100%|██████████| 22/22 [00:30<00:00,  1.39s/it]


In [16]:
len(nodes)

22

In [17]:
console.print(nodes[0].get_content(metadata_mode=MetadataMode.EMBED))