# Document Analysis Using Vision LLMs

In [1]:
# Built-in library
import json
import logging
import re
import warnings
from pathlib import Path
from pprint import pprint
from typing import Any, Literal, Optional, Union

# Standard imports
import numpy as np
import numpy.typing as npt
import pandas as pd
import polars as pl
from rich.console import Console
from rich.theme import Theme

custom_theme = Theme(
    {
        "white": "#FFFFFF",  # Bright white
        "info": "#00FF00",  # Bright green
        "warning": "#FFD700",  # Bright gold
        "error": "#FF1493",  # Deep pink
        "success": "#00FFFF",  # Cyan
        "highlight": "#FF4500",  # Orange-red
    }
)
console = Console(theme=custom_theme)

# Visualization
# import matplotlib.pyplot as pltife

# NumPy settings
np.set_printoptions(precision=4)

# Pandas settings
pd.options.display.max_rows = 1_000
pd.options.display.max_columns = 1_000
pd.options.display.max_colwidth = 600

# Polars settings
pl.Config.set_fmt_str_lengths(1_000)
pl.Config.set_tbl_cols(n=1_000)

warnings.filterwarnings("ignore")

# Black code formatter (Optional)
%load_ext lab_black

# auto reload imports
%load_ext autoreload
%autoreload 2

In [2]:
def go_up_from_current_directory(*, go_up: int = 1) -> None:
    """This is used to up a number of directories.

    Params:
    -------
    go_up: int, default=1
        This indicates the number of times to go back up from the current directory.

    Returns:
    --------
    None
    """
    import os
    import sys

    CONST: str = "../"
    NUM: str = CONST * go_up

    # Goto the previous directory
    prev_directory = os.path.join(os.path.dirname(__name__), NUM)
    # Get the 'absolute path' of the previous directory
    abs_path_prev_directory = os.path.abspath(prev_directory)

    # Add the path to the System paths
    sys.path.insert(0, abs_path_prev_directory)
    print(abs_path_prev_directory)

In [3]:
go_up_from_current_directory(go_up=2)
from settings import refresh_settings  # noqa: E402

settings = refresh_settings()

/Users/neidu/Desktop/Projects/Personal/My_Projects/AI-Tutorials


In [4]:
import spacy

nlp = spacy.load("en_core_web_sm")
nlp

<spacy.lang.en.English at 0x1194a2ea0>

In [5]:
text: str = (
    "Thanks for letting me know that uv add pip fixed the issue! That makes sense. It "
    "indicates you were likely using uv (a fast Python package installer and resolver) "
    "to manage your environment. While uv often replaces pip for package installation, "
    "it seems your setup or workflow required pip to be explicitly added to the "
    "environment managed by uv."
)


docs = nlp(text)

# for idx, row in enumerate(docs.sents):
# print(f"Sentence {idx}:\n{row}")
# print()

sentences: list[str] = [sent.text for sent in docs.sents]
console.log(sentences)

In [6]:
from typing import Generator, Iterable


def create_batches(
    data: list[Iterable[Any]], batch_size: int
) -> Generator[list[Any], None, None]:
    """
    Create batches from a list of data.

    Parameters
    ----------
    data : list[Iterable[Any]]
        The input list of iterables to be batched.
    batch_size : int
        The size of each batch.

    Returns
    -------
    Generator[list[Any], None, None]
        A generator yielding batches of the input data.

    Examples
    --------
    >>> data = [1, 2, 3, 4, 5]
    >>> list(create_batches(data, 2))
    [[1, 2], [3, 4], [5]]
    """
    for i in range(0, len(data), batch_size):
        yield data[i : i + batch_size]


def calculate_chunk_size(total_sentences: int, max_allowed_chunks: int) -> int:
    """
    Calculate the optimal size for each chunk based on the total sentences and maximum allowed chunks.

    Parameters
    ----------
    total_sentences : int
        Total number of sentences to be processed.
    max_allowed_chunks : int
        Maximum number of chunks allowed.

    Returns
    -------
    int
        Optimal chunk size to ensure total chunks doesn't exceed max_allowed_chunks.
    """
    return int(np.ceil(total_sentences / max_allowed_chunks))


def create_overlapping_chunks(
    sentences: list[str], chunk_size: int, overlap_size: int = 2
) -> list[list[str]]:
    """
    Create chunks of sentences with specified overlap between adjacent chunks.

    Parameters
    ----------
    sentences : list[str]
        List of sentences to be divided into chunks.
    chunk_size : int
        Number of sentences in each chunk.
    overlap_size : int, optional
        Number of sentences to overlap between adjacent chunks, by default 2.

    Returns
    -------
    list[list[str]]
        List of sentence chunks with specified overlap between adjacent chunks.

    Notes
    -----
    If the initial chunking produces more than MAX_ALLOWED_CHUNKS (5), the function
    recalculates the chunk size to ensure the total chunks stays within the limit.
    """
    MAX_ALLOWED_CHUNKS: int = 5
    initial_chunks = list(create_batches(sentences, batch_size=chunk_size))

    # Adjust chunk size if we exceed the maximum allowed chunks
    if len(initial_chunks) > MAX_ALLOWED_CHUNKS:
        adjusted_chunk_size = calculate_chunk_size(len(sentences), MAX_ALLOWED_CHUNKS)
        initial_chunks = list(create_batches(sentences, batch_size=adjusted_chunk_size))

    overlapping_chunks: list[list[Any]] = []

    for i in range(len(initial_chunks)):
        try:
            if i < len(initial_chunks) - 1:
                # Combine current chunk with overlap from next chunk
                combined_chunk = np.hstack(
                    (initial_chunks[i], initial_chunks[i + 1][:overlap_size])
                )
                overlapping_chunks.append(combined_chunk)
            else:
                # For the last chunk, just add it as is
                overlapping_chunks.append(np.array(initial_chunks[i]))
        except IndexError:
            overlapping_chunks.append(np.array(initial_chunks[i]))

    return overlapping_chunks

In [7]:
text: str = (
    "My name is Jon Doe. "
    "I live in SF. "
    "I have a 1 year old daughter. "
    "Thanks for letting me know that uv add pip fixed the issue! That makes sense. It "
    "indicates you were likely using uv (a fast Python package installer and resolver) "
    "to manage your environment. "
    "God is love. "
    "G.O.A.T. That guy is a legend!"
)
sentences: list[str] = [sent.text for sent in nlp(text).sents]
console.log(list(create_batches(sentences, batch_size=2)), style="warning")
console.log(create_overlapping_chunks(sentences, chunk_size=2, overlap_size=2))

In [8]:
from typing import Callable, List


def chunk_document(
    sentences: List[str],
    max_tokens_per_chunk: int,
    overlap_size: int = 2,
    get_token_count: Callable[[str], int] = lambda text: len(
        text.split()
    ),  # Simple token counter
) -> List[List[str]]:
    """
    Chunk a document into smaller chunks based on sentence boundaries while respecting
    maximum tokens per chunk constraint.

    Parameters
    ----------
    sentences : List[str]
        List of sentences from the document
    max_tokens_per_chunk : int
        Maximum number of tokens allowed in a chunk
    overlap_size : int, optional
        Number of sentences to overlap between adjacent chunks, default 2
    get_token_count : callable, optional
        Function to count tokens in text, defaults to simple word splitting

    Returns
    -------
    List[List[str]]
        List of chunks, where each chunk is a list of sentences
    """
    if not sentences:
        return []

    # Calculate token count for each sentence
    sentence_token_counts = [get_token_count(sentence) for sentence in sentences]

    # Check if any individual sentence exceeds the token limit
    max_sentence_tokens = max(sentence_token_counts)
    if max_sentence_tokens > max_tokens_per_chunk:
        print(
            f"Warning: Some sentences exceed the token limit ({max_sentence_tokens} > {max_tokens_per_chunk})"
        )
        print(
            "These sentences will be placed in their own chunks, exceeding the token limit."
        )

    # Create chunks based solely on token limit
    chunks = []
    current_chunk = []
    current_token_count = 0

    for sentence, token_count in zip(sentences, sentence_token_counts):
        # If adding this sentence would exceed the limit and we already have sentences in the chunk,
        # finalize the current chunk and start a new one
        if current_token_count + token_count > max_tokens_per_chunk and current_chunk:
            chunks.append(current_chunk)
            current_chunk = []
            current_token_count = 0

        # Add the sentence to the current chunk
        current_chunk.append(sentence)
        current_token_count += token_count

    # Add the last chunk if not empty
    if current_chunk:
        chunks.append(current_chunk)

    # Create overlapping chunks
    overlapping_chunks = []

    for i in range(len(chunks)):
        if i < len(chunks) - 1:
            # Get overlap from next chunk
            next_chunk_overlap = chunks[i + 1][: min(overlap_size, len(chunks[i + 1]))]

            # Calculate token count with overlap
            current_with_overlap = chunks[i] + next_chunk_overlap
            total_tokens_with_overlap = sum(
                get_token_count(s) for s in current_with_overlap
            )

            # Check if adding overlap exceeds token limit
            if total_tokens_with_overlap <= max_tokens_per_chunk:
                overlapping_chunks.append(current_with_overlap)
            else:
                # If too large, don't add overlap or add partial overlap if possible
                partial_overlap = []
                remaining_tokens = max_tokens_per_chunk - sum(
                    get_token_count(s) for s in chunks[i]
                )

                for overlap_sentence in next_chunk_overlap:
                    overlap_tokens = get_token_count(overlap_sentence)
                    if remaining_tokens >= overlap_tokens:
                        partial_overlap.append(overlap_sentence)
                        remaining_tokens -= overlap_tokens
                    else:
                        break

                if partial_overlap:
                    overlapping_chunks.append(chunks[i] + partial_overlap)
                else:
                    overlapping_chunks.append(chunks[i])
        else:
            # Last chunk, no overlap to add
            overlapping_chunks.append(chunks[i])

    return overlapping_chunks


# Example usage:
if __name__ == "__main__":
    # Example document (list of sentences)
    sample_sentences = [
        "This is sentence one with several tokens.",
        "Here is another sentence that is quite long and has many tokens for demonstration.",
        "This is a short one.",
        "The algorithm needs to handle varying sentence lengths effectively.",
        "Some sentences might have a lot of tokens and be very informative requiring significant processing.",
        "Others might be brief.",
        "We need to ensure that no chunk exceeds the token limit.",
        "At the same time, we want to maintain context between chunks using overlaps.",
        "Overlapping sentences help maintain context between chunks.",
        "This is the last sentence of our example.",
    ]

    # Example constraint
    MAX_TOKENS = 77  # Max tokens per chunk

    # Get chunks
    chunks = chunk_document(sentences=sample_sentences, max_tokens_per_chunk=MAX_TOKENS)

    # Print results
    print(f"Created {len(chunks)} chunks:")
    for i, chunk in enumerate(chunks):
        token_count = sum(len(s.split()) for s in chunk)
        print(f"Chunk {i + 1}: {len(chunk)} sentences, {token_count} tokens")
        print("  " + "\n  ".join(chunk))
        print()

Created 2 chunks:
Chunk 1: 7 sentences, 65 tokens
  This is sentence one with several tokens.
  Here is another sentence that is quite long and has many tokens for demonstration.
  This is a short one.
  The algorithm needs to handle varying sentence lengths effectively.
  Some sentences might have a lot of tokens and be very informative requiring significant processing.
  Others might be brief.
  We need to ensure that no chunk exceeds the token limit.

Chunk 2: 3 sentences, 28 tokens
  At the same time, we want to maintain context between chunks using overlaps.
  Overlapping sentences help maintain context between chunks.
  This is the last sentence of our example.



In [9]:
sample_sentences: list[str] = [
    "My name is Jon Doe. ",
    "I live in SF. ",
    "I have a 1 year old daughter. ",
    "Thanks for letting me know that uv add pip fixed the issue! That makes sense. It "
    "indicates you were likely using uv (a fast Python package installer and resolver) "
    "to manage your environment. ",
    "God is love. ",
    "G.O.A.T. That guy is a legend!",
]

# Example constraints
MAX_TOKENS = 20  # Max tokens per chunk

# Get chunks
chunks = chunk_document(
    sentences=sample_sentences,
    max_tokens_per_chunk=MAX_TOKENS,
    overlap_size=3,
)

# Print results
print(f"Created {len(chunks)} chunks:")
for i, chunk in enumerate(chunks):
    token_count = sum(len(s.split()) for s in chunk)
    print(f"Chunk {i + 1}: {len(chunk)} sentences, {token_count} tokens")
    print("  " + "\n  ".join(chunk))
    print()

These sentences will be placed in their own chunks, exceeding the token limit.
Created 3 chunks:
Chunk 1: 3 sentences, 16 tokens
  My name is Jon Doe. 
  I live in SF. 
  I have a 1 year old daughter. 

Chunk 2: 1 sentences, 33 tokens
  Thanks for letting me know that uv add pip fixed the issue! That makes sense. It indicates you were likely using uv (a fast Python package installer and resolver) to manage your environment. 

Chunk 3: 2 sentences, 9 tokens
  God is love. 
  G.O.A.T. That guy is a legend!



In [10]:
from instructor import AsyncInstructor

from schemas import GeneralResponse, ModelEnum
from utilities.client_utils import get_client

In [11]:
local_client: AsyncInstructor = get_client(is_remote=False)
remote_client: AsyncInstructor = get_client(is_remote=True)

Using Ollama
Using Remote


In [12]:
response: GeneralResponse = await local_client.chat.completions.create(
    messages=[
        {
            "role": "user",
            "content": "/no_think Tell me briefly something unique about SF. ",
        }
    ],
    response_model=GeneralResponse,
    model=ModelEnum.BASE_MODEL_LOCAL_2.value,
    max_tokens=500,
)
console.log(response.content)

In [13]:
response: GeneralResponse = await remote_client.chat.completions.create(
    messages=[
        {
            "role": "user",
            "content": "/no_think Tell me briefly something unique about SF. ",
        }
    ],
    response_model=GeneralResponse,
    model=ModelEnum.BASE_REMOTE_MODEL_1_7B.value,
    max_tokens=700,
    max_retries=3,
)
console.log(response.content)

In [19]:
response: GeneralResponse = await remote_client.chat.completions.create(
    messages=[
        {
            "role": "user",
            "content": "/no_think Tell me briefly something unique about SF. ",
        }
    ],
    response_model=GeneralResponse,
    model=ModelEnum.BASE_REMOTE_MODEL_2_8B.value,
    max_tokens=700,
    max_retries=3,
)
console.log(response.content)