# Basic Text Chunking
- We will be using a basic text splitter for chunks.
- We will be ignoring images and table data in this case.

## Setup

In [1]:
import os

# To get poppler working on MacOS
os.environ['PATH'] += os.pathsep + '/opt/homebrew/opt/poppler/bin:/Users/faizahmed/.pyenv/versions/3.11.8/bin:/opt/miniconda3/condabin:/Users/faizahmed/Library/pnpm:/Users/faizahmed/.pyenv/shims:/Users/faizahmed/.rbenv/shims:/Users/faizahmed/.nvm/versions/node/v18.16.1/bin:/opt/homebrew/bin:/opt/homebrew/sbin:/usr/local/bin:/System/Cryptexes/App/usr/bin:/usr/bin:/bin:/usr/sbin:/sbin:/var/run/com.apple.security.cryptexd/codex.system/bootstrap/usr/local/bin:/var/run/com.apple.security.cryptexd/codex.system/bootstrap/usr/bin:/var/run/com.apple.security.cryptexd/codex.system/bootstrap/usr/appleinternal/bin:/Library/Apple/usr/bin:/Library/TeX/texbin:/Users/faizahmed/.cargo/bin:/Users/faizahmed/.pyenv/versions/3.11.8/bin:/Users/faizahmed/Library/Android/sdk/emulator:/Users/faizahmed/Library/Android/sdk/platform-tools:/opt/homebrew/Cellar/hadoop/3.4.1/libexec/bin:/opt/homebrew/Cellar/hadoop/3.4.1/libexec/sbin'

### Setup Vector Database to store vectors
We will be using Qdrant as our vector database. It supports vector search along with metadata search which makes it convenient for usage. Also open source.

In [2]:
# !pip install qdrant_client
from qdrant_client import QdrantClient

client = QdrantClient(url="http://localhost:6333")


# Testing if the client is working
from qdrant_client.models import Distance, VectorParams

client.create_collection(
    collection_name="test_collection",
    vectors_config=VectorParams(size=4, distance=Distance.DOT),
)

client.delete_collection(collection_name="test_collection")

True

In [3]:
# !pip install langchain_text_splitters

### Setup Langchain and OpenAI API key

In [None]:
# !pip install langchain-openai
MAX_CHUNK_SIZE = 1024
from langchain_openai import OpenAIEmbeddings

embeddings_model = OpenAIEmbeddings(model="text-embedding-3-small", chunk_size=MAX_CHUNK_SIZE)

# Testing if the embeddings model is working
# embeddings_model.embed_documents(["Hello, world!"])

## Generate Chunks from PDF files

In this exploration, we will use 1024 size chunks.

In [24]:
from unstructured.partition.pdf import partition_pdf, Element

def generate_elements(file_path) -> list[Element]:
    elements = partition_pdf(
            filename=file_path,
            chunking_strategy="by_title",
            max_characters=MAX_CHUNK_SIZE,
            # Unstructured Helpers
            strategy="auto", 
        #     infer_table_structure=True, 
            # model_name="yolox",
            # extract_images_in_pdf=True,
            # image_output_dir_path="static/pdfImages/"
    )

    return elements

In [25]:
pdf_path = "../../research_papers/mapreduce.pdf"

In [26]:
elements = generate_elements(pdf_path)

In [33]:
elements[0].metadata.page_number
elements[0].text
str(elements[0].__dict__)

"{'text': 'MapReduce: Simplified Data Processing on Large Clusters\\n\\nby Jeffrey Dean and Sanjay Ghemawat\\n\\nAbstract\\n\\nMapReduce is a programming model and an associated implementation for processing\\n\\nand generating large datasets that is amenable to a broad variety of real-world tasks. Users specify the computation in terms of a map and a reduce function, and the under- lying runtime system automatically parallelizes the computation across large-scale clusters of machines, handles machine failures, and schedules inter-machine communication to make effi- cient use of the network and disks. Programmers find the system easy to use: more than ten thousand distinct MapReduce programs have been implemented internally at Google over the past four years, and an average of one hundred thousand MapReduce jobs are executed on Google’s clusters every day, processing a total of more than twenty petabytes of data per day.', 'embeddings': None, '_element_id': '0d109a749dae3ebd8a7d171fe70

## Create embeddings and store to qdrant

In [8]:
embeddings = embeddings_model.embed_documents([element.text for element in elements])

In [9]:
if not client.collection_exists("pdf_collection"):
    client.create_collection(
        collection_name="pdf_collection",
        vectors_config=VectorParams(size=len(embeddings[0]), distance=Distance.COSINE),
    )

In [10]:
# generate uuid
import uuid
from qdrant_client.models import PointStruct

points = []
for i, element in enumerate(elements[:]):
    point = PointStruct(
        id=str(uuid.uuid4()),
        vector=embeddings[i],
        payload={"text": element.text, "page": element.metadata.page_number}
    )
    points.append(point)

client.upsert(collection_name="pdf_collection", points=points)

UpdateResult(operation_id=1, status=<UpdateStatus.COMPLETED: 'completed'>)

## Testing out search

In [11]:
question = "execution steps of map reduce"

In [12]:
question_embedding = embeddings_model.embed_documents([question])[0]

In [13]:
client.search(collection_name="pdf_collection", query_vector=question_embedding, limit=10)

[ScoredPoint(id='6a899300-eac3-4ca4-a201-04c1cf7f1bca', version=1, score=0.6226835, payload={'text': '3.1 Execution Overview\n\nThe map invocations are distributed across multiple machines by auto- matically partitioning the input data into a set of M splits. The input splits can be processed in parallel by different machines. Reduce invo- cations are distributed by partitioning the intermediate key space into R pieces using a partitioning function (e.g., hash(key) mod R). The number of partitions (R) and the partitioning function are specified by the user.\n\nFigure 1 shows the overall flow of a MapReduce operation in our implementation. When the user program calls the MapReduce func- tion, the following sequence of actions occurs (the numbered labels in Figure 1 correspond to the numbers in the following list).\n\n1. The MapReduce library in the user program first splits the input files into M pieces of typically 16-64MB per piece (controllable by the user via an optional parameter).