In [1]:
from sentence_transformers import SentenceTransformer
from qdrant_client import QdrantClient, models
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core.node_parser import SentenceSplitter, SemanticSplitterNodeParser
from llama_index.core import Document
from transformers import AutoTokenizer
import textwrap
import os
import pandas as pd
import numpy as np
import re
from dotenv import load_dotenv


load_dotenv()  # Load environment variables from .env file

True

In [2]:
pd?

[31mType:[39m        module
[31mString form:[39m <module 'pandas' from '/home/rand/projects/qdrant/.venv/lib/python3.12/site-packages/pandas/__init__.py'>
[31mFile:[39m        ~/projects/qdrant/.venv/lib/python3.12/site-packages/pandas/__init__.py
[31mDocstring:[39m  
pandas - a powerful data analysis and manipulation library for Python

**pandas** is a Python package providing fast, flexible, and expressive data
structures designed to make working with "relational" or "labeled" data both
easy and intuitive. It aims to be the fundamental high-level building block for
doing practical, **real world** data analysis in Python. Additionally, it has
the broader goal of becoming **the most powerful and flexible open source data
analysis / manipulation tool available in any language**. It is already well on
its way toward this goal.

Main Features
-------------
Here are just a few of the things that pandas does well:

  - Easy handling of missing data in floating point as well as non-f

In [3]:
# initialize Qdrant Client
client = QdrantClient(url=os.getenv("QDRANT_URL"), api_key=os.getenv("QDRANT_API_KEY"))

# Initialize the Text Encoder
encoder = SentenceTransformer("all-MiniLM-L6-v2")

In [4]:
# ingest data and convert to document list
df = pd.read_csv("books.csv")
df.columns

Index(['isbn13', 'isbn10', 'title', 'subtitle', 'authors', 'categories',
       'thumbnail', 'description', 'published_year', 'average_rating',
       'num_pages', 'ratings_count'],
      dtype='object')

In [5]:
df.head()

Unnamed: 0,isbn13,isbn10,title,subtitle,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count
0,9780002005883,2005883,Gilead,,Marilynne Robinson,Fiction,http://books.google.com/books/content?id=KQZCP...,A NOVEL THAT READERS and critics have been eag...,2004.0,3.85,247.0,361.0
1,9780002261982,2261987,Spider's Web,A Novel,Charles Osborne;Agatha Christie,Detective and mystery stories,http://books.google.com/books/content?id=gA5GP...,A new 'Christie for Christmas' -- a full-lengt...,2000.0,3.83,241.0,5164.0
2,9780006163831,6163831,The One Tree,,Stephen R. Donaldson,American fiction,http://books.google.com/books/content?id=OmQaw...,Volume Two of Stephen Donaldson's acclaimed se...,1982.0,3.97,479.0,172.0
3,9780006178736,6178731,Rage of angels,,Sidney Sheldon,Fiction,http://books.google.com/books/content?id=FKo2T...,"A memorable, mesmerizing heroine Jennifer -- b...",1993.0,3.93,512.0,29532.0
4,9780006280897,6280897,The Four Loves,,Clive Staples Lewis,Christian life,http://books.google.com/books/content?id=XhQ5X...,Lewis' work on the nature of love divides love...,2002.0,4.15,170.0,33684.0


In [6]:
df.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6810 entries, 0 to 6809
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   isbn13          6810 non-null   int64  
 1   isbn10          6810 non-null   object 
 2   title           6810 non-null   object 
 3   subtitle        2381 non-null   object 
 4   authors         6738 non-null   object 
 5   categories      6711 non-null   object 
 6   thumbnail       6481 non-null   object 
 7   description     6548 non-null   object 
 8   published_year  6804 non-null   float64
 9   average_rating  6767 non-null   float64
 10  num_pages       6767 non-null   float64
 11  ratings_count   6767 non-null   float64
dtypes: float64(4), int64(1), object(7)
memory usage: 638.6+ KB


In [7]:
df.dropna(subset=["description", "authors", "categories" ,"title", "published_year", "average_rating", "ratings_count", "thumbnail"]).info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
Index: 6215 entries, 0 to 6809
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   isbn13          6215 non-null   int64  
 1   isbn10          6215 non-null   object 
 2   title           6215 non-null   object 
 3   subtitle        2182 non-null   object 
 4   authors         6215 non-null   object 
 5   categories      6215 non-null   object 
 6   thumbnail       6215 non-null   object 
 7   description     6215 non-null   object 
 8   published_year  6215 non-null   float64
 9   average_rating  6215 non-null   float64
 10  num_pages       6215 non-null   float64
 11  ratings_count   6215 non-null   float64
dtypes: float64(4), int64(1), object(7)
memory usage: 631.2+ KB


In [8]:
df = df.dropna(subset=["description", "authors", "categories" ,"title", "published_year", "average_rating", "ratings_count", "thumbnail"])

In [9]:
cols_drop = ["isbn10", "subtitle", "num_pages"]
df = df.drop(columns= cols_drop)
df.columns

Index(['isbn13', 'title', 'authors', 'categories', 'thumbnail', 'description',
       'published_year', 'average_rating', 'ratings_count'],
      dtype='object')

In [10]:
df.rename(
    columns={
        "isbn13": "isbn",
        "published_year": "year",
        "average_rating": "avg_rating",
        "ratings_count": "num_ratings",
        "categories": "genres",
        "thumbnail": "cover_url",
  
    }, inplace=True
)

In [11]:
df.columns

Index(['isbn', 'title', 'authors', 'genres', 'cover_url', 'description',
       'year', 'avg_rating', 'num_ratings'],
      dtype='object')

In [12]:
documents = []
for index, row in df.iterrows():
    documents.append({
        "Title": row['title'],
        "Authors": row['authors'],
        "Year": row['year'],
        "Genres": row['genres'],
        "Average Rating": row['avg_rating'],
        "Number of Ratings": row['num_ratings'],
        "Description": row['description'],
        "Cover URL": row['cover_url'],
        "ISBN": row['isbn'],

    })

documents[:2]

[{'Title': 'Gilead',
  'Authors': 'Marilynne Robinson',
  'Year': 2004.0,
  'Genres': 'Fiction',
  'Average Rating': 3.85,
  'Number of Ratings': 361.0,
  'Description': 'A NOVEL THAT READERS and critics have been eagerly anticipating for over a decade, Gilead is an astonishingly imagined story of remarkable lives. John Ames is a preacher, the son of a preacher and the grandson (both maternal and paternal) of preachers. It’s 1956 in Gilead, Iowa, towards the end of the Reverend Ames’s life, and he is absorbed in recording his family’s story, a legacy for the young son he will never see grow up. Haunted by his grandfather’s presence, John tells of the rift between his grandfather and his father: the elder, an angry visionary who fought for the abolitionist cause, and his son, an ardent pacifist. He is troubled, too, by his prodigal namesake, Jack (John Ames) Boughton, his best friend’s lost son who returns to Gilead searching for forgiveness and redemption. Told in John Ames’s joyous, r

In [13]:
#Analyzing token count in the dataset
MAX_TOKENS = 40
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
for doc in documents[:5]:
    tokens = tokenizer.encode(doc["Description"], add_special_tokens=False)
    print(f"Title: {doc['Title']}")
    print(f"Token Count: {len(tokens)}")

    if len(tokens) > MAX_TOKENS:
        print(f"  -> Exceeds max token limit by {len(tokens) - MAX_TOKENS} tokens")
    print()

Title: Gilead
Token Count: 260
  -> Exceeds max token limit by 220 tokens

Title: Spider's Web
Token Count: 249
  -> Exceeds max token limit by 209 tokens

Title: The One Tree
Token Count: 20

Title: Rage of angels
Token Count: 75
  -> Exceeds max token limit by 35 tokens

Title: The Four Loves
Token Count: 60
  -> Exceeds max token limit by 20 tokens



In [17]:
# Implement three Chunking Strategies: Sentence, Token, Semantic
def fixed_size_chunks(text: str, chunk_size: int= 100, overlap: int=20) -> list[str]:
    """Split text into fixed-size chunks with overlap"""
    words = text.split()
    chunks = []

    for i in range(0, len(words), chunk_size - overlap):
        chunk_words = words[i:i + chunk_size]
        if chunk_words: # Ensure chunk is not empty
            chunks.append(" ".join(chunk_words))

    return chunks


def sentence_chunks(text: str, max_sentences: int=3) -> list[str]:
    """Group sentences into chunks"""
    
    sentences = re.split(r'[.!?]+ ', text) # Split text into sentences
    sentences = [sentence.strip() for sentence in sentences if sentence.strip()] # Remove empty sentences

    chunks = []
    for i in range(0, len(sentences), max_sentences):
        chunk_sentences = sentences[i:i + max_sentences]
        if chunk_sentences:
            chunks.append(". ".join(chunk_sentences) + ".") # Add period at the end
    return chunks

def semantic_chunks(text: str, max_tokens: int=100) -> list[str]:
    """Split text into semantic chunks using SentenceTransformer"""
    document = Document(text=text)
    semantic_splitter = SemanticSplitterNodeParser(
        buffer_size=max_tokens,
        breakpoint_percentile_threshold=95,
        embed_model=HuggingFaceEmbedding("sentence-transformers/all-MiniLM-L6-v2")
    )
    nodes = semantic_splitter.get_nodes_from_documents([document])
    return [node.get_text() for node in nodes]
  



In [27]:
documents = documents[:50]

In [28]:
# create collections and process data
collection_name = "books_collection"

if client.collection_exists(collection_name):
    client.delete_collection(collection_name)

# Create a collection with three named vectors
client.create_collection(
    collection_name=collection_name,
    vectors_config={
        "fixed": models.VectorParams(size=384, distance=models.Distance.COSINE),
        "sentence": models.VectorParams(size=384, distance=models.Distance.COSINE),
        "semantic": models.VectorParams(size=384, distance=models.Distance.COSINE),
        }
)


# process and upload data
points = []
point_id = 0

for doc in documents:
    description = doc['Description']

    # Process with each chunking strategy
    strategies = {
        "fixed": fixed_size_chunks(description),
        "sentence": sentence_chunks(description),
        "semantic": semantic_chunks(description),
    }

    for strategy_name, chunks in strategies.items():
        for chunk_idx, chunk in enumerate(chunks):
            # Create vectors for this chunk
            vectors = {strategy_name: encoder.encode(chunk).tolist()}

            # Create payload with metadata
            points.append(
                models.PointStruct(
                    id=point_id,
                    vector=vectors,
                    payload={
                        **doc,
                        "chunk": chunk,
                        "chunk_strategy": strategy_name,
                        "chunk_index": chunk_idx,
                    },
                )
            )
            point_id += 1

client.upload_points(
    collection_name=collection_name,
    points=points,
    parallel=4,
)
print(f"Uploaded {len(points)} points to collection '{collection_name}'")

Uploaded 23 points to collection 'books_collection'
