In [1]:
import io
import os, sys, contextlib
from txtai.embeddings import Embeddings
from txtai.pipeline import Extractor
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document 
import re
import pandas as pd
import nltk
#nltk.download('punkt')
#nltk.download('punkt_tab')
from nltk import tokenize
from langchain import FAISS

  from .autonotebook import tqdm as notebook_tqdm


# Guide to Sentence and Passage Chunking
- This guide shows how sentence and passage chunking was implemented using nltk and langchain.text_splitter. It further covers embedding of the chunks using SentenceTransformers and LangChain Documents and saving them to a local vector database file using FAISS.
- We use a subset of Yelp restaurant reviews

### Read a subset of YELP restaurant review data (20k reviews)

In [2]:
dataset="YELP"
data_path = 'data/YELP/yelp_subset.pkl'
# Load a DataFrame of a subset of 20k YELP restaurant reviews from a pickle file
df_reviews = pd.read_pickle(data_path)
# Reset the index and rename the index column to "Doc Id"
df_reviews.reset_index(inplace=True)
df_reviews.rename(columns={'index': 'Doc Id'}, inplace=True)
# Rename the column from 'text' to 'Doc Text'
df_reviews.rename(columns={'text': 'Doc Text'}, inplace=True)
columns_to_keep = ['Doc Id', 'review_id',"business_id","stars","Doc Text"]
# Keep only the columns in the list
df_reviews = df_reviews[columns_to_keep]

In [3]:
df_reviews.head()

Unnamed: 0,Doc Id,review_id,business_id,stars,Doc Text
0,112388,vhETeXa3nM34Hwk3KEFfiA,AQw0B8j9QV1RkFLLFiwkuw,3.0,I will be spending several weekends here in Ca...
1,68092,M09LOjNR1ymX4avcBQfAYQ,rh6O8NtKJUhqZ0G2Pkpj2Q,5.0,Went here once and can't wait to go again! The...
2,40901,w5x1pXvmODU5cYI3PZsSQA,YGgGefpPTFhgthvQvMAGoQ,5.0,"Now I know why Guy featured this place, it was..."
3,19599,LnbFwaD8CEC-OsCMb1YZDA,SZU9c8V2GuREDN5KgyHFJw,4.0,Great place at the end of the wharf. Be prepar...
4,144853,3ZiPH6CHL_cyVNoYP2rt1Q,FQxEfhBd1gMrurP19bhK8w,4.0,"Mmm...I always get the chicken salad sandwich,..."


# Select chunking strategy
 1. passage_chunking
 2. sentence_chunking

In [4]:
chunking_strategy= "passage_chunking"

### Functions for chunking

In [5]:
def passage_chunking(df_reviews, size, overlap):
    """
    Function to chunk review documents into smaller passages by character count, 
    with a specified chunk size and overlap to retain semantic context.

    Inputs:
        - df_reviews: DataFrame containing reviews.
        - size: Integer specifying the size of each chunk in characters.
        - overlap: Integer specifying the overlap between chunks in characters.

    Outputs:
        - A list of chunked documents where each chunk is represented as a Document object 
          with its associated metadata (e.g., document ID).
    """

    # Initialize a list to store Document objects created from the input DataFrame.
    docs = []

    # Iterate through each row in the input DataFrame to process the reviews.
    for index, row in df_reviews.iterrows():
        # Extract the review text from the "Doc Text" column.
        doc_text = row["Doc Text"]

        # Create metadata for the document using the "Doc Id" column.
        doc_id = {"review_id": row["Doc Id"]}

        # Create a new Document object with the review text and metadata.
        newDoc = Document(page_content=doc_text, metadata=doc_id)

        # Append the Document object to the docs list.
        docs.append(newDoc)

    # Instantiate the RecursiveCharacterTextSplitter for chunking documents.
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=size,          # Set the size of each chunk in characters.
        chunk_overlap=overlap,    # Set the overlap size between chunks in characters.
        length_function=len,      # Define the function to measure text length.
        is_separator_regex=False  # Specify whether the separator is a regex.
    )

    # Use the text splitter to split the documents into smaller chunks.
    doc_chunks = text_splitter.split_documents(docs)

    # Return the list of chunked documents.
    return doc_chunks

def sentence_chunking(df_reviews):
    """
    Function to chunk review documents into sentences. Each sentence becomes a separate document 
    with its associated metadata.

    Inputs:
        - df_reviews: DataFrame containing reviews. 

    Outputs:
        - A list of chunked documents where each chunk is a sentence, represented as a Document object 
          with its associated metadata (e.g., document ID).
    """
    from nltk.tokenize import sent_tokenize  # Import sentence tokenizer from NLTK.

    # Initialize a list to store sentence-level Document objects.
    doc_chunks = []

    # Iterate through each row in the input DataFrame to process the reviews.
    for index, row in df_reviews.iterrows():
        # Extract the review text from the "Doc Text" column.
        doc_text = row["Doc Text"]

        # Tokenize the review text into sentences.
        sentences = sent_tokenize(doc_text)

        # Create metadata for the document using the "Doc Id" column.
        doc_id = {"review_id": row["Doc Id"]}

        # Iterate through each sentence to create sentence-level Document objects.
        for sent in sentences:
            # Create a new Document object with the sentence text and metadata.
            newDoc = Document(page_content=sent, metadata=doc_id)

            # Append the Document object to the doc_chunks list.
            doc_chunks.append(newDoc)

    # Return the list of sentence-level chunked documents.
    return doc_chunks

### Perform chunking

In [6]:
if chunking_strategy== "passage_chunking":
    docs=passage_chunking(df_reviews, size=200, overlap=20)
elif chunking_strategy== "sentence_chunking":
    docs=sentence_chunking(df_reviews)
else:
    docs=None

### Print example chunks

In [7]:
doc_ids= list(df_reviews["Doc Id"].unique())
doc_id=doc_ids[2]
print("Review Id:", doc_id)
print("\nGenerated chunks:")

example_chunks=[d for d in docs if d.metadata["review_id"]==doc_id]
for chunk in example_chunks:
    print("\u2022 "+ chunk.page_content)

print("\n")
review_text= df_reviews[df_reviews["Doc Id"]==doc_id]["Doc Text"].values[0]
print("Full review text:\n"+ review_text)
print("\n")

Review Id: 40901

Generated chunks:
• Now I know why Guy featured this place, it was awesome!  Totally home cooked and authentic, and I should know!  Even better than grandma's food (God rest her soul). The posole was awesome, the
• was awesome, the carnitas great, and the tacos are huge.  Order just one and you'll be satisfied, two and your stuffed!  Can't wait to go back next time I'm in Santa Barbara.  The place isn't exactly
• place isn't exactly upscale or great for a first date, or if you're a snob.  But if you want some great Mexican food, that is home cooked and authentic, this is the place.  I'd also try the carne
• also try the carne asada torta (like a steak sandwich), it looked great.


Full review text:
Now I know why Guy featured this place, it was awesome!  Totally home cooked and authentic, and I should know!  Even better than grandma's food (God rest her soul). The posole was awesome, the carnitas great, and the tacos are huge.  Order just one and you'll be satisfied, 

In [8]:
def average_words_per_string(list_of_strings):
    total_words = 0
    total_strings = len(list_of_strings)
    
    for string in list_of_strings:
        total_words += len(string.split())
    
    if total_strings == 0:
        return 0  
    
    return total_words / total_strings
    
l=[doc.page_content for doc in docs]
print("Number of chunks:", len(l))
average = average_words_per_string(l)
print("Average number of words per chunk:",f"{average:.2f}")

Number of chunks: 69890
Average number of words per chunk: 28.15


###  Select embedding model

In [9]:
from langchain.embeddings import SentenceTransformerEmbeddings
embedd_model="all-MiniLM-L6-v2" # all-mpnet-base-v2"
embedding_function = SentenceTransformerEmbeddings(model_name=embedd_model)

  embedding_function = SentenceTransformerEmbeddings(model_name=embedd_model)


### Save embedding vectors to local vector database using FAISS

In [10]:
faiss = FAISS.from_documents(docs, embedding_function)
save_to="data/Embeddings/" + dataset + "_" + chunking_strategy
faiss.save_local(save_to, index_name="index")


KeyboardInterrupt

