In [1]:
import io
import os, sys, contextlib
from txtai.embeddings import Embeddings
from txtai.pipeline import Extractor
import re
import pandas as pd
from langchain import FAISS
from langchain.embeddings import SentenceTransformerEmbeddings

  from .autonotebook import tqdm as notebook_tqdm


# Guide: Query reviews
To query reviews using the respective chunking strategies and embedding models, follow these steps:
- Ensure that the embeddings are generated and stored in the data/Embedding/ folder.
- A pre-generated embedding for sentence chunking + all-MiniLM-L6-v2 based on 5k reviews is provided within this repository. Due to github size limitations, embeddings for all chunking methods, embedding models and the number of reviews (20k) used in the case study are not included.
- To generate other embeddings use the files provided in this repository opinion_unit_CREATE_EMBED_STORE.ipynb and sentence_passage_chunking_CREATE_EMBED_STORE.ipynb.  

### Read the raw review files
##### (In order to link the Doc_Id for each retrieved chunk --> full review text)

In [2]:
dataset="YELP"
data_path = 'data/YELP/yelp_subset.pkl'
# Load a DataFrame of a subset of 20k YELP restaurant reviews from a pickle file
df_reviews = pd.read_pickle(data_path)
# Reset the index and rename the index column to "Doc Id"
df_reviews.reset_index(inplace=True)
df_reviews.rename(columns={'index': 'Doc Id'}, inplace=True)
# Rename the column from 'text' to 'Doc Text'
df_reviews.rename(columns={'text': 'Doc Text'}, inplace=True)
columns_to_keep = ['Doc Id', 'review_id',"business_id","stars","Doc Text"]
# Keep only the columns in the list
df_reviews = df_reviews[columns_to_keep]

#### Chunking stragies 
1. "sentence_chunking"
2. "pasage_chunking"
3. "opinion_units

In [3]:
chunking_strategy="sentence_chunking" 
dataset="YELP"
embed_model= "all-MiniLM-L6-v2"#, "all-mpnet-base-v2"
saved_to="data/Embeddings/" + dataset + "_" + chunking_strategy
embedding_function = SentenceTransformerEmbeddings(model_name=embed_model)

  embedding_function = SentenceTransformerEmbeddings(model_name=embed_model)


In [4]:
loaded_faiss = FAISS.load_local(saved_to, embedding_function, index_name="index", allow_dangerous_deserialization=True)

### Functions for printing the search results

In [5]:
def print_chunks(res):
    """
    Just print the chunk and the review Id
    """
    counter=1
    for r in res:
        doc_id=r[0].metadata["review_id"]
        review_ID= df_reviews[df_reviews["Doc Id"]==doc_id]["review_id"].values[0]
        print("#"+str(counter), "Doc ID:", r[0].metadata["review_id"])
        print("Chunk:",r[0].page_content)
        print("\n")
        counter+=1
    return 

def print_chunks_and_reviews(res):
    """
    Print chunk, review Id + full review text. 
    
    Review text is retrieved through the df_reviews dataframe wthich links doc ids to review texts
    """
    counter=1
    for r in res:
        print("#"+str(counter),"Doc ID:", r[0].metadata["review_id"])
        print("Chunk:",r[0].page_content)
        doc_id=r[0].metadata["review_id"]
        review_ID= df_reviews[df_reviews["Doc Id"]==doc_id]["review_id"].values[0]
        review_text= df_reviews[df_reviews["Doc Id"]==doc_id]["Doc Text"].values[0]
        print("Full review:\n"+ review_text)
        print("\n")
        counter+=1
    return

## Query the reviews!

In [6]:
# Query to retrieve documents
query="The pasta was superb!"
# Set the number of reviews to return 
n_reviews=10
res=loaded_faiss.similarity_search_with_score(query,n_reviews)
# Select whether to print only the retrieved chunk or also the full review text.
print_chunks(res)
#print_chunks_and_reviews(res)

#1 Doc ID: 172353
Chunk: The pasta was great!


#2 Doc ID: 170042
Chunk: The pasta was by far my favorite main dish.


#3 Doc ID: 108966
Chunk: Not the best pasta I've ever had but it was quick and delicious.


#4 Doc ID: 110056
Chunk: The pasta is delicious.


#5 Doc ID: 118311
Chunk: The pasta is good.


#6 Doc ID: 74279
Chunk: All the pastas are excellent too.


#7 Doc ID: 131
Chunk: The pasta was delicious, the calzones were phenomenal, and the pizza was so good.


#8 Doc ID: 152950
Chunk: Great pasta and environment.


#9 Doc ID: 98589
Chunk: The pasta... and more pasta!


#10 Doc ID: 176198
Chunk: The food was incredible!


