In [1]:
from langchain import FAISS
import os, sys
from txtai.embeddings import Embeddings
from langchain.docstore.document import Document 
import pandas as pd
import numpy as np
import time
import pickle
import openai
import re
import json
from nltk import tokenize
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


# Guide to Opinion Units
- This guide shows how to create Opinion Units from a subset of Yelp restaurant reviews using the OpenAI LLM API. It further covers embedding the opinion units using SentenceTransformers and LangChain Documents and saving them to a local vector database file using FAISS.
- To create Opinion Units using the OpenAI-LLM-API you need to create your own API key

### Read a subset of YELP restaurant review data (20k reviews)

In [2]:
dataset="YELP"
data_path = 'data/YELP/yelp_subset.pkl'
# Load a DataFrame of a subset of 20k YELP restaurant reviews from a pickle file
df_reviews = pd.read_pickle(data_path)
# Reset the index and rename the index column to "Doc Id"
df_reviews.reset_index(inplace=True)
df_reviews.rename(columns={'index': 'Doc Id'}, inplace=True)
# Rename the column from 'text' to 'Doc Text'
df_reviews.rename(columns={'text': 'Doc Text'}, inplace=True)
columns_to_keep = ['Doc Id', 'review_id',"business_id","stars","Doc Text"]
# Keep only the columns in the list
df_reviews = df_reviews[columns_to_keep]

In [3]:
df_reviews.head()

Unnamed: 0,Doc Id,review_id,business_id,stars,Doc Text
0,112388,vhETeXa3nM34Hwk3KEFfiA,AQw0B8j9QV1RkFLLFiwkuw,3.0,I will be spending several weekends here in Ca...
1,68092,M09LOjNR1ymX4avcBQfAYQ,rh6O8NtKJUhqZ0G2Pkpj2Q,5.0,Went here once and can't wait to go again! The...
2,40901,w5x1pXvmODU5cYI3PZsSQA,YGgGefpPTFhgthvQvMAGoQ,5.0,"Now I know why Guy featured this place, it was..."
3,19599,LnbFwaD8CEC-OsCMb1YZDA,SZU9c8V2GuREDN5KgyHFJw,4.0,Great place at the end of the wharf. Be prepar...
4,144853,3ZiPH6CHL_cyVNoYP2rt1Q,FQxEfhBd1gMrurP19bhK8w,4.0,"Mmm...I always get the chicken salad sandwich,..."


### Create query (prompt) template for prompting the LLM
- The query template consists of: query_string = instructions for creating opinion units + example review + example response from_the LLM 
- info on OpenAI API with python: https://gokhang1327.medium.com/getting-started-with-the-openai-api-chatgpt-in-python-d689eecbbd37

In [4]:
instructions1="Perform aspect-based sentiment analysis for the restaurant review provided as the input. Return each aspect-sentiment pair with a label and a corresponding excerpt from the text. Also mark the sentiment of aspects as negative or positive."

In [5]:
instructions2="Aspect-sentiment pairs should not mix opinions on different aspects. Make sure to include all aspects. An aspect should be independent and not have to rely on other aspects to be understood."

In [6]:
instructions3="If an opinion in the review is about the restaurant or experience in general then label this aspect as “overall experience”.  Opinions not related to the restaurant should not be included."

In [7]:
example_input= "I just left Mary’s with my lovely wife. The gorgeous outdoor patio seating was fantastic with a nice view of the ocean. We came for brunch and were blown away! We split a dozen oysters. They were the best I had in my life! FRESH! Delicious! The avocado toast was excellent as were the crab cakes. Altogether, we had a great experience. Almost 5 stars! but the staff could have been a little friendlier and the tables cleaner." 

In [8]:
example_output="""[["Outdoor patio seating", "The gorgeous outdoor patio seating was fantastic with a nice view of the ocean", "positive"], 
["View", "a nice view of the ocean", "positive"],
 ["Brunch", "We came for brunch and were blown away", "positive"], 
["Oysters", "We split a dozen oysters. They were the best I had in my life! FRESH! Delicious!", "positive"], 
["Avocado toast", "the avocado toast was excellent", "positive"], 
["Crab cakes", "the crab cakes were excellent", "positive"],
["Overall experience", "Altogether, we had a great experience. Almost 5 stars!", "positive"], 
["Staff friendliness", "the staff could have been a little friendlier", "negative"], 
["Table cleanliness", "the tables could have been cleaner", "negative"]]"""

### Read your OpenAI API key 
#### !!! You need to have your own OPENAI_API_KEY !!!

In [9]:
from openai import OpenAI
from key_file import OPENAI_API_KEY
OpenAI.api_key = OPENAI_API_KEY

# Check if API key is set
if OpenAI.api_key is None:
    print("Error: OpenAI API key not found. Please set the environment variable 'OPENAI_API_KEY' or import from a secure file.")
else:
    print("OpenAI API key (masked):", OpenAI.api_key[:3] + "..." * (len(OpenAI.api_key) - 3))


OpenAI API key (masked): sk-................................................................................................................................................


### Functions used to create Opinion units and to check their format

In [10]:
def create_query_string(instructions1, instructions2, instructions3, example_input, example_output, review_input):
    """
    Function aim: creates the query_string (text in prompt template) sent to the LLM-API
    
    query_string = instructions + review_query + example_query + example_answer 
    """
    query_string = """{}

{}

{}

Example input: {}

Example output: 
{}

Input: {}

Output: """.format(instructions1, instructions2, instructions3,
               example_input, example_output, review_input)
    return query_string

def create_opinion_units(query_string):
    """
    Function aim: Call LLM API and generate opinion units
    
    Input: query_string (prompt template text)  
    """
    # Import the OpenAI library and initialize the client with the API key.
    client = OpenAI(api_key=OPENAI_API_KEY)
    
    # Generate a completion using the OpenAI chat model.
    completion = client.chat.completions.create(
        model="gpt-4-turbo", # Specify the model to use (GPT-4 Turbo).
        #model="gpt-3.5-turbo",  # Specify the model to use (GPT-3.5 Turbo).
        messages=[  # Define the messages to be used for generating the response.
            {"role": "user", "content": query_string}  # User's message.
        ],
        max_tokens=2400,  # Maximum number of tokens in the response.
        temperature=1.0  # Sampling temperature, controlling the randomness of the response.
    )
    
    # Return the generated completion.
    return completion

def check_and_parse_ous(data):
    """
    This function has two functions:
    1. Check that format corresponds with e.g. 
    
        [["Avocado toast", "the avocado toast was excellent", "positive"], 
        ["Crab cakes", "the crab cakes were excellent", "positive"]]
    
    2. Disregards potential other text in LLM response except for opinion unit-json

    Input: Raw text response from LLM
    Output: 
        If correct format: True (correct format), ou_df (dataframe storing opinin unit) 
        If incorrect format: Fakse (incorrect format), False
    """
    try:
        # Regular expression to find the list in the string
        match = re.search(r'\[\[.*?\]\]', data, re.DOTALL)

        if match:
            # Extract the list string
            data = match.group(0)
        else:
            return False, False
        
        # Load the data to check if it's a valid JSON array
        parsed_data = json.loads(data)

        # Check if the outer structure is a list
        if not isinstance(parsed_data, list):
            return False, False

        # Check each element in the list
        for item in parsed_data:
            # Each item should be a list with exactly 3 elements
            if not (isinstance(item, list) and len(item) == 3):
                return False, False
            # Check that the first two elements are strings and the third is a valid sentiment
            if not (isinstance(item[0], str) and isinstance(item[1], str)):
                return False, False
        
        ou_df = pd.DataFrame(parsed_data, columns=['Aspect', 'Extract', 'Sentiment'])
        
        return True, ou_df
    except json.JSONDecodeError:
        False, False

### Optional subsampling of dataset or selection of specific reviews

In [14]:
# only use X reviews during experimentation
df_reviews=df_reviews.sample(n=3, random_state=23)
# sample specific reviews from dataset based on "Doc Id"
#specific_rows = error_ids
#df_reviews=df_reviews[df_reviews["Doc Id"].isin(specific_rows)]
print("Numer of reviews for opinion unit generation:", len(df_reviews.index))

Numer of reviews for opinion unit generation: 3


### Loop through reviews and create opinion units

In [15]:
# Initialize an empty list to store Document objects of created option units
docs = []
# Doc Ids of Opinion Units with incorrect format i.e. our LLM returned the wrong format 
error_ids=[]

#number of reviews
total_texts=len(df_reviews.index)
counter=0

print("Creating opinion units...")
# tqdm (pbar) creates a progress bar for us
with tqdm(total=len(df_reviews.index), desc="Creating opinion-units") as pbar:
    for index, review in df_reviews.iterrows():
        # update number of completed reviews in progress bar
        pbar.update(1)
        # Extract the text of the review
        review_input=review["Doc Text"]

        # create query_string (sent as query/prompt to LLM)
        query_string=create_query_string(instructions1, instructions2, instructions3,
                                         example_input, example_output, review_input)

        # Use LLM-gpt API to create prepositions
        chat_completion_object = create_opinion_units(query_string)

        # Extract the opinion_units from the completion object
        ous_string = chat_completion_object.choices[0].message.content

        # check if the LLM returned the correct Opinion Unit format, if incorrect format: ou_check = False
        ou_check, ous_df=check_and_parse_ous(ous_string)
        if not ou_check:
            print("Error in format for review ID:",review["Doc Id"])
            error_ids.append(review["Doc Id"])
            continue
        else:
            # Regular expression to find the list in the string
            import re
            # remove text from response that is not list
            ous_string = re.search(r'\[\[.*?\]\]', ous_string, re.DOTALL).group(0)

            # Iterate over each row (opinion unit) in the DataFrame ous_df
            for ind, ou in ous_df.iterrows():

                #ou_text is the form of the opinion unit minus the sentiment and Doc ID metadata
                ou_text=ou["Aspect"]+": "+ou["Extract"]

                # Create metadata dictionary containing review ID, sentiment, and aspec
                meta_dict={"review_id":review["Doc Id"], "sentiment":ou["Sentiment"], "Aspect":ou["Aspect"]}

                # Create a new Document object with the opinion unit excerpt text and metadata
                newDoc = Document(page_content=ou_text, metadata=meta_dict)
                docs.append(newDoc)

Creating opinion units...


Creating opinion-units: 100%|█████████████████████| 3/3 [00:18<00:00,  6.33s/it]


### Example output (opinion units) returned from LLM

In [17]:
doc_ids= list(df_reviews["Doc Id"].unique())
doc_id=doc_ids[2]
print("Review Id:", doc_id)
print("\nGenerated opinion units:")

example_ous=[d for d in docs if d.metadata["review_id"]==doc_id]
for ou in example_ous:
    print("\u2022 "+ ou.page_content+ " {sentiment: "+ou.metadata["sentiment"]+"}")

print("\n")
review_text= df_reviews[df_reviews["Doc Id"]==doc_id]["Doc Text"].values[0]
print("Full review text:\n"+ review_text)
print("\n")

Review Id: 8492

Generated opinion units:
• Food presentation: I was happily surprised at the set up of the plate. They separated the meat and cheese into a pile and it was up to you to assemble {sentiment: positive}
• Quantity of food: It was ALOT of food for a very decent price {sentiment: positive}
• Price value: a very decent price {sentiment: positive}
• Overall experience: Will be back for round 2 real soon {sentiment: positive}


Full review text:
I had meant to take a pic of the food I ate, but when my number was called I just went into auto and demolished the plate. I was happily surprised at the set up of he plate. They separated the meat and cheese into a pile and it was up to you to assemble... It was ALOT of food for a very decent price. Will be back for round 2 real soon.




###  Select embedding model

In [18]:
from langchain.embeddings import SentenceTransformerEmbeddings
embedd_model="all-MiniLM-L6-v2"
embedding_function = SentenceTransformerEmbeddings(model_name=embedd_model)

  embedding_function = SentenceTransformerEmbeddings(model_name=embedd_model)


### Save embedding vectors to local vector database using FAISS

In [19]:
faiss = FAISS.from_documents(docs, embedding_function)
chunking_strategy="opinion_units"
save_to="data/Embeddings/" + dataset + "_" + chunking_strategy
faiss.save_local(save_to, index_name="index")