In [17]:
import warnings
import pandas as pd
warnings.filterwarnings('ignore')

In [2]:
# https://www.turing.com/kb/guide-on-word-embeddings-in-nlp
# https://www.pinecone.io/learn/bertopic/
from restaurentpy.pipeline import RunPipeline

df_review = RunPipeline(path='/Volumes/Macintosh HD/AI World/Review Data/', pat='xlsx'). \
    run_pipeline()

Since the GPL-licensed package `unidecode` is not installed, using Python's `unicodedata` package which yields worse results.


In [3]:
df_review[df_review['lang']=='ar'].head()

Unnamed: 0,calendar_date,review_text,review_rating,lang,translate_review,sentiment_score,sentiment_type
1519,September-2023,الاكل جيد اصناف متنوعة من سلطات ووجبات رئيسية ...,4,ar,"the food is good, a variety of salads, main me...",0.308333,Positive
1522,October-2022,مطعم جيد ذو طابع غربي ... أطعمه متنوعه بين الآ...,5,ar,a good restaurant with a western flair... a va...,0.175,Positive
1523,August-2022,ممتاز من جميع النواحي عدا الاسعار العاليه حتى ...,4,ar,"excellent in all aspects, except for the high ...",0.4825,Positive
1525,March-2022,تقييمي لفرع المطعم في امارات فيستفال مول ، الا...,4,ar,my review of the restaurant branch in emirates...,0.070833,Positive
1526,September-2023,من افضل المطاعم العائلية في دبي . من ناحية جود...,5,ar,one of the best family restaurants in dubai. i...,0.475,Positive


In [4]:
df_review['sentiment_type'].value_counts()

sentiment_type
Positive    9101
Negative     714
Neutral      665
Name: count, dtype: int64

In [5]:
documents = list(df_review.translate_review.values)

# Basic approach

In [6]:
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
import numpy as np

# Load a pre-trained BERT model for embedding the documents
model = SentenceTransformer('all-MiniLM-L6-v2')

In [12]:
# User-defined topics (example embeddings)
# Assume we have pre-computed embeddings for the user-defined topics
user_defined_topics = [
    "Food",
    "Staff & Service",
    "Ambience",
    "Price"
]

topic_embeddings = model.encode(user_defined_topics)

# Create a dictionary to store the topic embeddings
topic_embedding_dict = {topic: emb for topic, emb in zip(user_defined_topics, topic_embeddings)}

In [13]:
# Generate embeddings for the documents using the pre-trained model
embeddings = model.encode(documents)

In [14]:
# Custom function to assign topics based on user-defined embeddings
def assign_custom_topics(doc_embeddings, topic_embeddings_dict):
    assigned_topics = []
    for doc_emb in doc_embeddings:
        # Calculate cosine similarity with each user-defined topic
        similarities = {topic: np.dot(doc_emb, topic_emb) / (np.linalg.norm(doc_emb) * np.linalg.norm(topic_emb))
                        for topic, topic_emb in topic_embeddings_dict.items()}
        # Assign the topic with the highest similarity
        assigned_topic = max(similarities, key=similarities.get)
        assigned_topics.append(assigned_topic)
    return assigned_topics

In [15]:
custom_topics = assign_custom_topics(embeddings, topic_embedding_dict)
for doc, topic in zip(documents, custom_topics):
    print(f"Document: '{doc}' is assigned to topic: '{topic}'")

Document: 'love the lighting here and the vibe is amazing here. good service and the seating arrangements are incredible. very well maintained and well organized. this place is best mornings as it gets really crowded at night. so i would suggest, if you prefer less crowds, then come early. the food choices are incredible. there are too many choices and honestly, all of them sounds delicious. so it's better if you know what you want already. would save time... the food is simply incredible. i barely finished my burger. it's so heavy and apparently this is how everyone eats.' is assigned to topic: 'Food'
Document: 'today is my birthday and first time having dinner at cheese cake factory.
the food is amazing and exceeded the expectations.
staffs are very attentive and helpful.
especially ' niro '
a big thanks to niro for his professionalism and kindness.
we really had amazing evening.' is assigned to topic: 'Staff & Service'
Document: 'if you love cheesecake, then cheesecake factory is th

In [16]:
custom_topics

['Food',
 'Staff & Service',
 'Food',
 'Food',
 'Food',
 'Food',
 'Food',
 'Food',
 'Food',
 'Food',
 'Food',
 'Staff & Service',
 'Food',
 'Food',
 'Staff & Service',
 'Food',
 'Food',
 'Staff & Service',
 'Ambience',
 'Staff & Service',
 'Food',
 'Food',
 'Food',
 'Food',
 'Food',
 'Ambience',
 'Food',
 'Staff & Service',
 'Food',
 'Food',
 'Food',
 'Food',
 'Food',
 'Staff & Service',
 'Staff & Service',
 'Food',
 'Food',
 'Food',
 'Food',
 'Food',
 'Food',
 'Food',
 'Food',
 'Food',
 'Food',
 'Food',
 'Food',
 'Food',
 'Staff & Service',
 'Staff & Service',
 'Food',
 'Food',
 'Food',
 'Staff & Service',
 'Food',
 'Staff & Service',
 'Food',
 'Staff & Service',
 'Staff & Service',
 'Food',
 'Food',
 'Staff & Service',
 'Food',
 'Staff & Service',
 'Food',
 'Staff & Service',
 'Ambience',
 'Ambience',
 'Food',
 'Food',
 'Food',
 'Staff & Service',
 'Food',
 'Staff & Service',
 'Staff & Service',
 'Food',
 'Food',
 'Staff & Service',
 'Food',
 'Food',
 'Food',
 'Food',
 'Food',
 'Staf

In [20]:
df_topic = pd.DataFrame({
    'translate_review': documents,
    'Topic': custom_topics
})
df_topic.head()

Unnamed: 0,translate_review,Topic
0,love the lighting here and the vibe is amazing...,Food
1,today is my birthday and first time having din...,Staff & Service
2,"if you love cheesecake, then cheesecake factor...",Food
3,i always get the same dish; the baja chicken t...,Food
4,one of my favorite restaurants. quality of foo...,Food


In [21]:
df_final = pd.merge(df_review, df_topic, on='translate_review')

In [22]:
df_final.head()

Unnamed: 0,calendar_date,review_text,review_rating,lang,translate_review,sentiment_score,sentiment_type,Topic
0,December-2023,Love the lighting here and the vibe is amazing...,5,en,love the lighting here and the vibe is amazing...,0.462963,Positive,Food
1,November-2023,Today is my birthday and first time having din...,5,en,today is my birthday and first time having din...,0.29625,Positive,Staff & Service
2,November-2023,"If you love Cheesecake, then Cheesecake factor...",5,en,"if you love cheesecake, then cheesecake factor...",0.454265,Positive,Food
3,October-2023,I always get the same dish; the baja chicken t...,5,en,i always get the same dish; the baja chicken t...,0.253125,Positive,Food
4,October-2023,One of my favorite restaurants. Quality of foo...,5,en,one of my favorite restaurants. quality of foo...,0.52875,Positive,Food


In [26]:
# Save the merged DataFrame to a CSV file
df_final.to_csv('inst/output/cheesecake_python.csv', index=False)

In [27]:
df_final['Topic'].value_counts()

Topic
Food               11341
Staff & Service     2077
Ambience            1357
Price                923
Name: count, dtype: int64