In [1]:
# https://www.turing.com/kb/guide-on-word-embeddings-in-nlp
# https://www.pinecone.io/learn/bertopic/
from restaurentpy.pipeline import RunPipeline

df_review = RunPipeline(path='/Volumes/Macintosh HD/AI World/Review Data/', pat='xlsx'). \
    run_pipeline()

Since the GPL-licensed package `unidecode` is not installed, using Python's `unicodedata` package which yields worse results.


In [2]:
df_review[df_review['lang']=='ar'].head()

Unnamed: 0,calendar_date,review_text,review_rating,lang,translate_review
1519,September-2023,الاكل جيد اصناف متنوعة من سلطات ووجبات رئيسية ...,4,ar,"the food is good, a variety of salads, main me..."
1522,October-2022,مطعم جيد ذو طابع غربي ... أطعمه متنوعه بين الآ...,5,ar,a good restaurant with a western flair... a va...
1523,August-2022,ممتاز من جميع النواحي عدا الاسعار العاليه حتى ...,4,ar,"excellent in all aspects, except for the high ..."
1525,March-2022,تقييمي لفرع المطعم في امارات فيستفال مول ، الا...,4,ar,my review of the restaurant branch in emirates...
1526,September-2023,من افضل المطاعم العائلية في دبي . من ناحية جود...,5,ar,one of the best family restaurants in dubai. i...


# Sentiment Analysis

In [3]:
from textblob import TextBlob

def analyze_sentiment(text):
    analysis = TextBlob(text)
    return analysis.sentiment.polarity

def categories_sentiment(sentiment_score):
    # Check polarity for sentiment (-1 to 1, negative to positive)
    if sentiment_score > 0:
        return "Positive"
    elif sentiment_score == 0:
        return "Neutral"
    else:
        return "Negative"

# Example usage:
text = "I love this product! It's amazing."
print("Sentiment:", analyze_sentiment(text))


Sentiment: 0.6125


In [4]:
df_review['sentiment_score'] = df_review['translate_review'].apply(analyze_sentiment)
df_review['sentiment_type'] = df_review['sentiment_score'].apply(categories_sentiment)

df_review.head()

Unnamed: 0,calendar_date,review_text,review_rating,lang,translate_review,sentiment_score,sentiment_type
0,December-2023,Love the lighting here and the vibe is amazing...,5,en,love the lighting here and the vibe is amazing...,0.462963,Positive
1,November-2023,Today is my birthday and first time having din...,5,en,today is my birthday and first time having din...,0.29625,Positive
2,November-2023,"If you love Cheesecake, then Cheesecake factor...",5,en,"if you love cheesecake, then cheesecake factor...",0.454265,Positive
3,October-2023,I always get the same dish; the baja chicken t...,5,en,i always get the same dish; the baja chicken t...,0.253125,Positive
4,October-2023,One of my favorite restaurants. Quality of foo...,5,en,one of my favorite restaurants. quality of foo...,0.52875,Positive


In [5]:
df_review['sentiment_type'].value_counts()

sentiment_type
Positive    9095
Negative     716
Neutral      658
Name: count, dtype: int64

In [6]:
import plotly.graph_objs as go
import pandas as pd

df_review['date'] = df_review['calendar_date'].apply(pd.to_datetime)

# Create scatter plot
fig = go.Figure(data=go.Scatter(x=df_review['date'], 
                                y=df_review['sentiment_score'], 
                                mode='markers'))

# Update layout
fig.update_layout(
    title="Scatter Plot with Date on X-axis",
    xaxis_title="Date",
    yaxis_title="Value"
)

# Show plot
fig.show()


# Spelling Correction

In [None]:
# from spellchecker import SpellChecker
# spell = SpellChecker()

# # Define a function for spell correction
# def correct_spelling(text):
#     corrected_texts = []
#     if text is not None:  # Check if text is not None
#         for i in text.split(): 
#             corrected_text = spell.correction(i)
#             if corrected_text != None:
#                 corrected_texts.append(corrected_text)
            
#         corrected_texts = ' '.join(corrected_texts)
#         return corrected_texts
#     else:
#         return None
    
# # Example usage:
# text = "I lovee ths prouct Its amazinggg price prices"
# print("Correct Word", correct_spelling(text))

# df_review['spell_corrected'] = df_review['translate_review'].apply(correct_spelling)
# df_review[77:]

# Basic approach

In [None]:
# from bertopic import BERTopic
# from sklearn.feature_extraction.text import CountVectorizer

# # add this to remove stopwords
# vectorizer_model = CountVectorizer(ngram_range=(1, 2), stop_words="english")

# docs = list(df_review.translate_review.values)
# topic_model = BERTopic(
#     vectorizer_model=vectorizer_model,
#     language='english',
#     calculate_probabilities=True,
#     verbose=True
# )
# topics, probs = topic_model.fit_transform(docs)

# topic_model.get_topic_info().head(10)

# topic_model.visualize_hierarchy()

# topic_model.visualize_topics()

## Advance Approach

In [None]:
# from sentence_transformers import SentenceTransformer
# from umap import UMAP
# from hdbscan import HDBSCAN

# embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
# # umap_model = UMAP(n_neighbors=3, n_components=3, min_dist=0.05)
# umap_model = UMAP(n_neighbors=5, 
#                   n_components=2, 
#                   metric='euclidean')
# hdbscan_model = HDBSCAN(min_cluster_size=80, 
#                         min_samples=40,
#                         gen_min_span_tree=True,
#                         prediction_data=True)

In [None]:
# from sklearn.feature_extraction.text import CountVectorizer
# from nltk.corpus import stopwords

# stopwords = list(stopwords.words('english')) + ['dubai', 'mall', 'factory', 'cheesecake', 'cheesecakes']

# # we add this to remove stopwords that can pollute topcs
# vectorizer_model = CountVectorizer(ngram_range=(1, 2), stop_words=stopwords)

# zeroshot_topic_list = ['price', 'service', 'food', 'ambience']

In [None]:
# from bertopic import BERTopic
# from bertopic.representation import KeyBERTInspired
# docs = list(df_review.translate_review.values)
# model = BERTopic(
#     # nr_topics='auto',
#     umap_model=umap_model,
#     hdbscan_model=hdbscan_model,
#     embedding_model=embedding_model,
#     # embedding_model="thenlper/gte-small",
#     min_topic_size=5,
#     vectorizer_model=vectorizer_model
#     # top_n_words=5,
#     # language='english',
#     # calculate_probabilities=True,
#     # verbose=True,
#     # zeroshot_topic_list=zeroshot_topic_list,
#     # zeroshot_min_similarity=0.95,
#     # representation_model=KeyBERTInspired()
# )
# topics, _ = model.fit_transform(docs)

# User Defined Topics

In [None]:
# from sklearn.cluster import KMeans
# from sklearn.metrics import silhouette_score
# from transformers import BertTokenizer, BertModel
# import numpy as np

In [None]:
# # Define user topics
# user_topics = ["food", "service & staff", "price", "ambience"]

# # Prepare data
# corpus = list(df_review.translate_review.values)  # List of documents

# # Embed documents
# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# model = BertModel.from_pretrained('bert-base-uncased')

# def get_bert_embeddings(text):
#     inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
#     outputs = model(**inputs)
#     embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().detach().numpy()
#     return embeddings

# document_embeddings = np.array([get_bert_embeddings(doc) for doc in corpus])

# # Clustering
# clusterer = KMeans(n_clusters=len(user_topics))
# cluster_labels = clusterer.fit_predict(document_embeddings)

# # Assign topics
# topic_assignments = {}
# for i, label in enumerate(cluster_labels):
#     topic_assignments.setdefault(user_topics[label], []).append(corpus[i])
    
# # Evaluation (optional)
# silhouette_avg = silhouette_score(document_embeddings, cluster_labels)
# print("Silhouette Score:", silhouette_avg)

# # Interpretation
# for topic, documents in topic_assignments.items():
#     print("Topic:", topic)
#     for doc in documents:
#         print("-", doc)
#     print()

# LDA

In [10]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from nltk.corpus import stopwords
from spellchecker import SpellChecker

documents = list(df_review.translate_review.values)

# Set a seed for reproducibility
random_state = 42

common_words = [
    "cheesecake",
    "chees",
    "cheese", "cheesecakes", "cheesecake", "cheesecak",
    "cheescake", "cheescakes",
    "cake",
    "cakes",
    "factory",
    "Dubai", "dubai",
    "mall",
    "habibi",
    "restaurant",
    "restaurent"
]

stopwords = list(stopwords.words('english')) + common_words

# Vectorize the text
vectorizer = CountVectorizer(stop_words=stopwords)
X = vectorizer.fit_transform(documents)

# Fit the LDA model
lda = LatentDirichletAllocation(n_components=4, random_state=random_state)
lda.fit(X)

# Print the top words for each topic
feature_names = vectorizer.get_feature_names_out()
for topic_idx, topic in enumerate(lda.components_):
    print(f"Topic {topic_idx + 1}:")
    top_words_indices = topic.argsort()[:-5-1:-1]
    top_words = [feature_names[i] for i in top_words_indices]
    print(" ".join(top_words))

Topic 1:
chicken us time like service
Topic 2:
food good service delicious nice
Topic 3:
place best food one try
Topic 4:
great food service always portions


In [8]:
# Assign topics to documents
document_topics = lda.transform(X)

# Print the topics assigned to each document
for i, doc_topics in enumerate(document_topics):
    print(f"Document {i + 1}:")
    for topic_idx, topic_prob in enumerate(doc_topics):
        print(f"Topic {topic_idx + 1}: {topic_prob:.4f}")

Document 1:
Topic 1: 0.3388
Topic 2: 0.6511
Topic 3: 0.0050
Topic 4: 0.0050
Document 2:
Topic 1: 0.5262
Topic 2: 0.4510
Topic 3: 0.0113
Topic 4: 0.0116
Document 3:
Topic 1: 0.1719
Topic 2: 0.0917
Topic 3: 0.2198
Topic 4: 0.5166
Document 4:
Topic 1: 0.7180
Topic 2: 0.1680
Topic 3: 0.0071
Topic 4: 0.1068
Document 5:
Topic 1: 0.0130
Topic 2: 0.8238
Topic 3: 0.1504
Topic 4: 0.0127
Document 6:
Topic 1: 0.6195
Topic 2: 0.3659
Topic 3: 0.0072
Topic 4: 0.0074
Document 7:
Topic 1: 0.5398
Topic 2: 0.4451
Topic 3: 0.0074
Topic 4: 0.0078
Document 8:
Topic 1: 0.2496
Topic 2: 0.2039
Topic 3: 0.0055
Topic 4: 0.5409
Document 9:
Topic 1: 0.2524
Topic 2: 0.0122
Topic 3: 0.5011
Topic 4: 0.2343
Document 10:
Topic 1: 0.0160
Topic 2: 0.0158
Topic 3: 0.9515
Topic 4: 0.0167
Document 11:
Topic 1: 0.2939
Topic 2: 0.0167
Topic 3: 0.6730
Topic 4: 0.0164
Document 12:
Topic 1: 0.0149
Topic 2: 0.2350
Topic 3: 0.3763
Topic 4: 0.3738
Document 13:
Topic 1: 0.6019
Topic 2: 0.0091
Topic 3: 0.0091
Topic 4: 0.3800
Document

In [None]:
# from gensim import corpora, models

# Tokenize the documents and create a dictionary
# tokenized_docs = [doc.split() for doc in documents]
# dictionary = corpora.Dictionary(tokenized_docs)

# # Convert the documents to a bag-of-words corpus
# corpus = [dictionary.doc2bow(doc) for doc in tokenized_docs]

# # Train the LDA model
# lda_model = models.LdaModel(corpus, num_topics=4, id2word=dictionary, passes=10)

# # Print the topics
# for idx, topic in lda_model.print_topics(-1):
#     print(f"Topic {idx}: {topic}")