In [1]:
# https://www.turing.com/kb/guide-on-word-embeddings-in-nlp
# https://www.pinecone.io/learn/bertopic/
from restaurentpy.pipeline import RunPipeline

df_review = RunPipeline(path='/Volumes/Macintosh HD/AI World/Review Data/', pat='xlsx'). \
    run_pipeline()

Since the GPL-licensed package `unidecode` is not installed, using Python's `unicodedata` package which yields worse results.
100%|██████████| 5/5 [00:08<00:00,  1.62s/it]


In [3]:
df_review[df_review['lang']=='ar'].head()

Unnamed: 0,calendar_date,review_text,review_rating,lang,translate_review
1519,September-2023,الاكل جيد اصناف متنوعة من سلطات ووجبات رئيسية ...,4,ar,"the food is good, a variety of salads, main me..."
1522,October-2022,مطعم جيد ذو طابع غربي ... أطعمه متنوعه بين الآ...,5,ar,a good restaurant with a western flair... a va...
1523,August-2022,ممتاز من جميع النواحي عدا الاسعار العاليه حتى ...,4,ar,"excellent in all aspects, except for the high ..."
1525,March-2022,تقييمي لفرع المطعم في امارات فيستفال مول ، الا...,4,ar,my review of the restaurant branch in emirates...
1526,September-2023,من افضل المطاعم العائلية في دبي . من ناحية جود...,5,ar,one of the best family restaurants in dubai. i...


# Basic approach

In [None]:
# from bertopic import BERTopic
# from sklearn.feature_extraction.text import CountVectorizer

# # add this to remove stopwords
# vectorizer_model = CountVectorizer(ngram_range=(1, 2), stop_words="english")

# docs = list(df_review.translate_review.values)
# topic_model = BERTopic(
#     vectorizer_model=vectorizer_model,
#     language='english',
#     calculate_probabilities=True,
#     verbose=True
# )
# topics, probs = topic_model.fit_transform(docs)

# topic_model.get_topic_info().head(10)

# topic_model.visualize_hierarchy()

# topic_model.visualize_topics()

## Advance Approach

In [4]:
from sentence_transformers import SentenceTransformer
from umap import UMAP
from hdbscan import HDBSCAN

embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
# umap_model = UMAP(n_neighbors=3, n_components=3, min_dist=0.05)
umap_model = UMAP(n_neighbors=5, 
                  n_components=2, 
                  metric='euclidean')
hdbscan_model = HDBSCAN(min_cluster_size=80, 
                        min_samples=40,
                        gen_min_span_tree=True,
                        prediction_data=True)

  from .autonotebook import tqdm as notebook_tqdm


In [17]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords

stopwords = list(stopwords.words('english')) + ['dubai', 'mall', 'factory', 'cheesecake', 'cheesecakes']

# we add this to remove stopwords that can pollute topcs
vectorizer_model = CountVectorizer(ngram_range=(1, 2), stop_words=stopwords)

zeroshot_topic_list = ['price', 'service', 'food', 'ambience']

In [18]:
from bertopic import BERTopic
docs = list(df_review.translate_review.values)
model = BERTopic(
    # nr_topics='auto',
    # umap_model=umap_model,
    # hdbscan_model=hdbscan_model,
    embedding_model=embedding_model,
    vectorizer_model=vectorizer_model,
    # top_n_words=5,
    language='english',
    # calculate_probabilities=True,
    verbose=True,
    zeroshot_topic_list=zeroshot_topic_list,
    zeroshot_min_similarity=1
)
topics, probs = model.fit_transform(docs)

2024-04-30 23:23:23,778 - BERTopic - Embedding - Transforming documents to embeddings.
Batches: 100%|██████████| 328/328 [03:15<00:00,  1.68it/s]
2024-04-30 23:26:39,749 - BERTopic - Embedding - Completed ✓
2024-04-30 23:26:39,750 - BERTopic - Zeroshot Step 1 - Finding documents that could be assigned to either one of the zero-shot topics
2024-04-30 23:26:39,955 - BERTopic - Zeroshot Step 1 - Completed ✓
2024-04-30 23:26:39,961 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-04-30 23:26:53,313 - BERTopic - Dimensionality - Completed ✓
2024-04-30 23:26:53,314 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-04-30 23:26:54,303 - BERTopic - Cluster - Completed ✓
2024-04-30 23:26:54,316 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-04-30 23:26:55,585 - BERTopic - Representation - Completed ✓
2024-04-30 23:26:55,785 - BERTopic - Zeroshot Step 2 - Clustering documents that were not found in

IndexError: list index out of range

In [7]:
model.visualize_topics()

ValueError: zero-size array to reduction operation maximum which has no identity

In [19]:
model.visualize_barchart()

In [None]:
model.get_document_info(docs=docs)

In [None]:
model.visualize_hierarchy()