# Setup used

Ram = 32 GB <br>
GPU = GeForce 3060ti RTX 6 GB <br>
CPU = AMD Ryzen 9 5900X 12-Core Processor, 3701 MHz, 12 cores, 24 logical cores

All packages where installed inside a virtual conda environment on WSL (Windows Subsystem for Linux)

# Install packages here

# Import packages here

In [1]:
import gensim.corpora as corpora
from gensim.models.coherencemodel import CoherenceModel
import nltk
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
from tqdm import tqdm
import pandas as pd
from cuml.cluster import HDBSCAN # We are using the GPU accelerated version of HDBSCAN
from cuml.manifold import UMAP # We are using the GPU accelerated version of UMAP
from octis.evaluation_metrics.diversity_metrics import TopicDiversity
import numpy as np

# Initialize functions here

## Topic Coherence

In [2]:
def calc_topic_coherence(model, frame):

    topics = model.topics_

    # Preprocess Documents
    documents = pd.DataFrame({"Document": frame['content_corrected'],
                            "ID": range(len(frame['content_corrected'])),
                            "Topic": topics})
    documents_per_topic = documents.groupby(['Topic'], as_index=False).agg({'Document': ' '.join})
    cleaned_docs = model._preprocess_text(documents_per_topic.Document.values)

    # Extract vectorizer and analyzer from BERTopic
    vectorizer = model.vectorizer_model
    analyzer = vectorizer.build_analyzer()

    # Extract features for Topic Coherence evaluation
    words = vectorizer.get_feature_names()
    tokens = [analyzer(doc) for doc in cleaned_docs]
    dictionary = corpora.Dictionary(tokens)
    corpus = [dictionary.doc2bow(token) for token in tokens]
    topic_words = [[words for words, _ in model.get_topic(topic)]
                for topic in range(len(set(topics))-1)]

    # Evaluate
    coherence_model = CoherenceModel(topics=topic_words,
                                    texts=tokens,
                                    corpus=corpus,
                                    dictionary=dictionary,
                                    coherence='c_v',
                                    processes=10)

    return coherence_model.get_coherence()

## Topic Diversity

In [3]:
def calc_topic_diversity(model):

    topics = model.topics_

    model_output_test = {}
    model_output_test['topics'] = [[z[0] for z in model.get_topic(x)] for x in set(topics)]
    metric = TopicDiversity(topk=3)

    return metric.score(model_output_test)

# Load data

In [4]:
df_netflix = pd.read_csv(f'preprocessed_data/prep_netflix_v4.csv')
df_youtube = pd.read_csv(f'preprocessed_data/prep_youtube_v4.csv')
df_whatsapp = pd.read_csv(f'preprocessed_data/prep_whatsapp_v4.csv')
df_paypal = pd.read_csv(f'preprocessed_data/prep_paypal_v4.csv')
df_amazon = pd.read_csv(f'preprocessed_data/prep_amazon_v4.csv')

  df_amazon = pd.read_csv(f'preprocessed_data/prep_amazon_v4.csv')


# Model training

## Initialize CountVectorizer fron ngram extraction and stop words removal

In [6]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/denis/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [7]:
vectorizer_model = CountVectorizer(
    stop_words=nltk.corpus.stopwords.words('english'))

## Initialize model

Depending on the dataset, the n_neighbors argument may need to be changed (higher n_neighbors gives a more broader look on the data, and vice versa)

In [8]:
umap_model = UMAP(n_neighbors=5, n_components=5, min_dist=0.0, metric='cosine', verbose=True, random_state=42)
hdbscan_model = HDBSCAN(min_cluster_size=5, metric='euclidean', cluster_selection_method='eom', prediction_data=True, )

## Training

### Training Netflix

In [38]:
topic_model_netflix = BERTopic(embedding_model='all-mpnet-base-v2', vectorizer_model=vectorizer_model, umap_model=umap_model, hdbscan_model=hdbscan_model, language='english', calculate_probabilities=True, verbose=True)
topics_netflix, probs_netflix = topic_model_netflix.fit_transform(list(df_netflix['content_corrected']))

Batches:   0%|          | 0/3057 [00:00<?, ?it/s]

2023-06-11 14:31:53,981 - BERTopic - Transformed documents to Embeddings
  return _core.array(a, dtype, False, order)


[D] [14:31:54.382036] /opt/conda/conda-bld/work/cpp/src/umap/runner.cuh:108 n_neighbors=5
[D] [14:31:54.382823] /opt/conda/conda-bld/work/cpp/src/umap/runner.cuh:130 Calling knn graph run
[D] [14:32:03.435313] /opt/conda/conda-bld/work/cpp/src/umap/runner.cuh:136 Done. Calling fuzzy simplicial set
[D] [14:32:03.437326] /opt/conda/conda-bld/work/cpp/src/umap/fuzzy_simpl_set/naive.cuh:317 Smooth kNN Distances
[D] [14:32:03.437582] /opt/conda/conda-bld/work/cpp/src/umap/fuzzy_simpl_set/naive.cuh:319 sigmas = [ 0.0772343, 0.15773, 0.171432, 0.0650024, 0.0347414, 0.085968, 0.0208058, 0.00141865, 0.0160859, 0.0205088, 0.0230265, 0.0622015, 0.0705271, 0.0874891, 0.0287013, 0.0373034, 0.122732, 0.208961, 0.0358205, 0.035893, 0.007653, 0.0438929, 0.0286384, 0.0855503, 0.0893173 ]

[D] [14:32:03.437648] /opt/conda/conda-bld/work/cpp/src/umap/fuzzy_simpl_set/naive.cuh:321 rhos = [ 0.214395, 0.322308, 0.331882, 0.451636, 0.257522, 0.522793, 0.268713, 0.385371, 0.390251, 0.470452, 0.414207, 0.48707

2023-06-11 14:32:13,408 - BERTopic - Reduced dimensionality


[D] [14:32:04.576725] /opt/conda/conda-bld/work/cpp/src/umap/runner.cuh:347 Running transform
[D] [14:32:04.576820] /opt/conda/conda-bld/work/cpp/src/umap/runner.cuh:349 Building KNN Graph
[D] [14:32:13.386251] /opt/conda/conda-bld/work/cpp/src/umap/runner.cuh:382 Smoothing KNN distances
[D] [14:32:13.387067] /opt/conda/conda-bld/work/cpp/src/umap/runner.cuh:414 Executing fuzzy simplicial set
[D] [14:32:13.388810] /opt/conda/conda-bld/work/cpp/src/umap/runner.cuh:443 Performing L1 normalization
[D] [14:32:13.389604] /opt/conda/conda-bld/work/cpp/src/umap/runner.cuh:479 n_epochs=30
[D] [14:32:13.392548] /opt/conda/conda-bld/work/cpp/src/umap/runner.cuh:502 Computing # of epochs for training each sample
[D] [14:32:13.393115] /opt/conda/conda-bld/work/cpp/src/umap/runner.cuh:509 Performing optimization


2023-06-11 14:32:42,076 - BERTopic - Clustered reduced embeddings


In [None]:
classes_complete = list(df_netflix['reviewCreatedVersion'])
topics_per_class = topic_model_netflix.topics_per_class(df_netflix['content_corrected'], classes=classes_complete)

In [40]:
topic_model_netflix.get_topic_info().head(50)

Unnamed: 0,Topic,Count,Name
0,-1,48006,-1_movies_shows_good_watch
1,0,1357,0_brightness_dark_bright_adjust
2,1,829,1_best_movies_nice_films
3,2,699,2_login_password_sign_log
4,3,667,3_white_subtitle_subtitles_read
5,4,663,4_reach_1001_sorry_later
6,5,574,5_uninstall_bloatware_delete_pre
7,6,573,6_opening_open_application_launch
8,7,487,7_ui_layout_interface_revert
9,8,456,8_hindi_tamil_dubbed_telugu


In [41]:
topic_model_netflix.reduce_topics(df_netflix['content_corrected'], nr_topics='auto')

2023-06-11 14:34:29,821 - BERTopic - Reduced number of topics from 1560 to 1134


<bertopic._bertopic.BERTopic at 0x7f85f42fb370>

In [None]:
topics_per_class_netflix = topic_model_netflix.topics_per_class(df_netflix['content_corrected'], classes=classes_complete)

In [43]:
freq_netflix = topic_model_netflix.get_topic_info()

In [44]:
freq_netflix.head(50)

Unnamed: 0,Topic,Count,Name
0,-1,48006,-1_movies_shows_good_watch
1,0,1468,0_login_password_sign_log
2,1,1357,1_brightness_dark_bright_adjust
3,2,1270,2_best_movies_nice_great
4,3,1166,3_opening_open_closing_application
5,4,1015,4_ui_layout_interface_scroll
6,5,869,5_crashing_crashes_crash_crashed
7,6,831,6_stars_star_gave_five
8,7,823,7_price_prices_expensive_raising
9,8,727,8_payment_repay_method_card


In [13]:
topic_model_netflix.get_representative_docs(7)

["very good app, however i haven't been able to cast to chromecast from the app.",
 'unable cast to my to using my chromecast.',
 'cannot cast to chromecast']

### Training YouTube

In [20]:
kmeans = KMeans(n_clusters=1500, verbose=True, random_state=42)
topic_model_youtube = BERTopic(embedding_model='all-mpnet-base-v2', vectorizer_model=vectorizer_model, umap_model=umap_model, hdbscan_model=hdbscan_model, language='english', calculate_probabilities=True, verbose=True)
topics_youtube, probs_youtube = topic_model_youtube.fit_transform(list(df_youtube['content_corrected']))

Batches:   0%|          | 0/3726 [00:00<?, ?it/s]

2023-06-11 13:20:21,323 - BERTopic - Transformed documents to Embeddings


[D] [13:20:21.387481] /opt/conda/conda-bld/work/cpp/src/umap/runner.cuh:108 n_neighbors=5
[D] [13:20:21.388170] /opt/conda/conda-bld/work/cpp/src/umap/runner.cuh:130 Calling knn graph run
[D] [13:20:32.885184] /opt/conda/conda-bld/work/cpp/src/umap/runner.cuh:136 Done. Calling fuzzy simplicial set
[D] [13:20:32.887091] /opt/conda/conda-bld/work/cpp/src/umap/fuzzy_simpl_set/naive.cuh:317 Smooth kNN Distances
[D] [13:20:32.887261] /opt/conda/conda-bld/work/cpp/src/umap/fuzzy_simpl_set/naive.cuh:319 sigmas = [ 0.0493708, 0.0825453, 0.0682144, 0.0357585, 0.0596857, 0.00926733, 0.105324, 0.108179, 0.0149078, 0.0546074, 0.0215907, 0.0630474, 0.0416555, 0.0409136, 0.0543861, 0.0304265, 0.00800323, 0.0591908, 0.134441, 0.0737476, 0.0254669, 0.0379176, 0.0321689, 0.09132, 0.0106606 ]

[D] [13:20:32.887320] /opt/conda/conda-bld/work/cpp/src/umap/fuzzy_simpl_set/naive.cuh:321 rhos = [ 0.504383, 0.348172, 0.221043, 0.389728, 0.280698, 0.24754, 0.50026, 0.367438, 0.636706, 0.333158, 0.363716, 0.252

2023-06-11 13:20:45,266 - BERTopic - Reduced dimensionality


[D] [13:20:45.243488] /opt/conda/conda-bld/work/cpp/src/umap/runner.cuh:382 Smoothing KNN distances
[D] [13:20:45.244413] /opt/conda/conda-bld/work/cpp/src/umap/runner.cuh:414 Executing fuzzy simplicial set
[D] [13:20:45.246158] /opt/conda/conda-bld/work/cpp/src/umap/runner.cuh:443 Performing L1 normalization
[D] [13:20:45.247318] /opt/conda/conda-bld/work/cpp/src/umap/runner.cuh:479 n_epochs=30
[D] [13:20:45.249236] /opt/conda/conda-bld/work/cpp/src/umap/runner.cuh:502 Computing # of epochs for training each sample
[D] [13:20:45.250089] /opt/conda/conda-bld/work/cpp/src/umap/runner.cuh:509 Performing optimization


2023-06-11 13:21:00,302 - BERTopic - Clustered reduced embeddings


In [21]:
freq_youtube = topic_model_youtube.get_topic_info()

In [22]:
freq_youtube.head(50)

Unnamed: 0,Topic,Count,Name
0,-1,60944,-1_app_ads_videos_video
1,0,6018,0_youtube_update_open_working
2,1,5507,1_ads_ad_many ads_many
3,2,5108,2_quality_video quality_resolution_480p
4,3,1795,3_dislike_dislike button_dislikes_button
5,4,1640,4_adds_add_many adds_much adds
6,5,1483,5_comments_section_layout_comment
7,6,1445,6_views_bts_deleting_deleting views
8,7,1439,7_recommendations_recommend_interested_channels
9,8,1162,8_pause_screen_button_buttons


In [38]:
topic_model_youtube.get_representative_docs(23)

['youtube keeps on freezing and deleting views from bts music videos.',
 "youtube is always deleting views from bts music videos! we work hard on streaming and y'all go and delete the views! fix this!!!",
 'stop freezing and deleting views for bts on youtube']

In [39]:
topic_model_youtube.get_topic(23)

[('views', 0.011689638359044924),
 ('bts', 0.009656458440860041),
 ('deleting', 0.007926001810020506),
 ('artists', 0.0074995750580003715),
 ('army', 0.007318133428555628),
 ('artist', 0.005464036808523589),
 ('deletes', 0.004760542701190598),
 ('fans', 0.004613249429276843),
 ('deleted', 0.0037937735906347827),
 ('ms', 0.00366729176421576)]

In [23]:
calc_topic_diversity(topic_model_youtube)

0.6062176165803109

In [11]:
classes_complete = list(df_youtube['reviewCreatedVersion'])
topics_per_class = topic_model_youtube.topics_per_class(df_youtube['content_corrected'], classes=classes_complete)

755it [00:26, 27.99it/s]


In [24]:
topic_model_youtube.reduce_topics(df_youtube['content_corrected'], nr_topics='auto')

2023-06-11 13:31:43,586 - BERTopic - Reduced number of topics from 193 to 118


<bertopic._bertopic.BERTopic at 0x7f66fc3f3340>

In [None]:
classes_complete = list(df_youtube['reviewCreatedVersion'])
topics_per_class = topic_model_youtube.topics_per_class(df_youtube['content_corrected'], classes=classes_complete)

In [27]:
freq_youtube = topic_model_youtube.get_topic_info()

In [28]:
freq_youtube.head(50)

Unnamed: 0,Topic,Count,Name
0,-1,60944,-1_app_ads_video_youtube
1,0,8578,0_ads_ad_many ads_many
2,1,7316,1_youtube_update_open_app
3,2,5108,2_quality_video quality_resolution_480p
4,3,2443,3_comment_comments_section_comment section
5,4,1939,4_dislike_dislike button_dislikes_button
6,5,1640,5_adds_add_many adds_many
7,6,1510,6_stars_star_give_app
8,7,1474,7_shorts_youtube shorts_short_disable
9,8,1445,8_views_bts_deleting_deleting views


In [17]:
calc_topic_diversity(topic_model_youtube)

0.7433333333333333

In [55]:
topic_model_youtube.get_representative_docs(47)

["videos quality is very low give us in hd option those who want to see in 1080p hd video by there choice why u are not giving us option of 1080p quality video please tell us all users want high quality video on youtube those who don't want to see they will not see in high quality and those who don't want low using of mobile they will not see in hd but u have to gave us choice ok good bye",
 'overall good but i feel too many videos are demonitized that should not be, especially true historical events. please let everyone equally utilize the platform. youtube should not be biased or politically motivated. and stop collecting and tracking our personal choices. we should be able to enjoy a road range if topics and know that we are safe and protected while doing so. maliciously exploiting your fellow countrymen and women goes against everything our great nation stands for.',
 'everything ok 1 star due to "quality." the quality or frame rate increases or set high before the video starts. th

### Training WhatsApp

In [None]:
topic_model_whatsapp = BERTopic(embedding_model='all-mpnet-base-v2', vectorizer_model=vectorizer_model, umap_model=umap_model, hdbscan_model=hdbscan_model, language='english', calculate_probabilities=True, verbose=True)
topics_whatsapp, probs_whatsapp = topic_model_whatsapp.fit_transform(list(df_whatsapp['content_corrected']))

In [None]:
classes_complete = list(df_whatsapp['reviewCreatedVersion'])
topics_per_class = topic_model_whatsapp.topics_per_class(df_whatsapp['content_corrected'], classes=classes_complete)

In [25]:
freq_whatsapp = topic_model_whatsapp.get_topic_info().head(50)

In [26]:
freq_whatsapp

Unnamed: 0,Topic,Count,Name
0,-1,34710,-1_voice_call_calls_app
1,0,1136,0_restore_backup_drive_restored
2,1,1077,1_split_fold_screen_tablet
3,2,745,2_quality_blur_blurry_low
4,3,617,3_dark_theme_mode_color
5,4,586,4_stickers_sticker_animated_packs
6,5,530,5_family_communicate_friends_communication
7,6,528,6_notifications_notification_settings_unless
8,7,512,7_gallery_folder_images_photos
9,8,446,8_forward_forwarding_forwarded_20


In [27]:
topic_model_whatsapp.reduce_topics(df_whatsapp['content_corrected'], nr_topics = 'auto')

2023-06-11 14:24:51,369 - BERTopic - Reduced number of topics from 1176 to 792


<bertopic._bertopic.BERTopic at 0x7f85f43b2970>

In [28]:
freq_whatsapp = topic_model_whatsapp.get_topic_info()

In [29]:
freq_whatsapp.head(50)

Unnamed: 0,Topic,Count,Name
0,-1,34710,-1_voice_call_app_video
1,0,1742,0_backup_restore_drive_lost
2,1,1130,1_quality_blur_low_blurry
3,2,1077,2_split_fold_screen_tablet
4,3,1047,3_family_communication_friends_communicate
5,4,925,4_dark_theme_mode_themes
6,5,827,5_notifications_notification_open_settings
7,6,823,6_30_seconds_15_sec
8,7,783,7_banned_account_ban_reason
9,8,729,8_policy_facebook_privacy_signal


In [79]:
topic_model_whatsapp.get_representative_docs(1)

["bad because my microphone (audio and video call) are not working if i call someone he/she can't hear me.please fix it .",
 "from like 3-4 days ago my whatsapp kinda weird. like when i tried to send voice note, it was recorded but i could hear nothing, i thought it was because of microphone permission, but that's not the problem. i tried to record my voice on recording app, but the results were absolutely fine, clear etc.",
 "i don't know why it's not recording the audio on my tablet .on phone i could record my voice and send . can you please help me to fix that"]

### Training PayPal

In [31]:
topic_model_paypal = BERTopic(embedding_model='all-mpnet-base-v2', vectorizer_model=vectorizer_model, umap_model=umap_model, hdbscan_model=hdbscan_model, language='english', calculate_probabilities=True, verbose=True)
topics_paypal, probs_paypal = topic_model_paypal.fit_transform(list(df_paypal['content_corrected']))

Batches:   0%|          | 0/1416 [00:00<?, ?it/s]

2023-06-11 14:27:46,226 - BERTopic - Transformed documents to Embeddings


[D] [14:27:46.258295] /opt/conda/conda-bld/work/cpp/src/umap/runner.cuh:108 n_neighbors=5
[D] [14:27:46.258957] /opt/conda/conda-bld/work/cpp/src/umap/runner.cuh:130 Calling knn graph run
[D] [14:27:48.075736] /opt/conda/conda-bld/work/cpp/src/umap/runner.cuh:136 Done. Calling fuzzy simplicial set
[D] [14:27:48.077183] /opt/conda/conda-bld/work/cpp/src/umap/fuzzy_simpl_set/naive.cuh:317 Smooth kNN Distances
[D] [14:27:48.077289] /opt/conda/conda-bld/work/cpp/src/umap/fuzzy_simpl_set/naive.cuh:319 sigmas = [ 0.0513363, 0.021544, 0.0475111, 0.0196955, 0.0177832, 0.0191948, 0.00296915, 0.0562973, 0.184814, 0.156624, 0.0992413, 0.022202, 0.0321569, 0.0289845, 0.0238266, 0.101858, 0.0228524, 0.0266123, 0.027761, 0.0192437, 0.0124161, 0.00614095, 0.0329423, 0.048563, 0.0430813 ]

[D] [14:27:48.077350] /opt/conda/conda-bld/work/cpp/src/umap/fuzzy_simpl_set/naive.cuh:321 rhos = [ 0.361831, 0.467776, 0.364107, 0.521235, 0.434171, 0.347409, 0.297399, 0.397382, 0.518404, 0.872324, 0.288073, 0.529

2023-06-11 14:27:50,155 - BERTopic - Reduced dimensionality


[D] [14:27:50.142838] /opt/conda/conda-bld/work/cpp/src/umap/runner.cuh:382 Smoothing KNN distances
[D] [14:27:50.143883] /opt/conda/conda-bld/work/cpp/src/umap/runner.cuh:414 Executing fuzzy simplicial set
[D] [14:27:50.144825] /opt/conda/conda-bld/work/cpp/src/umap/runner.cuh:443 Performing L1 normalization
[D] [14:27:50.145273] /opt/conda/conda-bld/work/cpp/src/umap/runner.cuh:479 n_epochs=30
[D] [14:27:50.145720] /opt/conda/conda-bld/work/cpp/src/umap/runner.cuh:502 Computing # of epochs for training each sample
[D] [14:27:50.145969] /opt/conda/conda-bld/work/cpp/src/umap/runner.cuh:509 Performing optimization


2023-06-11 14:27:53,534 - BERTopic - Clustered reduced embeddings


In [32]:
topic_model_paypal.get_topic_info().head(50)

Unnamed: 0,Topic,Count,Name
0,-1,23095,-1_paypal_account_money_bank
1,0,1158,0_easy_quick_works_fast
2,1,585,1_fingerprint_pin_finger_print
3,2,566,2_games_game_playing_play
4,3,449,3_easy_fast_quick_way
5,4,368,4_link_linked_card_debit
6,5,341,5_free_charge_sending_send
7,6,298,6_password_log_correct_login
8,7,296,7_login_log_browser_parcel
9,8,247,8_code_sign_confirmation_verification


In [33]:
topic_model_paypal.reduce_topics(df_paypal['content_corrected'], nr_topics='auto')

2023-06-11 14:28:15,188 - BERTopic - Reduced number of topics from 819 to 626


<bertopic._bertopic.BERTopic at 0x7f85f3137fa0>

In [34]:
topic_model_paypal.get_topic_info().head(50)

Unnamed: 0,Topic,Count,Name
0,-1,23095,-1_paypal_account_money_bank
1,0,1849,0_easy_quick_fast_convenient
2,1,799,1_link_debit_card_linked
3,2,671,2_login_log_password_correct
4,3,659,3_fingerprint_finger_pin_print
5,4,566,4_games_game_playing_play
6,5,460,5_stars_star_give_rate
7,6,443,6_code_sms_sign_verification
8,7,429,7_crashing_crashes_crash_keeps
9,8,341,8_free_charge_sending_send


In [None]:
classes_complete = list(df_paypal['reviewCreatedVersion'])
topics_per_class = topic_model_paypal.topics_per_class(df_paypal['content_corrected'], classes=classes_complete)

In [90]:
topic_model_paypal.get_representative_docs(4)

['it is very convenient and a fast way to send and receive money',
 "i like it, it's convenient and easy to work through.",
 'very fast & convenient, makes doing business easy.']

### Training Amazon

In [15]:
topic_model_amazon = BERTopic(embedding_model='all-mpnet-base-v2', vectorizer_model=vectorizer_model, umap_model=umap_model, hdbscan_model=hdbscan_model, language='english', calculate_probabilities=True, verbose=True)
topics_amazon, probs_amazon = topic_model_amazon.fit_transform(list(df_amazon['content_corrected']))

Batches:   0%|          | 0/1649 [00:00<?, ?it/s]

2023-06-11 14:13:31,243 - BERTopic - Transformed documents to Embeddings


[D] [14:13:31.271733] /opt/conda/conda-bld/work/cpp/src/umap/runner.cuh:108 n_neighbors=5
[D] [14:13:31.274136] /opt/conda/conda-bld/work/cpp/src/umap/runner.cuh:130 Calling knn graph run
[D] [14:13:39.632055] /opt/conda/conda-bld/work/cpp/src/umap/runner.cuh:136 Done. Calling fuzzy simplicial set
[D] [14:13:39.633702] /opt/conda/conda-bld/work/cpp/src/umap/fuzzy_simpl_set/naive.cuh:317 Smooth kNN Distances
[D] [14:13:39.634010] /opt/conda/conda-bld/work/cpp/src/umap/fuzzy_simpl_set/naive.cuh:319 sigmas = [ 0.018105, 0.00855923, 0.122778, 0.0384922, 0.0485811, 0.0388379, 0.0620747, 0.0735893, 0.0512304, 0.037055, 0.0274744, 0.0328927, 0.0213921, 0.0446987, 0.106812, 0.00881052, 0.00516212, 0.0407591, 0.0488567, 0.0347743, 0.0569639, 0.0242438, 0.0396252, 0.0246015, 0.0559511 ]

[D] [14:13:39.634082] /opt/conda/conda-bld/work/cpp/src/umap/fuzzy_simpl_set/naive.cuh:321 rhos = [ 0.429409, 0.294176, 0.276996, 0.553319, 0.303292, 0.476001, 0.285792, 0.478393, 0.299302, 0.742142, 0.535912, 0

2023-06-11 14:13:48,989 - BERTopic - Reduced dimensionality


[D] [14:13:48.975308] /opt/conda/conda-bld/work/cpp/src/umap/runner.cuh:382 Smoothing KNN distances
[D] [14:13:48.976048] /opt/conda/conda-bld/work/cpp/src/umap/runner.cuh:414 Executing fuzzy simplicial set
[D] [14:13:48.978074] /opt/conda/conda-bld/work/cpp/src/umap/runner.cuh:443 Performing L1 normalization
[D] [14:13:48.979047] /opt/conda/conda-bld/work/cpp/src/umap/runner.cuh:479 n_epochs=30
[D] [14:13:48.979429] /opt/conda/conda-bld/work/cpp/src/umap/runner.cuh:502 Computing # of epochs for training each sample
[D] [14:13:48.980344] /opt/conda/conda-bld/work/cpp/src/umap/runner.cuh:509 Performing optimization


2023-06-11 14:14:06,427 - BERTopic - Clustered reduced embeddings


In [None]:
classes_complete = list(df_amazon['reviewCreatedVersion'])
topics_per_class = topic_model_amazon.topics_per_class(df_amazon['content_corrected'], classes=classes_complete)

In [18]:
topic_model_amazon.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,-1,26288,-1_order_amazon_items_service
1,0,865,0_spanish_english_language_french
2,1,480,1_ads_ad_rings_lord
3,2,467,2_delivery_package_delivered_instructions
4,3,466,3_dark_mode_theme_night
...,...,...,...
912,911,5,911_rite_stored_oneplus_helped
913,912,5,912_claim_mature_comixology_comic
914,913,5,913_september_2019_isvalwa_fading
915,914,5,914_amazonsmile_activate_step_pertaining


In [19]:
topic_model_amazon.reduce_topics(df_amazon['content_corrected'], nr_topics='auto')

2023-06-11 14:15:03,243 - BERTopic - Reduced number of topics from 917 to 682


<bertopic._bertopic.BERTopic at 0x7f85f6262790>

In [20]:
freq_amazon = topic_model_amazon.get_topic_info()

In [21]:
freq_amazon.head(50)

Unnamed: 0,Topic,Count,Name
0,-1,26288,-1_order_amazon_items_service
1,0,1535,0_crashing_crashes_open_crash
2,1,1022,1_prime_day_shipping_paying
3,2,865,2_spanish_english_language_french
4,3,689,3_sign_log_login_password
5,4,569,4_shopping_great_online_love
6,5,526,5_pakistan_country_bangladesh_usa
7,6,509,6_uninstall_delete_space_phone
8,7,500,7_gift_card_cards_account
9,8,480,8_ads_ad_rings_lord


In [96]:
topic_model_amazon.get_representative_docs(15)

['whole page ad for the bad lord of the rings show. take that off.',
 'the stupid forced full screen lord of the rings ad is very annoying to have to see everytime open the app',
 'enough with the ads and forced lord of the rings ad!']

In [99]:
freq_youtube

Unnamed: 0,Topic,Count,Name
0,0,7080,0_ad_ads_commercials_advertisement
1,1,5476,1_audio_black_sound_buffering
2,2,4980,2_voice_music_crashing_songs
3,3,4966,3_open_says_install_400
4,4,4702,4_kids_age_kid_adds
...,...,...,...
1495,1495,1,1495_harder_mention_push_lately
1496,1496,1,1496_peace_mad_forcing_vids
1497,1497,1,1497_foronly_sooó_lasts_dies
1498,1498,1,1498_rlly_responding_plz_crashes


# Save model for further use

In [45]:
topic_model_netflix.save('models/topicmodel_netflix_hdbscan_v1.model')
# topic_model_youtube.save('models/topicmodel_youtube_hdbscan_v1.model')
# topic_model_whatsapp.save('models/topicmodel_whatsapp_hdbscan_v1.model')
# topic_model_paypal.save('models/topicmodel_paypal_hdbscan_v1.model')
# topic_model_amazon.save('models/topicmodel_amazon_hdbscan_v1.model')

# Load model

In [3]:
topic_model_netflix = BERTopic.load('models/topicmodel_netflix_hdbscan_v1.model')
topic_model_youtube = BERTopic.load('models/topicmodel_youtube_hdbscan_v1.model')
topic_model_whatsapp = BERTopic.load('models/topicmodel_whatsapp_hdbscan_v1.model')
topic_model_paypal = BERTopic.load('models/topicmodel_paypal_hdbscan_v1.model')
topic_model_amazon = BERTopic.load('models/topicmodel_amazon_hdbscan_v1.model')

# Analysis of Outliers

In [27]:
print(topic_model_netflix.get_topic_info()['Count'].loc[topic_model_netflix.get_topic_info().index[0]]/len(df_netflix))
print(topic_model_youtube.get_topic_info()['Count'].loc[topic_model_youtube.get_topic_info().index[0]]/len(df_youtube))
print(topic_model_whatsapp.get_topic_info()['Count'].loc[topic_model_whatsapp.get_topic_info().index[0]]/len(df_whatsapp))
print(topic_model_paypal.get_topic_info()['Count'].loc[topic_model_paypal.get_topic_info().index[0]]/len(df_paypal))
print(topic_model_amazon.get_topic_info()['Count'].loc[topic_model_amazon.get_topic_info().index[0]]/len(df_amazon))

0.490743485683326
0.5164818573753166
0.4635664298306533
0.5097446310724613
0.49839795241255097


# Reduction of Outliers

In [4]:
new_topics_netflix = BERTopic.reduce_outliers(topic_model_netflix, df_netflix['content_corrected'], topic_model_netflix.topics_, probabilities=topic_model_netflix.probabilities_, 
                             threshold=0.010, strategy="probabilities")
topic_model_netflix.update_topics(df_netflix['content_corrected'], topics=new_topics_netflix)

In [4]:
new_topics_youtube = BERTopic.reduce_outliers(topic_model_youtube, df_youtube['content_corrected'], topic_model_youtube.topics_, probabilities=topic_model_youtube.probabilities_, 
                             threshold=0.010, strategy="probabilities")
topic_model_youtube.update_topics(df_youtube['content_corrected'], topics=new_topics_youtube)

In [5]:
new_topics_whatsapp = BERTopic.reduce_outliers(topic_model_whatsapp, df_whatsapp['content_corrected'], topic_model_whatsapp.topics_, probabilities=topic_model_whatsapp.probabilities_, 
                             threshold=0.010, strategy="probabilities")
topic_model_whatsapp.update_topics(df_whatsapp['content_corrected'], topics=new_topics_whatsapp)

In [6]:
new_topics_paypal = BERTopic.reduce_outliers(topic_model_paypal, df_paypal['content_corrected'], topic_model_paypal.topics_, probabilities=topic_model_paypal.probabilities_, 
                             threshold=0.010, strategy="probabilities")
topic_model_paypal.update_topics(df_paypal['content_corrected'], topics=new_topics_paypal)

In [7]:
new_topics_amazon = BERTopic.reduce_outliers(topic_model_amazon, df_amazon['content_corrected'], topic_model_amazon.topics_, probabilities=topic_model_amazon.probabilities_, 
                             threshold=0.010, strategy="probabilities")
topic_model_amazon.update_topics(df_amazon['content_corrected'], topics=new_topics_amazon)

# Saving new model versions

In [23]:
topic_model_netflix.save('models/topicmodel_netflix_hdbscan_v2.model')
topic_model_youtube.save('models/topicmodel_youtube_hdbscan_v2.model')
topic_model_whatsapp.save('models/topicmodel_whatsapp_hdbscan_v2.model')
topic_model_paypal.save('models/topicmodel_paypal_hdbscan_v2.model')
topic_model_amazon.save('models/topicmodel_amazon_hdbscan_v2.model')

# Loading new model versions

In [26]:
topic_model_netflix = BERTopic.load('models/topicmodel_netflix_hdbscan_v2.model')
topic_model_youtube = BERTopic.load('models/topicmodel_youtube_hdbscan_v2.model')
topic_model_whatsapp = BERTopic.load('models/topicmodel_whatsapp_hdbscan_v2.model')
topic_model_paypal = BERTopic.load('models/topicmodel_paypal_hdbscan_v2.model')
topic_model_amazon = BERTopic.load('models/topicmodel_amazon_hdbscan_v2.model')

In [34]:
topic_model_netflix.get_topic_info().sort_values(by='Count', ascending=False)

Unnamed: 0,Topic,Count,Name
0,-1,15235,-1_movies_good_you_like
16,15,5067,15_freezes_audio_video_freezing
24,23,2389,23_failure_1001_error_1023
9,8,2070,8_payment_card_repay_method
3,2,2047,2_movies_best_great_nice
...,...,...,...
1085,1084,5,1084_85_88_myth_reflects
1087,1086,5,1086_locked_geez_unlocked_indeed
1091,1090,5,1090_language_filter_voiceover_dubbed
1093,1092,5,1092_minimize_minimizes_min_suck


In [35]:
topic_model_youtube.get_topic_info().sort_values(by='Count', ascending=False)

Unnamed: 0,Topic,Count,Name
0,-1,16310,-1_you_app_premium_like
11,10,8808,10_crashing_freezes_crashes_freezing
2,1,8726,1_ads_ad_advertisements_too
1,0,4940,0_quality_resolution_480p_1080p
6,5,3436,5_loading_working_load_connection
...,...,...,...
1006,1005,5,1005_luminosity_buffoons_interfaces_mon
1070,1069,5,1069_29gb_literary_hung_downloaded
995,994,5,994_exotic_animals_animal_train
1046,1045,5,1045_processor_880_surly_fiction


In [36]:
topic_model_whatsapp.get_topic_info().sort_values(by='Count', ascending=False)

Unnamed: 0,Topic,Count,Name
0,-1,8627,-1_app_and_that_it
7,6,4249,6_30_seconds_videos_video
10,9,3752,9_call_connecting_calls_reconnecting
4,3,2573,3_family_friends_communication_easy
12,11,2247,11_online_seen_offline_last
...,...,...,...
740,739,5,739_ujjawal_shivansh_seducing_terrorist
737,736,5,736_elements_misses_dost_takes
734,733,5,733_welcomed_disc_procedure_spending
717,716,5,716_ave_mkt_tube_posting


In [42]:
df_info_whatsapp = topic_model_whatsapp.get_document_info(df_whatsapp['content_corrected'])

In [45]:
list(df_info_whatsapp[df_info_whatsapp['Topic'] == 6]['Document'])

['worked excellent until the past update. when i try to share a video, it only sends 1 second of it. the ability to share videos quickly was the biggest selling point of the app. without that, theres no point in using this app. message apps are a dime a dozen. fix the video upload issue, there are plenty of apps that do the same thing. now i have to get everyone to jump ship',
 "i've been using this app for a long time now and i like it until recently i have been having issues sending videos i would attach a video and it will load fine but once i send it the person receiving it only gets 1 second of video. i have changed my video quality so it uses less kab/mb but still same thing, i even tried to send a 3 second video and still same issue. i never had this problem before and i don't know how to fix it.",
 "app is wonderful. but even after these much features, you are failed to give an option to export multiple files. it's possible in iphone (ios). if someone send me multiple documents

In [37]:
topic_model_paypal.get_topic_info().sort_values(by='Count', ascending=False)

Unnamed: 0,Topic,Count,Name
2,1,4758,1_link_card_bank_debit
0,-1,4464,-1_you_to_for_account
1,0,3584,0_easy_quick_fast_convenient
30,29,1807,29_ebay_seller_paypal_buyer
8,7,1376,7_crashing_crashes_open_keeps
...,...,...,...
588,587,6,587_sorting_code_vpn_kicking
604,603,6,603_leads_lift_surprised_error
542,541,6,541_button_indication_satisfactory_nervous
590,589,5,589_transverse_congrats_explanatory_supports


In [40]:
topic_model_amazon.get_topic_info().sort_values(by='Count', ascending=False)

Unnamed: 0,Topic,Count,Name
6,5,5244,5_they_delivery_service_delivered
0,-1,4607,-1_to_you_the_but
1,0,3229,0_crashing_crashes_open_keeps
4,3,1907,3_password_sign_log_account
41,40,1705,40_chat_customer_service_refund
...,...,...,...
620,619,6,619_op_preferences_s20be_mam
631,630,5,630_pmone_justomer_number_buddy
673,672,5,672_fa_crashare_oneplus7_lo
640,639,5,639_winning_deception_rigged_cheated


# Example sentences for the most discussed topic for each app review dataset

In [65]:
netflix_info = pd.merge(topic_model_netflix.get_document_info(df_netflix['content_corrected']), df_netflix, left_on='Document', right_on='content_corrected')
topic_0_netflix = netflix_info[netflix_info['Topic'] == 0]

topic_0_netflix[topic_0_netflix['Representative_document'] == True]\
    ['content'].loc[16491]

"Put in my email and password and says incorrect password but on any other device it's fine"

In [56]:
youtube_info = pd.merge(topic_model_youtube.get_document_info(df_youtube['content_corrected']), df_youtube, left_on='Document', right_on='content_corrected')
topic_0_youtube = youtube_info[youtube_info['Topic'] == 0]

topic_0_youtube[topic_0_youtube['Representative_document'] == True]\
    ['content'].loc[48171]

'The new update for video quality is very annoying. It automatically either goes to 720p or goes to 144p and you have to manually set it to 480p/360p everytime. Please do fix this as son as possible.'

In [59]:
whatsapp_info = pd.merge(topic_model_whatsapp.get_document_info(df_whatsapp['content_corrected']), df_whatsapp, left_on='Document', right_on='content_corrected')
topic_0_whatsapp = whatsapp_info[whatsapp_info['Topic'] == 0]

topic_0_whatsapp[topic_0_whatsapp['Representative_document'] == True]\
    ['content'].loc[2809]

'When I trying to restore my messages from google drive backup in whatsapp, I cannot restore. It is continuously showing that preparing to restore messages. But not restoring. Fix this bug.'

In [62]:
paypal_info = pd.merge(topic_model_paypal.get_document_info(df_paypal['content_corrected']), df_paypal, left_on='Document', right_on='content_corrected')
topic_0_paypal = paypal_info[paypal_info['Topic'] == 0]

topic_0_paypal[topic_0_paypal['Representative_document'] == True]\
    ['content'].loc[34554]

'Quick, fast and easy...nothing better.'

In [64]:
amazon_info = pd.merge(topic_model_amazon.get_document_info(df_amazon['content_corrected']), df_amazon, left_on='Document', right_on='content_corrected')
topic_0_amazon = amazon_info[amazon_info['Topic'] == 0]

topic_0_amazon[topic_0_amazon['Representative_document'] == True]\
    ['content'].loc[32496]

"Keeps crashing. Won't even open."

# Evaluation

## Topic Coherence

Topic Coherence is a score that measures how similar words are within a topic. Values between -1 and 1, where the higher a value is the better

In [8]:
with open('results_topic_coherence5.txt', 'a', encoding='utf-8') as f:
    f.write(f'Netflix topics have achieved a topic coherence of: {calc_topic_coherence(topic_model_netflix, df_netflix)}\n')
# Computing time: 11m 31.8s

In [9]:
with open('results_topic_coherence5.txt', 'a', encoding='utf-8') as f:
    f.write(f'YouTube topics have achieved a topic coherence of: {calc_topic_coherence(topic_model_youtube, df_youtube)}\n')
# Computing time: 12m 51.9s

In [10]:
with open('results_topic_coherence5.txt', 'a', encoding='utf-8') as f:
    f.write(f'WhatsApp topics have achieved a topic coherence of: {calc_topic_coherence(topic_model_whatsapp, df_whatsapp)}\n')
# Computing time: 6m 58.5s

In [11]:
with open('results_topic_coherence5.txt', 'a', encoding='utf-8') as f:
    f.write(f'PayPal topics have achieved a topic coherence of: {calc_topic_coherence(topic_model_paypal, df_paypal)}\n')
# Computing time: 3m 24.1s

In [12]:
with open('results_topic_coherence5.txt', 'a', encoding='utf-8') as f:
    f.write(f'Amazon topics have achieved a topic coherence of: {calc_topic_coherence(topic_model_amazon, df_amazon)}\n')

## Topic Diversity

In [7]:
del progress_bar

Saved topic coherence for amazon to file: 100%|██████████| 5/5 [40:34<00:00, 486.83s/it]


In [22]:
# Save result to txt

progress_bar = tqdm(total=5)

with open('results_topic_diversity7_outlierR.txt', 'w', encoding='utf-8') as f:

    progress_bar.set_description('Calculating topic diversity for netflix...')
    f.write(f'Netflix topics have achieved a topic diversity of: {calc_topic_diversity(topic_model_netflix)}\n')
    progress_bar.update(1)
    progress_bar.set_description('Saved topic diversity for netflix to file')
    progress_bar.set_description('Calculating topic diversity for youtube...')
    f.write(f'YouTube topics have achieved a topic diversity of: {calc_topic_diversity(topic_model_youtube)}\n')
    progress_bar.update(1)
    progress_bar.set_description('Saved topic diversity for youtube to file')
    progress_bar.set_description('Calculating topic diversity for whatsapp...')
    f.write(f'WhatsApp topics have achieved a topic diversity of: {calc_topic_diversity(topic_model_whatsapp)}\n')
    progress_bar.update(1)
    progress_bar.set_description('Saved topic diversity for whatsapp to file')
    progress_bar.set_description('Calculating topic diversity for paypal...')
    f.write(f'PayPal topics have achieved a topic diversity of: {calc_topic_diversity(topic_model_paypal)}\n')
    progress_bar.update(1)
    progress_bar.set_description('Saved topic diversity for paypal to file')
    progress_bar.set_description('Calculating topic diversity for amazon...')
    f.write(f'Amazon topics have achieved a topic diversity of: {calc_topic_diversity(topic_model_amazon)}\n')
    progress_bar.update(1)
    progress_bar.set_description('Saved topic diversity for amazon to file')

del progress_bar
    

Saved topic diversity for amazon to file: 100%|██████████| 5/5 [00:00<00:00, 175.70it/s]   


In [18]:
topic_model_netflix.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,-1,48006,-1_movies_shows_good_watch
1,0,1468,0_login_password_sign_log
2,1,1357,1_brightness_dark_bright_adjust
3,2,1270,2_best_movies_nice_great
4,3,1166,3_opening_open_closing_application
...,...,...,...
1129,1128,5,1128_verity_reruns_potential_pays
1130,1129,5,1129_valid_email_entered_explanation
1131,1130,5,1130_80mbspet_3xl_blocky_dick
1132,1131,5,1131_aimlessly_conversations_acknowledge_flawed


In [19]:
topic_model_youtube.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,-1,61576,-1_ads_ad_app_many
1,0,4914,0_quality_resolution_480p_1080p
2,1,3638,1_ads_ad_advertisements_advertisement
3,2,2654,2_pause_tap_swipe_forward
4,3,1856,3_dislike_dislikes_count_removal
...,...,...,...
1093,1092,5,1092_contain_improvise_extend_enables
1094,1093,5,1093_1stars_untill_overheating_redo
1095,1094,5,1094_flagged_valorant_2025_ceases
1096,1095,5,1095_forgive_rameshwaram_simar_unknowingly


In [20]:
topic_model_whatsapp.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,-1,34710,-1_voice_call_app_video
1,0,1742,0_backup_restore_drive_lost
2,1,1130,1_quality_blur_low_blurry
3,2,1077,2_split_fold_screen_tablet
4,3,1047,3_family_communication_friends_communicate
...,...,...,...
787,786,5,786_quo_pending_feeds_24hr
788,787,5,787_frequency_probulam_obidiegwu_beaune
789,788,5,788_idiots_unspam_intelligent_unbann
790,789,5,789_assign_becsue_plleeaassee_ringtone


In [23]:
topic_model_paypal.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,-1,23095,-1_paypal_account_money_bank
1,0,1849,0_easy_quick_fast_convenient
2,1,799,1_link_debit_card_linked
3,2,671,2_login_log_password_correct
4,3,659,3_fingerprint_finger_pin_print
...,...,...,...
621,620,5,620_99_pressed_someing_7you
622,621,5,621_cat_employer_beloved_prefers
623,622,5,622_err_caution_frequent_verifications
624,623,5,623_insulation_fairly_function_open


In [22]:
topic_model_amazon.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,-1,26288,-1_order_amazon_items_service
1,0,1535,0_crashing_crashes_open_crash
2,1,1022,1_prime_day_shipping_paying
3,2,865,2_spanish_english_language_french
4,3,689,3_sign_log_login_password
...,...,...,...
677,676,5,676_caulk_bag_empty_embarrassed
678,677,5,677_destroying_addicting_giveaways_addictive
679,678,5,678_faesbok_gaevirmit_idonnt_bnek
680,679,5,679_navigatable_craptastic_figuring_configuration
