# Setup used

Ram = 32 GB <br>
GPU = GeForce 3060ti RTX 6 GB <br>
CPU = AMD Ryzen 9 5900X 12-Core Processor, 3701 MHz, 12 cores, 24 logical cores

All packages where installed inside a virtual conda environment on WSL (Windows Subsystem for Linux)

# Install packages here

# Import packages here

In [1]:
import gensim.corpora as corpora
from gensim.models.coherencemodel import CoherenceModel
import nltk
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
from tqdm import tqdm
import pandas as pd
from cuml.cluster import HDBSCAN # We are using the GPU accelerated version of HDBSCAN
from cuml.manifold import UMAP # We are using the GPU accelerated version of UMAP
from octis.evaluation_metrics.diversity_metrics import TopicDiversity
import numpy as np

# Initialize functions here

## Topic Coherence

In [2]:
def calc_topic_coherence(model, frame):

    topics = model.topics_

    # Preprocess Documents
    documents = pd.DataFrame({"Document": frame['content_corrected'],
                            "ID": range(len(frame['content_corrected'])),
                            "Topic": topics})
    documents_per_topic = documents.groupby(['Topic'], as_index=False).agg({'Document': ' '.join})
    cleaned_docs = model._preprocess_text(documents_per_topic.Document.values)

    # Extract vectorizer and analyzer from BERTopic
    vectorizer = model.vectorizer_model
    analyzer = vectorizer.build_analyzer()

    # Extract features for Topic Coherence evaluation
    words = vectorizer.get_feature_names()
    tokens = [analyzer(doc) for doc in cleaned_docs]
    dictionary = corpora.Dictionary(tokens)
    corpus = [dictionary.doc2bow(token) for token in tokens]
    topic_words = [[words for words, _ in model.get_topic(topic)]
                for topic in range(len(set(topics))-1)]

    # Evaluate
    coherence_model = CoherenceModel(topics=topic_words,
                                    texts=tokens,
                                    corpus=corpus,
                                    dictionary=dictionary,
                                    coherence='c_v',
                                    processes=10)

    return coherence_model.get_coherence()

## Topic Diversity

In [3]:
def calc_topic_diversity(model):

    topics = model.topics_

    model_output_test = {}
    model_output_test['topics'] = [[z[0] for z in model.get_topic(x)] for x in set(topics)]
    metric = TopicDiversity(topk=3)

    return metric.score(model_output_test)

# Load data

In [4]:
df_netflix = pd.read_csv(f'preprocessed_data/prep_netflix_v4.csv')
df_youtube = pd.read_csv(f'preprocessed_data/prep_youtube_v4.csv')
df_whatsapp = pd.read_csv(f'preprocessed_data/prep_whatsapp_v4.csv')
df_paypal = pd.read_csv(f'preprocessed_data/prep_paypal_v4.csv')
df_amazon = pd.read_csv(f'preprocessed_data/prep_amazon_v4.csv')

  df_amazon = pd.read_csv(f'preprocessed_data/prep_amazon_v4.csv')


# Model training

## Initialize CountVectorizer fron ngram extraction and stop words removal

In [6]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/denis/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [7]:
vectorizer_model = CountVectorizer(
    stop_words=nltk.corpus.stopwords.words('english'))

## Initialize model

Depending on the dataset, the n_neighbors argument may need to be changed (higher n_neighbors gives a more broader look on the data, and vice versa)

In [8]:
umap_model = UMAP(n_neighbors=5, n_components=5, min_dist=0.0, metric='cosine', verbose=True, random_state=42)
hdbscan_model = HDBSCAN(min_cluster_size=5, metric='euclidean', cluster_selection_method='eom', prediction_data=True, )

## Training

### Training Netflix

In [None]:
topic_model_netflix = BERTopic(embedding_model='all-mpnet-base-v2', vectorizer_model=vectorizer_model, umap_model=umap_model, hdbscan_model=hdbscan_model, language='english', calculate_probabilities=True, verbose=True)
topics_netflix, probs_netflix = topic_model_netflix.fit_transform(list(df_netflix['content_corrected']))

In [None]:
topic_model_netflix.reduce_topics(df_netflix['content_corrected'], nr_topics='auto')

### Training YouTube

In [None]:
kmeans = KMeans(n_clusters=1500, verbose=True, random_state=42)
topic_model_youtube = BERTopic(embedding_model='all-mpnet-base-v2', vectorizer_model=vectorizer_model, umap_model=umap_model, hdbscan_model=hdbscan_model, language='english', calculate_probabilities=True, verbose=True)
topics_youtube, probs_youtube = topic_model_youtube.fit_transform(list(df_youtube['content_corrected']))

In [None]:
topic_model_youtube.reduce_topics(df_youtube['content_corrected'], nr_topics='auto')

### Training WhatsApp

In [None]:
topic_model_whatsapp = BERTopic(embedding_model='all-mpnet-base-v2', vectorizer_model=vectorizer_model, umap_model=umap_model, hdbscan_model=hdbscan_model, language='english', calculate_probabilities=True, verbose=True)
topics_whatsapp, probs_whatsapp = topic_model_whatsapp.fit_transform(list(df_whatsapp['content_corrected']))

In [None]:
topic_model_whatsapp.reduce_topics(df_whatsapp['content_corrected'], nr_topics = 'auto')

### Training PayPal

In [None]:
topic_model_paypal = BERTopic(embedding_model='all-mpnet-base-v2', vectorizer_model=vectorizer_model, umap_model=umap_model, hdbscan_model=hdbscan_model, language='english', calculate_probabilities=True, verbose=True)
topics_paypal, probs_paypal = topic_model_paypal.fit_transform(list(df_paypal['content_corrected']))

In [None]:
topic_model_paypal.reduce_topics(df_paypal['content_corrected'], nr_topics='auto')

### Training Amazon

In [None]:
topic_model_amazon = BERTopic(embedding_model='all-mpnet-base-v2', vectorizer_model=vectorizer_model, umap_model=umap_model, hdbscan_model=hdbscan_model, language='english', calculate_probabilities=True, verbose=True)
topics_amazon, probs_amazon = topic_model_amazon.fit_transform(list(df_amazon['content_corrected']))

In [None]:
topic_model_amazon.reduce_topics(df_amazon['content_corrected'], nr_topics='auto')

# Save model for further use

In [45]:
topic_model_netflix.save('models/topicmodel_netflix_hdbscan_v1.model')
topic_model_youtube.save('models/topicmodel_youtube_hdbscan_v1.model')
topic_model_whatsapp.save('models/topicmodel_whatsapp_hdbscan_v1.model')
topic_model_paypal.save('models/topicmodel_paypal_hdbscan_v1.model')
topic_model_amazon.save('models/topicmodel_amazon_hdbscan_v1.model')

# Load model

In [3]:
topic_model_netflix = BERTopic.load('models/topicmodel_netflix_hdbscan_v1.model')
topic_model_youtube = BERTopic.load('models/topicmodel_youtube_hdbscan_v1.model')
topic_model_whatsapp = BERTopic.load('models/topicmodel_whatsapp_hdbscan_v1.model')
topic_model_paypal = BERTopic.load('models/topicmodel_paypal_hdbscan_v1.model')
topic_model_amazon = BERTopic.load('models/topicmodel_amazon_hdbscan_v1.model')

# Analysis of Outliers

In [27]:
print(topic_model_netflix.get_topic_info()['Count'].loc[topic_model_netflix.get_topic_info().index[0]]/len(df_netflix))
print(topic_model_youtube.get_topic_info()['Count'].loc[topic_model_youtube.get_topic_info().index[0]]/len(df_youtube))
print(topic_model_whatsapp.get_topic_info()['Count'].loc[topic_model_whatsapp.get_topic_info().index[0]]/len(df_whatsapp))
print(topic_model_paypal.get_topic_info()['Count'].loc[topic_model_paypal.get_topic_info().index[0]]/len(df_paypal))
print(topic_model_amazon.get_topic_info()['Count'].loc[topic_model_amazon.get_topic_info().index[0]]/len(df_amazon))

0.490743485683326
0.5164818573753166
0.4635664298306533
0.5097446310724613
0.49839795241255097


# Reduction of Outliers

In [4]:
new_topics_netflix = BERTopic.reduce_outliers(topic_model_netflix, df_netflix['content_corrected'], topic_model_netflix.topics_, probabilities=topic_model_netflix.probabilities_, 
                             threshold=0.010, strategy="probabilities")
topic_model_netflix.update_topics(df_netflix['content_corrected'], topics=new_topics_netflix)

In [4]:
new_topics_youtube = BERTopic.reduce_outliers(topic_model_youtube, df_youtube['content_corrected'], topic_model_youtube.topics_, probabilities=topic_model_youtube.probabilities_, 
                             threshold=0.010, strategy="probabilities")
topic_model_youtube.update_topics(df_youtube['content_corrected'], topics=new_topics_youtube)

In [5]:
new_topics_whatsapp = BERTopic.reduce_outliers(topic_model_whatsapp, df_whatsapp['content_corrected'], topic_model_whatsapp.topics_, probabilities=topic_model_whatsapp.probabilities_, 
                             threshold=0.010, strategy="probabilities")
topic_model_whatsapp.update_topics(df_whatsapp['content_corrected'], topics=new_topics_whatsapp)

In [6]:
new_topics_paypal = BERTopic.reduce_outliers(topic_model_paypal, df_paypal['content_corrected'], topic_model_paypal.topics_, probabilities=topic_model_paypal.probabilities_, 
                             threshold=0.010, strategy="probabilities")
topic_model_paypal.update_topics(df_paypal['content_corrected'], topics=new_topics_paypal)

In [7]:
new_topics_amazon = BERTopic.reduce_outliers(topic_model_amazon, df_amazon['content_corrected'], topic_model_amazon.topics_, probabilities=topic_model_amazon.probabilities_, 
                             threshold=0.010, strategy="probabilities")
topic_model_amazon.update_topics(df_amazon['content_corrected'], topics=new_topics_amazon)

# Saving new model versions

In [23]:
topic_model_netflix.save('models/topicmodel_netflix_hdbscan_v2.model')
topic_model_youtube.save('models/topicmodel_youtube_hdbscan_v2.model')
topic_model_whatsapp.save('models/topicmodel_whatsapp_hdbscan_v2.model')
topic_model_paypal.save('models/topicmodel_paypal_hdbscan_v2.model')
topic_model_amazon.save('models/topicmodel_amazon_hdbscan_v2.model')

# Loading new model versions

In [26]:
topic_model_netflix = BERTopic.load('models/topicmodel_netflix_hdbscan_v2.model')
topic_model_youtube = BERTopic.load('models/topicmodel_youtube_hdbscan_v2.model')
topic_model_whatsapp = BERTopic.load('models/topicmodel_whatsapp_hdbscan_v2.model')
topic_model_paypal = BERTopic.load('models/topicmodel_paypal_hdbscan_v2.model')
topic_model_amazon = BERTopic.load('models/topicmodel_amazon_hdbscan_v2.model')

In [34]:
topic_model_netflix.get_topic_info().sort_values(by='Count', ascending=False)

Unnamed: 0,Topic,Count,Name
0,-1,15235,-1_movies_good_you_like
16,15,5067,15_freezes_audio_video_freezing
24,23,2389,23_failure_1001_error_1023
9,8,2070,8_payment_card_repay_method
3,2,2047,2_movies_best_great_nice
...,...,...,...
1085,1084,5,1084_85_88_myth_reflects
1087,1086,5,1086_locked_geez_unlocked_indeed
1091,1090,5,1090_language_filter_voiceover_dubbed
1093,1092,5,1092_minimize_minimizes_min_suck


In [35]:
topic_model_youtube.get_topic_info().sort_values(by='Count', ascending=False)

Unnamed: 0,Topic,Count,Name
0,-1,16310,-1_you_app_premium_like
11,10,8808,10_crashing_freezes_crashes_freezing
2,1,8726,1_ads_ad_advertisements_too
1,0,4940,0_quality_resolution_480p_1080p
6,5,3436,5_loading_working_load_connection
...,...,...,...
1006,1005,5,1005_luminosity_buffoons_interfaces_mon
1070,1069,5,1069_29gb_literary_hung_downloaded
995,994,5,994_exotic_animals_animal_train
1046,1045,5,1045_processor_880_surly_fiction


In [36]:
topic_model_whatsapp.get_topic_info().sort_values(by='Count', ascending=False)

Unnamed: 0,Topic,Count,Name
0,-1,8627,-1_app_and_that_it
7,6,4249,6_30_seconds_videos_video
10,9,3752,9_call_connecting_calls_reconnecting
4,3,2573,3_family_friends_communication_easy
12,11,2247,11_online_seen_offline_last
...,...,...,...
740,739,5,739_ujjawal_shivansh_seducing_terrorist
737,736,5,736_elements_misses_dost_takes
734,733,5,733_welcomed_disc_procedure_spending
717,716,5,716_ave_mkt_tube_posting


In [37]:
topic_model_paypal.get_topic_info().sort_values(by='Count', ascending=False)

Unnamed: 0,Topic,Count,Name
2,1,4758,1_link_card_bank_debit
0,-1,4464,-1_you_to_for_account
1,0,3584,0_easy_quick_fast_convenient
30,29,1807,29_ebay_seller_paypal_buyer
8,7,1376,7_crashing_crashes_open_keeps
...,...,...,...
588,587,6,587_sorting_code_vpn_kicking
604,603,6,603_leads_lift_surprised_error
542,541,6,541_button_indication_satisfactory_nervous
590,589,5,589_transverse_congrats_explanatory_supports


In [40]:
topic_model_amazon.get_topic_info().sort_values(by='Count', ascending=False)

Unnamed: 0,Topic,Count,Name
6,5,5244,5_they_delivery_service_delivered
0,-1,4607,-1_to_you_the_but
1,0,3229,0_crashing_crashes_open_keeps
4,3,1907,3_password_sign_log_account
41,40,1705,40_chat_customer_service_refund
...,...,...,...
620,619,6,619_op_preferences_s20be_mam
631,630,5,630_pmone_justomer_number_buddy
673,672,5,672_fa_crashare_oneplus7_lo
640,639,5,639_winning_deception_rigged_cheated


# Example sentences for the most discussed topic for each app review dataset

In [65]:
netflix_info = pd.merge(topic_model_netflix.get_document_info(df_netflix['content_corrected']), df_netflix, left_on='Document', right_on='content_corrected')
topic_0_netflix = netflix_info[netflix_info['Topic'] == 0]

topic_0_netflix[topic_0_netflix['Representative_document'] == True]\
    ['content'].loc[16491]

"Put in my email and password and says incorrect password but on any other device it's fine"

In [56]:
youtube_info = pd.merge(topic_model_youtube.get_document_info(df_youtube['content_corrected']), df_youtube, left_on='Document', right_on='content_corrected')
topic_0_youtube = youtube_info[youtube_info['Topic'] == 0]

topic_0_youtube[topic_0_youtube['Representative_document'] == True]\
    ['content'].loc[48171]

'The new update for video quality is very annoying. It automatically either goes to 720p or goes to 144p and you have to manually set it to 480p/360p everytime. Please do fix this as son as possible.'

In [59]:
whatsapp_info = pd.merge(topic_model_whatsapp.get_document_info(df_whatsapp['content_corrected']), df_whatsapp, left_on='Document', right_on='content_corrected')
topic_0_whatsapp = whatsapp_info[whatsapp_info['Topic'] == 0]

topic_0_whatsapp[topic_0_whatsapp['Representative_document'] == True]\
    ['content'].loc[2809]

'When I trying to restore my messages from google drive backup in whatsapp, I cannot restore. It is continuously showing that preparing to restore messages. But not restoring. Fix this bug.'

In [62]:
paypal_info = pd.merge(topic_model_paypal.get_document_info(df_paypal['content_corrected']), df_paypal, left_on='Document', right_on='content_corrected')
topic_0_paypal = paypal_info[paypal_info['Topic'] == 0]

topic_0_paypal[topic_0_paypal['Representative_document'] == True]\
    ['content'].loc[34554]

'Quick, fast and easy...nothing better.'

In [64]:
amazon_info = pd.merge(topic_model_amazon.get_document_info(df_amazon['content_corrected']), df_amazon, left_on='Document', right_on='content_corrected')
topic_0_amazon = amazon_info[amazon_info['Topic'] == 0]

topic_0_amazon[topic_0_amazon['Representative_document'] == True]\
    ['content'].loc[32496]

"Keeps crashing. Won't even open."

# Evaluation

## Topic Coherence

Topic Coherence is a score that measures how similar words are within a topic. Values between -1 and 1, where the higher a value is the better

In [8]:
with open('results_topic_coherence.txt', 'a', encoding='utf-8') as f:
    f.write(f'Netflix topics have achieved a topic coherence of: {calc_topic_coherence(topic_model_netflix, df_netflix)}\n')

In [9]:
with open('results_topic_coherence.txt', 'a', encoding='utf-8') as f:
    f.write(f'YouTube topics have achieved a topic coherence of: {calc_topic_coherence(topic_model_youtube, df_youtube)}\n')

In [10]:
with open('results_topic_coherence.txt', 'a', encoding='utf-8') as f:
    f.write(f'WhatsApp topics have achieved a topic coherence of: {calc_topic_coherence(topic_model_whatsapp, df_whatsapp)}\n')

In [11]:
with open('results_topic_coherence.txt', 'a', encoding='utf-8') as f:
    f.write(f'PayPal topics have achieved a topic coherence of: {calc_topic_coherence(topic_model_paypal, df_paypal)}\n')

In [12]:
with open('results_topic_coherence.txt', 'a', encoding='utf-8') as f:
    f.write(f'Amazon topics have achieved a topic coherence of: {calc_topic_coherence(topic_model_amazon, df_amazon)}\n')

## Topic Diversity

In [7]:
del progress_bar

Saved topic coherence for amazon to file: 100%|██████████| 5/5 [40:34<00:00, 486.83s/it]


In [22]:
# Save result to txt

progress_bar = tqdm(total=5)

with open('results_topic_diversity_outlierR.txt', 'w', encoding='utf-8') as f:

    progress_bar.set_description('Calculating topic diversity for netflix...')
    f.write(f'Netflix topics have achieved a topic diversity of: {calc_topic_diversity(topic_model_netflix)}\n')
    progress_bar.update(1)
    progress_bar.set_description('Saved topic diversity for netflix to file')
    progress_bar.set_description('Calculating topic diversity for youtube...')
    f.write(f'YouTube topics have achieved a topic diversity of: {calc_topic_diversity(topic_model_youtube)}\n')
    progress_bar.update(1)
    progress_bar.set_description('Saved topic diversity for youtube to file')
    progress_bar.set_description('Calculating topic diversity for whatsapp...')
    f.write(f'WhatsApp topics have achieved a topic diversity of: {calc_topic_diversity(topic_model_whatsapp)}\n')
    progress_bar.update(1)
    progress_bar.set_description('Saved topic diversity for whatsapp to file')
    progress_bar.set_description('Calculating topic diversity for paypal...')
    f.write(f'PayPal topics have achieved a topic diversity of: {calc_topic_diversity(topic_model_paypal)}\n')
    progress_bar.update(1)
    progress_bar.set_description('Saved topic diversity for paypal to file')
    progress_bar.set_description('Calculating topic diversity for amazon...')
    f.write(f'Amazon topics have achieved a topic diversity of: {calc_topic_diversity(topic_model_amazon)}\n')
    progress_bar.update(1)
    progress_bar.set_description('Saved topic diversity for amazon to file')

del progress_bar
    

Saved topic diversity for amazon to file: 100%|██████████| 5/5 [00:00<00:00, 175.70it/s]   
