# Topic Modeling using BERTopic

In [1]:
!pip install bertopic

Collecting bertopic
  Downloading bertopic-0.15.0-py2.py3-none-any.whl.metadata (20 kB)
Collecting hdbscan>=0.8.29 (from bertopic)
  Downloading hdbscan-0.8.33.tar.gz (5.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.2/5.2 MB[0m [31m18.2 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
[?25hCollecting umap-learn>=0.5.0 (from bertopic)
  Downloading umap-learn-0.5.4.tar.gz (90 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m90.8/90.8 kB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Collecting sentence-transformers>=0.4.1 (from bertopic)
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Prep

## Import libraries

In [70]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from transformers import pipeline
from wordcloud import WordCloud
import matplotlib.pyplot as plt 

from snorkel.labeling import LabelingFunction
import re
from snorkel.preprocess import preprocessor
from textblob import TextBlob

from tqdm import tqdm

import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')

from nltk.stem import WordNetLemmatizer

from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired, PartOfSpeech, MaximalMarginalRelevance
from sklearn.metrics.pairwise import cosine_similarity


[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/martjebuss/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/martjebuss/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


## Load data

In [44]:
# Load df
df = pd.read_csv(
    "../data/chatgpt_after_datacleaning.csv",
    encoding="utf-8",
)

df_long = pd.read_csv(
    "../data/chatgpt-long_after_datacleaning_withoutspellcorrection.csv",
    encoding="utf-8",
)

## Remove NAs

In [45]:
df = df.dropna(subset=['content'])

In [65]:
len(df)

22371

## Delete reviews of one or two words

In [55]:
def has_multiple_words(text):
    words = text.split()
    return len(words) > 2

df = df[df['content'].apply(has_multiple_words)]


## BERTopic simple model

In [56]:
docs = list(df.content.values)
topic_model = BERTopic()

In [57]:
topics, probs = topic_model.fit_transform(docs)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [58]:
topic_model.get_topic_info().head(15).set_index('Topic')[
   ['Count', 'Name', 'Representation']]

Unnamed: 0_level_0,Count,Name,Representation
Topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
-1,6978,-1_and_it_to_you,"[and, it, to, you, this, app, the, of, its, ai]"
0,419,0_catgut_life_love_me,"[catgut, life, love, me, help, friend, my, is,..."
1,389,1_2021_september_update_updated,"[2021, september, update, updated, data, 2023,..."
2,377,2_log_login_sign_cant,"[log, login, sign, cant, account, unable, let,..."
3,363,3_voice_speech_text_feature,"[voice, speech, text, feature, audio, recognit..."
4,363,4_catgut_android_finally_official,"[catgut, android, finally, official, now, vers..."
5,347,5_students_study_student_homework,"[students, study, student, homework, assignmen..."
6,250,6_chat_get_because_original,"[chat, get, because, original, he, gpt, telegr..."
7,226,7_answers_answer_questions_wrong,"[answers, answer, questions, wrong, gives, que..."
8,222,8_amazing_app_this_fantastic,"[amazing, app, this, fantastic, love, wonderfu..."


In [None]:
topic_model.visualize_barchart(top_n_topics = 16, n_words = 10)

## Improving the topic model

In [59]:
main_representation_model = KeyBERTInspired()
aspect_representation_model1 = PartOfSpeech("en_core_web_sm")
aspect_representation_model2 = [KeyBERTInspired(top_n_words=30), 
                                MaximalMarginalRelevance(diversity=.5)]

representation_model = {
   "Main": main_representation_model,
   "Aspect1":  aspect_representation_model1,
   "Aspect2":  aspect_representation_model2 
}

vectorizer_model = CountVectorizer(min_df=5, stop_words = 'english')
topic_model = BERTopic(nr_topics = 'auto', 
                      vectorizer_model = vectorizer_model,
                      representation_model = representation_model)


In [60]:
topics, ini_probs = topic_model.fit_transform(docs)

In [61]:
def get_topic_stats(topic_model, extra_cols = []):
    topics_info_df = topic_model.get_topic_info().sort_values('Count', ascending = False)
    topics_info_df['Share'] = 100.*topics_info_df['Count']/topics_info_df['Count'].sum()
    topics_info_df['CumulativeShare'] = 100.*topics_info_df['Count'].cumsum()/topics_info_df['Count'].sum()
    return topics_info_df[['Topic', 'Count', 'Share', 'CumulativeShare', 
                           'Name', 'Representation'] + extra_cols]

get_topic_stats(topic_model, ['Aspect1', 'Aspect2']).head(25)\
    .set_index('Topic')

Unnamed: 0_level_0,Count,Share,CumulativeShare,Name,Representation,Aspect1,Aspect2
Topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
-1,6841,30.579768,30.579768,-1_app_apps_android_application,"[app, apps, android, application, useful, use,...","[app, good, great, useful, chat, amazing, work...","[apps, android, ai, catgut, features, conversa..."
0,2737,12.234589,42.814358,0_application_apps_app_useful,"[application, apps, app, useful, using, use, h...","[app, application, students, nice, good, usefu...","[application, apps, useful, interface, study, ..."
1,2198,9.82522,52.639578,1_catgut_ai_apps_app,"[catgut, ai, apps, app, android, processing, a...","[catgut, language, android, responses, tool, i...","[catgut, apps, processing, assistant, intellig..."
2,1041,4.653346,57.292924,2_best_greatest_excellent_good,"[best, greatest, excellent, good, great, wonde...","[good, best, amazing, bad, excellent, better, ...","[best, greatest, wonderful, thanks, impressed,..."
3,665,2.972598,60.265522,3_login_unable_password_logged,"[login, unable, password, logged, tried, accou...","[login, log, account, unable, error, page, pas...","[login, error, try, app, loading, broken, stuc..."
4,519,2.319968,62.58549,4_2021_2023_uptodate_updates,"[2021, 2023, uptodate, updates, 2022, latest, ...","[update, data, information, date, current, dat...","[2023, uptodate, latest, outdated, upgrade, se..."
5,369,1.649457,64.234947,5_voice_microphone_audio_speech,"[voice, microphone, audio, speech, speak, soun...","[voice, speech, text, audio, feature, read, as...","[voice, microphone, speak, alexa, assistant, i..."
6,291,1.300791,65.535738,6_useful_usefulness_students_purposes,"[useful, usefulness, students, purposes, stude...","[useful, students, studies, study, student, he...","[usefulness, students, purpose, studies, exams..."
7,288,1.287381,66.823119,7_chatgpt_chatgpts_chat_gpt4,"[chatgpt, chatgpts, chat, gpt4, talking, conve...","[chatgpt, conversations, model, language, enga...","[chatgpt, chat, gpt35, conversational, communi..."
8,236,1.054937,67.878056,8_helpful_helps_helped_helping,"[helpful, helps, helped, helping, useful, assi...","[helpful, lot, thanks, love, work, important, ...","[helpful, assistance, important, thankyou, doi..."


In [63]:
topic_model.visualize_barchart(top_n_topics = 21, n_words = 10)

In [68]:
topic_model.visualize_topics()

Summary: 
- Bugs with login, other errors  
- Educational purpose, studying 
- Helping, Assistance 
- Phone number for registration needed, verification
- Feature request on uploading photos, gettung responses with images 
- Translation problems for hindi speakers 
- Topic 18: future-oriented

## Dealing with Outliers

-1 represents outliers: 6841! 

In [67]:
percentage_outliers = (6841 / 22371)*100 
percentage_outliers

30.579768450225735

Four different strategies to deal with the outliers:
1. based on topic-document probabilities,
2. based on topic distributions,
3. based on c-TF-IFD representations,
4. based on document and topic embeddings.

## Topics by Class (Different Versions of the App)

In [64]:
topics_per_class = topic_model.topics_per_class(docs, 
    classes=filt_df.hotel)

topic_model.visualize_topics_per_class(topics_per_class, 
    top_n_topics=10, normalize_frequency = True)

NameError: name 'filt_df' is not defined

## Reducing the number of topics

In [69]:
topic_model.visualize_heatmap(n_clusters = 20)

In [71]:
distance_matrix = cosine_similarity(np.array(topic_model.topic_embeddings_))
dist_df = pd.DataFrame(distance_matrix, columns=topic_model.topic_labels_.values(), 
                       index=topic_model.topic_labels_.values())

tmp = []
for rec in dist_df.reset_index().to_dict('records'):
    t1 = rec['index']
    for t2 in rec:
        if t2 == 'index': 
            continue
        tmp.append(
            {
                'topic1': t1, 
                'topic2': t2, 
                'distance': rec[t2]
            }
        )

pair_dist_df = pd.DataFrame(tmp)

pair_dist_df = pair_dist_df[(pair_dist_df.topic1.map(
      lambda x: not x.startswith('-1'))) & 
            (pair_dist_df.topic2.map(lambda x: not x.startswith('-1')))]
pair_dist_df = pair_dist_df[pair_dist_df.topic1 < pair_dist_df.topic2]
pair_dist_df.sort_values('distance', ascending = False).head(20)

Unnamed: 0,topic1,topic2,distance
18771,112_android_chat_mobile_chats,12_chat_chats_chatty_app,0.883406
2204,12_chat_chats_chatty_app,45_chat_chatty_talk_getting,0.862427
18645,111_app_ads_ad_application,52_ads_ad_free_youtube,0.846736
2166,12_chat_chats_chatty_app,7_chatgpt_chatgpts_chat_gpt4,0.81765
19097,114_programming_education_reading_skills,6_useful_usefulness_students_purposes,0.796908
11212,66_bangladesh_nepal_apply_luck,89_bangladesh_mobile_india_app,0.793711
238,0_application_apps_app_useful,71_app_google_googled_youtube,0.789005
5716,33_google_googled_search_searching,71_app_google_googled_youtube,0.788343
18099,108_2023_ai_2022_2021,4_2021_2023_uptodate_updates,0.78792
300,0_application_apps_app_useful,133_app_mindblowing_application_mind,0.77911


### Merging

Intentionally left blank as we should take a deeper look into the clusters to see what can be merged

In [None]:
#topic_model.merge_topics(docs, [[26, 74], [43, 68, 62], [16, 50, 91]])
#df['merged_topic'] = topic_model.topics_

## Topic Distributions

In [72]:
topic_distr, topic_token_distr = topic_model.approximate_distribution(
      docs, window = 4, calculate_tokens=True)

In [74]:
topic_model.visualize_distribution(topic_distr[1])

In [79]:
# Calculate the topic distributions on a token-level
topic_distr, topic_token_distr = topic_model.approximate_distribution(docs, calculate_tokens=True)

# Visualize the token-level distributions
df = topic_model.visualize_approximate_distribution(docs[1], topic_token_distr[1])
df

Unnamed: 0,ve,been,using,catgut,for,while,but,ve.1,just,tested,out,the,microphone,speech,recognition,option,for.1,the.1,first,time,and,let,say,COMPLETELY,BLOWN,away,not,seriously,It,literally,puts,ALL,the.2,expressions,punctuation,in,the.3,right,place,No,matter,how,you,talk,it,converts,it.1,without,problem,It.1,amazing,and.1,will,probably,will.1,never,type,to,catgut.1,again,Still,though,that,some,outstanding,work,Now,we,wait,for.2,voice,responses,from,the.4,both,Hopefully
0_application_apps_app_useful,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1,0.201,0.329,0.329,0.229,0.129,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1_catgut_ai_apps_app,0.0,0.185,0.369,0.694,0.694,0.509,0.325,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.325,0.649,0.649,0.649,0.325,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.124,0.124,0.124,0.124,0.0
2_best_greatest_excellent_good,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.105,0.21,0.447,0.447,0.342,0.237,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3_login_unable_password_logged,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.156,0.297,0.297,0.297,0.14,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5_voice_microphone_audio_speech,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.249,0.541,0.843,1.163,1.087,0.795,0.492,0.172,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.339,0.666,1.182,1.698,1.359,1.033,0.516,0.0,0.0
10_images_photos_image_pictures,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.146,0.146,0.146,0.146,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11_answers_answering_answered_questions,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.108,0.108,0.108,0.108,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14_chat_chats_whatsapp_messages,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.129,0.129,0.129,0.129,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19_words_word_speechless_extraordinary,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.129,0.129,0.129,0.129,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.2,0.2,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
22_prompt_prompts_editing_editor,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.2,0.2,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
