# Topic modeling with BERT

Roughly following: https://towardsdatascience.com/meet-bertopic-berts-cousin-for-advanced-topic-modeling-ea5bf0b7faa3

BERTopic documentation: https://maartengr.github.io/BERTopic/index.html

In [15]:
# Setting to ignore warnings
import warnings
warnings.filterwarnings("ignore")

# Import libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

from bertopic import BERTopic

# Text processing libraries
import re
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import emoji
import contractions  # from https://github.com/kootenpv/contractions
import string

pd.set_option('display.max_colwidth', 100)

## 1 Prepare Data

In [16]:
# Read data and clean a bit
spotify = pd.read_csv("../data/reviews.csv")
data_in = spotify.copy()
data_in = data_in.drop_duplicates(subset="Review")  # Drop dupes in Reviews
data_in.drop(["Total_thumbsup",
             "Reply"], axis=1, inplace=True)
data_in["Length"] = data_in["Review"].str.split(" ").str.len()
data_in = data_in[data_in.Length < 150]
data_in.drop(["Length"], axis=1, inplace=True)
data = data_in
data.head(10)

Unnamed: 0,Time_submitted,Review,Rating
0,2022-07-09 15:00:00,"Great music service, the audio is high quality and the app is easy to use. Also very quick and f...",5
1,2022-07-09 14:21:22,Please ignore previous negative rating. This app is super great. I give it five stars+,5
2,2022-07-09 13:27:32,"This pop-up ""Get the best Spotify experience on Android 12"" is too annoying. Please let's get ri...",4
3,2022-07-09 13:26:45,Really buggy and terrible to use as of recently,1
4,2022-07-09 13:20:49,Dear Spotify why do I get songs that I didn't put on my playlist??? And why do we have shuffle p...,1
5,2022-07-09 13:20:20,The player controls sometimes disappear for no reason. App restart forgets what I was playing bu...,3
6,2022-07-09 13:19:21,I love the selection and the lyrics are provided with the song you're listening to!,5
7,2022-07-09 13:17:22,Still extremely slow when changing storage to external sd card.. I'm convinced this is done on p...,3
8,2022-07-09 13:16:49,"It's a great app and the best mp3 music app I have ever used but there is one problem that, why ...",5
9,2022-07-09 13:11:32,"I'm deleting this app, for the following reasons: This app now has a failing business model. Whe...",1


In [17]:
## Define a function to do some text cleaning
def clean_text(text):
    text = emoji.demojize(text)  # fix emojis
    text = contractions.fix(text)  # fix contractions
    text = re.sub(r'[^\x00-\x7f]', r'', text)  # remove strange fonts
    text = re.sub(r"\d+", "number", text)  # replace numbers with "number"
    text = re.sub(r'[^\w\s]', '', text)  # remove non-alphanumeric chars
    text = re.sub(r'\b\w{1}\b', '', text) # remove words less than 3 characters
    text = text.replace('_', ' ')  # replace underscores with space
    text = text.strip()  # strip extra spaces
    return text.lower()

In [18]:
## Test our text cleaner
clean_text("I ❤️ Spotify, even though it is critizised for exploiting artists")

'red heart spotify even though it is critizised for exploiting artists'

In [19]:
data["Review"] = data["Review"].apply(clean_text)

In [20]:
## Tokenize/remove punctuations
tokenizer = RegexpTokenizer(r'\w+')

In [21]:
data["Review"] = data["Review"].apply(tokenizer.tokenize)

In [22]:
data.sample(10)

Unnamed: 0,Time_submitted,Review,Rating
17194,2022-05-29 08:43:41,"[very, good, app, for, listening, music, musical, scoremusical, notes]",5
28143,2022-04-28 00:54:43,"[will, not, let, me, play, music, click, on, this, one, song, and, it, plays, me, something, way...",3
31314,2022-04-21 07:44:49,"[enjoy, the, play, list, because, can, choose]",5
31971,2022-04-20 13:39:22,"[lately, have, noticed, the, app, has, about, number, chance, of, not, working, when, first, ope...",1
8112,2022-06-21 16:20:27,"[this, good, but, never, dawnload, song, person, facepalming]",3
9381,2022-06-18 16:33:11,"[got, to, love, free, music, and, podcast, thank, you, spotifybeaming, face, with, smiling, eyes]",5
9601,2022-06-18 06:23:01,"[after, uninstalled, for, months, am, reinstalling, this, app, the, previous, reason, for, unist...",2
26661,2022-05-02 04:30:56,"[love, it, one, of, the, few, apps, that, really, like, easy, to, use, even, for, someone, as, t...",5
38108,2022-04-12 01:03:51,"[the, new, update, is, super, buggy, songs, keep, stopping, randomly, also, there, is, no, playe...",1
53689,2022-02-09 18:27:53,"[li, give, this, app, number, star, because, you, only, get, number, skips, and, you, have, to, ...",1


In [23]:
# Get NLTK English stopwords
our_stopwords = set(stopwords.words('english'))
additional_stopwords = ["spotify", "app", "apps"]
our_stopwords.update(additional_stopwords)
#our_stopwords.remove("not")
# Remove stopwords
# We loop through each list of words and remove stopwords
data['Review'] = data['Review'].apply(lambda x: [word for word in x if word not in (our_stopwords)])
data.sample(5)

Unnamed: 0,Time_submitted,Review,Rating
50466,2022-02-22 10:53:23,"[offline, playback, garbage, downloaded, stuff, would, go, back, itunes, library, rogan]",2
20598,2022-05-19 06:20:44,"[days, keep, pause, disconnect, lost, super, annoying, feel, like, using, turn, soundcloud, inst...",1
59850,2022-01-20 06:24:31,"[keeps, saying, offline, although, good, signal, data]",1
20440,2022-05-19 17:30:51,"[playlists, awesome, thumbs]",5
16988,2022-05-29 19:16:55,"[still, best, streaming, music, service, yet, rating, gone, since, put, random, ads, podcasts, r...",3


In [24]:
## Lemmatise tokens
## Download these if needed
nltk.download('wordnet')
nltk.download('omw-1.4')
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/dominikfreunberger/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/dominikfreunberger/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [25]:
## Test lemmatizer
[lemmatizer.lemmatize(word) for word in ["reasons", "plays", "playlists", "apps", "app", "leaves", "leaf"]]

['reason', 'play', 'playlist', 'apps', 'app', 'leaf', 'leaf']

In [26]:
def lemmatize_text(text):
    return [lemmatizer.lemmatize(w) for w in text]

In [27]:
data["Review"] = data["Review"].apply(lemmatize_text)

In [28]:
data.head(10)

Unnamed: 0,Time_submitted,Review,Rating
0,2022-07-09 15:00:00,"[great, music, service, audio, high, quality, easy, use, also, quick, friendly, support]",5
1,2022-07-09 14:21:22,"[please, ignore, previous, negative, rating, super, great, give, five, star]",5
2,2022-07-09 13:27:32,"[popup, get, best, experience, android, number, annoying, please, let, u, get, rid]",4
3,2022-07-09 13:26:45,"[really, buggy, terrible, use, recently]",1
4,2022-07-09 13:20:49,"[dear, get, song, put, playlist, shuffle, play]",1
5,2022-07-09 13:20:20,"[player, control, sometimes, disappear, reason, restart, forgets, playing, fix, issue]",3
6,2022-07-09 13:19:21,"[love, selection, lyric, provided, song, listening]",5
7,2022-07-09 13:17:22,"[still, extremely, slow, changing, storage, external, sd, card, convinced, done, purpose, know, ...",3
8,2022-07-09 13:16:49,"[great, best, mpnumber, music, ever, used, one, problem, cannot, play, song, find, song, despite...",5
9,2022-07-09 13:11:32,"[deleting, following, reason, failing, business, model, whether, streaming, service, like, consu...",1


In [29]:
data["Review"] = data["Review"].apply(lambda l: " ".join(l))
data.head()

Unnamed: 0,Time_submitted,Review,Rating
0,2022-07-09 15:00:00,great music service audio high quality easy use also quick friendly support,5
1,2022-07-09 14:21:22,please ignore previous negative rating super great give five star,5
2,2022-07-09 13:27:32,popup get best experience android number annoying please let u get rid,4
3,2022-07-09 13:26:45,really buggy terrible use recently,1
4,2022-07-09 13:20:49,dear get song put playlist shuffle play,1


In [30]:
data = data.reset_index()

In [31]:
model = BERTopic(verbose=True,embedding_model='paraphrase-MiniLM-L3-v2', min_topic_size = 50)

In [21]:
review_topics, _ = model.fit_transform(data["Review"])

Batches: 100%|██████████| 1918/1918 [04:27<00:00,  7.18it/s]
2022-12-18 15:27:27,433 - BERTopic - Transformed documents to Embeddings
2022-12-18 15:28:24,705 - BERTopic - Reduced dimensionality


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

2022-12-18 15:28:32,168 - BERTopic - Clustered reduced embeddings


In [None]:
model.save("../models/bert_topic_model")

In [34]:
model = model.load("../models/bert_topic_model")

In [35]:
freq = model.get_topic_info()
print("Number of topics: {}".format(len(freq)))
freq.head(7)

Number of topics: 115


Unnamed: 0,Topic,Count,Name
0,-1,25552,-1_playlist_song_listen_number
1,0,3824,0_playing_stop_pause_bar
2,1,1922,1_song_love_find_favorite
3,2,1525,2_shuffle_play_playlist_song
4,3,1493,3_log_account_logged_login
5,4,1432,4_ad_minute_number_every
6,5,1385,5_podcast_podcasts_episode_stop


In [36]:
a_topic = freq.iloc[1]["Topic"] # Select the 1st topic
model.get_topic(a_topic) # Show the words and their c-TF-IDF scores

[('playing', 0.022956288037585874),
 ('stop', 0.022174742384027772),
 ('pause', 0.018814986660804223),
 ('bar', 0.015711207532191164),
 ('control', 0.014049069265785363),
 ('randomly', 0.013576976333477419),
 ('update', 0.01262804117629182),
 ('phone', 0.011477279341211946),
 ('keep', 0.010970527220981997),
 ('fix', 0.010708118626688194)]

In [37]:
model.visualize_barchart(top_n_topics=12)

In [38]:
model.visualize_topics()

In [39]:
model.visualize_hierarchy(top_n_topics=50)

In [40]:
# Select most 3 similar topics
similar_topics, similarity = model.find_topics("bug", top_n = 3)
similar_topics

[44, 61, 81]

In [41]:
most_similar = similar_topics[0]
print("Most Similar Topic Info: \n{}".format(model.get_topic(most_similar)))
print("Similarity Score: {}".format(similarity[0]))

Most Similar Topic Info: 
[('buggy', 0.13015931795584082), ('bug', 0.1242785862446515), ('fix', 0.024684106563145146), ('lately', 0.021479107802768013), ('fixed', 0.02055804475804504), ('super', 0.019251414422199895), ('recent', 0.018819382676115732), ('extremely', 0.018543584035248493), ('update', 0.017688537872655303), ('downhill', 0.016693536176433817)]
Similarity Score: 0.9055053847679345


## Dynamic Topic Modeling (DTM)
https://maartengr.github.io/BERTopic/getting_started/topicsovertime/topicsovertime.html

In [42]:
data["Time_submitted"] = pd.to_datetime(data["Time_submitted"])
data["Week"] = data["Time_submitted"].dt.isocalendar().week

In [43]:
## Topics over time
topics_over_time = model.topics_over_time(data["Review"], data["Week"])

28it [00:15,  1.81it/s]


In [47]:
fig = model.visualize_topics_over_time(topics_over_time, top_n_topics=75, normalize_frequency=True)
fig.update_layout(xaxis_title="Week number")
fig.update_xaxes(range=[0, 27])
fig.show()