### Import Packages

In [1]:
import os
import json
import pandas as pd

### Import Data

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
df = pd.read_csv('drive/MyDrive/Colab Notebooks/amazon_alexa.tsv', sep='\t')

### Modeling

In [4]:
from bertopic import BERTopic

In [5]:
reviews = df['verified_reviews'].to_list()

In [6]:
topic_model = BERTopic(nr_topics='auto', n_gram_range=(1,3), verbose=True, calculate_probabilities=True)
topics, probabilities = topic_model.fit_transform(reviews)

Batches:   0%|          | 0/99 [00:00<?, ?it/s]

2022-04-24 21:09:29,681 - BERTopic - Transformed documents to Embeddings
2022-04-24 21:09:57,907 - BERTopic - Reduced dimensionality with UMAP
2022-04-24 21:09:58,841 - BERTopic - Clustered UMAP embeddings with HDBSCAN
2022-04-24 21:10:04,850 - BERTopic - Reduced number of topics from 79 to 46


In [7]:
freq = topic_model.get_topic_info()
freq.head(10)

Unnamed: 0,Topic,Count,Name
0,0,939,0_the_to_and_echo
1,-1,663,-1_the_to_and_it
2,1,231,1_easy_set up_easy to_set
3,2,191,2_love it_love_it love_it love it
4,3,108,3_product_great product_great_product great
5,4,66,4_barry_meh ight_meh ight barry_ight barry barry
6,5,61,5_great_cool_cool amazing_great very
7,6,56,6_works_great works_works great works_works like
8,7,53,7_stick_fire_fire stick_firestick
9,8,50,8_learning_still_still learning_fun


In [8]:
topic_nr = freq.iloc[5]["Topic"]
topic_model.get_topic(topic_nr)

[('barry', 1.7326400761741159),
 ('meh ight', 0.9433135549577893),
 ('meh ight barry', 0.9433135549577893),
 ('ight barry barry', 0.9433135549577893),
 ('ight barry', 0.9433135549577893),
 ('ight', 0.9433135549577893),
 ('barry barry', 0.9433135549577893),
 ('meh', 0.8212911912405938),
 ('', 1e-05),
 ('', 1e-05)]

In [9]:
topic_model.visualize_barchart()

In [10]:
topic_model.visualize_topics()

In [11]:
# Further reduce topics
new_topics, new_probs = topic_model.reduce_topics(reviews, topics, probabilities, nr_topics='auto')

2022-04-24 21:10:13,812 - BERTopic - Reduced number of topics from 46 to 39


In [19]:
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer

# Prepare embeddings
sentence_model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = sentence_model.encode(reviews, show_progress_bar=False)

# Create topic model
model = BERTopic()
topics, probabilities = model.fit_transform(reviews, embeddings)

2022-04-24 21:19:35,987 - BERTopic - Reduced dimensionality with UMAP
2022-04-24 21:19:36,921 - BERTopic - Clustered UMAP embeddings with HDBSCAN
2022-04-24 21:19:41,585 - BERTopic - Reduced number of topics from 73 to 73


In [13]:
model.get_topic_info().head(10)

Unnamed: 0,Topic,Count,Name
0,-1,774,-1_to_the_and_my
1,0,101,0_dot_echo_dots_the
2,1,91,1_loves_gift_one_it
3,2,71,2_smart_home_hub_thermostat
4,3,70,3_tv_netflix_hulu_watch
5,4,66,4_barry_ight_meh_
6,5,63,5_music_listen_many_it
7,6,61,6_cool_amazing_paroduct_terrific
8,7,58,7_show_echo_video_the
9,8,57,8_alexa_or_she_to


In [14]:
model.visualize_barchart()

In [15]:
model.visualize_topics()

In [16]:
model.save("my_topics_model")