In [None]:
%pip install bertopic 

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import bertopic
from bertopic import BERTopic


  from .autonotebook import tqdm as notebook_tqdm


In [3]:

df = pd.read_csv('SPOTIFY_REVIEWS_CLEANED.csv')
df.shape

(3308617, 13)

In [4]:
import datetime as dt
df['review_timestamp'] = pd.to_datetime(df['review_timestamp'])
df2 = df[(df['review_timestamp'] >= '2019-11-15') & (df['review_timestamp'] <= '2023-11-15')]
print("Original:", len(df))
print("After the most recent 5 year:", len(df2))

Original: 3308617
After the most recent 5 year: 1669701


In [5]:
df_final = df2[~df2['length_type'].isin(['Short', 'Very short'])].reset_index(drop=True)
print(df2['length_type'].value_counts())
print(df_final['length_type'].value_counts())


length_type
Very short    614051
Long          587948
Short         269962
Medium        197652
Name: count, dtype: int64
length_type
Long      587948
Medium    197652
Name: count, dtype: int64


In [6]:
df_final = df_final[~df_final['review_text'].isin(['[deleted]', '[removed]'])].reset_index(drop=True)
df_final.shape
df_final.head()

Unnamed: 0.1,Unnamed: 0,review_id,pseudo_author_id,author_name,review_text,review_rating,review_likes,author_app_version,review_timestamp,raw_word_count,length_type,length_type2,tokens
0,1663991,979989f1-78a2-4576-a783-c763ae7a9ffa,157768270865747512306,A Google user,I love the fact that I can listen to nearly an...,4,1,8.5.31.676,2019-11-15 00:02:50,68.0,Long,Long,"['i', 'love', 'the', 'fact', 'that', 'i', 'can..."
1,1663997,bd98f73f-1bb1-42f8-ad7a-d12f4c3662e9,280812221700598190021,A Google user,Randomly stops playing.... I will be listening...,2,2,8.5.31.676,2019-11-15 00:18:54,64.0,Long,Long,"['randomly', 'stops', 'playing', '.', '.', '.'..."
2,1663998,7167a53f-b1c0-4eae-9f33-26295c73de76,765391996510868237903,A Google user,So far this has been a better experience than ...,5,0,8.5.31.676,2019-11-15 00:19:11,22.0,Long,Long,"['so', 'far', 'this', 'has', 'been', 'a', 'bet..."
3,1664000,9fb431b7-7f2b-494c-8f0d-2805dec40b70,307958352133874143584,A Google user,Great app for looking up and listening to ur f...,5,0,,2019-11-15 00:20:11,11.0,Long,Long,"['great', 'app', 'for', 'looking', 'up', 'and'..."
4,1664001,e3872654-d562-422e-96c1-d2480db446f4,180538848993703574960,A Google user,I love this app so .uch cause zi get to listen...,5,0,8.5.29.828,2019-11-15 00:20:20,18.0,Long,Long,"['i', 'love', 'this', 'app', 'so', '.', 'uc', ..."


In [7]:
import re

def light_clean(text):
    if not isinstance(text, str):
        return ""
    text = re.sub(r"http\S+|www\.\S+", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

df_final['review_text_clean'] = df_final['review_text'].apply(light_clean)
docs = df_final['review_text_clean'].tolist()

In [13]:
#%pip install scikit-learn
#%pip install PCA
from sklearn.decomposition import PCA
from bertopic import BERTopic

%pip install bertopic sentence-transformers umap-learn hdbscan scikit-learn



Note: you may need to restart the kernel to use updated packages.


In [None]:
#simplifier model(chatgpt)
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
import umap, hdbscan, torch

# 1) Pick a *small* but strong SBERT (fast)
device = "cuda" if torch.cuda.is_available() else "cpu"
emb_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2", device=device)

# 2) Fast vectorizer (removes punctuation via token_pattern; English stopwords)
vectorizer = CountVectorizer(
    ngram_range=(1, 2),
    min_df=5,            # raise for speed on big corpora
    max_df=0.95,
    stop_words="english",
    token_pattern=r"(?u)\b[a-zA-Z][a-zA-Z]+\b"  # words ≥2 letters; skips punctuation/digits
)

# 3) Aggressive dimensionality reduction (fewer components = faster)
umap_model = umap.UMAP(
    n_neighbors=15,
    n_components=5,      # ↓ speeds clustering a lot; try 5–10
    min_dist=0.0,
    metric="cosine",
    random_state=42,
    verbose=True
)

# 4) Faster density clustering (larger min_cluster_size = fewer, faster clusters)
hdbscan_model = hdbscan.HDBSCAN(
    min_cluster_size=50,     # tune: larger = faster, coarser topics
    metric="euclidean",      # BERTopic default after UMAP
    cluster_selection_method="eom",
    prediction_data=False    # disable soft probs for speed
)

# 5) Build the fast BERTopic
topic_model = BERTopic(
    embedding_model=emb_model,
    vectorizer_model=vectorizer,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    language="english",
    low_memory=True,
    calculate_probabilities=False,  # big speed win
    nr_topics="auto",               # or an int to force a target # of topics
    verbose=True
)

# ---- Your docs list here (list[str]) ----
# Example:
# docs = df['review_text'].fillna("").astype(str).tolist()

topics, probs = topic_model.fit_transform(docs)  # probs=None since disabled

# Inspect
topic_info = topic_model.get_topic_info()
print(topic_info.head())            # topic sizes
print(topic_model.get_topic(0)[:10])  # top terms for topic 0

# Representative docs per topic
rep = topic_model.get_representative_docs()
# rep is {topic_id: [doc1, doc2, ...]}


2025-11-10 12:06:03,798 - BERTopic - Embedding - Transforming documents to embeddings.
Batches: 100%|██████████| 24553/24553 [50:52<00:00,  8.04it/s]  
2025-11-10 12:57:25,346 - BERTopic - Embedding - Completed ✓
2025-11-10 12:57:25,495 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm


UMAP(angular_rp_forest=True, metric='cosine', min_dist=0.0, n_components=5, n_jobs=1, random_state=42, verbose=True)


OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.


Mon Nov 10 12:57:29 2025 Construct fuzzy simplicial set
Mon Nov 10 12:57:30 2025 Finding Nearest Neighbors
Mon Nov 10 12:57:30 2025 Building RP forest with 49 trees
Mon Nov 10 13:00:32 2025 NN descent for 20 iterations
	 1  /  20
	 2  /  20
	 3  /  20
	 4  /  20
	 5  /  20
	 6  /  20
	Stopping threshold met -- exiting after 6 iterations
Mon Nov 10 13:03:38 2025 Finished Nearest Neighbor Search
Mon Nov 10 13:03:55 2025 Construct embedding


: 

In [None]:
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer

# 1. Load a small + fast BERT embedding model
emb_model2 = SentenceTransformer("all-MiniLM-L6-v2")

# 2. Create BERTopic with defaults (simple mode)
topic_model2 = BERTopic(embedding_model=emb_model)


# 4. Fit and get topic assignments
topics, probs = topic_model2.fit_transform(docs)

# 5. View top topics
topic_info2 = topic_model2.get_topic_info()
print(topic_info2)

# 6. View top words for a topic (example: topic 0)
print(topic_model2.get_topic(0))
