Whole dataset - run bertopic

In [1]:
import pandas as pd
import numpy as np
import os

os.getcwd()

os.chdir("/Users/amymiao/Documents/UC Berkeley/Fall 8.27-12.16/Applied Computing/final-project-repo-group-four")

In [2]:
df = pd.read_csv('data/clean_processed.csv')

In [3]:
docs = df['Lyrics'].tolist()

In [50]:
from bertopic import BERTopic
from umap import UMAP
from hdbscan import HDBSCAN
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
import numpy as np
import random

SEED = 42
random.seed(SEED)
np.random.seed(SEED)

stopwords_nltk = stopwords.words('english')
general_lyrics_stopwords = stopwords_nltk + [
    'la', 'lo', 'lola', 'du', 'oh', 'yeah', 'la la', 'lo lo', 'du du', 'got', 'gonna',
    'ooh', 'get', 'take', 'da', 'know', 'like', 'one', 'said', 'ron',
    'let', 'go', 'come', 'want', 'toke', 'say', 'day', 'good', 'see', 'make',
    'niggas', 'nigga', 'ya', 'choo', 'na', 'uh', 'yuh', 'hmm', 'wanna',
    'doo', 'dat', 'woo', 'nah', 'whoomp', 'yo', 'whoo', 'whatta', 'wa',
    'johnny', 'boaw', 'th', 'whoa', 'eh', 'ooo', 'um', 'dum', 'thy', 'oo',
    'the', 'on', 'and', 'ta', 'pa', 'oop', 'ha', 'bop', 'ah', 'yah', 'shes',
    'aa', 'im', 'ohoh', 'whats', 'huh', 'youre', 'dont', 'self', 'isnt', 'yeh', 
    'jo', 'cant', 'hes', 'uhuh', 'ive', 'yah','hmmmmm','ohh','woulda','nae','lil',
    'chh','ayy','aha','dit','ding','ling','feat','justin','mbabarara','rainin','burnin',
    'em','drake','sylvia','alejandro','mmm','whatcha','somethin','rumours'
]


vectorizer = CountVectorizer(
    stop_words=general_lyrics_stopwords
)

umap_model = UMAP(
    n_neighbors=10,
    n_components=10,
    min_dist=0.0,
    metric="cosine",
    random_state=SEED
)

hdbscan_model = HDBSCAN(  # HDBSCAN is for clustering 
    min_cluster_size=20,   # how many docs per topic (granularity)
    min_samples=10,      # how many clusters are allowed to be formed
    prediction_data=True,
    cluster_selection_method="leaf"  # leaf gives more granular topics
)

general_topic_model = BERTopic(    # top level model (compare with HDBSCAN and umap)
    embedding_model="all-distilroberta-v1",   # all-distilroberta-v1
    vectorizer_model=vectorizer,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    min_topic_size=20,      #if a topic survives HDBSCAN but is below this min_topic_size, it will get merged into another topic
    top_n_words=15,
    calculate_probabilities=True,
    verbose=True
)


In [51]:
topics_general, probs_general = general_topic_model.fit_transform(docs)
# Reduce the number of topics (set to the number you want)
general_topic_model.reduce_topics(docs, nr_topics=30)

general_topic_model.get_topic_info()

2025-11-23 12:32:48,100 - BERTopic - Embedding - Transforming documents to embeddings.
Batches: 100%|██████████| 196/196 [02:55<00:00,  1.12it/s]
2025-11-23 12:35:46,239 - BERTopic - Embedding - Completed ✓
2025-11-23 12:35:46,239 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-11-23 12:35:51,173 - BERTopic - Dimensionality - Completed ✓
2025-11-23 12:35:51,174 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-11-23 12:35:51,422 - BERTopic - Cluster - Completed ✓
2025-11-23 12:35:51,424 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-11-23 12:35:51,849 - BERTopic - Representation - Completed ✓
2025-11-23 12:35:52,045 - BERTopic - Topic reduction - Reducing number of topics
2025-11-23 12:35:52,049 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-11-23 12:35:52,453 - BERTopic - Representation - Completed ✓
2025-11-23 12:35:52,455 - BERTopic - Topic reduction - Reduced 

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,4351,-1_love_baby_time_cause,"[love, baby, time, cause, never, girl, way, ba...",[ Where do they get off telling you That I am ...
1,0,229,0_love_never_heart_without,"[love, never, heart, without, away, cry, gone,...",[ I heard from a friend today And she said you...
2,1,194,1_baby_girl_cause_need,"[baby, girl, cause, need, back, mind, around, ...",[ I never felt nothing in the world like this ...
3,2,149,2_bitch_fuck_shit_back,"[bitch, fuck, shit, back, gon, money, city, tw...","[ Fuck bitches, get money (What?) Fuck niggas,..."
4,3,111,3_boogie_mony_shake_twist,"[boogie, mony, shake, twist, watch, dance, ton...","[(That's right) Get up and boogie, get up and ..."
5,4,99,4_love_baby_lovin_need,"[love, baby, lovin, need, girl, give, feel, lo...",[If you want my lovin' If you really do Don't ...
6,5,99,5_bay_crank_watch_soulja,"[bay, crank, watch, soulja, boy, wee, girl, wo...","[ Okay, check it, check it, check it out It'..."
7,6,96,6_baby_body_love_long,"[baby, body, love, long, way, girl, shake, fre...",[ What we gotta do right here is go back Back ...
8,7,91,7_chance_woman_girl_love,"[chance, woman, girl, love, man, mmmm, give, n...","[ Ain't another woman that can take your spot,..."
9,8,87,8_promise_hope_bright_believe,"[promise, hope, bright, believe, church, find,...",[ Well I see him on the TV Preachin' 'bout the...


In [52]:
# Re-visualize the intertopic distance map
fig = general_topic_model.visualize_topics()
fig.show()

In [83]:
docs_per_topic = general_topic_model.get_representative_docs()
docs_per_topic[28]

[" I'm tryna put you in the worst mood, ah P1 cleaner than your church shoes, ah Milli point two just to hurt you, ah All red Lamb’ just to tease you, ah None of these toys on lease too, ah Made your whole year in a week too, yah Main bitch out your league too, ah Side bitch out of your league too, ah   House so empty, need a centerpiece 20 racks a table cut from ebony Cut that ivory into skinny pieces Then she clean it with her face man I love my baby You talking money, need a hearing aid You talking bout me, I don't see the shade Switch up my style, I take any lane I switch up my cup, I kill any pain   Look what you've done I’m a motherfuckin' starboy Look what you've done I'm a motherfuckin' starboy   Every day a nigga try to test me, ah Every day a nigga try to end me, ah Pull off in that Roadster SV, ah Pockets overweight, gettin' hefty, ah Coming for the king, that's a far cry, ah I come alive in the fall time, I No competition, I don't really listen I’m in the blue Mulsanne bump

without "love","baby","girl" ---BANNEd

In [48]:
stopwords_nltk = stopwords.words('english')
general_lyrics_stopwords2 = stopwords_nltk + [
    'la', 'lo', 'lola', 'du', 'oh', 'yeah', 'la la', 'lo lo', 'du du', 'got', 'gonna',
    'ooh', 'get', 'take', 'da', 'know', 'like', 'one', 'said', 'ron',
    'let', 'go', 'come', 'want', 'toke', 'say', 'day', 'good', 'see', 'make',
    'niggas', 'nigga', 'ya', 'choo', 'na', 'uh', 'yuh', 'hmm', 'wanna',
    'doo', 'dat', 'woo', 'nah', 'whoomp', 'yo', 'whoo', 'whatta', 'wa',
    'johnny', 'boaw', 'th', 'whoa', 'eh', 'ooo', 'um', 'dum', 'thy', 'oo',
    'the', 'on', 'and', 'ta', 'pa', 'oop', 'ha', 'bop', 'ah', 'yah', 'shes',
    'aa', 'im', 'ohoh', 'whats', 'huh', 'youre', 'dont', 'self', 'isnt', 'yeh', 
    'jo', 'cant', 'hes', 'uhuh', 'ive', 'yah','hmmmmm','ohh','woulda','nae','lil',
    'chh','ayy','aha','dit','ding','ling','feat','justin','mbabarara','rainin','burnin',
    'em','drake','sylvia','alejandro','love','baby','girl','girls','boy','boys'
]


vectorizer2 = CountVectorizer(
    stop_words=general_lyrics_stopwords2
)

umap_model = UMAP(
    n_neighbors=10,
    n_components=10,
    min_dist=0.0,
    metric="cosine",
    random_state=SEED
)

hdbscan_model = HDBSCAN(  # HDBSCAN is for clustering 
    min_cluster_size=20,   # how many docs per topic (granularity)
    min_samples=10,      # how many clusters are allowed to be formed
    prediction_data=True,
    cluster_selection_method="leaf"  # leaf gives more granular topics
)

general_topic_model2 = BERTopic(    # top level model (compare with HDBSCAN and umap)
    embedding_model="all-distilroberta-v1",   # all-distilroberta-v1
    vectorizer_model=vectorizer2,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    min_topic_size=20,      #if a topic survives HDBSCAN but is below this min_topic_size, it will get merged into another topic
    top_n_words=15,
    calculate_probabilities=True,
    verbose=True
)

topics_general, probs_general = general_topic_model2.fit_transform(docs)
# Reduce the number of topics (set to the number you want)
general_topic_model2.reduce_topics(docs, nr_topics=30)
general_topic_model2.get_topic_info()

2025-11-23 12:24:46,055 - BERTopic - Embedding - Transforming documents to embeddings.
Batches: 100%|██████████| 196/196 [03:05<00:00,  1.06it/s]
2025-11-23 12:27:53,283 - BERTopic - Embedding - Completed ✓
2025-11-23 12:27:53,284 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-11-23 12:27:58,497 - BERTopic - Dimensionality - Completed ✓
2025-11-23 12:27:58,498 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-11-23 12:27:58,746 - BERTopic - Cluster - Completed ✓
2025-11-23 12:27:58,748 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-11-23 12:27:59,170 - BERTopic - Representation - Completed ✓
2025-11-23 12:27:59,366 - BERTopic - Topic reduction - Reducing number of topics
2025-11-23 12:27:59,370 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-11-23 12:27:59,778 - BERTopic - Representation - Completed ✓
2025-11-23 12:27:59,779 - BERTopic - Topic reduction - Reduced 

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,4351,-1_time_cause_never_way,"[time, cause, never, way, back, right, tell, n...","[Oh, darling I'm so lonely without you Can't s..."
1,0,229,0_never_heart_without_away,"[never, heart, without, away, cry, gone, time,...",[ I heard from a friend today And she said you...
2,1,194,1_cause_need_back_mind,"[cause, need, back, mind, around, tell, shake,...",[ They say around the way you've asked for me ...
3,2,149,2_bitch_fuck_shit_back,"[bitch, fuck, shit, back, gon, money, city, tw...","[ Fuck bitches, get money (What?) Fuck niggas,..."
4,3,111,3_boogie_mony_shake_twist,"[boogie, mony, shake, twist, watch, dance, mus...","[Boogie Boogie down baby Ooo-uh, boogie Baby, ..."
5,4,99,4_lovin_need_mmm_give,"[lovin, need, mmm, give, feel, loving, wo, eve...","[ Baby, I need your lovin' Baby, I need your l..."
6,5,99,5_bay_crank_watch_soulja,"[bay, crank, watch, soulja, wee, work, yuuuuuu...","[ Okay, check it, check it, check it out It'..."
7,6,96,6_body_long_way_shake,"[body, long, way, shake, fresh, party, right, ...","[ Hey, hey Bed, stay in bed The feeling of y..."
8,7,91,7_chance_woman_man_whatcha,"[chance, woman, man, whatcha, mmmm, give, need...",[ Wha-wha-wha-wha-what did you say? (J-J-J-J-J...
9,8,87,8_promise_hope_bright_believe,"[promise, hope, bright, believe, church, find,...",[ Well I see him on the TV Preachin' 'bout the...


In [49]:
# Re-visualize the intertopic distance map
fig = general_topic_model2.visualize_topics()
fig.show()