In [2]:
import pandas as pd
import numpy as np
import os
from bertopic import BERTopic
from umap import UMAP
from hdbscan import HDBSCAN
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
import numpy as np
import random

os.getcwd()

os.chdir("/Users/amymiao/Documents/UC Berkeley/Fall 8.27-12.16/Applied Computing/final-project-repo-group-four")

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
df = pd.read_csv('data/clean_processed.csv')

In [4]:
df_2010s= df[(df["Year"]>= 2010) & (df["Year"] < 2020)]
df_2010s.shape

(981, 11)

In [5]:
docs = df_2010s['Lyrics'].tolist()

In [6]:
SEED = 42
random.seed(SEED)
np.random.seed(SEED)

stopwords_nltk = stopwords.words('english')
lyrics_2010s_stopwords = stopwords_nltk + [
    'la', 'lo', 'lola', 'du', 'oh', 'yeah', 'la la', 'lo lo', 'du du', 'got', 'gonna',
    'ooh', 'get', 'take', 'da', 'know', 'like', 'one', 'said', 'ron',
    'let', 'go', 'come', 'want', 'toke', 'say', 'day', 'good', 'see', 'make',
    'niggas', 'nigga', 'ya', 'choo', 'na', 'uh', 'yuh', 'hmm', 'wanna',
    'doo', 'dat', 'woo', 'nah', 'whoomp', 'yo', 'whoo', 'whatta', 'wa',
    'johnny', 'boaw', 'th', 'whoa', 'eh', 'ooo', 'um', 'dum', 'thy', 'oo',
    'the', 'on', 'and', 'ta', 'pa', 'oop', 'ha', 'bop', 'ah', 'yah', 'shes',
    'aa', 'im', 'ohoh', 'whats', 'huh', 'youre', 'dont', 'self', 'isnt', 'yeh', 
    'jo', 'cant', 'hes', 'uhuh', 'ive', 'yah','hmmmmm','ohh','woulda','nae','lil',
    'chh','ayy','aha','dit','ding','ling','feat','justin','mbabarara','rainin','burnin',
    'em','drake','sylvia','alejandro','mmm','whatcha','somethin','rumours'
]

vectorizer = CountVectorizer(
    stop_words=lyrics_2010s_stopwords
)

umap_model = UMAP(
    n_neighbors=5,
    n_components=10,
    min_dist=0.0,
    metric="cosine",
    random_state=SEED
)

hdbscan_model = HDBSCAN(
    min_cluster_size=10,   # how many docs per topic (granularity)
    min_samples=10,      # how strict the clustering is
    prediction_data=True,
    cluster_selection_method="leaf"  # leaf gives more granular topics, usually more topics
)

topic_model_2010s = BERTopic(
    embedding_model="all-distilroberta-v1",   # all-distilroberta-v1
    vectorizer_model=vectorizer,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    min_topic_size=10,
    calculate_probabilities=True,
    verbose=True
)


In [7]:
topics_2010s, probs_2010s = topic_model_2010s.fit_transform(docs)

# only 10 big topics, be careful when reducing topics, can just manualy exclude some topics doesnot make sense
topic_model_2010s.reduce_topics(docs, nr_topics=10)

topic_info_2010s = topic_model_2010s.get_topic_info()
topic_info_2010s

2025-11-30 15:01:55,462 - BERTopic - Embedding - Transforming documents to embeddings.
Batches: 100%|██████████| 31/31 [00:25<00:00,  1.21it/s]
2025-11-30 15:02:23,265 - BERTopic - Embedding - Completed ✓
2025-11-30 15:02:23,265 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.
2025-11-30 15:02:26,827 - BERTopic - Dimensionality - Completed ✓
2025-11-30 15:02:26,828 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-11-30 15:02:26,852 - BERTopic - Cluster - Completed ✓
2025-11-30 15:02:26,854 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-11-30 15:02:26,941 - BERTopic - Representation - Completed ✓
2025-11-30 15:02:27,031 - BERTopic - Topic reduction - Reducing number of topics
2025-11-30 15:02:27,033 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-11-30 15:02:27,115 - BERTopic

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,444,-1_love_baby_cause_girl,"[love, baby, cause, girl, hey, time, way, back...","[ Yeah... right... Usher baby... okay, yeah ..."
1,0,201,0_love_baby_never_cause,"[love, baby, never, cause, feel, heart, tell, ...","[ Oh, woah Oh, woah Oh, woah You know you ..."
2,1,86,1_bitch_fuck_ass_shit,"[bitch, fuck, ass, shit, walk, mma, back, mone...","[ Southside Hah, it's Gucci It's Drop Top, Wiz..."
3,2,63,2_doh_cake_put_baby,"[doh, cake, put, baby, work, party, girl, hey,...",[ Party Rock Yeah Whoo! Let's go! Party rock...
4,3,63,3_thunder_made_man_solo,"[thunder, made, man, solo, cause, cold, away, ...","[ Lately, I've been, I've been losing sleep Dr..."
5,4,54,4_back_little_night_old,"[back, little, night, old, road, country, girl...",[ Been going round and round all day Bailing s...
6,5,22,5_que_de_el_je,"[que, de, el, je, te, tu, eu, en, tú, mi]","[ D-D-D-D-D-DY ¡Ay, Daddy! Play N' Skillz Ka..."
7,6,19,6_beautiful_says_love_baby,"[beautiful, says, love, baby, rock, cause, gir...",[ She just wants to be beautiful She goes unno...
8,7,15,7_burn_fire_try_rolling,"[burn, fire, try, rolling, deep, radioactive, ...",[ There's a fire starting in my heart Reaching...
9,8,14,8_low_young_dumb_turn,"[low, young, dumb, turn, para, broke, aaa, sai...",[ So you're still thinking of me Just like I k...


In [8]:
topic_model_2010s.visualize_documents(docs, topics=topics_2010s)

: 

In [11]:
docs_per_topic = topic_model_2010s.get_representative_docs()
docs_per_topic[1]

[" Southside Hah, it's Gucci It's Drop Top, Wizop, ooh If Young Metro don't trust you (yeah) I'm gon' shoot ya (ayy)   You get the bag and fumble it I get the bag and flip it and tumble it (yeah) Straight out the lot, 300 cash (cash) And the car came with a blunt in it (yeah) Lil mama a thot, and she got ass (thot) Then she gon' fuck up a bag (yeah) Pull up to the spot, livin' too fast (yeah) Droppin' the dope in the stash (yah) In Italy, got two foreign hoes, they DM me (ooh, brr, ayy) Drop the top when it's cold (drop top) But you feel the heat (skrrt, yah, ayy) Be real with me Keep it 100, just be real with me (ayy) Eat it up like it's a feast (whoa, eat it up) They say the dope on fleek (yep)   Percocet pill on me (Percocet) Ice on my neck, baby, chill with me (ice) Them niggas that post in the back don't say nothin' Them niggas will kill for me Back ends I count in my sleep, on fleek 100k spent on a Patek Phillippe (Phillippe) Bitch, I'm a dog, eat my treat (hrr) Hop out the frog 

In [13]:
docs_per_topic = topic_model_2010s.get_representative_docs()
docs_per_topic[0]

['   Oh, woah Oh, woah Oh, woah   You know you love me, I know you care Just shout whenever and I’ll be there You want my love, you want my heart And we will never, ever, ever be apart Are we an item? Girl, quit playing We’re just friends, what are you saying? Said "There’s another," and looked right in my eyes My first love broke my heart for the first time, and I was like...   Baby, baby, baby oh Like baby, baby, baby no Like baby, baby, baby no oh Thought you\'d always be mine, mine Baby, baby, baby oh Like baby, baby, baby no Like baby, baby, baby no oh Thought you’d always be mine, mine   Oh, for you I would have done whatever And I just can’t believe we ain\'t together And I wanna play it cool, but I\'m losing you I\'ll buy you anything, I\'ll buy you any ring And I\'m in pieces, baby, fix me And just shake me \'til you wake me from this bad dream I\'m going down, down, down, down And I just can’t believe my first love won’t be around, and I\'m like...   Baby, baby, baby oh Like 