In [2]:
import pandas as pd
import numpy as np
import os
from bertopic import BERTopic
from umap import UMAP
from hdbscan import HDBSCAN
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
import numpy as np
import random

os.getcwd()

os.chdir("/Users/amymiao/Documents/UC Berkeley/Fall 8.27-12.16/Applied Computing/final-project-repo-group-four")

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
df = pd.read_csv('data/clean_processed.csv')

In [4]:
df_2010s= df[(df["Year"]>= 2010) & (df["Year"] < 2020)]
df_2010s.shape

(981, 11)

In [5]:
docs = df_2010s['Lyrics'].tolist()

In [14]:
SEED = 42
random.seed(SEED)
np.random.seed(SEED)

stopwords_nltk = stopwords.words('english')
lyrics_2010s_stopwords = stopwords_nltk + [
       'la','lo','lola','du','oh','yeah','la la','lo lo','du du','got','gonna',
    'ooh','get','take','da','know','like','one','said','ron','baby',
    'let','go','come','want','toke','say','day','good','see','make',
    'niggas','nigga','love','ya','choo','na','uh','yuh','hmm','wanna',
    'doo','dat','woo','nah','whoomp','yo','whoo','woo','whatta'
]

vectorizer = CountVectorizer(
    stop_words=lyrics_2010s_stopwords
)

umap_model = UMAP(
    n_neighbors=5,
    n_components=10,
    min_dist=0.0,
    metric="cosine",
    random_state=SEED
)

hdbscan_model = HDBSCAN(
    min_cluster_size=10,   # how many docs per topic (granularity)
    min_samples=10,      # how strict the clustering is
    prediction_data=True,
    cluster_selection_method="leaf"  # leaf gives more granular topics, usually more topics
)

topic_model_2010s = BERTopic(
    embedding_model="all-distilroberta-v1",   # all-distilroberta-v1
    vectorizer_model=vectorizer,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    min_topic_size=10,
    calculate_probabilities=True,
    verbose=True
)


In [15]:
topics_2010s, probs_2010s = topic_model_2010s.fit_transform(docs)

# only 10 big topics, be careful when reducing topics, can just manualy exclude some topics doesnot make sense
topic_model_2010s.reduce_topics(docs, nr_topics=10)

topic_info_2010s = topic_model_2010s.get_topic_info()
topic_info_2010s

2025-11-30 16:21:37,330 - BERTopic - Embedding - Transforming documents to embeddings.
Batches: 100%|██████████| 31/31 [00:25<00:00,  1.20it/s]
2025-11-30 16:22:05,410 - BERTopic - Embedding - Completed ✓
2025-11-30 16:22:05,411 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-11-30 16:22:06,337 - BERTopic - Dimensionality - Completed ✓
2025-11-30 16:22:06,338 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-11-30 16:22:06,369 - BERTopic - Cluster - Completed ✓
2025-11-30 16:22:06,373 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-11-30 16:22:06,468 - BERTopic - Representation - Completed ✓
2025-11-30 16:22:06,559 - BERTopic - Topic reduction - Reducing number of topics
2025-11-30 16:22:06,563 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-11-30 16:22:06,645 - BERTopic - Representation - Completed ✓
2025-11-30 16:22:06,646 - BERTopic - Topic reduction - Reduced nu

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,444,-1_cause_girl_hey_time,"[cause, girl, hey, time, way, back, need, neve...",[ I might be too strung out on compliments O...
1,0,201,0_never_cause_feel_heart,"[never, cause, feel, heart, tell, shake, right...",[ I drove by all the places we used to hang ou...
2,1,86,1_bitch_ayy_fuck_ass,"[bitch, ayy, fuck, ass, shit, walk, mma, back,...","[ Southside Hah, it's Gucci It's Drop Top, Wiz..."
3,2,63,2_doh_cake_put_work,"[doh, cake, put, work, party, girl, hey, night...",[ Party Rock Yeah Whoo! Let's go! Party rock...
4,3,63,3_thunder_made_man_solo,"[thunder, made, man, solo, cause, cold, away, ...","[ Lately, I've been, I've been losing sleep Dr..."
5,4,54,4_back_little_night_old,"[back, little, night, old, road, country, girl...",[Sit in that six-lane backed up traffic Horns ...
6,5,22,5_que_de_el_je,"[que, de, el, je, te, tu, eu, en, tú, mi]","[ D-D-D-D-D-DY ¡Ay, Daddy! Play N' Skillz Ka..."
7,6,19,6_beautiful_says_rock_cause,"[beautiful, says, rock, cause, girl, peat, pre...",[ She just wants to be beautiful She goes unno...
8,7,15,7_burn_fire_try_rolling,"[burn, fire, try, rolling, deep, radioactive, ...",[ There's a fire starting in my heart Reaching...
9,8,14,8_aa_low_oo_young,"[aa, low, oo, young, dumb, whoa, turn, para, b...",[ I was the knight in shining armor in your mo...


In [8]:
topic_model_2010s.visualize_documents(docs, topics=topics_2010s)

: 

In [16]:
docs_per_topic = topic_model_2010s.get_representative_docs()
docs_per_topic[4]

["Sit in that six-lane backed up traffic Horns are honking, I've about had it I'm looking for an exit sign Gotta get out of here, get it all off my mind And like a memory from your grandpa's attic A song comes slippin' through the radio static Changing my mood A little George Strait 1982  And it makes me wanna take a back road Makes me wanna take the long way home Put a little gravel in my travel Unwind, unravel all night long Makes me wanna grab my honey Tear down some two-lane country Who knows Get lost and get right with my soul Makes me wanna take Makes me wanna take a back road  I've been cooped up, tied down, 'bout forgotten What a field looks like, full of corn and cotton If I'm gonna hit a traffic jam Well it better be a tractor man So sick and tired of this interstate system I need a curve and wind-a-twistin' Dusty path to nowhere With the wind blowing through my baby's hair  Yeah, makes me wanna take a back road Makes me wanna take the long way home Put a little gravel in my 

In [13]:
docs_per_topic = topic_model_2010s.get_representative_docs()
docs_per_topic[0]

['   Oh, woah Oh, woah Oh, woah   You know you love me, I know you care Just shout whenever and I’ll be there You want my love, you want my heart And we will never, ever, ever be apart Are we an item? Girl, quit playing We’re just friends, what are you saying? Said "There’s another," and looked right in my eyes My first love broke my heart for the first time, and I was like...   Baby, baby, baby oh Like baby, baby, baby no Like baby, baby, baby no oh Thought you\'d always be mine, mine Baby, baby, baby oh Like baby, baby, baby no Like baby, baby, baby no oh Thought you’d always be mine, mine   Oh, for you I would have done whatever And I just can’t believe we ain\'t together And I wanna play it cool, but I\'m losing you I\'ll buy you anything, I\'ll buy you any ring And I\'m in pieces, baby, fix me And just shake me \'til you wake me from this bad dream I\'m going down, down, down, down And I just can’t believe my first love won’t be around, and I\'m like...   Baby, baby, baby oh Like 