<a href="https://colab.research.google.com/github/emngarcia/TikTokViralSongs/blob/main/cluster_creation_evaluation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# reading in df with lyrics and audio features
import pandas as pd

df = pd.read_csv('merged_right.csv')

# dropping duplicate rows (songs popular multiple years) for clustering
df.drop(columns=['Unnamed: 0'], inplace=True)
df.drop_duplicates(inplace = True, subset = ['lyrics'], keep = 'first')

df

Unnamed: 0,track_name,artist_name,artist_pop,album,track_pop,danceability,energy,loudness,mode,key,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,duration_ms,year,lyrics
0,Shake It,Metro Station,53,Metro Station,68,0.618,0.955,-3.836,1,4,0.0798,0.002210,0.000003,0.486,0.7900,150.034,4,179947,2019,"Let’s drop!\nYeah, come on\nShake, shake\n\nI'..."
1,Chinese New Year,SALES,61,SALES - EP,53,0.744,0.845,-7.422,0,4,0.2530,0.759000,0.232000,0.100,0.7490,75.221,4,160000,2019,I'll see you at the movies\nI see you with you...
3,Baby I'm Yours,Breakbot,57,By Your Side,69,0.829,0.792,-3.755,0,2,0.0668,0.726000,0.000006,0.122,0.7580,118.050,4,215507,2019,I thought I had it all together\nBut I was led...
6,The Git Up,Blanco Brown,60,The Git Up,2,0.847,0.678,-8.635,1,9,0.1090,0.066900,0.000000,0.274,0.8110,97.984,4,200594,2019,Right now\nI just need you to get real loose\n...
8,Say Hey (I Love You),Michael Franti & Spearhead,58,All Rebel Rockers,0,0.738,0.983,-4.374,0,5,0.0855,0.038000,0.000006,0.183,0.9570,92.998,4,235760,2019,"Comme da selecta\nAyy, uh-huh, woo (That's rig..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
839,California Dreaming,Jorm,42,California Dreaming,53,0.717,0.865,-4.126,0,1,0.0307,0.022200,0.001200,0.605,0.4870,124.048,4,152323,2022,All the leaves are brown\nAnd the sky is grey\...
840,Rockstar,Ilkay Sencan,61,Rockstar,69,0.771,0.826,-3.978,1,11,0.0604,0.142000,0.030000,0.145,0.1850,100.012,4,212280,2022,I've been fuckin' hoes and poppin' pillies\nMa...
841,Lovely,Alfons,58,Lovely,45,0.504,0.677,-6.954,0,4,0.0640,0.316000,0.001270,0.268,0.0375,128.078,4,159023,2022,Одинокий прекрасный\nВ синем небе парил\nПрола...
842,Jimmy Cooks (feat. 21 Savage),Drake,95,"Honestly, Nevermind",92,0.529,0.673,-4.711,1,0,0.1750,0.000307,0.000002,0.093,0.3660,165.921,4,218365,2022,"Вновь проснулся бодреньким, так что знай: я в ..."


In [None]:
# df.to_csv('merged_right_nodupes.csv')

In [None]:
# k means clustering with lyrics and audio features
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import make_column_transformer
from sklearn.pipeline import Pipeline

transformer = make_column_transformer(
    (StandardScaler(), ['key', 'mode', 'valence', 'tempo', 'time_signature', 'danceability', 'energy', 'loudness', 'speechiness', 'acousticness', 'instrumentalness', 'liveness']),
    (TfidfVectorizer(max_features = 1000), 'lyrics')
)

model = KMeans(n_clusters=5)

pipeline = Pipeline([
    ('transform', transformer),
    ('cluster', model)
])


pipeline.fit(df)

In [None]:
centroids = model.cluster_centers_
clusters = model.labels_

df["Cluster"] = clusters

# printing artists in each cluster
for cluster_id in range(model.n_clusters):
  print(cluster_id)
  print(df[df["Cluster"] == cluster_id]["artist_name"])

0
0           Metro Station
6            Blanco Brown
11            Bag Raiders
15               Ashley O
16     Falling In Reverse
              ...        
832             Dame Dame
834                DaBaby
836                   CPX
840          Ilkay Sencan
842                 Drake
Name: artist_name, Length: 225, dtype: object
1
21          Andrew Gold
25            Flo Milli
29           KRYPTO9095
30     Supa Dupa Humble
36         Ant Saunders
             ...       
808       The Kid LAROI
818    Young T & Bugsey
835          DripReport
837              Halsey
838        Rich The Kid
Name: artist_name, Length: 144, dtype: object
2
3                        Breakbot
8      Michael Franti & Spearhead
18                         WILLOW
23                   Selena Gomez
31                   Tierra Whack
                  ...            
822                        Coopex
830                     Dame Dame
831                    The Weeknd
833                  Lil Uzi Vert
839         

In [None]:
# printing songs in each cluster
for cluster_id in range(model.n_clusters):
  print(cluster_id)
  print(df[df["Cluster"] == cluster_id]["track_name"])

0
0                           Shake It
6                         The Git Up
11                    Shooting Stars
15                         On A Roll
16               Good Girls Bad Guys
                   ...              
832                        Let It Go
834                              BOP
836                           Demons
840                         Rockstar
842    Jimmy Cooks (feat. 21 Savage)
Name: track_name, Length: 225, dtype: object
1
21           Spooky, Scary Skeletons
25                       Beef FloMix
29            Woah (feat. D3Mstreet)
30                          Steppin'
36                     Yellow Hearts
                   ...              
808                      WITHOUT YOU
818    Don't Rush (feat. Headie One)
835                         Skechers
837                       Without Me
838                        Plug Walk
Name: track_name, Length: 144, dtype: object
2
3                                   Baby I'm Yours
8                             Say Hey (

In [None]:
# df.to_csv('merged_right_nodupes_clusters.csv')

In [None]:
# loading in df with duplicates
df_dupes = pd.read_csv('merged_right.csv')
df_dupes.drop(columns=['Unnamed: 0'], inplace=True)

df_dupes

Unnamed: 0,track_name,artist_name,artist_pop,album,track_pop,danceability,energy,loudness,mode,key,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,duration_ms,year,lyrics
0,Shake It,Metro Station,53,Metro Station,68,0.618,0.955,-3.836,1,4,0.0798,0.002210,0.000003,0.486,0.7900,150.034,4,179947,2019,"Let’s drop!\nYeah, come on\nShake, shake\n\nI'..."
1,Chinese New Year,SALES,61,SALES - EP,53,0.744,0.845,-7.422,0,4,0.2530,0.759000,0.232000,0.100,0.7490,75.221,4,160000,2019,I'll see you at the movies\nI see you with you...
2,Chinese New Year,SALES,61,Chinese New Year,66,0.744,0.845,-7.422,0,4,0.2530,0.759000,0.232000,0.100,0.7490,75.221,4,160000,2020,I'll see you at the movies\nI see you with you...
3,Baby I'm Yours,Breakbot,57,By Your Side,69,0.829,0.792,-3.755,0,2,0.0668,0.726000,0.000006,0.122,0.7580,118.050,4,215507,2019,I thought I had it all together\nBut I was led...
4,Baby I'm Yours,Breakbot,57,By Your Side,69,0.829,0.792,-3.755,0,2,0.0668,0.726000,0.000006,0.122,0.7580,118.050,4,215507,2020,I thought I had it all together\nBut I was led...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
839,California Dreaming,Jorm,42,California Dreaming,53,0.717,0.865,-4.126,0,1,0.0307,0.022200,0.001200,0.605,0.4870,124.048,4,152323,2022,All the leaves are brown\nAnd the sky is grey\...
840,Rockstar,Ilkay Sencan,61,Rockstar,69,0.771,0.826,-3.978,1,11,0.0604,0.142000,0.030000,0.145,0.1850,100.012,4,212280,2022,I've been fuckin' hoes and poppin' pillies\nMa...
841,Lovely,Alfons,58,Lovely,45,0.504,0.677,-6.954,0,4,0.0640,0.316000,0.001270,0.268,0.0375,128.078,4,159023,2022,Одинокий прекрасный\nВ синем небе парил\nПрола...
842,Jimmy Cooks (feat. 21 Savage),Drake,95,"Honestly, Nevermind",92,0.529,0.673,-4.711,1,0,0.1750,0.000307,0.000002,0.093,0.3660,165.921,4,218365,2022,"Вновь проснулся бодреньким, так что знай: я в ..."


In [None]:
# subset of df with clusters, track, artist
df_cluster = df[['track_name', 'artist_name', 'Cluster']]
df_cluster

Unnamed: 0,track_name,artist_name,Cluster
0,Shake It,Metro Station,0
1,Chinese New Year,SALES,3
3,Baby I'm Yours,Breakbot,2
6,The Git Up,Blanco Brown,0
8,Say Hey (I Love You),Michael Franti & Spearhead,2
...,...,...,...
839,California Dreaming,Jorm,2
840,Rockstar,Ilkay Sencan,0
841,Lovely,Alfons,3
842,Jimmy Cooks (feat. 21 Savage),Drake,0


In [None]:
# merging cluster df to add clusters all songs including duplicates with a different year
df_dupes_cluster = df_dupes.merge(df_cluster, on = ['track_name', 'artist_name'], how = 'left')

df_dupes_cluster

Unnamed: 0,track_name,artist_name,artist_pop,album,track_pop,danceability,energy,loudness,mode,key,...,acousticness,instrumentalness,liveness,valence,tempo,time_signature,duration_ms,year,lyrics,Cluster
0,Shake It,Metro Station,53,Metro Station,68,0.618,0.955,-3.836,1,4,...,0.002210,0.000003,0.486,0.7900,150.034,4,179947,2019,"Let’s drop!\nYeah, come on\nShake, shake\n\nI'...",0.0
1,Chinese New Year,SALES,61,SALES - EP,53,0.744,0.845,-7.422,0,4,...,0.759000,0.232000,0.100,0.7490,75.221,4,160000,2019,I'll see you at the movies\nI see you with you...,3.0
2,Chinese New Year,SALES,61,Chinese New Year,66,0.744,0.845,-7.422,0,4,...,0.759000,0.232000,0.100,0.7490,75.221,4,160000,2020,I'll see you at the movies\nI see you with you...,3.0
3,Baby I'm Yours,Breakbot,57,By Your Side,69,0.829,0.792,-3.755,0,2,...,0.726000,0.000006,0.122,0.7580,118.050,4,215507,2019,I thought I had it all together\nBut I was led...,2.0
4,Baby I'm Yours,Breakbot,57,By Your Side,69,0.829,0.792,-3.755,0,2,...,0.726000,0.000006,0.122,0.7580,118.050,4,215507,2020,I thought I had it all together\nBut I was led...,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
839,California Dreaming,Jorm,42,California Dreaming,53,0.717,0.865,-4.126,0,1,...,0.022200,0.001200,0.605,0.4870,124.048,4,152323,2022,All the leaves are brown\nAnd the sky is grey\...,2.0
840,Rockstar,Ilkay Sencan,61,Rockstar,69,0.771,0.826,-3.978,1,11,...,0.142000,0.030000,0.145,0.1850,100.012,4,212280,2022,I've been fuckin' hoes and poppin' pillies\nMa...,0.0
841,Lovely,Alfons,58,Lovely,45,0.504,0.677,-6.954,0,4,...,0.316000,0.001270,0.268,0.0375,128.078,4,159023,2022,Одинокий прекрасный\nВ синем небе парил\nПрола...,3.0
842,Jimmy Cooks (feat. 21 Savage),Drake,95,"Honestly, Nevermind",92,0.529,0.673,-4.711,1,0,...,0.000307,0.000002,0.093,0.3660,165.921,4,218365,2022,"Вновь проснулся бодреньким, так что знай: я в ...",0.0


In [None]:
# df_dupes_cluster.to_csv('merged_right_withdupes_clusters.csv')

In [None]:
# reading in df with clusters and no duplicates
df = pd.read_csv('merged_right_nodupes_clusters.csv')

df.drop(columns=['Unnamed: 0'], inplace=True)

df

Unnamed: 0,track_name,artist_name,artist_pop,album,track_pop,danceability,energy,loudness,mode,key,...,acousticness,instrumentalness,liveness,valence,tempo,time_signature,duration_ms,year,lyrics,Cluster
0,Shake It,Metro Station,53,Metro Station,68,0.618,0.955,-3.836,1,4,...,0.002210,0.000003,0.486,0.7900,150.034,4,179947,2019,"Let’s drop!\nYeah, come on\nShake, shake\n\nI'...",0
1,Chinese New Year,SALES,61,SALES - EP,53,0.744,0.845,-7.422,0,4,...,0.759000,0.232000,0.100,0.7490,75.221,4,160000,2019,I'll see you at the movies\nI see you with you...,2
2,Baby I'm Yours,Breakbot,57,By Your Side,69,0.829,0.792,-3.755,0,2,...,0.726000,0.000006,0.122,0.7580,118.050,4,215507,2019,I thought I had it all together\nBut I was led...,2
3,The Git Up,Blanco Brown,60,The Git Up,2,0.847,0.678,-8.635,1,9,...,0.066900,0.000000,0.274,0.8110,97.984,4,200594,2019,Right now\nI just need you to get real loose\n...,0
4,Say Hey (I Love You),Michael Franti & Spearhead,58,All Rebel Rockers,0,0.738,0.983,-4.374,0,5,...,0.038000,0.000006,0.183,0.9570,92.998,4,235760,2019,"Comme da selecta\nAyy, uh-huh, woo (That's rig...",2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,California Dreaming,Jorm,42,California Dreaming,53,0.717,0.865,-4.126,0,1,...,0.022200,0.001200,0.605,0.4870,124.048,4,152323,2022,All the leaves are brown\nAnd the sky is grey\...,2
696,Rockstar,Ilkay Sencan,61,Rockstar,69,0.771,0.826,-3.978,1,11,...,0.142000,0.030000,0.145,0.1850,100.012,4,212280,2022,I've been fuckin' hoes and poppin' pillies\nMa...,0
697,Lovely,Alfons,58,Lovely,45,0.504,0.677,-6.954,0,4,...,0.316000,0.001270,0.268,0.0375,128.078,4,159023,2022,Одинокий прекрасный\nВ синем небе парил\nПрола...,2
698,Jimmy Cooks (feat. 21 Savage),Drake,95,"Honestly, Nevermind",92,0.529,0.673,-4.711,1,0,...,0.000307,0.000002,0.093,0.3660,165.921,4,218365,2022,"Вновь проснулся бодреньким, так что знай: я в ...",0


In [None]:
# using random forest classifier to evaluate the most important features in determining the kmeans clusters
from sklearn.ensemble import RandomForestClassifier

X_train = df.drop(columns=['Cluster'])
y_train = df['Cluster']

transformer = make_column_transformer(
    (StandardScaler(), ['mode', 'key', 'valence', 'tempo', 'time_signature', 'danceability', 'energy', 'loudness', 'speechiness', 'acousticness', 'instrumentalness', 'liveness']),
    (TfidfVectorizer(max_features = 1000), 'lyrics')
)

model = RandomForestClassifier(n_estimators=100, random_state=42)

pipeline = Pipeline([
    ('transform', transformer),
    ('cluster', model)
])

pipeline.fit(X_train, y_train)

trained_model = pipeline.named_steps['cluster']

importances = trained_model.feature_importances_

transformed_features = pipeline.named_steps['transform'].get_feature_names_out()

feature_importances = pd.DataFrame({'Feature': transformed_features, 'Importance': importances})
feature_importances = feature_importances.sort_values('Importance', ascending=False)

feature_importances.head(50)

Unnamed: 0,Feature,Importance
0,standardscaler__mode,0.124084
6,standardscaler__energy,0.046399
7,standardscaler__loudness,0.041085
5,standardscaler__danceability,0.028495
8,standardscaler__speechiness,0.023112
9,standardscaler__acousticness,0.019014
10,standardscaler__instrumentalness,0.015624
2,standardscaler__valence,0.010848
4,standardscaler__time_signature,0.009732
11,standardscaler__liveness,0.008054


In [None]:
# using a tsne to evaluate the quality of our clusters and how distinct they are
from sklearn.manifold import TSNE

transformer = make_column_transformer(
    (StandardScaler(), ['mode', 'key', 'valence', 'tempo', 'time_signature', 'danceability', 'energy', 'loudness', 'speechiness', 'acousticness', 'instrumentalness', 'liveness']),
)

transformed = transformer.fit_transform(X_train)
tsne = TSNE(n_components=2, perplexity=30, random_state=42)
X_tsne = tsne.fit_transform(transformed) # note: did not include lyrics bc the tsne transform doesn't allow for a sparse matrix

df['Cluster'] = df['Cluster'].astype(int)
df['Cluster'] = df['Cluster'].replace(cluster_labels)
df['Cluster'] = df['Cluster'].astype(str).str.strip()

sorted_clusters = [
    'High-Energy Dance Music',
    'Indie & Electronic',
    'Nostalgic Pop',
    'Rap & Viral Memes',
    'Heartbreak & Emotional Songs'
]

custom_colors = {
    'High-Energy Dance Music': '#93179c',
    'Indie & Electronic': '#802ebf',
    'Nostalgic Pop': '#5244d4',
    'Rap & Viral Memes': '#769ee3',
    'Heartbreak & Emotional Songs': '#c2ddf2'
}

fig = px.scatter(x=X_tsne[:, 0], y=X_tsne[:, 1], color=df['Cluster'],
                 category_orders={"Cluster": sorted_clusters},
                 color_discrete_map=custom_colors,
                 title="TSNE Projection of K-Means Clusters",)
fig.update_traces(marker=dict(size=10, line=dict(width=1, color="white")))
fig.show()