In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

Matplotlib created a temporary config/cache directory at /tmp/matplotlib-92idy2na because the default path (/home/cvillarin/.cache/matplotlib) is not a writable directory; it is highly recommended to set the MPLCONFIGDIR environment variable to a writable directory, in particular to speed up the import of Matplotlib and to better support multiprocessing.


In [2]:
years = range(2006,2022)

df_all = pd.DataFrame()
for year in years:
    pickle = f'songs_{year}.pkl'
    df = pd.read_pickle(pickle)
    df_all = pd.concat([df_all, df])
    
df_all = (df_all.drop(columns=['query'])).reset_index(drop=True)
display(df_all)

Unnamed: 0,rank,song,artist,year,lyrics
0,1,Bad Day,Daniel Powter,2006,Bad Day Lyrics\nWhere is the moment we needed ...
1,2,Temperature,Sean Paul,2006,Temperature Lyrics\nOh-oh\nOh-oh\nDi gyaldem S...
2,3,Promiscuous,Nelly Furtado Featuring Timbaland,2006,TranslationsPortuguêsPromiscuous Lyrics\nAm I ...
3,4,You're Beautiful,James Blunt,2006,You’re Beautiful Lyrics\nMy life is brilliant\...
4,5,Hips Don't Lie,Shakira Featuring Wyclef Jean,2006,Hips Don’t Lie Lyrics\nLadies up in here tonig...
...,...,...,...,...,...
1593,96,Things A Man Oughta Know,Lainey Wilson,2021,Things a Man Oughta Know Lyrics\nI can hook a ...
1594,97,Throat Baby (Go Baby),BRS Kash,2021,Throat Baby (Go Baby) Lyrics\n(What's happenin...
1595,98,Tombstone,Rod Wave,2021,"Tombstone Lyrics\nDamn, this motherfucker too ..."
1596,99,Drinkin' Beer. Talkin' God. Amen.,Chase Rice Featuring Florida Georgia Line,2021,Drinkin’ Beer. Talkin’ God. Amen. Lyrics\nFire...


In [3]:
import re

def preprocess_lyrics(lyrics):
    
    # Convert to lowercase
    lyrics = lyrics.lower()
    
    # Removes titles
    lyrics = lyrics.split('yrics\n', 1)[-1]
    
    # Removes 'Embed from the end'
    lyrics = lyrics.rsplit('embed', 1)[0]
    
    # Removes digits before embed
    lyrics = lyrics.rstrip('1234567890')
    
    # Removes 'you might also like'
    lyrics = lyrics.rsplit('you might also like', 1)[0]
    
    # Replace dash with space
    lyrics = (re.sub("[-]", " ", lyrics))
    
    # Remove Non-letters
    lyrics = (re.sub("[^\sA-Za-z]", "", lyrics))
    
    # Standardize whitespace
    lyrics = (re.sub("[\s]", " ", lyrics))
    
    return lyrics

In [4]:
df_all['lyrics'] = df_all['lyrics'].apply(preprocess_lyrics)

In [5]:
df_all

Unnamed: 0,rank,song,artist,year,lyrics
0,1,Bad Day,Daniel Powter,2006,where is the moment we needed the most you kic...
1,2,Temperature,Sean Paul,2006,oh oh oh oh di gyaldem schillaci sean da paul ...
2,3,Promiscuous,Nelly Furtado Featuring Timbaland,2006,am i throwin you off nope didnt think so how ...
3,4,You're Beautiful,James Blunt,2006,my life is brilliant my life is brilliant my l...
4,5,Hips Don't Lie,Shakira Featuring Wyclef Jean,2006,ladies up in here tonight no fighting we got t...
...,...,...,...,...,...
1593,96,Things A Man Oughta Know,Lainey Wilson,2021,i can hook a trailer on a two inch hitch i can...
1594,97,Throat Baby (Go Baby),BRS Kash,2021,whats happenin chi chi sexy lil bitch sexy li...
1595,98,Tombstone,Rod Wave,2021,damn this motherfucker too crazy saucii let th...
1596,99,Drinkin' Beer. Talkin' God. Amen.,Chase Rice Featuring Florida Georgia Line,2021,firewood crackle in the fall air red dirt play...


In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
corpus = df_all['lyrics'].tolist()
vectorizer = TfidfVectorizer(min_df=3, ngram_range=(1, 2), stop_words='english')
X = vectorizer.fit_transform(corpus)
vectorizer.get_feature_names_out()


print(X.shape)

(1598, 12089)


In [7]:
pd.DataFrame(X.todense(), columns=vectorizer.get_feature_names_out())

Unnamed: 0,aap,able,abraza,ac,academy,accept,account,ace,ache,aching,...,yum,yung,yung joc,yup,zero,zero hero,zip,zombie,zone,zoo
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1593,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1594,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1595,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1596,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
