In [29]:
import pandas as pd
import re
import nltk
from collections import Counter

In [94]:
# load data
songs_data = pd.read_csv("data/top50_2018_2022.csv")
bad_words_data = pd.read_csv("data/bad-words.csv")
songs_data

Unnamed: 0,position,title,artist,year,Lyrics,Primary Artists
0,1,God's Plan,Drake,2018,And they wishin' and wishin' And wishin' and w...,Drake
1,2,I Like It,"Cardi B, Bad Bunny & J Balvin",2018,"Yeah, baby, I like it like that You gotta beli...","Cardi B, Bad Bunny & J Balvin"
2,3,In My Feelings,Drake,2018,"Trap, TrapMoneyBenny This shit got me in my fe...",Drake
3,4,Psycho,Post Malone Featuring Ty Dolla $ign,2018,"Damn, my AP goin' psycho, lil' mama bad like M...",Post Malone
4,5,Nice For What,Drake,2018,I wanna know who mothafuckin' representin' in ...,Drake
...,...,...,...,...,...,...
245,46,Already Dead,Juice WRLD,2022,"You can see the pain in my laugh (Yeah, yeah) ...",Juice WRLD
246,47,Family Ties,Baby Keem & Kendrick Lamar,2022,"Jump in that— (Hah, Hah), summon that bitch Ju...",Baby Keem & Kendrick Lamar
247,48,Alone,Rod Wave,2022,B Squared beats Oh (Oh-woah) Oh-woah Tell me w...,Rod Wave
248,49,Plan B,Megan Thee Stallion,2022,"Who the fuck you think you talkin' to, nigga? ...",Megan Thee Stallion


In [95]:
# primary artists -> separate by comma
songs_data["Primary Artists"] = songs_data["Primary Artists"].str.replace(" & ",",")
songs_data["Primary Artists"] = songs_data["Primary Artists"].str.replace(", ", ",")
songs_data["Primary Artists"] = songs_data["Primary Artists"].str.split(",")

In [96]:
songs_data["Primary Artists"]

0                             [Drake]
1      [Cardi B, Bad Bunny, J Balvin]
2                             [Drake]
3                       [Post Malone]
4                             [Drake]
                    ...              
245                      [Juice WRLD]
246       [Baby Keem, Kendrick Lamar]
247                        [Rod Wave]
248             [Megan Thee Stallion]
249                    [Armani White]
Name: Primary Artists, Length: 250, dtype: object

In [22]:
def preprocess_lyrics(lyrics):
    # Remove punctuation
    lyrics = re.sub('[^a-zA-Z0-9\s]', '', lyrics)
    # Convert to lowercase
    lyrics = lyrics.lower()
    # Tokenize into individual words
    words = nltk.word_tokenize(lyrics)
    return words

In [55]:
def word_list_to_df(words):
    return pd.DataFrame.from_dict(Counter(used_words), orient="index").reset_index()

In [57]:
# a table of ALL words (independent of any other variable)
used_words = []
for lyrics in songs_data["Lyrics"]:
    for word in preprocess_lyrics(lyrics):
        used_words.append(word)

all_words_df = word_list_to_df(used_words)

In [49]:
songs_data["year"].unique()

array([2018, 2019, 2020, 2021, 2022], dtype=int64)

In [66]:
# a table of words FOR EACH year
words_year_dfs = {}
for year in songs_data["year"].unique():
    words_year_dfs[year] = []

# combine all lyrics of a year
year_lyrics_df = songs_data.groupby("year")["Lyrics"].apply(' '.join).reset_index()
for year in words_year_dfs.keys():
    lyrics_of_that_year = year_lyrics_df.loc[year_lyrics_df["year"]==year, "Lyrics"].values[0]
    words=preprocess_lyrics(lyrics_of_that_year)
    words_year_dfs[year] = word_list_to_df(words)

In [68]:
words_year_dfs[2020]

Unnamed: 0,index,0
0,and,1978
1,they,912
2,wishin,30
3,on,1903
4,me,2294
...,...,...
11243,mitt,1
11244,kimbo,1
11245,gen3,1
11246,propped,1


In [112]:
# a table of words FOR EACH primary artist
words_artist_dfs = {}
# if multiple primary artists: EXPLODE!! (means per duplicate row for each artist)
exploded_songs_data = songs_data.explode('Primary Artists')
for artist in exploded_songs_data["Primary Artists"].unique():
    words_artist_dfs[artist] = []

# combine all lyrics of an artist
artist_lyrics_df = explored_songs_data.groupby("Primary Artists")["Lyrics"].apply(' '.join).reset_index()
for artist in words_artist_dfs.keys():
    lyrics_of_that_artist = artist_lyrics_df.loc[artist_lyrics_df["Primary Artists"]==artist, "Lyrics"].values[0]
    words=preprocess_lyrics(lyrics_of_that_artist)
    words_artist_dfs[artist] = word_list_to_df(words)

In [115]:
words_artist_dfs["Post Malone"]

Unnamed: 0,index,0
0,and,1978
1,they,912
2,wishin,30
3,on,1903
4,me,2294
...,...,...
11243,mitt,1
11244,kimbo,1
11245,gen3,1
11246,propped,1
