In [1]:
import pandas as pd
from collections import defaultdict
import math
import re

In [2]:
df_final = pd.read_csv("../data/Hot100Data.csv")
df_final

Unnamed: 0.1,Unnamed: 0,Year,Performer,Song,Lyrics
0,0,2019,Post Malone Featuring DaBaby,Enemies,\n\n[Chorus: Post Malone]\nUsed to have friend...
1,1,2019,"Yella Beezy, Gucci Mane & Quavo",Bacc At It Again,"\n\n[Intro]\nCook that shit up, Quay\n\n[Choru..."
2,2,2019,DaBaby,VIBEZ,"\n\n[Intro]\nLet's go (Yeah, yeah, Neeko, you ..."
3,3,2019,NF,When I Grow Up,"\n\n[Verse 1]\nYeah, when I grow up, you know ..."
4,4,2019,Post Malone,Hollywood's Bleeding,"\n\n[Intro]\nHollywood's bleeding, vampires fe..."
...,...,...,...,...,...
3443,3589,2016,Chris Brown,Zero,\n\n[Verse 1]\nI thought we were great\nYou to...
3444,3590,2018,Kodak Black Featuring Travis Scott & Offset,ZEZE,\n\n[Intro]\nD.A. got that dope!\n\n[Chorus: T...
3445,3591,2017,Future,Zoom,"\n\n[Intro]\nBought a drop-top Porsche, about ..."
3446,3592,2006,Lil' Boosie Featuring Yung Joc,Zoom,"\n\n[Intro]\nLil' Boosie, Bad Azz\nYung Joc - ..."


In [3]:
# dictionary that keeps track of words and the number of documents they appear in
docCount = defaultdict(int)

punctuation = set('!"%&()*+,-.:;<=>?@[]^_`{|}~')

# build dictionary
for i, row in df_final.iterrows():
    # convert lyrics to lowercase
    r = row['Lyrics'].lower()
    # remove words in square brackets (i.e. [intro], [chorus: artist name])
    r = re.sub('[\(\[].*?[\)\]]', '', r)
    # remove punctuation
    r = ''.join([c for c in r if not c in punctuation])
    words = r.split() # split r into an array of separate words
    seen = set() # words seen in current document
    for w in words:
        # increment docCount for word w if it is in current document
        if w not in seen:
            docCount[w] += 1
            seen.add(w)

In [4]:
# counts = [(docCount[w], w) for w in docCount]
# counts.sort()
# counts.reverse()
# counts[:25]

In [5]:
# term frequency
# t: term
# d: document
# returns the number of times t appears in d
def tf(t, d):
    return d.count(t)

# inverse document frequency
# t: term
# N: total number of documents
# returns the IDF of a term
def idf(t, N):
    n_t = abs(docCount[t]) # number of documents containing t
    return math.log(N/n_t, 10)

In [6]:
# dictionary that keeps track of words and their tfidf values
tfidf_dict = defaultdict(int)

punctuation = set('!"%&()*+,-.:;<=>?@[]^_`{|}~')

for i, row in df_final.iterrows():
    # convert lyrics to lowercase
    r = row['Lyrics'].lower()
    # remove words in square brackets (i.e. [intro], [chorus: artist name])
    r = re.sub('[\(\[].*?[\)\]]', '', r)
    # remove punctuation
    r = ''.join([c for c in r if not c in punctuation])
    #print(r)
    words = r.split()
    #print(words)
    for w in words:
        my_tf = tf(w, r)
        my_idf = idf(w, len(df_final))
        tfidf_dict[w] += my_tf * my_idf

print(len(tfidf_dict))

39034


In [7]:
# get top 100 tfidf values
tfidf_counts = [(tfidf_dict[w], w) for w in tfidf_dict]
tfidf_counts.sort()
tfidf_counts.reverse()
tfidf_counts[:100]

[(147116.5055199298, 'a'),
 (140232.60587090033, 'na'),
 (107183.39751603428, 'he'),
 (104589.77195571616, 'wop'),
 (94547.10674735022, 'i'),
 (83191.29900879071, 'la'),
 (61218.101386667644, 'luh'),
 (56068.28996505899, 'low'),
 (53394.047260379004, 'versace'),
 (50197.761348729335, 'she'),
 (50014.45200561936, 'oh'),
 (48842.435258191465, 'e'),
 (45458.54690632048, 'yo'),
 (44602.435990924765, 'da'),
 (44375.24367372304, 'howbow'),
 (41262.18448742878, 'diddily'),
 (41249.88129236593, 'it'),
 (40299.673055024694, 'boom'),
 (39102.21780893886, 'ya'),
 (38912.107080956914, 'do'),
 (38396.35554652455, 'dah'),
 (36440.212263871974, 'o'),
 (36148.20405931003, 'in'),
 (35413.213563942896, 'baby'),
 (35199.64741314996, 'go'),
 (35100.221195349746, 'nigga'),
 (35047.14659725923, 'love'),
 (34844.2539208421, 'at'),
 (33489.4525500261, 'y'),
 (32602.21984191899, 'bwok'),
 (32155.917229750637, 'yeah'),
 (31892.000406412793, 'n'),
 (31847.649316640913, 'ooh'),
 (31319.72683807217, 'up'),
 (31197