In [71]:
import pandas as pd
from collections import defaultdict
import math
import re

In [72]:
df_final = pd.read_csv("../data/Hot100Data.csv")

In [73]:
# separate df_final by decade
df_5960 = df_final[df_final['Year'] <= 1969] # includes 1959, 35 observations
df_70 = df_final.loc[(df_final['Year'] >= 1970) & (df_final['Year'] <= 1979)] # 14 observations
df_80 = df_final.loc[(df_final['Year'] >= 1980) & (df_final['Year'] <= 1989)] # 53 observations
df_90 = df_final.loc[(df_final['Year'] >= 1990) & (df_final['Year'] <= 1999)] # 483 observations
df_00 = df_final.loc[(df_final['Year'] >= 2000) & (df_final['Year'] <= 2009)] # 1050 observations
df_10 = df_final.loc[(df_final['Year'] >= 2010) & (df_final['Year'] <= 2019)] # 1813 observations

In [74]:
# term frequency
# t: term
# d: document
# returns the number of times t appears in d
def tf(t, d):
    return d.count(t)

# inverse document frequency
# t: term
# N: total number of documents
# returns the IDF of a term
def idf(t, N, docCount):
    n_t = abs(docCount[t]) # number of documents containing t
    return math.log(N/n_t, 10)

In [86]:
def get_tfidf_df(df):
    # dictionary that keeps track of words and the number of documents they appear in
    docCount = defaultdict(int)
    
    punctuation = set('!"%&(#)*+,-.:;<=>?@[]^_`{|}~')

    # build dictionary
    for i, row in df.iterrows():
        # convert lyrics to lowercase
        r = row['Lyrics'].lower()
        # remove words in square brackets (i.e. [intro], [chorus: artist name])
        r = re.sub('[\(\[].*?[\)\]]', '', r)
        # remove punctuation
        r = ''.join([c for c in r if not c in punctuation])
        words = r.split() # split r into an array of separate words
        seen = set() # words seen in current document
        for w in words:
            # increment docCount for word w if it is in current document
            if w not in seen:
                docCount[w] += 1
                seen.add(w)

    # dictionary that keeps track of words and their tfidf values
    tfidf_dict = defaultdict(int)

    for i, row in df.iterrows():
        # convert lyrics to lowercase
        r = row['Lyrics'].lower()
        # remove words in square brackets (i.e. [intro], [chorus: artist name])
        r = re.sub('[\(\[].*?[\)\]]', '', r)
        # remove punctuation
        r = ''.join([c for c in r if not c in punctuation])
        words = r.split()
        for w in words:
            my_tf = tf(w, r)
            my_idf = idf(w, len(df), docCount)
            tfidf_dict[w] += my_tf * my_idf
    
    # sort by tfidf values, descending
    tfidf_counts = [(tfidf_dict[w], w) for w in tfidf_dict]
    tfidf_counts.sort()
    tfidf_counts.reverse()
    
    # turn into dataframe
    tfidf_df = pd.DataFrame(tfidf_counts)
    tfidf_df.columns = ["TF-IDF", "Term"]
    
    return tfidf_df

In [88]:
tfidf_5960 = get_tfidf_df(df_5960)
tfidf_70 = get_tfidf_df(df_70)
tfidf_80 = get_tfidf_df(df_80)
tfidf_90 = get_tfidf_df(df_90)
tfidf_00 = get_tfidf_df(df_00)
tfidf_10 = get_tfidf_df(df_10)

In [95]:
print(tfidf_5960[:10])
print(tfidf_70[:10])

        TF-IDF   Term
0  3557.532774  giddy
1  1435.662118      i
2  1214.185447     la
3  1031.015035     up
4   954.466718      a
5   680.934008    mia
6   680.934008  hound
7   680.934008    dog
8   680.934008   cara
9   648.101540    why
        TF-IDF    Term
0  4264.742421     off
1  2830.510338     get
2   691.115206       o
3   616.921569   gotta
4   428.651885  number
5   412.777184     hot
6   398.783409       i
7   359.962215    want
8   197.752941    work
9   193.695638  summer


In [90]:
tfidf_70[:10]

Unnamed: 0,TF-IDF,Term
0,4264.742421,off
1,2830.510338,get
2,691.115206,o
3,616.921569,gotta
4,428.651885,number
5,412.777184,hot
6,398.783409,i
7,359.962215,want
8,197.752941,work
9,193.695638,summer


In [91]:
tfidf_80[:10]

Unnamed: 0,TF-IDF,Term
0,9339.339425,wild
1,3937.181139,a
2,3521.110292,ah
3,3119.133692,push
4,3073.578926,he
5,2778.175946,jump
6,2267.230677,e
7,2251.53676,it
8,2112.23794,cali
9,2056.590288,type


In [92]:
tfidf_90[:10]

Unnamed: 0,TF-IDF,Term
0,17113.600217,a
1,16233.703815,dat
2,14178.955956,wake
3,14110.383987,he
4,12971.444075,ha
5,12850.039849,jump
6,12410.571533,duh
7,11339.676627,bling
8,11285.70753,weasel
9,10546.009076,e


In [93]:
tfidf_00[:10]

Unnamed: 0,TF-IDF,Term
0,54648.318113,na
1,51467.4551,a
2,45059.655164,la
3,36486.189421,he
4,27843.28058,bwok
5,26444.980058,i
6,23130.505502,da
7,22121.452979,she
8,21969.803025,n
9,21931.421142,boom


In [94]:
tfidf_10[:10]

Unnamed: 0,TF-IDF,Term
0,104300.096065,wop
1,85510.278521,na
2,67398.575395,a
3,59147.356169,luh
4,52869.009283,he
5,49965.397119,versace
6,49894.124046,low
7,43536.504045,i
8,40873.342055,howbow
9,38005.951987,diddily


In [56]:
# dictionary that keeps track of words and the number of documents they appear in
docCount = defaultdict(int)

punctuation = set('!"%&()*+,-.:;<=>?@[]^_`{|}~')

# build dictionary
for i, row in df_final.iterrows():
    # convert lyrics to lowercase
    r = row['Lyrics'].lower()
    # remove words in square brackets (i.e. [intro], [chorus: artist name])
    r = re.sub('[\(\[].*?[\)\]]', '', r)
    # remove punctuation
    r = ''.join([c for c in r if not c in punctuation])
    words = r.split() # split r into an array of separate words
    seen = set() # words seen in current document
    for w in words:
        # increment docCount for word w if it is in current document
        if w not in seen:
            docCount[w] += 1
            seen.add(w)
            


In [57]:
# counts = [(docCount[w], w) for w in docCount]
# counts.sort()
# counts.reverse()
# counts[:25]

In [58]:
# dictionary that keeps track of words and their tfidf values
tfidf_dict = defaultdict(int)

punctuation = set('!"%&()*+,-.:;<=>?@[]^_`{|}~')

for i, row in df_final.iterrows():
    # convert lyrics to lowercase
    r = row['Lyrics'].lower()
    # remove words in square brackets (i.e. [intro], [chorus: artist name])
    r = re.sub('[\(\[].*?[\)\]]', '', r)
    # remove punctuation
    r = ''.join([c for c in r if not c in punctuation])
    #print(r)
    words = r.split()
    #print(words)
    for w in words:
        my_tf = tf(w, r)
        my_idf = idf(w, len(df_final))
        tfidf_dict[w] += my_tf * my_idf

print(len(tfidf_dict))

TypeError: idf() missing 1 required positional argument: 'docCount'

In [None]:
# get top 20 tfidf values
tfidf_counts = [(tfidf_dict[w], w) for w in tfidf_dict]
tfidf_counts.sort()
tfidf_counts.reverse()
tfidf_counts[:20]