In [1]:
import lucem_illud_2020 #pip install -U git+git://github.com/Computational-Content-Analysis-2020/lucem_illud_2020.git

#All these packages need to be installed from pip
#For ML
import sklearn
import sklearn.naive_bayes
import sklearn.tree
import sklearn.ensemble
import sklearn.neural_network
import sklearn.decomposition

import nltk #For tokenizing and normalizing
import numpy as np #arrays
import matplotlib.pyplot as plt #Plots
import matplotlib.colors # For nice colours
import seaborn as sns#Makes plots look nice, also heatmaps
import scipy as sp #for interp

#These are from the standard library
import collections
import os
import os.path
import random
import re
import glob
import pandas as pd
import requests
import json
import math

#This 'magic' command makes the plots work better
#in the notebook, don't use it outside of a notebook.
#Also you can ignore the warning
%matplotlib inline

In [3]:
billboard_df = pd.read_csv('billboard-lyrics/billboard_lyrics_1964-2015.csv')
billboard_df

Unnamed: 0,Rank,Song,Artist,Year,Lyrics,Source
0,1,wooly bully,sam the sham and the pharaohs,1965,sam the sham miscellaneous wooly bully wooly b...,3.0
1,2,i cant help myself sugar pie honey bunch,four tops,1965,sugar pie honey bunch you know that i love yo...,1.0
2,3,i cant get no satisfaction,the rolling stones,1965,,1.0
3,4,you were on my mind,we five,1965,when i woke up this morning you were on my mi...,1.0
4,5,youve lost that lovin feelin,the righteous brothers,1965,you never close your eyes anymore when i kiss...,1.0
...,...,...,...,...,...,...
5095,96,el perdon,nicky jam and enrique iglesias,2015,enrique iglesias dime si es verdad me dijeron ...,3.0
5096,97,she knows,neyo featuring juicy j,2015,,
5097,98,night changes,one direction,2015,going out tonight changes into something red ...,1.0
5098,99,back to back,drake,2015,oh man oh man oh man not againyeah i learned ...,1.0


In [7]:
billboard_df = billboard_df.dropna()

In [9]:
billboard_df = billboard_df.drop_duplicates()

In [6]:
my_stop_words = [str(i) for i in range(10)]
my_stop_words += ['intro','verse','pre','post','lift','chorus','bridge','outro','instrumental']
my_stop_words

['0',
 '1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9',
 'intro',
 'verse',
 'pre',
 'post',
 'lift',
 'chorus',
 'bridge',
 'outro',
 'instrumental']

In [11]:
from multiprocessing import Pool
p = Pool()
billboard_df['tokens'] = list(p.map(lucem_illud_2020.word_tokenize,billboard_df['Lyrics'])) 

In [12]:
billboard_df['normalized'] = list(p.starmap(lucem_illud_2020.normalizeTokens,[(x,my_stop_words) for x in billboard_df['tokens']]))

In [15]:
billboard_df = billboard_df.reset_index(drop=True)

In [19]:
invalid = []
for i in range(len(billboard_df)):
    word_list = billboard_df.loc[i,'normalized']
    if len(word_list) <=10:
        invalid.append(i)
invalid

[2,
 17,
 155,
 169,
 171,
 188,
 205,
 256,
 274,
 287,
 289,
 308,
 329,
 349,
 350,
 446,
 457,
 465,
 480,
 553,
 608,
 652,
 665,
 683,
 698,
 706,
 724,
 765,
 784,
 791,
 890,
 903,
 988,
 1021,
 1042,
 1061,
 1075,
 1221,
 1234,
 1241,
 1289,
 1301,
 1356,
 1363,
 1393,
 1405,
 1462,
 1518,
 1535,
 1550,
 1574,
 1654,
 1662,
 1675,
 1679,
 1841,
 1919,
 1947,
 1953,
 1987,
 2019,
 2108,
 2143,
 2157,
 2172,
 2240,
 2255,
 2311,
 2423,
 2471,
 2543,
 2546,
 2575,
 2650,
 2654,
 2752,
 2775,
 2845,
 2914,
 2965,
 3027,
 3198,
 3366,
 3490,
 4637,
 4786]

In [20]:
len(invalid)

86

In [21]:
billboard_df = billboard_df.drop(invalid)
billboard_df

Unnamed: 0,Rank,Song,Artist,Year,Lyrics,Source,tokens,normalized
0,1,wooly bully,sam the sham and the pharaohs,1965,sam the sham miscellaneous wooly bully wooly b...,3.0,"[sam, the, sham, miscellaneous, wooly, bully, ...","[sam, sham, miscellaneous, wooly, bully, wooly..."
1,2,i cant help myself sugar pie honey bunch,four tops,1965,sugar pie honey bunch you know that i love yo...,1.0,"[sugar, pie, honey, bunch, you, know, that, i,...","[sugar, pie, honey, bunch, know, love, not, he..."
3,4,you were on my mind,we five,1965,when i woke up this morning you were on my mi...,1.0,"[when, i, woke, up, this, morning, you, were, ...","[wake, morning, mind, mind, get, trouble, whoa..."
4,5,youve lost that lovin feelin,the righteous brothers,1965,you never close your eyes anymore when i kiss...,1.0,"[you, never, close, your, eyes, anymore, when,...","[close, eye, anymore, kiss, lip, s, tenderness..."
5,6,downtown,petula clark,1965,when youre alone and life is making you lonel...,1.0,"[when, you, re, alone, and, life, is, making, ...","[life, make, lonely, downtown, have, get, worr..."
...,...,...,...,...,...,...,...,...
4908,95,waves,mr probz,2015,my face above the water my feet cant touch th...,1.0,"[my, face, above, the, water, my, feet, ca, nt...","[face, water, foot, not, touch, grind, touch, ..."
4909,96,el perdon,nicky jam and enrique iglesias,2015,enrique iglesias dime si es verdad me dijeron ...,3.0,"[enrique, iglesias, dime, si, es, verdad, me, ...","[enrique, iglesias, dime, si, es, verdad, dije..."
4910,98,night changes,one direction,2015,going out tonight changes into something red ...,1.0,"[going, out, tonight, changes, into, something...","[go, tonight, change, red, mother, not, like, ..."
4911,99,back to back,drake,2015,oh man oh man oh man not againyeah i learned ...,1.0,"[oh, man, oh, man, oh, man, not, againyeah, i,...","[oh, man, oh, man, oh, man, againyeah, learn, ..."


In [22]:
billboard_groups = billboard_df.groupby(['Year'])

In [28]:
for index,g in list(billboard_groups):
    print(len(g))

93
91
94
88
90
95
86
86
95
91
91
94
90
92
91
93
91
92
98
97
95
95
96
97
99
96
94
96
97
93
95
90
92
94
97
99
97
99
97
98
98
96
95
97
97
99
100
98
97
98
98


In [30]:
billboard_df.to_csv('cleaned_billboard.csv',columns=['Rank','Song','Artist','Year','Lyrics','normalized'], index=None)