In [1]:
import lucem_illud_2020 #pip install -U git+git://github.com/Computational-Content-Analysis-2020/lucem_illud_2020.git

#All these packages need to be installed from pip
#For ML
import sklearn

import nltk #For tokenizing and normalizing
import numpy as np #arrays
import matplotlib.pyplot as plt #Plots
import matplotlib.colors # For nice colours
import seaborn as sns#Makes plots look nice, also heatmaps
import scipy as sp #for interp

#These are from the standard library
import collections
import os
import os.path
import random
import re
import glob
import pandas as pd
import requests
import json
import math

from spellchecker import SpellChecker
import ftfy
import ast
#This 'magic' command makes the plots work better
#in the notebook, don't use it outside of a notebook.
#Also you can ignore the warning
%matplotlib inline

In [2]:
billboard_df = pd.read_csv('cleaned_billboard.csv')
billboard_df[:5]

Unnamed: 0,Rank,Song,Artist,Year,Lyrics,Language
0,1,wooly bully,sam the sham and the pharaohs,1965,uno dos one two tres quatro matty told hatty a...,1
1,2,i cant help myself sugar pie honey bunch,four tops,1965,sugar pie honey bunch you know that i love you...,1
2,3,i cant get no satisfaction,the rolling stones,1965,i can't get no satisfaction i can't get me no ...,1
3,4,you were on my mind,we five,1965,when i woke up this morning you were on my min...,1
4,5,youve lost that lovin feelin,the righteous brothers,1965,you never close your eyes anymore when i kiss ...,1


In [3]:
billboard_df = billboard_df.dropna()

In [4]:
len(billboard_df)

5068

In [5]:
billboard_df = billboard_df.drop_duplicates()
len(billboard_df)

5068

In [6]:
billboard_df = billboard_df.reset_index(drop=True)

In [26]:
# lyrics cleaning
import string
import re
import spacy
import ftfy
import pkg_resources
from symspellpy import SymSpell, Verbosity

sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)
bigram_path = pkg_resources.resource_filename(
    "symspellpy", "frequency_bigramdictionary_en_243_342.txt")
# term_index is the column of the term and count_index is the
# column of the term frequency
sym_spell.load_dictionary('integrated_uni_dict.txt', term_index=0, count_index=1, separator=',')
sym_spell.load_bigram_dictionary(bigram_path, term_index=0, count_index=2)

mark_words = ['pre-hook','post-hook','pre-chorus','post-chorus','bridge','outro','verse [0-9]',
              '(repeat(ed)* )*intro( repeat(ed)*)*', '(repeat(ed)* )*verse( repeat(ed)*)*',
              '(repeat(ed)* )*chorus( repeat(ed)*)*', '(repeat(ed)* )*hook( repeat(ed)*)*',
              'interlude','instrumental( break )*','x[0-9]+','[0-9]+x','repeat ((to end)|(to faded*))', 
              r"[#&$%\*\+@~=\^_\|`\(\)\[\]]"
              ]

neg_words = ['( doesn)t ','( didn)t ','( haven)t ','( hadn)t ','( isn)t ','( aren)t ','( weren)t ','( don)t ','( wasn)t ','( can)t ']

def clean_garbled(s):
    no_garb = ftfy.fix_text(s)
    no_uni = ''.join([w if ord(w) < 128 else '' for w in no_garb]).lower()
    return no_uni

def hyphen_modify(s):
    return s.replace('-','')

def repeat_modify(s):
    # replace single repeat letters
    single_modified = re.sub(r'(\w|\!|\?)\1\1+',r'\1',s)
    # replace double and trible repeat units
    double_modified = re.sub(r'(\w\w)\1\1+',r'\1 \1 \1',single_modified)
    trible_modified = re.sub(r'(\w\w\w)\1\1+',r'\1 \1 \1',double_modified)
    return trible_modified

def correction(s):
    suggestions = sym_spell.lookup_compound(s, max_edit_distance=2)
    return suggestions[0].term

def remove_marks(s):
    command = '|'.join(['('+ w + ')' for w in mark_words])
    regex = re.compile(command)
    no_marks = re.sub(regex, '', s)
    return re.sub(' +', ' ', no_marks)

def negation_return(s):
    temp = s
    for w in neg_words:
        temp = re.sub(w, "\g<1>'t ", temp)
    return temp

def lyrics_clean(s):
    no_uni = clean_garbled(s)
    no_hyphen = hyphen_modify(no_uni)
    no_repeat = repeat_modify(no_hyphen)
    corrected = correction(no_repeat)
    no_marks = remove_marks(corrected)
    negated = negation_return(no_marks)    
    return negated

In [9]:
from multiprocessing import Pool
#cleaned result
p = Pool()
result = list(p.map(lyrics_clean,billboard_df['Lyrics'])) 

In [10]:
billboard_df['Lyrics'] = result

In [56]:
billboard_df = billboard_df.loc[billboard_df['Lyrics']!='']

In [11]:
billboard_df = billboard_df.reset_index(drop=True)

In [12]:
len(billboard_df)

5068

In [12]:
# language detection
from langdetect import detect, detect_langs, DetectorFactory
p = Pool()
DetectorFactory.seed = 0

In [24]:
final = set()
for i in range(10):
    result = p.map(detect, billboard_df['Lyrics'])
    if len(final) != 0:
        final &= set(enumerate(result))
    else:
        final = set(enumerate(result))
    print('done',i)

done 0
done 1
done 2
done 3
done 4
done 5
done 6
done 7
done 8
done 9


In [25]:
final_list = [x[0] for x in final]

In [31]:
len(final_list)

5049

In [32]:
en_list = [x[0] for x in final if x[1]=='en']

In [33]:
len(en_list)

5031

In [37]:
billboard_df = billboard_df.drop(columns='language')

In [39]:
billboard_df['Language']=0
billboard_df.loc[en_list,'Language']=1

In [41]:
billboard_df

Unnamed: 0,Rank,Song,Artist,Year,Lyrics,Language
0,1,wooly bully,sam the sham and the pharaohs,1965,uno dos one two tres quatro matty told hatty a...,1
1,2,i cant help myself sugar pie honey bunch,four tops,1965,sugar pie honey bunch you know that i love you...,1
2,3,i cant get no satisfaction,the rolling stones,1965,i can't get no satisfaction i can't get me no ...,1
3,4,you were on my mind,we five,1965,when i woke up this morning you were on my min...,1
4,5,youve lost that lovin feelin,the righteous brothers,1965,you never close your eyes anymore when i kiss ...,1
...,...,...,...,...,...,...
5063,96,el perdon,nicky jam and enrique iglesias,2015,dime si es ver dad me dijon que te estes casa ...,0
5064,97,she knows,neyo featuring juicy j,2015,yeah uhuh your boy juicy j she bad yeah in cas...,1
5065,98,night changes,one direction,2015,going out tonight changes into something red h...,1
5066,99,back to back,drake,2015,oh man oh man oh man not again yeah i learned ...,1


In [8]:
from multiprocessing import Pool
# tokenization
p = Pool()
billboard_df['Tokens'] = p.map(lucem_illud_2020.word_tokenize,billboard_df['Lyrics']) 

In [9]:
import string
my_stop_words = [str(i) for i in range(10)] \
     + [x for x in string.ascii_lowercase if x not in ['a','u','o','i']] + ['-pron-']
my_stop_words

['0',
 '1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'j',
 'k',
 'l',
 'm',
 'n',
 'p',
 'q',
 'r',
 's',
 't',
 'v',
 'w',
 'x',
 'y',
 'z',
 '-pron-']

In [10]:
# normalization
billboard_df['Normalized'] = p.starmap(lucem_illud_2020.normalizeTokens,
                                       [(x,my_stop_words) for x in billboard_df['Tokens']])

In [11]:
# re-normalization for '-PRON-'
billboard_df['Normalized'] = p.starmap(lucem_illud_2020.normalizeTokens,
                                       [(x,my_stop_words) for x in billboard_df['Normalized']])

In [12]:
billboard_df

Unnamed: 0,Rank,Song,Artist,Year,Lyrics,Language,Tokens,Normalized
0,1,wooly bully,sam the sham and the pharaohs,1965,uno dos one two tres quatro matty told hatty a...,1,"[uno, dos, one, two, tres, quatro, matty, told...","[uno, dos, tres, quatro, matty, tell, hatty, t..."
1,2,i cant help myself sugar pie honey bunch,four tops,1965,sugar pie honey bunch you know that i love you...,1,"[sugar, pie, honey, bunch, you, know, that, i,...","[sugar, pie, honey, bunch, know, love, help, l..."
2,3,i cant get no satisfaction,the rolling stones,1965,i can't get no satisfaction i can't get me no ...,1,"[i, ca, n't, get, no, satisfaction, i, ca, n't...","[satisfaction, satisfaction, try, try, try, tr..."
3,4,you were on my mind,we five,1965,when i woke up this morning you were on my min...,1,"[when, i, woke, up, this, morning, you, were, ...","[wake, morning, mind, mind, trouble, whoa, oh,..."
4,5,youve lost that lovin feelin,the righteous brothers,1965,you never close your eyes anymore when i kiss ...,1,"[you, never, close, your, eyes, anymore, when,...","[close, eye, anymore, kiss, lip, tenderness, l..."
...,...,...,...,...,...,...,...,...
5063,96,el perdon,nicky jam and enrique iglesias,2015,dime si es ver dad me dijon que te estes casa ...,0,"[dime, si, es, ver, dad, me, dijon, que, te, e...","[dime, si, es, ver, dad, dijon, que, te, estes..."
5064,97,she knows,neyo featuring juicy j,2015,yeah uhuh your boy juicy j she bad yeah in cas...,1,"[yeah, uhuh, your, boy, juicy, j, she, bad, ye...","[yes, uhuh, boy, juicy, bad, yes, case, know, ..."
5065,98,night changes,one direction,2015,going out tonight changes into something red h...,1,"[going, out, tonight, changes, into, something...","[tonight, change, red, mother, like, kind, dre..."
5066,99,back to back,drake,2015,oh man oh man oh man not again yeah i learned ...,1,"[oh, man, oh, man, oh, man, not, again, yeah, ...","[oh, man, oh, man, oh, man, yes, learn, game, ..."


In [13]:
billboard_df.to_csv('cleaned_billboard.csv',index=None)