In [1]:
import pandas as pd
import numpy as np
import nltk
import string
import re
import math

from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from contractions import expandContractions

In [2]:
# -*- coding: utf-8 -*-
"""
Created on Mon Aug 01 01:11:02 2016
@author: DIP
author's repo: https://github.com/dipanjanS/practical-machine-learning-with-python/blob/master/bonus%20content/nlp%20proven%20approach/contractions.py
"""

CONTRACTION_MAP = {
"ain't": "is not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he would",
"he'd've": "he would have",
"he'll": "he will",
"he'll've": "he he will have",
"he's": "he is",
"how'd": "how did",
"how'd'y": "how do you",
"how'll": "how will",
"how's": "how is",
"I'd": "I would",
"I'd've": "I would have",
"I'll": "I will",
"I'll've": "I will have",
"I'm": "I am",
"I've": "I have",
"i'd": "i would",
"i'd've": "i would have",
"i'll": "i will",
"i'll've": "i will have",
"i'm": "i am",
"i've": "i have",
"isn't": "is not",
"it'd": "it would",
"it'd've": "it would have",
"it'll": "it will",
"it'll've": "it will have",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"mightn't've": "might not have",
"must've": "must have",
"mustn't": "must not",
"mustn't've": "must not have",
"needn't": "need not",
"needn't've": "need not have",
"o'clock": "of the clock",
"oughtn't": "ought not",
"oughtn't've": "ought not have",
"shan't": "shall not",
"sha'n't": "shall not",
"shan't've": "shall not have",
"she'd": "she would",
"she'd've": "she would have",
"she'll": "she will",
"she'll've": "she will have",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"shouldn't've": "should not have",
"so've": "so have",
"so's": "so as",
"that'd": "that would",
"that'd've": "that would have",
"that's": "that is",
"there'd": "there would",
"there'd've": "there would have",
"there's": "there is",
"they'd": "they would",
"they'd've": "they would have",
"they'll": "they will",
"they'll've": "they will have",
"they're": "they are",
"they've": "they have",
"to've": "to have",
"wasn't": "was not",
"we'd": "we would",
"we'd've": "we would have",
"we'll": "we will",
"we'll've": "we will have",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what will",
"what'll've": "what will have",
"what're": "what are",
"what's": "what is",
"what've": "what have",
"when's": "when is",
"when've": "when have",
"where'd": "where did",
"where's": "where is",
"where've": "where have",
"who'll": "who will",
"who'll've": "who will have",
"who's": "who is",
"who've": "who have",
"why's": "why is",
"why've": "why have",
"will've": "will have",
"won't": "will not",
"won't've": "will not have",
"would've": "would have",
"wouldn't": "would not",
"wouldn't've": "would not have",
"y'all": "you all",
"y'all'd": "you all would",
"y'all'd've": "you all would have",
"y'all're": "you all are",
"y'all've": "you all have",
"you'd": "you would",
"you'd've": "you would have",
"you'll": "you will",
"you'll've": "you will have",
"you're": "you are",
"you've": "you have"
}

#remove contractions, code by Dipanjan Sarkar (git linked on contraction map cell)
def expandContractions(text, contraction_mapping=CONTRACTION_MAP):
    
    contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())), 
                                      flags=re.IGNORECASE|re.DOTALL)
    def expand_match(contraction):
        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = contraction_mapping.get(match)\
                                if contraction_mapping.get(match)\
                                else contraction_mapping.get(match.lower())                       
        expanded_contraction = first_char+expanded_contraction[1:]
        return expanded_contraction
        
    expanded_text = contractions_pattern.sub(expand_match, text)
    expanded_text = re.sub("'", "", expanded_text)
    
    return expanded_text


In [3]:
#lyric files
eurovision = open("data/intersection.txt", "r")
c60s = open("data/1960s.txt", "r")
c70s = open("data/1970s.txt", "r")
c80s = open("data/1980s.txt", "r")
c90s = open("data/1990s.txt", "r")
c00s = open("data/2000s.txt", "r")
c10s = open("data/2010s.txt", "r")

eurovisionLyrics = eurovision.read()
lyrics60s = c60s.read()
lyrics70s = c70s.read()
lyrics80s = c80s.read()
lyrics90s = c90s.read()
lyrics00s = c00s.read()
lyrics10s = c10s.read()

eurovision.close()
c60s.close()
c70s.close()
c80s.close()
c90s.close()
c00s.close()
c10s.close()

In [4]:
#make lowercase
def convLower(text):
    text = text.lower()
    return text

#remove punctuation
def removePunctuation(text):
    text = "".join([char for char in text if char not in string.punctuation])
    return text

#remove non-alphabet characters
def removeNonAlphabet(text):
    regex = re.sub(r'[^a-zA-Z]', '', text) 
    return text

#remove numbers
def removeNumbers(text):
    text = "".join(i for i in text if not i.isdigit())
    return text

def cleanText(data):
    data = convLower(data)
    data = expandContractions(data)
    data = removePunctuation(data)
    data = removeNonAlphabet(data)
    data = removeNumbers(data)
    data = removePunctuation(data)
    return data

In [5]:
#create a dictionary of words and their occurence for each document
def wordCount(data):
    dictName = dict.fromkeys(uniqueWords, 0)
    for word in data:
        dictName[word] += 1
    return dictName

def computeTF(wordDict, bagOfWords):
    tfDict = {}
    bagOfWordsCount = len(bagOfWords)
    for word, count in wordDict.items():
        tfDict[word] = count / float(bagOfWordsCount)
    return tfDict

#returns dictionary of TF values
#basic IDF with log
def computeIDFBasic(documents):
    N = len(documents)
    
    idfDict = dict.fromkeys(documents[0].keys(), 0)
    for document in documents:
        for word, val in document.items():
            if val > 0:
                idfDict[word] += 1
    
    for word, val in idfDict.items():
        idfDict[word] = math.log(N / float(val))
    return idfDict 


def computeIDFPlusOne(documents):
    import math
    N = len(documents)
    
    idfDict = dict.fromkeys(documents[0].keys(), 0)
    for document in documents:
        for word, val in document.items():
            if val > 0:
                idfDict[word] += 1
    
    for word, val in idfDict.items():
        idfDict[word] = 1 + math.log(N / float(val))
    return idfDict

def computeTFIDF(tfData, idfs):
    tfidf = {}
    for word, val in tfData.items():
        tfidf[word] = val * idfs[word]
    return tfidf

In [6]:
#clean lyrics
cleanEV = cleanText(eurovisionLyrics)
clean60s = cleanText(lyrics60s)
clean70s = cleanText(lyrics70s)
clean80s = cleanText(lyrics80s)
clean90s = cleanText(lyrics90s)
clean00s = cleanText(lyrics00s)
clean10s = cleanText(lyrics10s)

In [7]:
cleanDecades = clean60s + clean70s + clean80s + clean90s + clean00s + clean10s

In [10]:
#tokenize as words
wordsEurovision = word_tokenize(cleanEV)
words60s = word_tokenize(clean60s)
words70s = word_tokenize(clean70s)
words80s = word_tokenize(clean80s)
words90s = word_tokenize(clean90s)
words00s = word_tokenize(clean00s)
words10s = word_tokenize(clean10s)
wordsDecades = word_tokenize(cleanDecades)

In [14]:
#get unique words
uniqueWords = set(wordsEurovision).union(set(words60s), set(words70s), set(words80s), set(words90s), set(words00s), set(words10s), set(wordsDecades))



In [15]:
#word count
wcEV = wordCount(wordsEurovision)
wc60s = wordCount(words60s)
wc70s = wordCount(words70s)
wc80s = wordCount(words80s)
wc90s = wordCount(words90s)
wc00s = wordCount(words00s)
wc10s = wordCount(words10s)
wcDecades = wordCount(wordsDecades)

In [16]:
#TF values 
tfEV = computeTF(wcEV, wordsEurovision)
tf60s = computeTF(wc60s, words60s)
tf70s = computeTF(wc70s, words70s)
tf80s = computeTF(wc80s, words80s)
tf90s = computeTF(wc90s, words90s)
tf00s = computeTF(wc00s, words00s)
tf10s = computeTF(wc10s, words10s)
tfDecades = computeTF(wcDecades, wordsDecades)

In [17]:
#IDF values (basic idf, log)
allIDFBasic = computeIDFBasic([wcEV, wc60s, wc70s, wc80s, wc90s, wc00s, wc10s, wcDecades])
#IDF with plus one (basic idf, 1 + log)
allIDFPlusOne = computeIDFPlusOne([wcEV, wc60s, wc70s, wc80s, wc90s, wc00s, wc10s, wcDecades])

In [21]:
#basic ttfidf
tfidfBEV = computeTFIDF(tfEV, allIDFBasic)
tfidfB60s = computeTFIDF(tf60s, allIDFBasic)
tfidfB70s = computeTFIDF(tf70s, allIDFBasic)
tfidfB80s = computeTFIDF(tf80s, allIDFBasic)
tfidfB90s = computeTFIDF(tf90s, allIDFBasic)
tfidfB00s = computeTFIDF(tf00s, allIDFBasic)
tfidfB10s = computeTFIDF(tf10s, allIDFBasic)
tfidfBDecades = computeTFIDF(tfDecades, allIDFBasic)

In [26]:
#summarise results in dataframe
basicData = {'EV': tfidfBEV, '60s': tfidfB60s, '70s': tfidfB70s, '80s': tfidfB80s, '90s': tfidfB90s, '00s': tfidfB00s, '10s': tfidfB10s, 'decades': tfidfBDecades}
dfBasic = pd.DataFrame(data=basicData)

#top 15 values sorted by Eurovision and Decades
top15EVB = dfBasic.nlargest(15, 'EV')
top15DecadesB = dfBasic.nlargest(15, 'decades')

In [24]:
#tfidf with idf plus one
tfidfPEV = computeTFIDF(tfEV, allIDFPlusOne)
tfidfP60s = computeTFIDF(tf60s, allIDFPlusOne)
tfidfP70s = computeTFIDF(tf70s, allIDFPlusOne)
tfidfP80s = computeTFIDF(tf80s, allIDFPlusOne)
tfidfP90s = computeTFIDF(tf90s, allIDFPlusOne)
tfidfP00s = computeTFIDF(tf00s, allIDFPlusOne)
tfidfP10s = computeTFIDF(tf10s, allIDFPlusOne)
tfidfPDecades = computeTFIDF(tfDecades, allIDFPlusOne)

In [27]:
plusOneData = {'EV': tfidfPEV, '60s': tfidfP60s, '70s': tfidfP70s, '80s': tfidfP80s, '90s': tfidfP90s, '00s': tfidfP00s, '10s': tfidfP10s, 'decades': tfidfPDecades}
dfPlusOne = pd.DataFrame(data=plusOneData)

#top 15 values sorted by Eurovision and Decades
top15EVP = dfPlusOne.nlargest(15, 'EV')
top15DecadesP = dfPlusOne.nlargest(15, 'decades')

In [38]:
top15DecadesB

Unnamed: 0,EV,60s,70s,80s,90s,00s,10s,decades
mam,0.0,0.004833,0.0,0.0,0.0,0.0,0.0,0.001999
malachy,0.0,0.002872,0.0,0.0,0.0,0.0,0.0,0.001188
bond,0.0,0.0,0.0,0.0,1.8e-05,0.004235,6.1e-05,0.000759
limerick,0.0,0.001648,0.0,0.0,0.0,0.0,0.0,0.000681
natalya,0.0,0.0,0.0,0.0,0.0,0.00364,0.0,0.000646
raphael,0.0,0.0,0.0,0.004191,0.0,0.0,0.0,0.000629
mrs,0.0,0.001155,0.0,0.0,0.0,0.0,2.2e-05,0.00048
alec,0.0,0.0,0.0,0.0,0.0,0.002671,0.0,0.000474
mr,0.0,0.000851,0.0,0.0,7.4e-05,0.000279,2.1e-05,0.00041
paddy,0.0,0.000863,0.0,0.0,0.0,0.0,0.0,0.000357


In [39]:
top15EVB 

Unnamed: 0,EV,60s,70s,80s,90s,00s,10s,decades
–,0.001868,2e-06,2.1e-05,0.0,0.0,8e-06,1.3e-05,5e-06
vote,0.001075,0.0,0.0,0.0,0.0,0.0,0.0,0.0
hu,0.000597,0.0,0.0,0.0,0.0,0.0,0.0,0.0
sweden,0.000597,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ukraine,0.000597,0.0,0.0,0.0,0.0,0.0,0.0,0.0
waterloo,0.000597,0.0,0.0,0.0,0.0,0.0,0.0,0.0
bye,0.000577,4e-06,0.0,0.0,0.0,0.000466,0.0,8.4e-05
mamma,0.000518,2.4e-05,0.000281,0.0,0.0,0.0,0.0,2.8e-05
cululoo,0.000478,0.0,0.0,0.0,0.0,0.0,0.0,0.0
douze,0.000478,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [40]:
top15DecadesP

Unnamed: 0,EV,60s,70s,80s,90s,00s,10s,decades
the,0.036856,0.055953,0.037153,0.050398,0.030471,0.042687,0.026866,0.046213
i,0.042372,0.024269,0.043707,0.025635,0.049463,0.032513,0.049653,0.032111
and,0.022119,0.041639,0.02062,0.023906,0.018105,0.019358,0.015805,0.028848
you,0.037402,0.01733,0.039252,0.024872,0.04868,0.028502,0.036553,0.026681
to,0.019103,0.025441,0.020767,0.024778,0.022018,0.018751,0.014962,0.022444
a,0.018069,0.021377,0.022498,0.023578,0.0138,0.020215,0.016204,0.020347
is,0.014651,0.019645,0.017822,0.016971,0.01967,0.015808,0.01115,0.017552
in,0.01376,0.016719,0.012409,0.015818,0.008713,0.012773,0.012812,0.014479
it,0.014363,0.008954,0.015391,0.014088,0.019462,0.019278,0.016425,0.013697
not,0.011749,0.012389,0.013955,0.011142,0.019905,0.013961,0.015982,0.013634


In [41]:
top15EVP

Unnamed: 0,EV,60s,70s,80s,90s,00s,10s,decades
i,0.042372,0.024269,0.043707,0.025635,0.049463,0.032513,0.049653,0.032111
you,0.037402,0.01733,0.039252,0.024872,0.04868,0.028502,0.036553,0.026681
the,0.036856,0.055953,0.037153,0.050398,0.030471,0.042687,0.026866,0.046213
and,0.022119,0.041639,0.02062,0.023906,0.018105,0.019358,0.015805,0.028848
to,0.019103,0.025441,0.020767,0.024778,0.022018,0.018751,0.014962,0.022444
a,0.018069,0.021377,0.022498,0.023578,0.0138,0.020215,0.016204,0.020347
my,0.015943,0.007867,0.015612,0.008119,0.012861,0.008933,0.015029,0.009791
me,0.015771,0.008144,0.016275,0.010924,0.020792,0.012087,0.019241,0.012085
is,0.014651,0.019645,0.017822,0.016971,0.01967,0.015808,0.01115,0.017552
’,0.014651,5.1e-05,0.00151,0.000732,0.001148,0.007693,0.001795,0.001884
