In [1]:
import pandas as pd
import numpy as np
import nltk
import string
import re
import math

from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from contractions import expandContractions

In [2]:
# -*- coding: utf-8 -*-
"""
Created on Mon Aug 01 01:11:02 2016
@author: DIP
author's repo: https://github.com/dipanjanS/practical-machine-learning-with-python/blob/master/bonus%20content/nlp%20proven%20approach/contractions.py
"""

CONTRACTION_MAP = {
"ain't": "is not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he would",
"he'd've": "he would have",
"he'll": "he will",
"he'll've": "he he will have",
"he's": "he is",
"how'd": "how did",
"how'd'y": "how do you",
"how'll": "how will",
"how's": "how is",
"I'd": "I would",
"I'd've": "I would have",
"I'll": "I will",
"I'll've": "I will have",
"I'm": "I am",
"I've": "I have",
"i'd": "i would",
"i'd've": "i would have",
"i'll": "i will",
"i'll've": "i will have",
"i'm": "i am",
"i've": "i have",
"isn't": "is not",
"it'd": "it would",
"it'd've": "it would have",
"it'll": "it will",
"it'll've": "it will have",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"mightn't've": "might not have",
"must've": "must have",
"mustn't": "must not",
"mustn't've": "must not have",
"needn't": "need not",
"needn't've": "need not have",
"o'clock": "of the clock",
"oughtn't": "ought not",
"oughtn't've": "ought not have",
"shan't": "shall not",
"sha'n't": "shall not",
"shan't've": "shall not have",
"she'd": "she would",
"she'd've": "she would have",
"she'll": "she will",
"she'll've": "she will have",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"shouldn't've": "should not have",
"so've": "so have",
"so's": "so as",
"that'd": "that would",
"that'd've": "that would have",
"that's": "that is",
"there'd": "there would",
"there'd've": "there would have",
"there's": "there is",
"they'd": "they would",
"they'd've": "they would have",
"they'll": "they will",
"they'll've": "they will have",
"they're": "they are",
"they've": "they have",
"to've": "to have",
"wasn't": "was not",
"we'd": "we would",
"we'd've": "we would have",
"we'll": "we will",
"we'll've": "we will have",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what will",
"what'll've": "what will have",
"what're": "what are",
"what's": "what is",
"what've": "what have",
"when's": "when is",
"when've": "when have",
"where'd": "where did",
"where's": "where is",
"where've": "where have",
"who'll": "who will",
"who'll've": "who will have",
"who's": "who is",
"who've": "who have",
"why's": "why is",
"why've": "why have",
"will've": "will have",
"won't": "will not",
"won't've": "will not have",
"would've": "would have",
"wouldn't": "would not",
"wouldn't've": "would not have",
"y'all": "you all",
"y'all'd": "you all would",
"y'all'd've": "you all would have",
"y'all're": "you all are",
"y'all've": "you all have",
"you'd": "you would",
"you'd've": "you would have",
"you'll": "you will",
"you'll've": "you will have",
"you're": "you are",
"you've": "you have"
}

#remove contractions, code by Dipanjan Sarkar (git linked on contraction map cell)
def expandContractions(text, contraction_mapping=CONTRACTION_MAP):
    
    contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())), 
                                      flags=re.IGNORECASE|re.DOTALL)
    def expand_match(contraction):
        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = contraction_mapping.get(match)\
                                if contraction_mapping.get(match)\
                                else contraction_mapping.get(match.lower())                       
        expanded_contraction = first_char+expanded_contraction[1:]
        return expanded_contraction
        
    expanded_text = contractions_pattern.sub(expand_match, text)
    expanded_text = re.sub("'", "", expanded_text)
    
    return expanded_text


In [3]:
#lyric files
eurovision = open("data/intersection.txt", "r")
c60s = open("data/1960s.txt", "r")
c70s = open("data/1970s.txt", "r")
c80s = open("data/1980s.txt", "r")
c90s = open("data/1990s.txt", "r")
c00s = open("data/2000s.txt", "r")
c10s = open("data/2010s.txt", "r")

eurovisionLyrics = eurovision.read()
lyrics60s = c60s.read()
lyrics70s = c70s.read()
lyrics80s = c80s.read()
lyrics90s = c90s.read()
lyrics00s = c00s.read()
lyrics10s = c10s.read()

eurovision.close()
c60s.close()
c70s.close()
c80s.close()
c90s.close()
c00s.close()
c10s.close()

In [4]:
#make lowercase
def convLower(text):
    text = text.lower()
    return text

#remove punctuation
def removePunctuation(text):
    text = "".join([char for char in text if char not in string.punctuation])
    return text

#remove non-alphabet characters
def removeNonAlphabet(text):
    regex = re.sub(r'[^a-zA-Z]', '', text) 
    return text

#remove numbers
def removeNumbers(text):
    text = "".join(i for i in text if not i.isdigit())
    return text

def cleanText(data):
    data = convLower(data)
    data = expandContractions(data)
    data = removePunctuation(data)
    data = removeNonAlphabet(data)
    data = removeNumbers(data)
    data = removePunctuation(data)
    return data

In [5]:
#clean lyrics
cleanEV = cleanText(eurovisionLyrics)
clean60s = cleanText(lyrics60s)
clean70s = cleanText(lyrics70s)
clean80s = cleanText(lyrics80s)
clean90s = cleanText(lyrics90s)
clean00s = cleanText(lyrics00s)
clean10s = cleanText(lyrics10s)

In [6]:
cleanDecades = clean60s + clean70s + clean80s + clean90s + clean00s + clean10s

In [7]:
#tokenize as words
wordsEurovision = word_tokenize(cleanEV)
words60s = word_tokenize(clean60s)
words70s = word_tokenize(clean70s)
words80s = word_tokenize(clean80s)
words90s = word_tokenize(clean90s)
words00s = word_tokenize(clean00s)
words10s = word_tokenize(clean10s)
wordsDecades = word_tokenize(cleanDecades)

In [9]:
#bigrams for each lyrics set
bgEV = list(nltk.bigrams(wordsEurovision))
bg60s = list(nltk.bigrams(words60s))
bg70s = list(nltk.bigrams(words70s))
bg80s = list(nltk.bigrams(words80s))
bg90s = list(nltk.bigrams(words90s))
bg00s = list(nltk.bigrams(words00s))
bg10s = list(nltk.bigrams(words10s))
bgDecades = list(nltk.bigrams(wordsDecades))

In [10]:
#get unique bigrams
uniqueBigrams = set(bgEV).union(set(bg60s), set(bg70s), set(bg80s), set(bg90s), set(bg00s), set(bg10s), set(bgDecades))

In [11]:
def computeTF(wordDict, bagOfWords):
    tfDict = {}
    bagOfWordsCount = len(bagOfWords)
    for word, count in wordDict.items():
        tfDict[word] = count / float(bagOfWordsCount)
    return tfDict

#returns dictionary of TF values
#basic IDF with log
def computeIDFBasic(documents):
    N = len(documents)
    
    idfDict = dict.fromkeys(documents[0].keys(), 0)
    for document in documents:
        for word, val in document.items():
            if val > 0:
                idfDict[word] += 1
    
    for word, val in idfDict.items():
        idfDict[word] = math.log(N / float(val))
    return idfDict 


def computeIDFPlusOne(documents):
    import math
    N = len(documents)
    
    idfDict = dict.fromkeys(documents[0].keys(), 0)
    for document in documents:
        for word, val in document.items():
            if val > 0:
                idfDict[word] += 1
    
    for word, val in idfDict.items():
        idfDict[word] = 1 + math.log(N / float(val))
    return idfDict

def computeTFIDF(tfData, idfs):
    tfidf = {}
    for word, val in tfData.items():
        tfidf[word] = val * idfs[word]
    return tfidf

In [12]:
#bigram count
def bigramCount(data):
    dictName = dict.fromkeys(uniqueBigrams, 0)
    for i in data:
        dictName[i] += 1
    return dictName

bgcEV = bigramCount(bgEV)
bgc60s = bigramCount(bg60s)
bgc70s = bigramCount(bg70s)
bgc80s = bigramCount(bg80s)
bgc90s = bigramCount(bg90s)
bgc00s = bigramCount(bg00s)
bgc10s = bigramCount(bg10s)
bgcDecades = bigramCount(bgDecades)

In [13]:
#TF values for bigrams
tfbgEV = computeTF(bgcEV, bgEV)
tfbg60s = computeTF(bgc60s, bg60s)
tfbg70s = computeTF(bgc70s, bg70s)
tfbg80s = computeTF(bgc80s, bg80s)
tfbg90s = computeTF(bgc90s, bg90s)
tfbg10s = computeTF(bgc00s, bg00s)
tfbg00s = computeTF(bgc10s, bg10s)
tfbgDecades = computeTF(bgcDecades, bgDecades)

In [17]:
#basic IDF value for bigrams
bgIDF = computeIDFBasic([bgcEV, bgc60s, bgc70s, bgc80s, bgc90s, bgc00s, bgc10s, bgcDecades])

In [18]:
#tfidf values
tfidfBigramEV = computeTFIDF(tfbgEV, bgIDF)
tfidfBigram60s = computeTFIDF(tfbg60s, bgIDF)
tfidfBigram70s = computeTFIDF(tfbg70s, bgIDF)
tfidfBigram80s = computeTFIDF(tfbg80s, bgIDF)
tfidfBigram90s = computeTFIDF(tfbg90s, bgIDF)
tfidfBigram00s = computeTFIDF(tfbg00s, bgIDF)
tfidfBigram10s = computeTFIDF(tfbg10s, bgIDF)
tfidfBigramDecades = computeTFIDF(tfbgDecades, bgIDF)

In [21]:
biGramData = {'EV': tfidfBigramEV, '60s': tfidfBigram60s, '70s': tfidfBigram70s, '80s': tfidfBigram80s, '90s': tfidfBigram90s, '00s': tfidfBigram00s, '10s': tfidfBigram10s, 'decades': tfidfBigramDecades}
dfBigram = pd.DataFrame(data=biGramData)

In [22]:
top15Decades = dfBigram.nlargest(15, 'decades')
top15EV = dfBigram.nlargest(15, 'decades')

In [25]:
#trigrams for each lyrics set
tgEV = list(nltk.trigrams(wordsEurovision))
tg60s = list(nltk.trigrams(words60s))
tg70s = list(nltk.trigrams(words70s))
tg80s = list(nltk.trigrams(words80s))
tg90s = list(nltk.trigrams(words90s))
tg00s = list(nltk.trigrams(words00s))
tg10s = list(nltk.trigrams(words10s))
tgDecades = list(nltk.trigrams(wordsDecades))


#get unique trigrams
uniqueTrigrams = set(tgEV).union(set(tg60s), set(tg70s), set(tg80s), set(tg90s), set(tg00s), set(tg10s), set(tgDecades))

#trigram count
def trigramCount(data):
    dictName = dict.fromkeys(uniqueTrigrams, 0)
    for i in data:
        dictName[i] += 1
    return dictName

tgcEV = trigramCount(tgEV)
tgc60s = trigramCount(tg60s)
tgc70s = trigramCount(tg70s)
tgc80s = trigramCount(tg80s)
tgc90s = trigramCount(tg90s)
tgc00s = trigramCount(tg00s)
tgc10s = trigramCount(tg10s)
tgcDecades = trigramCount(tgDecades)

#TF values for bigrams
tftgEV = computeTF(tgcEV, tgEV)
tftg60s = computeTF(tgc60s, tg60s)
tftg70s = computeTF(tgc70s, tg70s)
tftg80s = computeTF(tgc80s, tg80s)
tftg90s = computeTF(tgc90s, tg90s)
tftg10s = computeTF(tgc00s, tg00s)
tftg00s = computeTF(tgc10s, tg10s)
tftgDecades = computeTF(tgcDecades, tgDecades)

#basic IDF value for bigrams
tgIDF = computeIDFBasic([tgcEV, tgc60s, tgc70s, tgc80s, tgc90s, tgc00s, tgc10s, tgcDecades])

#tfidf values
tfidfTrigramEV = computeTFIDF(tftgEV, tgIDF)
tfidfTrigram60s = computeTFIDF(tftg60s, tgIDF)
tfidfTrigram70s = computeTFIDF(tftg70s, tgIDF)
tfidfTrigram80s = computeTFIDF(tftg80s, tgIDF)
tfidfTrigram90s = computeTFIDF(tftg90s, tgIDF)
tfidfTrigram00s = computeTFIDF(tftg00s, tgIDF)
tfidfTrigram10s = computeTFIDF(tftg10s, tgIDF)
tfidfTrigramDecades = computeTFIDF(tftgDecades, tgIDF)

#summarise results in dataframe
triGramData = {'EV': tfidfTrigramEV, '60s': tfidfTrigram60s, '70s': tfidfTrigram70s, '80s': tfidfTrigram80s, '90s': tfidfTrigram90s, '00s': tfidfTrigram00s, '10s': tfidfTrigram10s, 'decades': tfidfTrigramDecades}
dfTrigram = pd.DataFrame(data=triGramData)

#top 15 values sorted by Eurovision and Decades
top15EV = dfTrigram.nlargest(15, 'EV')
top15Decades = dfTrigram.nlargest(15, 'decades')

In [29]:
top15Decades 

Unnamed: 0,Unnamed: 1,Unnamed: 2,EV,60s,70s,80s,90s,00s,10s,decades
don,’,t,0.000878,0.0,0.0,0.0,6.1e-05,4.2e-05,0.000229,5.1e-05
you,thank,you,0.000836,0.0,0.0,0.0,0.0,0.0,0.0,0.0
thank,you,thank,0.000717,0.0,0.0,0.0,0.0,0.0,0.0,0.0
we,got,love,0.000717,0.0,0.0,0.0,0.0,0.0,0.0,0.0
heaven,and,earth,0.000657,0.0,0.0,0.0,0.0,0.0,0.0,0.0
i,might,now,0.000657,0.0,0.0,0.0,0.0,0.0,0.0,0.0
not,your,toy,0.000657,0.0,0.0,0.0,0.0,0.0,0.0,0.0
shine,a,light,0.000657,0.0,0.0,0.0,0.0,0.0,0.0,0.0
am,the,voice,0.000597,0.0,0.0,0.0,0.0,0.0,0.0,0.0
rock,me,baby,0.000538,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [30]:
top15EV

Unnamed: 0,Unnamed: 1,Unnamed: 2,EV,60s,70s,80s,90s,00s,10s,decades
lie,la,lie,0.0,0.0,0.005054,0.0,0.0,0.0,0.0,0.000321
lovely,day,lovely,0.0,0.0,0.004799,0.0,0.0,0.0,0.0,0.000305
day,lovely,day,0.0,0.0,0.004748,0.0,0.0,0.0,0.0,0.000302
na,na,na,9.9e-05,3e-06,0.0,0.000596,0.001996,0.0,4.2e-05,0.000277
la,lie,la,0.0,0.0,0.003369,0.0,0.0,0.0,0.0,0.000214
of,the,species,0.0,0.0,0.0,0.0,0.0,0.0,0.001098,0.000195
the,post,office,0.0,0.000432,0.0,0.0,0.0,0.0,0.0,0.000178
lo,lo,lo,0.0,0.0,0.002655,0.0,0.0,0.0,0.0,0.000169
by,the,fire,0.0,0.000377,0.0,0.0,0.0,0.0,0.0,0.000156
be,gone,with,0.0,0.0,0.0,0.0,0.0,0.0,0.000823,0.000146


In [35]:
triGramCount = {'EV': tgcEV, '60s': tgc60s, '70s': tgc70s, '80s': tgc80s, '90s': tgc90s, '00s': tgc00s, '10s': tgc10s, 'decades': tgcDecades}
dfTrigramCount = pd.DataFrame(data=triGramCount)

biGramCount = {'EV': bgcEV, '60s': bgc60s, '70s': bgc70s, '80s': bgc80s, '90s': bgc90s, '00s': bgc00s, '10s': bgc10s, 'decades': bgcDecades}
dfBigramCount = pd.DataFrame(data=biGramCount)

In [39]:
dfTrigram.nlargest(15, 'decades')

Unnamed: 0,Unnamed: 1,Unnamed: 2,EV,60s,70s,80s,90s,00s,10s,decades
lie,la,lie,0.0,0.0,0.005054,0.0,0.0,0.0,0.0,0.000321
lovely,day,lovely,0.0,0.0,0.004799,0.0,0.0,0.0,0.0,0.000305
day,lovely,day,0.0,0.0,0.004748,0.0,0.0,0.0,0.0,0.000302
na,na,na,9.9e-05,3e-06,0.0,0.000596,0.001996,0.0,4.2e-05,0.000277
la,lie,la,0.0,0.0,0.003369,0.0,0.0,0.0,0.0,0.000214
of,the,species,0.0,0.0,0.0,0.0,0.0,0.0,0.001098,0.000195
the,post,office,0.0,0.000432,0.0,0.0,0.0,0.0,0.0,0.000178
lo,lo,lo,0.0,0.0,0.002655,0.0,0.0,0.0,0.0,0.000169
by,the,fire,0.0,0.000377,0.0,0.0,0.0,0.0,0.0,0.000156
be,gone,with,0.0,0.0,0.0,0.0,0.0,0.0,0.000823,0.000146


In [40]:
dfBigram.nlargest(15, 'decades')

Unnamed: 0,Unnamed: 1,EV,60s,70s,80s,90s,00s,10s,decades
mam,says,0.0,0.001185,0.0,0.0,0.0,0.0,0.0,0.00049
la,lie,0.0,0.0,0.005615,0.0,0.0,0.0,0.0,0.000357
lie,la,0.0,0.0,0.005615,0.0,0.0,0.0,0.0,0.000357
na,na,0.000124,5e-06,0.0,0.000619,0.002349,0.0,4.6e-05,0.000314
day,lovely,0.0,0.0,0.004798,0.0,0.0,0.0,0.0,0.000305
malachy,and,0.0,0.000659,0.0,0.0,0.0,0.0,0.0,0.000273
the,lane,0.0,0.000659,0.0,0.0,0.0,0.0,0.0,0.000273
tea,and,0.0,0.000604,0.0,0.0,0.0,0.0,0.0,0.00025
uncle,pa,0.0,0.000596,0.0,0.0,0.0,0.0,0.0,0.000247
aunt,aggie,0.0,0.000588,0.0,0.0,0.0,0.0,0.0,0.000243
