In [30]:
import pandas as pd
import numpy as np
import nltk
import string
import re
import math

from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from collections import Counter

In [2]:
#lyric files
eurovision = open("data/intersection.txt", "r")
c60s = open("data/1960s.txt", "r")
c70s = open("data/1970s.txt", "r")
c80s = open("data/1980s.txt", "r")
c90s = open("data/1990s.txt", "r")
c00s = open("data/2000s.txt", "r")
c10s = open("data/2010s.txt", "r")

eurovisionLyrics = eurovision.read()
lyrics60s = c60s.read()
lyrics70s = c70s.read()
lyrics80s = c80s.read()
lyrics90s = c90s.read()
lyrics00s = c00s.read()
lyrics10s = c10s.read()

eurovision.close()
c60s.close()
c70s.close()
c80s.close()
c90s.close()
c00s.close()
c10s.close()

In [10]:
allLyrics = eurovisionLyrics + lyrics60s + lyrics70s + lyrics80s + lyrics90s + lyrics00s + lyrics10s
len(allLyrics)

2289901

In [3]:
#make lowercase
def convLower(text):
    text = text.lower()
    return text

In [4]:
#remove punctuation
def removePunctuation(text):
    text = "".join([char for char in text if char not in string.punctuation])
    return text

In [5]:
#remove non-alphabet characters
def removeNonAlphabet(text):
    regex = re.sub(r'[^a-zA-Z]', '', text) 
    return text

In [11]:
#remove numbers
def removeNumbers(text):
    text = "".join(i for i in text if not i.isdigit())
    return text

In [12]:
def cleanText(data):
    data = convLower(data)
    data = removePunctuation(data)
    data = removeNonAlphabet(data)
    data = removeNumbers(data)
    data = removePunctuation(data)
    return data

In [13]:
cleanEV = cleanText(eurovisionLyrics)
clean60s = cleanText(lyrics60s)
clean70s = cleanText(lyrics70s)
clean80s = cleanText(lyrics80s)
clean90s = cleanText(lyrics90s)
clean00s = cleanText(lyrics00s)
clean10s = cleanText(lyrics10s)

In [14]:
#tokenize as words
wordsEurovision = word_tokenize(cleanEV)
words60s = word_tokenize(clean60s)
words70s = word_tokenize(clean70s)
words80s = word_tokenize(clean80s)
words90s = word_tokenize(clean90s)
words00s = word_tokenize(clean00s)
words10s = word_tokenize(clean10s)

In [31]:
#get unique words
uniqueWords = set(wordsEurovision).union(set(words60s), set(words70s), set(words80s), set(words90s), set(words00s), set(words10s))

In [124]:
def wordCount(data):
    dictName = dict.fromkeys(uniqueWords, 0)
    for word in data:
        dictName[word] += 1
    return dictName

wcEV = wordCount(wordsEurovision)
wc60s = wordCount(words60s)
wc70s = wordCount(words70s)
wc80s = wordCount(words80s)
wc90s = wordCount(words90s)
wc00s = wordCount(words00s)
wc10s = wordCount(words10s)

20421

In [26]:
def computeTF(wordDict, bagOfWords):
    tfDict = {}
    bagOfWordsCount = len(bagOfWords)
    for word, count in wordDict.items():
        tfDict[word] = count / float(bagOfWordsCount)
    return tfDict

In [29]:
tfEV = computeTF(wcEV, wordsEurovision)
tf60s = computeTF(wc60s, words60s)
tf70s = computeTF(wc60s, words70s)
tf80s = computeTF(wc60s, words80s)
tf90s = computeTF(wc60s, words90s)
tf10s = computeTF(wc60s, words10s)
tf00s = computeTF(wc60s, words00s)

In [41]:
def computeIDFBasic(documents):
    N = len(documents)
    
    idfDict = dict.fromkeys(documents[0].keys(), 0)
    for document in documents:
        for word, val in document.items():
            if val > 0:
                idfDict[word] += 1
    
    for word, val in idfDict.items():
        idfDict[word] = math.log(N / float(val))
    return idfDict 

allIDFBasic = computeIDFBasic([wcEV, wc60s, wc70s, wc80s, wc90s, wc00s, wc10s])

In [39]:
def computeTFIDF(tfData, idfs):
    tfidf = {}
    for word, val in tfData.items():
        tfidf[word] = val * idfs[word]

        return tfidf

In [72]:
tfidfBEV = computeTFIDF(tfEV, allIDFBasic)
tfidfB60s = computeTFIDF(tf60s, allIDFBasic)
tfidfB70s = computeTFIDF(tf70s, allIDFBasic)
tfidfB80s = computeTFIDF(tf80s, allIDFBasic)
tfidfB90s = computeTFIDF(tf90s, allIDFBasic)
tfidfB00s = computeTFIDF(tf00s, allIDFBasic)
tfidfB10s = computeTFIDF(tf10s, allIDFBasic)

In [107]:
basicData = {'EV': tfidfBEV, '60s': tfidfB60s, '70s': tfidfB70s, '80s': tfidfB80s, '90s': tfidfB90s, '00s': tfidfB00s, '10s': tfidfB10s}
dfBasic = pd.DataFrame(data=basicData)
dfBasic.head()

Unnamed: 0,EV,60s,70s,80s,90s,00s,10s
a,0.0,0.0,0.0,0.0,0.0,0.0,0.0
aaa,3.7e-05,0.0,0.0,0.0,0.0,0.0,0.0
aaaa,0.0,0.0,0.0,0.0,0.0,0.0,0.0
aaaah,0.0,0.0,0.0,0.0,0.0,0.0,0.0
aaaani,5.7e-05,0.0,0.0,0.0,0.0,0.0,0.0


In [116]:
dfBasic.nsmallest(15, '10s')

Unnamed: 0,EV,60s,70s,80s,90s,00s,10s
a,0.0,0.0,0.0,0.0,0.0,0.0,0.0
aaa,3.7e-05,0.0,0.0,0.0,0.0,0.0,0.0
aaaa,0.0,0.0,0.0,0.0,0.0,0.0,0.0
aaaah,0.0,0.0,0.0,0.0,0.0,0.0,0.0
aaaani,5.7e-05,0.0,0.0,0.0,0.0,0.0,0.0
aaah,0.0,0.0,0.0,0.0,0.0,0.0,0.0
aagg,0.0,0.0,0.0,0.0,0.0,0.0,0.0
aah,5e-05,0.0,0.0,0.0,0.0,0.0,0.0
aah…,5.7e-05,0.0,0.0,0.0,0.0,0.0,0.0
aaliyah,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [42]:
def computeIDFPlusOne(documents):
    import math
    N = len(documents)
    
    idfDict = dict.fromkeys(documents[0].keys(), 0)
    for document in documents:
        for word, val in document.items():
            if val > 0:
                idfDict[word] += 1
    
    for word, val in idfDict.items():
        idfDict[word] = 1 + math.log(N / float(val))
    return idfDict 

allIDFPlusOne = computeIDFPlusOne([wcEV, wc60s, wc70s, wc80s, wc90s, wc00s, wc10s])

In [97]:
tfidfPEV = computeTFIDF(tfEV, allIDFPlusOne)
tfidfP60s = computeTFIDF(tf60s, allIDFPlusOne)
tfidfP70s = computeTFIDF(tf70s, allIDFPlusOne)
tfidfP80s = computeTFIDF(tf80s, allIDFPlusOne)
tfidfP90s = computeTFIDF(tf90s, allIDFPlusOne)
tfidfP00s = computeTFIDF(tf00s, allIDFPlusOne)
tfidfP10s = computeTFIDF(tf10s, allIDFPlusOne)

In [99]:
plusOneData = {'EV': tfidfPEV, '60s': tfidfP60s, '70s': tfidfP70s, '80s': tfidfP80s, '90s': tfidfP90s, '00s': tfidfP00s, '10s': tfidfP10s}
dfPlusOne = pd.DataFrame(data=plusOneData)
dfPlusOne.head()

Unnamed: 0,EV,60s,70s,80s,90s,00s,10s
a,0.018561,0.022156,0.145605,0.060466,0.10411,0.051436,0.087829
aaa,6.6e-05,0.0,0.0,0.0,0.0,0.0,0.0
aaaa,0.0,0.0,0.0,0.0,0.0,0.0,0.0
aaaah,0.0,0.0,0.0,0.0,0.0,0.0,0.0
aaaani,8.7e-05,0.0,0.0,0.0,0.0,0.0,0.0


In [120]:
dfPlusOne.nlargest(15, '00s')

Unnamed: 0,EV,60s,70s,80s,90s,00s,10s
the,0.03786,0.057969,0.380956,0.158201,0.272389,0.134575,0.229793
and,0.022722,0.043157,0.283616,0.117778,0.202789,0.100189,0.171077
to,0.019623,0.026368,0.173285,0.07196,0.123901,0.061214,0.104525
a,0.018561,0.022156,0.145605,0.060466,0.10411,0.051436,0.087829
i,0.036089,0.018713,0.122976,0.051069,0.08793,0.043442,0.074179
in,0.014135,0.017329,0.113878,0.04729,0.081425,0.040228,0.068691
you,0.034348,0.014806,0.097301,0.040407,0.069572,0.034372,0.058692
of,0.013368,0.014055,0.092367,0.038357,0.066044,0.032629,0.055716
he,0.001682,0.011914,0.078296,0.032514,0.055983,0.027659,0.047228
mam,0.0,0.010645,0.069957,0.029051,0.05002,0.024713,0.042198


In [128]:
bgEV = list(nltk.bigrams(wordsEurovision))
bg60s = list(nltk.bigrams(words60s))
bg70s = list(nltk.bigrams(words70s))
bg80s = list(nltk.bigrams(words80s))
bg90s = list(nltk.bigrams(words90s))
bg00s = list(nltk.bigrams(words00s))
bg10s = list(nltk.bigrams(words10s))

('your', 'legs')

In [61]:
#get unique bigrams
uniqueBigrams = set(bgEV).union(set(bg60s), set(bg70s), set(bg80s), set(bg90s), set(bg00s), set(bg10s))

In [162]:
#bigram count
def bigramCount(data):
    dictName = dict.fromkeys(uniqueBigrams, 0)
    for i in data:
        dictName[i] += 1
    return dictName

bgcEV = bigramCount(bgEV)
bgc60s = bigramCount(bg60s)
bgc70s = bigramCount(bg70s)
bgc80s = bigramCount(bg80s)
bgc90s = bigramCount(bg90s)
bgc00s = bigramCount(bg00s)
bgc10s = bigramCount(bg10s)

In [137]:
tfbgEV = computeTF(bgcEV, bgEV)
tfbg60s = computeTF(bgc60s, bg60s)
tfbg70s = computeTF(bgc70s, bg70s)
tfbg80s = computeTF(bgc80s, bg80s)
tfbg90s = computeTF(bgc90s, bg90s)
tfbg10s = computeTF(bgc00s, bg00s)
tfbg00s = computeTF(bgc10s, bg10s)

In [138]:
bgIDF = computeIDFBasic([bgcEV, bgc60s, bgc70s, bgc80s, bgc90s, bgc00s, bgc10s])

In [150]:
tfidfBigramEV = computeTFIDF(tfbgEV, bgIDF)
tfidfBigram60s = computeTFIDF(tfbg60s, bgIDF)
tfidfBigram70s = computeTFIDF(tfbg70s, bgIDF)
tfidfBigram80s = computeTFIDF(tfbg80s, bgIDF)
tfidfBigram90s = computeTFIDF(tfbg90s, bgIDF)
tfidfBigram00s = computeTFIDF(tfbg00s, bgIDF)
tfidfBigram10s = computeTFIDF(tfbg10s, bgIDF)
dfBigram = pd.DataFrame([tfidfBigramEV, tfidfBigram60s, tfidfBigram70s, tfidfBigram80s, tfidfBigram90s, tfidfBigram00s, tfidfBigram10s])
dfBigram

Unnamed: 0,"(a, a)","(a, ass)","(a, attitude)","(a, baby)","(a, babys)","(a, baccarat)","(a, back)","(a, background)","(a, bad)","(a, badbad)",...,"(”, —boil)","(”, “)","(„, wie)","(•, bring)","(…, and)","(…, but)","(…, sed)","(…but, now)","(…no…, this)","(﻿i, was)"
0,0.0,0.0,0.0,2.4e-05,1.1e-05,0.0,1.5e-05,0.0,9.2e-05,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,1.1e-05,0.0,0.0,0.0,3.1e-05,3.6e-05,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,9e-06,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2.9e-05,0.0,0.0,8e-06,0.0,0.0,0.0,0.0,3.9e-05,9.1e-05,...,0.0,0.0,4.5e-05,0.0,0.0,4.5e-05,0.0,0.0,4.5e-05,4.5e-05
5,0.0,2.7e-05,0.000106,1.4e-05,0.0,2.7e-05,1.7e-05,0.0,0.000137,0.0,...,2.7e-05,2.7e-05,0.0,0.0,2.7e-05,0.0,2.7e-05,0.0,0.0,0.0


In [156]:
biGramData = {'EV': tfidfBigramEV, '60s': tfidfBigram60s, '70s': tfidfBigram70s, '80s': tfidfBigram80s, '90s': tfidfBigram90s, '00s': tfidfBigram00s, '10s': tfidfBigram10s}
dfBigram = pd.DataFrame(data=biGramData)
dfBigram.nlargest(15, 'EV')

Unnamed: 0,Unnamed: 1,EV,60s,70s,80s,90s,00s,10s
don,’,0.001073,0.0,0.0,0.0,7.7e-05,5.2e-05,0.000282
you,thank,0.000861,0.0,0.0,0.0,0.0,0.0,0.0
s,no,0.000702,0.0,0.0,0.0,0.0,0.0,3.4e-05
and,earth,0.000632,0.0,0.0,0.0,0.0,0.0,0.0
might,now,0.000632,0.0,0.0,0.0,0.0,0.0,0.0
shine,a,0.000632,0.0,0.0,0.0,0.0,0.0,0.0
vote,vote,0.000632,0.0,0.0,0.0,0.0,0.0,0.0
your,toy,0.000632,0.0,0.0,0.0,0.0,0.0,0.0
drum,drum,0.000574,0.0,0.0,0.0,0.0,0.0,0.0
hu,ha,0.000574,0.0,0.0,0.0,0.0,0.0,0.0


In [159]:
tgEV = list(nltk.trigrams(wordsEurovision))
tg60s = list(nltk.trigrams(words60s))
tg70s = list(nltk.trigrams(words70s))
tg80s = list(nltk.trigrams(words80s))
tg90s = list(nltk.trigrams(words90s))
tg00s = list(nltk.trigrams(words00s))
tg10s = list(nltk.trigrams(words10s))

In [160]:
#get unique trigrams
uniqueTrigrams = set(tgEV).union(set(tg60s), set(tg70s), set(tg80s), set(tg90s), set(tg00s), set(tg10s))

In [164]:
#bigram count
def trigramCount(data):
    dictName = dict.fromkeys(uniqueTrigrams, 0)
    for i in data:
        dictName[i] += 1
    return dictName

tgcEV = trigramCount(tgEV)
tgc60s = trigramCount(tg60s)
tgc70s = trigramCount(tg70s)
tgc80s = trigramCount(tg80s)
tgc90s = trigramCount(tg90s)
tgc00s = trigramCount(tg00s)
tgc10s = trigramCount(tg10s)

In [165]:
tgIDF = computeIDFBasic([tgcEV, tgc60s, tgc70s, tgc80s, tgc90s, tgc00s, tgc10s])

In [167]:
tftgEV = computeTF(tgcEV, tgEV)
tftg60s = computeTF(tgc60s, tg60s)
tftg70s = computeTF(tgc70s, tg70s)
tftg80s = computeTF(tgc80s, tg80s)
tftg90s = computeTF(tgc90s, tg90s)
tftg10s = computeTF(tgc00s, tg00s)
tftg00s = computeTF(tgc10s, tg10s)

In [168]:
tfidfTrigramEV = computeTFIDF(tftgEV, tgIDF)
tfidfTrigram60s = computeTFIDF(tftg60s, tgIDF)
tfidfTrigram70s = computeTFIDF(tftg70s, tgIDF)
tfidfTrigram80s = computeTFIDF(tftg80s, tgIDF)
tfidfTrigram90s = computeTFIDF(tftg90s, tgIDF)
tfidfTrigram00s = computeTFIDF(tftg00s, tgIDF)
tfidfTrigram10s = computeTFIDF(tftg10s, tgIDF)
dfTrigram = pd.DataFrame([tfidfTrigramEV, tfidfTrigram60s, tfidfTrigram70s, tfidfTrigram80s, tfidfTrigram90s, tfidfTrigram00s, tfidfTrigram10s])


Unnamed: 0,"(a, a, a)","(a, a, little)","(a, a, were)","(a, ass, like)","(a, attitude, pop)","(a, baby, again)","(a, baby, and)","(a, baby, anymore)","(a, baby, around)","(a, baby, but)",...,"(”, —boil, experience)","(”, “, seven)","(„, wie, gehts)","(•, bring, me)","(…, and, he)","(…, but, like)","(…, sed, propter)","(…but, now, it)","(…no…, this, man)","(﻿i, was, lost)"
0,0.000115,0.0,0.000115,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,5.7e-05,0.0,0.0,0.0,5.7e-05,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,1.1e-05,2.3e-05,1.1e-05,1.1e-05,1.1e-05,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,4.5e-05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,4.5e-05,0.0,0.0,4.5e-05,0.0,0.0,4.5e-05,4.5e-05
6,0.0,0.0,0.0,2.7e-05,0.000106,0.0,0.0,0.0,0.0,0.0,...,2.7e-05,2.7e-05,0.0,0.0,2.7e-05,0.0,2.7e-05,0.0,0.0,0.0


In [172]:
triGramData = {'EV': tfidfTrigramEV, '60s': tfidfTrigram60s, '70s': tfidfTrigram70s, '80s': tfidfTrigram80s, '90s': tfidfTrigram90s, '00s': tfidfTrigram00s, '10s': tfidfTrigram10s}
dfTrigram = pd.DataFrame(data=triGramData)
dfTrigram.nlargest(15, '10s')

Unnamed: 0,Unnamed: 1,Unnamed: 2,EV,60s,70s,80s,90s,00s,10s
of,the,species,0.0,0.0,0.0,0.0,0.0,0.0,0.00159
be,gone,with,0.0,0.0,0.0,0.0,0.0,0.0,0.001193
go,head,be,0.0,0.0,0.0,0.0,0.0,0.0,0.001193
gone,with,it,0.0,0.0,0.0,0.0,0.0,0.0,0.001193
head,be,gone,0.0,0.0,0.0,0.0,0.0,0.0,0.001193
youre,ready,for,0.0,0.0,0.0,0.0,0.0,0.0,0.000954
this,my,shit,0.0,0.0,0.0,0.0,0.0,0.0,0.000848
best,the,best,0.0,0.0,0.0,0.0,0.0,0.0,0.000795
dont,stop,the,0.0,0.0,0.0,0.0,0.0,0.0,0.000795
ready,for,this,0.0,0.0,0.0,0.0,0.0,0.0,0.000795
