In [30]:
import pandas as pd
import numpy as np
import nltk
import string
import re
import math

from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from collections import Counter

In [2]:
#lyric files
eurovision = open("data/intersection.txt", "r")
c60s = open("data/1960s.txt", "r")
c70s = open("data/1970s.txt", "r")
c80s = open("data/1980s.txt", "r")
c90s = open("data/1990s.txt", "r")
c00s = open("data/2000s.txt", "r")
c10s = open("data/2010s.txt", "r")

eurovisionLyrics = eurovision.read()
lyrics60s = c60s.read()
lyrics70s = c70s.read()
lyrics80s = c80s.read()
lyrics90s = c90s.read()
lyrics00s = c00s.read()
lyrics10s = c10s.read()

eurovision.close()
c60s.close()
c70s.close()
c80s.close()
c90s.close()
c00s.close()
c10s.close()

In [10]:
allLyrics = eurovisionLyrics + lyrics60s + lyrics70s + lyrics80s + lyrics90s + lyrics00s + lyrics10s
len(allLyrics)

2289901

In [3]:
#make lowercase
def convLower(text):
    text = text.lower()
    return text

In [4]:
#remove punctuation
def removePunctuation(text):
    text = "".join([char for char in text if char not in string.punctuation])
    return text

In [5]:
#remove non-alphabet characters
def removeNonAlphabet(text):
    regex = re.sub(r'[^a-zA-Z]', '', text) 
    return text

In [11]:
#remove numbers
def removeNumbers(text):
    text = "".join(i for i in text if not i.isdigit())
    return text

In [12]:
def cleanText(data):
    data = convLower(data)
    data = removePunctuation(data)
    data = removeNonAlphabet(data)
    data = removeNumbers(data)
    data = removePunctuation(data)
    return data

In [13]:
cleanEV = cleanText(eurovisionLyrics)
clean60s = cleanText(lyrics60s)
clean70s = cleanText(lyrics70s)
clean80s = cleanText(lyrics80s)
clean90s = cleanText(lyrics90s)
clean00s = cleanText(lyrics00s)
clean10s = cleanText(lyrics10s)

In [14]:
#tokenize as words
wordsEurovision = word_tokenize(cleanEV)
words60s = word_tokenize(clean60s)
words70s = word_tokenize(clean70s)
words80s = word_tokenize(clean80s)
words90s = word_tokenize(clean90s)
words00s = word_tokenize(clean00s)
words10s = word_tokenize(clean10s)

In [31]:
#get unique words
uniqueWords = set(wordsEurovision).union(set(words60s), set(words70s), set(words80s), set(words90s), set(words00s), set(words10s))

In [22]:
def wordCount(data):
    dictName = dict.fromkeys(uniqueWords, 0)
    for word in data:
        dictName[word] += 1
    return dictName

wcEV = wordCount(wordsEurovision)
wc60s = wordCount(words60s)
wc70s = wordCount(words70s)
wc80s = wordCount(words80s)
wc90s = wordCount(words90s)
wc00s = wordCount(words00s)
wc10s = wordCount(words10s)

In [26]:
def computeTF(wordDict, bagOfWords):
    tfDict = {}
    bagOfWordsCount = len(bagOfWords)
    for word, count in wordDict.items():
        tfDict[word] = count / float(bagOfWordsCount)
    return tfDict

In [29]:
tfEV = computeTF(wcEV, wordsEurovision)
tf60s = computeTF(wc60s, words60s)
tf70s = computeTF(wc60s, words70s)
tf80s = computeTF(wc60s, words80s)
tf90s = computeTF(wc60s, words90s)
tf10s = computeTF(wc60s, words10s)
tf00s = computeTF(wc60s, words00s)

In [41]:
def computeIDFBasic(documents):
    N = len(documents)
    
    idfDict = dict.fromkeys(documents[0].keys(), 0)
    for document in documents:
        for word, val in document.items():
            if val > 0:
                idfDict[word] += 1
    
    for word, val in idfDict.items():
        idfDict[word] = math.log(N / float(val))
    return idfDict 

allIDFBasic = computeIDFBasic([wcEV, wc60s, wc70s, wc80s, wc90s, wc00s, wc10s])

In [39]:
def computeTFIDF(tfData, idfs):
    tfidf = {}
    for word, val in tfData.items():
        tfidf[word] = val * idfs[word]

        return tfidf

In [72]:
tfidfBEV = computeTFIDF(tfEV, allIDFBasic)
tfidfB60s = computeTFIDF(tf60s, allIDFBasic)
tfidfB70s = computeTFIDF(tf70s, allIDFBasic)
tfidfB80s = computeTFIDF(tf80s, allIDFBasic)
tfidfB90s = computeTFIDF(tf90s, allIDFBasic)
tfidfB00s = computeTFIDF(tf00s, allIDFBasic)
tfidfB10s = computeTFIDF(tf10s, allIDFBasic)
dfBasic = pd.DataFrame([tfidfBEV, tfidfB60s, tfidfB70s, tfidfB80s, tfidfB90s, tfidfB00s, tfidfB10s])

In [75]:
dfBasic

Unnamed: 0,a,aaa,aaaa,aaaah,aaaani,aaah,aagg,aah,aah…,aaliyah,...,‘,’,“,”,„,•,…,…but,…no…,﻿i
0,0.0,3.7e-05,0.0,0.0,5.7e-05,0.0,0.0,5e-05,5.7e-05,0.0,...,0.00045,0.0,2.274414e-05,6e-05,0.0,5.7e-05,0.0,5.7e-05,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,9.042687e-07,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,5.942586e-06,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,2.467793e-06,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,4.249033e-06,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,2.099259e-06,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,3.584566e-06,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [42]:
def computeIDFPlusOne(documents):
    import math
    N = len(documents)
    
    idfDict = dict.fromkeys(documents[0].keys(), 0)
    for document in documents:
        for word, val in document.items():
            if val > 0:
                idfDict[word] += 1
    
    for word, val in idfDict.items():
        idfDict[word] = 1 + math.log(N / float(val))
    return idfDict 

allIDFPlusOne = computeIDFPlusOne([wcEV, wc60s, wc70s, wc80s, wc90s, wc00s, wc10s])

In [76]:
tfidfPEV = computeTFIDF(tfEV, allIDFPlusOne)
tfidfP60s = computeTFIDF(tf60s, allIDFPlusOne)
tfidfP70s = computeTFIDF(tf70s, allIDFPlusOne)
tfidfP80s = computeTFIDF(tf80s, allIDFPlusOne)
tfidfP90s = computeTFIDF(tf90s, allIDFPlusOne)
tfidfP00s = computeTFIDF(tf00s, allIDFPlusOne)
tfidfP10s = computeTFIDF(tf10s, allIDFPlusOne)
dfPlusOne = pd.DataFrame([tfidfPEV, tfidfP60s, tfidfP70s, tfidfP80s, tfidfP90s, tfidfP00s, tfidfP10s])

In [77]:
dfPlusOne

Unnamed: 0,a,aaa,aaaa,aaaah,aaaani,aaah,aagg,aah,aah…,aaliyah,...,‘,’,“,”,„,•,…,…but,…no…,﻿i
0,0.018561,6.6e-05,0.0,0.0,8.7e-05,0.0,0.0,0.000109,8.7e-05,0.0,...,0.000981,0.01505,0.00017,0.000237,0.0,8.7e-05,0.0,8.7e-05,0.0,0.0
1,0.022156,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,5.3e-05,7e-06,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.145605,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000347,4.4e-05,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.060466,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000144,1.8e-05,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.10411,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000248,3.2e-05,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.051436,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000123,1.6e-05,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.087829,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000209,2.7e-05,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [69]:
bgEV = list(nltk.bigrams(wordsEurovision))
bg60s = list(nltk.bigrams(words60s))
bg70s = list(nltk.bigrams(words70s))
bg80s = list(nltk.bigrams(words80s))
bg90s = list(nltk.bigrams(words90s))
bg00s = list(nltk.bigrams(words00s))
bg10s = list(nltk.bigrams(words10s))

In [61]:
#get unique bigrams
uniqueBigrams = set(bgEV).union(set(bg60s), set(bg70s), set(bg80s), set(bg90s), set(bg00s), set(bg10s))

In [62]:
#bigram count


In [65]:
tfbgEV = computeTF(bgcEV, bgEV)
tfbg60s = computeTF(bgc60s, bg60s)
tfbg70s = computeTF(bgc70s, bg70s)
tfbg80s = computeTF(bgc80s, bg80s)
tfbg90s = computeTF(bgc90s, bg90s)
tfbg10s = computeTF(bgc00s, bg00s)
tfbg00s = computeTF(bgc10s, bg10s)

In [66]:
bgIDF = computeIDFBasic([bgcEV, bgc60s, bgc70s, bgc80s, bgc90s, bgc00s, bgc10s])

KeyError: ('little', 'prayer')

In [None]:
tfidfBBigram60s = computeTFIDF(tf60s, allIDFBasic)
tfidfBBigram70s = computeTFIDF(tf70s, allIDFBasic)
tfidfBBigram80s = computeTFIDF(tf80s, allIDFBasic)
tfidfBBigram90s = computeTFIDF(tf90s, allIDFBasic)
tfidfBBigram00s = computeTFIDF(tf00s, allIDFBasic)
tfidfBBigram10s = computeTFIDF(tf10s, allIDFBasic)
dfBasic = pd.DataFrame([tfidfBBigram60s, tfidfBBigram70s, tfidfBBigram80s, tfidfBBigram90s, tfidfBBigram00s, tfidfBBigram10s])