In [None]:
# https://rstudio-pubs-static.s3.amazonaws.com/79360_850b2a69980c4488b1db95987a24867a.html

In [2]:
%%time
import pandas as pd
import os
import bisect
import codecs
import nltk
from aux_bisect import *
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.stem.porter import PorterStemmer


CPU times: user 445 ms, sys: 48.6 ms, total: 494 ms
Wall time: 632 ms


In [6]:
LEXICONS_DIR = 'datasets/lexicons/'
MOODY_DIR = 'datasets/moody_lyrics/raw_lyrics/'
BILBOARD = 'datasets/lyrics_bilboard.txt'

## Load the corpus

In [7]:
## Cargamos cada lyric en una lista
def load_moody():
    lyrics_list = []
    
    for filename in os.listdir(MOODY_DIR):
        with codecs.open(MOODY_DIR+filename, 'r', 'utf-8') as f:
            lyrics_list.append(f.read())
    return lyrics_list


def load_bilboard():
    lyrics_list = []
    with codecs.open(BILBOARD, 'r', 'utf-8') as f:
        lyrics_list = f.read().splitlines()
    return lyrics_list


In [11]:
option = 'moody'

if option == 'moody':
    lyrics_list = load_moody()
    
elif option == 'bilboard':
    lyrics_list = load_bilboard()

### Lyric example

In [12]:
print(lyrics_list[0])

In this world of ordinary people
Extraordinary people
I'm glad there is you

In this world of over-rated treasures
Of under-rated pleasures
I'm so glad there is you

I live to love
I love to live with you be, beside me
This role so new
I'll muddle through, with you to guide me

So in this world where many, many play at love
And hardly any stay in love
I'm glad there is you more than ever
I'm glad there is you

I live to love
I love to live with you beside me
This role so new
I'll muddle through with you to guide me

So in this world, in this world, in this world
Where many, many play at love
But hardly any stay in love
I'm glad there is you more than ever
I'm glad there is you


## Corpus filtering

In [15]:
STOPWORDS = nltk.corpus.stopwords.words('english')
anew_raw = pd.read_csv(LEXICONS_DIR+'anew.csv')
anew_clean = anew_raw[['Description', 'Valence Mean', 'Arousal Mean']].sort_values(by='Description')
anew_words = anew_clean['Description'].tolist()
print("Length ANEW Lexicon: ", len(anew_words))

'''
# Pre-process General Inquirer Sentiment Lexicons
general_inquirer = pd.read_csv(LEXICONS_DIR+'general_inquirer.csv')['A'].tolist()
gi = []
for senti in general_inquirer:
    senti = str(senti.split('#')[0])
    if not senti in gi:
        gi.append(senti.lower())
print("Length General Inquirer Lexicon: ", len(gi))
'''


Length ANEW Lexicon:  1030


FileNotFoundError: File b'datasets/lexicons/general_inquirer.csv' does not exist

In [16]:
# Pre proceso de positive/negative english words
with open(LEXICONS_DIR+'positive_words_en.txt') as f:
    en_pos = f.read().splitlines()

with open(LEXICONS_DIR+'negative_words_en.txt') as f:
    en_neg = f.read().splitlines()
    
en_pos_neg = sorted(en_pos + en_neg)
print("Length Positive/Negative English Lexicon: ", len(en_pos_neg))

mixed_corpus_not_stemmed = anew_words+en_pos_neg

# Hay que stemmear el corpus
p_stemmer = PorterStemmer()
mixed_corpus = []
for token in mixed_corpus_not_stemmed:
    mixed_corpus.append(p_stemmer.stem(token))
mixed_corpus = sorted(mixed_corpus)
    

Length Positive/Negative English Lexicon:  4376


In [17]:
## Process a word
def word_processor(word, sentiment, lexicon=anew_words):
    word = word.lower()
    if not word in STOPWORDS and word.isalpha():
        if sentiment:
            if find_index(lexicon, word):
                return word
        else:
            return word
    
    return False


def tokenize_lyric(lyric, sentiment=True):
    ## Pre-procesamos el corpus
    toktok = ToktokTokenizer()

    # Create p_stemmer of class PorterStemmer
    p_stemmer = PorterStemmer()

    words = toktok.tokenize(lyric)
    filtered_words = []
    for w in words:
        stemmed = p_stemmer.stem(w)
        clean_word = word_processor(stemmed, sentiment, mixed_corpus)
        if clean_word:
            filtered_words.append(clean_word)
            
    return filtered_words


#  Tokenizamos y sacamos las letras con menos de 10 palabras
filtered_lyrics = []
for lyric in lyrics_list:
    tokenized = tokenize_lyric(lyric, True)
    if len(tokenized) >= 3:
        filtered_lyrics.append(tokenized)
len(filtered_lyrics)
#print(filtered_lyrics[0:3])

2535

## Diccionario y BOW corpus

In [19]:
from gensim import corpora

dictionary = corpora.Dictionary(filtered_lyrics)
corpus_bow = [dictionary.doc2bow(lyric) for lyric in filtered_lyrics]
print("BOW Corpus Length = " + str(len(corpus_bow)))

BOW Corpus Length = 2535


## Constructing a document-term matrix + LDA

In [20]:
from gensim.models.ldamulticore import LdaMulticore

def apply_lda(corpus, num_topics, passes):
    ldamodel = LdaMulticore(
        corpus, 
        num_topics=num_topics, 
        id2word = dictionary, 
        passes = passes,
        workers = 3,
        minimum_probability=0.001
    )        
    return ldamodel


In [21]:
def print_lda(ldamodel, num_words=6):
    prin = ldamodel.print_topics(num_topics=num_topics, num_words=num_words)
    for i in prin:
        print(i)


## LSA

In [22]:
from gensim.models import LsiModel

num_topics = 6
lsi = LsiModel(corpus_bow, id2word=dictionary, num_topics=num_topics)
lsi.print_topics(num_topics=num_topics, num_words=5)

[(0,
  '0.989*"love" + 0.082*"babi" + 0.057*"like" + 0.036*"prais" + 0.034*"heart"'),
 (1,
  '0.920*"god" + 0.202*"like" + 0.158*"power" + 0.156*"awesom" + 0.085*"home"'),
 (2,
  '-0.534*"babi" + -0.415*"home" + -0.388*"girl" + -0.272*"like" + 0.237*"god"'),
 (3,
  '-0.724*"fire" + 0.361*"home" + -0.304*"like" + -0.284*"burn" + 0.258*"babi"'),
 (4,
  '-0.526*"christma" + -0.446*"merri" + 0.444*"girl" + -0.377*"home" + 0.235*"lone"'),
 (5,
  '0.465*"home" + -0.460*"christma" + -0.415*"merri" + -0.367*"girl" + 0.279*"fire"')]

## Hierarchical Dirichlet Process, HDP

In [23]:
from gensim.models import HdpModel

hdp = HdpModel(corpus_bow, id2word=dictionary)
hdp.print_topics(num_topics=8, num_words=5)

[(0, '0.184*love + 0.029*babi + 0.021*like + 0.017*heart + 0.013*girl'),
 (1, '0.063*love + 0.019*girl + 0.017*babi + 0.017*joy + 0.014*like'),
 (2, '0.033*lone + 0.017*girl + 0.008*fire + 0.007*heaven + 0.007*holi'),
 (3, '0.021*fire + 0.011*love + 0.009*god + 0.007*fight + 0.007*like'),
 (4, '0.030*home + 0.024*love + 0.021*babi + 0.007*like + 0.006*girl'),
 (5, '0.023*love + 0.008*babi + 0.007*lie + 0.004*time + 0.004*rock'),
 (6, '0.019*love + 0.010*happi + 0.007*lie + 0.005*music + 0.005*like'),
 (7, '0.056*love + 0.005*song + 0.004*time + 0.004*hate + 0.004*life')]

## Kmeans con TF-IDF

In [27]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

vectorizer = TfidfVectorizer(stop_words='english', tokenizer=tokenize_lyric)
X = vectorizer.fit_transform(lyrics_list)

true_k = 8
model = KMeans(n_clusters=true_k, init='k-means++', max_iter=200, n_init=1)
model.fit(X)

print("Top terms per cluster:")
order_centroids = model.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
for i in range(true_k):
    print("Cluster: ", i)
    for ind in order_centroids[i, :7]:
        print(terms[ind])
    print

Top terms per cluster:
Cluster:  0
lone
long
girl
time
die
babi
like
Cluster:  1
love
babi
like
heart
life
time
world
Cluster:  2
sin
hay
su
mar
culpabl
error
air
Cluster:  3
home
babi
long
sweet
alon
christma
like
Cluster:  4
god
peac
holi
heavenli
sleep
silent
heaven
Cluster:  5
christma
merri
blue
happi
white
tree
love
Cluster:  6
war
fight
burn
like
die
death
time
Cluster:  7
love
babi
girl
like
time
heart
life


## Results

In [28]:
## Process a word
def word_processor2(word, sentiment, lexicon=anew_words):
    word = word.lower()
    if word.isalpha():
        if sentiment:
            if find_index(lexicon, word):
                return word
        else:
            return word
    
    return False


def tokenize_lyric2(lyric, sentiment=True):
    ## Pre-procesamos el corpus
    toktok = ToktokTokenizer()

    # Create p_stemmer of class PorterStemmer
    p_stemmer = PorterStemmer()

    words = toktok.tokenize(lyric)
    filtered_words = []
    for w in words:
        stemmed = p_stemmer.stem(w)
        clean_word = word_processor2(stemmed, sentiment, mixed_corpus)
        if clean_word:
            filtered_words.append(clean_word)
            
    return filtered_words


#  Tokenizamos y sacamos las letras con menos de 10 palabras
tokenized_lyrics = []
for lyric in lyrics_list:
    tokenized = tokenize_lyric2(lyric, False)
    if len(tokenized) >= 0:
        tokenized_lyrics.append(tokenized)
len(tokenized_lyrics)


def get_context(word, corpus):
    corpus_context = []
    for lyric in corpus:
        lyric_context = []
        for i in range(len(lyric)):
            if word == lyric[i]:
                word_context = []
                try:
                    word_context.append(lyric[i-3])
                    word_context.append(lyric[i-2])
                    word_context.append(lyric[i-1])
                    word_context.append(lyric[i])
                    word_context.append(lyric[i+1])
                    word_context.append(lyric[i+2])
                    word_context.append(lyric[i+3])
                    if not word_context in lyric_context:
                        lyric_context.append(word_context)
                except:
                    pass
        if lyric_context:
            corpus_context.append(lyric_context)
    return corpus_context


In [30]:
%%time
# Resumen LDA
num_topics = 8
ldamodel = apply_lda(corpus_bow, num_topics, 100)
print_lda(ldamodel)

(0, '0.244*"babi" + 0.033*"like" + 0.032*"rock" + 0.031*"parti" + 0.028*"time" + 0.026*"name"')
(1, '0.569*"love" + 0.025*"like" + 0.022*"heart" + 0.020*"life" + 0.011*"time" + 0.009*"world"')
(2, '0.116*"girl" + 0.055*"burn" + 0.046*"blue" + 0.039*"like" + 0.030*"dream" + 0.026*"free"')
(3, '0.129*"christma" + 0.096*"war" + 0.086*"happi" + 0.065*"merri" + 0.049*"sin" + 0.029*"con"')
(4, '0.147*"home" + 0.063*"joy" + 0.059*"good" + 0.026*"sweet" + 0.023*"honey" + 0.023*"sleep"')
(5, '0.076*"god" + 0.049*"time" + 0.046*"fight" + 0.030*"life" + 0.029*"fall" + 0.027*"like"')
(6, '0.062*"heart" + 0.047*"pain" + 0.039*"hate" + 0.038*"kiss" + 0.024*"better" + 0.021*"like"')
(7, '0.103*"fire" + 0.068*"lone" + 0.043*"sun" + 0.031*"easi" + 0.028*"like" + 0.027*"lie"')
CPU times: user 1min 13s, sys: 8.39 s, total: 1min 22s
Wall time: 1min 14s


## Cluster exploration from word

In [33]:
word = 'troubl'
print(ldamodel.get_term_topics(p_stemmer.stem(word)))

res = get_context(word, tokenized_lyrics)
res[:3]

[(2, 0.0174636), (3, 0.005617701)]


[[['never', 'ring', 'the', 'troubl', 'that', 'it', 'bring']],
 [['to', 'tell', 'my', 'troubl', 'to', 'i', 'don'],
  ['to', 'tell', 'my', 'troubl', 'to', 'caus', 'sinc']],
 [['behind', 'me', 'where', 'troubl', 'melt', 'like', 'lemondrop']]]

## Classify new lyric

In [29]:
coldplay_love = ''''
Look at the stars
Look how they shine for you
And everything you do
Yeah they were all yellow

I came along
I wrote a song for you
And all the things you do
And it was called yellow

So then I took my turn
Oh what a thing to have done
And it was all yellow

Your skin
Oh yeah your skin and bones
Turn into something beautiful
You know you know I love you so
You know I love you so

I swam across
I jumped across for you
Oh what a thing to do

'Cause you were all yellow
I drew a line
I drew a line for you
Oh what a thing to do
And it was all yellow

Your skin
Oh yeah your skin and bones
Turn into something beautiful
And you know
For you I'd bleed myself dry
For you I'd bleed myself dry

It's true
Look how they shine for you
Look how they shine for you
Look how they shine for
Look how they shine for you
Look how they shine for you
Look how they shine

Look at the stars
Look how they shine for you
And all the things that you do
'''

coldplay_sad = '''
And the hardest part
Was letting go, not taking part
Was the hardest part

And the strangest thing
Was waiting for that bell to ring
It was the strangest start

I could feel it go down
Bittersweet I could taste in my mouth
Silver lining the cloud
Oh and I
I wish that I could work it out

And the hardest part
Was letting go, not taking part
You really broke my heart, oh

And I tried to sing
But I couldn't think of anything
And that was the hardest part, oh

I could feel it go down
You left the sweetest taste in my mouth
You're silver lining the clouds
Oh and I
Oh and I
I wonder what it's all about
I wonder what it's all about

Everything I know is wrong
Everything I do, it's just comes undone
And everything is torn apart

Oh and it's the hardest part
That's the hardest part
Yeah that's the hardest part
That's the hardest part
'''

eminem_violence = '''
I told y'all mothafuckas I was comin' back 
What now nigga what now what 
You's the projects nigga
One shot two shot three shot four shots 
All I hear is gunshots this is where the fun stops 
Bodies drop hit the floor music's off 
Parties stop, everybody hit the door someone's lickin' shots off
You bitches is gone I'm dropped in the club 
And I'm tryna run and get my motherfuckin' gun 
(Nigga what about your wife) 
Nigga fuck my wife I'm tryna run and save my motherfuckin' life 
Oh shit the shoot is comin' 
Bitches, hoes niggas is runnin' 
People shot all over the floor 
And I'm tryna make it to the St. Andrew's door 
That's the sound of the glock 
Even D-J House fucked around and go shot 
I done messed around and forgot my tec 
I don't see nobody but Fab Five and Hex 
(Kuniva you aight) 
These niggas is trippin' 
(Where's Bizarre at?) 
I'm tryna slip through the exit and get to where my car is at 
Bitches screamin' everywhere and niggas is wildin' 
Two minutes ago we was all jokin' and smilin' 
This chick is clingin' onto me sobbin' and sighin' 
Sayin' she didn't mean to diss me earlier and she cryin' 
But its real and cats is gettin' killed 
So I hugged her and used her body as a human shield 
And she got hit now she yellin' 
(Don't leave me!) 
I told her I'd be right back and the dumb bitch believed me 
I squeezed through the back door and made my escape 
I ran and got my 38 I hope its not to late
One shot two shot three shot four shots 
All I hear is gunshots this is where the fun stops 
Bodies drop hit the floor music's off 
Parties stop, everybody hit the door someone's lickin' shots off
I'm on seven mile what the fuck was that 
Damn somebody hit me from the back 
(With they car?) 
With a gat nigga and my tire is flat 
And I just hit a pole, them niggas some hoes 
(Is you hit?) 
I don't know but I can tell you what they drove 
It was a black Mitsubishi 
(Shit that's the clique we beefin' wit I swear) 
Man and I was on my way there 
Believe me I'm leavin' a carcus today 
I'm a park my car and walk the rest of the way 
I'm in the mood to strut, my A-K ain't even tuck 
I'm a meet you at the club we goin' fuck these hoes up
One shot two shot three shot four shots 
All I hear is gunshots this is where the fun stops 
Bodies drop hit the floor music's off 
Parties stop, everybody hit the door someone's lickin' shots off
I never seen no shit like this is my life before 
People will still camp out from the night before 
Sleepin' outside the door waitin' in line 
Still tryna get inside the club to see D12 perform 
The fire marshals no, the venue's too small 
People are wall to wall three thousand and some odd vans 
And some come walk from out the parkin' lot 
Get into an argument over a parkin' spot 
He's about to pull his gun out and let's a few of 'em off 
Missed who he's aimin' for six feet away's the door 
In St. Andrew's hall not a stray slidin' all over the place 
Sprays one bitch in the face another one of 'em came through the wall 
Before anyone could even hear the first shot go off 
I'm posted up by the bar havin' a Mozeltoff 
Bullet wizzed right by my ear damn near shot it off 
Thank god I'm alive I gotta find Denaun 
And where the fuck is Von he usually tucks one on him 
Wait a minute I think I just saw Bizarre 
Nah I guess not, what the fuck oh my god it was 
I never saw him run so fast in my life 
Look at him haulin' ass I think he left his wife 
There she is on the ground bein' trampled 
I go to grab her up by the damn hand and I can't pull her 
God damn there just went another damn bullet I'm hit 
My vest is barely able to handle it, its to thin 
If I get hit again I can't do it, I scoop deep 
Follow Bizarre's path and ran through it 
And made it to the front door and collapsed on the steps 
Looked up and I seen Swift shootin' it out 
But I can't see who he's shootin' it out with 
But Denaun's right behind him squeezin' his four fifth
One shot two shot three shot four shots 
All I hear is gunshots this is where the fun stops 
Bodies drop hit the floor music's off 
Parties stop, everybody hit the door someone's lickin' shots off
It's Friday night came to this bitch right 
Big ass to my left and Desert Eagle to my right 
I ain't come in this bitch to party I came in this bitch to fight 
Although I can't stay here to fight 'cause I'm poppin' niggas tonight 
That's right bitches I'm drunk with revenge 
Shot a bouncer in the neck for tryna check when I get in 
Swift told me to meet him here so its clear that this fucker
Shoot out the back of his truck goes up in this motherfucker 
So one shot for the money two's to stop the show 
Third's for the bartender there's plenty of shots to go 
(I just wanna know who's drivin' a black Mitsubishi) 
He tried to run so Proof shot him in the knee wit a three piece
One shot two shot three shot four shots 
All I hear is gunshots this is where the fun stops 
Bodies drop hit the floor music's off 
Parties stop, everybody hit the door someone's lickin' shots off
'''

depressing_sad = '''
I was bruised and battered, I couldn't tell what I felt.
I was unrecognizable to myself.
Saw my reflection in a window and didn't know my own face.
Oh brother are you gonna leave me wastin' away
On the streets of Philadelphia.

I walked the avenue, 'til my legs felt like stone, 
I heard the voices of friends, vanished and gone, 
At night I could hear the blood in my veins, 
It was just as black and whispering as the rain, 
On the streets of Philadelphia.

Ain't no angel gonna greet me.
It's just you and I my friend.
And my clothes don't fit me no more, 
I walked a thousand miles
Just to slip this skin.

Night has fallen, I'm lyin' awake, 
I can feel myself fading away, 
So receive me brother with your faithless kiss, 
Or will we leave each other alone like this
On the streets of Philadelphia.
'''

love_happy = '''
Imagine me and you, I do
I think about you day and night, it's only right
To think about the girl you love and hold her tight
So happy together
If I should call you up, invest a dime
And you say you belong to me and ease my mind
Imagine how the world could be, so very fine
So happy together
I can't see me lovin' nobody but you
For all my life
When you're with me, baby the skies'll be blue
For all my life
Me and you and you and me
No matter how they toss the dice, it had to be
The only one for me is you, and you for me
So happy together
I can't see me lovin' nobody but you
For all my life
When you're with me, baby the skies'll be blue
For all my life
Me and you and you and me
No matter how they toss the dice, it had to be
The only one for me is you, and you for me
So happy together
Ba-ba-ba-ba ba-ba-ba-ba ba-ba-ba ba-ba-ba-ba
Ba-ba-ba-ba ba-ba-ba-ba ba-ba-ba ba-ba-ba-ba
Me and you and you and me
No matter how they toss the dice, it had to be
The only one for me is you, and you for me
So happy together
So happy together
How is the weather
So happy together
We're happy together
So happy together
Happy together
So happy together
So happy together (ba-ba-ba-ba ba-ba-ba-ba)

'''


In [34]:
# Love
new_tokenized = tokenize_lyric(coldplay_love)
vec = dictionary.doc2bow(new_tokenized)
topics_list = ldamodel[vec]
print("Yellow - Love")
print("Sentiment Words")
print(set(new_tokenized))
print(topics_list)
print

# Love Melancholic, sad
new_tokenized = tokenize_lyric(coldplay_sad)
vec = dictionary.doc2bow(new_tokenized)
topics_list = ldamodel[vec]
print("Hardest part - Love Sad")
print("Sentiment Words")
print(set(new_tokenized))
print(topics_list)
print

# Depressing, sad
new_tokenized = tokenize_lyric(depressing_sad)
vec = dictionary.doc2bow(new_tokenized)
topics_list = ldamodel[vec]
print("Street of philadelphia - Depressing, sad")
print("Sentiment Words")
print(set(new_tokenized))
print(topics_list)
print

# Violence, Anger
new_tokenized = tokenize_lyric(eminem_violence)
vec = dictionary.doc2bow(new_tokenized)
topics_list = ldamodel[vec]
print("Eminem | One shot, two shot - Violence, Anger")
print("Sentiment Words")
print(set(new_tokenized))
print(topics_list)
print


# Happy
new_tokenized = tokenize_lyric(love_happy)
vec = dictionary.doc2bow(new_tokenized)
topics_list = ldamodel[vec]
print("Bob Marley | Three Little Birds - Love Happy")
print("Sentiment Words")
print(set(new_tokenized))
print(topics_list)
print

Yellow - Love
Sentiment Words
{'star', 'yellow', 'song', 'shine', 'bleed', 'beauti', 'love'}
[(0, 0.26569563), (1, 0.14224355), (2, 0.005443275), (3, 0.0054450766), (4, 0.0054396936), (5, 0.0054365839), (6, 0.0054387404), (7, 0.56485742)]
Hardest part - Love Sad
Sentiment Words
{'part', 'wrong', 'undon', 'heart', 'strangest', 'wonder', 'tast', 'work', 'cloud', 'broke'}
[(0, 0.0052139075), (1, 0.0052115009), (2, 0.0052127019), (3, 0.0052145175), (4, 0.0052107354), (5, 0.0052111535), (6, 0.69215435), (7, 0.27657118)]
Street of philadelphia - Depressing, sad
Sentiment Words
{'fallen', 'friend', 'avenu', 'black', 'angel', 'faithless', 'rain', 'batter', 'bruis', 'like', 'kiss', 'brother', 'greet', 'cloth', 'window', 'alon', 'street'}
[(0, 0.005694435), (1, 0.19262962), (2, 0.28156328), (3, 0.0056924908), (4, 0.10997805), (5, 0.005693973), (6, 0.23129344), (7, 0.16745467)]
Eminem | One shot, two shot - Violence, Anger
Sentiment Words
{'gun', 'right', 'human', 'hit', 'black', 'door', 'argumen

<function print>