# Goal of notebook : 

1. Load Lyrics with genre
2. Clean text in parenthesis from lyrics
3. Create Fasttext vectors and read them into dataframe
4. Create column with indicator of last word of line
5. Create column with type of word (noum, verb, etc)

In [214]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## 1. Load Lyrics with genre

In [288]:
#filepath = "../augment db/full_lyrics.csv"
filepath = "../../kaggleDataset/lyrics_final.csv"
lyrics_df = pd.read_csv(filepath)
print(len(lyrics_df))
lyrics_df.head()

317015


Unnamed: 0,artist,genre,language,lyrics,song,year
0,beyonce-knowles,pop,en,"Oh baby, how you doing?\r\r\r\r\r\nYou know I'...",ego-remix,2009.0
1,beyonce-knowles,pop,en,"playin' everything so easy,\r\r\r\r\r\nit's li...",then-tell-me,2009.0
2,beyonce-knowles,pop,en,If you search\r\r\r\r\r\nFor tenderness\r\r\r\...,honesty,2009.0
3,beyonce-knowles,pop,en,"Oh oh oh I, oh oh oh I\r\r\r\r\r\n[Verse 1:]\r...",you-are-my-rock,2009.0
4,beyonce-knowles,pop,en,"Party the people, the people the party it's po...",black-culture,2009.0


In [34]:
lyrics_df = lyrics_df.drop(['lang'], axis = 1)

In [35]:
lyrics_df.head()

Unnamed: 0,artist,genre,lyrics,title
0,beyonce-knowles,pop,"Oh baby, how you doing?\r\r\nYou know I'm gonn...",ego-remix
1,beyonce-knowles,pop,"playin' everything so easy,\r\r\nit's like you...",then-tell-me
2,beyonce-knowles,pop,If you search\r\r\nFor tenderness\r\r\nIt isn'...,honesty
3,beyonce-knowles,pop,"Oh oh oh I, oh oh oh I\r\r\n[Verse 1:]\r\r\nIf...",you-are-my-rock
4,beyonce-knowles,pop,"Party the people, the people the party it's po...",black-culture


## 2. Clean text in parenthesis from lyrics

In [289]:
sample_df = lyrics_df.sample(n=20)
for i, s in sample_df.iterrows():
    print(s['lyrics'])

A king would trade his finest crown for love, love like this
And warriors have laid weapons down for love, love like this
Hearts will break a thousand time for love, love like this
And arms will wait a thousand nights for love, love like this

Love may come and love may go
But here inside your arms I know
That only you will ever show  me
Love like this

You and I have finally found love, love like this
A place to lay our burdens down love, love like this
Hearts to speak without out a sound love, love like this
What makes the world keep turning round is love, love like this

Love may come and love may go
But here inside your arms I know
That only you will ever show  me
Love like this

The moon has never shown before on love, love like this
And I have never wanted more  than love, love like this
Love like this
Love like this
Love like this

If I didn't forget you
My heart would explode
Yeah. There is too

### Remove [<30 chars] and (<30 chars) and x(digit)

This information gives structure for the singer but not usefull for lyrics. This is [Chorus], choir in parentheses, x2 etc. <br>
To test regex expressions : https://regex101.com

In [290]:
import string
import re

In [298]:
regex_bracket = re.compile("\[(.*?)\]")
regex_parentheses = re.compile("\((.*?)\)")
regex_curly_bracket = re.compile("\{(.*?)\}")
regex_timesx = re.compile("x[0-9]")

def clean_structure_words(l):
    res= regex_bracket.sub('', l)
    res = regex_parentheses.sub('', res)
    res = regex_curly_bracket.sub('', res)
    res = regex_timesx.sub('', res)
    res = res.lower()
    return res

In [299]:
clean_structure_words("Hello, my. name is +emma* (nice)[to]meet -x3--")

'hello, my. name is +emma* meet ---'

In [300]:
lyrics_df['lyrics'] = lyrics_df['lyrics'].apply(lambda x : clean_structure_words(x))

In [301]:
lyrics_df.tail()

Unnamed: 0,artist,genre,language,lyrics,song,year
317010,godhead,metal,en,eleanor rigby picks up the\r\r\r\nrice in the ...,eleanor-rigby,
317011,enter-shikari,metal,en,this is an expedition\r\r\r\ninto the arctic t...,arguing-with-thermometers,
317012,rev-theory,metal,en,"hey i want a taste, you're a black heart devil...",light-it-up,
317013,sum-41,metal,en,"the faster we're falling,\r\r\r\nwe're stoppin...",in-too-deep,
317014,papa-roach,metal,en,"born with no soul, lack of control\r\r\ncut fr...",dead-cell,


### Remove . , ! ? ... ---
- We want to keep the * as this represents some swear words in english
- We want to keep the ' as this is part of many words in english, especially when words are cropped, ie 'Cause 
- Otherwise punctuation is not meaningful

In [302]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [303]:
string.printable

'0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~ \t\n\r\x0b\x0c'

In [304]:
#punctuation_to_remove = '!#$%\+,-./:;<=>?@\\^_`|~()[]{}'
#regex_punct = re.compile('[%s]' % re.escape(punctuation_to_remove))
chars_to_keep = "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ* \n"

def clean_punctuation(l):
    #res = regex_punct.sub('', l)
    res = re.sub(' +', ' ', l)#res)
    res = res.replace("'", ' ') # replace the parentheses with a space.
    res = ''.join(char for char in res if char in chars_to_keep)
    return res

In [305]:
repr(clean_punctuation("Hello,     my.   name I'm (coucou) is +emma* meet ---"))

'"Hello my name I\'m coucou is emma* meet "'

In [306]:
lyrics_df['lyrics'] = lyrics_df['lyrics'].apply(lambda x : clean_punctuation(x))

In [307]:
lyrics_df.tail()

Unnamed: 0,artist,genre,language,lyrics,song,year
317010,godhead,metal,en,eleanor rigby picks up the\nrice in the church...,eleanor-rigby,
317011,enter-shikari,metal,en,this is an expedition\ninto the arctic tundra\...,arguing-with-thermometers,
317012,rev-theory,metal,en,hey i want a taste you re a black heart devil\...,light-it-up,
317013,sum-41,metal,en,the faster we re falling\nwe re stopping and s...,in-too-deep,
317014,papa-roach,metal,en,born with no soul lack of control\ncut from th...,dead-cell,


## 3. Create Fasttext vectors and read them into dataframe

In [309]:
# First create a txt file with the lyrics
f = open("lyrics.txt", "w+")
for i, r in lyrics_df.iterrows():
    f.write(r['lyrics'])
    
f.close()

https://fasttext.cc/docs/en/unsupervised-tutorial.html

In [310]:
# Generate embeddings
#! ./fasttext skipgram -input lyrics.txt -output model_lyrics
! ../../fastText/fasttext skipgram -input lyrics.txt -output model_lyrics3

Read 87M words
Number of words:  88213
Number of labels: 0
Progress: 100.0% words/sec/thread:  204077 lr:  0.000000 loss:  1.603853 ETA:   0h 0mmh 2mm 1mmmmmmmmm0m


In [64]:
# To check first words : The first line is a header containing the number of words and the dimensionality of the vectors. 
# The subsequent lines are the word vectors for all words in the vocabulary, sorted by decreasing frequency.
#! head -n 4 model_lyrics.vec

In [311]:
import sys
import codecs
file_vec = 'model_lyrics3.vec'
file_bin = 'model_lyrics3.bin'

In [312]:
# Load embeddings
def load_embeddings(file_name):
    with codecs.open(file_name, 'r', 'utf-8') as f_in:
        lines = f_in.readlines()
        lines = lines[1:]
        vocabulary, wv = zip(*[line.strip().split(' ', 1) for line in lines])
    wv = np.loadtxt(wv)
    return wv, vocabulary

In [313]:
word_embeddings, vocabulary = load_embeddings(file_vec)

In [314]:
vocabulary = list(vocabulary)

In [315]:
"eversince" in vocabulary

True

In [316]:
len(vocabulary)

88213

In [317]:
word_embeddings.shape

(88213, 100)

In [318]:
def get_embedding_for_missing_word(w, file_bin = file_bin):
    #data = ! echo $w | ./fasttext print-word-vectors model_lyrics.bin #emma
    data = ! echo $w | ./fasttext print-word-vectors $file_bin #oli
    emb = np.zeros(100)
    for i, x in enumerate(data[0].split( )[1:]):
        try:
            emb[i] = float(x)
        except ValueError:
            print(data[0])
    return emb

In [319]:
! echo "environment" | ./fasttext print-word-vectors $file_bin

environment -0.13956 0.39715 -0.16328 0.44154 -0.25917 0.068301 -0.40802 0.29887 0.3078 0.57416 -0.37117 -0.37019 0.42183 -0.51193 -0.38435 0.49242 -0.16212 -0.27712 -0.11974 0.14308 0.18436 -0.59502 0.31742 -0.05954 -0.62733 -0.88829 0.60522 -0.18148 0.25192 0.0088631 0.49228 0.029216 -0.44215 -0.05881 -0.41463 0.21338 0.25346 -0.086722 0.048448 0.095128 0.45532 0.16692 0.53699 -0.31469 0.35777 -0.0007568 0.6378 -0.76667 -0.49559 -0.4017 0.32498 -0.1134 -0.081226 0.25142 -0.1777 -0.23577 -0.3432 0.011128 -0.35867 -0.64709 0.1285 0.067267 0.17216 0.38183 0.15164 0.025435 -0.44227 0.68807 -0.042678 0.41345 -0.42113 -0.35407 0.20841 -0.16725 -0.015706 -0.26245 0.38864 -0.45108 0.28486 -0.095763 0.1112 -0.35914 -0.2907 -0.33486 -0.20412 0.75666 0.36177 0.80754 -0.2279 -0.062341 0.46662 -0.0052635 0.32678 -0.32547 -0.039614 -0.25879 -0.35734 -0.56961 -0.28363 -0.54875 


In [320]:
get_embedding_for_missing_word("we re")

array([ 0.29783  ,  0.21201  , -0.04428  , -0.21724  , -0.40256  ,
       -0.09561  ,  0.27005  ,  0.035976 ,  0.41611  ,  0.17687  ,
       -0.096361 ,  0.05534  ,  0.1041   , -0.55563  , -0.20159  ,
       -0.020378 , -0.050577 ,  0.13754  ,  0.35045  , -0.11657  ,
       -0.0062261, -0.20972  ,  0.3326   , -0.098936 ,  0.11213  ,
       -0.42462  ,  0.24342  ,  0.36843  ,  0.08445  , -0.19725  ,
        0.33089  ,  0.2577   ,  0.44877  , -0.33745  , -0.042147 ,
       -0.20386  , -0.071768 ,  0.12298  ,  0.12241  , -0.26141  ,
        0.080055 , -0.13491  , -0.28003  ,  0.38589  ,  0.060752 ,
        0.23097  , -0.20877  , -0.329    ,  0.035687 , -0.36589  ,
       -0.21418  , -0.1552   ,  0.15799  , -0.087406 ,  0.16495  ,
       -0.2686   ,  0.3036   ,  0.17785  , -0.061034 ,  0.30043  ,
       -0.047854 ,  0.015371 ,  0.11438  ,  0.05328  , -0.043036 ,
        0.033904 , -0.13546  ,  0.34273  ,  0.13733  , -0.049146 ,
       -0.22783  , -0.59114  ,  0.12467  ,  0.38411  , -0.0114

In [321]:
get_embedding_for_missing_word("we are")

array([ 0.29783  ,  0.21201  , -0.04428  , -0.21724  , -0.40256  ,
       -0.09561  ,  0.27005  ,  0.035976 ,  0.41611  ,  0.17687  ,
       -0.096361 ,  0.05534  ,  0.1041   , -0.55563  , -0.20159  ,
       -0.020378 , -0.050577 ,  0.13754  ,  0.35045  , -0.11657  ,
       -0.0062261, -0.20972  ,  0.3326   , -0.098936 ,  0.11213  ,
       -0.42462  ,  0.24342  ,  0.36843  ,  0.08445  , -0.19725  ,
        0.33089  ,  0.2577   ,  0.44877  , -0.33745  , -0.042147 ,
       -0.20386  , -0.071768 ,  0.12298  ,  0.12241  , -0.26141  ,
        0.080055 , -0.13491  , -0.28003  ,  0.38589  ,  0.060752 ,
        0.23097  , -0.20877  , -0.329    ,  0.035687 , -0.36589  ,
       -0.21418  , -0.1552   ,  0.15799  , -0.087406 ,  0.16495  ,
       -0.2686   ,  0.3036   ,  0.17785  , -0.061034 ,  0.30043  ,
       -0.047854 ,  0.015371 ,  0.11438  ,  0.05328  , -0.043036 ,
        0.033904 , -0.13546  ,  0.34273  ,  0.13733  , -0.049146 ,
       -0.22783  , -0.59114  ,  0.12467  ,  0.38411  , -0.0114

Check for words with apostrophes (') such as I'm, you're:

## 4. Create column with indicator of last word of line

In [322]:
test_l = lyrics_df.loc[190]['lyrics']
test_l

'oh baby how you doing\nyou know i m gonna cut right to the chase\nsome women were made but me myself\ni like to think that i was created for a special purpose\nyou know what s more special than you you feel me\nit s on baby let s get lost\nyou don t need to call into work  cause you re the boss\nfor real want you to show me how you feel\ni consider myself lucky that s a big deal\nwhy well you got the key to my heart\nbut you ain t gonna need it i d rather you open up my body\nand show me secrets you didn t know was inside\nno need for me to lie\nit s too big it s too wide\nit s too strong it won t fit\nit s too much it s too tough\nhe talk like this  cause he can back it up\nhe got a big ego such a huge ego\ni love his big ego it s too much\nhe walk like this  cause he can back it up\nusually i m humble right now i don t choose\nyou can leave with me or you could have the blues\nsome call it arrogant i call it confident\nyou decide when you find on what i m working with\ndamn i know i

In [323]:
import json
def generate_embedding_for_lyrics(l):
    #print(l)
    sentences = l.split('\n')
    embedding = []
    #missing_words = []
    for s in sentences:
        words = s.split(' ')
        #print(repr(words))
        for i, w in enumerate(words):
            if len(w)>0:
                try : 
                    idx = vocabulary.index(w)
                    vector = word_embeddings[idx]
                except ValueError : 
                    #print("Word not found :", repr(w))
                    #missing_words.append(w)
                    vector = get_embedding_for_missing_word(w)
                if i == (len(words)-1) :
                    vector = np.append(vector, 1)
                else :
                    vector = np.append(vector, 0)
                vector = np.append(w, vector)
                embedding.append(vector)
                
    return embedding

In [325]:
generate_embedding_for_lyrics("oh baby how you doing\nyou know im gonna cut right to the chase")

[array(['oh', '0.10379', '0.32986', '-0.34839', '0.070715', '-0.22461',
        '0.06979', '0.046826', '0.28013', '0.080266', '-0.091009',
        '-0.086079', '0.60234', '0.61265', '0.20444', '-0.24218',
        '0.19384', '-0.25945', '0.23119', '-0.10847', '-0.053188',
        '-0.088627', '-0.37306', '-0.17445', '-0.12089', '0.20406',
        '-0.47211', '0.074021', '0.22309', '0.26263', '-0.077209',
        '-0.054993', '0.045765', '-0.33694', '0.090265', '0.061225',
        '-0.031373', '0.34355', '-0.0080525', '-0.078579', '-0.3383',
        '-0.0084812', '0.26118', '-0.1201', '0.020674', '0.011981',
        '-0.0098026', '-0.067438', '0.097171', '0.33965', '-0.17555',
        '0.38893', '-0.095315', '-0.1466', '-0.087356', '0.090826',
        '-0.62537', '0.26035', '0.12335', '0.35735', '0.15872', '0.082386',
        '0.42808', '0.034145', '0.18829', '-0.098424', '0.10311',
        '0.082719', '-0.20298', '0.26479', '0.073556', '0.024633',
        '-0.34269', '0.025242', '0.3888

In [271]:
#lyrics_df['embedding'] = lyrics_df['lyrics'][:30].apply(lambda x : generate_embedding_for_lyrics(x))

echo "eversince" | ./fasttext print-word-vectors model_lyrics.bin
eversince 0.26688 -0.1294 0.21473 -0.0022411 -0.21671 0.032355 -0.18573 0.14012 0.26681 0.37774 -0.33736 0.26381 -0.053749 0.37619 -0.21128 -0.056934 0.20461 0.33779 -0.10611 0.021266 -0.17509 0.27 0.41086 0.13709 0.16705 -0.60041 0.38402 0.087207 -0.33848 0.19228 -0.27138 -0.30764 -0.2702 -0.30729 -0.72478 0.16182 0.11382 0.19189 -0.27027 0.27689 0.039753 -0.21678 -0.4436 0.24418 0.12741 0.21971 0.014112 0.11523 0.00034974 0.24912 -0.62545 -0.04865 -0.32367 -0.38435 -0.058886 -0.13295 0.48399 0.49655 0.34541 0.65489 -0.18568 -0.076875 0.087327 -0.0079827 -0.073017 -0.046165 0.009111 -0.024724 -0.44245 0.092923 0.33201 -0.54304 -0.22664 0.41005 0.14919 -0.30418 -0.13195 -0.10742 -0.18871 0.14914 0.15383 -0.30302 0.362 0.26775 0.11757 0.057333 0.38091 0.095017 -0.10443 0.13355 0.64025 0.070291 0.045029 -0.053138 0.19487 -0.062763 0.19003 0.32966 0.049617 0.207

In [326]:
lyrics_df.head()

Unnamed: 0,artist,genre,language,lyrics,song,year
0,beyonce-knowles,pop,en,oh baby how you doing\nyou know i m gonna cut ...,ego-remix,2009.0
1,beyonce-knowles,pop,en,playin everything so easy\nit s like you seem...,then-tell-me,2009.0
2,beyonce-knowles,pop,en,if you search\nfor tenderness\nit isn t hard t...,honesty,2009.0
3,beyonce-knowles,pop,en,oh oh oh i oh oh oh i\n\nif i wrote a book abo...,you-are-my-rock,2009.0
4,beyonce-knowles,pop,en,party the people the people the party it s pop...,black-culture,2009.0


In [327]:
lyrics_df.to_csv('../../kaggleDataset/lyrics_final_clean_noQuotes.csv', index = False)

## 5. Create column with type of word (noum, verb, etc)

In [40]:
import spacy

In [42]:
en_nlp = spacy.load('en')

ADJ: adjective ADP: adposition ADV: adverb AUX: auxiliary verb CONJ: coordinating conjunction DET: determiner INTJ: interjection NOUN: noun NUM: numeral PART: particle PRON: pronoun PROPN: proper noun PUNCT: punctuation SCONJ: subordinating conjunction SYM: symbol VERB: verb X: other

In [171]:
def generate_pos_for_lyrics(l):
    doc = en_nlp(l)
    embedding = []
    for i in range(len(doc)):
        print(doc[i], doc[i].pos_)
        type_ = doc[i].pos_
        if type_ != 'SPACE' and type_ != 'PART':
            embedding.append(type_)
    return embedding

In [257]:
phrase = "Oh baby how you doing\nYou know I m gonna cut right to the chase 2342"
generate_pos_for_lyrics(phrase)

Oh INTJ
baby NOUN
how ADV
you PRON
doing VERB

 SPACE
You PRON
know VERB
I PRON
m VERB
gon VERB
na PART
cut VERB
right ADV
to ADP
the DET
chase NOUN
2342 NUM


['INTJ',
 'NOUN',
 'ADV',
 'PRON',
 'VERB',
 'PRON',
 'VERB',
 'PRON',
 'VERB',
 'VERB',
 'VERB',
 'ADV',
 'ADP',
 'DET',
 'NOUN',
 'NUM']

In [160]:
def generate_pos_for_word(w):
    doc = en_nlp(w)
    #print(doc[0], doc[0].pos_)
    return doc[0].pos_

In [164]:
merge_embedding_pos(phrase)

/bin/bash: -c: line 0: unexpected EOF while looking for matching `''
/bin/bash: -c: line 0: unexpected EOF while looking for matching `''
/bin/bash: -c: line 0: unexpected EOF while looking for matching `''
/bin/bash: -c: line 0: unexpected EOF while looking for matching `''
/bin/bash: -c: line 0: unexpected EOF while looking for matching `''
/bin/bash: -c: line 0: unexpected EOF while looking for matching `''
/bin/bash: -c: line 0: unexpected EOF while looking for matching `''
/bin/bash: -c: line 0: unexpected EOF while looking for matching `''
/bin/bash: -c: line 0: unexpected EOF while looking for matching `''
/bin/bash: -c: line 0: unexpected EOF while looking for matching `''


[array(['Oh', '0.20192', '0.17829', '0.11377', '0.25585', '0.33173',
        '0.15312', '0.24776', '0.5862', '0.093913', '0.19943', '0.10433',
        '-0.18693', '0.26373', '0.015889', '-0.161', '-0.25633',
        '0.089251', '0.29465', '-0.16769', '0.37362', '-0.42326',
        '-0.38022', '-0.36808', '0.3859', '-0.3937', '0.14879', '0.03692',
        '-0.093622', '-0.29215', '0.55122', '0.0025661', '0.39107',
        '0.47622', '-0.11309', '0.099956', '0.47135', '0.12469',
        '-0.11252', '-0.14087', '0.51835', '-0.18488', '0.40184',
        '0.16747', '-0.027179', '0.34297', '-0.052577', '-0.092113',
        '0.037731', '-0.14673', '0.046032', '-0.41669', '-0.12625',
        '0.081563', '0.57855', '-0.12862', '0.094819', '0.21436',
        '0.47401', '0.06485', '-0.19564', '-0.014255', '-0.027256',
        '0.050378', '0.068006', '0.27775', '0.25269', '-0.48629',
        '0.018667', '-0.067289', '-0.13964', '0.10941', '-0.034141',
        '-0.07083', '-0.13839', '-0.069037', '

In [94]:
pos_to_idx = {'ADJ': 0, 'ADP': 1, 'ADV':2,'AUX': 3,'CONJ': 4,'CCONJ': 5,'DET': 6, 'INTJ': 7, 'NOUN': 8,
              'NUM': 9, 'PART': 10,'PRON':11, 'PROPN': 12, 'PUNCT': 13, 'SCONJ': 14, 'SYM': 15, 
              'VERB': 16, 'X': 17}

In [67]:
"""def merge_embedding_pos(l):
    embedding = generate_embedding_for_lyrics(l)
    pos = generate_pos_for_lyrics(l)
    #print(len(embedding))
    for i in range(len(embedding)):
        bin_vector = np.zeros(18)
        bin_vector[pos_to_idx[pos[i]]] = 1
        embedding[i] = np.append(embedding[i], bin_vector)
    return embedding"""

In [108]:
def merge_embedding_pos(l):
    embedding = generate_embedding_for_lyrics(l)
    for i in range(len(embedding)):
        w = embedding[i][0]
        #print(w)
        if w != '\n':
            pos = generate_pos_for_word(str(w))
            bin_vector = np.zeros(18)
            bin_vector[pos_to_idx[pos]] = 1
            embedding[i] = np.append(embedding[i], bin_vector)
    return embedding

In [109]:
merge_embedding_pos("Oh baby how you doing\nYou know I'm gonna cut right to the chase")

[array(['Oh', '-0.26756', '0.16066', '-0.026644', '-0.061647', '0.12856',
        '-0.16667', '-0.31698', '0.21519', '0.087186', '0.70611',
        '0.097478', '-0.017475', '0.13562', '-0.34733', '0.12901',
        '0.060213', '0.0038623', '-0.003505', '-0.309', '-0.1214',
        '-0.31168', '-0.038396', '-0.13885', '0.13174', '0.48543',
        '0.27012', '0.023631', '0.039914', '-0.0085227', '-0.32833',
        '-0.34518', '-0.039726', '-0.19771', '0.23373', '-0.041411',
        '-0.036932', '-0.073936', '-0.47071', '0.30993', '0.32195',
        '-0.074116', '0.17782', '-0.33701', '0.12694', '0.07969',
        '-0.078333', '-0.32187', '-0.21694', '-0.32914', '0.38804',
        '-0.47941', '0.52507', '0.13306', '-0.086922', '-0.1153',
        '0.64853', '-0.28492', '0.55657', '0.037141', '-0.24895',
        '0.26131', '-0.094953', '0.061678', '0.19854', '0.22078',
        '0.27887', '-0.65879', '-0.03955', '0.62236', '-0.22495',
        '-0.056741', '0.017541', '0.072202', '0.017473'

In [98]:
lyrics_df['embedding'] = lyrics_df['lyrics'][:30].apply(lambda x : merge_embedding_pos(x))

/bin/bash: -c: line 0: unexpected EOF while looking for matching `''
/bin/bash: -c: line 0: unexpected EOF while looking for matching `''
/bin/bash: -c: line 0: unexpected EOF while looking for matching `''
/bin/bash: -c: line 0: unexpected EOF while looking for matching `''
/bin/bash: -c: line 0: unexpected EOF while looking for matching `''
/bin/bash: -c: line 0: unexpected EOF while looking for matching `''
/bin/bash: -c: line 0: unexpected EOF while looking for matching `''
/bin/bash: -c: line 0: unexpected EOF while looking for matching `''
/bin/bash: -c: line 0: unexpected EOF while looking for matching `''
/bin/bash: -c: line 0: unexpected EOF while looking for matching `''
/bin/bash: -c: line 0: unexpected EOF while looking for matching `''
/bin/bash: -c: line 0: unexpected EOF while looking for matching `''
/bin/bash: -c: line 0: unexpected EOF while looking for matching `''
/bin/bash: -c: line 0: unexpected EOF while looking for matching `''
/bin/bash: -c: line 0: unexpected 

Can we remove SYM for symbol, PUNCT for punctuation, and/or merge some categories ?

In [107]:
lyrics_df.head(10)

Unnamed: 0,artist,genre,lyrics,title,embedding
0,beyonce-knowles,pop,Oh baby how you doing\nYou know I'm gonna cut ...,ego-remix,"[[Oh, -0.26756, 0.16066, -0.026644, -0.061647,..."
1,beyonce-knowles,pop,playin' everything so easy\nit's like you seem...,then-tell-me,"[[playin', 0.046408, -0.22275, 0.27501, 0.3307..."
2,beyonce-knowles,pop,If you search\nFor tenderness\nIt isn't hard t...,honesty,"[[If, -0.077863, 0.26915, 0.24783, 0.078624, 0..."
3,beyonce-knowles,pop,Oh oh oh I oh oh oh I\n\nIf I wrote a book abo...,you-are-my-rock,"[[Oh, -0.26756, 0.16066, -0.026644, -0.061647,..."
4,beyonce-knowles,pop,Party the people the people the party it's pop...,black-culture,"[[Party, -0.031328, -0.60994, -0.27762, -0.313..."
5,beyonce-knowles,pop,I heard\nChurch bells ringing\nI heard\nA choi...,all-i-could-do-was-cry,"[[I, -0.028172, 0.063204, 0.13383, 0.013919, -..."
6,beyonce-knowles,pop,This is just another day that I would spend\nW...,once-in-a-lifetime,"[[This, 0.026437, 0.014923, 0.17864, -0.10028,..."
7,beyonce-knowles,pop,Waiting waiting waiting waiting\nWaiting waiti...,waiting,"[[Waiting, -0.36918, 0.38065, 0.68427, -0.1569..."
8,beyonce-knowles,pop,\nI read all of the magazines\nwhile waiting a...,slow-love,"[[I, -0.028172, 0.063204, 0.13383, 0.013919, -..."
9,beyonce-knowles,pop,Nnnow honey\nYou better sit down and look arou...,why-don-t-you-love-me,"[[Nnnow, 0.014805, -0.076101, -0.017499, -0.13..."


## 6. Create column with Entity Recognition

In [122]:
nlp = spacy.load('en_core_web_sm')

Entity types : <br>
PERSON	People, including fictional.<br>
NORP	Nationalities or religious or political groups.<br>
FAC	Buildings, airports, highways, bridges, etc.<br>
ORG	Companies, agencies, institutions, etc.<br>
GPE	Countries, cities, states.<br>
LOC	Non-GPE locations, mountain ranges, bodies of water.<br>
PRODUCT	Objects, vehicles, foods, etc. (Not services.)<br>
EVENT	Named hurricanes, battles, wars, sports events, etc.<br>
WORK_OF_ART	Titles of books, songs, etc.<br>
LAW	Named documents made into laws.<br>
LANGUAGE	Any named language.<br>
DATE	Absolute or relative dates or periods.<br>
TIME	Times smaller than a day.<br>
PERCENT	Percentage, including "%".<br>
MONEY	Monetary values, including unit.<br>
QUANTITY	Measurements, as of weight or distance.<br>
ORDINAL	"first", "second", etc.<br>
CARDINAL	Numerals that do not fall under another type.<br>

In [136]:
doc = nlp(lyrics_df.loc[24]['lyrics'].replace('\n', '').lower())
for ent in doc.ents:
    print(ent.text, ent.label_)

the first day DATE
first ORDINAL
the first day DATE
the first day DATE
the first day the first day DATE
the first day DATE
the first day DATE
the first day DATE


In [154]:
def find_sub_list(sl,l):
    sll=len(sl)
    for ind in (i for i,e in enumerate(l) if e==sl[0]):
        if l[ind:ind+sll]==sl:
            return ind,ind+sll-1

In [195]:
def get_entity_for_lyrics(l):
    l = l.replace('\n', ' ').lower()
    doc = nlp(l)
    l_list = l.split(' ')
    curr_i = 0
    result = np.empty(shape=len(l_list), dtype=object)
    for ent in doc.ents:
        if ent.text.replace(' ', '') != '':
            ent_list = ent.text.split(' ')
            try:
                s, e = find_sub_list(ent_list, l_list[curr_i:])
                s += curr_i
                e += curr_i
                for i in range(s, e+1):
                    result[i] = ent.label_
                curr_i = e
            except : 
                print(ent_list)
                print(l_list[curr_i:])
    return result

In [159]:
ent_to_idx = {'PERSON' : 0,
'NORP': 1,
'FAC' : 2,
'ORG' : 3,
'GPE' : 4,
'LOC' : 5,
'PRODUCT' : 6,
'EVENT' : 7,
'WORK_OF_ART' : 8,
'LAW': 9,
'LANGUAGE' : 10,
'DATE' : 11,
'TIME' : 12,
'PERCENT' : 13,
'MONEY' : 14,
'QUANTITY' : 15,
'ORDINAL' : 16,
'CARDINAL': 17}

In [169]:
def add_ent_to_embedding(emb, l):
    # Compute the entities for the lyrics
    ents = get_entity_for_lyrics(l)
    # Go over the embedding of the words
    for i, w in enumerate(emb):
        # Create empty binary vector
        bin_vector = np.zeros(18)
        # If the entity is None, means we don't know --> Keep all 0s
        if ents[i] != None :
            # Put 1 where the entity is when we know it : current entity is ents[i], pass to index with ent_to_idx
            bin_vector[ent_to_idx[ents[i]]] = 1
        # Add the binary vector at the end of embedding and return
        emb[i] = np.append(w, bin_vector)
    return emb

In [171]:
add_ent_to_embedding(lyrics_df.loc[0]['embedding'], lyrics_df.loc[0]['lyrics'])

[array(['Oh', '-0.26756', '0.16066', '-0.026644', '-0.061647', '0.12856',
        '-0.16667', '-0.31698', '0.21519', '0.087186', '0.70611',
        '0.097478', '-0.017475', '0.13562', '-0.34733', '0.12901',
        '0.060213', '0.0038623', '-0.003505', '-0.309', '-0.1214',
        '-0.31168', '-0.038396', '-0.13885', '0.13174', '0.48543',
        '0.27012', '0.023631', '0.039914', '-0.0085227', '-0.32833',
        '-0.34518', '-0.039726', '-0.19771', '0.23373', '-0.041411',
        '-0.036932', '-0.073936', '-0.47071', '0.30993', '0.32195',
        '-0.074116', '0.17782', '-0.33701', '0.12694', '0.07969',
        '-0.078333', '-0.32187', '-0.21694', '-0.32914', '0.38804',
        '-0.47941', '0.52507', '0.13306', '-0.086922', '-0.1153',
        '0.64853', '-0.28492', '0.55657', '0.037141', '-0.24895',
        '0.26131', '-0.094953', '0.061678', '0.19854', '0.22078',
        '0.27887', '-0.65879', '-0.03955', '0.62236', '-0.22495',
        '-0.056741', '0.017541', '0.072202', '0.017473'

In [196]:
for i, r in lyrics_df[:30].iterrows() :
    r['embedding'] = add_ent_to_embedding(r['embedding'], r['lyrics'])

['night']
['one', 'more', 'dance', 'then', "i'm", 'ready', 'to', 'go', 'well', 'the', "dj's", 'playing', 'all', 'the', 'same', 'songs', 'and', 'the', "night's", 'about', 'to', 'end', 'can', 'we', 'meet', 'in', 'the', 'parking', 'lot', 'find', 'a', 'quiet', 'place', 'were', 'we', 'can', 'talk', 'to', 'find', 'out', 'more', 'about', 'each', 'other', 'baby', 'can', 'we', 'repeat', 'prechorus', 'and', 'chorus', 'bridge', 'you', 'know', "i'm", 'feelin', 'you', 'tonight', 'so', "let's", 'find', 'a', 'certain', 'spot', 'to', 'go', 'where', 'we', 'can', 'get', 'to', 'know', 'each', 'other', 'better', "i'll", 'go', 'and', 'tell', 'my', 'girls', 'you', 'go', 'and', 'tell', 'your', 'boys', 'before', 'we', "leavin'", 'tell', 'me', 'whats', "it'", "gon'", 'be', '', 'chorus', 'until', 'fade']


Wikipedia scheme : <br>
PER	Named person or family.<br>
LOC	Name of politically or geographically defined location (cities, provinces, countries, international regions, bodies of water, mountains).<br>
ORG	Named corporate, governmental, or other organizational entity.<br>
MISC	Miscellaneous entities, e.g. events, nationalities, products or works of art.<br>

In [111]:
#with wikipedia scheme
nlp_wiki = spacy.load('xx_ent_wiki_sm')

In [120]:
doc = nlp_wiki(lyrics_df.loc[24]['lyrics'].replace('\n', '').lower())
for ent in doc.ents:
    print(ent, ent.label_)