# Goal of notebook : 

1. Load Lyrics with genre
2. Clean text in parenthesis from lyrics
3. Create Fasttext vectors and read them into dataframe
4. Create column with indicator of last word of line
5. Create column with type of word (noum, verb, etc)

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## 1. Load Lyrics with genre

In [3]:
filepath = "../kaggleDataset/lyrics_cleaned.csv"
lyrics_df = pd.read_csv(filepath)
lyrics_df.head()

Unnamed: 0,song,year,artist,genre,lyrics,language
0,ego-remix,2009,beyonce-knowles,pop,"Oh baby, how you doing?\r\nYou know I'm gonna ...",en
1,then-tell-me,2009,beyonce-knowles,pop,"playin' everything so easy,\r\nit's like you s...",en
2,honesty,2009,beyonce-knowles,pop,If you search\r\nFor tenderness\r\nIt isn't ha...,en
3,you-are-my-rock,2009,beyonce-knowles,pop,"Oh oh oh I, oh oh oh I\r\n[Verse 1:]\r\nIf I w...",en
4,black-culture,2009,beyonce-knowles,pop,"Party the people, the people the party it's po...",en


## 2. Clean text in parenthesis from lyrics

In [13]:
sample_df = lyrics_df.sample(n=20)
for i, s in sample_df.iterrows():
    print(s['lyrics'])

Fix the mental, as if that changes anything
Your heart is cured now, you're normal once again
One confession, you thought that's all it took
But redemption, doesn't let you off the hook
One of us
Can't erase the past to change who he would be
One of us
On a whim would act, then look for sympathy
Not from me
Hear me say
One more
Chance to believe in
That you can even the score
One more
Place at the table
Always room for one more
Room for one more
Thinking evil, is that just your normal gig
Fueled by hatred, happy as a stuffed pig
You're so lonely, everyone around you reeks
Of indifference, thriving in their apathy
One of us
Can't erase the past to change who he would be
One of us
On a whim would act, then look for sympathy
Not from me
Hear me say
One more
Chance to believe in
That you can even the score
One more
Place at the table
Always room for one more
One more
Chance to believe in
That you can even the score
One more
Place at the table
Always ro

### Remove [<30 chars] and (<30 chars) and x(digit)

This information gives structure for the singer but not usefull for lyrics. This is [Chorus], choir in parentheses, x2 etc. <br>
To test regex expressions : https://regex101.com

In [4]:
import string
import re

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [30]:
regex_bracket = re.compile("\[[a-zA-Z0-9: ]{0,30}\]")
regex_parentheses = re.compile("\([a-zA-Z0-9]{0,30}\)")
regex_curly_bracket = re.compile("\([a-zA-Z0-9]{0,30}\)")
regex_timesx = re.compile("x[0-9]")

def clean_structure_words(l):
    res= regex_bracket.sub('', l)
    res = regex_parentheses.sub('', res)
    res = regex_curly_bracket.sub('', res)
    res = regex_timesx.sub('', res)
    return res

In [56]:
clean_structure_words("Hello, my. name is +emma* (nice)[to]meet -x3--")

'Hello, my. name is +emma* meet ---'

In [32]:
lyrics_df['lyrics'] = lyrics_df['lyrics'].apply(lambda x : clean_structure_words(x))

In [37]:
lyrics_df.head()

Unnamed: 0,song,year,artist,genre,lyrics,language
0,ego-remix,2009,beyonce-knowles,pop,Oh baby how you doing\nYou know I'm gonna cut ...,en
1,then-tell-me,2009,beyonce-knowles,pop,playin' everything so easy\nit's like you seem...,en
2,honesty,2009,beyonce-knowles,pop,If you search\nFor tenderness\nIt isn't hard t...,en
3,you-are-my-rock,2009,beyonce-knowles,pop,Oh oh oh I oh oh oh I\n\nIf I wrote a book abo...,en
4,black-culture,2009,beyonce-knowles,pop,Party the people the people the party it's pop...,en


### Remove . , ! ? ... ---
- We want to keep the [] and () as we need to delete what is inside also
- We want to keep the * as this represents some swear words in english
- We want to keep the ' as this is part of many words in english, especially when words are cropped, ie 'Cause 
- Otherwise punctuation is not meaningful

In [33]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [5]:
string.printable

'0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~ \t\n\r\x0b\x0c'

In [64]:
#punctuation_to_remove = '!#$%\+,-./:;<=>?@\\^_`|~()[]{}'
#regex_punct = re.compile('[%s]' % re.escape(punctuation_to_remove))
chars_to_keep = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ* \n'

def clean_punctuation(l):
    #res = regex_punct.sub('', l)
    res = re.sub(' +', ' ', l)#res)
    res = ''.join(char for char in res if char in chars_to_keep)
    return res

In [65]:
repr(clean_punctuation("Hello,     my.   name (coucou) is +emma* meet ---"))

"'Hello my name coucou is emma* meet '"

In [66]:
lyrics_df['lyrics'] = lyrics_df['lyrics'].apply(lambda x : clean_punctuation(x))

In [67]:
lyrics_df.head()

Unnamed: 0,song,year,artist,genre,lyrics,language,embedding
0,ego-remix,2009,beyonce-knowles,pop,Oh baby how you doing\nYou know Im gonna cut r...,en,"[[Oh, 0.10919, -0.75727, -0.042626, 0.23598, 0..."
1,then-tell-me,2009,beyonce-knowles,pop,playin everything so easy\nits like you seem s...,en,"[[playin', 0.20792, 0.57511, -0.26275, 0.02359..."
2,honesty,2009,beyonce-knowles,pop,If you search\nFor tenderness\nIt isnt hard to...,en,"[[If, -0.083604, -0.12674, 0.075451, -0.17801,..."
3,you-are-my-rock,2009,beyonce-knowles,pop,Oh oh oh I oh oh oh I\n\nIf I wrote a book abo...,en,"[[Oh, 0.10919, -0.75727, -0.042626, 0.23598, 0..."
4,black-culture,2009,beyonce-knowles,pop,Party the people the people the party its popp...,en,"[[Party, 0.21165, -0.17531, -0.16863, 0.24736,..."


## 3. Create Fasttext vectors and read them into dataframe

In [68]:
# First create a txt file with the lyrics
f = open("lyrics.txt", "w+")
for i, r in lyrics_df.iterrows():
    f.write(r['lyrics'])
    
f.close()

https://fasttext.cc/docs/en/unsupervised-tutorial.html

In [None]:
# Generate embeddings
#! ./fasttext skipgram -input lyrics.txt -output model_lyrics

In [None]:
# To check first words : The first line is a header containing the number of words and the dimensionality of the vectors. 
# The subsequent lines are the word vectors for all words in the vocabulary, sorted by decreasing frequency.
#! head -n 4 model_lyrics.vec

In [15]:
import sys
import codecs

In [69]:
# Load embeddings
def load_embeddings(file_name):
    with codecs.open(file_name, 'r', 'utf-8') as f_in:
        lines = f_in.readlines()
        lines = lines[1:]
        vocabulary, wv = zip(*[line.strip().split(' ', 1) for line in lines])
    wv = np.loadtxt(wv)
    return wv, vocabulary

In [70]:
word_embeddings, vocabulary = load_embeddings('model_lyrics.vec')

In [71]:
vocabulary = list(vocabulary)

In [72]:
"eversince" in vocabulary

False

In [73]:
len(vocabulary)

97718

In [74]:
word_embeddings.shape

(97718, 100)

In [91]:
def get_embedding_for_missing_word(w):
    data = ! echo $w | ./fasttext print-word-vectors model_lyrics.bin
    emb = np.zeros(100)
    for i, x in enumerate(data[0].split( )[1:]):
        emb[i] = float(x)
    return emb

In [92]:
get_embedding_for_missing_word("eversince")

array([ 0.15354  , -0.082218 , -0.2494   , -0.39903  , -0.053004 ,
       -0.20358  ,  0.46507  , -0.48215  ,  0.031456 ,  0.29864  ,
       -0.45612  , -0.52815  ,  0.42328  ,  0.48587  , -0.13842  ,
       -0.027078 , -0.29471  ,  0.31589  , -0.052668 , -0.17239  ,
       -0.062075 , -0.1166   ,  0.062651 , -0.36563  ,  0.59316  ,
       -0.11488  ,  0.31329  ,  0.0054363, -0.22374  ,  0.19368  ,
        0.46279  , -0.13704  , -0.48086  , -0.21136  , -0.41806  ,
        0.096683 ,  0.17842  , -0.12313  , -0.47596  , -0.11186  ,
       -0.093677 ,  0.078458 , -0.11241  , -0.20531  ,  0.28876  ,
        0.25704  , -0.049918 ,  0.54401  ,  0.10132  ,  0.062308 ,
        0.31098  , -0.068861 ,  0.24074  , -0.29187  , -0.2174   ,
       -0.22046  ,  0.075688 ,  0.088568 , -0.25472  ,  0.24982  ,
        0.02957  , -0.011789 , -0.081646 ,  0.031771 , -0.34703  ,
       -0.34078  , -0.24647  , -0.078897 , -0.26882  ,  0.25468  ,
       -0.049018 ,  0.31585  , -0.075016 , -0.0050666, -0.1406

## 4. Create column with indicator of last word of line

In [88]:
test_l = lyrics_df.loc[0]['lyrics']
test_l

"Oh baby how you doing\nYou know I'm gonna cut right to the chase\nSome women were made but me myself\nI like to think that I was created for a special purpose\nYou know what's more special than you You feel me\nIt's on baby let's get lost\nYou don't need to call into work 'cause you're the boss\nFor real want you to show me how you feel\nI consider myself lucky that's a big deal\nWhy Well you got the key to my heart\nBut you ain't gonna need it I'd rather you open up my body\nAnd show me secrets you didn't know was inside\nNo need for me to lie\nIt's too big it's too wide\nIt's too strong it won't fit\nIt's too much it's too tough\nHe talk like this 'cause he can back it up\nHe got a big ego such a huge ego\nI love his big ego it's too much\nHe walk like this 'cause he can back it up\nUsually I'm humble right now I don't choose\nYou can leave with me or you could have the blues\nSome call it arrogant I call it confident\nYou decide when you find on what I'm working with\nDamn I know I

In [95]:
def generate_embedding_for_lyrics(l):
    #print(l)
    sentences = l.split('\n')
    embedding = []
    #missing_words = []
    for s in sentences:
        words = s.split(' ')
        #print(repr(words))
        for i, w in enumerate(words):
            if len(w) >0:
                try : 
                    idx = vocabulary.index(w)
                    vector = word_embeddings[idx]
                except ValueError : 
                    #print("Word not found :", repr(w))
                    #missing_words.append(w)
                    vector = get_embedding_for_missing_word(w)
                if i == (len(words)-1) :
                    vector = np.append(vector, 1)
                else :
                    vector = np.append(vector, 0)
                vector = np.append(w, vector)
                embedding.append(vector)
                
    return embedding

In [None]:
generate_embedding_for_lyrics("Oh baby how you doing\nYou know I'm gonna cut right to the chase")

In [96]:
lyrics_df['embedding'] = lyrics_df['lyrics'][:30].apply(lambda x : generate_embedding_for_lyrics(x))

echo "eversince" | ./fasttext print-word-vectors model_lyrics.bin
eversince 0.26688 -0.1294 0.21473 -0.0022411 -0.21671 0.032355 -0.18573 0.14012 0.26681 0.37774 -0.33736 0.26381 -0.053749 0.37619 -0.21128 -0.056934 0.20461 0.33779 -0.10611 0.021266 -0.17509 0.27 0.41086 0.13709 0.16705 -0.60041 0.38402 0.087207 -0.33848 0.19228 -0.27138 -0.30764 -0.2702 -0.30729 -0.72478 0.16182 0.11382 0.19189 -0.27027 0.27689 0.039753 -0.21678 -0.4436 0.24418 0.12741 0.21971 0.014112 0.11523 0.00034974 0.24912 -0.62545 -0.04865 -0.32367 -0.38435 -0.058886 -0.13295 0.48399 0.49655 0.34541 0.65489 -0.18568 -0.076875 0.087327 -0.0079827 -0.073017 -0.046165 0.009111 -0.024724 -0.44245 0.092923 0.33201 -0.54304 -0.22664 0.41005 0.14919 -0.30418 -0.13195 -0.10742 -0.18871 0.14914 0.15383 -0.30302 0.362 0.26775 0.11757 0.057333 0.38091 0.095017 -0.10443 0.13355 0.64025 0.070291 0.045029 -0.053138 0.19487 -0.062763 0.19003 0.32966 0.049617 0.207

In [97]:
lyrics_df.head()

Unnamed: 0,song,year,artist,genre,lyrics,language,embedding
0,ego-remix,2009,beyonce-knowles,pop,Oh baby how you doing\nYou know Im gonna cut r...,en,"[[Oh, 0.11909, 0.08231, -0.28486, 0.23137, 0.1..."
1,then-tell-me,2009,beyonce-knowles,pop,playin everything so easy\nits like you seem s...,en,"[[playin, 0.5371, -0.097827, 0.68525, -0.10739..."
2,honesty,2009,beyonce-knowles,pop,If you search\nFor tenderness\nIt isnt hard to...,en,"[[If, -0.3726, 0.33803, -0.36677, -0.29783, 0...."
3,you-are-my-rock,2009,beyonce-knowles,pop,Oh oh oh I oh oh oh I\n\nIf I wrote a book abo...,en,"[[Oh, 0.11909, 0.08231, -0.28486, 0.23137, 0.1..."
4,black-culture,2009,beyonce-knowles,pop,Party the people the people the party its popp...,en,"[[Party, 0.28332, 0.68937, -0.4371, -0.084831,..."


## 5. Create column with type of word (noum, verb, etc)

In [98]:
import spacy

In [99]:
en_nlp = spacy.load('en')

ADJ: adjective ADP: adposition ADV: adverb AUX: auxiliary verb CONJ: coordinating conjunction DET: determiner INTJ: interjection NOUN: noun NUM: numeral PART: particle PRON: pronoun PROPN: proper noun PUNCT: punctuation SCONJ: subordinating conjunction SYM: symbol VERB: verb X: other

In [100]:
def generate_pos_for_lyrics(l):
    doc = en_nlp(l)
    embedding = []
    for i in range(len(doc)):
        embedding.append(doc[i].pos_)
    return embedding

In [138]:
generate_pos_for_lyrics("Oh baby how you doing\nYou know I'm gonna cut right to the chase")

['INTJ',
 'NOUN',
 'ADV',
 'PRON',
 'VERB',
 'SPACE',
 'PRON',
 'VERB',
 'PRON',
 'VERB',
 'VERB',
 'PART',
 'VERB',
 'ADV',
 'ADP',
 'DET',
 'NOUN']

In [101]:
def merge_embedding_pos(l):
    embedding = generate_embedding_for_lyrics(l)
    pos = generate_pos_for_lyrics(l)
    #print(len(embedding))
    for i in range(len(embedding)):
        embedding[i] = np.append(embedding[i], pos[i])
    return embedding

In [150]:
merge_embedding_pos("Oh baby how you doing\nYou know I'm gonna cut right to the chase")

[array(['Oh', '0.26281', '0.20464', '0.52809', '0.53019', '0.18293',
        '0.037628', '-0.23044', '0.062628', '0.23377', '0.57916',
        '-0.54274', '0.08324', '-0.16041', '0.15954', '-0.020274',
        '0.29752', '0.2614', '0.20318', '-0.047337', '0.018324',
        '-0.058327', '0.038221', '-0.11085', '-0.27219', '0.38866',
        '-0.39597', '0.31506', '-0.2931', '-0.14714', '-0.26283',
        '-0.66931', '-0.19716', '-0.076028', '-0.19689', '-0.35428',
        '0.060643', '0.2279', '-0.68128', '-0.64535', '0.26321',
        '-0.23125', '0.096682', '-0.33837', '0.63975', '-0.14143',
        '0.0089394', '-0.26971', '0.0081875', '-0.41468', '0.13715',
        '-0.24524', '0.50966', '0.19552', '0.48978', '-0.69828',
        '0.022169', '-0.11176', '0.11083', '0.23648', '-0.0083582',
        '0.28457', '0.34085', '0.27281', '-0.18691', '0.28133', '0.07971',
        '0.08198', '0.57897', '-0.24385', '-0.30722', '-0.66809',
        '0.38878', '-0.38226', '-0.36377', '-0.38864', 

In [102]:
lyrics_df['embedding'] = lyrics_df['lyrics'][:30].apply(lambda x : merge_embedding_pos(x))

In [105]:
lyrics_df.head(10)

Unnamed: 0,song,year,artist,genre,lyrics,language,embedding
0,ego-remix,2009,beyonce-knowles,pop,Oh baby how you doing\nYou know Im gonna cut r...,en,"[[Oh, 0.11909, 0.08231, -0.28486, 0.23137, 0.1..."
1,then-tell-me,2009,beyonce-knowles,pop,playin everything so easy\nits like you seem s...,en,"[[playin, 0.5371, -0.097827, 0.68525, -0.10739..."
2,honesty,2009,beyonce-knowles,pop,If you search\nFor tenderness\nIt isnt hard to...,en,"[[If, -0.3726, 0.33803, -0.36677, -0.29783, 0...."
3,you-are-my-rock,2009,beyonce-knowles,pop,Oh oh oh I oh oh oh I\n\nIf I wrote a book abo...,en,"[[Oh, 0.11909, 0.08231, -0.28486, 0.23137, 0.1..."
4,black-culture,2009,beyonce-knowles,pop,Party the people the people the party its popp...,en,"[[Party, 0.28332, 0.68937, -0.4371, -0.084831,..."
5,all-i-could-do-was-cry,2009,beyonce-knowles,pop,I heard\nChurch bells ringing\nI heard\nA choi...,en,"[[I, -0.11296, 0.096441, 0.053521, 0.14538, 0...."
6,once-in-a-lifetime,2009,beyonce-knowles,pop,This is just another day that I would spend\nW...,en,"[[This, 0.17977, 0.12549, 0.26417, -0.66416, -..."
7,waiting,2009,beyonce-knowles,pop,Waiting waiting waiting waiting\nWaiting waiti...,en,"[[Waiting, 0.050919, -0.035479, 0.1379, -0.298..."
8,slow-love,2009,beyonce-knowles,pop,\nI read all of the magazines\nwhile waiting a...,en,"[[I, -0.11296, 0.096441, 0.053521, 0.14538, 0...."
9,why-don-t-you-love-me,2009,beyonce-knowles,pop,Nnnow honey\nYou better sit down and look arou...,en,"[[Nnnow, 0.12027, 0.36235, -0.26464, -0.31749,..."
