# Goal of notebook : 

1. Load Lyrics with genre
2. Clean text in parenthesis from lyrics
3. Create Fasttext vectors and read them into dataframe
4. Create column with indicator of last word of line
5. Create column with type of word (noum, verb, etc)

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## 1. Load Lyrics with genre

In [33]:
filepath = "../augment db/full_lyrics.csv"
lyrics_df = pd.read_csv(filepath)
lyrics_df.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,artist,genre,lang,lyrics,title
0,beyonce-knowles,pop,,"Oh baby, how you doing?\r\r\nYou know I'm gonn...",ego-remix
1,beyonce-knowles,pop,,"playin' everything so easy,\r\r\nit's like you...",then-tell-me
2,beyonce-knowles,pop,,If you search\r\r\nFor tenderness\r\r\nIt isn'...,honesty
3,beyonce-knowles,pop,,"Oh oh oh I, oh oh oh I\r\r\n[Verse 1:]\r\r\nIf...",you-are-my-rock
4,beyonce-knowles,pop,,"Party the people, the people the party it's po...",black-culture


In [34]:
lyrics_df = lyrics_df.drop(['lang'], axis = 1)

In [35]:
lyrics_df.head()

Unnamed: 0,artist,genre,lyrics,title
0,beyonce-knowles,pop,"Oh baby, how you doing?\r\r\nYou know I'm gonn...",ego-remix
1,beyonce-knowles,pop,"playin' everything so easy,\r\r\nit's like you...",then-tell-me
2,beyonce-knowles,pop,If you search\r\r\nFor tenderness\r\r\nIt isn'...,honesty
3,beyonce-knowles,pop,"Oh oh oh I, oh oh oh I\r\r\n[Verse 1:]\r\r\nIf...",you-are-my-rock
4,beyonce-knowles,pop,"Party the people, the people the party it's po...",black-culture


## 2. Clean text in parenthesis from lyrics

In [36]:
sample_df = lyrics_df.sample(n=20)
for i, s in sample_df.iterrows():
    print(s['lyrics'])

Everybody
Whoa-oh-oh
Everybody
Whoa-oh-oh
Everybody get up and dance now
Everybody show me how you smile
Everybody stand up and sing now
Take my hand; I'll make it worth your while
Everyone will be alright
Let's make this summer outta sight!
Life is great in my backyard
Let's crank up the volume on this guitar
This is a song for your eye drums
This is the anthem for having fun
Now that you're here and you're in the mood
Let's go party, cowabunga dude!
I wanna get kicked out of school
I wanna go to a place where the big kids rule
I want to enjoy life and just have fun
I don't wanna worry about anyone
I wanna be a big rock star
I wanna make me famous with this guitar
I wanna go see everywhere
Don't want to sit around and just be a square
This is a song for your eye drums
This is the anthem for having fun
Now that you're here and you're in the mood
Let's rock out and order pizza, dude!
This is a song for your eye drums
This is the 

### Remove [<30 chars] and (<30 chars) and x(digit)

This information gives structure for the singer but not usefull for lyrics. This is [Chorus], choir in parentheses, x2 etc. <br>
To test regex expressions : https://regex101.com

In [37]:
import string
import re

In [38]:
regex_bracket = re.compile("\[(.*?)\]")
regex_parentheses = re.compile("\((.*?)\)")
regex_curly_bracket = re.compile("\{(.*?)\}")
regex_timesx = re.compile("x[0-9]")

def clean_structure_words(l):
    res= regex_bracket.sub('', l)
    res = regex_parentheses.sub('', res)
    res = regex_curly_bracket.sub('', res)
    res = regex_timesx.sub('', res)
    return res

In [8]:
clean_structure_words("Hello, my. name is +emma* (nice)[to]meet -x3--")

'Hello, my. name is +emma* meet ---'

In [39]:
lyrics_df['lyrics'] = lyrics_df['lyrics'].apply(lambda x : clean_structure_words(x))

In [40]:
lyrics_df.head()

Unnamed: 0,artist,genre,lyrics,title
0,beyonce-knowles,pop,"Oh baby, how you doing?\r\r\nYou know I'm gonn...",ego-remix
1,beyonce-knowles,pop,"playin' everything so easy,\r\r\nit's like you...",then-tell-me
2,beyonce-knowles,pop,If you search\r\r\nFor tenderness\r\r\nIt isn'...,honesty
3,beyonce-knowles,pop,"Oh oh oh I, oh oh oh I\r\r\n\r\r\nIf I wrote a...",you-are-my-rock
4,beyonce-knowles,pop,"Party the people, the people the party it's po...",black-culture


### Remove . , ! ? ... ---
- We want to keep the * as this represents some swear words in english
- We want to keep the ' as this is part of many words in english, especially when words are cropped, ie 'Cause 
- Otherwise punctuation is not meaningful

In [11]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [12]:
string.printable

'0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~ \t\n\r\x0b\x0c'

In [41]:
#punctuation_to_remove = '!#$%\+,-./:;<=>?@\\^_`|~()[]{}'
#regex_punct = re.compile('[%s]' % re.escape(punctuation_to_remove))
chars_to_keep = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ\'* \n'

def clean_punctuation(l):
    #res = regex_punct.sub('', l)
    res = re.sub(' +', ' ', l)#res)
    res = ''.join(char for char in res if char in chars_to_keep)
    return res

In [15]:
repr(clean_punctuation("Hello,     my.   name I'm (coucou) is +emma* meet ---"))

'"Hello my name I\'m coucou is emma* meet "'

In [42]:
lyrics_df['lyrics'] = lyrics_df['lyrics'].apply(lambda x : clean_punctuation(x))

In [43]:
lyrics_df.head()

Unnamed: 0,artist,genre,lyrics,title
0,beyonce-knowles,pop,Oh baby how you doing\nYou know I'm gonna cut ...,ego-remix
1,beyonce-knowles,pop,playin' everything so easy\nit's like you seem...,then-tell-me
2,beyonce-knowles,pop,If you search\nFor tenderness\nIt isn't hard t...,honesty
3,beyonce-knowles,pop,Oh oh oh I oh oh oh I\n\nIf I wrote a book abo...,you-are-my-rock
4,beyonce-knowles,pop,Party the people the people the party it's pop...,black-culture


## 3. Create Fasttext vectors and read them into dataframe

In [44]:
# First create a txt file with the lyrics
f = open("lyrics.txt", "w+")
for i, r in lyrics_df.iterrows():
    f.write(r['lyrics'])
    
f.close()

https://fasttext.cc/docs/en/unsupervised-tutorial.html

In [45]:
# Generate embeddings
#! ./fasttext skipgram -input lyrics.txt -output model_lyrics

Read 82M words
Number of words:  125703
Number of labels: 0


Progress: 13.5%  words/sec/thread: 133900  lr: 0.043244  loss: 1.823365  eta: 0h3m  lr: 0.049940  loss: 2.882784  eta: 0h25m 0.2%  words/sec/thread: 31919  lr: 0.049905  loss: 2.513665  eta: 0h17m 12m 0.4%  words/sec/thread: 53753  lr: 0.049794  loss: 2.165484  eta: 0h10m   eta: 0h8m   loss: 2.011464  eta: 0h7m 73958  lr: 0.049628  loss: 1.999047  eta: 0h7m   eta: 0h7m   lr: 0.049554  loss: 1.997317  eta: 0h7m 85550  lr: 0.049479  loss: 1.997706  eta: 0h6m   lr: 0.049448  loss: 2.004879  eta: 0h6m m   lr: 0.049380  loss: 1.984411  eta: 0h6m %  words/sec/thread: 93960  lr: 0.049330  loss: 1.970854  eta: 0h5m 1.969319  eta: 0h5m %  words/sec/thread: 96354  lr: 0.049281  loss: 1.967838  eta: 0h5m h5m 0.049114  loss: 1.939019  eta: 0h5m 1.9%  words/sec/thread: 106083  lr: 0.049047  loss: 1.928800  eta: 0h5m 5m 0.049000  loss: 1.921255  eta: 0h5m   eta: 0h5m   lr: 0.048950  loss: 1.913636  eta: 0h5m m   words/sec/thread: 111915  lr: 0.048812  loss: 1.908214  eta: 0h4m 2.6%  words/sec/thread

Progress: 18.1%  words/sec/thread: 133978  lr: 0.040972  loss: 1.818897  eta: 0h3m   words/sec/thread: 133917  lr: 0.043223  loss: 1.823420  eta: 0h3m   lr: 0.043215  loss: 1.823393  eta: 0h3m   words/sec/thread: 133930  lr: 0.043212  loss: 1.823408  eta: 0h3m   words/sec/thread: 133912  lr: 0.043201  loss: 1.823351  eta: 0h3m 1.823177  eta: 0h3m gress: 13.6%  words/sec/thread: 133902  lr: 0.043181  loss: 1.823286  eta: 0h3m   lr: 0.043145  loss: 1.822866  eta: 0h3m 3m   words/sec/thread: 133948  lr: 0.043113  loss: 1.822445  eta: 0h3m m 1.821728  eta: 0h3m   words/sec/thread: 133906  lr: 0.043059  loss: 1.821549  eta: 0h3m 13.9%  words/sec/thread: 133923  lr: 0.043037  loss: 1.821278  eta: 0h3m   words/sec/thread: 133913  lr: 0.043009  loss: 1.820722  eta: 0h3m 0.042962  loss: 1.820333  eta: 0h3m   words/sec/thread: 133859  lr: 0.042951  loss: 1.820246  eta: 0h3m 1.819971  eta: 0h3m   words/sec/thread: 133916  lr: 0.042920  loss: 1.819542  eta: 0h3m   eta: 0h3m   words/sec/thread: 133

Progress: 22.4%  words/sec/thread: 133513  lr: 0.038801  loss: 1.826023  eta: 0h3m   words/sec/thread: 133980  lr: 0.040965  loss: 1.818916  eta: 0h3m   words/sec/thread: 133973  lr: 0.040954  loss: 1.818792  eta: 0h3m   words/sec/thread: 133960  lr: 0.040930  loss: 1.818576  eta: 0h3m   words/sec/thread: 133961  lr: 0.040920  loss: 1.818602  eta: 0h3m   words/sec/thread: 133962  lr: 0.040914  loss: 1.818567  eta: 0h3m   words/sec/thread: 133945  lr: 0.040902  loss: 1.818581  eta: 0h3m   words/sec/thread: 133902  lr: 0.040874  loss: 1.818421  eta: 0h3m   words/sec/thread: 133898  lr: 0.040870  loss: 1.818272  eta: 0h3m   words/sec/thread: 133875  lr: 0.040845  loss: 1.818129  eta: 0h3m   words/sec/thread: 133849  lr: 0.040828  loss: 1.818113  eta: 0h3m 0.040820  loss: 1.818117  eta: 0h3m gress: 18.4%  words/sec/thread: 133843  lr: 0.040799  loss: 1.817418  eta: 0h3m 3m   words/sec/thread: 133821  lr: 0.040783  loss: 1.817361  eta: 0h3m   words/sec/thread: 133815  lr: 0.040768  loss: 1.

Progress: 25.6%  words/sec/thread: 133341  lr: 0.037184  loss: 1.821525  eta: 0h3m   words/sec/thread: 133507  lr: 0.038794  loss: 1.825965  eta: 0h3m   words/sec/thread: 133500  lr: 0.038784  loss: 1.825757  eta: 0h3m   words/sec/thread: 133502  lr: 0.038773  loss: 1.825733  eta: 0h3m 0.038756  loss: 1.825500  eta: 0h3m   words/sec/thread: 133487  lr: 0.038746  loss: 1.825500  eta: 0h3m   words/sec/thread: 133480  lr: 0.038733  loss: 1.825340  eta: 0h3m   words/sec/thread: 133477  lr: 0.038729  loss: 1.825243  eta: 0h3m   loss: 1.825160  eta: 0h3m .6%  words/sec/thread: 133472  lr: 0.038709  loss: 1.824752  eta: 0h3m   words/sec/thread: 133471  lr: 0.038700  loss: 1.824534  eta: 0h3m   words/sec/thread: 133474  lr: 0.038693  loss: 1.824320  eta: 0h3m   words/sec/thread: 133481  lr: 0.038688  loss: 1.824253  eta: 0h3m 0.038663  loss: 1.823908  eta: 0h3m   words/sec/thread: 133488  lr: 0.038658  loss: 1.823909  eta: 0h3m   words/sec/thread: 133505  lr: 0.038629  loss: 1.823932  eta: 0h3

Progress: 28.9%  words/sec/thread: 133356  lr: 0.035556  loss: 1.822672  eta: 0h3m   words/sec/thread: 133316  lr: 0.037167  loss: 1.821200  eta: 0h3m m   words/sec/thread: 133312  lr: 0.037155  loss: 1.821107  eta: 0h3m   words/sec/thread: 133296  lr: 0.037122  loss: 1.821137  eta: 0h3m ad: 133283  lr: 0.037111  loss: 1.821055  eta: 0h3m   words/sec/thread: 133278  lr: 0.037102  loss: 1.820958  eta: 0h3m   words/sec/thread: 133278  lr: 0.037097  loss: 1.820977  eta: 0h3m 25.8%  words/sec/thread: 133272  lr: 0.037089  loss: 1.821028  eta: 0h3m   words/sec/thread: 133264  lr: 0.037078  loss: 1.821048  eta: 0h3m   words/sec/thread: 133265  lr: 0.037069  loss: 1.821168  eta: 0h3m   words/sec/thread: 133266  lr: 0.037058  loss: 1.821118  eta: 0h3m   words/sec/thread: 133268  lr: 0.037049  loss: 1.821102  eta: 0h3m   words/sec/thread: 133115  lr: 0.036998  loss: 1.821382  eta: 0h3m   eta: 0h3m   words/sec/thread: 133108  lr: 0.036983  loss: 1.821187  eta: 0h3m   words/sec/thread: 133100  lr

Progress: 32.0%  words/sec/thread: 133642  lr: 0.034021  loss: 1.821145  eta: 0h2m   words/sec/thread: 133361  lr: 0.035540  loss: 1.822777  eta: 0h3m 29.0%  words/sec/thread: 133363  lr: 0.035525  loss: 1.822824  eta: 0h3m /thread: 133364  lr: 0.035520  loss: 1.822843  eta: 0h3m   words/sec/thread: 133358  lr: 0.035491  loss: 1.823009  eta: 0h3m   words/sec/thread: 133361  lr: 0.035478  loss: 1.823096  eta: 0h3m   words/sec/thread: 133363  lr: 0.035469  loss: 1.823210  eta: 0h3m   words/sec/thread: 133363  lr: 0.035453  loss: 1.823233  eta: 0h3m ad: 133364  lr: 0.035447  loss: 1.823181  eta: 0h3m ad: 133371  lr: 0.035443  loss: 1.823158  eta: 0h3m   words/sec/thread: 133381  lr: 0.035431  loss: 1.823127  eta: 0h3m   words/sec/thread: 133406  lr: 0.035407  loss: 1.823037  eta: 0h3m   words/sec/thread: 133408  lr: 0.035401  loss: 1.822924  eta: 0h3m   words/sec/thread: 133410  lr: 0.035392  loss: 1.822893  eta: 0h3m   words/sec/thread: 133414  lr: 0.035385  loss: 1.822883  eta: 0h3m   w

Progress: 35.0%  words/sec/thread: 133074  lr: 0.032521  loss: 1.821041  eta: 0h2m 0.034009  loss: 1.821210  eta: 0h2m   loss: 1.821300  eta: 0h2m   words/sec/thread: 133600  lr: 0.033992  loss: 1.821400  eta: 0h2m   words/sec/thread: 133596  lr: 0.033985  loss: 1.821371  eta: 0h2m 0.033983  loss: 1.821383  eta: 0h2m 2.1%  words/sec/thread: 133602  lr: 0.033975  loss: 1.821364  eta: 0h2m   words/sec/thread: 133600  lr: 0.033968  loss: 1.821353  eta: 0h2m   words/sec/thread: 133601  lr: 0.033962  loss: 1.821337  eta: 0h2m   words/sec/thread: 133595  lr: 0.033942  loss: 1.821350  eta: 0h2m h2m ad: 133596  lr: 0.033928  loss: 1.821301  eta: 0h2m   words/sec/thread: 133594  lr: 0.033920  loss: 1.821298  eta: 0h2m   words/sec/thread: 133604  lr: 0.033905  loss: 1.821293  eta: 0h2m   words/sec/thread: 133602  lr: 0.033899  loss: 1.821271  eta: 0h2m   words/sec/thread: 133603  lr: 0.033894  loss: 1.821276  eta: 0h2m ad: 133607  lr: 0.033883  loss: 1.821316  eta: 0h2m   words/sec/thread: 13360

Progress: 37.5%  words/sec/thread: 133019  lr: 0.031226  loss: 1.819283  eta: 0h2m   words/sec/thread: 133073  lr: 0.032511  loss: 1.821060  eta: 0h2m ad: 133069  lr: 0.032502  loss: 1.821010  eta: 0h2m 0.032495  loss: 1.820969  eta: 0h2m 5.0%  words/sec/thread: 133067  lr: 0.032488  loss: 1.820944  eta: 0h2m a: 0h2m   words/sec/thread: 133062  lr: 0.032474  loss: 1.820817  eta: 0h2m   words/sec/thread: 133058  lr: 0.032465  loss: 1.820782  eta: 0h2m 1.820756  eta: 0h2m lr: 0.032454  loss: 1.820726  eta: 0h2m   words/sec/thread: 133052  lr: 0.032446  loss: 1.820733  eta: 0h2m   eta: 0h2m rds/sec/thread: 133041  lr: 0.032431  loss: 1.820756  eta: 0h2m 0.032423  loss: 1.820892  eta: 0h2m  lr: 0.032418  loss: 1.820884  eta: 0h2m 0.032397  loss: 1.820785  eta: 0h2m   words/sec/thread: 133033  lr: 0.032394  loss: 1.820781  eta: 0h2m ad: 133032  lr: 0.032392  loss: 1.820809  eta: 0h2m h2m   words/sec/thread: 133021  lr: 0.032370  loss: 1.820867  eta: 0h2m 0.032367  loss: 1.820862  eta: 0h2m 

Progress: 40.1%  words/sec/thread: 133178  lr: 0.029958  loss: 1.816395  eta: 0h2m   words/sec/thread: 133020  lr: 0.031208  loss: 1.819352  eta: 0h2m   words/sec/thread: 133022  lr: 0.031202  loss: 1.819393  eta: 0h2m 0.031186  loss: 1.819450  eta: 0h2m   words/sec/thread: 133030  lr: 0.031182  loss: 1.819429  eta: 0h2m 0.031170  loss: 1.819515  eta: 0h2m ad: 133047  lr: 0.031145  loss: 1.819623  eta: 0h2m   words/sec/thread: 133050  lr: 0.031141  loss: 1.819611  eta: 0h2m 1.819592  eta: 0h2m ss: 1.819622  eta: 0h2m /thread: 133077  lr: 0.031111  loss: 1.819712  eta: 0h2m   words/sec/thread: 133084  lr: 0.031104  loss: 1.819672  eta: 0h2m 1.819566  eta: 0h2m   words/sec/thread: 133091  lr: 0.031090  loss: 1.819587  eta: 0h2m   words/sec/thread: 133104  lr: 0.031067  loss: 1.819769  eta: 0h2m   words/sec/thread: 133099  lr: 0.031058  loss: 1.819844  eta: 0h2m   words/sec/thread: 133102  lr: 0.031038  loss: 1.819941  eta: 0h2m h2m ss: 1.819923  eta: 0h2m 18  lr: 0.031013  loss: 1.819873

Progress: 42.6%  words/sec/thread: 132816  lr: 0.028723  loss: 1.817121  eta: 0h2m   words/sec/thread: 133182  lr: 0.029943  loss: 1.816489  eta: 0h2m   words/sec/thread: 133178  lr: 0.029938  loss: 1.816530  eta: 0h2m 0.029932  loss: 1.816558  eta: 0h2m   words/sec/thread: 133173  lr: 0.029925  loss: 1.816567  eta: 0h2m h2m lr: 0.029911  loss: 1.816571  eta: 0h2m /thread: 133170  lr: 0.029903  loss: 1.816616  eta: 0h2m   words/sec/thread: 133168  lr: 0.029895  loss: 1.816639  eta: 0h2m   words/sec/thread: 133164  lr: 0.029890  loss: 1.816630  eta: 0h2m 0.029882  loss: 1.816629  eta: 0h2m   words/sec/thread: 133164  lr: 0.029877  loss: 1.816631  eta: 0h2m   words/sec/thread: 133158  lr: 0.029870  loss: 1.816662  eta: 0h2m 0.3%  words/sec/thread: 133161  lr: 0.029861  loss: 1.816644  eta: 0h2m 133153  lr: 0.029852  loss: 1.816707  eta: 0h2m ad: 133149  lr: 0.029839  loss: 1.816648  eta: 0h2m 0.3%  words/sec/thread: 133148  lr: 0.029830  loss: 1.816574  eta: 0h2m 0.029816  loss: 1.816479

Progress: 44.9%  words/sec/thread: 132303  lr: 0.027543  loss: 1.815090  eta: 0h2m ss: 42.6%  words/sec/thread: 132810  lr: 0.028712  loss: 1.817147  eta: 0h2m 0.028701  loss: 1.817089  eta: 0h2m   words/sec/thread: 132803  lr: 0.028692  loss: 1.817097  eta: 0h2m   words/sec/thread: 132801  lr: 0.028687  loss: 1.817052  eta: 0h2m   words/sec/thread: 132796  lr: 0.028682  loss: 1.817063  eta: 0h2m   words/sec/thread: 132794  lr: 0.028676  loss: 1.817079  eta: 0h2m   words/sec/thread: 132791  lr: 0.028671  loss: 1.817113  eta: 0h2m 1.817071  eta: 0h2m   words/sec/thread: 132780  lr: 0.028646  loss: 1.817145  eta: 0h2m 1.817162  eta: 0h2m 2.7%  words/sec/thread: 132778  lr: 0.028634  loss: 1.817169  eta: 0h2m ss: 1.817189  eta: 0h2m s: 42.8%  words/sec/thread: 132772  lr: 0.028622  loss: 1.817192  eta: 0h2m 67  lr: 0.028618  loss: 1.817152  eta: 0h2m gress: 42.8%  words/sec/thread: 132762  lr: 0.028615  loss: 1.817156  eta: 0h2m 132761  lr: 0.028610  loss: 1.817141  eta: 0h2m ad: 132758  

Progress: 47.2%  words/sec/thread: 132146  lr: 0.026388  loss: 1.813394  eta: 0h2m 1.815237  eta: 0h2m 5.0%  words/sec/thread: 132296  lr: 0.027520  loss: 1.815225  eta: 0h2m ss: 1.815186  eta: 0h2m   words/sec/thread: 132292  lr: 0.027509  loss: 1.815165  eta: 0h2m   words/sec/thread: 132294  lr: 0.027497  loss: 1.815121  eta: 0h2m 0.027490  loss: 1.815157  eta: 0h2m 5.0%  words/sec/thread: 132300  lr: 0.027480  loss: 1.815119  eta: 0h2m ss: 1.815060  eta: 0h2m sec/thread: 132300  lr: 0.027467  loss: 1.815059  eta: 0h2m   words/sec/thread: 132301  lr: 0.027466  loss: 1.815031  eta: 0h2m   words/sec/thread: 132299  lr: 0.027457  loss: 1.814976  eta: 0h2m 0.027452  loss: 1.814982  eta: 0h2m   words/sec/thread: 132296  lr: 0.027444  loss: 1.815038  eta: 0h2m   words/sec/thread: 132292  lr: 0.027434  loss: 1.815202  eta: 0h2m h2m lr: 0.027416  loss: 1.815104  eta: 0h2m a: 0h2m 96  lr: 0.027406  loss: 1.815025  eta: 0h2m gress: 45.2%  words/sec/thread: 132293  lr: 0.027398  loss: 1.815035 

Progress: 49.7%  words/sec/thread: 131817  lr: 0.025168  loss: 1.810441  eta: 0h2m   words/sec/thread: 132149  lr: 0.026373  loss: 1.813330  eta: 0h2m 1.813302  eta: 0h2m 7.3%  words/sec/thread: 132154  lr: 0.026359  loss: 1.813272  eta: 0h2m   words/sec/thread: 132156  lr: 0.026351  loss: 1.813255  eta: 0h2m 0.026346  loss: 1.813252  eta: 0h2m thread: 132156  lr: 0.026336  loss: 1.813216  eta: 0h2m 0.026322  loss: 1.813186  eta: 0h2m ad: 132157  lr: 0.026312  loss: 1.813159  eta: 0h2m 1.813158  eta: 0h2m   words/sec/thread: 132157  lr: 0.026297  loss: 1.813136  eta: 0h2m h2m /thread: 132158  lr: 0.026282  loss: 1.813114  eta: 0h2m oss: 1.813115  eta: 0h2m   words/sec/thread: 132153  lr: 0.026273  loss: 1.813103  eta: 0h2m   words/sec/thread: 132157  lr: 0.026263  loss: 1.813088  eta: 0h2m 0.026257  loss: 1.813083  eta: 0h2m ad: 132158  lr: 0.026247  loss: 1.813046  eta: 0h2m h2m thread: 132161  lr: 0.026237  loss: 1.813038  eta: 0h2m s: 47.5%  words/sec/thread: 132163  lr: 0.026228  l

Progress: 52.2%  words/sec/thread: 131325  lr: 0.023909  loss: 1.802033  eta: 0h2m ad: 131811  lr: 0.025156  loss: 1.810380  eta: 0h2m   words/sec/thread: 131807  lr: 0.025152  loss: 1.810281  eta: 0h2m 0.025144  loss: 1.810199  eta: 0h2m   words/sec/thread: 131800  lr: 0.025136  loss: 1.810248  eta: 0h2m 1.810227  eta: 0h2m 9.7%  words/sec/thread: 131796  lr: 0.025126  loss: 1.810237  eta: 0h2m   words/sec/thread: 131793  lr: 0.025114  loss: 1.810318  eta: 0h2m   words/sec/thread: 131788  lr: 0.025106  loss: 1.810371  eta: 0h2m 0.025104  loss: 1.810419  eta: 0h2m   words/sec/thread: 131781  lr: 0.025096  loss: 1.810388  eta: 0h2m   words/sec/thread: 131768  lr: 0.025081  loss: 1.810394  eta: 0h2m 1.810426  eta: 0h2m 9.9%  words/sec/thread: 131756  lr: 0.025072  loss: 1.810430  eta: 0h2m   words/sec/thread: 131733  lr: 0.025056  loss: 1.810409  eta: 0h2m   words/sec/thread: 131724  lr: 0.025049  loss: 1.810434  eta: 0h2m ad: 131723  lr: 0.025045  loss: 1.810421  eta: 0h2m ad: 131719  l

Progress: 54.6%  words/sec/thread: 130872  lr: 0.022716  loss: 1.787247  eta: 0h1m thread: 131317  lr: 0.023895  loss: 1.801837  eta: 0h2m   words/sec/thread: 131315  lr: 0.023887  loss: 1.801740  eta: 0h2m 1.801650  eta: 0h2m thread: 131310  lr: 0.023877  loss: 1.801550  eta: 0h2m ss: 1.801474  eta: 0h2m sec/thread: 131299  lr: 0.023864  loss: 1.801375  eta: 0h2m   loss: 1.801315  eta: 0h2m   eta: 0h2m rds/sec/thread: 131295  lr: 0.023852  loss: 1.801252  eta: 0h2m 1189  eta: 0h2m   words/sec/thread: 131291  lr: 0.023843  loss: 1.801115  eta: 0h2m h2m lr: 0.023829  loss: 1.800858  eta: 0h2m ad: 131283  lr: 0.023821  loss: 1.800714  eta: 0h2m h2m lr: 0.023808  loss: 1.800528  eta: 0h2m 0.023799  loss: 1.800443  eta: 0h2m thread: 131262  lr: 0.023795  loss: 1.800349  eta: 0h2m   words/sec/thread: 131257  lr: 0.023785  loss: 1.800280  eta: 0h2m ad: 131256  lr: 0.023778  loss: 1.800177  eta: 0h2m /thread: 131254  lr: 0.023773  loss: 1.800093  eta: 0h2m ad: 131256  lr: 0.023768  loss: 1.79

Progress: 57.0%  words/sec/thread: 130478  lr: 0.021499  loss: 1.775339  eta: 0h1m a: 0h1m   words/sec/thread: 130866  lr: 0.022698  loss: 1.786910  eta: 0h1m ad: 130863  lr: 0.022691  loss: 1.786747  eta: 0h1m   words/sec/thread: 130862  lr: 0.022682  loss: 1.786667  eta: 0h1m 1.786598  eta: 0h1m 4.7%  words/sec/thread: 130859  lr: 0.022666  loss: 1.786524  eta: 0h1m   words/sec/thread: 130857  lr: 0.022662  loss: 1.786421  eta: 0h1m 0.022657  loss: 1.786365  eta: 0h1m thread: 130851  lr: 0.022649  loss: 1.786247  eta: 0h1m ad: 130847  lr: 0.022645  loss: 1.786186  eta: 0h1m 0.022624  loss: 1.786160  eta: 0h1m   words/sec/thread: 130839  lr: 0.022619  loss: 1.786073  eta: 0h1m 0.022610  loss: 1.786019  eta: 0h1m   words/sec/thread: 130834  lr: 0.022595  loss: 1.785933  eta: 0h1m   words/sec/thread: 130833  lr: 0.022589  loss: 1.785818  eta: 0h1m 0.022581  loss: 1.785796  eta: 0h1m   words/sec/thread: 130828  lr: 0.022574  loss: 1.785737  eta: 0h1m   words/sec/thread: 130828  lr: 0.022

Progress: 59.7%  words/sec/thread: 130118  lr: 0.020150  loss: 1.761669  eta: 0h1m   words/sec/thread: 130476  lr: 0.021492  loss: 1.775308  eta: 0h1m   words/sec/thread: 130474  lr: 0.021473  loss: 1.775246  eta: 0h1m 1.775175  eta: 0h1m 7.1%  words/sec/thread: 130470  lr: 0.021460  loss: 1.775136  eta: 0h1m   words/sec/thread: 130466  lr: 0.021455  loss: 1.775081  eta: 0h1m   words/sec/thread: 130462  lr: 0.021444  loss: 1.775027  eta: 0h1m   words/sec/thread: 130460  lr: 0.021435  loss: 1.775001  eta: 0h1m 1.775004  eta: 0h1m   words/sec/thread: 130458  lr: 0.021423  loss: 1.775010  eta: 0h1m   words/sec/thread: 130456  lr: 0.021417  loss: 1.774988  eta: 0h1m   words/sec/thread: 130451  lr: 0.021406  loss: 1.774878  eta: 0h1m 1.774782  eta: 0h1m 7.2%  words/sec/thread: 130445  lr: 0.021396  loss: 1.774725  eta: 0h1m ss: 1.774614  eta: 0h1m s: 57.2%  words/sec/thread: 130440  lr: 0.021381  loss: 1.774583  eta: 0h1m   loss: 1.774490  eta: 0h1m   words/sec/thread: 130436  lr: 0.021369 

Progress: 62.8%  words/sec/thread: 130098  lr: 0.018602  loss: 1.747084  eta: 0h1m 9.7%  words/sec/thread: 130114  lr: 0.020130  loss: 1.761486  eta: 0h1m lr: 0.020123  loss: 1.761453  eta: 0h1m   words/sec/thread: 130112  lr: 0.020119  loss: 1.761418  eta: 0h1m   words/sec/thread: 130111  lr: 0.020109  loss: 1.761408  eta: 0h1m 1.761364  eta: 0h1m thread: 130110  lr: 0.020088  loss: 1.761340  eta: 0h1m ad: 130106  lr: 0.020085  loss: 1.761292  eta: 0h1m 1.761247  eta: 0h1m 0.020063  loss: 1.761061  eta: 0h1m ad: 130090  lr: 0.020041  loss: 1.760675  eta: 0h1m   words/sec/thread: 130088  lr: 0.020035  loss: 1.760608  eta: 0h1m   words/sec/thread: 130088  lr: 0.020027  loss: 1.760573  eta: 0h1m 0.020020  loss: 1.760556  eta: 0h1m 0.0%  words/sec/thread: 130089  lr: 0.020013  loss: 1.760512  eta: 0h1m ss: 1.760417  eta: 0h1m s: 60.0%  words/sec/thread: 130088  lr: 0.019999  loss: 1.760367  eta: 0h1m  lr: 0.019991  loss: 1.760274  eta: 0h1m 0.019982  loss: 1.760136  eta: 0h1m   words/sec/

Progress: 65.5%  words/sec/thread: 129794  lr: 0.017271  loss: 1.734014  eta: 0h1m ss: 1.746934  eta: 0h1m 91  lr: 0.018577  loss: 1.746883  eta: 0h1m ad: 130091  lr: 0.018560  loss: 1.746692  eta: 0h1m h1m ad: 130092  lr: 0.018544  loss: 1.746417  eta: 0h1m 2.9%  words/sec/thread: 130091  lr: 0.018531  loss: 1.746273  eta: 0h1m ad: 130093  lr: 0.018521  loss: 1.746122  eta: 0h1m   words/sec/thread: 130089  lr: 0.018512  loss: 1.745945  eta: 0h1m 1.745875  eta: 0h1m thread: 130090  lr: 0.018493  loss: 1.745809  eta: 0h1m a: 0h1m 86  lr: 0.018478  loss: 1.745644  eta: 0h1m rds/sec/thread: 130085  lr: 0.018473  loss: 1.745545  eta: 0h1m ad: 130084  lr: 0.018464  loss: 1.745456  eta: 0h1m h1m lr: 0.018442  loss: 1.745269  eta: 0h1m a: 0h1m gress: 63.1%  words/sec/thread: 130081  lr: 0.018429  loss: 1.745047  eta: 0h1m 8422  loss: 1.744963  eta: 0h1m   words/sec/thread: 130069  lr: 0.018414  loss: 1.744870  eta: 0h1m h1m lr: 0.018398  loss: 1.744619  eta: 0h1m   words/sec/thread: 130060  l

Progress: 68.2%  words/sec/thread: 129628  lr: 0.015883  loss: 1.717668  eta: 0h1m thread: 129792  lr: 0.017255  loss: 1.733957  eta: 0h1m a: 0h1m   words/sec/thread: 129789  lr: 0.017241  loss: 1.733815  eta: 0h1m 1.733715  eta: 0h1m 5.5%  words/sec/thread: 129785  lr: 0.017226  loss: 1.733662  eta: 0h1m ad: 129784  lr: 0.017220  loss: 1.733544  eta: 0h1m h1m thread: 129780  lr: 0.017200  loss: 1.733456  eta: 0h1m a: 0h1m sec/thread: 129778  lr: 0.017184  loss: 1.733297  eta: 0h1m   loss: 1.733235  eta: 0h1m rds/sec/thread: 129775  lr: 0.017175  loss: 1.733153  eta: 0h1m 3071  eta: 0h1m   words/sec/thread: 129771  lr: 0.017159  loss: 1.733024  eta: 0h1m 1.732923  eta: 0h1m 5.7%  words/sec/thread: 129765  lr: 0.017144  loss: 1.732866  eta: 0h1m ss: 1.732783  eta: 0h1m sec/thread: 129759  lr: 0.017130  loss: 1.732703  eta: 0h1m   loss: 1.732692  eta: 0h1m rds/sec/thread: 129759  lr: 0.017114  loss: 1.732647  eta: 0h1m 2564  eta: 0h1m   words/sec/thread: 129759  lr: 0.017096  loss: 1.732

Progress: 71.5%  words/sec/thread: 129551  lr: 0.014257  loss: 1.705448  eta: 0h1m   words/sec/thread: 129620  lr: 0.015863  loss: 1.717381  eta: 0h1m ad: 129615  lr: 0.015851  loss: 1.717233  eta: 0h1m ad: 129612  lr: 0.015843  loss: 1.717127  eta: 0h1m   words/sec/thread: 129610  lr: 0.015832  loss: 1.717049  eta: 0h1m 1.716964  eta: 0h1m ad: 129613  lr: 0.015814  loss: 1.716873  eta: 0h1m   words/sec/thread: 129614  lr: 0.015805  loss: 1.716783  eta: 0h1m 0.015798  loss: 1.716716  eta: 0h1m   words/sec/thread: 129612  lr: 0.015789  loss: 1.716634  eta: 0h1m h1m   words/sec/thread: 129607  lr: 0.015770  loss: 1.716445  eta: 0h1m 1.716376  eta: 0h1m 8.5%  words/sec/thread: 129604  lr: 0.015757  loss: 1.716327  eta: 0h1m ad: 129603  lr: 0.015749  loss: 1.716225  eta: 0h1m h1m thread: 129601  lr: 0.015728  loss: 1.716115  eta: 0h1m a: 0h1m ad: 129606  lr: 0.015709  loss: 1.715943  eta: 0h1m 8.6%  words/sec/thread: 129609  lr: 0.015694  loss: 1.715833  eta: 0h1m   words/sec/thread: 12960

Progress: 74.9%  words/sec/thread: 129654  lr: 0.012555  loss: 1.691579  eta: 0h1m lr: 0.014244  loss: 1.705316  eta: 0h1m s: 71.5%  words/sec/thread: 129553  lr: 0.014233  loss: 1.705238  eta: 0h1m   eta: 0h1m 129550  lr: 0.014216  loss: 1.705073  eta: 0h1m   words/sec/thread: 129553  lr: 0.014206  loss: 1.705009  eta: 0h1m 0.014198  loss: 1.704948  eta: 0h1m 1.6%  words/sec/thread: 129552  lr: 0.014188  loss: 1.704882  eta: 0h1m lr: 0.014184  loss: 1.704844  eta: 0h1m s: 71.6%  words/sec/thread: 129549  lr: 0.014177  loss: 1.704795  eta: 0h1m   loss: 1.704725  eta: 0h1m loss: 1.704651  eta: 0h1m ess: 71.7%  words/sec/thread: 129547  lr: 0.014152  loss: 1.704599  eta: 0h1m   words/sec/thread: 129548  lr: 0.014142  loss: 1.704554  eta: 0h1m 0.014131  loss: 1.704478  eta: 0h1m ad: 129551  lr: 0.014116  loss: 1.704378  eta: 0h1m 1.8%  words/sec/thread: 129551  lr: 0.014102  loss: 1.704241  eta: 0h1m   words/sec/thread: 129553  lr: 0.014091  loss: 1.704198  eta: 0h1m 0.014082  loss: 1.704

Progress: 78.6%  words/sec/thread: 130229  lr: 0.010703  loss: 1.679232  eta: 0h0m  lr: 0.012534  loss: 1.691447  eta: 0h1m   words/sec/thread: 129668  lr: 0.012524  loss: 1.691414  eta: 0h1m   words/sec/thread: 129675  lr: 0.012512  loss: 1.691367  eta: 0h1m 0.012503  loss: 1.691320  eta: 0h1m 5.0%  words/sec/thread: 129684  lr: 0.012492  loss: 1.691190  eta: 0h1m a: 0h1m /thread: 129694  lr: 0.012473  loss: 1.690958  eta: 0h1m oss: 1.690910  eta: 0h1m /sec/thread: 129704  lr: 0.012451  loss: 1.690837  eta: 0h1m   words/sec/thread: 129710  lr: 0.012441  loss: 1.690755  eta: 0h1m 0.012430  loss: 1.690725  eta: 0h1m ad: 129719  lr: 0.012420  loss: 1.690604  eta: 0h1m 2409  loss: 1.690496  eta: 0h1m   words/sec/thread: 129738  lr: 0.012387  loss: 1.690447  eta: 0h1m 1.690442  eta: 0h1m lr: 0.012363  loss: 1.690376  eta: 0h1m a: 0h1m 55  lr: 0.012344  loss: 1.690247  eta: 0h1m   words/sec/thread: 129758  lr: 0.012334  loss: 1.690249  eta: 0h1m 0.012322  loss: 1.690246  eta: 0h1m   words/s

Progress: 82.1%  words/sec/thread: 130455  lr: 0.008935  loss: 1.671441  eta: 0h0m thread: 130232  lr: 0.010683  loss: 1.679198  eta: 0h0m   words/sec/thread: 130235  lr: 0.010676  loss: 1.679169  eta: 0h0m ad: 130237  lr: 0.010666  loss: 1.679175  eta: 0h0m ad: 130243  lr: 0.010651  loss: 1.679118  eta: 0h0m h0m thread: 130249  lr: 0.010630  loss: 1.678986  eta: 0h0m   words/sec/thread: 130258  lr: 0.010617  loss: 1.678938  eta: 0h0m 1.678904  eta: 0h0m 8.8%  words/sec/thread: 130269  lr: 0.010595  loss: 1.678917  eta: 0h0m ad: 130273  lr: 0.010583  loss: 1.678847  eta: 0h0m ad: 130273  lr: 0.010573  loss: 1.678873  eta: 0h0m   words/sec/thread: 130275  lr: 0.010561  loss: 1.678821  eta: 0h0m 1.678752  eta: 0h0m thread: 130282  lr: 0.010540  loss: 1.678678  eta: 0h0m   words/sec/thread: 130289  lr: 0.010527  loss: 1.678609  eta: 0h0m 1.678515  eta: 0h0m thread: 130295  lr: 0.010503  loss: 1.678421  eta: 0h0m ss: 1.678361  eta: 0h0m sec/thread: 130300  lr: 0.010484  loss: 1.678305  eta

Progress: 85.6%  words/sec/thread: 130647  lr: 0.007176  loss: 1.663486  eta: 0h0m ss: 1.671483  eta: 0h0m   words/sec/thread: 130459  lr: 0.008908  loss: 1.671488  eta: 0h0m 1.671470  eta: 0h0m 2.2%  words/sec/thread: 130463  lr: 0.008888  loss: 1.671517  eta: 0h0m   words/sec/thread: 130464  lr: 0.008880  loss: 1.671484  eta: 0h0m 0.008869  loss: 1.671481  eta: 0h0m 2.3%  words/sec/thread: 130463  lr: 0.008859  loss: 1.671449  eta: 0h0m ss: 1.671368  eta: 0h0m sec/thread: 130465  lr: 0.008837  loss: 1.671331  eta: 0h0m   loss: 1.671299  eta: 0h0m   words/sec/thread: 130466  lr: 0.008823  loss: 1.671246  eta: 0h0m 0.008815  loss: 1.671218  eta: 0h0m   words/sec/thread: 130465  lr: 0.008807  loss: 1.671209  eta: 0h0m 0.008798  loss: 1.671168  eta: 0h0m   words/sec/thread: 130468  lr: 0.008792  loss: 1.671126  eta: 0h0m 1.671072  eta: 0h0m thread: 130473  lr: 0.008770  loss: 1.671037  eta: 0h0m ss: 1.671021  eta: 0h0m   words/sec/thread: 130478  lr: 0.008752  loss: 1.670992  eta: 0h0m 0

Progress: 89.4%  words/sec/thread: 130871  lr: 0.005288  loss: 1.655255  eta: 0h0m h0m lr: 0.007145  loss: 1.663205  eta: 0h0m a: 0h0m 48  lr: 0.007123  loss: 1.663049  eta: 0h0m rds/sec/thread: 130649  lr: 0.007114  loss: 1.662932  eta: 0h0m   words/sec/thread: 130650  lr: 0.007101  loss: 1.662858  eta: 0h0m 0.007090  loss: 1.662802  eta: 0h0m 5.8%  words/sec/thread: 130652  lr: 0.007083  loss: 1.662733  eta: 0h0m ss: 1.662653  eta: 0h0m sec/thread: 130657  lr: 0.007059  loss: 1.662594  eta: 0h0m   loss: 1.662584  eta: 0h0m ad: 130659  lr: 0.007038  loss: 1.662544  eta: 0h0m 6.0%  words/sec/thread: 130657  lr: 0.007025  loss: 1.662507  eta: 0h0m   words/sec/thread: 130656  lr: 0.007011  loss: 1.662482  eta: 0h0m 1.662437  eta: 0h0m ad: 130656  lr: 0.006990  loss: 1.662389  eta: 0h0m  lr: 0.006980  loss: 1.662297  eta: 0h0m ss: 86.1%  words/sec/thread: 130657  lr: 0.006970  loss: 1.662275  eta: 0h0m 4  eta: 0h0m  130660  lr: 0.006945  loss: 1.662124  eta: 0h0m 62085  eta: 0h0m ead: 130

Progress: 93.2%  words/sec/thread: 131126  lr: 0.003423  loss: 1.645956  eta: 0h0m 0.005262  loss: 1.655158  eta: 0h0m 9.5%  words/sec/thread: 130883  lr: 0.005248  loss: 1.655103  eta: 0h0m 1.655031  eta: 0h0m 9.6%  words/sec/thread: 130892  lr: 0.005225  loss: 1.654990  eta: 0h0m   eta: 0h0m   words/sec/thread: 130897  lr: 0.005203  loss: 1.654797  eta: 0h0m 1.654652  eta: 0h0m   words/sec/thread: 130901  lr: 0.005185  loss: 1.654546  eta: 0h0m h0m ad: 130911  lr: 0.005156  loss: 1.654155  eta: 0h0m   words/sec/thread: 130917  lr: 0.005142  loss: 1.654048  eta: 0h0m 0.005132  loss: 1.654011  eta: 0h0m 9.8%  words/sec/thread: 130920  lr: 0.005121  loss: 1.653946  eta: 0h0m ss: 1.653893  eta: 0h0m s: 89.8%  words/sec/thread: 130920  lr: 0.005102  loss: 1.653849  eta: 0h0m ad: 130926  lr: 0.005088  loss: 1.653723  eta: 0h0m 9.8%  words/sec/thread: 130926  lr: 0.005076  loss: 1.653661  eta: 0h0m   words/sec/thread: 130929  lr: 0.005066  loss: 1.653656  eta: 0h0m   words/sec/thread: 13093

Progress: 96.9%  words/sec/thread: 131461  lr: 0.001574  loss: 1.638938  eta: 0h0m 3.2%  words/sec/thread: 131125  lr: 0.003404  loss: 1.645869  eta: 0h0m   words/sec/thread: 131124  lr: 0.003391  loss: 1.645854  eta: 0h0m 0.003386  loss: 1.645833  eta: 0h0m   words/sec/thread: 131123  lr: 0.003377  loss: 1.645764  eta: 0h0m 0.003368  loss: 1.645720  eta: 0h0m thread: 131121  lr: 0.003362  loss: 1.645656  eta: 0h0m a: 0h0m   loss: 1.645555  eta: 0h0m 131122  lr: 0.003322  loss: 1.645495  eta: 0h0m ad: 131121  lr: 0.003298  loss: 1.645404  eta: 0h0m h0m lr: 0.003277  loss: 1.645289  eta: 0h0m a: 0h0m   words/sec/thread: 131131  lr: 0.003257  loss: 1.645191  eta: 0h0m 1.645143  eta: 0h0m 3.5%  words/sec/thread: 131134  lr: 0.003234  loss: 1.645117  eta: 0h0m   words/sec/thread: 131138  lr: 0.003225  loss: 1.645088  eta: 0h0m ad: 131138  lr: 0.003214  loss: 1.645047  eta: 0h0m 3.6%  words/sec/thread: 131140  lr: 0.003200  loss: 1.644938  eta: 0h0m a: 0h0m 41  lr: 0.003174  loss: 1.644776 

Progress: 100.0%  words/sec/thread: 131665  lr: 0.000000  loss: 1.633493  eta: 0h0m .638976  eta: 0h0m thread: 131462  lr: 0.001549  loss: 1.638937  eta: 0h0m a: 0h0m sec/thread: 131470  lr: 0.001528  loss: 1.638835  eta: 0h0m ad: 131470  lr: 0.001519  loss: 1.638762  eta: 0h0m ad: 131472  lr: 0.001509  loss: 1.638710  eta: 0h0m h0m   words/sec/thread: 131474  lr: 0.001490  loss: 1.638647  eta: 0h0m 1.638613  eta: 0h0m ad: 131471  lr: 0.001472  loss: 1.638583  eta: 0h0m   words/sec/thread: 131469  lr: 0.001464  loss: 1.638581  eta: 0h0m   words/sec/thread: 131468  lr: 0.001457  loss: 1.638557  eta: 0h0m 1.638534  eta: 0h0m ad: 131464  lr: 0.001442  loss: 1.638512  eta: 0h0m   words/sec/thread: 131467  lr: 0.001430  loss: 1.638499  eta: 0h0m 0.001421  loss: 1.638488  eta: 0h0m h0m thread: 131468  lr: 0.001408  loss: 1.638486  eta: 0h0m /thread: 131468  lr: 0.001399  loss: 1.638446  eta: 0h0m   words/sec/thread: 131470  lr: 0.001390  loss: 1.638386  eta: 0h0m 1.638329  eta: 0h0m 7.3%  wo

In [46]:
# To check first words : The first line is a header containing the number of words and the dimensionality of the vectors. 
# The subsequent lines are the word vectors for all words in the vocabulary, sorted by decreasing frequency.
#! head -n 4 model_lyrics.vec

125703 100
</s> -0.012681 0.039548 0.12973 -0.13181 0.2192 0.006487 -0.069694 0.11984 0.021715 0.1954 -0.070876 0.050559 0.040476 -0.14598 0.21885 -0.20444 -0.14007 0.056377 -0.21201 -0.14093 -0.19119 -0.20454 0.21292 -0.22055 0.22079 0.010239 -0.024036 -0.13012 -0.19501 0.039349 -0.088207 -0.12589 0.029011 0.13177 -0.04941 -0.16296 -0.0151 -0.23948 -0.12409 -0.11343 0.010718 0.083172 -0.16852 0.28312 0.075389 -0.31338 -0.20002 -0.1016 -0.18516 0.01608 -0.20757 0.24317 0.066537 -0.050611 -0.15119 0.054165 -0.13907 0.28233 0.082253 0.067789 0.24579 0.022096 -0.038027 -0.064011 -0.13251 0.043089 -0.22678 0.05545 0.10473 -0.18219 -0.080301 0.14894 0.06075 -0.02897 -0.016911 -0.29241 0.28462 0.096767 -0.086651 0.19748 -0.049296 -0.019989 -0.0040943 -0.21574 0.31605 -0.12374 0.27784 0.20987 -0.077757 0.26502 -0.12618 -0.091735 0.14919 -0.23601 -0.37268 -0.34082 0.094895 0.15425 -0.17881 0.010367 
the -0.041153 -0.048301 0.33608 -0.21116 0.065999 0.25024 0.010657 0.19707 0.057401 0.11175 -

In [47]:
import sys
import codecs

In [48]:
# Load embeddings
def load_embeddings(file_name):
    with codecs.open(file_name, 'r', 'utf-8') as f_in:
        lines = f_in.readlines()
        lines = lines[1:]
        vocabulary, wv = zip(*[line.strip().split(' ', 1) for line in lines])
    wv = np.loadtxt(wv)
    return wv, vocabulary

In [49]:
word_embeddings, vocabulary = load_embeddings('model_lyrics.vec')

In [21]:
vocabulary = list(vocabulary)

In [53]:
"eversince" in vocabulary

False

In [51]:
len(vocabulary)

125703

In [52]:
word_embeddings.shape

(125703, 100)

In [106]:
def get_embedding_for_missing_word(w):
    data = ! echo $w | ./fasttext print-word-vectors model_lyrics.bin
    emb = np.zeros(100)
    for i, x in enumerate(data[0].split( )[1:]):
        try:
            emb[i] = float(x)
        except : 
            continue
            #print(data[0])
    return emb

In [92]:
get_embedding_for_missing_word("eversince")

array([ 0.15354  , -0.082218 , -0.2494   , -0.39903  , -0.053004 ,
       -0.20358  ,  0.46507  , -0.48215  ,  0.031456 ,  0.29864  ,
       -0.45612  , -0.52815  ,  0.42328  ,  0.48587  , -0.13842  ,
       -0.027078 , -0.29471  ,  0.31589  , -0.052668 , -0.17239  ,
       -0.062075 , -0.1166   ,  0.062651 , -0.36563  ,  0.59316  ,
       -0.11488  ,  0.31329  ,  0.0054363, -0.22374  ,  0.19368  ,
        0.46279  , -0.13704  , -0.48086  , -0.21136  , -0.41806  ,
        0.096683 ,  0.17842  , -0.12313  , -0.47596  , -0.11186  ,
       -0.093677 ,  0.078458 , -0.11241  , -0.20531  ,  0.28876  ,
        0.25704  , -0.049918 ,  0.54401  ,  0.10132  ,  0.062308 ,
        0.31098  , -0.068861 ,  0.24074  , -0.29187  , -0.2174   ,
       -0.22046  ,  0.075688 ,  0.088568 , -0.25472  ,  0.24982  ,
        0.02957  , -0.011789 , -0.081646 ,  0.031771 , -0.34703  ,
       -0.34078  , -0.24647  , -0.078897 , -0.26882  ,  0.25468  ,
       -0.049018 ,  0.31585  , -0.075016 , -0.0050666, -0.1406

## 4. Create column with indicator of last word of line

In [27]:
test_l = lyrics_df.loc[190]['lyrics']
test_l

"You a bad girl and your friends bad too\nOh you got the swag sauce you dripping swagu\nYou a bad girl and your friends bad too\nOh you got the swag sauce you dripping swagu\n\nI may be young but I'm ready\nTo give you all my love\nI told my girls you can get it\nDon't slow it down just let it go\nSo in love I'll give it all away\nJust don't tell nobody tomorrow\nSo tonight I'll do it every way\nSpeakers knocking til the morning lights\nChorus\nWe like to party hey hey hey hey hey hey\n\nWe like to party hey hey hey hey hey hey\n\nYour touch is driving me crazy\nI can't explain the way I feel\nTop down with the radio on\nAnd the night belongs to us\nJust hold me close don't let me go\nSo in love I don't care what they say\nI don't care if they're talking tomorrow\nCause tonight is the night oh oh oh\nThat I give you everything\nMusic knocking til the morning light\nCause we like to party\nSet the scene 3000 degrees\nAin't worried 'bout them fuck niggas over there\nBut they worried 'bou

In [55]:
def generate_embedding_for_lyrics(l):
    #print(l)
    sentences = l.split('\n')
    embedding = []
    #missing_words = []
    for s in sentences:
        words = s.split(' ')
        #print(repr(words))
        for i, w in enumerate(words):
            if len(w)>0:
                try : 
                    idx = vocabulary.index(w)
                    vector = word_embeddings[idx]
                except ValueError : 
                    #print("Word not found :", repr(w))
                    #missing_words.append(w)
                    vector = get_embedding_for_missing_word(w)
                if i == (len(words)-1) :
                    vector = np.append(vector, 1)
                else :
                    vector = np.append(vector, 0)
                vector = np.append(w, vector)
                embedding.append(vector)
                
    return embedding

In [None]:
generate_embedding_for_lyrics("Oh baby how you doing\nYou know I'm gonna cut right to the chase")

In [None]:
lyrics_df['embedding'] = lyrics_df['lyrics'][:30].apply(lambda x : generate_embedding_for_lyrics(x))

echo "eversince" | ./fasttext print-word-vectors model_lyrics.bin
eversince 0.26688 -0.1294 0.21473 -0.0022411 -0.21671 0.032355 -0.18573 0.14012 0.26681 0.37774 -0.33736 0.26381 -0.053749 0.37619 -0.21128 -0.056934 0.20461 0.33779 -0.10611 0.021266 -0.17509 0.27 0.41086 0.13709 0.16705 -0.60041 0.38402 0.087207 -0.33848 0.19228 -0.27138 -0.30764 -0.2702 -0.30729 -0.72478 0.16182 0.11382 0.19189 -0.27027 0.27689 0.039753 -0.21678 -0.4436 0.24418 0.12741 0.21971 0.014112 0.11523 0.00034974 0.24912 -0.62545 -0.04865 -0.32367 -0.38435 -0.058886 -0.13295 0.48399 0.49655 0.34541 0.65489 -0.18568 -0.076875 0.087327 -0.0079827 -0.073017 -0.046165 0.009111 -0.024724 -0.44245 0.092923 0.33201 -0.54304 -0.22664 0.41005 0.14919 -0.30418 -0.13195 -0.10742 -0.18871 0.14914 0.15383 -0.30302 0.362 0.26775 0.11757 0.057333 0.38091 0.095017 -0.10443 0.13355 0.64025 0.070291 0.045029 -0.053138 0.19487 -0.062763 0.19003 0.32966 0.049617 0.207

In [97]:
lyrics_df.head()

Unnamed: 0,song,year,artist,genre,lyrics,language,embedding
0,ego-remix,2009,beyonce-knowles,pop,Oh baby how you doing\nYou know Im gonna cut r...,en,"[[Oh, 0.11909, 0.08231, -0.28486, 0.23137, 0.1..."
1,then-tell-me,2009,beyonce-knowles,pop,playin everything so easy\nits like you seem s...,en,"[[playin, 0.5371, -0.097827, 0.68525, -0.10739..."
2,honesty,2009,beyonce-knowles,pop,If you search\nFor tenderness\nIt isnt hard to...,en,"[[If, -0.3726, 0.33803, -0.36677, -0.29783, 0...."
3,you-are-my-rock,2009,beyonce-knowles,pop,Oh oh oh I oh oh oh I\n\nIf I wrote a book abo...,en,"[[Oh, 0.11909, 0.08231, -0.28486, 0.23137, 0.1..."
4,black-culture,2009,beyonce-knowles,pop,Party the people the people the party its popp...,en,"[[Party, 0.28332, 0.68937, -0.4371, -0.084831,..."


## 5. Create column with type of word (noum, verb, etc)

In [57]:
import spacy

In [58]:
en_nlp = spacy.load('en')

ADJ: adjective ADP: adposition ADV: adverb AUX: auxiliary verb CONJ: coordinating conjunction DET: determiner INTJ: interjection NOUN: noun NUM: numeral PART: particle PRON: pronoun PROPN: proper noun PUNCT: punctuation SCONJ: subordinating conjunction SYM: symbol VERB: verb X: other

In [71]:
def generate_pos_for_lyrics(l):
    doc = en_nlp(l)
    embedding = []
    for i in range(len(doc)):
        if doc[i].pos_ != 'SPACE':
            print(doc[i], doc[i].pos_)
            embedding.append(doc[i].pos_)
    return embedding

In [72]:
generate_pos_for_lyrics("Oh baby how you doing\nYou know I'm gonna cut right to the chase")

Oh INTJ
baby NOUN
how ADV
you PRON
doing VERB
You PRON
know VERB
I PRON
'm VERB
gon VERB
na PART
cut VERB
right ADV
to ADP
the DET
chase NOUN


['INTJ',
 'NOUN',
 'ADV',
 'PRON',
 'VERB',
 'PRON',
 'VERB',
 'PRON',
 'VERB',
 'VERB',
 'PART',
 'VERB',
 'ADV',
 'ADP',
 'DET',
 'NOUN']

In [90]:
def generate_pos_for_word(w):
    doc = en_nlp(w)
    #print(doc[0], doc[0].pos_)
    return doc[0].pos_

In [91]:
generate_pos_for_word('Oh')

'INTJ'

In [94]:
pos_to_idx = {'ADJ': 0, 'ADP': 1, 'ADV':2,'AUX': 3,'CONJ': 4,'CCONJ': 5,'DET': 6, 'INTJ': 7, 'NOUN': 8,
              'NUM': 9, 'PART': 10,'PRON':11, 'PROPN': 12, 'PUNCT': 13, 'SCONJ': 14, 'SYM': 15, 
              'VERB': 16, 'X': 17}

In [67]:
"""def merge_embedding_pos(l):
    embedding = generate_embedding_for_lyrics(l)
    pos = generate_pos_for_lyrics(l)
    #print(len(embedding))
    for i in range(len(embedding)):
        bin_vector = np.zeros(18)
        bin_vector[pos_to_idx[pos[i]]] = 1
        embedding[i] = np.append(embedding[i], bin_vector)
    return embedding"""

In [108]:
def merge_embedding_pos(l):
    embedding = generate_embedding_for_lyrics(l)
    for i in range(len(embedding)):
        w = embedding[i][0]
        #print(w)
        if w != '\n':
            pos = generate_pos_for_word(str(w))
            bin_vector = np.zeros(18)
            bin_vector[pos_to_idx[pos]] = 1
            embedding[i] = np.append(embedding[i], bin_vector)
    return embedding

In [109]:
merge_embedding_pos("Oh baby how you doing\nYou know I'm gonna cut right to the chase")

[array(['Oh', '-0.26756', '0.16066', '-0.026644', '-0.061647', '0.12856',
        '-0.16667', '-0.31698', '0.21519', '0.087186', '0.70611',
        '0.097478', '-0.017475', '0.13562', '-0.34733', '0.12901',
        '0.060213', '0.0038623', '-0.003505', '-0.309', '-0.1214',
        '-0.31168', '-0.038396', '-0.13885', '0.13174', '0.48543',
        '0.27012', '0.023631', '0.039914', '-0.0085227', '-0.32833',
        '-0.34518', '-0.039726', '-0.19771', '0.23373', '-0.041411',
        '-0.036932', '-0.073936', '-0.47071', '0.30993', '0.32195',
        '-0.074116', '0.17782', '-0.33701', '0.12694', '0.07969',
        '-0.078333', '-0.32187', '-0.21694', '-0.32914', '0.38804',
        '-0.47941', '0.52507', '0.13306', '-0.086922', '-0.1153',
        '0.64853', '-0.28492', '0.55657', '0.037141', '-0.24895',
        '0.26131', '-0.094953', '0.061678', '0.19854', '0.22078',
        '0.27887', '-0.65879', '-0.03955', '0.62236', '-0.22495',
        '-0.056741', '0.017541', '0.072202', '0.017473'

In [98]:
lyrics_df['embedding'] = lyrics_df['lyrics'][:30].apply(lambda x : merge_embedding_pos(x))

/bin/bash: -c: line 0: unexpected EOF while looking for matching `''
/bin/bash: -c: line 0: unexpected EOF while looking for matching `''
/bin/bash: -c: line 0: unexpected EOF while looking for matching `''
/bin/bash: -c: line 0: unexpected EOF while looking for matching `''
/bin/bash: -c: line 0: unexpected EOF while looking for matching `''
/bin/bash: -c: line 0: unexpected EOF while looking for matching `''
/bin/bash: -c: line 0: unexpected EOF while looking for matching `''
/bin/bash: -c: line 0: unexpected EOF while looking for matching `''
/bin/bash: -c: line 0: unexpected EOF while looking for matching `''
/bin/bash: -c: line 0: unexpected EOF while looking for matching `''
/bin/bash: -c: line 0: unexpected EOF while looking for matching `''
/bin/bash: -c: line 0: unexpected EOF while looking for matching `''
/bin/bash: -c: line 0: unexpected EOF while looking for matching `''
/bin/bash: -c: line 0: unexpected EOF while looking for matching `''
/bin/bash: -c: line 0: unexpected 

Can we remove SYM for symbol, PUNCT for punctuation, and/or merge some categories ?

In [107]:
lyrics_df.head(10)

Unnamed: 0,artist,genre,lyrics,title,embedding
0,beyonce-knowles,pop,Oh baby how you doing\nYou know I'm gonna cut ...,ego-remix,"[[Oh, -0.26756, 0.16066, -0.026644, -0.061647,..."
1,beyonce-knowles,pop,playin' everything so easy\nit's like you seem...,then-tell-me,"[[playin', 0.046408, -0.22275, 0.27501, 0.3307..."
2,beyonce-knowles,pop,If you search\nFor tenderness\nIt isn't hard t...,honesty,"[[If, -0.077863, 0.26915, 0.24783, 0.078624, 0..."
3,beyonce-knowles,pop,Oh oh oh I oh oh oh I\n\nIf I wrote a book abo...,you-are-my-rock,"[[Oh, -0.26756, 0.16066, -0.026644, -0.061647,..."
4,beyonce-knowles,pop,Party the people the people the party it's pop...,black-culture,"[[Party, -0.031328, -0.60994, -0.27762, -0.313..."
5,beyonce-knowles,pop,I heard\nChurch bells ringing\nI heard\nA choi...,all-i-could-do-was-cry,"[[I, -0.028172, 0.063204, 0.13383, 0.013919, -..."
6,beyonce-knowles,pop,This is just another day that I would spend\nW...,once-in-a-lifetime,"[[This, 0.026437, 0.014923, 0.17864, -0.10028,..."
7,beyonce-knowles,pop,Waiting waiting waiting waiting\nWaiting waiti...,waiting,"[[Waiting, -0.36918, 0.38065, 0.68427, -0.1569..."
8,beyonce-knowles,pop,\nI read all of the magazines\nwhile waiting a...,slow-love,"[[I, -0.028172, 0.063204, 0.13383, 0.013919, -..."
9,beyonce-knowles,pop,Nnnow honey\nYou better sit down and look arou...,why-don-t-you-love-me,"[[Nnnow, 0.014805, -0.076101, -0.017499, -0.13..."


## 6. Create column with Entity Recognition

In [122]:
nlp = spacy.load('en_core_web_sm')

Entity types : <br>
PERSON	People, including fictional.<br>
NORP	Nationalities or religious or political groups.<br>
FAC	Buildings, airports, highways, bridges, etc.<br>
ORG	Companies, agencies, institutions, etc.<br>
GPE	Countries, cities, states.<br>
LOC	Non-GPE locations, mountain ranges, bodies of water.<br>
PRODUCT	Objects, vehicles, foods, etc. (Not services.)<br>
EVENT	Named hurricanes, battles, wars, sports events, etc.<br>
WORK_OF_ART	Titles of books, songs, etc.<br>
LAW	Named documents made into laws.<br>
LANGUAGE	Any named language.<br>
DATE	Absolute or relative dates or periods.<br>
TIME	Times smaller than a day.<br>
PERCENT	Percentage, including "%".<br>
MONEY	Monetary values, including unit.<br>
QUANTITY	Measurements, as of weight or distance.<br>
ORDINAL	"first", "second", etc.<br>
CARDINAL	Numerals that do not fall under another type.<br>

In [136]:
doc = nlp(lyrics_df.loc[24]['lyrics'].replace('\n', '').lower())
for ent in doc.ents:
    print(ent.text, ent.label_)

the first day DATE
first ORDINAL
the first day DATE
the first day DATE
the first day the first day DATE
the first day DATE
the first day DATE
the first day DATE


In [154]:
def find_sub_list(sl,l):
    sll=len(sl)
    for ind in (i for i,e in enumerate(l) if e==sl[0]):
        if l[ind:ind+sll]==sl:
            return ind,ind+sll-1

In [195]:
def get_entity_for_lyrics(l):
    l = l.replace('\n', ' ').lower()
    doc = nlp(l)
    l_list = l.split(' ')
    curr_i = 0
    result = np.empty(shape=len(l_list), dtype=object)
    for ent in doc.ents:
        if ent.text.replace(' ', '') != '':
            ent_list = ent.text.split(' ')
            try:
                s, e = find_sub_list(ent_list, l_list[curr_i:])
                s += curr_i
                e += curr_i
                for i in range(s, e+1):
                    result[i] = ent.label_
                curr_i = e
            except : 
                print(ent_list)
                print(l_list[curr_i:])
    return result

In [159]:
ent_to_idx = {'PERSON' : 0,
'NORP': 1,
'FAC' : 2,
'ORG' : 3,
'GPE' : 4,
'LOC' : 5,
'PRODUCT' : 6,
'EVENT' : 7,
'WORK_OF_ART' : 8,
'LAW': 9,
'LANGUAGE' : 10,
'DATE' : 11,
'TIME' : 12,
'PERCENT' : 13,
'MONEY' : 14,
'QUANTITY' : 15,
'ORDINAL' : 16,
'CARDINAL': 17}

In [169]:
def add_ent_to_embedding(emb, l):
    # Compute the entities for the lyrics
    ents = get_entity_for_lyrics(l)
    # Go over the embedding of the words
    for i, w in enumerate(emb):
        # Create empty binary vector
        bin_vector = np.zeros(18)
        # If the entity is None, means we don't know --> Keep all 0s
        if ents[i] != None :
            # Put 1 where the entity is when we know it : current entity is ents[i], pass to index with ent_to_idx
            bin_vector[ent_to_idx[ents[i]]] = 1
        # Add the binary vector at the end of embedding and return
        emb[i] = np.append(w, bin_vector)
    return emb

In [171]:
add_ent_to_embedding(lyrics_df.loc[0]['embedding'], lyrics_df.loc[0]['lyrics'])

[array(['Oh', '-0.26756', '0.16066', '-0.026644', '-0.061647', '0.12856',
        '-0.16667', '-0.31698', '0.21519', '0.087186', '0.70611',
        '0.097478', '-0.017475', '0.13562', '-0.34733', '0.12901',
        '0.060213', '0.0038623', '-0.003505', '-0.309', '-0.1214',
        '-0.31168', '-0.038396', '-0.13885', '0.13174', '0.48543',
        '0.27012', '0.023631', '0.039914', '-0.0085227', '-0.32833',
        '-0.34518', '-0.039726', '-0.19771', '0.23373', '-0.041411',
        '-0.036932', '-0.073936', '-0.47071', '0.30993', '0.32195',
        '-0.074116', '0.17782', '-0.33701', '0.12694', '0.07969',
        '-0.078333', '-0.32187', '-0.21694', '-0.32914', '0.38804',
        '-0.47941', '0.52507', '0.13306', '-0.086922', '-0.1153',
        '0.64853', '-0.28492', '0.55657', '0.037141', '-0.24895',
        '0.26131', '-0.094953', '0.061678', '0.19854', '0.22078',
        '0.27887', '-0.65879', '-0.03955', '0.62236', '-0.22495',
        '-0.056741', '0.017541', '0.072202', '0.017473'

In [196]:
for i, r in lyrics_df[:30].iterrows() :
    r['embedding'] = add_ent_to_embedding(r['embedding'], r['lyrics'])

['night']
['one', 'more', 'dance', 'then', "i'm", 'ready', 'to', 'go', 'well', 'the', "dj's", 'playing', 'all', 'the', 'same', 'songs', 'and', 'the', "night's", 'about', 'to', 'end', 'can', 'we', 'meet', 'in', 'the', 'parking', 'lot', 'find', 'a', 'quiet', 'place', 'were', 'we', 'can', 'talk', 'to', 'find', 'out', 'more', 'about', 'each', 'other', 'baby', 'can', 'we', 'repeat', 'prechorus', 'and', 'chorus', 'bridge', 'you', 'know', "i'm", 'feelin', 'you', 'tonight', 'so', "let's", 'find', 'a', 'certain', 'spot', 'to', 'go', 'where', 'we', 'can', 'get', 'to', 'know', 'each', 'other', 'better', "i'll", 'go', 'and', 'tell', 'my', 'girls', 'you', 'go', 'and', 'tell', 'your', 'boys', 'before', 'we', "leavin'", 'tell', 'me', 'whats', "it'", "gon'", 'be', '', 'chorus', 'until', 'fade']


Wikipedia scheme : <br>
PER	Named person or family.<br>
LOC	Name of politically or geographically defined location (cities, provinces, countries, international regions, bodies of water, mountains).<br>
ORG	Named corporate, governmental, or other organizational entity.<br>
MISC	Miscellaneous entities, e.g. events, nationalities, products or works of art.<br>

In [111]:
#with wikipedia scheme
nlp_wiki = spacy.load('xx_ent_wiki_sm')

In [120]:
doc = nlp_wiki(lyrics_df.loc[24]['lyrics'].replace('\n', '').lower())
for ent in doc.ents:
    print(ent, ent.label_)