In [1]:
%matplotlib inline
import importlib
import numpy as np
import pandas as pd
import json
import utils

In [2]:
importlib.reload(utils)

<module 'utils' from '/home/fei/Documents/projects/lyrics/stacked_lstm/utils.py'>

In [4]:
dat = pd.read_csv('../lyrics_files/songdata.csv', usecols=['text']).drop_duplicates()

In [5]:
dat.head()

Unnamed: 0,text
0,"Look at her face, it's a wonderful face \nAnd..."
1,"Take it easy with me, please \nTouch me gentl..."
2,I'll never know why I had to go \nWhy I had t...
3,Making somebody happy is a question of give an...
4,Making somebody happy is a question of give an...


In [7]:
glove = pd.read_table('../glove.6B.50d.txt', sep=' ', header=None, quoting=3, na_filter=False, index_col=0)

In [8]:
glove.shape

(400000, 50)

In [9]:
glove_words = set(glove.index.tolist())

In [10]:
dat.loc[:, 'text'] += '(end)'

In [26]:
cleaned = utils.data_curating(dat.copy())

In [27]:
cleaned.head()

0    look at her face , it 's a wonderful face \n a...
1    take it easy with me , please \n touch me gent...
2    i 'll never know why i had to go \n why i had ...
3    making somebody happy is a question of give an...
4    making somebody happy is a question of give an...
Name: cleaned_text, dtype: object

In [28]:
cleaned = cleaned.str.split(' ')

In [29]:
cleaned.head()

0    [look, at, her, face, ,, it, 's, a, wonderful,...
1    [take, it, easy, with, me, ,, please, \n, touc...
2    [i, 'll, never, know, why, i, had, to, go, \n,...
3    [making, somebody, happy, is, a, question, of,...
4    [making, somebody, happy, is, a, question, of,...
Name: cleaned_text, dtype: object

In [30]:
cleaned = cleaned.apply(lambda words: [word for word in words if word in glove_words or word == '\n'])

In [31]:
cleaned.head()

0    [look, at, her, face, ,, it, 's, a, wonderful,...
1    [take, it, easy, with, me, ,, please, \n, touc...
2    [i, 'll, never, know, why, i, had, to, go, \n,...
3    [making, somebody, happy, is, a, question, of,...
4    [making, somebody, happy, is, a, question, of,...
Name: cleaned_text, dtype: object

In [32]:
cleaned_sets = cleaned.apply(set)

In [33]:
vocab_size, unique_tokens = utils.get_unique_tokens(cleaned_sets)

In [34]:
vocab_size

53375

In [35]:
unique_tokens[:5]

['\n', '!', '"', "'", "'cause"]

In [36]:
utils.save_unique_tokens(unique_tokens)

In [37]:
word2ind, ind2word = utils.get_index_word_map(unique_tokens)

In [10]:
word2ind, ind2word = utils.load_index_word_map()

In [38]:
utils.save_index_word_map(word2ind, ind2word)

In [39]:
emb = utils.create_emb(ind2word, glove)

In [40]:
utils.save_embedding(emb)

In [41]:
tokenised = utils.tokenise_cleaned_data(cleaned, word2ind)

In [42]:
tokenised.head()

0    [27610, 2505, 21539, 16310, 19, 24309, 12, 36,...
1    [46611, 24309, 14551, 52436, 29193, 19, 35503,...
2    [22747, 7, 31734, 25862, 52138, 22747, 20587, ...
3    [28378, 43832, 20904, 24238, 36, 37341, 32621,...
4    [28378, 43832, 20904, 24238, 36, 37341, 32621,...
Name: cleaned_text, dtype: object

In [43]:
%%time
samples = utils.make_training_samples(tokenised, input_length=200)

CPU times: user 37min 4s, sys: 3min 59s, total: 41min 3s
Wall time: 42min 6s


In [44]:
samples.shape

(6190186, 201)

In [45]:
samples.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,191,192,193,194,195,196,197,198,199,200
0,46611,24309,14551,52436,29193,19,35503,0,48269,29193,...,19,19537,32810,19,35453,0,1552,19,1552,0
1,24309,14551,52436,29193,19,35503,0,48269,29193,19056,...,19537,32810,19,35453,0,1552,19,1552,0,1547
2,14551,52436,29193,19,35503,0,48269,29193,19056,27142,...,32810,19,35453,0,1552,19,1552,0,1547,26900
3,52436,29193,19,35503,0,48269,29193,19056,27142,36,...,19,35453,0,1552,19,1552,0,1547,26900,29193
4,29193,19,35503,0,48269,29193,19056,27142,36,45844,...,35453,0,1552,19,1552,0,1547,26900,29193,17520


In [46]:
utils.save_training_samples(samples)