In [1]:
%matplotlib inline
import importlib
import numpy as np
import pandas as pd
import json
import utils

In [2]:
importlib.reload(utils)

<module 'utils' from '/home/fei/Documents/projects/lyrics/encoder-decoder/utils.py'>

In [3]:
dat = pd.read_csv('../songdata.csv', usecols=['text'], nrows=10000).drop_duplicates()

In [4]:
dat.head()

Unnamed: 0,text
0,"Look at her face, it's a wonderful face \nAnd..."
1,"Take it easy with me, please \nTouch me gentl..."
2,I'll never know why I had to go \nWhy I had t...
3,Making somebody happy is a question of give an...
4,Making somebody happy is a question of give an...


In [5]:
glove = pd.read_table('../glove.6B.50d.txt', sep=' ', header=None, quoting=3, na_filter=False, index_col=0)

In [6]:
dat.loc[:, 'text'] += '(end)'

In [7]:
dat.loc[:, 'cleaned_text'] = utils.data_curating(dat.copy())

In [8]:
cleaned = dat.cleaned_text.str.split(' ')
cleaned_sets = cleaned.apply(set)
vocab_size, unique_tokens = utils.get_unique_tokens(cleaned_sets)

In [9]:
dat.loc[:, 'cleaned_text'] = dat.cleaned_text.str.split('\n')

In [10]:
dat.loc[:, 'cleaned_text'] = dat.cleaned_text.apply(lambda sens: [sen.strip() for sen in sens])

In [11]:
dat.cleaned_text.head()

0    [look at her face , it 's a wonderful face, an...
1    [take it easy with me , please, touch me gentl...
2    [i 'll never know why i had to go, why i had t...
3    [making somebody happy is a question of give a...
4    [making somebody happy is a question of give a...
Name: cleaned_text, dtype: object

In [12]:
encoder_input = dat.cleaned_text.apply(lambda x: x[:-1])
decoder_input = dat.cleaned_text.apply(lambda x: x[1:])

In [13]:
encoder_input = encoder_input.apply(lambda sens: [sen.split(' ') for sen in sens])
decoder_input = decoder_input.apply(lambda sens: [sen.split(' ') for sen in sens])

In [14]:
encoder_input = encoder_input.sum()
decoder_input = decoder_input.sum()

In [15]:
def pad_data(input_lists, length=20):
    input_data = []

    for sen in input_lists:
        out = [' '] * length
        for ind, word in enumerate(sen):
            if ind < length:
                out[ind] = word if word != '' else ' '
        input_data.append(out)

    return pd.Series(input_data)

In [16]:
encoder_input_data = pad_data(encoder_input)
decoder_input_data = pad_data(decoder_input)

In [17]:
vocab_size

32916

In [18]:
unique_tokens[:5]

['\n', '!', '"', "'", "'cause"]

In [19]:
unique_tokens.insert(0, ' ')

In [20]:
unique_tokens[:5]

[' ', '\n', '!', '"', "'"]

In [21]:
utils.save_unique_tokens(unique_tokens)

In [22]:
word2ind, ind2word = utils.get_index_word_map(unique_tokens)

In [23]:
utils.save_index_word_map(word2ind, ind2word)

In [24]:
emb = utils.create_emb(ind2word, glove)

In [25]:
utils.save_embedding(emb)

In [26]:
encoder_input_tokenised = utils.tokenise_cleaned_data(encoder_input_data, word2ind)

In [27]:
encoder_input_tokenised.head()

0    [16652, 1464, 13082, 9845, 20, 14702, 13, 37, ...
1    [913, 14702, 17705, 26798, 27014, 29547, 17684...
2    [16652, 1464, 29139, 31819, 29133, 25641, 2651...
3    [13619, 16815, 4158, 20009, 10167, 2228, 34, 0...
4    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
dtype: object

In [28]:
decoder_input_tokenised = utils.tokenise_cleaned_data(decoder_input_data, word2ind)

In [29]:
decoder_input_tokenised.head()

0    [913, 14702, 17705, 26798, 27014, 29547, 17684...
1    [16652, 1464, 29139, 31819, 29133, 25641, 2651...
2    [13619, 16815, 4158, 20009, 10167, 2228, 34, 0...
3    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
4    [25641, 13, 15253, 18949, 15519, 19876, 11745,...
dtype: object

In [30]:
len(decoder_input_tokenised.loc[0])

20

In [31]:
decoder_target_tokenised = decoder_input_tokenised.apply(lambda x: x + [word2ind['\n']])

In [32]:
decoder_target_tokenised.head()

0    [913, 14702, 17705, 26798, 27014, 29547, 17684...
1    [16652, 1464, 29139, 31819, 29133, 25641, 2651...
2    [13619, 16815, 4158, 20009, 10167, 2228, 34, 0...
3    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
4    [25641, 13, 15253, 18949, 15519, 19876, 11745,...
dtype: object

In [33]:
len(decoder_target_tokenised.loc[0])

21

In [34]:
decoder_input_tokenised = decoder_input_tokenised.apply(lambda x: [word2ind[' ']] + x)

In [35]:
decoder_input_tokenised.head()

0    [0, 913, 14702, 17705, 26798, 27014, 29547, 17...
1    [0, 16652, 1464, 29139, 31819, 29133, 25641, 2...
2    [0, 13619, 16815, 4158, 20009, 10167, 2228, 34...
3    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
4    [0, 25641, 13, 15253, 18949, 15519, 19876, 117...
dtype: object

In [36]:
len(decoder_input_tokenised.iloc[0])

21

In [37]:
encoder_input_tokenised = pd.DataFrame(encoder_input_tokenised.values.tolist())
decoder_input_tokenised = pd.DataFrame(decoder_input_tokenised.values.tolist())
decoder_target_tokenised = pd.DataFrame(decoder_target_tokenised.values.tolist())

In [38]:
encoder_input_tokenised.shape

(397605, 20)

In [39]:
decoder_input_tokenised.shape

(397605, 21)

In [40]:
decoder_target_tokenised.shape

(397605, 21)

In [41]:
encoder_input_tokenised.to_csv('encoder_input.csv', index=False)
decoder_input_tokenised.to_csv('decoder_input.csv', index=False)
decoder_target_tokenised.to_csv('decoder_target.csv', index=False)