In [1]:
%matplotlib inline
import importlib
import numpy as np
import pandas as pd
import json
import utils

In [2]:
importlib.reload(utils)

<module 'utils' from '/home/fei/Documents/projects/lyrics/encoder-decoder/utils.py'>

In [39]:
dat = pd.read_csv('../songdata.csv', usecols=['text'], nrows=10000).drop_duplicates()

In [40]:
dat.head()

Unnamed: 0,text
0,"Look at her face, it's a wonderful face \nAnd..."
1,"Take it easy with me, please \nTouch me gentl..."
2,I'll never know why I had to go \nWhy I had t...
3,Making somebody happy is a question of give an...
4,Making somebody happy is a question of give an...


In [41]:
dat.loc[:, 'text'] += '(end)'

In [42]:
unique_chars = dat.loc[:, 'text'].apply(lambda x: list(set(x)))

In [43]:
unique_chars.head()

0    [,, o, k, s, l, u, a, j, S, q, h, \n, b, W, f,...
1    [,, o, k, s, G, l, Y, u, a, h, \n, b, M, P, f,...
2    [,, o, k, D, s, G, l, Y, u, a, S, h, \n, b, W,...
3    [,, o, k, D, s, -, l, Y, u, a, S, q, h, b, \n,...
4    [,, o, k, D, s, -, l, Y, u, a, S, q, h, b, \n,...
Name: text, dtype: object

In [44]:
unique_chars = unique_chars.sum()

In [45]:
unique_chars = sorted(np.unique(unique_chars))

In [46]:
dat_split = dat.text.str.split('\n')
dat_split = dat_split.apply(lambda sens: [sen.strip() for sen in sens])

In [47]:
dat_split.head()

0    [Look at her face, it's a wonderful face, And ...
1    [Take it easy with me, please, Touch me gently...
2    [I'll never know why I had to go, Why I had to...
3    [Making somebody happy is a question of give a...
4    [Making somebody happy is a question of give a...
Name: text, dtype: object

In [48]:
def create_sentence_sum(sens):
    sen_list = sens.sum()
    return pd.Series(sen_list)

In [49]:
encoder_input_sens = dat_split.apply(lambda sens: sens[:-1])
decoder_input_sens = dat_split.apply(lambda sens: sens[1:])
decoder_target_sens = dat_split.apply(lambda sens: sens[1:])

In [50]:
encoder_input_sens = create_sentence_sum(encoder_input_sens)
decoder_input_sens = create_sentence_sum(decoder_input_sens)

In [51]:
encoder_input_sens.apply(lambda sen: len(sen)).describe()

count    397639.000000
mean         26.012974
std          16.452444
min           0.000000
25%          16.000000
50%          26.000000
75%          36.000000
max          78.000000
dtype: float64

In [52]:
decoder_input_sens.apply(lambda sen: len(sen)).describe()

count    397639.000000
mean         25.358546
std          16.600439
min           0.000000
25%          14.000000
50%          26.000000
75%          36.000000
max          78.000000
dtype: float64

In [53]:
encoder_input_sens.head()

0             Look at her face, it's a wonderful face
1                And it means something special to me
2    Look at the way that she smiles when she sees me
3                        How lucky can one fellow be?
4                                                    
dtype: object

In [54]:
def pad_data(input_lists, length=40):
    input_data = []

    for sen in input_lists:
        out = [' '] * length
        for ind, word in enumerate(sen):
            if ind < length:
                out[ind] = word if word != '' else ' '
        input_data.append(out)

    return pd.Series(input_data)

In [55]:
encoder_input_data = pad_data(encoder_input_sens)
decoder_input_data = pad_data(decoder_input_sens)

In [56]:
encoder_input_data.head()

0    [L, o, o, k,  , a, t,  , h, e, r,  , f, a, c, ...
1    [A, n, d,  , i, t,  , m, e, a, n, s,  , s, o, ...
2    [L, o, o, k,  , a, t,  , t, h, e,  , w, a, y, ...
3    [H, o, w,  , l, u, c, k, y,  , c, a, n,  , o, ...
4    [ ,  ,  ,  ,  ,  ,  ,  ,  ,  ,  ,  ,  ,  ,  , ...
dtype: object

In [57]:
utils.save_unique_tokens(unique_chars)

In [58]:
word2ind, ind2word = utils.get_index_word_map(unique_chars)

In [59]:
utils.save_index_word_map(word2ind, ind2word)

In [60]:
encoder_input_tokenised = utils.tokenise_cleaned_data(encoder_input_data, word2ind)

In [61]:
encoder_input_tokenised.head()

0    [33, 64, 64, 60, 1, 50, 69, 1, 57, 54, 67, 1, ...
1    [22, 63, 53, 1, 58, 69, 1, 62, 54, 50, 63, 68,...
2    [33, 64, 64, 60, 1, 50, 69, 1, 69, 57, 54, 1, ...
3    [29, 64, 72, 1, 61, 70, 52, 60, 74, 1, 52, 50,...
4    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
dtype: object

In [62]:
decoder_input_tokenised = utils.tokenise_cleaned_data(decoder_input_data, word2ind)

In [63]:
decoder_input_tokenised.head()

0    [22, 63, 53, 1, 58, 69, 1, 62, 54, 50, 63, 68,...
1    [33, 64, 64, 60, 1, 50, 69, 1, 69, 57, 54, 1, ...
2    [29, 64, 72, 1, 61, 70, 52, 60, 74, 1, 52, 50,...
3    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
4    [40, 57, 54, 4, 68, 1, 59, 70, 68, 69, 1, 62, ...
dtype: object

In [64]:
len(decoder_input_tokenised.loc[0])

40

In [65]:
decoder_target_tokenised = decoder_input_tokenised.apply(lambda x: x + [word2ind['\n']])

In [66]:
decoder_target_tokenised.head()

0    [22, 63, 53, 1, 58, 69, 1, 62, 54, 50, 63, 68,...
1    [33, 64, 64, 60, 1, 50, 69, 1, 69, 57, 54, 1, ...
2    [29, 64, 72, 1, 61, 70, 52, 60, 74, 1, 52, 50,...
3    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
4    [40, 57, 54, 4, 68, 1, 59, 70, 68, 69, 1, 62, ...
dtype: object

In [67]:
len(decoder_target_tokenised.loc[0])

41

In [68]:
decoder_input_tokenised = decoder_input_tokenised.apply(lambda x: [word2ind[' ']] + x)

In [69]:
decoder_input_tokenised.head()

0    [1, 22, 63, 53, 1, 58, 69, 1, 62, 54, 50, 63, ...
1    [1, 33, 64, 64, 60, 1, 50, 69, 1, 69, 57, 54, ...
2    [1, 29, 64, 72, 1, 61, 70, 52, 60, 74, 1, 52, ...
3    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
4    [1, 40, 57, 54, 4, 68, 1, 59, 70, 68, 69, 1, 6...
dtype: object

In [70]:
len(decoder_input_tokenised.iloc[0])

41

In [71]:
encoder_input_tokenised = pd.DataFrame(encoder_input_tokenised.values.tolist())
decoder_input_tokenised = pd.DataFrame(decoder_input_tokenised.values.tolist())
decoder_target_tokenised = pd.DataFrame(decoder_target_tokenised.values.tolist())

In [72]:
encoder_input_tokenised.shape

(397639, 40)

In [73]:
decoder_input_tokenised.shape

(397639, 41)

In [74]:
decoder_target_tokenised.shape

(397639, 41)

In [75]:
encoder_input_tokenised.to_csv('encoder_input.csv', index=False)
decoder_input_tokenised.to_csv('decoder_input.csv', index=False)
decoder_target_tokenised.to_csv('decoder_target.csv', index=False)