In [1]:
%matplotlib inline
import importlib
import numpy as np
import pandas as pd
import json
import utils

In [2]:
importlib.reload(utils)

<module 'utils' from '/home/fei/Documents/projects/lyrics/encoder-decoder/utils.py'>

In [3]:
dat = pd.read_csv('../songdata.csv', usecols=['text'], nrows=100).drop_duplicates()

In [4]:
dat.head()

Unnamed: 0,text
0,"Look at her face, it's a wonderful face \nAnd..."
1,"Take it easy with me, please \nTouch me gentl..."
2,I'll never know why I had to go \nWhy I had t...
3,Making somebody happy is a question of give an...
4,Making somebody happy is a question of give an...


In [5]:
dat.loc[:, 'text'] += '(end)'

In [6]:
unique_chars = dat.loc[:, 'text'].apply(lambda x: list(set(x)))

In [7]:
unique_chars.head()

0    [n, h, ?, m, S, \n, ,, I, b, H, v,  , e, j, s,...
1    [n, h, m, G, \n, ,, J, O, I, b, v,  , Y, e, s,...
2    [n, h, m, S, G, D, \n, ,, J, N, I, b, H, v,  ,...
3    [n, h, m, S, D, \n, ,, N, O, I, b, v,  , Y, e,...
4    [n, h, m, S, D, \n, ,, N, O, I, b, v,  , Y, e,...
Name: text, dtype: object

In [8]:
unique_chars = unique_chars.sum()

In [9]:
unique_chars = sorted(np.unique(unique_chars))

In [10]:
dat_split = dat.text.str.split('\n')
dat_split = dat_split.apply(lambda sens: [sen.strip() for sen in sens])

In [11]:
dat_split.head()

0    [Look at her face, it's a wonderful face, And ...
1    [Take it easy with me, please, Touch me gently...
2    [I'll never know why I had to go, Why I had to...
3    [Making somebody happy is a question of give a...
4    [Making somebody happy is a question of give a...
Name: text, dtype: object

In [12]:
def create_sentence_sum(sens):
    sen_list = sens.sum()
    return pd.Series(sen_list)

In [13]:
encoder_input_sens = dat_split.apply(lambda sens: sens[:-1])
decoder_input_sens = dat_split.apply(lambda sens: sens[1:])
decoder_target_sens = dat_split.apply(lambda sens: sens[1:])

In [14]:
encoder_input_sens = create_sentence_sum(encoder_input_sens)
decoder_input_sens = create_sentence_sum(decoder_input_sens)

In [15]:
encoder_input_sens.apply(lambda sen: len(sen)).describe()

count    4462.000000
mean       27.096369
std        16.112879
min         0.000000
25%        17.000000
50%        27.000000
75%        38.000000
max        72.000000
dtype: float64

In [16]:
decoder_input_sens.apply(lambda sen: len(sen)).describe()

count    4462.000000
mean       26.411699
std        16.253475
min         0.000000
25%        16.000000
50%        27.000000
75%        37.000000
max        72.000000
dtype: float64

In [17]:
encoder_input_sens.head()

0             Look at her face, it's a wonderful face
1                And it means something special to me
2    Look at the way that she smiles when she sees me
3                        How lucky can one fellow be?
4                                                    
dtype: object

In [18]:
def pad_data(input_lists, length=40):
    input_data = []

    for sen in input_lists:
        out = [' '] * length
        for ind, word in enumerate(sen):
            if ind < length:
                out[ind] = word if word != '' else ' '
        input_data.append(out)

    return pd.Series(input_data)

In [19]:
encoder_input_data = pad_data(encoder_input_sens)
decoder_input_data = pad_data(decoder_input_sens)

In [20]:
encoder_input_data.head()

0    [L, o, o, k,  , a, t,  , h, e, r,  , f, a, c, ...
1    [A, n, d,  , i, t,  , m, e, a, n, s,  , s, o, ...
2    [L, o, o, k,  , a, t,  , t, h, e,  , w, a, y, ...
3    [H, o, w,  , l, u, c, k, y,  , c, a, n,  , o, ...
4    [ ,  ,  ,  ,  ,  ,  ,  ,  ,  ,  ,  ,  ,  ,  , ...
dtype: object

In [21]:
utils.save_unique_tokens(unique_chars)

In [22]:
word2ind, ind2word = utils.get_index_word_map(unique_chars)

In [23]:
utils.save_index_word_map(word2ind, ind2word)

In [24]:
encoder_input_tokenised = utils.tokenise_cleaned_data(encoder_input_data, word2ind)

In [25]:
encoder_input_tokenised.head()

0    [25, 54, 54, 50, 1, 40, 59, 1, 47, 44, 57, 1, ...
1    [14, 53, 43, 1, 48, 59, 1, 52, 44, 40, 53, 58,...
2    [25, 54, 54, 50, 1, 40, 59, 1, 59, 47, 44, 1, ...
3    [21, 54, 62, 1, 51, 60, 42, 50, 64, 1, 42, 40,...
4    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
dtype: object

In [26]:
decoder_input_tokenised = utils.tokenise_cleaned_data(decoder_input_data, word2ind)

In [27]:
decoder_input_tokenised.head()

0    [14, 53, 43, 1, 48, 59, 1, 52, 44, 40, 53, 58,...
1    [25, 54, 54, 50, 1, 40, 59, 1, 59, 47, 44, 1, ...
2    [21, 54, 62, 1, 51, 60, 42, 50, 64, 1, 42, 40,...
3    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
4    [32, 47, 44, 4, 58, 1, 49, 60, 58, 59, 1, 52, ...
dtype: object

In [28]:
len(decoder_input_tokenised.loc[0])

40

In [29]:
decoder_target_tokenised = decoder_input_tokenised.apply(lambda x: x + [word2ind['\n']])

In [30]:
decoder_target_tokenised.head()

0    [14, 53, 43, 1, 48, 59, 1, 52, 44, 40, 53, 58,...
1    [25, 54, 54, 50, 1, 40, 59, 1, 59, 47, 44, 1, ...
2    [21, 54, 62, 1, 51, 60, 42, 50, 64, 1, 42, 40,...
3    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
4    [32, 47, 44, 4, 58, 1, 49, 60, 58, 59, 1, 52, ...
dtype: object

In [31]:
len(decoder_target_tokenised.loc[0])

41

In [32]:
decoder_input_tokenised = decoder_input_tokenised.apply(lambda x: [word2ind[' ']] + x)

In [33]:
decoder_input_tokenised.head()

0    [1, 14, 53, 43, 1, 48, 59, 1, 52, 44, 40, 53, ...
1    [1, 25, 54, 54, 50, 1, 40, 59, 1, 59, 47, 44, ...
2    [1, 21, 54, 62, 1, 51, 60, 42, 50, 64, 1, 42, ...
3    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
4    [1, 32, 47, 44, 4, 58, 1, 49, 60, 58, 59, 1, 5...
dtype: object

In [34]:
len(decoder_input_tokenised.iloc[0])

41

In [35]:
encoder_input_tokenised = pd.DataFrame(encoder_input_tokenised.values.tolist())
decoder_input_tokenised = pd.DataFrame(decoder_input_tokenised.values.tolist())
decoder_target_tokenised = pd.DataFrame(decoder_target_tokenised.values.tolist())

In [36]:
encoder_input_tokenised.shape

(4462, 40)

In [37]:
decoder_input_tokenised.shape

(4462, 41)

In [38]:
decoder_target_tokenised.shape

(4462, 41)

In [39]:
encoder_input_tokenised.to_csv('encoder_input.csv', index=False)
decoder_input_tokenised.to_csv('decoder_input.csv', index=False)
decoder_target_tokenised.to_csv('decoder_target.csv', index=False)