In [1]:
%matplotlib inline
import importlib
import numpy as np
import pandas as pd
import json
import utils

In [2]:
importlib.reload(utils)

<module 'utils' from '/home/fei/Documents/projects/lyrics/encoder-decoder/utils.py'>

In [36]:
dat = pd.read_csv('../songdata.csv', usecols=['text'], nrows=100).drop_duplicates()

In [37]:
dat.head()

Unnamed: 0,text
0,"Look at her face, it's a wonderful face \nAnd..."
1,"Take it easy with me, please \nTouch me gentl..."
2,I'll never know why I had to go \nWhy I had t...
3,Making somebody happy is a question of give an...
4,Making somebody happy is a question of give an...


In [38]:
dat.loc[:, 'text'] += '(end)'

In [39]:
unique_chars = dat.loc[:, 'text'].apply(lambda x: list(set(x)))

In [40]:
unique_chars.head()

0    [t, w, ),  , s, l, f, W, g, A, i, u, a, e, \n,...
1    [t, w, ),  , s, l, f, g, A, i, u, a, e, T, \n,...
2    [t, w, ),  , s, B, l, W, f, g, A, i, u, a, e, ...
3    [t, w, ),  , s, B, l, f, W, g, A, i, u, a, e, ...
4    [t, w, ),  , s, B, l, f, W, g, A, i, u, a, e, ...
Name: text, dtype: object

In [41]:
unique_chars = unique_chars.sum()

In [42]:
unique_chars = sorted(np.unique(unique_chars))

In [43]:
dat_split = dat.text.str.split('\n')
dat_split = dat_split.apply(lambda sens: [sen.strip() for sen in sens])

In [44]:
dat_split.head()

0    [Look at her face, it's a wonderful face, And ...
1    [Take it easy with me, please, Touch me gently...
2    [I'll never know why I had to go, Why I had to...
3    [Making somebody happy is a question of give a...
4    [Making somebody happy is a question of give a...
Name: text, dtype: object

In [48]:
def create_sentence_sum(sens):
    sen_list = sens.sum()
    return pd.Series(sen_list)

In [49]:
encoder_input_sens = dat_split.apply(lambda sens: sens[:-1])
decoder_input_sens = dat_split.apply(lambda sens: sens[1:])
decoder_target_sens = dat_split.apply(lambda sens: sens[1:])

In [50]:
encoder_input_sens = create_sentence_sum(encoder_input_sens)
decoder_input_sens = create_sentence_sum(decoder_input_sens)

In [51]:
encoder_input_sens.apply(lambda sen: len(sen)).describe()

count    4462.000000
mean       27.096369
std        16.112879
min         0.000000
25%        17.000000
50%        27.000000
75%        38.000000
max        72.000000
dtype: float64

In [52]:
decoder_input_sens.apply(lambda sen: len(sen)).describe()

count    4462.000000
mean       26.411699
std        16.253475
min         0.000000
25%        16.000000
50%        27.000000
75%        37.000000
max        72.000000
dtype: float64

In [53]:
encoder_input_sens.head()

0             Look at her face, it's a wonderful face
1                And it means something special to me
2    Look at the way that she smiles when she sees me
3                        How lucky can one fellow be?
4                                                    
dtype: object

In [70]:
def pad_data(input_lists, length=75):
    input_data = []

    for sen in input_lists:
        out = [' '] * length
        for ind, word in enumerate(sen):
            if ind < length:
                out[ind] = word if word != '' else ' '
        input_data.append(out)

    return pd.Series(input_data)

In [71]:
encoder_input_data = pad_data(encoder_input_sens)
decoder_input_data = pad_data(decoder_input_sens)

In [73]:
encoder_input_data.head()

0    [L, o, o, k,  , a, t,  , h, e, r,  , f, a, c, ...
1    [A, n, d,  , i, t,  , m, e, a, n, s,  , s, o, ...
2    [L, o, o, k,  , a, t,  , t, h, e,  , w, a, y, ...
3    [H, o, w,  , l, u, c, k, y,  , c, a, n,  , o, ...
4    [ ,  ,  ,  ,  ,  ,  ,  ,  ,  ,  ,  ,  ,  ,  , ...
dtype: object

In [74]:
utils.save_unique_tokens(unique_chars)

In [75]:
word2ind, ind2word = utils.get_index_word_map(unique_chars)

In [76]:
utils.save_index_word_map(word2ind, ind2word)

In [77]:
encoder_input_tokenised = utils.tokenise_cleaned_data(encoder_input_data, word2ind)

In [78]:
encoder_input_tokenised.head()

0    [25, 54, 54, 50, 1, 40, 59, 1, 47, 44, 57, 1, ...
1    [14, 53, 43, 1, 48, 59, 1, 52, 44, 40, 53, 58,...
2    [25, 54, 54, 50, 1, 40, 59, 1, 59, 47, 44, 1, ...
3    [21, 54, 62, 1, 51, 60, 42, 50, 64, 1, 42, 40,...
4    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
dtype: object

In [79]:
decoder_input_tokenised = utils.tokenise_cleaned_data(decoder_input_data, word2ind)

In [80]:
decoder_input_tokenised.head()

0    [14, 53, 43, 1, 48, 59, 1, 52, 44, 40, 53, 58,...
1    [25, 54, 54, 50, 1, 40, 59, 1, 59, 47, 44, 1, ...
2    [21, 54, 62, 1, 51, 60, 42, 50, 64, 1, 42, 40,...
3    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
4    [32, 47, 44, 4, 58, 1, 49, 60, 58, 59, 1, 52, ...
dtype: object

In [81]:
len(decoder_input_tokenised.loc[0])

75

In [82]:
decoder_target_tokenised = decoder_input_tokenised.apply(lambda x: x + [word2ind['\n']])

In [83]:
decoder_target_tokenised.head()

0    [14, 53, 43, 1, 48, 59, 1, 52, 44, 40, 53, 58,...
1    [25, 54, 54, 50, 1, 40, 59, 1, 59, 47, 44, 1, ...
2    [21, 54, 62, 1, 51, 60, 42, 50, 64, 1, 42, 40,...
3    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
4    [32, 47, 44, 4, 58, 1, 49, 60, 58, 59, 1, 52, ...
dtype: object

In [84]:
len(decoder_target_tokenised.loc[0])

76

In [85]:
decoder_input_tokenised = decoder_input_tokenised.apply(lambda x: [word2ind[' ']] + x)

In [86]:
decoder_input_tokenised.head()

0    [1, 14, 53, 43, 1, 48, 59, 1, 52, 44, 40, 53, ...
1    [1, 25, 54, 54, 50, 1, 40, 59, 1, 59, 47, 44, ...
2    [1, 21, 54, 62, 1, 51, 60, 42, 50, 64, 1, 42, ...
3    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
4    [1, 32, 47, 44, 4, 58, 1, 49, 60, 58, 59, 1, 5...
dtype: object

In [87]:
len(decoder_input_tokenised.iloc[0])

76

In [88]:
encoder_input_tokenised = pd.DataFrame(encoder_input_tokenised.values.tolist())
decoder_input_tokenised = pd.DataFrame(decoder_input_tokenised.values.tolist())
decoder_target_tokenised = pd.DataFrame(decoder_target_tokenised.values.tolist())

In [89]:
encoder_input_tokenised.shape

(4462, 75)

In [90]:
decoder_input_tokenised.shape

(4462, 76)

In [91]:
decoder_target_tokenised.shape

(4462, 76)

In [92]:
encoder_input_tokenised.to_csv('encoder_input.csv', index=False)
decoder_input_tokenised.to_csv('decoder_input.csv', index=False)
decoder_target_tokenised.to_csv('decoder_target.csv', index=False)