In [26]:
import numpy as np
import pandas as pd
import json

In [2]:
dat = pd.read_csv('songdata.csv', usecols=['text'])

In [3]:
dat.head()

Unnamed: 0,text
0,"Look at her face, it's a wonderful face \nAnd..."
1,"Take it easy with me, please \nTouch me gentl..."
2,I'll never know why I had to go \nWhy I had t...
3,Making somebody happy is a question of give an...
4,Making somebody happy is a question of give an...


In [4]:
def data_cleaning(dat):
    dat.loc[:, 'cleaned_text'] = dat.text.str.lower()
    dat.loc[:, 'cleaned_text'] = dat.cleaned_text.str.replace(r'[\n!"\(\),-.0-9:?\[\]]', lambda x: ' '+x.group(0)+' ')
    dat.loc[:, 'cleaned_text'] = dat.cleaned_text.str.replace("''", '"')
    dat.loc[:, 'cleaned_text'] = dat.cleaned_text.str.replace(r"\w+in'$|\w+in'\s", lambda m: m.group(0).replace("'", 'g'))
    dat.loc[:, 'cleaned_text'] = dat.cleaned_text.str.replace("'", " ' ")
    dat.loc[:, 'cleaned_text'] = dat.cleaned_text.str.replace(r' {2,}', ' ')
    dat.loc[:, 'text_list'] = dat.cleaned_text.str.split(' ')
    return dat.loc[:, 'text_list']

In [5]:
cleaned = data_cleaning(dat.head(1000).copy())

In [6]:
cleaned.head()

0    [look, at, her, face, ,, it, ', s, a, wonderfu...
1    [take, it, easy, with, me, ,, please, \n, touc...
2    [i, ', ll, never, know, why, i, had, to, go, \...
3    [making, somebody, happy, is, a, question, of,...
4    [making, somebody, happy, is, a, question, of,...
Name: text_list, dtype: object

In [17]:
def get_unique_tokens(tokens):
    unique_tokens = sorted(list(set(tokens)))
    return len(unique_tokens), unique_tokens

In [19]:
tokens = cleaned.sum()

In [20]:
vocab_size, unique_tokens = get_unique_tokens(tokens)

In [43]:
print(vocab_size)

8762


In [21]:
def save_unique_tokens(unique_tokens, filename='unique_tokens'):
    # check if all unique
    assert len(unique_tokens) == len(set(unique_tokens)), 'Make sure all tokens unique!'
    unique_token_series = pd.Series(unique_tokens)
    unique_token_series.to_csv(filename, index=False, header=None)

In [22]:
save_unique_tokens(unique_tokens)

In [24]:
def get_index_word_map(unique_tokens):
    # check if all unique
    assert len(unique_tokens) == len(set(unique_tokens)), 'Make sure all tokens unique!'
    word2ind = {}
    ind2word = {}
    for ind, word in enumerate(unique_tokens):
        word2ind[word] = ind
        ind2word[ind] = word
    return word2ind, ind2word

In [25]:
word2ind, ind2word = get_index_word_map(unique_tokens)

In [27]:
def save_dict(dict2save, filename):
    with open(filename, 'w') as f:
        json.dump(dict2save, f)

def save_index_word_map(word2ind, ind2word, word2ind_filename='word2ind', ind2word_filename='ind2word'):
    save_dict(word2ind, word2ind_filename)
    save_dict(ind2word, ind2word_filename)

In [30]:
save_index_word_map(word2ind, ind2word)

In [31]:
glove = pd.read_table('glove.6B.50d.txt', sep=' ', header=None, quoting=3, na_filter=False, index_col=0)

In [40]:
def create_emb(ind2word, imported_emb):
    vocab_size = len(ind2word)
    n_fact = imported_emb.shape[1]
    emb = np.zeros((vocab_size, n_fact))
    for i in range(vocab_size):
        word = ind2word[i]
        try:
            emb[i] = imported_emb.loc[word]
        except KeyError:
            emb[i] = np.random.normal(scale=0.6, size=(n_fact,))
    return emb

In [41]:
emb = create_emb(ind2word, glove)

In [46]:
def save_embedding(embedding, filename='embedding.csv'):
    np.savetxt(filename, embedding, delimiter=',')

In [47]:
save_embedding(emb)

In [49]:
def tokenise_cleaned_data(dat, word2ind):
    return dat.apply(lambda words: [word2ind[word] for word in words])

In [50]:
tokenised = tokenise_cleaned_data(cleaned, word2ind)

In [51]:
tokenised.head()

0    [4413, 397, 3515, 2562, 8, 3926, 5, 6415, 25, ...
1    [7600, 3926, 2324, 8594, 4666, 8, 5694, 1, 791...
2    [3730, 5, 4375, 5069, 4149, 8534, 3730, 3356, ...
3    [4552, 7073, 3402, 3914, 25, 5975, 5199, 3128,...
4    [4552, 7073, 3402, 3914, 25, 5975, 5199, 3128,...
Name: text_list, dtype: object

In [56]:
def transform_text(text_list, input_length=10):
    list_length = len(text_list)
    res_list = []
    for i in range(0, list_length-input_length):
        res_list.append(text_list[i:i+input_length+1])
    return res_list

In [57]:
def make_training_samples(pd_series):
    transformed_texts = pd_series.apply(transform_text)
    samples = transformed_texts.sum()
    return pd.DataFrame(samples)

In [58]:
samples = make_training_samples(tokenised)

In [59]:
samples.shape

(293470, 11)

In [60]:
samples.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,4413,397,3515,2562,8,3926,5,6415,25,8618,2562
1,397,3515,2562,8,3926,5,6415,25,8618,2562,1
2,3515,2562,8,3926,5,6415,25,8618,2562,1,261
3,2562,8,3926,5,6415,25,8618,2562,1,261,3926
4,8,3926,5,6415,25,8618,2562,1,261,3926,4677


In [61]:
def save_training_samples(samples, filename='train.csv'):
    samples.to_csv('train.csv', index=False, header=None)

In [62]:
save_training_samples(samples)