In [68]:
import numpy as np
import pandas as pd
import json

In [69]:
dat = pd.read_csv('songdata.csv', usecols=['text'])

In [70]:
dat.head()

Unnamed: 0,text
0,"Look at her face, it's a wonderful face \nAnd..."
1,"Take it easy with me, please \nTouch me gentl..."
2,I'll never know why I had to go \nWhy I had t...
3,Making somebody happy is a question of give an...
4,Making somebody happy is a question of give an...


In [71]:
apos_end_pattern = r"'( cause| d| em| ll| m| n| re| s| til| till| twas| ve) (?!')"
apos_start_pattern = r" (d |j |l |ol |y )'"
apos_double_pattern = r" ' n ' "
def data_cleaning(dat):
    dat.loc[:, 'cleaned_text'] = dat.text.str.lower()
    dat.loc[:, 'cleaned_text'] = dat.cleaned_text.str.replace(r'[\n!"\(\),-.0-9:?\[\]]', lambda x: ' '+x.group(0)+' ')
    dat.loc[:, 'cleaned_text'] = dat.cleaned_text.str.replace("''", '"')
    dat.loc[:, 'cleaned_text'] = dat.cleaned_text.str.replace("'", " ' ")
    # in' to ing
    dat.loc[:, 'cleaned_text'] = dat.cleaned_text.str.replace(r"\w+in'$|\w+in'\s", lambda m: m.group(0).replace("'", 'g'))
    # recover 'cause, 'd, 'em, 'll, 'm, 'n, 're, 's, 'til, 'till, 'twas
    dat.loc[:, 'cleaned_text'] = dat.cleaned_text.str.replace(apos_end_pattern, lambda m: m.group(0)[:1]+m.group(0)[2:])
    # recover d', j', l', ol', y'
    dat.loc[:, 'cleaned_text'] = dat.cleaned_text.str.replace(apos_start_pattern, lambda m: m.group(0)[:-2]+m.group(0)[-1:])
    # recover 'n'
    dat.loc[:, 'cleaned_text'] = dat.cleaned_text.str.replace(apos_double_pattern, lambda m: m.group(0)[:2]+m.group(0)[3]+m.group(0)[-2:])
    dat.loc[:, 'cleaned_text'] = dat.cleaned_text.str.replace(r' {2,}', ' ')
    dat.loc[:, 'cleaned_text'] = dat.cleaned_text.str.strip()
    dat.loc[:, 'text_list'] = dat.cleaned_text.str.split(' ')
    return dat.loc[:, 'text_list']

In [72]:
cleaned = data_cleaning(dat.head(2000).copy())

In [73]:
cleaned.head()

0    [look, at, her, face, ,, it, 's, a, wonderful,...
1    [take, it, easy, with, me, ,, please, \n, touc...
2    [i, 'll, never, know, why, i, had, to, go, \n,...
3    [making, somebody, happy, is, a, question, of,...
4    [making, somebody, happy, is, a, question, of,...
Name: text_list, dtype: object

In [74]:
def get_unique_tokens(tokens):
    unique_tokens = sorted(list(set(tokens)))
    return len(unique_tokens), unique_tokens

In [75]:
tokens = cleaned.sum()

In [76]:
vocab_size, unique_tokens = get_unique_tokens(tokens)

In [77]:
vocab_size

13063

In [78]:
def save_unique_tokens(unique_tokens, filename='unique_tokens'):
    # check if all unique
    assert len(unique_tokens) == len(set(unique_tokens)), 'Make sure all tokens unique!'
    unique_token_series = pd.Series(unique_tokens)
    unique_token_series.to_csv(filename, index=False, header=None)

In [79]:
save_unique_tokens(unique_tokens)

In [80]:
def get_index_word_map(unique_tokens):
    # check if all unique
    assert len(unique_tokens) == len(set(unique_tokens)), 'Make sure all tokens unique!'
    word2ind = {}
    ind2word = {}
    for ind, word in enumerate(unique_tokens):
        word2ind[word] = ind
        ind2word[ind] = word
    return word2ind, ind2word

In [81]:
word2ind, ind2word = get_index_word_map(unique_tokens)

In [82]:
def save_dict(dict2save, filename):
    with open(filename, 'w') as f:
        json.dump(dict2save, f)

def save_index_word_map(word2ind, ind2word, word2ind_filename='word2ind', ind2word_filename='ind2word'):
    save_dict(word2ind, word2ind_filename)
    save_dict(ind2word, ind2word_filename)

In [83]:
save_index_word_map(word2ind, ind2word)

In [84]:
glove = pd.read_table('glove.6B.50d.txt', sep=' ', header=None, quoting=3, na_filter=False, index_col=0)

In [85]:
def create_emb(ind2word, imported_emb):
    vocab_size = len(ind2word)
    n_fact = imported_emb.shape[1]
    emb = np.zeros((vocab_size, n_fact))
    for i in range(vocab_size):
        word = ind2word[i]
        try:
            emb[i] = imported_emb.loc[word]
        except KeyError:
            emb[i] = np.random.normal(scale=0.6, size=(n_fact,))
    return emb

In [86]:
emb = create_emb(ind2word, glove)

In [87]:
def save_embedding(embedding, filename='embedding.csv'):
    np.savetxt(filename, embedding, delimiter=',')

In [88]:
save_embedding(emb)

In [89]:
def tokenise_cleaned_data(dat, word2ind):
    return dat.apply(lambda words: [word2ind[word] for word in words])

In [90]:
tokenised = tokenise_cleaned_data(cleaned, word2ind)

In [91]:
tokenised.head()

0    [6615, 586, 5250, 3882, 21, 5866, 14, 38, 1285...
1    [11363, 5866, 3515, 12826, 7004, 21, 8515, 0, ...
2    [5552, 9, 7609, 6197, 12739, 5552, 5013, 11731...
3    [6831, 10578, 5089, 5848, 38, 8960, 7801, 4686...
4    [6831, 10578, 5089, 5848, 38, 8960, 7801, 4686...
Name: text_list, dtype: object

In [92]:
def transform_text(text_list, input_length=10):
    list_length = len(text_list)
    res_list = []
    for i in range(0, list_length-input_length):
        res_list.append(text_list[i:i+input_length+1])
    return res_list

In [93]:
def make_training_samples(pd_series):
    transformed_texts = pd_series.apply(transform_text)
    samples = transformed_texts.sum()
    return pd.DataFrame(samples)

In [94]:
samples = make_training_samples(tokenised)

In [95]:
samples.shape

(555801, 11)

In [96]:
samples.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,6615,586,5250,3882,21,5866,14,38,12859,3882,0
1,586,5250,3882,21,5866,14,38,12859,3882,0,386
2,5250,3882,21,5866,14,38,12859,3882,0,386,5866
3,3882,21,5866,14,38,12859,3882,0,386,5866,7018
4,21,5866,14,38,12859,3882,0,386,5866,7018,10587


In [97]:
def save_training_samples(samples, filename='train.csv'):
    samples.to_csv('train.csv', index=False, header=None)

In [98]:
save_training_samples(samples)