In [1]:
import numpy as np
import pandas as pd
import json

In [48]:
dat = pd.read_csv('songdata.csv', usecols=['text'])

In [49]:
dat.head()

Unnamed: 0,text
0,"Look at her face, it's a wonderful face \nAnd..."
1,"Take it easy with me, please \nTouch me gentl..."
2,I'll never know why I had to go \nWhy I had t...
3,Making somebody happy is a question of give an...
4,Making somebody happy is a question of give an...


In [50]:
apos_end_pattern = r"'( cause| d| em| ll| m| n| re| s| til| till| twas| ve) (?!')"
apos_start_pattern = r" (d |j |l |ol |y )'"
apos_double_pattern = r" ' n ' "
def data_cleaning(dat):
    dat.loc[:, 'cleaned_text'] = dat.text.str.lower()
    dat.loc[:, 'cleaned_text'] = dat.cleaned_text.str.replace("''cause", "'cause")
    dat.loc[:, 'cleaned_text'] = dat.cleaned_text.str.replace("''", '"')
    dat.loc[:, 'cleaned_text'] = dat.cleaned_text.str.replace(r'[\n!"\(\),-.0-9:?\[\]]', lambda x: ' '+x.group(0)+' ')
    dat.loc[:, 'cleaned_text'] = dat.cleaned_text.str.replace("'", " ' ")
    # in' to ing
    dat.loc[:, 'cleaned_text'] = dat.cleaned_text.str.replace(r"\w+in'$|\w+in'\s", lambda m: m.group(0).replace("'", 'g'))
    # recover 'cause, 'd, 'em, 'll, 'm, 'n, 're, 's, 'til, 'till, 'twas
    dat.loc[:, 'cleaned_text'] = dat.cleaned_text.str.replace(apos_end_pattern, lambda m: m.group(0)[:1]+m.group(0)[2:])
    # recover d', j', l', ol', y'
    dat.loc[:, 'cleaned_text'] = dat.cleaned_text.str.replace(apos_start_pattern, lambda m: m.group(0)[:-2]+m.group(0)[-1:])
    # recover 'n'
    dat.loc[:, 'cleaned_text'] = dat.cleaned_text.str.replace(apos_double_pattern, lambda m: m.group(0)[:2]+m.group(0)[3]+m.group(0)[-2:])
    dat.loc[:, 'cleaned_text'] = dat.cleaned_text.str.replace(r' {2,}', ' ')
    dat.loc[:, 'cleaned_text'] = dat.cleaned_text.str.strip()
    dat.loc[:, 'text_list'] = dat.cleaned_text.str.split(' ')
    return dat.loc[:, 'text_list']

In [51]:
cleaned = data_cleaning(dat.head(10000).copy())

In [52]:
cleaned.head()

0    [look, at, her, face, ,, it, 's, a, wonderful,...
1    [take, it, easy, with, me, ,, please, \n, touc...
2    [i, 'll, never, know, why, i, had, to, go, \n,...
3    [making, somebody, happy, is, a, question, of,...
4    [making, somebody, happy, is, a, question, of,...
Name: text_list, dtype: object

In [53]:
def get_unique_tokens(tokens):
    unique_tokens = sorted(list(set(tokens)))
    return len(unique_tokens), unique_tokens

In [54]:
tokens = cleaned.sum()

In [55]:
vocab_size, unique_tokens = get_unique_tokens(tokens)

In [56]:
vocab_size

32930

In [57]:
unique_tokens[:5]

['\n', '!', '"', "'", "'cause"]

In [58]:
def save_unique_tokens(unique_tokens, filename='unique_tokens'):
    # check if all unique
    assert len(unique_tokens) == len(set(unique_tokens)), 'Make sure all tokens unique!'
    unique_token_series = pd.Series(unique_tokens)
    unique_token_series.to_csv(filename, index=False, header=None)

In [59]:
save_unique_tokens(unique_tokens)

In [60]:
def get_index_word_map(unique_tokens):
    # check if all unique
    assert len(unique_tokens) == len(set(unique_tokens)), 'Make sure all tokens unique!'
    word2ind = {}
    ind2word = {}
    for ind, word in enumerate(unique_tokens):
        word2ind[word] = ind
        ind2word[ind] = word
    return word2ind, ind2word

In [61]:
word2ind, ind2word = get_index_word_map(unique_tokens)

In [62]:
def save_dict(dict2save, filename):
    with open(filename, 'w') as f:
        json.dump(dict2save, f)

def save_index_word_map(word2ind, ind2word, word2ind_filename='word2ind', ind2word_filename='ind2word'):
    save_dict(word2ind, word2ind_filename)
    save_dict(ind2word, ind2word_filename)

In [63]:
save_index_word_map(word2ind, ind2word)

In [64]:
glove = pd.read_table('glove.6B.50d.txt', sep=' ', header=None, quoting=3, na_filter=False, index_col=0)

In [65]:
def create_emb(ind2word, imported_emb):
    vocab_size = len(ind2word)
    n_fact = imported_emb.shape[1]
    emb = np.zeros((vocab_size, n_fact))
    for i in range(vocab_size):
        word = ind2word[i]
        try:
            emb[i] = imported_emb.loc[word]
        except KeyError:
            emb[i] = np.random.normal(scale=0.6, size=(n_fact,))
    return emb

In [66]:
emb = create_emb(ind2word, glove)

In [67]:
def save_embedding(embedding, filename='embedding.csv'):
    np.savetxt(filename, embedding, delimiter=',')

In [68]:
save_embedding(emb)

In [69]:
def tokenise_cleaned_data(dat, word2ind):
    return dat.apply(lambda words: [word2ind[word] for word in words])

In [70]:
tokenised = tokenise_cleaned_data(cleaned, word2ind)

In [71]:
tokenised.head()

0    [16657, 1464, 13086, 9847, 19, 14707, 12, 36, ...
1    [28629, 14707, 8824, 32363, 17689, 19, 21701, ...
2    [13846, 7, 19356, 15648, 32161, 13846, 12503, ...
3    [17183, 26796, 12700, 14662, 36, 22803, 19884,...
4    [17183, 26796, 12700, 14662, 36, 22803, 19884,...
Name: text_list, dtype: object

In [72]:
def transform_text(text_list, input_length=10):
    list_length = len(text_list)
    res_list = []
    for i in range(0, list_length-input_length):
        res_list.append(text_list[i:i+input_length+1])
    return res_list

In [73]:
def make_training_samples(pd_series):
    transformed_texts = pd_series.apply(transform_text)
    samples = transformed_texts.sum()
    return pd.DataFrame(samples)

In [74]:
samples = make_training_samples(tokenised)

In [75]:
samples.shape

(2780161, 11)

In [76]:
samples.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,16657,1464,13086,9847,19,14707,12,36,32437,9847,0
1,1464,13086,9847,19,14707,12,36,32437,9847,0,912
2,13086,9847,19,14707,12,36,32437,9847,0,912,14707
3,9847,19,14707,12,36,32437,9847,0,912,14707,17710
4,19,14707,12,36,32437,9847,0,912,14707,17710,26809


In [77]:
def save_training_samples(samples, filename='train.csv'):
    samples.to_csv('train.csv', index=False, header=None)

In [78]:
save_training_samples(samples)