In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import json

In [2]:
dat = pd.read_csv('eng_songdata.csv', usecols=['text']).drop_duplicates()

In [3]:
dat.head()

Unnamed: 0,text
0,"Look at her face, it's a wonderful face \nAnd..."
1,"Take it easy with me, please \nTouch me gentl..."
2,I'll never know why I had to go \nWhy I had t...
3,Making somebody happy is a question of give an...
4,Making somebody happy is a question of give an...


In [4]:
apos_end_pattern = r"'( cause| d| em| ll| m| n| re| s| til| till| twas| ve) (?!')"
apos_start_pattern = r" (d |j |l |ol |y )'"
apos_double_pattern = r" ' n ' "
def data_cleaning(dat):
    dat.loc[:, 'cleaned_text'] = dat.text.str.lower()
    dat.loc[:, 'cleaned_text'] = dat.cleaned_text.str.replace("''cause", "'cause")
    dat.loc[:, 'cleaned_text'] = dat.cleaned_text.str.replace("n't", " n't")
    dat.loc[:, 'cleaned_text'] = dat.cleaned_text.str.replace("''", '"')
    dat.loc[:, 'cleaned_text'] = dat.cleaned_text.str.replace(r'[\n!"\(\),-.0-9:?\[\]]', lambda x: ' '+x.group(0)+' ')
    dat.loc[:, 'cleaned_text'] = dat.cleaned_text.str.replace("'", " ' ")
    # in' to ing
    dat.loc[:, 'cleaned_text'] = dat.cleaned_text.str.replace(r"\w+in'$|\w+in'\s", lambda m: m.group(0).replace("'", 'g'))
    # recover n't
    dat.loc[:, 'cleaned_text'] = dat.cleaned_text.str.replace(r" n ' t ", " n't ")
    # recover 'cause, 'd, 'em, 'll, 'm, 'n, 're, 's, 'til, 'till, 'twas
    dat.loc[:, 'cleaned_text'] = dat.cleaned_text.str.replace(apos_end_pattern, lambda m: m.group(0)[:1]+m.group(0)[2:])
    # recover d', j', l', ol', y'
    dat.loc[:, 'cleaned_text'] = dat.cleaned_text.str.replace(apos_start_pattern, lambda m: m.group(0)[:-2]+m.group(0)[-1:])
    # recover 'n'
    dat.loc[:, 'cleaned_text'] = dat.cleaned_text.str.replace(apos_double_pattern, lambda m: m.group(0)[:2]+m.group(0)[3]+m.group(0)[-2:])
    dat.loc[:, 'cleaned_text'] = dat.cleaned_text.str.replace(r' {2,}', ' ')
    dat.loc[:, 'cleaned_text'] = dat.cleaned_text.str.strip()
    dat.loc[:, 'text_list'] = dat.cleaned_text.str.split(' ')
    return dat.loc[:, 'text_list']

In [5]:
glove = pd.read_table('glove.6B.50d.txt', sep=' ', header=None, quoting=3, na_filter=False, index_col=0)

In [6]:
# glove_words = set(glove.index.tolist())

In [7]:
cleaned = data_cleaning(dat.copy())

In [8]:
cleaned.head()

0    [look, at, her, face, ,, it, 's, a, wonderful,...
1    [take, it, easy, with, me, ,, please, \n, touc...
2    [i, 'll, never, know, why, i, had, to, go, \n,...
3    [making, somebody, happy, is, a, question, of,...
4    [making, somebody, happy, is, a, question, of,...
Name: text_list, dtype: object

In [9]:
# try to remove non-english songs
# n_eng = cleaned.apply(lambda x: len(set(x) & glove_words))

In [10]:
#dat.loc[n_eng > 20].to_csv('eng_songdata.csv', index=False)

In [11]:
cleaned_sets = cleaned.apply(set)

In [25]:
def get_unique_tokens(text_sets):
    tokens = set()
    for ind, item in text_sets.iteritems():
        tokens = tokens | item
    return len(tokens), sorted(list(tokens))

In [26]:
vocab_size, unique_tokens = get_unique_tokens(cleaned_sets)

In [27]:
vocab_size

81452

In [28]:
unique_tokens[:5]

['\n', '!', '"', "'", "'cause"]

In [29]:
def save_unique_tokens(unique_tokens, filename='unique_tokens'):
    # check if all unique
    assert len(unique_tokens) == len(set(unique_tokens)), 'Make sure all tokens unique!'
    unique_token_series = pd.Series(unique_tokens)
    unique_token_series.to_csv(filename, index=False, header=None)

In [30]:
save_unique_tokens(unique_tokens)

In [31]:
def get_index_word_map(unique_tokens):
    # check if all unique
    assert len(unique_tokens) == len(set(unique_tokens)), 'Make sure all tokens unique!'
    word2ind = {}
    ind2word = {}
    for ind, word in enumerate(unique_tokens):
        word2ind[word] = ind
        ind2word[ind] = word
    return word2ind, ind2word

In [32]:
word2ind, ind2word = get_index_word_map(unique_tokens)

In [33]:
def save_dict(dict2save, filename):
    with open(filename, 'w') as f:
        json.dump(dict2save, f)

def save_index_word_map(word2ind, ind2word, word2ind_filename='word2ind', ind2word_filename='ind2word'):
    save_dict(word2ind, word2ind_filename)
    save_dict(ind2word, ind2word_filename)

In [34]:
save_index_word_map(word2ind, ind2word)

In [35]:
def create_emb(ind2word, imported_emb):
    vocab_size = len(ind2word)
    n_fact = imported_emb.shape[1]
    emb = np.zeros((vocab_size, n_fact))
    for i in range(vocab_size):
        word = ind2word[i]
        try:
            emb[i] = imported_emb.loc[word]
        except KeyError:
            emb[i] = np.random.normal(scale=0.6, size=(n_fact,))
    return emb

In [36]:
emb = create_emb(ind2word, glove)

In [37]:
def save_embedding(embedding, filename='embedding.csv'):
    np.savetxt(filename, embedding, delimiter=',')

In [38]:
save_embedding(emb)

In [39]:
def tokenise_cleaned_data(dat, word2ind):
    return dat.apply(lambda words: [word2ind[word] for word in words])

In [40]:
tokenised = tokenise_cleaned_data(cleaned, word2ind)

In [41]:
tokenised.head()

0    [41434, 3741, 31893, 23851, 19, 36007, 12, 36,...
1    [70486, 36007, 21300, 79589, 44029, 19, 54233,...
2    [33764, 7, 48341, 38656, 79162, 33764, 30328, ...
3    [42765, 66351, 30916, 35884, 36, 56819, 49792,...
4    [42765, 66351, 30916, 35884, 36, 56819, 49792,...
Name: text_list, dtype: object

In [42]:
def transform_text(text_list, input_length=10):
    list_length = len(text_list)
    res_list = []
    for i in range(0, list_length-input_length):
        res_list.append(text_list[i:i+input_length+1])
    return res_list

In [43]:
def make_training_samples(pd_series, input_length=10):
    transformed_texts = pd_series.apply(transform_text, input_length=input_length)
    samples = transformed_texts.sum()
    return pd.DataFrame(samples)

In [44]:
samples = make_training_samples(tokenised, input_length=20)

In [45]:
samples.shape

(15674922, 21)

In [46]:
samples.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11,12,13,14,15,16,17,18,19,20
0,41434,3741,31893,23851,19,36007,12,36,79788,23851,...,2348,36007,44070,66380,66904,72665,44029,0,41434,3741
1,3741,31893,23851,19,36007,12,36,79788,23851,0,...,36007,44070,66380,66904,72665,44029,0,41434,3741,71714
2,31893,23851,19,36007,12,36,79788,23851,0,2348,...,44070,66380,66904,72665,44029,0,41434,3741,71714,78316
3,23851,19,36007,12,36,79788,23851,0,2348,36007,...,66380,66904,72665,44029,0,41434,3741,71714,78316,71689
4,19,36007,12,36,79788,23851,0,2348,36007,44070,...,66904,72665,44029,0,41434,3741,71714,78316,71689,63594


In [47]:
def save_training_samples(samples, filename='train.csv'):
    samples.to_csv('train.csv', index=False, header=None)

In [None]:
save_training_samples(samples)