In [1]:
# https://www.kaggle.com/c/text-normalization-challenge-english-language

In [1]:
import pickle as pkl
import numpy as np
from pathlib import Path
import csv
from sklearn.model_selection import train_test_split
import itertools
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from tqdm import tqdm

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
PATH = Path('DATA')
NUM_CLASSES = 100
MAX_LEN = 50

## Read data

In [3]:
x, y = [], []
with open(PATH/'kaggle_data/en_train.csv', newline='', encoding='utf8') as csvfile:
    spamreader = csv.reader(csvfile, delimiter=',')
    for i, row in enumerate(spamreader):
        if i > 0:
            x.append(row[3])
            y.append(row[4])

print('Num examples:', len(x), len(y))
print('Num classes: ', len(set(y)))


#x_trn, x_val, y_trn, y_val = train_test_split(x, y, test_size=0.05, random_state=42)
#pkl.dump([x_trn, y_trn, x_val, y_val], open(PATH/'xt_yt_xv_yv.pkl', 'wb'))

Num examples: 9918441 9918441
Num classes:  471587


## Process data

#### Inspect data

In [50]:
print('\n############## Train data:')
for i in range(10, 15):
    print(x[i], '-', y[i])


############## Train data:
2006 - two thousand six
IUCN - i u c n
Red - Red
List - List
of - of


In [51]:
# Get longest words
print(max([len(x) for x in x]))
print(max([len(x) for x in y]))
# Clearly has some abnormally long sequences 

1057
3767


In [52]:
sorted_len_x = sorted(x, key=len)
sorted_len_y = sorted(y, key=len)

In [53]:
# The outputs contain links and very long numbers, the character based model should have around 130 max characters
print(sorted_len_x[-650])
print(sorted_len_y[-650])

http://cfpub.epa.gov/ncea/iris/index.cfm
nine trillion seven hundred eighty billion eight hundred four million seven hundred thirty seven thousand seven hundred forty six


#### Tokenize data

In [4]:
x_t = []
y_t = []

for it, (xi, yi) in tqdm(enumerate(zip(x, y))):
    # We use "tab" as the "start sequence" character
    # for the targets, and "\n" as "end sequence" character.
    if len(xi) <= MAX_LEN and len(yi) <= MAX_LEN:
        yi = '\t' + yi + '\n'
        y_t.append(yi)
        x_t.append(xi)
            
tokenizer = Tokenizer(num_words=NUM_CLASSES, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~',
          lower=True, split='', char_level=True, oov_token='OOV')

tokenizer.fit_on_texts(x_t + y_t)
x_t = tokenizer.texts_to_sequences(x_t)
y_t = tokenizer.texts_to_sequences(y_t)
print(tokenizer.word_index['p'])

9918441it [00:08, 1192725.56it/s]


19


In [5]:
# Pre pad x and post pad y
x_t_p = pad_sequences(x_t, maxlen=MAX_LEN+2, dtype='int32', padding='pre', truncating='post', value=0)
y_t_p = pad_sequences(y_t, maxlen=MAX_LEN+2, dtype='int32', padding='post', truncating='post', value=0)

In [6]:
print(tokenizer.word_index['\t'])
ind=8
print(x_t_p[ind], x_t_p[ind].shape)
print(y_t_p[ind], y_t_p[ind].shape)

3
[ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  6 15  6  7  5 12  6
 15  2  6  2] (52,)
[ 3  6 15  6  7  5 12  6 15  2  6  2  4  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0] (52,)


In [134]:
encoder_input_data = x_t_p
decoder_input_data = y_t_p
decoder_target_data = np.zeros_like(decoder_input_data)

for i, val in enumerate(decoder_input_data):
    decoder_target_data[i][1:] = decoder_input_data[i][:-1]

In [139]:
print(decoder_input_data[0])
print(decoder_target_data[0])

[ 3 25 10  8 13 13  6  7  5  6  8 11  8  6  4  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0]
[ 0  3 25 10  8 13 13  6  7  5  6  8 11  8  6  4  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0]


In [143]:
np.savez(open(PATH/'pkl/enci_deci_dect.npz', 'wb'), encoder_input_data, decoder_input_data, decoder_target_data)

In [129]:
# x_trn, x_val, y_trn, y_val = train_test_split(x_p_t, y_p_t, test_size=0.05, random_state=42)

In [135]:
# pkl.dump([x_trn, y_trn, x_val, y_val], open(PATH/'xt_yt_xv_yv.pkl', 'wb'), protocol=4)

## Load data

In [146]:
with open(PATH/'pkl/enci_deci_dect.npz', 'rb') as f:
    encoder_input_data, decoder_input_data, decoder_target_data = np.load(f)

In [10]:
with open(PATH/'pkl/xt_yt_xv_yv.pkl', 'rb') as f:
    x_trn, y_trn, x_val, y_val = pkl.load(f)

In [None]:
for i, (input_text, target_text) in enumerate(zip(x_t, y_t)):
    for t, char in enumerate(input_text):
        encoder_input_data[i, t, input_token_index[char]] = 1.
    for t, char in enumerate(target_text):
        # decoder_target_data is ahead of decoder_input_data by one timestep
        decoder_input_data[i, t, target_token_index[char]] = 1.
        if t > 0:
            # decoder_target_data will be ahead by one timestep
            # and will not include the start character.
            decoder_target_data[i, t - 1, target_token_index[char]] = 1.