# Boilerplate

In [1]:
# files
TRAINING_DIRECTORY = 'cnn/summary/'
EXTENSION = '.question'
N_TRAIN_FILES = 40000
N_TEST_FILES = 500

# tokenization
FILTERS = ''
PAD_CHAR = '@pad'
INPUT_END_CHAR = '@input-end'
TARGET_END_CHAR = '@target-end'
OOV_CHAR = '@unk'
PAD_TOKEN = 0
INPUT_END_TOKEN = 1
TARGET_END_TOKEN = 2
OOV_TOKEN = 3

# MODEL_PARAMS
MAX_INPUT_LEN = 150
MAX_TARGET_LEN = 20
MAX_TOTAL_LEN = MAX_INPUT_LEN + MAX_TARGET_LEN
NUM_WORDS = 15000

# Read in files

In [2]:
import glob

In [3]:
FILES = glob.glob('%s/*%s' % (TRAINING_DIRECTORY, EXTENSION))
print(len(FILES))
FILES

380298


['cnn/summary/239c17682016c0c244cdf46200c5f740c8a2c9db.question',
 'cnn/summary/65c7b28e3ba0dacc5a85ab3ea1c9a635744bdf83.question',
 'cnn/summary/dc65fd491d2ac61c83037e5c1338cf6de19f95ee.question',
 'cnn/summary/603b08f191ac8bf06df441619dd912f446179065.question',
 'cnn/summary/0b3c640ebc44567eb3beb8b8297638d7090e8384.question',
 'cnn/summary/a3c5386bc005ef369b70267a5b4248b37052b64a.question',
 'cnn/summary/4dfd23aa3de5205cfe91b091b5365994105d29ba.question',
 'cnn/summary/36b0c8f1069460e5d997ad66c4ebff1c350c5000.question',
 'cnn/summary/18062cf32b6e91bbfa45e8645317499d417e2d87.question',
 'cnn/summary/b01d1b7f5bc30e17d727de4d8d85245b8ff2ffd0.question',
 'cnn/summary/530c4400a0a83900c2810e2795607af94a2447a5.question',
 'cnn/summary/9f17668a9cd8170fdad9788ed64e489e468cf9c3.question',
 'cnn/summary/3eb19c15e3b126ef702183f031e6bedfd8276ed9.question',
 'cnn/summary/198bf4f7ed30c58ad769029dc18f28c843172ae3.question',
 'cnn/summary/20ba1ddd872afe8f10f0242838ff76ede6e1d200.question',
 'cnn/summ

In [4]:
TRAIN_FILES = FILES[:N_TRAIN_FILES]
TEST_FILES = FILES[N_TRAIN_FILES:N_TRAIN_FILES+N_TEST_FILES]

# Define batch generator

In [5]:
import random
import numpy as np
from keras.preprocessing.sequence import pad_sequences
from data import BaseBatchGenerator

class SummaryBatchGenerator(BaseBatchGenerator):
    def __init__(self, tokenizer, one_hot_encoder, num_words, max_input_len,
                 max_target_len, sentence_len, input_end_token, target_end_token):
        self.tokenizer = tokenizer
        self.one_hot_encoder = one_hot_encoder
        self.num_words = num_words
        self.max_input_len = max_input_len
        self.max_target_len = max_target_len
        self.sentence_len = sentence_len
        self.input_end_token = input_end_token
        self.target_end_token = target_end_token

    def generate_batches(self, steps, batch_size, n_batches):
        for i in range(n_batches):
            start, stop = i*batch_size, (i+1)*batch_size
            X = np.array(steps[start:stop])
            y = self.one_hot_encoder([[i] for s in X for i in s])
            y = y.reshape((batch_size, self.sentence_len+1, self.num_words))
            yield X[:,:-1], y[:,1:,:]

    def generate_steps(self, item):
        training_example = self.process_file(item)
        if len(training_example.target_tokens) > self.max_target_len:
            return []
        example = training_example.input_tokens[:self.max_input_len] \
                + [self.input_end_token] \
                + training_example.target_tokens \
                + [self.target_end_token]
        # add 1 to sentence_len since we shift output one step forward to prevent
        # model from attending to future time steps
        example = pad_sequences([example], maxlen=self.sentence_len, padding='post')[0]
        example = pad_sequences([example], maxlen=self.sentence_len+1, padding='pre')[0]
        return [example]

    def process_file(self, file):
        input_text, target_text = self.read_file(file)
        input_tokens, target_tokens = self.tokenizer([input_text])[0], self.tokenizer([target_text])[0]
        training_example = TrainingExample(file, input_text, target_text,
                                           input_tokens, target_tokens)
        return training_example

    @staticmethod
    def read_file(file):
        with open(file) as f:
            context, target = f.read().split('\t')
        return context, target

    def batches_per_epoch(self, items, batch_size):
        steps = sum(len(self.process_file(f).target_tokens) for f in items)
        return steps // batch_size
    

class TrainingExample:
    def __init__(self, item, input_text, target_text, input_tokens,
                 target_tokens):
        self.item = item
        self.input_text = input_text
        self.target_text = target_text
        self.input_tokens = input_tokens
        self.target_tokens = target_tokens

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


# Initialize Tokenizer

In [6]:
from tokenization import Tokenizer

In [7]:
tokenizer = Tokenizer(
    num_words=NUM_WORDS,
    filters=FILTERS,
    oov_token=OOV_CHAR)

In [8]:
text_gen = (text for f in TRAIN_FILES+TEST_FILES for text in SummaryBatchGenerator.read_file(f))

In [9]:
%%time
tokenizer.fit_on_texts(text_gen)

CPU times: user 18.6 s, sys: 224 ms, total: 18.9 s
Wall time: 19.3 s


In [10]:
print('TOKENIZER.num_words', tokenizer.num_words)
print('len(TOKENIZER.word_index)', len(tokenizer.word_index))
tokenizer.word_index

TOKENIZER.num_words 15000
len(TOKENIZER.word_index) 79411


{'pizzerias': 77992,
 'filthy': 12388,
 'better': 366,
 'dadaist': 72640,
 'mathematician': 29038,
 'aog': 52206,
 '2.25': 42060,
 'superbike': 56481,
 'fleetingly': 66552,
 'poles': 11875,
 'fatality': 11757,
 'leaked': 4258,
 'lunchbox': 44291,
 'outdoor': 5302,
 'warden': 16579,
 'author': 1741,
 'monosodium': 58794,
 'riotously': 55513,
 'balatkar': 61815,
 'waggling': 51872,
 'comeback': 3273,
 'redditor': 46966,
 'ruined': 8875,
 '1995/1996': 44218,
 'recalibrate': 24838,
 'multi-successful': 66088,
 'entomologist': 39313,
 'total': 1248,
 '9647': 63448,
 'auteur': 25587,
 'shakira': 55926,
 'launching': 3955,
 '20.32': 65186,
 'pro-reformist': 55080,
 'bandit': 18539,
 'polycarbonate': 33801,
 'baseboards': 54528,
 'glared': 31503,
 'misdirects': 47173,
 'economize': 46096,
 'verbosity': 74985,
 'delays': 3943,
 're-assertive': 44084,
 '12th': 4402,
 'grievances': 7940,
 'recidivism': 21734,
 'accuser': 15529,
 'smithing': 56213,
 'twisters': 16109,
 'ouster': 4946,
 'institutio

In [11]:
index_to_word = {v: k for k, v in tokenizer.word_index.items()}
index_to_word[PAD_TOKEN] = PAD_CHAR
index_to_word[INPUT_END_TOKEN] = INPUT_END_CHAR
index_to_word[TARGET_END_TOKEN] = TARGET_END_CHAR

In [12]:
sorted(index_to_word.items(), key=lambda x: x[0])

[(0, '@pad'),
 (1, '@input-end'),
 (2, '@target-end'),
 (3, '@unk'),
 (4, 'the'),
 (5, ','),
 (6, '.'),
 (7, 'to'),
 (8, '"'),
 (9, 'of'),
 (10, 'and'),
 (11, 'a'),
 (12, 'in'),
 (13, "'s"),
 (14, 'that'),
 (15, 'for'),
 (16, 'is'),
 (17, '-'),
 (18, 'on'),
 (19, 'it'),
 (20, 'said'),
 (21, 'was'),
 (22, 'he'),
 (23, 'with'),
 (24, 'as'),
 (25, 'i'),
 (26, 'his'),
 (27, '--'),
 (28, '@entity1'),
 (29, 'at'),
 (30, 'have'),
 (31, 'from'),
 (32, 'but'),
 (33, 'are'),
 (34, 'be'),
 (35, 'has'),
 (36, 'by'),
 (37, 'this'),
 (38, 'not'),
 (39, '@entity2'),
 (40, 'an'),
 (41, 'we'),
 (42, 'they'),
 (43, '@entity3'),
 (44, '@entity0'),
 (45, 'who'),
 (46, '@entity4'),
 (47, 'will'),
 (48, ')'),
 (49, '('),
 (50, "n't"),
 (51, 'you'),
 (52, ':'),
 (53, 'were'),
 (54, 'their'),
 (55, 'had'),
 (56, '@entity5'),
 (57, 'she'),
 (58, 'been'),
 (59, 'about'),
 (60, 'one'),
 (61, 'more'),
 (62, 'after'),
 (63, 'her'),
 (64, '@entity6'),
 (65, 'or'),
 (66, 'there'),
 (67, 'when'),
 (68, 'people'),
 (6

In [13]:
tokenizer.num_words = min(len(tokenizer.word_index)+1, tokenizer.num_words)
NUM_WORDS = tokenizer.num_words

In [14]:
TOKENIZER = tokenizer.texts_to_sequences
ONE_HOT_ENCODER = tokenizer.sequences_to_matrix

# Take a look at batches

In [15]:
batch_gen = SummaryBatchGenerator(
    tokenizer=TOKENIZER,
    one_hot_encoder=ONE_HOT_ENCODER,
    num_words=NUM_WORDS,
    max_input_len=MAX_INPUT_LEN,
    max_target_len=MAX_TARGET_LEN,
    sentence_len=MAX_TOTAL_LEN,
    input_end_token=INPUT_END_TOKEN,
    target_end_token=TARGET_END_TOKEN
).generate_forever(TRAIN_FILES, batch_size=32)

In [16]:
X, y = next(batch_gen)

In [17]:
X.shape, y.shape

((32, 170), (32, 170, 15000))

In [18]:
X

array([[   0,   49,   44, ...,   17,  531,    2],
       [   0, 1847, 6449, ...,    0,    0,    0],
       [   0,   49,   44, ...,    0,    0,    0],
       ...,
       [   0,   49,   44, ...,    0,    0,    0],
       [   0,   44,   49, ...,    0,    0,    0],
       [   0,   12,  464, ..., 3202,    2,    0]], dtype=int32)

In [19]:
y

array([[[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 1., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.]],

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.]],

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.]],

       ...,

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0.

In [20]:
print('\n\n'.join([' '.join([index_to_word[i] for i in x]) for x in X]))

@pad ( @entity0 ) -- @entity2 's tennis shoes turned into dancing shoes when he returned to action at the @entity5 . the world @entity6 polished off @entity9 's @entity8 6 - 2 6 - 1 in under an hour on tuesday and then @unk on center court to the hit song , " get lucky " by @entity12 . @entity2 has n't always had the fans on his side -- the crowd was firmly against him at the @entity16 in may -- but his @unk moves @unk him to the spectators in @entity19 . " it was a little agreement i had with my friends when we had vacation time a few weeks ago , " @entity2 was quoted as saying by the @entity21 's website . " we listened to that song quite a lot . " we had a lot of dancing going on . i said , @input-end @entity90 loses at the women 's event in @entity91 after claiming the first set 6 - 0 @target-end

@pad tensions escalated in the @entity2 region this week after @entity3 , @entity4 and the @entity5 were involved in a series of potentially explosive confrontations over disputed territo

In [21]:
# only one per row
import numpy as np
ys = np.argwhere(y[0] == 1)

In [22]:
import numpy as np
for j in range(0, len(y), 5):
    ys = np.argwhere(y[j] == 1)
    assert len(ys) == len({row for row, idx in ys})
    print(j)
    print(' '.join(index_to_word[idx] for row, idx in ys))
    print('\n')

0
( @entity0 ) -- @entity2 's tennis shoes turned into dancing shoes when he returned to action at the @entity5 . the world @entity6 polished off @entity9 's @entity8 6 - 2 6 - 1 in under an hour on tuesday and then @unk on center court to the hit song , " get lucky " by @entity12 . @entity2 has n't always had the fans on his side -- the crowd was firmly against him at the @entity16 in may -- but his @unk moves @unk him to the spectators in @entity19 . " it was a little agreement i had with my friends when we had vacation time a few weeks ago , " @entity2 was quoted as saying by the @entity21 's website . " we listened to that song quite a lot . " we had a lot of dancing going on . i said , @input-end @entity90 loses at the women 's event in @entity91 after claiming the first set 6 - 0 @target-end @pad


5
@entity1 , @entity0 ( @entity2 ) -- early on , @entity3 knew exactly what she wanted to do in life . dressed in her high school soccer uniform , @entity3 said she wanted to be a @ent

# Training

In [23]:
from keras.optimizers import adam
from keras.losses import categorical_crossentropy
# from evaluate import rouge_l_fscore

In [24]:
# model architecture
N_HEADS = 8
N_LAYERS = 5
D_MODEL = 64*N_HEADS
SENTENCE_LEN = MAX_TOTAL_LEN
VOCAB_SIZE = NUM_WORDS
DROPOUT = 0.1

# batch training
N_EPOCHS = 1000
WARMUP_STEPS = 200
BATCH_SIZE = 32
CALLBACKS = []
METRICS = [categorical_crossentropy]

# optimization
# https://arxiv.org/pdf/1804.00247.pdf
OPTIMIZER = adam(lr=1e-5, beta_1=0.9, beta_2=0.98, epsilon=1e-9, clipnorm=2.0)

In [25]:
batch_gen = SummaryBatchGenerator(
    tokenizer=TOKENIZER,
    one_hot_encoder=ONE_HOT_ENCODER,
    num_words=NUM_WORDS,
    max_input_len=MAX_INPUT_LEN,
    max_target_len=MAX_TARGET_LEN,
    sentence_len=MAX_TOTAL_LEN,
    input_end_token=INPUT_END_TOKEN,
    target_end_token=TARGET_END_TOKEN
)

In [26]:
%%time
TRAIN_STEPS_PER_EPOCH = batch_gen.batches_per_epoch(TRAIN_FILES, batch_size=BATCH_SIZE)
print('train steps per epoch', TRAIN_STEPS_PER_EPOCH)

train steps per epoch 15574
CPU times: user 15.6 s, sys: 248 ms, total: 15.9 s
Wall time: 15.9 s


In [27]:
%%time
TEST_STEPS_PER_EPOCH = batch_gen.batches_per_epoch(TEST_FILES, batch_size=BATCH_SIZE)
print('train steps per epoch', TEST_STEPS_PER_EPOCH)

train steps per epoch 197
CPU times: user 216 ms, sys: 4 ms, total: 220 ms
Wall time: 218 ms


In [28]:
Xy_train = batch_gen.generate_forever(TRAIN_FILES, batch_size=BATCH_SIZE)
Xy_test = batch_gen.generate_forever(TEST_FILES, batch_size=BATCH_SIZE)

In [29]:
from model_decoder import TransformerDecoder
model = TransformerDecoder(
    n_heads=N_HEADS,
    decoder_layers=N_LAYERS,
    d_model=D_MODEL,
    vocab_size=VOCAB_SIZE,
    sequence_len=SENTENCE_LEN,
    dropout=DROPOUT)

In [30]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input (InputLayer)              (None, None)         0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 170, 512)     7680000     input[0][0]                      
__________________________________________________________________________________________________
positional_encoding_1 (Position (None, 170, 512)     0           embedding[0][0]                  
__________________________________________________________________________________________________
embedding_scalar (Scalar)       (None, 170, 512)     0           positional_encoding_1[0][0]      
__________________________________________________________________________________________________
dropout_1 

In [31]:
import keras.backend as K
def perplexity(y_true, y_pred):
    cross_entropy = K.categorical_crossentropy(y_true, y_pred)
    return K.exp(cross_entropy)
LOSS = perplexity

In [32]:
class LRScheduler:
    def __init__(self, d_model, warmup_steps):
        self.d_model = d_model
        self.warmup_steps = warmup_steps
        self.epoch = 1

    def lr(self, epoch):
        lr = self.d_model**-.5 * min(self.epoch**-.5, epoch*(self.warmup_steps**-1.5))
        self.epoch += 1
        return lr
lr_scheduler = LRScheduler(D_MODEL, WARMUP_STEPS).lr

In [33]:
from keras.callbacks import LearningRateScheduler
# CALLBACKS.append(LearningRateScheduler(lr_scheduler))

In [34]:
model.compile(loss=LOSS, optimizer=OPTIMIZER, metrics=METRICS)

In [None]:
# print loss value references
def display_loss_reference():
    format_ = 'loss: %20s\tentropy: %20s'
    upper_limit = np.log(VOCAB_SIZE)
    print(format_ % (np.exp(upper_limit), upper_limit))
    for i in reversed(range(int(np.floor(upper_limit))+1)):
        print(format_ % (np.exp(i), i))
display_loss_reference()

loss:   15000.000000000004	entropy:    9.615805480084347
loss:    8103.083927575384	entropy:                    9
loss:   2980.9579870417283	entropy:                    8
loss:   1096.6331584284585	entropy:                    7
loss:    403.4287934927351	entropy:                    6
loss:    148.4131591025766	entropy:                    5
loss:   54.598150033144236	entropy:                    4
loss:   20.085536923187668	entropy:                    3
loss:     7.38905609893065	entropy:                    2
loss:    2.718281828459045	entropy:                    1
loss:                  1.0	entropy:                    0


In [None]:
model.fit_generator(
    Xy_train,
    steps_per_epoch=TRAIN_STEPS_PER_EPOCH,
    epochs=N_EPOCHS,
    validation_data=Xy_test,
    validation_steps=TEST_STEPS_PER_EPOCH,
    callbacks=CALLBACKS)

Epoch 1/1000