# Boilerplate

In [None]:
# files
TRAINING_DIRECTORY = 'cnn/summary/'
EXTENSION = '.question'
N_TRAIN_FILES = 20000
N_TEST_FILES = 500

# tokenization
FILTERS = ''
PAD_CHAR = '@pad'
INPUT_END_CHAR = '@input-end'
TARGET_END_CHAR = '@target-end'
OOV_CHAR = '@unk'
PAD_TOKEN = 0
INPUT_END_TOKEN = 1
TARGET_END_TOKEN = 2
OOV_TOKEN = 3

# MODEL_PARAMS
MAX_INPUT_LEN = 150
MAX_TARGET_LEN = 20
MAX_TOTAL_LEN = MAX_INPUT_LEN + MAX_TARGET_LEN
NUM_WORDS = 15000

# Read in files

In [None]:
import glob

In [None]:
FILES = glob.glob('%s/*%s' % (TRAINING_DIRECTORY, EXTENSION))
print(len(FILES))
FILES

In [None]:
TRAIN_FILES = FILES[:N_TRAIN_FILES]
TEST_FILES = FILES[N_TRAIN_FILES:N_TRAIN_FILES+N_TEST_FILES]

# Define batch generator

In [5]:
import random
import numpy as np
from keras.preprocessing.sequence import pad_sequences
from data import BaseBatchGenerator

class SummaryBatchGenerator(BaseBatchGenerator):
    def __init__(self, tokenizer, one_hot_encoder, num_words, max_input_len,
                 max_target_len, sentence_len, input_end_token, target_end_token):
        self.tokenizer = tokenizer
        self.one_hot_encoder = one_hot_encoder
        self.num_words = num_words
        self.max_input_len = max_input_len
        self.max_target_len = max_target_len
        self.sentence_len = sentence_len
        self.input_end_token = input_end_token
        self.target_end_token = target_end_token

    def generate_batches(self, steps, batch_size, n_batches):
        for i in range(n_batches):
            start, stop = i*batch_size, (i+1)*batch_size
            X = np.array(steps[start:stop])
            y = self.one_hot_encoder([[i] for s in X for i in s])
            y = y.reshape((batch_size, self.sentence_len+1, self.num_words))
            yield X[:,:-1], y[:,1:,:]

    def generate_steps(self, item):
        training_example = self.process_file(item)
        if len(training_example.target_tokens) > self.max_target_len:
            return []
        example = training_example.input_tokens[:self.max_input_len] \
                + [self.input_end_token] \
                + training_example.target_tokens \
                + [self.target_end_token]
        # add 1 to sentence_len since we shift output one step forward to prevent
        # model from attending to future time steps
        example = pad_sequences([example], maxlen=self.sentence_len, padding='post')[0]
        example = pad_sequences([example], maxlen=self.sentence_len+1, padding='pre')[0]
        return [example]

    def process_file(self, file):
        input_text, target_text = self.read_file(file)
        input_tokens, target_tokens = self.tokenizer([input_text])[0], self.tokenizer([target_text])[0]
        training_example = TrainingExample(file, input_text, target_text,
                                           input_tokens, target_tokens)
        return training_example

    @staticmethod
    def read_file(file):
        with open(file) as f:
            context, target = f.read().split('\t')
        return context, target

    def batches_per_epoch(self, items, batch_size):
        steps = sum(len(self.process_file(f).target_tokens) for f in items)
        return steps // batch_size
    

class TrainingExample:
    def __init__(self, item, input_text, target_text, input_tokens,
                 target_tokens):
        self.item = item
        self.input_text = input_text
        self.target_text = target_text
        self.input_tokens = input_tokens
        self.target_tokens = target_tokens

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


# Initialize Tokenizer

In [6]:
from tokenization import Tokenizer

In [7]:
TOKENIZER = Tokenizer(
    num_words=NUM_WORDS,
    filters=FILTERS,
    oov_token=OOV_CHAR)

In [8]:
text_gen = (text for f in TRAIN_FILES+TEST_FILES for text in SummaryBatchGenerator.read_file(f))

In [9]:
%%time
TOKENIZER.fit_on_texts(text_gen)

CPU times: user 12.2 s, sys: 396 ms, total: 12.6 s
Wall time: 16.8 s


In [10]:
print('TOKENIZER.num_words', TOKENIZER.num_words)
print('len(TOKENIZER.word_index)', len(TOKENIZER.word_index))
TOKENIZER.word_index

TOKENIZER.num_words 15000
len(TOKENIZER.word_index) 65190


{'6:00': 34974,
 'abideth': 55279,
 'crewmen': 35763,
 'anti-prostitution': 54409,
 'abides': 30580,
 '9pm': 38516,
 'marionettes': 40398,
 'tardiness': 32313,
 'frenetic': 23067,
 'usury': 35853,
 'attested': 44226,
 'spiked': 12415,
 'peers': 6101,
 'strict': 4210,
 'appearing': 4302,
 '1711': 35838,
 'harper': 45392,
 'sightseers': 49472,
 'administrating': 50632,
 'un-erotic': 57859,
 'pitchers': 17111,
 '75.6': 52537,
 'walkabout': 62861,
 'housemate': 33629,
 'spiritualized': 54500,
 're-attached': 42155,
 'anniversary': 2084,
 'rated': 4693,
 'digits': 11113,
 'vendor': 9366,
 'decadents': 49544,
 'neo-conservatism': 37501,
 'stared': 13100,
 'ballpoint': 45853,
 'astrobiologists': 46836,
 '@entity540': 53627,
 'tenuousness': 52768,
 'epidemiological': 33203,
 'leaky': 30491,
 '8.5': 12203,
 'ex-beatle': 36369,
 'emptor': 45698,
 'assaulted': 5841,
 'unpolished': 64784,
 'gallingly': 58703,
 'scrapbooking': 42434,
 'crabs': 17658,
 'dois': 43927,
 'divisive': 7298,
 'strangulati

In [11]:
index_to_word = {v: k for k, v in TOKENIZER.word_index.items()}
index_to_word[PAD_TOKEN] = PAD_CHAR
index_to_word[INPUT_END_TOKEN] = INPUT_END_CHAR
index_to_word[TARGET_END_TOKEN] = TARGET_END_CHAR

In [12]:
sorted(index_to_word.items(), key=lambda x: x[0])

[(0, '@pad'),
 (1, '@input-end'),
 (2, '@target-end'),
 (3, '@unk'),
 (4, 'the'),
 (5, ','),
 (6, '.'),
 (7, 'to'),
 (8, '"'),
 (9, 'of'),
 (10, 'and'),
 (11, 'a'),
 (12, 'in'),
 (13, "'s"),
 (14, 'that'),
 (15, 'for'),
 (16, 'is'),
 (17, '-'),
 (18, 'on'),
 (19, 'it'),
 (20, 'said'),
 (21, 'was'),
 (22, 'he'),
 (23, 'with'),
 (24, 'as'),
 (25, 'i'),
 (26, 'his'),
 (27, '@entity1'),
 (28, '--'),
 (29, 'at'),
 (30, 'have'),
 (31, 'from'),
 (32, 'but'),
 (33, 'are'),
 (34, 'be'),
 (35, 'has'),
 (36, 'by'),
 (37, 'this'),
 (38, 'not'),
 (39, '@entity2'),
 (40, 'an'),
 (41, 'we'),
 (42, 'they'),
 (43, '@entity0'),
 (44, '@entity3'),
 (45, 'who'),
 (46, '@entity4'),
 (47, 'will'),
 (48, ')'),
 (49, '('),
 (50, "n't"),
 (51, 'you'),
 (52, ':'),
 (53, 'were'),
 (54, 'their'),
 (55, 'had'),
 (56, '@entity5'),
 (57, 'she'),
 (58, 'been'),
 (59, 'about'),
 (60, 'one'),
 (61, 'after'),
 (62, 'more'),
 (63, '@entity6'),
 (64, 'her'),
 (65, 'or'),
 (66, 'there'),
 (67, 'when'),
 (68, 'people'),
 (6

In [13]:
TOKENIZER.num_words = min(len(TOKENIZER.word_index)+1, TOKENIZER.num_words)

# Take a look at batches

In [14]:
batch_gen = SummaryBatchGenerator(
    tokenizer=TOKENIZER.texts_to_sequences,
    one_hot_encoder=TOKENIZER.sequences_to_matrix,
    num_words=TOKENIZER.num_words,
    max_input_len=MAX_INPUT_LEN,
    max_target_len=MAX_TARGET_LEN,
    sentence_len=MAX_TOTAL_LEN,
    input_end_token=INPUT_END_TOKEN,
    target_end_token=TARGET_END_TOKEN
).generate_forever(TRAIN_FILES, batch_size=32)

In [15]:
X, y = next(batch_gen)

In [16]:
X.shape, y.shape

((32, 170), (32, 170, 15000))

In [17]:
X

array([[  0,  43,  49, ...,   2,   0,   0],
       [  0,  49,  43, ...,   2,   0,   0],
       [  0,  49,  27, ...,   0,   0,   0],
       ...,
       [  0,  49,  43, ...,   0,   0,   0],
       [  0,  49,  43, ...,   0,   0,   0],
       [  0, 119,  27, ...,   0,   0,   0]], dtype=int32)

In [18]:
y

array([[[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.]],

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.]],

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.]],

       ...,

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0.

In [19]:
print('\n\n'.join([' '.join([index_to_word[i] for i in x]) for x in X]))


@pad ( @entity0 ) -- blood and urine transported from @entity3 to @entity4 . it is a 5,000 - mile transatlantic commute , but that 's the anti-doping plan @unk will implement to ensure a clean @entity8 in 2014 . football 's global governing body was forced to take the step after the @entity12 ( @entity12 ) suspended testing at a laboratory in @entity3 . the @entity16 laboratory had its accreditation to conduct anti-doping tests taken away by @entity12 in august after failing to meet the organization 's @entity19 ( @entity19 ) . " following the recent decision by @entity12 to revoke the accreditation of the @entity16 laboratory in @entity3 ... @entity7 has decided to use the @entity12 accredited laboratory in @entity4 , @entity24 , " read a @entity7 statement . " as @entity12 confirmed that following due process the @entity16 laboratory would not be able to achieve @unk in time @input-end plan put in place after @entity12 ( @entity12 ) suspended testing at lab in @entity3 @target-end @

In [20]:
# only one per row
import numpy as np
ys = np.argwhere(y[0] == 1)

In [21]:
import numpy as np
for j in range(0, len(y), 5):
    ys = np.argwhere(y[j] == 1)
    assert len(ys) == len({row for row, idx in ys})
    print(j)
    print(' '.join(index_to_word[idx] for row, idx in ys))
    print('\n')

0


5
( @entity0 ) -- on thursday , president @entity1 is scheduled to deliver a major speech in @entity3 about his administration 's counterterrorism policies , focusing on the rationale and legal framework for the controversial @entity9 drone program and his plans to wind down the prison camp at @entity12 . so we thought it might be useful to examine some common myths about the drone program and the prison population at @entity12 . 1 . drone strikes largely target the leaders of terrorist groups that threaten the @entity21 . in fact , of the thousands who have been killed in @entity9 drone strikes in @entity24 , only 37 were leaders of @entity26 or affiliated organizations , according to a tally by the @entity29 . and even if we add to that list the leaders of the @entity34 who have been killed in drone strikes , only 2 % of the @input-end @entity336 : it 's a myth that all those at @entity12 are too dangerous to release @target-end @pad @pad


10
( @entity0 ) -- a central @entity2 p

# Training

In [22]:
from keras.optimizers import adam
from keras.losses import categorical_crossentropy
# from evaluate import rouge_l_fscore

In [23]:
# model architecture
N_HEADS = 8
N_LAYERS = 4
D_MODEL = 64*N_HEADS
SENTENCE_LEN = MAX_TOTAL_LEN
VOCAB_SIZE = TOKENIZER.num_words
DROPOUT = 0.1

# batch training
N_EPOCHS = 1000
WARMUP_STEPS = 200
BATCH_SIZE = 32
CALLBACKS = []
METRICS = [categorical_crossentropy]

# optimization
# https://arxiv.org/pdf/1804.00247.pdf
OPTIMIZER = adam(lr=1e-5, beta_1=0.9, beta_2=0.98, epsilon=1e-9, clipnorm=2.0)

In [24]:
batch_gen = SummaryBatchGenerator(
    tokenizer=TOKENIZER.texts_to_sequences,
    one_hot_encoder=TOKENIZER.sequences_to_matrix,
    num_words=TOKENIZER.num_words,
    max_input_len=MAX_INPUT_LEN,
    max_target_len=MAX_TARGET_LEN,
    sentence_len=MAX_TOTAL_LEN,
    input_end_token=INPUT_END_TOKEN,
    target_end_token=TARGET_END_TOKEN
)

In [25]:
%%time
TRAIN_STEPS_PER_EPOCH = batch_gen.batches_per_epoch(TRAIN_FILES, batch_size=BATCH_SIZE)
print('train steps per epoch', TRAIN_STEPS_PER_EPOCH)

train steps per epoch 7782
CPU times: user 7.76 s, sys: 116 ms, total: 7.88 s
Wall time: 7.88 s


In [26]:
%%time
TEST_STEPS_PER_EPOCH = batch_gen.batches_per_epoch(TEST_FILES, batch_size=BATCH_SIZE)
print('train steps per epoch', TEST_STEPS_PER_EPOCH)

train steps per epoch 195
CPU times: user 208 ms, sys: 0 ns, total: 208 ms
Wall time: 207 ms


In [27]:
Xy_train = batch_gen.generate_forever(TRAIN_FILES, batch_size=BATCH_SIZE)
Xy_test = batch_gen.generate_forever(TEST_FILES, batch_size=BATCH_SIZE)

In [28]:
from model_decoder import TransformerDecoder
model = TransformerDecoder(
    n_heads=N_HEADS,
    decoder_layers=N_LAYERS,
    d_model=D_MODEL,
    vocab_size=VOCAB_SIZE,
    sequence_len=SENTENCE_LEN,
    dropout=DROPOUT)

In [29]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input (InputLayer)              (None, None)         0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 170, 512)     7680000     input[0][0]                      
__________________________________________________________________________________________________
positional_encoding_1 (Position (None, 170, 512)     0           embedding[0][0]                  
__________________________________________________________________________________________________
embedding_scalar (Scalar)       (None, 170, 512)     0           positional_encoding_1[0][0]      
__________________________________________________________________________________________________
dropout_1 

In [30]:
import keras.backend as K
def perplexity(y_true, y_pred):
    cross_entropy = K.categorical_crossentropy(y_true, y_pred)
    return K.exp(cross_entropy)
LOSS = perplexity

In [31]:
class LRScheduler:
    def __init__(self, d_model, warmup_steps):
        self.d_model = d_model
        self.warmup_steps = warmup_steps
        self.epoch = 1

    def lr(self, epoch):
        lr = self.d_model**-.5 * min(self.epoch**-.5, epoch*(self.warmup_steps**-1.5))
        self.epoch += 1
        return lr
# lr_scheduler = LRScheduler(D_MODEL, WARMUP_STEPS).lr

In [32]:
from keras.callbacks import LearningRateScheduler
# CALLBACKS.append(LearningRateScheduler(lr_scheduler))

In [33]:
model.compile(loss=LOSS, optimizer=OPTIMIZER, metrics=METRICS)

In [None]:
# print loss value references
def display_loss_reference():
    format_ = 'loss: %20s\tentropy: %20s'
    upper_limit = np.log(VOCAB_SIZE)
    for i in range(int(np.floor(upper_limit))+1):
        print(format_ % (np.exp(i), i))
    print(format_ % (np.exp(upper_limit), upper_limit))
display_loss_reference()

loss:                  1.0	entropy:                    0
loss:    2.718281828459045	entropy:                    1
loss:     7.38905609893065	entropy:                    2
loss:   20.085536923187668	entropy:                    3
loss:   54.598150033144236	entropy:                    4
loss:    148.4131591025766	entropy:                    5
loss:    403.4287934927351	entropy:                    6
loss:   1096.6331584284585	entropy:                    7
loss:   2980.9579870417283	entropy:                    8
loss:    8103.083927575384	entropy:                    9
loss:   15000.000000000004	entropy:    9.615805480084347


In [None]:
model.fit_generator(
    Xy_train,
    steps_per_epoch=TRAIN_STEPS_PER_EPOCH,
    epochs=N_EPOCHS,
    validation_data=Xy_test,
    validation_steps=TEST_STEPS_PER_EPOCH,
    callbacks=CALLBACKS)

Epoch 1/1000
  27/7782 [..............................] - ETA: 2:16:08 - loss: 15023.4550 - categorical_crossentropy: 9.6161