In [1]:
# Outside imports
import os
import importlib
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


In [6]:
import train
import model
import evaluate
import beam_search
import data_util.data
import data_util.batcher
import data_util.config


importlib.reload(model)
importlib.reload(train)
importlib.reload(evaluate)
importlib.reload(beam_search)
importlib.reload(data_util.config)
importlib.reload(data_util.data)
importlib.reload(data_util.batcher)

from model import *
from train import *
from beam_search import *
from evaluate import *
from data_util.data import *
from data_util.batcher import *


In [7]:
# load real data
data_path = '/Users/rowancassius/Desktop/capstone/data/context_task_data.tsv'
dat = pd.read_csv(data_path, sep='\t')

# test/train split
x_train, x_test, y_train, y_test = train_test_split(
    dat.TaskSentence.values, 
    dat.Summary.values,
    test_size=0.2,
    random_state=0,
    shuffle=True
)

# fit tokenizer
tokenizer = Tokenizer(lower=True, filters='!"#$%&()*+,./:;<=>?[\\]^_`{|}~\t\n')
tokenizer.fit_on_texts(list(x_train)+list(y_train))

# lowercase the data
x_train = tokenizer.sequences_to_texts(tokenizer.texts_to_sequences(x_train))
y_train = tokenizer.sequences_to_texts(tokenizer.texts_to_sequences(y_train))

examples = list(zip(x_train, y_train))
# examples

In [8]:
random.seed(123)
T.manual_seed(123)
if T.cuda.is_available():
    T.cuda.manual_seed_all(123)
    
class Namespace:
    def __init__(self, **kwargs):
        self.__dict__.update(kwargs)

In [9]:
# Standards
# Make Vocab
vocab = data.Vocab(
    words = tokenizer.word_index.keys(), 
    max_size=len(tokenizer.word_index))

# Make Batcher
batcher = Batcher(
    examples=examples,
    vocab=vocab, 
    mode='train', 
    batch_size=32, 
    single_pass=False)

max_size of vocab was specified as 2829; we now have 2829 words. Stopping reading.
Finished constructing vocabulary of 2829 total words. Last word added: foundation


In [None]:
train_mle = "yes"
train_rl = "no"
mle_weight = 1.0
load_model = None
new_lr = None
rl_weight = 1 - mle_weight

opt = Namespace(train_mle = train_mle, 
                train_rl = train_rl, 
                mle_weight = mle_weight, 
                load_model = load_model,
                new_lr = new_lr, 
                rl_weight = rl_weight)


train_processor = Train(vocab, batcher, opt)

In [None]:
# train_processor.setup_train(model=Model)
# train_processor.train_one_batch(train_processor.batcher.next_batch(), 1)

In [None]:
task = "validate"
load_model = os.path.join(config.log_root, "data/saved_models/0000500.tar")

opt = Namespace(task = task, load_model = load_model)

# new batcher for evaluation
batcher = Batcher(
    examples=examples[:20],
    vocab=vocab, 
    mode='train', 
    batch_size=10, 
    single_pass=True)

eval_processor = Evaluate(vocab, batcher, opt) 

In [None]:
decoded_sents, ref_sents = eval_processor.evaluate_batch(model=Model)

In [None]:
decoded_sents

In [None]:
ref_sents

## Now Repeating with the BERT model

In [10]:
# Make Vocab
bert_vocab = BertVocab()

# Make Batcher
bert_batcher = Batcher(
    examples=examples,
    vocab=bert_vocab, 
    mode='train', 
    batch_size=32, 
    single_pass=False)

In [11]:
train_mle = "yes"
train_rl = "no"
mle_weight = 1.0
load_model = None
new_lr = None
rl_weight = 1 - mle_weight

opt = Namespace(train_mle = train_mle, 
                train_rl = train_rl, 
                mle_weight = mle_weight, 
                load_model = load_model,
                new_lr = new_lr, 
                rl_weight = rl_weight)


bert_train_processor = Train(bert_vocab, bert_batcher, opt)

In [12]:
# bert_train_processor.setup_train(model=BertSummarizer)
# bert_train_processor.train_one_batch(bert_train_processor.batcher.next_batch(), 1)

In [13]:
config.save_model_path = "data/bert_saved_models"

bert_train_processor.trainIters(n_iters=2, model=BertSummarizer, report_every=1, save_every = 2)

0
iter: 1 mle_loss: 7.244 reward: 0.0000
1
iter: 2 mle_loss: 6.429 reward: 0.0000
model saved at: 
 data/bert_saved_models/0000002.tar
2
iter: 3 mle_loss: 6.389 reward: 0.0000


Yay!

In [14]:
task = "validate"
load_model = os.path.join(config.log_root, "data/bert_saved_models/0000002.tar")

opt = Namespace(task = task, load_model = load_model)

# new batcher for evaluation
bert_decode_batcher = Batcher(
    examples=examples[:20],
    vocab=bert_vocab, 
    mode='train', 
    batch_size=10, 
    single_pass=True)

eval_processor = Evaluate(bert_vocab, bert_decode_batcher, opt) 

example_generator completed reading all examples. No more data.


Exception in thread Thread-8:
Traceback (most recent call last):
  File "/Users/rowancassius/Desktop/capstone/LSTM_Summarizer/data_util/batcher.py", line 276, in text_generator
    e = next(example_generator) # e is a tf.Example
StopIteration

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/Users/rowancassius/opt/anaconda3/lib/python3.7/threading.py", line 926, in _bootstrap_inner
    self.run()
  File "/Users/rowancassius/opt/anaconda3/lib/python3.7/threading.py", line 870, in run
    self._target(*self._args, **self._kwargs)
  File "/Users/rowancassius/Desktop/capstone/LSTM_Summarizer/data_util/batcher.py", line 210, in fill_example_queue
    (article, abstract) = next(input_gen) # read the next example from file. article and abstract are both strings.
RuntimeError: generator raised StopIteration



In [15]:
decoded_sents, ref_sents = eval_processor.evaluate_batch(model=BertSummarizer)

Summarizing Batch...
tensor([[    1,  3531],
        [    1, 19274],
        [    1,  6413],
        [    1, 30522]])
tensor([[    1,  2021],
        [    1, 30522],
        [    1,  2145],
        [    1,  2182]])
tensor([[    1, 30522],
        [    1,  3531],
        [    1,  3828],
        [    1,  2023]])
tensor([[   1, 5736],
        [   1, 2202],
        [   1, 1037],
        [   1, 4248]])
tensor([[    1,  3531],
        [    1, 10373],
        [    1,  2033],
        [    1,  2054]])
tensor([[   1, 2064],
        [   1, 2017],
        [   1, 2689],
        [   1, 2009]])
tensor([[   1, 3531],
        [   1, 2507],
        [   1, 2033],
        [   1, 2115]])
tensor([[   1, 2065],
        [   1, 3100],
        [   1, 3531],
        [   1, 3143]])
tensor([[   1, 3531],
        [   1, 2131],
        [   1, 2007],
        [   1, 2014]])
tensor([[    1,  3531],
        [    1, 10639],
        [    1,  2007],
        [    1,  2928]])
tensor([[    1,  3531,  2065],
        [    1,  3

tensor([[   1, 3531, 2065, 2065, 2065, 2065, 2003, 2003],
        [   1, 3531, 2065, 2065, 2065, 2003, 2003, 2003],
        [   1, 3531, 2065, 2065, 2065, 2065, 2065, 2003],
        [   1, 3531, 2065, 2065, 2065, 2003, 2065, 2003]])
tensor([[    1,  2021,  1037,  1037,  1037,  1037,  1037,  1037],
        [    1, 30522,  1037,  1037,  1037,  1037,  1037,  1037],
        [    1,  2145,  1037,  1037,  1037,  1037,  1037,  1037],
        [    1,  2021,  1037,  1037,  1037,  1037,  1037, 30523]])
tensor([[    1, 30522,  3828,  1999,  2017,  2031,  2031,  2031],
        [    1, 30522,  3828,  2373,  2017,  2031,  2031,  2031],
        [    1, 30522,  3828,  2373, 14176,  2031,  2031,  2031],
        [    1, 30522,  3828,  1999, 14176,  2031,  2031,  2031]])
tensor([[   1, 5736, 2298, 2023, 2017, 2017, 2017, 2017],
        [   1, 5736, 2298, 2052, 2017, 2017, 2017, 2017],
        [   1, 5736, 4248, 2023, 2017, 2017, 2017, 2017],
        [   1, 5736, 4248, 2052, 2017, 2017, 2017, 2017]])
tens

tensor([[   1, 3531, 2065, 2065, 2065, 2065, 2003, 2003, 2003, 2003, 2003],
        [   1, 3531, 2065, 2065, 2065, 2003, 2003, 2003, 2003, 2003, 2003],
        [   1, 3531, 2065, 2065, 2065, 2065, 2065, 2003, 2003, 2003, 2003],
        [   1, 3531, 2065, 2065, 2065, 2003, 2065, 2003, 2003, 2003, 2003]])
tensor([[    1,  2021,  1037,  1037,  1037,  1037,  1037,  1037,  1037,  1037,
          1037],
        [    1, 30522,  1037,  1037,  1037,  1037,  1037,  1037,  1037,  1037,
          1037],
        [    1,  2145,  1037,  1037,  1037,  1037,  1037,  1037,  1037,  1037,
          1037],
        [    1,  2021,  1037,  1037,  1037,  1037,  1037,  1037,  1037,  1037,
         30523]])
tensor([[    1, 30522,  3828,  1999,  2017,  2031,  2031,  2031,  2031,  2031,
          2031],
        [    1, 30522,  3828,  2373, 14176,  2031,  2031,  2031,  2031,  2031,
          2031],
        [    1, 30522,  3828,  2373,  2017,  2031,  2031,  2031,  2031,  2031,
          2031],
        [    1, 30522,

tensor([[   1, 2065, 2017, 2017, 2017, 2017],
        [   1, 2065, 2017, 2017, 2017, 2033],
        [   1, 2065, 2017, 2017, 2017, 2024],
        [   1, 2065, 2017, 2017, 2024, 2017]])
tensor([[   1, 2320, 2000, 2000, 2000, 2000],
        [   1, 2320, 1996, 2000, 2000, 2000],
        [   1, 2320, 2000, 2000, 2000, 1996],
        [   1, 2320, 2000, 1996, 2000, 2000]])
tensor([[    1, 30522, 30522, 30522,  2033,  2017],
        [    1, 30522, 30522, 30522,  1998,  2017],
        [    1, 30522, 30522, 30522,  1998,  2065],
        [    1, 30522, 30522, 30522,  1998,  2031]])
tensor([[   1, 3531, 3531, 3531, 1997, 2344],
        [   1, 3531, 3531, 3531, 1997, 2006],
        [   1, 3531, 3531, 3531, 1997, 2064],
        [   1, 3531, 3531, 2033, 1997, 2344]])
tensor([[   1, 3531, 1996, 1996, 1996, 1996],
        [   1, 2191, 1996, 1996, 1996, 1996],
        [   1, 3056, 1996, 1996, 1996, 1996],
        [   1, 3531, 3531, 1996, 1996, 1996]])
tensor([[   1, 3531, 2004, 2004, 2004, 2004],
     

tensor([[   1, 2065, 2017, 2017, 2017, 2017, 2033, 2033, 2033, 2033],
        [   1, 2065, 2017, 2017, 2017, 2033, 2033, 2033, 2033, 2033],
        [   1, 2065, 2017, 2017, 2017, 2024, 2033, 2033, 2033, 2033],
        [   1, 2065, 2017, 2017, 2024, 2017, 2033, 2033, 2033, 2033]])
tensor([[   1, 2320, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000],
        [   1, 2320, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 1996],
        [   1, 2320, 2000, 2000, 2000, 2000, 2000, 2000, 1996, 2000],
        [   1, 2320, 2000, 2000, 2000, 2000, 2000, 1996, 2000, 2000]])
tensor([[    1, 30522, 30522, 30522,  2033,  2017,  2055, 11968, 11968, 11968],
        [    1, 30522, 30522, 30522,  1998,  2031,  2055, 11968, 11968, 11968],
        [    1, 30522, 30522, 30522,  1998,  2017, 30523, 11968, 11968, 11968],
        [    1, 30522, 30522, 30522,  1998,  2017,  2055, 11968, 11968, 11968]])
tensor([[    1,  3531,  3531,  3531,  1997,  2344,  2012, 19575, 19575, 19575],
        [    1,  3531,  3531,  3531, 

In [16]:
decoded_sents

['please if if if if is is is is is',
 'but a a a a a a a a a',
 'twanda save in you have have have have have have',
 'kate look this you you you you you you you',
 'please you out out out out out out out out',
 'can my my calendar calendar calendar calendar calendar calendar calendar',
 'please comments asap asap asap asap asap asap asap asap',
 'if send send send send send send send send send',
 'please with ondetails ondetails ondetails ondetails ondetails ondetails ondetails ondetails',
 'please mark holsworth holsworth holsworth holsworth holsworth holsworth holsworth holsworth',
 'if you you you you me me me me me',
 'once to to to to to to to to to',
 'dari dari dari and you heartburn par par par par',
 'please please please of order at discount discount discount discount',
 'please the the the the the the the the the',
 'please as as as as as as as as as',
 'remind you you you you you you you you you',
 'please the review review review review review review review review',
 'wou

In [17]:
ref_sents

["insert appropriate seller's payment",
 'call sender',
 'save power point',
 'examine this',
 'send findings to sender',
 'change calendar',
 'respond to sender with comments about it',
 'complete document and return to sender',
 'give patti ondetails',
 'communicate with mark holsworth',
 'reply to sender about ocr program',
 'inform sender of needs to load the gtcs',
 'review suggested mark up',
 'give number of copies to sender',
 'verify pg e topock curves are flat to socal',
 'contact sender with further comments and to coordinate execution',
 'remind sender when in banff louise',
 'look at book review',
 'return toaster to sender',
 'forward it to jon']