In [1]:
# Outside imports
import os
import importlib
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


In [36]:
import train
import model
import evaluate
import beam_search
import data_util.data
import data_util.batcher
import data_util.config


importlib.reload(model)
importlib.reload(train)
importlib.reload(evaluate)
importlib.reload(beam_search)
importlib.reload(data_util.config)
importlib.reload(data_util.data)
importlib.reload(data_util.batcher)

from model import *
from train import *
from beam_search import *
from evaluate import *
from data_util.data import *
from data_util.batcher import *


In [3]:
# load real data
data_path = '/Users/rowancassius/Desktop/capstone/data/context_task_data.tsv'
dat = pd.read_csv(data_path, sep='\t')

# test/train split
x_train, x_test, y_train, y_test = train_test_split(
    dat.TaskSentence.values, 
    dat.Summary.values,
    test_size=0.2,
    random_state=0,
    shuffle=True
)

# fit tokenizer
tokenizer = Tokenizer(lower=True, filters='!"#$%&()*+,./:;<=>?[\\]^_`{|}~\t\n')
tokenizer.fit_on_texts(list(x_train)+list(y_train))

# lowercase the data
x_train = tokenizer.sequences_to_texts(tokenizer.texts_to_sequences(x_train))
y_train = tokenizer.sequences_to_texts(tokenizer.texts_to_sequences(y_train))

examples = list(zip(x_train, y_train))
# examples

In [4]:
random.seed(123)
T.manual_seed(123)
if T.cuda.is_available():
    T.cuda.manual_seed_all(123)
    
class Namespace:
    def __init__(self, **kwargs):
        self.__dict__.update(kwargs)

In [37]:
# Standards
# Make Vocab
vocab = data.Vocab(
    words = tokenizer.word_index.keys(), 
    max_size=len(tokenizer.word_index))

# Make Batcher
batcher = Batcher(
    examples=examples,
    vocab=vocab, 
    mode='train', 
    batch_size=32, 
    single_pass=False)

max_size of vocab was specified as 2829; we now have 2829 words. Stopping reading.
Finished constructing vocabulary of 2829 total words. Last word added: foundation
INFO:tensorflow:Bucket queue size: 0, Input queue size: 439
INFO:tensorflow:Bucket queue size: 1000, Input queue size: 32000


In [6]:
train_mle = "yes"
train_rl = "no"
mle_weight = 1.0
load_model = None
new_lr = None
rl_weight = 1 - mle_weight

opt = Namespace(train_mle = train_mle, 
                train_rl = train_rl, 
                mle_weight = mle_weight, 
                load_model = load_model,
                new_lr = new_lr, 
                rl_weight = rl_weight)


train_processor = Train(vocab, batcher, opt)

In [7]:
# train_processor.setup_train(model=Model)
# train_processor.train_one_batch(train_processor.batcher.next_batch(), 1)

In [38]:
task = "validate"
load_model = os.path.join(config.log_root, "data/saved_models/0000500.tar")

opt = Namespace(task = task, load_model = load_model)

# new batcher for evaluation
batcher = Batcher(
    examples=examples[:20],
    vocab=vocab, 
    mode='train', 
    batch_size=10, 
    single_pass=True)

eval_processor = Evaluate(vocab, batcher, opt) 

example_generator completed reading all examples. No more data.


Exception in thread Thread-38:
Traceback (most recent call last):
  File "/Users/rowancassius/Desktop/capstone/LSTM_Summarizer/data_util/batcher.py", line 273, in text_generator
    e = next(example_generator) # e is a tf.Example
StopIteration

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/Users/rowancassius/opt/anaconda3/lib/python3.7/threading.py", line 926, in _bootstrap_inner
    self.run()
  File "/Users/rowancassius/opt/anaconda3/lib/python3.7/threading.py", line 870, in run
    self._target(*self._args, **self._kwargs)
  File "/Users/rowancassius/Desktop/capstone/LSTM_Summarizer/data_util/batcher.py", line 207, in fill_example_queue
    (article, abstract) = next(input_gen) # read the next example from file. article and abstract are both strings.
RuntimeError: generator raised StopIteration



INFO:tensorflow:Bucket queue size: 1000, Input queue size: 32000


In [39]:
decoded_sents, ref_sents = eval_processor.evaluate_batch(model=Model)

INFO:tensorflow:Bucket queue size: 1000, Input queue size: 32000
Summarizing Batch...
x_t before: torch.Size([40])
x_t: torch.Size([40, 256])
x_t before: torch.Size([40])
x_t: torch.Size([40, 256])
x_t before: torch.Size([40])
x_t: torch.Size([40, 256])
x_t before: torch.Size([24])
x_t: torch.Size([24, 256])
x_t before: torch.Size([16])
x_t: torch.Size([16, 256])
Summarizing Batch...
x_t before: torch.Size([40])
x_t: torch.Size([40, 256])
x_t before: torch.Size([40])
x_t: torch.Size([40, 256])
x_t before: torch.Size([40])
x_t: torch.Size([40, 256])
x_t before: torch.Size([32])
x_t: torch.Size([32, 256])
x_t before: torch.Size([32])
x_t: torch.Size([32, 256])
x_t before: torch.Size([16])
x_t: torch.Size([16, 256])
x_t before: torch.Size([12])
x_t: torch.Size([12, 256])
x_t before: torch.Size([12])
x_t: torch.Size([12, 256])
x_t before: torch.Size([8])
x_t: torch.Size([8, 256])
INFO:tensorflow:Finished reading dataset in single_pass mode.
INFO:tensorflow:Bucket queue size: 1000, Input qu

## Now Repeating with the BERT model

In [29]:
# Make Vocab
bert_vocab = BertVocab('bert-base-uncased')

# Make Batcher
bert_batcher = Batcher(
    examples=examples,
    vocab=bert_vocab, 
    mode='train', 
    batch_size=32, 
    single_pass=False)

INFO:tensorflow:Bucket queue size: 18, Input queue size: 111


In [35]:
train_mle = "yes"
train_rl = "no"
mle_weight = 1.0
load_model = None
new_lr = None
rl_weight = 1 - mle_weight

opt = Namespace(train_mle = train_mle, 
                train_rl = train_rl, 
                mle_weight = mle_weight, 
                load_model = load_model,
                new_lr = new_lr, 
                rl_weight = rl_weight)


bert_train_processor = Train(bert_vocab, bert_batcher, opt)

INFO:tensorflow:Bucket queue size: 1000, Input queue size: 32000
INFO:tensorflow:Bucket queue size: 1000, Input queue size: 32000
INFO:tensorflow:Bucket queue size: 1000, Input queue size: 32000
INFO:tensorflow:Bucket queue size: 1000, Input queue size: 32000
INFO:tensorflow:Bucket queue size: 1000, Input queue size: 32000


In [None]:
# bert_train_processor.setup_train(model=BertSummarizer)
# bert_train_processor.train_one_batch(bert_train_processor.batcher.next_batch(), 1)

In [14]:
bert_train_processor.trainIters(n_iters=50, model=BertSummarizer, report_every=5, save_every = 10)

0
INFO:tensorflow:Bucket queue size: 1000, Input queue size: 32000
INFO:tensorflow:Bucket queue size: 1000, Input queue size: 32000
1
2
INFO:tensorflow:Bucket queue size: 1000, Input queue size: 32000
INFO:tensorflow:Bucket queue size: 1000, Input queue size: 32000
3
4
INFO:tensorflow:Bucket queue size: 1000, Input queue size: 32000
iter: 5 mle_loss: 6.355 reward: 0.0000
5
INFO:tensorflow:Bucket queue size: 1000, Input queue size: 32000
6
INFO:tensorflow:Bucket queue size: 1000, Input queue size: 32000
7
INFO:tensorflow:Bucket queue size: 1000, Input queue size: 32000
8
9
INFO:tensorflow:Bucket queue size: 1000, Input queue size: 32000
INFO:tensorflow:Bucket queue size: 1000, Input queue size: 32000
iter: 10 mle_loss: 4.720 reward: 0.0000
model saved at: 
 data/saved_models/0000010.tar
10
11
INFO:tensorflow:Bucket queue size: 1000, Input queue size: 32000
INFO:tensorflow:Bucket queue size: 1000, Input queue size: 32000
12
13
14
INFO:tensorflow:Bucket queue size: 1000, Input queue size:

NameError: name 'exit' is not defined

Yay!

In [30]:
task = "validate"
load_model = os.path.join(config.log_root, "data/saved_models/0000020.tar")

opt = Namespace(task = task, load_model = load_model)

# new batcher for evaluation
bert_decode_batcher = Batcher(
    examples=examples[:20],
    vocab=bert_vocab, 
    mode='train', 
    batch_size=10, 
    single_pass=True)

eval_processor = Evaluate(bert_vocab, bert_decode_batcher, opt) 

example_generator completed reading all examples. No more data.


Exception in thread Thread-33:
Traceback (most recent call last):
  File "/Users/rowancassius/Desktop/capstone/LSTM_Summarizer/data_util/batcher.py", line 273, in text_generator
    e = next(example_generator) # e is a tf.Example
StopIteration

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/Users/rowancassius/opt/anaconda3/lib/python3.7/threading.py", line 926, in _bootstrap_inner
    self.run()
  File "/Users/rowancassius/opt/anaconda3/lib/python3.7/threading.py", line 870, in run
    self._target(*self._args, **self._kwargs)
  File "/Users/rowancassius/Desktop/capstone/LSTM_Summarizer/data_util/batcher.py", line 207, in fill_example_queue
    (article, abstract) = next(input_gen) # read the next example from file. article and abstract are both strings.
RuntimeError: generator raised StopIteration



INFO:tensorflow:Bucket queue size: 1000, Input queue size: 32000


In [32]:
decoded_sents, ref_sents = eval_processor.evaluate_batch(model=BertSummarizer)

INFO:tensorflow:Finished reading dataset in single_pass mode.


In [33]:
decoded_sents

[]

INFO:tensorflow:Bucket queue size: 1000, Input queue size: 32000


In [34]:
ref_sents

[]