In [4]:
# Outside imports
import os
import importlib
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [233]:

import model
import train
import evaluate
import train_util
import data_util.data
import data_util.batcher
import data_util.config

importlib.reload(model)
importlib.reload(train)
importlib.reload(evaluate)
importlib.reload(train_util)
importlib.reload(data_util.config)
importlib.reload(data_util.data)
importlib.reload(data_util.batcher)

from evaluate import *
from model import *
from train import *
from train_util import *
from data_util.data import *
from data_util.batcher import *


In [196]:
# load real data
data_path = '/Users/rowancassius/Desktop/capstone/data/context_task_data.tsv'
dat = pd.read_csv(data_path, sep='\t')

# fill nas
dat.fillna('', inplace=True)

# train/test split
np.random.seed(111)
dat = dat.sample(frac=1)
train_size = int(.8*dat.shape[0])
train_data = dat[:train_size]
test_data = dat[train_size:]

# fit tokenizer
FILTERS = '!"&()*+,:;<=>?[\\]^_`{|}~\t\n'
tokenizer = Tokenizer(lower=True, filters=FILTERS)
tokenizer.fit_on_texts(train_data.Context.values + train_data.TaskSentence.values + train_data.Summary.values)
tokenizer.texts_to_texts = lambda texts: tokenizer.sequences_to_texts(tokenizer.texts_to_sequences(texts))

# transform texts
def prep_data(data):
    for c in ['Context', 'TaskSentence', 'Summary']:
        data[c] = tokenizer.texts_to_texts(data[c])
    return data

train_data = prep_data(train_data)
test_data = prep_data(test_data)

In [234]:
vocab = data.Vocab(
    words = tokenizer.word_index.keys(), 
    max_size=len(tokenizer.word_index))

max_size of vocab was specified as 6001; we now have 6001 words. Stopping reading.
Finished constructing vocabulary of 6001 total words. Last word added: 1991/1998


In [236]:
task_batcher = TaskBatcher(
    examples=train_data.to_dict('records')[:10],
    vocab=vocab,
    mode='train',
    batch_size=32,
    single_pass=False
)

In [237]:
random.seed(123)
T.manual_seed(123)
if T.cuda.is_available():
    T.cuda.manual_seed_all(123)
    
class Namespace:
    def __init__(self, **kwargs):
        self.__dict__.update(kwargs)

In [238]:
train_mle = "yes"
train_rl = "no"
mle_weight = 1.0
load_model = None
new_lr = None
rl_weight = 1 - mle_weight

opt = Namespace(train_mle = train_mle, 
                train_rl = train_rl, 
                mle_weight = mle_weight, 
                load_model = load_model,
                new_lr = new_lr, 
                rl_weight = rl_weight)


train_processor = TaskTrain(vocab, task_batcher, opt)

In [239]:
config.save_model_path = "data/saved_models"

train_processor.trainIters(n_iters=100, model=TaskModel, report_every=1, save_every = 20)

iter: 1 mle_loss: 5.020 reward: 0.0000
iter: 2 mle_loss: 5.086 reward: 0.0000
iter: 3 mle_loss: 4.896 reward: 0.0000
iter: 4 mle_loss: 4.849 reward: 0.0000
iter: 5 mle_loss: 4.763 reward: 0.0000
iter: 6 mle_loss: 4.753 reward: 0.0000
iter: 7 mle_loss: 4.578 reward: 0.0000
iter: 8 mle_loss: 3.832 reward: 0.0000
iter: 9 mle_loss: 3.738 reward: 0.0000
iter: 10 mle_loss: 3.250 reward: 0.0000
iter: 11 mle_loss: 3.102 reward: 0.0000
iter: 12 mle_loss: 3.121 reward: 0.0000
iter: 13 mle_loss: 3.075 reward: 0.0000
iter: 14 mle_loss: 3.025 reward: 0.0000
iter: 15 mle_loss: 2.972 reward: 0.0000
iter: 16 mle_loss: 2.879 reward: 0.0000
iter: 17 mle_loss: 2.850 reward: 0.0000
iter: 18 mle_loss: 2.740 reward: 0.0000
iter: 19 mle_loss: 2.680 reward: 0.0000
iter: 20 mle_loss: 2.592 reward: 0.0000
model saved at: 
 data/saved_models/0000020.tar
iter: 21 mle_loss: 2.517 reward: 0.0000
iter: 22 mle_loss: 2.500 reward: 0.0000
iter: 23 mle_loss: 2.355 reward: 0.0000
iter: 24 mle_loss: 2.298 reward: 0.0000
i

Decoding Time

In [240]:
task = "validate"
load_model = os.path.join(config.log_root, "data/saved_models/0000100.tar")

opt = Namespace(task = task, load_model = load_model)

# new batcher for evaluation
task_batcher = TaskBatcher(
    examples=train_data.to_dict('records')[:20],
    vocab=vocab, 
    mode='train', 
    batch_size=10, 
    single_pass=True)

eval_processor = TaskEvaluate(vocab, task_batcher, opt) 

example_generator completed reading all examples. No more data.


Exception in thread Thread-104:
Traceback (most recent call last):
  File "/Users/rowancassius/Desktop/capstone/LSTM_Summarizer/data_util/batcher.py", line 390, in text_generator
    example = next(example_generator)
StopIteration

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/Users/rowancassius/opt/anaconda3/lib/python3.7/threading.py", line 926, in _bootstrap_inner
    self.run()
  File "/Users/rowancassius/opt/anaconda3/lib/python3.7/threading.py", line 870, in run
    self._target(*self._args, **self._kwargs)
  File "/Users/rowancassius/Desktop/capstone/LSTM_Summarizer/data_util/batcher.py", line 371, in fill_example_queue
    context, task, summary = next(input_gen) # read the next example from file. article and abstract are both strings.
RuntimeError: generator raised StopIteration



In [241]:
decoded_sents, ref_sents = eval_processor.evaluate_batch(model=TaskModel)

torch.Size([10, 25, 512])
Summarizing Batch...
tensor([[  2,  36],
        [  2, 137],
        [  2,  22],
        [  2,  12]])
tensor([[  2,  45],
        [  2, 137],
        [  2,  22],
        [  2,  12]])
tensor([[  2,  82],
        [  2,  32],
        [  2, 137],
        [  2,  22]])
tensor([[  2, 163],
        [  2, 137],
        [  2,  18],
        [  2,  22]])
tensor([[  2,  32],
        [  2,  11],
        [  2, 103],
        [  2, 137]])
tensor([[  2, 929],
        [  2, 137],
        [  2,  22],
        [  2, 745]])
tensor([[  2, 496],
        [  2,  21],
        [  2, 137],
        [  2,   7]])
tensor([[  2, 137],
        [  2,   5],
        [  2,  22],
        [  2,  12]])
tensor([[  2,  45],
        [  2, 137],
        [  2,  22],
        [  2,  82]])
tensor([[  2, 137],
        [  2,  78],
        [  2,  22],
        [  2,  82]])
tensor([[  2,  36, 176],
        [  2, 137, 176],
        [  2,  22, 176],
        [  2,  12, 176]])
tensor([[   2,   45,   86],
        [   2,

tensor([[   2,  111],
        [   2,  748],
        [   2,    4],
        [   2, 1093]])
tensor([[   2,  281],
        [   2,   28],
        [   2,  533],
        [   2, 2083]])
tensor([[  2, 309],
        [  2, 616],
        [  2, 824],
        [  2, 120]])
tensor([[   2,  619],
        [   2,  137],
        [   2,    7],
        [   2, 2092]])
tensor([[   2, 3266],
        [   2,   24],
        [   2,  111],
        [   2,    4]])
tensor([[   2,   88],
        [   2,  137],
        [   2,    7],
        [   2, 3267]])
tensor([[   2, 1100],
        [   2,    8],
        [   2,  137],
        [   2,  151]])
tensor([[   2, 1595],
        [   2,  137],
        [   2,   82],
        [   2,   13]])
tensor([[   2,   28],
        [   2, 1287],
        [   2,  124],
        [   2,   62]])
tensor([[  2, 267],
        [  2, 137],
        [  2,   5],
        [  2,   7]])
tensor([[   2,  111,   13],
        [   2,  748,   13],
        [   2,    4,   13],
        [   2, 1093,   13]])
tensor([[   2

tensor([[   2,  309, 1097,    5,  101,   19,   39,   39],
        [   2,  309, 1097,    5,  101,  101,   39,   39],
        [   2,  309, 1097,    5,  101,    4,   39,   39],
        [   2,  309,    5,    5,  101,    4,   39,   39]])
tensor([[   2,  619,    5, 2093,   19, 2095, 2095, 2095],
        [   2,  619,    5, 2093,   19,   19, 2095, 2095],
        [   2,  619,    5, 2093, 2094, 2095, 2095, 2095],
        [   2,  619,    5, 2093,   19, 2095, 2095,   28]])
tensor([[   2, 3266,   18,   18,    5,   93,   93,   93],
        [   2, 3266,   18,    5,    5,   93,   93,   93],
        [   2, 3266,   18,   18,    5,   93,   93,  153],
        [   2, 3266,   62,   18,    5,   93,   93,   93]])
tensor([[  2,  88,  12,  14, 441, 682, 682,   6],
        [  2,  88,  14,  14, 441, 682, 682,   6],
        [  2,  88,  11,  14,  14, 441, 682,   6],
        [  2,  88,  11,  14, 441, 441, 682,   6]])
tensor([[  2,  28,   5,   5, 617, 498,   8,   3],
        [  2,  28,   5,   5, 617,  29,   8,   3],


In [242]:
decoded_sents

['review change order',
 'forward draft of a a conveyance document to bob kahn',
 'contact sender with redlines against prior drafts',
 'update personal gis file',
 'send message to unsubscribe@ashford com to unsubscribe',
 'welcome stewart to the ebs team',
 'direct sender to someone to assist in making an offer',
 'reply to this email with cancel',
 'forward this to public affairs group',
 'reply to henri@hearme com',
 'them in the event you',
 'presentation to to coordinate order',
 'available moment to look look attached attached attached attached attached',
 'rsvp to eva at epollard@houstonenergy epollard@houstonenergy epollard@houstonenergy epollard@houstonenergy',
 'commerce your your to make make make make',
 'provide this with nymex quote quote and and',
 'agreed put on york york',
 'ak copy to me a',
 'call to to leave where you',
 'prepare a to the the attached form form']

In [232]:
ref_sents

['review change order',
 'forward draft of a conveyance document to bob kahn',
 'contact sender with redlines against prior drafts',
 'update personal gis file',
 'send message to unsubscribe@ashford com to unsubscribe',
 'welcome stewart to the ebs team',
 'direct sender to someone to assist in making an offer',
 'reply to this email with cancel',
 'forward this to public affairs group',
 'reply to henri@hearme com',
 'contact barbara for coordination of legal support',
 'coordinate with shirley at x3-5290',
 'examine membership form',
 'rsvp to eva pollard at epollard@houstonenergy org',
 'help sender',
 'send sender a nymex quote and basis quote',
 'put pressure on new york',
 'forward a copy of the contract with ena and ak to sender',
 'call sender',
 'prepare a draft isda pursuant and deliver to agustin perez']

..
..


In [154]:
examples = list(zip(train_data.TaskSentence, train_data.Summary))

In [155]:
batcher = Batcher(
    examples=examples[:10],
    vocab=vocab, 
    mode='train', 
    batch_size=32, 
    single_pass=False)

In [156]:
# Make Vocab
vocab = data.Vocab(
    words = tokenizer.word_index.keys(), 
    max_size=len(tokenizer.word_index))

# Make Batcher
batcher = Batcher(
    examples=examples,
    vocab=vocab, 
    mode='train', 
    batch_size=32, 
    single_pass=False)

max_size of vocab was specified as 6001; we now have 6001 words. Stopping reading.
Finished constructing vocabulary of 6001 total words. Last word added: 1991/1998


In [157]:
train_mle = "yes"
train_rl = "no"
mle_weight = 1.0
load_model = None
new_lr = None
rl_weight = 1 - mle_weight

opt = Namespace(train_mle = train_mle, 
                train_rl = train_rl, 
                mle_weight = mle_weight, 
                load_model = load_model,
                new_lr = new_lr, 
                rl_weight = rl_weight)


train_processor = Train(vocab, batcher, opt)

In [158]:
config.save_model_path = "data/saved_models"

train_processor.trainIters(n_iters=2, model=Model, report_every=1, save_every = 2)

0
iter: 1 mle_loss: 5.241 reward: 0.0000
1
iter: 2 mle_loss: 5.354 reward: 0.0000
model saved at: 
 data/saved_models/0000002.tar
2
iter: 3 mle_loss: 5.859 reward: 0.0000


In [None]:
task = "validate"
load_model = os.path.join(config.log_root, "data/saved_models/0000500.tar")

opt = Namespace(task = task, load_model = load_model)

# new batcher for evaluation
batcher = Batcher(
    examples=examples[:20],
    vocab=vocab, 
    mode='train', 
    batch_size=10, 
    single_pass=True)

eval_processor = Evaluate(vocab, batcher, opt) 

In [None]:
decoded_sents, ref_sents, scores = eval_processor.evaluate_batch(print_sents = True)

In [None]:
scores

In [None]:
ref_sents

In [None]:
decoded_sents

In [None]:
from setuptools import setup, find_packages


In [None]:
find_packages()

In [None]:
d