In [19]:
# Outside imports
import os
import importlib
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [20]:
import model
import train
import evaluate
import train_util
import data_util.data
import data_util.batcher
import data_util.config
import data_util.preprocess

importlib.reload(data_util.preprocess)
importlib.reload(train)
importlib.reload(model)
importlib.reload(evaluate)
importlib.reload(train_util)
importlib.reload(data_util.config)
importlib.reload(data_util.data)
importlib.reload(data_util.batcher)

from train import *
from evaluate import *
from model import *
from train_util import *
from data_util.data import *
from data_util.batcher import *
from data_util.preprocess import *

In [21]:
# load real data
data_path = os.path.join(config.log_root, 'data/context_task_data.tsv')
dat = pd.read_csv(data_path, sep='\t')

# fill nas
dat.fillna('', inplace=True)

# train/test split
np.random.seed(111)
dat = dat.sample(frac=1)

train_size = int(.8*dat.shape[0])
train_data = dat[:train_size]
test_data = dat[train_size:]


In [22]:
# Process the the data
def prep_data(df):
    df['Context'] = df['Context'].map(lambda x: article_process_text(x))
    df['TaskSentence'] = df['TaskSentence'].map(lambda x: article_process_text(x))
    df['Summary'] = df['Summary'].map(lambda x: summary_process_text(x))
    return df
train_data = prep_data(train_data)
test_data = prep_data(test_data)

In [5]:
vocab = Vocab(os.path.join(config.log_root, 'data/vocab/vocab.txt'))

Finished constructing vocabulary of 6639 total words. Last word added: LANGUAGE


In [6]:
random.seed(123)
T.manual_seed(123)
if T.cuda.is_available():
    T.cuda.manual_seed_all(123)
    
class Namespace:
    def __init__(self, **kwargs):
        self.__dict__.update(kwargs)

In [14]:
train_mle = "yes"
train_rl = "no"
mle_weight = 1.0
load_model = None
new_lr = None
rl_weight = 1 - mle_weight

opt = Namespace(train_mle = train_mle, 
                train_rl = train_rl, 
                mle_weight = mle_weight, 
                load_model = load_model,
                new_lr = new_lr, 
                rl_weight = rl_weight)

task_batcher = TaskBatcher(
    examples=train_data.to_dict('records'),
    vocab=vocab,
    mode='train',
    batch_size=32,
    single_pass=False
)

val_task_batcher = TaskBatcher( # Batching obj
    examples=test_data.to_dict('records')[:200],
    vocab=vocab, 
    mode='train', 
    batch_size=50, 
    single_pass=False
)


train_processor = TaskTrain(vocab, task_batcher, opt, TaskModel, val_task_batcher)

In [15]:
# load pre-trained embedding weights
train_processor.model.load_embeddings("embedding_6639_200.tar")

In [16]:
config.save_model_path = "data/saved_models"

mle_losses = train_processor.trainIters(n_iters=200, report_every=1, save_every = 20)

iter: 1 mle_loss: 6.156 mle_loss_val: -100.0000
iter: 2 mle_loss: 5.996 mle_loss_val: -100.0000
iter: 3 mle_loss: 6.573 mle_loss_val: -100.0000
iter: 4 mle_loss: 5.907 mle_loss_val: -100.0000
iter: 5 mle_loss: 5.893 mle_loss_val: -100.0000
iter: 6 mle_loss: 5.372 mle_loss_val: -100.0000
iter: 7 mle_loss: 4.603 mle_loss_val: -100.0000
iter: 8 mle_loss: 4.012 mle_loss_val: -100.0000
iter: 9 mle_loss: 4.355 mle_loss_val: -100.0000
iter: 10 mle_loss: 3.586 mle_loss_val: -100.0000
iter: 11 mle_loss: 3.572 mle_loss_val: -100.0000
iter: 12 mle_loss: 3.766 mle_loss_val: -100.0000
iter: 13 mle_loss: 3.235 mle_loss_val: -100.0000
iter: 14 mle_loss: 2.986 mle_loss_val: -100.0000
iter: 15 mle_loss: 3.070 mle_loss_val: -100.0000
iter: 16 mle_loss: 3.145 mle_loss_val: -100.0000
iter: 17 mle_loss: 3.135 mle_loss_val: -100.0000
iter: 18 mle_loss: 3.109 mle_loss_val: -100.0000
iter: 19 mle_loss: 2.955 mle_loss_val: -100.0000
iter: 20 mle_loss: 3.421 mle_loss_val: 3.0396
model saved at: 
 data/saved_mod

iter: 169 mle_loss: 2.004 mle_loss_val: 2.2549
iter: 170 mle_loss: 2.059 mle_loss_val: 2.2549
iter: 171 mle_loss: 1.989 mle_loss_val: 2.2549
iter: 172 mle_loss: 2.192 mle_loss_val: 2.2549
iter: 173 mle_loss: 2.089 mle_loss_val: 2.2549
iter: 174 mle_loss: 1.996 mle_loss_val: 2.2549
iter: 175 mle_loss: 1.869 mle_loss_val: 2.2549
iter: 176 mle_loss: 2.119 mle_loss_val: 2.2549
iter: 177 mle_loss: 2.182 mle_loss_val: 2.2549
iter: 178 mle_loss: 2.271 mle_loss_val: 2.2549
iter: 179 mle_loss: 2.300 mle_loss_val: 2.2549
iter: 180 mle_loss: 2.272 mle_loss_val: 2.2279
model saved at: 
 data/saved_models/0000180.tar
iter: 181 mle_loss: 2.091 mle_loss_val: 2.2279
iter: 182 mle_loss: 1.780 mle_loss_val: 2.2279
iter: 183 mle_loss: 2.153 mle_loss_val: 2.2279
iter: 184 mle_loss: 1.972 mle_loss_val: 2.2279
iter: 185 mle_loss: 1.965 mle_loss_val: 2.2279
iter: 186 mle_loss: 1.742 mle_loss_val: 2.2279
iter: 187 mle_loss: 2.026 mle_loss_val: 2.2279
iter: 188 mle_loss: 1.765 mle_loss_val: 2.2279
iter: 189 ml

In [18]:
mle_losses

{'train': [(1, 6.156216144561768),
  (2, 5.995687961578369),
  (3, 6.572842597961426),
  (4, 5.907210350036621),
  (5, 5.893364429473877),
  (6, 5.371551036834717),
  (7, 4.602599143981934),
  (8, 4.011568546295166),
  (9, 4.354945659637451),
  (10, 3.58575439453125),
  (11, 3.5724947452545166),
  (12, 3.7657856941223145),
  (13, 3.235222816467285),
  (14, 2.985779047012329),
  (15, 3.070305585861206),
  (16, 3.145054578781128),
  (17, 3.135206937789917),
  (18, 3.108992099761963),
  (19, 2.955451488494873),
  (20, 3.421107769012451),
  (21, 2.831132650375366),
  (22, 2.7401914596557617),
  (23, 3.0186681747436523),
  (24, 3.007930278778076),
  (25, 2.887131929397583),
  (26, 2.494403123855591),
  (27, 2.9594056606292725),
  (28, 3.0613701343536377),
  (29, 2.967100143432617),
  (30, 2.8152413368225098),
  (31, 2.9568252563476562),
  (32, 2.9920711517333984),
  (33, 2.743544578552246),
  (34, 2.7561545372009277),
  (35, 2.779202461242676),
  (36, 3.0034172534942627),
  (37, 2.878787517

Decoding Time

In [23]:
task = "validate"

load_model = os.path.join(config.log_root, "data/saved_models/0000180.tar") # model directory

opt = Namespace(task = task, load_model = load_model) # opt


# new batcher for evaluation
task_batcher = TaskBatcher( # Batching obj
    examples=test_data.to_dict('records'),
    vocab=vocab, 
    mode='train', 
    batch_size=188, 
    single_pass=True)

eval_processor = TaskEvaluate(vocab, task_batcher, opt, TaskModel) # Evaluation object

In [24]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

example_generator completed reading all examples. No more data.


Exception in thread Thread-20:
Traceback (most recent call last):
  File "/Users/rowancassius/Desktop/capstone/LSTM_Summarizer/data_util/batcher.py", line 444, in text_generator
    example = next(example_generator)
StopIteration

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/Users/rowancassius/opt/anaconda3/lib/python3.7/threading.py", line 926, in _bootstrap_inner
    self.run()
  File "/Users/rowancassius/opt/anaconda3/lib/python3.7/threading.py", line 870, in run
    self._target(*self._args, **self._kwargs)
  File "/Users/rowancassius/Desktop/capstone/LSTM_Summarizer/data_util/batcher.py", line 425, in fill_example_queue
    context, task, summary = next(input_gen) # read the next example from file. article and abstract are both strings.
RuntimeError: generator raised StopIteration



In [25]:
count_parameters(eval_processor.model)

14911184

In [26]:
decoded_sents, ref_sents, task_sents, context_sents = eval_processor.evaluate_batch()

Summarizing Batch...
Summarizing Batch...
Summarizing Batch...
INFO:tensorflow:Finished reading dataset in single_pass mode.


In [27]:
len(ref_sents)

564

In [28]:
len(decoded_sents)

564

In [29]:
scores = Rouge().get_scores(decoded_sents, ref_sents, avg = True)

In [30]:
scores

{'rouge-1': {'f': 0.5231746168243703,
  'p': 0.6946175278622088,
  'r': 0.46071446617050515},
 'rouge-2': {'f': 0.30014812891335485,
  'p': 0.4127659574468086,
  'r': 0.2661482553104892},
 'rouge-l': {'f': 0.5232927180716093,
  'p': 0.7001730834177643,
  'r': 0.4579418682242525}}

In [31]:
pd.set_option('display.max_colwidth', -1)

In [32]:
df = pd.DataFrame({'ref': ref_sents, 'decoded': decoded_sents})
df.head()

Unnamed: 0,ref,decoded
0,review details,review details
1,send comments from legal department to SENDER,send those legal department
2,update SENDER of changes,bring up to date
3,give SENDER weather data,offer
4,check if deals can be rolled or extended,perform this


In [None]:
df.to_csv(os.path.join(config.log_root, 'data/test_results_2.csv'), sep = '\t', header=True, index=False)

In [None]:
'packet' in vocab._word_to_id

In [None]:
'interview' in vocab._word_to_id

In [None]:
# some training examples
pd.DataFrame({'ref': ref_sents, 'decoded': decoded_sents})

In [None]:
test_data.head()

In [None]:
test_data

In [None]:
ref_sents

..
..


In [None]:
from nltk.corpus import words

In [None]:
len(words.words())

In [None]:
import nltk

In [None]:
for w in nltk.corpus.wordnet.words(): print(w)

In [None]:
len(list(nltk.corpus.wordnet.words()))