In [1]:
# Outside imports
import os
import importlib
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


In [2]:
import model
import train
import evaluate
import train_util
import data_util.data
import data_util.batcher
import data_util.config
import data_util.preprocess

importlib.reload(train)
importlib.reload(model)
importlib.reload(evaluate)
importlib.reload(train_util)
importlib.reload(data_util.config)
importlib.reload(data_util.data)
importlib.reload(data_util.batcher)
importlib.reload(data_util.preprocess)

from train import *
from evaluate import *
from model import *
from train_util import *
from data_util.data import *
from data_util.batcher import *
from data_util.preprocess import *

In [3]:
# load real data
data_path = '/Users/rowancassius/Desktop/capstone/data/context_task_data_fresh.tsv'
dat = pd.read_csv(data_path, sep='\t')

# fill nas
dat.fillna('', inplace=True)

# train/test split
np.random.seed(111)
dat = dat.sample(frac=1)

train_size = int(.8*dat.shape[0])
train_data = dat[:train_size]
test_data = dat[train_size:]


In [4]:
# Process the the data
def prep_data(df):
    df['Context'] = df['Context'].map(lambda x: article_process_text(x))
    df['TaskSentence'] = df['TaskSentence'].map(lambda x: article_process_text(x))
    df['Summary'] = df['Summary'].map(lambda x: summary_process_text(x))
    return df
train_data = prep_data(train_data)
test_data = prep_data(test_data)

In [5]:
train_data

Unnamed: 0,#,Task,Context,TaskSentence,Summary,Labeler,NoRequestInContext,Urgent,NotRequest,Unsure/Discuss,RandomNumber
1083,5187,Please give me a shout with,Attached is the file that I use from Storey to...,Please give me a shout with any questions.,contact SENDER with questions,Natalie,1,0,1,,0.053197
384,385,Please respond whether we have your most updat...,"Hi, I'm Shawna with Icon. I am updating our da...",Please respond whether we have your most updat...,check whether information is updated,Rowan,,0,,,0.282344
1348,5452,plz call when you can.,,plz call when you can 415-7827-822 .,call SENDER at 415-7827-822,Natalie,1,0,0,,0.779285
535,1727,"Karen, please call me when you receive this em...",,"Karen, please call me when you receive this em...",call SENDER,Percy,,,1,,0.929963
694,3461,Please look over the attached at your convenie...,I have them revised the document to reflect th...,Please look over the attached at your convenie...,look over attached documents,George,,,,,0.989056
...,...,...,...,...,...,...,...,...,...,...,...
1241,5345,IMAGE To receive our e-mails in a text-only fo...,Your kind of clothes. Online. All the time. IM...,please reply to this message and type change t...,reply to SENDER and type change to text,Natalie,1,0,1,,0.308842
305,306,"In light of this, please make sure your Confir...","Diane or Patrick, I need help from one of you ...","In light of this, please make sure your Confir...",ensure confirmation copy includes the GTC,Rowan,1,0,,,0.580914
247,248,Please review the attached non-standard discou...,"I'm OK on this, with one small change. Please ...",Please review the attached non-standard discou...,review attached letter,Rowan,,0,,,0.149520
181,182,"Please, check the logic of my",Headcount gives the allocations by major busin...,"Please, check the logic of my modifications I ...",check logic of modifications,Rowan,,,,,0.305235


In [6]:
vocab = Vocab.from_vocab_file(os.path.join(config.log_root, 'data/vocab.txt'))

Finished constructing vocabulary of 4654 total words. Last word added: included


In [7]:
vocab.word2id('now')

2732

In [7]:
random.seed(123)
T.manual_seed(123)
if T.cuda.is_available():
    T.cuda.manual_seed_all(123)
    
class Namespace:
    def __init__(self, **kwargs):
        self.__dict__.update(kwargs)

In [36]:
train_mle = "yes"
train_rl = "no"
mle_weight = 1.0
load_model = None
new_lr = None
rl_weight = 1 - mle_weight

opt = Namespace(train_mle = train_mle, 
                train_rl = train_rl, 
                mle_weight = mle_weight, 
                load_model = load_model,
                new_lr = new_lr, 
                rl_weight = rl_weight)

task_batcher = TaskBatcher(
    examples=train_data.to_dict('records'),
    vocab=vocab,
    mode='train',
    batch_size=32,
    single_pass=False
)

val_task_batcher = TaskBatcher( # Batching obj
    examples=train_data.to_dict('records')[:200],
    vocab=vocab, 
    mode='train', 
    batch_size=50, 
    single_pass=False
)


train_processor = TaskTrain(vocab, task_batcher, opt, TaskModel, val_task_batcher)

In [9]:
config.save_model_path = "data/lstm_seg_ent_2"

mle_losses = train_processor.trainIters(n_iters=200, report_every=1, save_every = 20)

iter: 1 mle_loss: 6.290 mle_loss_val: -100.0000
iter: 2 mle_loss: 5.847 mle_loss_val: -100.0000
iter: 3 mle_loss: 6.003 mle_loss_val: -100.0000
iter: 4 mle_loss: 5.632 mle_loss_val: -100.0000
iter: 5 mle_loss: 5.439 mle_loss_val: -100.0000
iter: 6 mle_loss: 5.186 mle_loss_val: -100.0000
iter: 7 mle_loss: 4.832 mle_loss_val: -100.0000
iter: 8 mle_loss: 4.664 mle_loss_val: -100.0000
iter: 9 mle_loss: 4.020 mle_loss_val: -100.0000
-------------------Keyboard Interrupt------------------


NameError: name 'exit' is not defined

Decoding Time

In [40]:
config.save_model_path = "data/lstm_seg_ent_2"

task = "validate"

load_model = os.path.join(config.log_root, "data/lstm_seg_ent_2/0000200.tar") # model directory

opt = Namespace(task = task, load_model = load_model) # opt


# new batcher for evaluation
task_batcher = TaskBatcher( # Batching obj
    examples=test_data.to_dict('records'),
    vocab=vocab, 
    mode='train', 
    batch_size=10, 
    single_pass=True)

eval_processor = TaskEvaluate(vocab, task_batcher, opt, TaskModel) # Evaluation object

model_path /Users/rowancassius/Desktop/capstone/LSTM_Summarizer/data/lstm_seg_ent_2/0000200.tar
example_generator completed reading all examples. No more data.


Exception in thread Thread-20:
Traceback (most recent call last):
  File "/Users/rowancassius/Desktop/capstone/LSTM_Summarizer/data_util/batcher.py", line 443, in text_generator
    example = next(example_generator)
StopIteration

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/Users/rowancassius/opt/anaconda3/lib/python3.7/threading.py", line 926, in _bootstrap_inner
    self.run()
  File "/Users/rowancassius/opt/anaconda3/lib/python3.7/threading.py", line 870, in run
    self._target(*self._args, **self._kwargs)
  File "/Users/rowancassius/Desktop/capstone/LSTM_Summarizer/data_util/batcher.py", line 424, in fill_example_queue
    context, task, summary = next(input_gen) # read the next example from file. article and abstract are both strings.
RuntimeError: generator raised StopIteration



In [42]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

In [44]:
count_parameters(eval_processor.model)

14413103

In [41]:
eval_processor.model.state_dict()

OrderedDict([('encoder.lstm.weight_ih_l0',
              tensor([[-0.0954,  0.0502,  0.1390,  ...,  0.0518, -0.0395,  0.0514],
                      [ 0.0487,  0.0290, -0.0217,  ...,  0.0148, -0.0193,  0.0253],
                      [-0.0725,  0.0572,  0.1141,  ...,  0.0124, -0.0181,  0.0087],
                      ...,
                      [-0.1301,  0.1141,  0.1222,  ...,  0.0576, -0.0336,  0.0744],
                      [-0.1076,  0.0807,  0.0551,  ..., -0.0027, -0.0076,  0.0451],
                      [-0.1604,  0.1239,  0.1439,  ...,  0.0289, -0.0160,  0.0375]])),
             ('encoder.lstm.weight_hh_l0',
              tensor([[ 0.0335, -0.0072, -0.0085,  ..., -0.0220, -0.0239,  0.0236],
                      [ 0.0034, -0.0056,  0.0198,  ...,  0.0085, -0.0093,  0.0221],
                      [-0.0015,  0.0040,  0.0262,  ..., -0.0231,  0.0076, -0.0066],
                      ...,
                      [ 0.0188, -0.0020, -0.0154,  ..., -0.0133, -0.0255,  0.0245],
                 

In [26]:
decoded_sents, ref_sents, article_sents = eval_processor.evaluate_batch()

Summarizing Batch...
Summarizing Batch...
Summarizing Batch...
Summarizing Batch...
example_generator completed reading all examples. No more data.


Exception in thread Thread-12:
Traceback (most recent call last):
  File "/Users/rowancassius/Desktop/capstone/LSTM_Summarizer/data_util/batcher.py", line 443, in text_generator
    example = next(example_generator)
StopIteration

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/Users/rowancassius/opt/anaconda3/lib/python3.7/threading.py", line 926, in _bootstrap_inner
    self.run()
  File "/Users/rowancassius/opt/anaconda3/lib/python3.7/threading.py", line 870, in run
    self._target(*self._args, **self._kwargs)
  File "/Users/rowancassius/Desktop/capstone/LSTM_Summarizer/data_util/batcher.py", line 424, in fill_example_queue
    context, task, summary = next(input_gen) # read the next example from file. article and abstract are both strings.
RuntimeError: generator raised StopIteration



Summarizing Batch...
Summarizing Batch...
Summarizing Batch...
Summarizing Batch...
Summarizing Batch...
Summarizing Batch...
Summarizing Batch...
Summarizing Batch...
Summarizing Batch...
Summarizing Batch...
Summarizing Batch...
Summarizing Batch...
Summarizing Batch...
Summarizing Batch...
Summarizing Batch...
Summarizing Batch...
Summarizing Batch...
Summarizing Batch...
Summarizing Batch...
Summarizing Batch...
Summarizing Batch...
Summarizing Batch...
Summarizing Batch...
Summarizing Batch...
Summarizing Batch...
INFO:tensorflow:Finished reading dataset in single_pass mode.


In [31]:
len(ref_sents)

290

In [32]:
len(decoded_sents)

290

In [33]:
scores = Rouge().get_scores(decoded_sents, ref_sents, avg = True)

In [34]:
scores

{'rouge-1': {'f': 0.0871131137765286,
  'p': 0.13652983032293375,
  'r': 0.07235935126401657},
 'rouge-2': {'f': 0.02123266716332826,
  'p': 0.034140667761357414,
  'r': 0.017825412135756964},
 'rouge-l': {'f': 0.09327039381614881,
  'p': 0.15944581280788178,
  'r': 0.07215619629412733}}

In [23]:
pd.set_option('display.max_colwidth', -1)

In [24]:
df = pd.DataFrame({'ref': ref_sents, 'decoded': decoded_sents})
df.head()

Unnamed: 0,ref,decoded
0,tell Tom to clarify the cp,Kate Tom Alonso please
1,determine how renewal contract work will be coordinated,with with CDC
2,forward information to beneficial people,Cynthia Kase Cynthia Kase please
3,review draft of 20/20 Term Sheet,be a Letter of Intent please
4,forward things to SENDER,John John Sam


In [None]:
df.to_csv(os.path.join(config.log_root, 'data/test_results_2.csv'), sep = '\t', header=True, index=False)

In [None]:
'packet' in vocab._word_to_id

In [None]:
'interview' in vocab._word_to_id

In [None]:
# some training examples
pd.DataFrame({'ref': ref_sents, 'decoded': decoded_sents})

In [None]:
test_data.head()

In [None]:
test_data

In [None]:
ref_sents

..
..


In [None]:
from nltk.corpus import words

In [None]:
len(words.words())

In [None]:
import nltk

In [None]:
for w in nltk.corpus.wordnet.words(): print(w)

In [None]:
len(list(nltk.corpus.wordnet.words()))