In [1]:
# Outside imports
import os
import importlib
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


In [2]:
import model
import train
import evaluate
import train_util
import data_util.data
import data_util.batcher
import data_util.config
import data_util.preprocess

importlib.reload(data_util.preprocess)
importlib.reload(train)
importlib.reload(model)
importlib.reload(evaluate)
importlib.reload(train_util)
importlib.reload(data_util.config)
importlib.reload(data_util.data)
importlib.reload(data_util.batcher)

from train import *
from evaluate import *
from model import *
from train_util import *
from data_util.data import *
from data_util.batcher import *
from data_util.preprocess import *

In [3]:
# load real data
data_path = os.path.join(config.log_root, 'data/context_task_data.tsv')
dat = pd.read_csv(data_path, sep='\t')

# fill nas
dat.fillna('', inplace=True)

# train/test split
np.random.seed(111)
dat = dat.sample(frac=1)

train_size = int(.8*dat.shape[0])
train_data = dat[:train_size]
test_data = dat[train_size:]


In [4]:
# Process the the data
def prep_data(df):
    df['Context'] = df['Context'].map(lambda x: article_process_text(x))
    df['TaskSentence'] = df['TaskSentence'].map(lambda x: article_process_text(x))
    df['Summary'] = df['Summary'].map(lambda x: summary_process_text(x))
    return df
train_data = prep_data(train_data)
test_data = prep_data(test_data)

In [5]:
vocab = Vocab(os.path.join(config.log_root, 'data/vocab/vocab.txt'))

Finished constructing vocabulary of 6639 total words. Last word added: LANGUAGE


In [6]:
random.seed(123)
T.manual_seed(123)
if T.cuda.is_available():
    T.cuda.manual_seed_all(123)
    
class Namespace:
    def __init__(self, **kwargs):
        self.__dict__.update(kwargs)

In [7]:
train_mle = "yes"
train_rl = "no"
mle_weight = 1.0
load_model = None
new_lr = None
rl_weight = 1 - mle_weight

opt = Namespace(train_mle = train_mle, 
                train_rl = train_rl, 
                mle_weight = mle_weight, 
                load_model = load_model,
                new_lr = new_lr, 
                rl_weight = rl_weight)

task_batcher = TaskBatcher(
    examples=train_data.to_dict('records'),
    vocab=vocab,
    mode='train',
    batch_size=32,
    single_pass=False
)

val_task_batcher = TaskBatcher( # Batching obj
    examples=test_data.to_dict('records')[:200],
    vocab=vocab, 
    mode='train', 
    batch_size=50, 
    single_pass=False
)


train_processor = TaskTrain(vocab, task_batcher, opt, TaskModel, val_task_batcher)

In [8]:
# load pre-trained embedding weights
train_processor.model.load_embeddings("embedding_6639_200.tar")

In [9]:
config.save_model_path = "data/saved_models_2"

mle_losses = train_processor.trainIters(n_iters=400, report_every=1, save_every = 30)

iter: 1 mle_loss: 6.149 mle_loss_val: -100.0000
iter: 2 mle_loss: 5.980 mle_loss_val: -100.0000
iter: 3 mle_loss: 6.558 mle_loss_val: -100.0000
iter: 4 mle_loss: 5.892 mle_loss_val: -100.0000
iter: 5 mle_loss: 5.856 mle_loss_val: -100.0000
iter: 6 mle_loss: 5.299 mle_loss_val: -100.0000
iter: 7 mle_loss: 4.500 mle_loss_val: -100.0000
iter: 8 mle_loss: 3.893 mle_loss_val: -100.0000
iter: 9 mle_loss: 4.333 mle_loss_val: -100.0000
iter: 10 mle_loss: 3.549 mle_loss_val: -100.0000
iter: 11 mle_loss: 3.579 mle_loss_val: -100.0000
iter: 12 mle_loss: 3.731 mle_loss_val: -100.0000
iter: 13 mle_loss: 3.141 mle_loss_val: -100.0000
iter: 14 mle_loss: 2.942 mle_loss_val: -100.0000
iter: 15 mle_loss: 3.059 mle_loss_val: -100.0000
iter: 16 mle_loss: 3.126 mle_loss_val: -100.0000
iter: 17 mle_loss: 3.153 mle_loss_val: -100.0000
iter: 18 mle_loss: 3.091 mle_loss_val: -100.0000
iter: 19 mle_loss: 2.932 mle_loss_val: -100.0000
iter: 20 mle_loss: 3.393 mle_loss_val: -100.0000
iter: 21 mle_loss: 2.895 mle_

iter: 171 mle_loss: 1.922 mle_loss_val: 2.3227
iter: 172 mle_loss: 2.458 mle_loss_val: 2.3227
iter: 173 mle_loss: 2.036 mle_loss_val: 2.3227
iter: 174 mle_loss: 1.980 mle_loss_val: 2.3227
iter: 175 mle_loss: 1.925 mle_loss_val: 2.3227
iter: 176 mle_loss: 2.023 mle_loss_val: 2.3227
iter: 177 mle_loss: 2.171 mle_loss_val: 2.3227
iter: 178 mle_loss: 2.416 mle_loss_val: 2.3227
iter: 179 mle_loss: 2.232 mle_loss_val: 2.3227
iter: 180 mle_loss: 2.334 mle_loss_val: 2.2281
model saved at: 
 data/saved_models_2/0000180.tar
iter: 181 mle_loss: 1.984 mle_loss_val: 2.2281
iter: 182 mle_loss: 1.831 mle_loss_val: 2.2281
iter: 183 mle_loss: 2.108 mle_loss_val: 2.2281
iter: 184 mle_loss: 2.007 mle_loss_val: 2.2281
iter: 185 mle_loss: 1.906 mle_loss_val: 2.2281
iter: 186 mle_loss: 1.683 mle_loss_val: 2.2281
iter: 187 mle_loss: 2.036 mle_loss_val: 2.2281
iter: 188 mle_loss: 1.854 mle_loss_val: 2.2281
iter: 189 mle_loss: 2.279 mle_loss_val: 2.2281
iter: 190 mle_loss: 2.226 mle_loss_val: 2.2281
iter: 191 

iter: 339 mle_loss: 1.957 mle_loss_val: 2.2192
iter: 340 mle_loss: 1.374 mle_loss_val: 2.2192
iter: 341 mle_loss: 1.766 mle_loss_val: 2.2192
iter: 342 mle_loss: 1.481 mle_loss_val: 2.2192
iter: 343 mle_loss: 1.551 mle_loss_val: 2.2192
iter: 344 mle_loss: 1.827 mle_loss_val: 2.2192
iter: 345 mle_loss: 1.796 mle_loss_val: 2.2192
iter: 346 mle_loss: 1.816 mle_loss_val: 2.2192
iter: 347 mle_loss: 1.753 mle_loss_val: 2.2192
iter: 348 mle_loss: 1.531 mle_loss_val: 2.2192
iter: 349 mle_loss: 1.910 mle_loss_val: 2.2192
iter: 350 mle_loss: 1.387 mle_loss_val: 2.2192
iter: 351 mle_loss: 1.484 mle_loss_val: 2.2192
iter: 352 mle_loss: 1.711 mle_loss_val: 2.2192
iter: 353 mle_loss: 1.713 mle_loss_val: 2.2192
iter: 354 mle_loss: 1.635 mle_loss_val: 2.2192
iter: 355 mle_loss: 1.595 mle_loss_val: 2.2192
iter: 356 mle_loss: 1.910 mle_loss_val: 2.2192
iter: 357 mle_loss: 1.750 mle_loss_val: 2.2192
iter: 358 mle_loss: 1.698 mle_loss_val: 2.2192
iter: 359 mle_loss: 1.894 mle_loss_val: 2.2192
iter: 360 mle

In [None]:
mle_losses

Decoding Time

In [None]:
task = "validate"

load_model = os.path.join(config.log_root, "data/saved_models/0000180.tar") # model directory

opt = Namespace(task = task, load_model = load_model) # opt


# new batcher for evaluation
task_batcher = TaskBatcher( # Batching obj
    examples=test_data.to_dict('records'),
    vocab=vocab, 
    mode='train', 
    batch_size=188, 
    single_pass=True)

eval_processor = TaskEvaluate(vocab, task_batcher, opt, TaskModel) # Evaluation object

In [None]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

In [None]:
count_parameters(eval_processor.model)

In [None]:
decoded_sents, ref_sents, task_sents, context_sents = eval_processor.evaluate_batch()

In [None]:
len(ref_sents)

In [None]:
len(decoded_sents)

In [None]:
scores = Rouge().get_scores(decoded_sents, ref_sents, avg = True)

In [None]:
scores

In [None]:
pd.set_option('display.max_colwidth', -1)

In [None]:
df = pd.DataFrame({'ref': ref_sents, 'decoded': decoded_sents})
df.head()

In [None]:
df.to_csv(os.path.join(config.log_root, 'data/test_results_2.csv'), sep = '\t', header=True, index=False)

In [None]:
'packet' in vocab._word_to_id

In [None]:
'interview' in vocab._word_to_id

Checking certain words for presence in the vocab:

In [None]:
words = ["ensure", "indicate", "turn", "open", "add"]

In [None]:
[w in vocab._word_to_id for w in words]

In [None]:
action_words = train_data['Summary'].map(lambda x: x.split(' ', 1)[0])

In [None]:
action_words.dtype

In [None]:
in_vocab = action_words.map(lambda w: w in vocab._word_to_id)

In [None]:
len(in_vocab)

In [None]:
in_vocab.sum()

In [None]:
action_words[~in_vocab]

In [None]:
test_data[~in_vocab]

2240 of 2256 verbs to start summaries are in the vocab

In [None]:
# train
action_words[~in_vocab]

In [None]:
train_data[~in_vocab]['Labeler'].value_counts()

In [None]:
train_data[~in_vocab]['Summary']