In [1]:
# Outside imports
import os
import importlib
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


In [103]:
import train
import model
import evaluate
import data_util.data
import data_util.batcher
import data_util.config

importlib.reload(model)
importlib.reload(train)
importlib.reload(evaluate)
importlib.reload(data_util.config)
importlib.reload(data_util.data)
importlib.reload(data_util.batcher)

from model import *
from train import *
from evaluate import *
from data_util.data import *
from data_util.batcher import *


In [3]:
# load real data
data_path = '/Users/rowancassius/Desktop/capstone/data/context_task_data.tsv'
dat = pd.read_csv(data_path, sep='\t')

# test/train split
x_train, x_test, y_train, y_test = train_test_split(
    dat.TaskSentence.values, 
    dat.Summary.values,
    test_size=0.2,
    random_state=0,
    shuffle=True
)

# fit tokenizer
tokenizer = Tokenizer(lower=True, filters='!"#$%&()*+,./:;<=>?[\\]^_`{|}~\t\n')
tokenizer.fit_on_texts(list(x_train)+list(y_train))

# lowercase the data
x_train = tokenizer.sequences_to_texts(tokenizer.texts_to_sequences(x_train))
y_train = tokenizer.sequences_to_texts(tokenizer.texts_to_sequences(y_train))

examples = list(zip(x_train, y_train))
# examples

In [4]:
random.seed(123)
T.manual_seed(123)
if T.cuda.is_available():
    T.cuda.manual_seed_all(123)
    
class Namespace:
    def __init__(self, **kwargs):
        self.__dict__.update(kwargs)

In [5]:
# Standards
# Make Vocab
vocab = data.Vocab(
    words = tokenizer.word_index.keys(), 
    max_size=len(tokenizer.word_index))

# Make Batcher
batcher = Batcher(
    examples=examples,
    vocab=vocab, 
    mode='train', 
    batch_size=32, 
    single_pass=False)

max_size of vocab was specified as 2829; we now have 2829 words. Stopping reading.
Finished constructing vocabulary of 2829 total words. Last word added: foundation
INFO:tensorflow:Bucket queue size: 12, Input queue size: 547


In [89]:
train_mle = "yes"
train_rl = "no"
mle_weight = 1.0
load_model = None
new_lr = None
rl_weight = 1 - mle_weight

opt = Namespace(train_mle = train_mle, 
                train_rl = train_rl, 
                mle_weight = mle_weight, 
                load_model = load_model,
                new_lr = new_lr, 
                rl_weight = rl_weight)


train_processor = Train(vocab, batcher, opt)

INFO:tensorflow:Bucket queue size: 1000, Input queue size: 32000
INFO:tensorflow:Bucket queue size: 1000, Input queue size: 32000
INFO:tensorflow:Bucket queue size: 1000, Input queue size: 32000
INFO:tensorflow:Bucket queue size: 1000, Input queue size: 32000


In [90]:
train_processor.setup_train(model=Model)

0

In [91]:
train_processor.train_one_batch(train_processor.batcher.next_batch(), 1)

x_t shape: torch.Size([32])
tensor([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2])
POST embed x_t shape torch.Size([32, 256])
final_dist: torch.Size([32, 50000])
x_t multinomial: torch.Size([32])
x_t shape: torch.Size([32])
tensor([  0, 294, 500,  87,   0, 152,  28,  96, 324,  67,  16, 121,  63,  13,
         19,  55, 289, 774,  67, 200, 169,  19, 270, 229,  82, 208,  74,  33,
        169,  75,  19,  58])
POST embed x_t shape torch.Size([32, 256])
final_dist: torch.Size([32, 50000])
x_t multinomial: torch.Size([32])
x_t shape: torch.Size([32])
tensor([ 151,    9,   13,   10,   86,    0,   59,   11, 2174,  173,    9, 1817,
           0,   61,  163,   17,  971,    0,  112,    0,   37,    9,  979,  230,
         962,   89,   48,    4,    0,    6,   28,    0])
POST embed x_t shape torch.Size([32, 256])
final_dist: torch.Size([32, 50000])
x_t multinomial: torch.Size([32])
x_t shape: torch.Size([32])
tensor([ 13,  20, 975,  86,   4,  

(6.453962326049805, 0)

INFO:tensorflow:Bucket queue size: 1000, Input queue size: 32000
INFO:tensorflow:Bucket queue size: 1000, Input queue size: 32000
INFO:tensorflow:Bucket queue size: 1000, Input queue size: 32000


In [12]:
# Make Vocab
bert_vocab = BertVocab('bert-base-uncased')

# Make Batcher
bert_batcher = Batcher(
    examples=examples,
    vocab=bert_vocab, 
    mode='train', 
    batch_size=32, 
    single_pass=False)

INFO:tensorflow:Bucket queue size: 21, Input queue size: 352


In [104]:
train_mle = "yes"
train_rl = "no"
mle_weight = 1.0
load_model = None
new_lr = None
rl_weight = 1 - mle_weight

opt = Namespace(train_mle = train_mle, 
                train_rl = train_rl, 
                mle_weight = mle_weight, 
                load_model = load_model,
                new_lr = new_lr, 
                rl_weight = rl_weight)


bert_train_processor = Train(bert_vocab, bert_batcher, opt)

In [105]:
# bert_train_processor.setup_train(model=BertSummarizer)
# bert_train_processor.train_one_batch(bert_train_processor.batcher.next_batch(), 1)

In [None]:
bert_train_processor.trainIters(n_iters=5, model=BertSummarizer)

0
INFO:tensorflow:Bucket queue size: 1000, Input queue size: 32000
INFO:tensorflow:Bucket queue size: 1000, Input queue size: 32000
1
2
