In [4]:
import tensorflow as tf
from tensorflow import keras

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from sklearn.model_selection import train_test_split

import unicodedata
import re
import numpy as np
import os
import io
import time

In [5]:
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

Num GPUs Available:  8


In [6]:
# path = '/tmp/punctuator_data'
path = '/home/jupyter'
with open(os.path.join(path, 'train.txt'), 'r') as f:
    target_str = f.read()
    target_text = target_str.split('\n')

with open(os.path.join(path, 'train_nopunc.txt'), 'r') as f:
    input_str = f.read()
    input_text = input_str.split('\n')

In [7]:
type(target_str)

str

In [8]:
tf.test.is_gpu_available(
    cuda_only=False, min_cuda_compute_capability=None
)

Instructions for updating:
Use `tf.config.list_physical_devices('GPU')` instead.


True

In [9]:
print(target_text[:3])

['Rachel clasped her hands together and slowed her pace.', 'Accepted crystallographic symbolism has been used;', "They found a large welcoming group - El Paso policemen, Border Patrol, sheriff's deputies, and FBI men, who surged around the plane with rifles and submachine guns."]


In [10]:
''.join(sorted(set(target_str)))

'\n !"$%&\'()*+,-./0123456789:;?ABCDEFGHIJKLMNOPQRSTUVWXYZ[]abcdefghijklmnopqrstuvwxyz{}'

In [11]:
''.join(sorted(set(input_str)))

'\n 0123456789abcdefghijklmnopqrstuvwxyz'

In [12]:
def tokenize(text):
    text_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='', char_level=True, lower=False)
    text_tokenizer.fit_on_texts(text)

    tensor = text_tokenizer.texts_to_sequences(text)

    tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor,
                                                         padding='post')

    return tensor, text_tokenizer

In [13]:
input_tensor_train, input_tokenizer = tokenize(input_text)

In [14]:
target_tensor_train, target_tokenizer = tokenize(target_text)

In [15]:
max_length_targ, max_length_inp = target_tensor_train.shape[1], input_tensor_train.shape[1]

In [16]:
target_tensor_train.shape

(46839, 864)

In [17]:
target_tensor_train

array([[48,  4, 13, ...,  0,  0,  0],
       [27, 13, 13, ...,  0,  0,  0],
       [26, 10,  2, ...,  0,  0,  0],
       ...,
       [30,  4, 15, ...,  0,  0,  0],
       [53,  5, 10, ...,  0,  0,  0],
       [ 0,  0,  0, ...,  0,  0,  0]], dtype=int32)

In [18]:
def convert(lang, tensor):
    for t in tensor:
        if t!=0:
            print ("%d ----> %s" % (t, lang.index_word[t]))

In [19]:
convert(input_tokenizer, input_tensor_train[0])

9 ----> r
4 ----> a
13 ----> c
10 ----> h
2 ----> e
11 ----> l
1 ---->  
13 ----> c
11 ----> l
4 ----> a
8 ----> s
17 ----> p
2 ----> e
12 ----> d
1 ---->  
10 ----> h
2 ----> e
9 ----> r
1 ---->  
10 ----> h
4 ----> a
7 ----> n
12 ----> d
8 ----> s
1 ---->  
3 ----> t
5 ----> o
18 ----> g
2 ----> e
3 ----> t
10 ----> h
2 ----> e
9 ----> r
1 ---->  
4 ----> a
7 ----> n
12 ----> d
1 ---->  
8 ----> s
11 ----> l
5 ----> o
19 ----> w
2 ----> e
12 ----> d
1 ---->  
10 ----> h
2 ----> e
9 ----> r
1 ---->  
17 ----> p
4 ----> a
13 ----> c
2 ----> e


In [20]:
convert(target_tokenizer, target_tensor_train[0])

48 ----> R
4 ----> a
13 ----> c
10 ----> h
2 ----> e
11 ----> l
1 ---->  
13 ----> c
11 ----> l
4 ----> a
8 ----> s
17 ----> p
2 ----> e
12 ----> d
1 ---->  
10 ----> h
2 ----> e
9 ----> r
1 ---->  
10 ----> h
4 ----> a
7 ----> n
12 ----> d
8 ----> s
1 ---->  
3 ----> t
5 ----> o
18 ----> g
2 ----> e
3 ----> t
10 ----> h
2 ----> e
9 ----> r
1 ---->  
4 ----> a
7 ----> n
12 ----> d
1 ---->  
8 ----> s
11 ----> l
5 ----> o
19 ----> w
2 ----> e
12 ----> d
1 ---->  
10 ----> h
2 ----> e
9 ----> r
1 ---->  
17 ----> p
4 ----> a
13 ----> c
2 ----> e
23 ----> .


In [21]:
BUFFER_SIZE = len(input_tensor_train)
BATCH_SIZE = 4
steps_per_epoch = len(input_tensor_train)//BATCH_SIZE
embedding_dim = 64
units = 128
vocab_inp_size = len(input_tokenizer.word_index) + 1
vocab_targ_size = len(target_tokenizer.word_index) + 1

dataset = tf.data.Dataset.from_tensor_slices((input_tensor_train, target_tensor_train)).shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)

In [22]:
dataset

<BatchDataset shapes: ((4, 843), (4, 864)), types: (tf.int32, tf.int32)>

In [23]:
example_input_batch, example_target_batch = next(iter(dataset))
example_input_batch.shape, example_target_batch.shape

(TensorShape([4, 843]), TensorShape([4, 864]))

In [24]:
example_input_batch

<tf.Tensor: shape=(4, 843), dtype=int32, numpy=
array([[ 3, 10,  2, ...,  0,  0,  0],
       [14, 17,  1, ...,  0,  0,  0],
       [25,  4, 13, ...,  0,  0,  0],
       [ 6,  3,  1, ...,  0,  0,  0]], dtype=int32)>

In [25]:
class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):
        super(Encoder, self).__init__()
        self.batch_sz = batch_sz
        self.enc_units = enc_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(self.enc_units,
                                       return_sequences=True,
                                       return_state=True,
                                       recurrent_initializer='glorot_uniform')
    
    def call(self, x, hidden):
        x = self.embedding(x)
        output, state = self.gru(x, initial_state = hidden)
        return output, state
    
    def initialize_hidden_state(self):
        return tf.zeros((self.batch_sz, self.enc_units))

In [26]:
encoder = Encoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE)

In [27]:
sample_hidden = encoder.initialize_hidden_state()
sample_output, sample_hidden = encoder(example_input_batch, sample_hidden)

In [28]:
sample_hidden.shape

TensorShape([4, 128])

In [29]:
sample_output.shape  # batch size, sequence length, units

TensorShape([4, 843, 128])

In [30]:
sample_hidden.shape

TensorShape([4, 128])

In [31]:
class BahdanauAttention(tf.keras.layers.Layer):
    def __init__(self, units):
        super(BahdanauAttention, self).__init__()
        self.W1 = tf.keras.layers.Dense(units)
        self.W2 = tf.keras.layers.Dense(units)
        self.V = tf.keras.layers.Dense(1)
    
    def call(self, query, values):
        query_with_time_axis = tf.expand_dims(query, 1)
        
        score = self.V(tf.nn.tanh(self.W1(query_with_time_axis) + self.W2(values)))
        attention_weights = tf.nn.softmax(score, axis=1)
        
        context_vector = attention_weights * values
        context_vector = tf.reduce_sum(context_vector, axis=1)
        
        return context_vector, attention_weights

In [32]:
attention_layer = BahdanauAttention(10)
attention_result, attention_weights = attention_layer(sample_hidden, sample_output)

In [33]:
attention_result.shape  # batch size, units

TensorShape([4, 128])

In [34]:
attention_weights.shape  # batch_size, sequence_length, 1

TensorShape([4, 843, 1])

In [35]:
class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz):
        super(Decoder, self).__init__()
        self.batch_sz = batch_sz
        self.dec_units = dec_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(self.dec_units,
                                       return_sequences=True,
                                       return_state=True,
                                       recurrent_initializer='glorot_uniform')
        self.fc = tf.keras.layers.Dense(vocab_size)
        
        self.attention = BahdanauAttention(self.dec_units)
    
    def call(self, x, hidden, enc_output):
        context_vector, attention_weights = self.attention(hidden, enc_output)
        
        x = self.embedding(x)
        
        x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)
        
        output, state = self.gru(x)
        output = tf.reshape(output, (-1, output.shape[2]))
        
        x = self.fc(output)
        
        return x, state, attention_weights

In [36]:
decoder = Decoder(vocab_targ_size, embedding_dim, units, BATCH_SIZE)

sample_decoder_output, _, _ = decoder(tf.random.uniform((BATCH_SIZE, 1)), sample_hidden, sample_output)

In [37]:
optimizer = tf.keras.optimizers.Adam()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')

def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)
    
    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask
    
    return tf.reduce_mean(loss_)

In [38]:
checkpoint_dir = './training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(optimizer=optimizer,
                                 encoder=encoder,
                                 decoder=decoder)

In [39]:
# checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))

In [40]:
@tf.function
def train_step(inp, targ, enc_hidden):
    start = time.time()
    loss = 0
    
    with tf.GradientTape() as tape:
        enc_output, enc_hidden = encoder(inp, enc_hidden)
        
        dec_hidden = enc_hidden
        
        # TODO: check if decoder initialization makes sense
        dec_input = tf.expand_dims([1] * BATCH_SIZE, 1)  # initialize decoder input
        
        # teacher forcing
        for t in range(1, targ.shape[1]):
            predictions, dec_hidden, _ = decoder(dec_input, dec_hidden, enc_output)
            
            loss += loss_function(targ[:, t], predictions)
            
            dec_input = tf.expand_dims(targ[:, t], 1)
            
        batch_loss = (loss / int(targ.shape[1]))
        
        variables = encoder.trainable_variables + decoder.trainable_variables
        
        gradients = tape.gradient(loss, variables)
        
        optimizer.apply_gradients(zip(gradients, variables))
        
        print(time.time() - start, ' sec')
        return batch_loss

In [None]:
%%time
EPOCHS = 10

for epoch in range(EPOCHS):
    start = time.time()
    
    enc_hidden = encoder.initialize_hidden_state()
    total_loss = 0
    
    for (batch, (inp, targ)) in enumerate(dataset.take(steps_per_epoch)):
        batch_loss = train_step(inp, targ, enc_hidden)
        total_loss += batch_loss
        
        if batch % 100 == 0:
            print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1, batch, batch_loss.numpy()))
            
    if (epoch + 1) % 2 == 0:
        checkpoint.save(file_prefix = checkpoint_prefix)

    print('Epoch {} Loss {:.4f}'.format(epoch + 1, total_loss / steps_per_epoch))
    print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))

354.39831733703613  sec


In [None]:
def evaluate(sentence):
    attention_plot = np.zeros((max_length_targ, max_length_inp))
    
    
    inputs = [input_tokenizer.word_index[i] for i in sentence]
    inputs = tf.keras.preprocessing.sequence.pad_sequences([inputs], maxlen=max_length_inp, padding='post')
    
    inputs = tf.convert_to_tensor(inputs)
    
    result = ''
    
    hidden = [tf.zeros((1, units))]
    enc_out, enc_hidden = encoder(inputs, hidden)
    
    dec_hidden = enc_hidden
    dec_input = tf.expand_dims([1], 0)
    
    for t in range(max_length_targ):
        predictions, dec_hidden, attention_weights = decoder(dec_input, dec_hidden, enc_out)
        
        attention_weights = tf.reshape(attention_weights, (-1, ))
        attention_plot[t] = attention_weights.numpy()
        
        predicted_id = tf.argmax(predictions[0]).numpy()
        
        result += target_tokenizer.index_word[predicted_id] + ' '
        
        dec_input = tf.expand_dims([predicted_id], 0)
        
    return result, sentence, attention_plot


In [None]:
def punctuate(sentence):
    result, sentence, attention_plot = evaluate(sentence)
    print('Input: ', sentence)
    print('Output: ', result)

In [None]:
punctuate('hello there')

In [None]:
punctuate('every few minutes she would awaken for a moment to review things stowey yes was on his way south and the two boys were away in school and nothing was burning on the stove and lucretia was coming for dinner and bringing three guests of hers')