In [1]:
import tensorflow as tf
# tf.config.run_functions_eagerly(True)
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
import cv2
from sklearn.metrics import confusion_matrix, roc_curve
import seaborn
import datetime, pathlib, io, os, time, random, re, string
import gensim.downloader as api
from PIL import Image
import tensorflow_datasets as tfds
import tensorflow_probability as tfp
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Layer
from tensorflow.keras.layers import Dense, Flatten, InputLayer, BatchNormalization, Input, Embedding, TextVectorization
from tensorflow.keras.layers import SimpleRNN, LSTM, GRU, Bidirectional, Dropout, Conv1D
from tensorflow.keras.losses import BinaryCrossentropy, CategoricalCrossentropy, SparseCategoricalCrossentropy
from tensorflow.keras.metrics import Accuracy, TopKCategoricalAccuracy, CategoricalAccuracy, SparseCategoricalAccuracy
from tensorflow.keras.optimizers import Adam
from tensorboard.plugins import projector

# Dataset

In [2]:
# !wget https://www.manythings.org/anki/fra-eng.zip

# Data Processing

In [3]:
text_dataset = tf.data.TextLineDataset("localdata/Section7/fra.txt")
text_dataset

2023-10-25 13:30:14.759144: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M1 Max
2023-10-25 13:30:14.759166: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 64.00 GB
2023-10-25 13:30:14.759169: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 24.00 GB
2023-10-25 13:30:14.759201: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2023-10-25 13:30:14.759213: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


<TextLineDatasetV2 element_spec=TensorSpec(shape=(), dtype=tf.string, name=None)>

In [4]:
for i in text_dataset.take(3):
    print(i)

tf.Tensor(b'Go.\tVa !\tCC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #1158250 (Wittydev)', shape=(), dtype=string)
tf.Tensor(b'Go.\tMarche.\tCC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #8090732 (Micsmithel)', shape=(), dtype=string)
tf.Tensor(b'Go.\tEn route !\tCC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #8267435 (felix63)', shape=(), dtype=string)


In [5]:
VOCAB_SIZE = 20000
ENGLISH_SEQUENCE_LENGTH = 64
FRENCH_SEQUENCE_LENGTH = 64
EMBEDDING_DIM = 300
BATCH_SIZE = 64

In [6]:
english_vectorize_layer = TextVectorization(
    standardize='lower_and_strip_punctuation',
    max_tokens=VOCAB_SIZE,
    output_mode='int',
    output_sequence_length=ENGLISH_SEQUENCE_LENGTH
)

In [7]:
french_vectorize_layer = TextVectorization(
    standardize='lower_and_strip_punctuation',
    max_tokens=VOCAB_SIZE,
    output_mode='int',
    output_sequence_length=FRENCH_SEQUENCE_LENGTH
)

In [8]:
def selector(input_text):
    split_text = tf.strings.split(input_text, '\t')
    return { 'input_1': split_text[0:1], 'input_2': 'starttoken ' + split_text[1:2] },  split_text[1:2] + ' endtoken'

In [9]:
split_dataset = text_dataset.map(selector)
split_dataset

<_MapDataset element_spec=({'input_1': TensorSpec(shape=(None,), dtype=tf.string, name=None), 'input_2': TensorSpec(shape=(None,), dtype=tf.string, name=None)}, TensorSpec(shape=(None,), dtype=tf.string, name=None))>

In [10]:
for i in split_dataset.take(3):
    print(i)

({'input_1': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'Go.'], dtype=object)>, 'input_2': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'starttoken Va !'], dtype=object)>}, <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'Va ! endtoken'], dtype=object)>)
({'input_1': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'Go.'], dtype=object)>, 'input_2': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'starttoken Marche.'], dtype=object)>}, <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'Marche. endtoken'], dtype=object)>)
({'input_1': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'Go.'], dtype=object)>, 'input_2': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'starttoken En route !'], dtype=object)>}, <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'En route ! endtoken'], dtype=object)>)


In [11]:
def seperator(input_text):
    split_text = tf.strings.split(input_text, '\t')
    return split_text[0:1], 'starttoken ' + split_text[1:2] + ' endtoken'

In [12]:
init_dataset = text_dataset.map(seperator)
init_dataset

<_MapDataset element_spec=(TensorSpec(shape=(None,), dtype=tf.string, name=None), TensorSpec(shape=(None,), dtype=tf.string, name=None))>

In [13]:
for i in init_dataset.take(3):
    print(i)

(<tf.Tensor: shape=(1,), dtype=string, numpy=array([b'Go.'], dtype=object)>, <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'starttoken Va ! endtoken'], dtype=object)>)
(<tf.Tensor: shape=(1,), dtype=string, numpy=array([b'Go.'], dtype=object)>, <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'starttoken Marche. endtoken'], dtype=object)>)
(<tf.Tensor: shape=(1,), dtype=string, numpy=array([b'Go.'], dtype=object)>, <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'starttoken En route ! endtoken'], dtype=object)>)


### vocabulary

In [14]:
english_training_data = init_dataset.map(lambda x, y: x)

In [15]:
for i in english_training_data.take(1):
    print(i)

tf.Tensor([b'Go.'], shape=(1,), dtype=string)


In [16]:
english_vectorize_layer.adapt(english_training_data)

2023-10-25 13:30:17.929624: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.


In [17]:
french_training_data = init_dataset.map(lambda x, y: y)
french_vectorize_layer.adapt(french_training_data)

### tokenization

In [18]:
def vectorizer(inputs, output):
    return {
        'input_1': english_vectorize_layer(inputs['input_1']),
        'input_2': french_vectorize_layer(inputs['input_2']) }, french_vectorize_layer(output)

In [19]:
dataset = split_dataset.map(vectorizer)
dataset

<_MapDataset element_spec=({'input_1': TensorSpec(shape=(None, 64), dtype=tf.int64, name=None), 'input_2': TensorSpec(shape=(None, 64), dtype=tf.int64, name=None)}, TensorSpec(shape=(None, 64), dtype=tf.int64, name=None))>

In [20]:
french_vectorize_layer.get_vocabulary()[3]

'endtoken'

In [21]:
for i in dataset.take(3):
    print(i)

({'input_1': <tf.Tensor: shape=(1, 64), dtype=int64, numpy=
array([[44,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0]])>, 'input_2': <tf.Tensor: shape=(1, 64), dtype=int64, numpy=
array([[  2, 103,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0]])>}, <tf.Tensor: shape=(1, 64), dtype=int64, numpy=
array([[103,   3,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,

### shuffle and make as batch

In [22]:
dataset = dataset.shuffle(2038)
print(dataset)

<_ShuffleDataset element_spec=({'input_1': TensorSpec(shape=(None, 64), dtype=tf.int64, name=None), 'input_2': TensorSpec(shape=(None, 64), dtype=tf.int64, name=None)}, TensorSpec(shape=(None, 64), dtype=tf.int64, name=None))>


In [23]:
dataset = dataset.unbatch()
dataset

<_UnbatchDataset element_spec=({'input_1': TensorSpec(shape=(64,), dtype=tf.int64, name=None), 'input_2': TensorSpec(shape=(64,), dtype=tf.int64, name=None)}, TensorSpec(shape=(64,), dtype=tf.int64, name=None))>

In [24]:
dataset = dataset.batch(BATCH_SIZE).prefetch(buffer_size=tf.data.AUTOTUNE)
dataset

<_PrefetchDataset element_spec=({'input_1': TensorSpec(shape=(None, 64), dtype=tf.int64, name=None), 'input_2': TensorSpec(shape=(None, 64), dtype=tf.int64, name=None)}, TensorSpec(shape=(None, 64), dtype=tf.int64, name=None))>

In [25]:
NUM_BATCHES = int(200000 / BATCH_SIZE)
NUM_BATCHES

3125

In [26]:
train_dataset = dataset.take(int(0.9 * NUM_BATCHES))
val_dataset = dataset.skip(int(0.9 * NUM_BATCHES))
train_dataset

<_TakeDataset element_spec=({'input_1': TensorSpec(shape=(None, 64), dtype=tf.int64, name=None), 'input_2': TensorSpec(shape=(None, 64), dtype=tf.int64, name=None)}, TensorSpec(shape=(None, 64), dtype=tf.int64, name=None))>

In [27]:
for i in train_dataset.take(1):
    print(i)

({'input_1': <tf.Tensor: shape=(64, 64), dtype=int64, numpy=
array([[    2, 10668,     0, ...,     0,     0,     0],
       [    2,   143,    44, ...,     0,     0,     0],
       [   20,  1157,     0, ...,     0,     0,     0],
       ...,
       [ 2340,     0,     0, ...,     0,     0,     0],
       [   20,  1351,     0, ...,     0,     0,     0],
       [   46,   173,     0, ...,     0,     0,     0]])>, 'input_2': <tf.Tensor: shape=(64, 64), dtype=int64, numpy=
array([[    2,    24,    38, ...,     0,     0,     0],
       [    2,     4,   127, ...,     0,     0,     0],
       [    2,    24,    11, ...,     0,     0,     0],
       ...,
       [    2, 19347,     0, ...,     0,     0,     0],
       [    2,     4,    25, ...,     0,     0,     0],
       [    2,    72,    49, ...,     0,     0,     0]])>}, <tf.Tensor: shape=(64, 64), dtype=int64, numpy=
array([[   24,    38,    22, ...,     0,     0,     0],
       [    4,   127,   215, ...,     0,     0,     0],
       [   24,   

# Model

In [62]:
x = tf.keras.models.Sequential([Embedding(VOCAB_SIZE, EMBEDDING_DIM)])
x.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_10 (Embedding)    (None, None, 256)         5120000   
                                                                 
Total params: 5120000 (19.53 MB)
Trainable params: 5120000 (19.53 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [28]:
class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, units):
        super().__init__()
        self.embedding_dim = embedding_dim
        self.vocab_size = vocab_size
        self.units = units

    def build(self, input_shape):
        self.embedding = Embedding(self.vocab_size, self.embedding_dim)
        self.lstm = LSTM(self.units, return_sequences=True)

    def call(self, x):
        x = self.embedding(x)
        output = self.lstm(x)
        return output

In [51]:
EMBEDDING_DIM = 256
HIDDEN_UNITS = 256 # MUST BE SAME AS EMBEDDING DIM
encoder = Encoder(VOCAB_SIZE, EMBEDDING_DIM, HIDDEN_UNITS)
encoder_output = encoder(tf.zeros([BATCH_SIZE, ENGLISH_SEQUENCE_LENGTH]))
encoder_output.shape # (BATCH_SIZE, SEQUENCE_LENGTH, HIDDEN_UNITS)

TensorShape([64, 64, 256])

In [47]:
class BahdanauAttention(tf.keras.layers.Layer):
    def __init__(self, units):
        super().__init__()
        self.units = units

    def build(self, input_shape):
        self.w_1 = tf.keras.layers.Dense(self.units)
        self.w_2 = tf.keras.layers.Dense(self.units)
        self.w = tf.keras.layers.Dense(1)

    def call(self, prev_dec_state, enc_states):
        # print("prev_dec_state.shape=", prev_dec_state.shape) # BATCH_SIZE, HIDDEN_UNITS
        # print("enc_states.shape=", enc_states.shape) # BATCH_SIZE, SEQUENCE_LENGTH, HIDDEN_UNITS
        # print("tf.expand_dims(prev_dec_state, -2).shape=", tf.expand_dims(prev_dec_state, -2).shape)
        
        scores = self.w(
            tf.nn.tanh(
                self.w_1(tf.expand_dims(prev_dec_state, -2)) +
                self.w_2(enc_states)
            )
        )
        # print("scores.shape=", scores.shape) # BATCH_SIZE, SEQUENCE_LENGTH, 1

        attention_weights = tf.nn.softmax(scores, axis=1)
        # print("attention_weights.shape=", attention_weights.shape) # BATCH_SIZE, SEQUENCE_LENGTH, 1
        
        context_vector = attention_weights * enc_states
        # print("1.context_vector.shape=", context_vector.shape) # BATCH_SIZE, SEQUENCE_LENGTH, HIDDEN_UNITS
        
        context_vector = tf.reduce_sum(context_vector, axis=1)
        # print("2.context_vector.shape=", context_vector.shape) # BATCH_SIZE, HIDDEN_UNITS
        
        return context_vector#, attention_weights

In [48]:
bahdanau_attention = BahdanauAttention(HIDDEN_UNITS)
context_vector = bahdanau_attention(tf.zeros([BATCH_SIZE, 32]), tf.zeros([BATCH_SIZE, FRENCH_SEQUENCE_LENGTH, 32]))
print(context_vector.shape)

(64, 32)


In [63]:
class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, dec_units, sequence_length):
        super().__init__()
        self.embedding_dim = embedding_dim
        self.vocab_size = vocab_size
        self.dec_units = dec_units
        self.sequence_length = sequence_length

    def build(self, input_shape):
        self.dense = Dense(self.vocab_size, activation="softmax")
        self.gru = GRU(self.dec_units, return_sequences=True, return_state=True)
        self.attention = BahdanauAttention(self.dec_units)
        self.embedding = Embedding(self.vocab_size, self.embedding_dim)

    def call(self, x, hidden, shifted_target): # x is encoders' states, hidden is initial hidden state (for decoder)
        # print("x.shape=", x.shape) # BATCH_SIZE, SEQUENCE_LENGTH, HIDDEN_UNITS
        # print("hidden.shape=", hidden.shape) # BATCH_SIZE, HIDDEN_UNITS
        
        outputs = []
        context_vectors = []
        shifted_target = self.embedding(shifted_target)
        # print("shifted_targets.shape=", shifted_target.shape) # BATCH_SIZE, SEQUENCE_LENGTH, EMBEDDING_DIM

        for t in range(0, self.sequence_length):
            context_vector = self.attention(hidden, x)
            # print(str(t), ".context_vector.shape=", context_vector.shape) # BATCH_SIZE, HIDDEN_UNITS
            # print(str(t), ".shifted_target[:, t].shape=", shifted_target[:, t].shape) # BATCH_SIZE, HIDDEN_UNITS
            # print(str(t), ".attention_weights.shape=", attention_weights.shape)
            dec_input = context_vector + shifted_target[:, t]
            # print(str(t), ".dec_input.shape=", dec_input.shape) # BATCH_SIZE, HIDDEN_UNITS
            output, hidden = self.gru(tf.expand_dims(dec_input, 1))
            # print(str(t), ".output.shape=", output.shape) # BATCH_SIZE, 1, HIDDEN_UNITS
            # print(str(t), ".hidden.shape=", hidden.shape) # BATCH_SIZE, HIDDEN_UNITS
            # print(str(t), ".output[:, 0].shape=", output[:, 0].shape) # BATCH_SIZE, HIDDEN_UNITS
            outputs.append(output[:, 0])

        outputs = tf.convert_to_tensor(outputs)
        # print("1.outputs.shape=", outputs.shape) # SEQUENCE_LENGTH, BATCH_SIZE, HIDDEN_UNITS
        outputs = tf.transpose(outputs, perm=[1,0,2])
        # print("2.outputs.shape=", outputs.shape) # BATCH_SIZE, SEQUENCE_LENGTH, HIDDEN_UNITS
        outputs = self.dense(outputs)
        # print("3.outputs.shape=", outputs.shape) # BATCHS_SIZE, SEQUENCE_LENGTH, VOCAB_SIZE
        return outputs

In [64]:
decoder = Decoder(VOCAB_SIZE, EMBEDDING_DIM, HIDDEN_UNITS, FRENCH_SEQUENCE_LENGTH)
print(decoder(encoder_output, tf.zeros([BATCH_SIZE, HIDDEN_UNITS]), tf.zeros([BATCH_SIZE, FRENCH_SEQUENCE_LENGTH])).shape)

(64, 64, 20000)


In [66]:
### Encoder
input = Input(shape=(ENGLISH_SEQUENCE_LENGTH, ), dtype="int64", name="input_1")
encoder = Encoder(VOCAB_SIZE, EMBEDDING_DIM, HIDDEN_UNITS)
encoder_output = encoder(input)

### Decoder
shifted_target = Input(shape=(FRENCH_SEQUENCE_LENGTH,), dtype="int64", name="input_2")
decodere = Decoder(VOCAB_SIZE, EMBEDDING_DIM, HIDDEN_UNITS, FRENCH_SEQUENCE_LENGTH)
decoder_output = decoder(encoder_output, tf.zeros([1, HIDDEN_UNITS]), shifted_target)

### Output
bahdanau = tf.keras.models.Model([input, shifted_target], decoder_output)
bahdanau.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, 64)]                 0         []                            
                                                                                                  
 encoder_4 (Encoder)         (None, 64, 256)              5645312   ['input_1[0][0]']             
                                                                                                  
 input_2 (InputLayer)        [(None, 64)]                 0         []                            
                                                                                                  
 decoder_7 (Decoder)         (None, 64, 20000)            1078659   ['encoder_4[0][0]',           
                                                          3          'input_2[0][0]']         

In [67]:
bahdanau.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(),
                 optimizer=tf.keras.optimizers.Adam(5e-4)
                )



In [68]:
history = bahdanau.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=2)

Epoch 1/2


2023-10-25 15:11:11.222092: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:961] model_pruner failed: INVALID_ARGUMENT: Graph does not contain terminal node Adam/AssignAddVariableOp_10.


    218/Unknown - 210s 657ms/step - loss: 1.6263

KeyboardInterrupt: 