<a href="https://colab.research.google.com/github/h4ck4l1/datasets/blob/main/NLP_with_RNN_and_Attention/Spa_to_En_NeuralTranslationNetwork.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# from google.colab import auth
# auth.authenticate_user()
import os,warnings
import numpy as np
from zipfile import ZipFile
os.environ['TF_MIN_LOG_LEVEL'] = "3"
warnings.filterwarnings("ignore")
import tensorflow as tf
from tensorflow import keras
tf.get_logger().setLevel("ERROR")

In [2]:
# resolver = tf.distribute.cluster_resolver.TPUClusterResolver()
# tf.config.experimental_connect_to_cluster(resolver)
# tf.tpu.experimental.initialize_tpu_system(resolver)
# strategy = tf.distribute.TPUStrategy(resolver)
strategy = tf.distribute.experimental.CentralStorageStrategy()

In [3]:
with tf.device("/job:localhost"):
    file_path = keras.utils.get_file(fname="/content/spa_to_en.zip",origin="https://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip")
    with ZipFile(file_path,"r") as f:
        f.extractall("/content/en_to_spa")
    with open("/content/en_to_spa/spa-eng/spa.txt","r") as f:
        text = f.read()
    new_text = text.replace("¿","").replace("¡","")
    full_text = [line.split("\t") for line in new_text.splitlines()]
    en_text,es_text = zip(*full_text)
    total_size = len(en_text) # 189117

In [4]:
def get_layers(vocab_size=1000,sequence_length=50):
    en_vec_layer = keras.layers.TextVectorization(vocab_size,output_sequence_length=50)
    es_vec_layer = keras.layers.TextVectorization(vocab_size,output_sequence_length=50)
    en_vec_layer.adapt(en_text)
    es_vec_layer.adapt([f"sos {sentence} eos" for sentence in es_text])
    return en_vec_layer,es_vec_layer

In [5]:
def get_dataset(en_text,es_text,es_vec_layer,train_size):
    X_train = tf.constant(en_text[:train_size])
    X_valid = tf.constant(en_text[train_size:])
    X_dec_train = tf.constant([f"sos {sentence}" for sentence in es_text[:train_size]])
    X_dec_valid = tf.constant([f"sos {sentence}" for sentence in es_text[train_size:]])
    y_train = es_vec_layer([f"{sentence} eos" for sentence in es_text[:train_size]])
    y_valid = es_vec_layer([f"{sentence} eos" for sentence in es_text[train_size:]])
    return (X_train,X_dec_train),y_train,(X_valid,X_dec_valid),y_valid

In [6]:
class NLP(keras.Model):

    def __init__(self,en_vec_layer,es_vec_layer,vocab_size=1000,embed_size=128,**kwargs):

        super(NLP,self).__init__(**kwargs)
        self.en_vec_layer = en_vec_layer
        self.es_vec_layer = es_vec_layer
        self.en_embed = keras.layers.Embedding(vocab_size,embed_size)
        self.es_embed = keras.layers.Embedding(vocab_size,embed_size)
        self.en_encoder = keras.layers.LSTM(512,return_state=True)
        self.es_decoder = keras.layers.LSTM(512,return_sequences=True)
        self.out = keras.layers.Dense(vocab_size,"softmax")

    def call(self,inputs):

        en_input = inputs[0]
        es_input = inputs[1]
        en_encoded_out = self.en_vec_layer(en_input)
        es_encoded_out = self.es_vec_layer(es_input)
        en_embed_out = self.en_embed(en_encoded_out)
        es_embed_out = self.es_embed(es_encoded_out)
        encoder_out,*en_state = self.en_encoder(en_embed_out)
        decoder_out = self.es_decoder(es_embed_out,initial_state=en_state)
        dense_out = self.out(decoder_out)
        return dense_out

In [7]:
with strategy.scope():
    train_size = 100_000
    valid_size = total_size-train_size
    BATCH_SIZE = 50*8
    en_vec_layer,es_vec_layer = get_layers()
    X_train,y_train,X_valid,y_valid = get_dataset(en_text,es_text,es_vec_layer,train_size=train_size)
    nlp_model = NLP(en_vec_layer,es_vec_layer)
    nlp_model.compile(
        loss="sparse_categorical_crossentropy",
        optimizer="adam",
        metrics=["accuracy"],
        steps_per_execution=10
    )
    train_steps = train_size//BATCH_SIZE
    valid_steps = valid_size//BATCH_SIZE

In [8]:
history = nlp_model.fit(X_train,y_train,epochs=10,batch_size=BATCH_SIZE,validation_data=(X_valid,y_valid),steps_per_epoch=train_steps,validation_steps=valid_steps)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [9]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [15]:
nlp_model.save("/content/drive/MyDrive/Colab Notebooks/NLP_practice/nlp_model",save_format="tf",overwrite=True)



In [16]:
nlp_model.save_weights("/content/drive/MyDrive/Colab Notebooks/NLP_practice/nlp_model_weights",save_format="tf",overwrite=True)

In [17]:
drive.flush_and_unmount()