In [1]:
import string
import numpy as np

from tensorflow.keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from keras.layers import LSTM, Input, TimeDistributed, Dense, Activation, RepeatVector, Embedding
from keras.optimizers import Adam
from keras.losses import sparse_categorical_crossentropy

# Path to translation file
path_to_data = '/kaggle/input/spanish-hindi/deu.txt'

# Read file
translation_file = open(path_to_data,"r", encoding='utf-8') 
raw_data = translation_file.read()
translation_file.close()

# Parse data
raw_data = raw_data.split('\n')
pairs = [sentence.split('\t') for sentence in  raw_data]
pairs = pairs[:50000]

2024-03-02 07:59:31.721804: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-02 07:59:31.721965: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-02 07:59:31.852252: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
len(pairs)

50000

In [3]:
def clean_sentence(sentence):
    # Lower case the sentence
    lower_case_sent = sentence.lower()
    # Strip punctuation
    string_punctuation = string.punctuation + "¡" + '¿'
    clean_sentence = lower_case_sent.translate(str.maketrans('', '', string_punctuation))
   
    return clean_sentence

In [4]:
def tokenize(sentences):
    # Create tokenizer
    text_tokenizer = Tokenizer()
    # Fit texts
    text_tokenizer.fit_on_texts(sentences)
    return text_tokenizer.texts_to_sequences(sentences), text_tokenizer

In [5]:
eng_sentencs = [clean_sentence(pair[0]) for pair in pairs]

In [7]:
eng_sentencs[:100]

['go',
 'hi',
 'hi',
 'run',
 'run',
 'wow',
 'wow',
 'duck',
 'fire',
 'help',
 'help',
 'stay',
 'stop',
 'stop',
 'wait',
 'wait',
 'begin',
 'do it',
 'do it',
 'go on',
 'hello',
 'hello',
 'hello',
 'hurry',
 'hurry',
 'i hid',
 'i hid',
 'i ran',
 'i see',
 'i see',
 'i try',
 'i try',
 'i won',
 'i won',
 'i won',
 'oh no',
 'relax',
 'shoot',
 'shoot',
 'smile',
 'sorry',
 'ask me',
 'ask me',
 'ask me',
 'attack',
 'attack',
 'buy it',
 'cheers',
 'eat it',
 'eat up',
 'eat up',
 'eat up',
 'exhale',
 'freeze',
 'freeze',
 'go now',
 'got it',
 'got it',
 'got it',
 'got it',
 'got it',
 'got it',
 'he ran',
 'he ran',
 'hop in',
 'hop in',
 'hug me',
 'hug me',
 'hug me',
 'i care',
 'i fell',
 'i fell',
 'i fell',
 'i fell',
 'i fell',
 'i fled',
 'i fled',
 'i know',
 'i lied',
 'i lost',
 'i paid',
 'i paid',
 'i pass',
 'i sang',
 'i spit',
 'i spit',
 'i swim',
 'i wept',
 'i wept',
 'im 19',
 'im 19',
 'im ok',
 'im ok',
 'im up',
 'im up',
 'inhale',
 'listen',
 'no w

In [8]:
span_sentencs = [clean_sentence(i[1]) for i in pairs]

In [9]:
span_sentencs[:5]

['geh', 'hallo', 'grüß gott', 'lauf', 'lauf']

In [11]:
# Tokenize words
spa_text_tokenized, spa_text_tokenizer = tokenize(span_sentencs)
eng_text_tokenized, eng_text_tokenizer = tokenize(eng_sentencs)

In [12]:
spa_text_tokenized[:5]

[[137], [577], [2199, 833], [1554], [1554]]

In [13]:
print('Maximum length spanish sentence: {}'.format(len(max(spa_text_tokenized,key=len))))
print('Maximum length english sentence: {}'.format(len(max(eng_text_tokenized,key=len))))

Maximum length spanish sentence: 12
Maximum length english sentence: 6


In [14]:
# Check language length
spanish_vocab = len(spa_text_tokenizer.word_index) + 1
english_vocab = len(eng_text_tokenizer.word_index) + 1
print("Spanish vocabulary is of {} unique words".format(spanish_vocab))
print("English vocabulary is of {} unique words".format(english_vocab))

Spanish vocabulary is of 10077 unique words
English vocabulary is of 6084 unique words


In [15]:
max_spanish_len = int(len(max(spa_text_tokenized,key=len)))
max_english_len = int(len(max(eng_text_tokenized,key=len)))

In [16]:
spa_pad_sentence = pad_sequences(spa_text_tokenized, max_spanish_len, padding = "post")
eng_pad_sentence = pad_sequences(eng_text_tokenized, max_english_len, padding = "post")

In [17]:
print('Maximum length spanish sentence: {}'.format(len(max(spa_pad_sentence,key=len))))
print('Maximum length english sentence: {}'.format(len(max(eng_pad_sentence,key=len))))

Maximum length spanish sentence: 12
Maximum length english sentence: 6


In [18]:
eng_pad_sentence[0]

array([29,  0,  0,  0,  0,  0], dtype=int32)

In [19]:
# Reshape data
spa_pad_sentence = spa_pad_sentence.reshape(*spa_pad_sentence.shape, 1)
eng_pad_sentence = eng_pad_sentence.reshape(*eng_pad_sentence.shape, 1)

Above code converts -- >array([35,  0,  0,  0,  0,  0,  0,  0], dtype=int32) to
array([[35],
       [ 0],
       [ 0],
       [ 0],
       [ 0],
       [ 0],
       [ 0],
       [ 0]], dtype=int32)

In [20]:
eng_pad_sentence[0]

array([[29],
       [ 0],
       [ 0],
       [ 0],
       [ 0],
       [ 0]], dtype=int32)

In below code: first line will convert out input data tensor of maximum spanish length

In [21]:
input_sequence = Input(shape=(max_spanish_len,))
embedding = Embedding(input_dim=spanish_vocab, output_dim=128,)(input_sequence)

In [22]:
input_sequence

<KerasTensor shape=(None, 12), dtype=float32, sparse=None, name=keras_tensor>

In [23]:
encoder = LSTM(64, return_sequences=False)(embedding)


As we can see in the image the hidden vector is repeated n times, so each time step of the LSTM receives the same vector. In order to have this same vector for every time step we need to use the layer RepeatVector, as its names implies its role is to repeat the vector it is receiving, the only parameter we need to define is n, the number of repetitions. This number is equal to the number of time step of the decoder part, in other words the maximum English sentence length, 6.

In [24]:
r_vec = RepeatVector(max_english_len)(encoder)


In [25]:
decoder = LSTM(64, return_sequences=True, dropout=0.2)(r_vec)


In [26]:
logits = TimeDistributed(Dense(english_vocab))(decoder)


Input_dim = Spanish_vocab (15804)
Each vector would be given to 128 neuron
total 15804*128 = 2,022,912 

In [27]:
enc_dec_model = Model(input_sequence, Activation('softmax')(logits))
enc_dec_model.compile(loss=sparse_categorical_crossentropy,
              optimizer=Adam(1e-3),
              metrics=['accuracy'])
enc_dec_model.summary()

In [28]:
model_results = enc_dec_model.fit(spa_pad_sentence, eng_pad_sentence, batch_size=30, epochs=50)


Epoch 1/50
[1m1667/1667[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 13ms/step - accuracy: 0.4106 - loss: 4.4831
Epoch 2/50
[1m1667/1667[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 13ms/step - accuracy: 0.4633 - loss: 3.4820
Epoch 3/50
[1m1667/1667[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 13ms/step - accuracy: 0.5062 - loss: 3.1144
Epoch 4/50
[1m1667/1667[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 13ms/step - accuracy: 0.5395 - loss: 2.7990
Epoch 5/50
[1m1667/1667[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 13ms/step - accuracy: 0.5741 - loss: 2.5111
Epoch 6/50
[1m1667/1667[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 13ms/step - accuracy: 0.5952 - loss: 2.3097
Epoch 7/50
[1m1667/1667[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 13ms/step - accuracy: 0.6137 - loss: 2.1378
Epoch 8/50
[1m1667/1667[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 13ms/step - accuracy: 0.6279 - loss: 1.9973
Epoch 9/

In [37]:

def logits_to_sentence(logits, tokenizer):

    index_to_words = {idx: word for word, idx in tokenizer.word_index.items()}
    index_to_words[0] = '' 

    return ' '.join([index_to_words[prediction] for prediction in np.argmax(logits, 1)])

index = 9898
print("The english sentence is: {}".format(eng_sentencs[index]))
print("The spanish sentence is: {}".format(span_sentencs[index]))
print('The predicted sentence is :')
print(logits_to_sentence(enc_dec_model.predict(spa_pad_sentence[index:index+1])[0], eng_text_tokenizer))

The english sentence is: im taking off
The spanish sentence is: ich fliege los
The predicted sentence is :
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
im going go   


In [44]:
s = "oye vamos a viajar hoy"
s = s.lower()

In [47]:
print(s)

None
