In [1]:
import pandas as pd
import numpy as np
import re
import pickle
from string import digits
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense, GRU
from tensorflow.keras.models import Model
from tensorflow.keras.utils import plot_model

model_name = 'PORT_GRU'

2023-03-20 17:01:42.391527: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-03-20 17:01:43.633949: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-03-20 17:01:43.634118: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory


In [None]:
# Reads the portuguese words
lines= pd.read_table('br-utf8.txt', names=['words'])
lines.shape

In [None]:
lines.sample(5)

In [None]:
# Lowercase all characters
lines.words = lines.words.apply(lambda x: x.lower())
# Remove quotes
lines.words = lines.words.apply(lambda x: re.sub("'", '', x))
# Add start and end tokens to target sequences
lines.words = lines.words.apply(lambda x : '<'+ x + '>')

lines.sample(5)

In [None]:
# Get all chars list
all_chars=set()
for word in lines.words: 
    for char in word: 
        if char not in all_chars: all_chars.add(char)
        
# Word with max lenght
max_length_word=0
for word in lines.words:
    max_length_word = max(len(word), max_length_word)


all_chars.add('_')
num_chars = len(all_chars) + 1

# Dicts to transform chars into index and vice-versa
char_token_index = dict([(char, i+1) for i, char in enumerate(all_chars)])
reverse_char_token_index = dict((i, char) for char, i in char_token_index.items())

# save dictionary to char_to_index.pkl file
with open('models/'+model_name+'/char_to_index.pkl', 'wb') as fp:
    pickle.dump(char_token_index, fp)
    print('Dictionary saved successfully to file')

In [None]:
X_train, X_test, _, __ = train_test_split(lines.words, lines.words, test_size = 0.2)
print(X_train.shape, X_test.shape)

In [None]:
flags = pd.Series(['<start_>', '<_end>'])
X_train = pd.concat([X_train, flags], axis=0)
y_train, y_test = X_train, X_test

In [None]:
def generate_batch(X = X_train, y = y_train, batch_size=128):
    ''' Generate a batch of data '''
    while True:
        # iterate from batch to batch
        for j in range(0, len(X), batch_size):
            encoder_input_data = np.zeros((batch_size, max_length_word),dtype='float32')
            decoder_input_data = np.zeros((batch_size, max_length_word),dtype='float32')
            decoder_target_data = np.zeros((batch_size, max_length_word, num_chars),dtype='float32')
            # get the batch elements 
            for i, (input_text, target_text) in enumerate(zip(X[j:j+batch_size], y[j:j+batch_size])):
                for t, char in enumerate(input_text):
                    encoder_input_data[i, t] = char_token_index[char] # encoder input seq
                for t, char in enumerate(target_text):
                    if t < len(target_text) - 1:
                        decoder_input_data[i, t] = char_token_index[char] # decoder input seq
                    if t > 0:
                        # decoder target sequence (one hot encoded)
                        # does not include the START_ token
                        # Offset by one timestep
                        decoder_target_data[i, t-1, char_token_index[char]] = 1.
            yield([encoder_input_data, decoder_input_data], decoder_target_data)

In [None]:
latent_dim = 50

# Encoder
encoder_inputs = Input(shape=(None,))
encoder_emb =  Embedding(num_chars, latent_dim, mask_zero=True)(encoder_inputs)
encoder_gru = GRU(latent_dim, return_state=True)
encoder_outputs, encoder_state = encoder_gru(encoder_emb)

In [None]:
# Set up the decoder, using `encoder_states` as initial state.
decoder_inputs = Input(shape=(None,))
decoder_emb_layer = Embedding(num_chars, latent_dim, mask_zero = True)
decoder_emb = decoder_emb_layer(decoder_inputs)

decoder_gru = GRU(latent_dim, return_sequences=True, return_state=True)
decoder_outputs,  _ = decoder_gru(decoder_emb, initial_state=encoder_state)

# Use a softmax to generate a probability distribution over the target vocabulary for each time step
decoder_dense = Dense(num_chars, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

In [None]:
# Define the model that will turn
# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

# Compile the model
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['acc'])

plot_model(model, show_shapes=True, show_layer_activations=True)

In [None]:
train_samples = len(X_train)
val_samples = len(X_test)
batch_size = 64
epochs = 10

In [None]:
model.fit(
    generate_batch(X_train, y_train, batch_size=batch_size),
    steps_per_epoch=train_samples//batch_size,
    epochs=epochs,
    validation_data=generate_batch(X_test, y_test, batch_size=batch_size),
    validation_steps=val_samples//batch_size
)

In [None]:
model.save('models/'+model_name+'/por_enc_dec')

In [None]:
model.summary()

In [None]:
# Encode the input sequence to get the "thought vectors"
encoder_model = Model(encoder_inputs, encoder_state)
plot_model(encoder_model)

In [None]:
# Decoder setup
# Below tensors will hold the states of the previous time step
decoder_state_input = Input(shape=(latent_dim,))

# Get the embeddings of the decoder sequence
decoder_emb2 = decoder_emb_layer(decoder_inputs)

# To predict the next word in the sequence, set the initial states to the states from the previous time step
decoder_outputs2, decoder_state2 = decoder_gru(decoder_emb2, initial_state=decoder_state_input)

# A dense softmax layer to generate prob dist. over the target vocabulary
decoder_outputs2 = decoder_dense(decoder_outputs2)

# Final decoder model
decoder_model = Model([decoder_inputs, decoder_state_input], [decoder_outputs2, decoder_state2])
plot_model(decoder_model)

In [None]:
def decode_sequence(input_seq):
    # Encode the input as state vectors.
    states_value = encoder_model.predict(input_seq, verbose=0)
    
    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1,1))
    
    # Populate the first character of target sequence with the start character.
    target_seq[0, 0] = char_token_index['<']
    
    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sentence = ''
    
    while not stop_condition:
        output_tokens, states_value = decoder_model.predict([target_seq, states_value], verbose=0)
        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = reverse_char_token_index[sampled_token_index]
        decoded_sentence += sampled_char
        
        # Exit condition: either hit max length or find stop token.
        if (sampled_char == '>' or len(decoded_sentence) > 50):
            stop_condition = True
        
        # Update the target sequence (of length 1).
        target_seq = np.zeros((1,1))
        target_seq[0, 0] = sampled_token_index
    
    return decoded_sentence

In [None]:
train_gen = generate_batch(X_test, y_test, batch_size = 1)
for k in range(15):
    (input_seq, actual_output), _ = next(train_gen)
    decoded_sentence = decode_sequence(input_seq)
    print('-----------------------------------------')
    print('Input English sentence:', X_test[k:k+1].values[0])
    print('Actual Marathi Translation:', y_test[k:k+1].values[0])
    print('Predicted Marathi Translation:', decoded_sentence)

In [None]:
unknown = pd.Series(['<nintendo>', '<arretado>', '<estrombofone>','<tankar>', '<_end>'])
train_gen = generate_batch(unknown, unknown, batch_size=1)
for k in range(len(unknown)):
    (input_seq, actual_output), _ = next(train_gen)
    decoded_sentence = decode_sequence(input_seq)
    print('In set: ', unknown[k] in  lines.words.values)
    print('Input: ', unknown[k])
    print('Output: ', decoded_sentence)
    print('-------------------------')