In [1]:
import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, SimpleRNN, Embedding, Dense, Attention
from tensorflow.keras.utils import to_categorical
from tensorflow.keras import layers

# Load your dataset
base_dir = 'datasets/mt/'
train_data = pd.read_csv(base_dir + 'train.csv')
val_data = pd.read_csv(base_dir + 'valid.csv')
test_data = pd.read_csv(base_dir + 'test.csv')
# Ensure your dataset has 'source' and 'target' columns
train_source_texts = train_data['indonesian'].values
train_target_texts = train_data['english'].values

val_source_texts = val_data['indonesian'].values
val_target_texts = val_data['english'].values

test_source_texts = test_data['indonesian'].values
test_target_texts = test_data['english'].values

2024-11-02 16:06:10.906623: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-11-02 16:06:11.304843: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-11-02 16:06:11.304943: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-11-02 16:06:11.376623: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-02 16:06:11.519201: I tensorflow/core/platform/cpu_feature_guar

In [2]:
!pip install tensorflow==2.15.0

Collecting tensorflow==2.15.0
  Downloading tensorflow-2.15.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.4 kB)
Collecting ml-dtypes~=0.2.0 (from tensorflow==2.15.0)
  Downloading ml_dtypes-0.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (20 kB)
Collecting numpy<2.0.0,>=1.23.5 (from tensorflow==2.15.0)
  Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
Collecting protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0dev,>=3.20.3 (from tensorflow==2.15.0)
  Downloading protobuf-4.25.5-cp37-abi3-manylinux2014_x86_64.whl.metadata (541 bytes)
Collecting wrapt<1.15,>=1.11.0 (from tensorflow==2.15.0)
  Downloading wrapt-1.14.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting tensorboard<2.16,>=2.15 (from tensorflow==2.15.0)
  Downloading tensorboard-2.15.2-py3-none-any.whl.metadata (1.7 kB)
Collect

In [2]:
import tensorflow as tf
print(tf.sysconfig.get_build_info()['cuda_version'])
print(tf.sysconfig.get_build_info()['cudnn_version'])
print(tf.config.list_physical_devices('GPU'))
print(tf.test.is_built_with_cuda())

12.2
8
[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
True


In [3]:
tf.sysconfig.get_build_info() 

OrderedDict([('cpu_compiler', '/usr/lib/llvm-17/bin/clang'),
             ('cuda_compute_capabilities',
              ['sm_50', 'sm_60', 'sm_70', 'sm_75', 'compute_80']),
             ('cuda_version', '12.2'),
             ('cudnn_version', '8'),
             ('is_cuda_build', True),
             ('is_rocm_build', False),
             ('is_tensorrt_build', True)])

In [4]:
print(tf.__version__)

2.15.0


In [7]:
# Tokenization (Source and Target)
source_tokenizer = Tokenizer()
target_tokenizer = Tokenizer()

source_tokenizer.fit_on_texts(train_source_texts)
target_tokenizer.fit_on_texts(train_target_texts)

source_vocab_size = len(source_tokenizer.word_index) + 1
target_vocab_size = len(target_tokenizer.word_index) + 1

# Convert texts to sequences
train_source_sequences = source_tokenizer.texts_to_sequences(train_source_texts)
train_target_sequences = target_tokenizer.texts_to_sequences(train_target_texts)

val_source_sequences = source_tokenizer.texts_to_sequences(val_source_texts)
val_target_sequences = target_tokenizer.texts_to_sequences(val_target_texts)

test_source_sequences = source_tokenizer.texts_to_sequences(test_source_texts)
test_target_sequences = target_tokenizer.texts_to_sequences(test_target_texts)

In [8]:
print(len(train_target_sequences[1]))
print(len(val_target_sequences[0]))

20
19


In [9]:
# Pad sequences to ensure uniform input size
max_source_length = 77
max_target_length = 77   #to include EOS or shifted tokens
X_train_source = pad_sequences(train_source_sequences, maxlen=max_source_length, padding='post')
X_train_target = pad_sequences(train_target_sequences, maxlen=max_target_length, padding='post')

X_val_source = pad_sequences(val_source_sequences, maxlen=max_source_length, padding='post')
X_val_target = pad_sequences(val_target_sequences, maxlen=max_target_length, padding='post')

X_test_source = pad_sequences(test_source_sequences, maxlen=max_source_length, padding='post')
X_test_target = pad_sequences(test_target_sequences, maxlen=max_target_length, padding='post')


In [10]:
# Shift target sequences for teacher forcing
train_target_sequences_shifted = np.zeros_like(X_train_target)
train_target_sequences_shifted[:, :-1] = X_train_target[:, 1:]

# One-hot encode target sequences
y_train = X_train_target[:, 1:]
y_train = to_categorical(train_target_sequences_shifted, num_classes=target_vocab_size)

val_target_sequences_shifted = np.zeros_like(X_val_target)
val_target_sequences_shifted[:, :-1] = X_val_target[:, 1:]

y_val = X_val_target[:, 1:]  # Exclude the first token
y_val = to_categorical(val_target_sequences_shifted, num_classes=target_vocab_size)

In [11]:
# Check dimensions
print(f'X_train_source shape: {X_train_source.shape}')
print(f'X_val_source shape: {X_val_source.shape}')
print(f'y_train shape: {y_train.shape}')
print(f'y_val shape: {y_val.shape}')

X_train_source shape: (500, 77)
X_val_source shape: (100, 77)
y_train shape: (500, 77, 2863)
y_val shape: (100, 77, 2863)


In [12]:
import numpy as np
from tensorflow.keras.layers import Input, Embedding, SimpleRNN, Dense
from tensorflow.keras.models import Model

# Hyperparameters
EMBEDDING_SIZE = 256       # Size of the embedding layer
RNN_UNITS = 256            # Number of RNN units
max_source_length = 77    # Max length of source sequences
max_target_length = 77    # Max length of target sequences

# Encoder Model
encoder_inputs = Input(shape=(max_source_length,))
encoder_embedding_layer = Embedding(source_vocab_size, EMBEDDING_SIZE)(encoder_inputs)
encoder = SimpleRNN(RNN_UNITS, return_state=True)
encoder_outputs, h = encoder(encoder_embedding_layer)
encoder_states = h

# Decoder Model
decoder_inputs = Input(shape=(1,))  # Input shape for a single time step
decoder_embedding_layer = Embedding(target_vocab_size, EMBEDDING_SIZE)(decoder_inputs)
decoder = SimpleRNN(RNN_UNITS, return_sequences=True, return_state=True)
decoder_outputs, _ = decoder(decoder_embedding_layer, initial_state=encoder_states)

# Output Layer
output_dense_layer = Dense(target_vocab_size, activation='softmax')
outputs = output_dense_layer(decoder_outputs)

# Complete Seq2Seq Model
model = Model([encoder_inputs, decoder_inputs], outputs)
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])


2024-11-02 16:24:14.012085: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-11-02 16:24:14.012391: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-11-02 16:24:14.012604: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-

In [13]:
print(decoder_outputs.shape)
print(outputs.shape)

(None, 1, 256)
(None, 1, 2863)


In [14]:
from tensorflow.keras.callbacks import ModelCheckpoint
# Add model checkpoint to save the best model
model_checkpoint = ModelCheckpoint(
    'mt_best_val_accuracy_{val_accuracy:.4f}.keras',  # File path to save the model
    monitor='val_accuracy',  # Monitor validation loss
    save_best_only=True,  # Save only the best model
    mode='max',  # Minimize validation loss
    verbose=1  # Print a message when the model is saved
)

: 

In [None]:
# Fit the model with the ModelCheckpoint callback
history = model.fit([X_train_source, X_train_target], y_train,
                    epochs=50, batch_size=64,
                    validation_data=([X_val_source, X_val_target], y_val),
                    callbacks=[model_checkpoint])

Epoch 1/50


In [None]:
import matplotlib.pyplot as plt

# Retrieve accuracy and loss from the history object
history_dict = history.history
accuracy = history_dict['accuracy']
val_accuracy = history_dict['val_accuracy']
loss = history_dict['loss']
val_loss = history_dict['val_loss']

epochs = range(1, len(accuracy) + 1)

# Plot training and validation accuracy
plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
plt.plot(epochs, accuracy, 'bo-', label='Training accuracy')
plt.plot(epochs, val_accuracy, 'ro-', label='Validation accuracy')
plt.title('Training and Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()

# Plot training and validation loss
plt.subplot(1, 2, 2)
plt.plot(epochs, loss, 'bo-', label='Training loss')
plt.plot(epochs, val_loss, 'ro-', label='Validation loss')
plt.title('Training and Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

# Show the plots
plt.tight_layout()
plt.show()


In [None]:
!pip install nltk

In [None]:
import numpy as np
from tensorflow.keras.models import load_model
from tensorflow.keras.utils import to_categorical
from nltk.translate.bleu_score import corpus_bleu

# Load the trained model
model = load_model('mt_best_val_accuracy_0.7217.keras')

# Generate predictions
predicted_probabilities = model.predict([X_val_source, X_val_target])
predicted_sequences = np.argmax(predicted_probabilities, axis=-1)

# Convert predictions and references to lists of sentences
predicted_sentences = []
reference_sentences = []

for i in range(len(predicted_sequences)):
    predicted_sent = predicted_sequences[i]
    reference_sent = X_test_target[i]

    # Remove padding (assuming padding is done with a specific token, e.g., 0)
    predicted_sent = [word for word in predicted_sent if word != 0]
    reference_sent = [word for word in reference_sent if word != 0]

    predicted_sentences.append(predicted_sent)
    reference_sentences.append([reference_sent])  # NLTK expects a list of references

# Calculate BLEU score
bleu_score = corpus_bleu(reference_sentences, predicted_sentences)
print(f'BLEU Score: {bleu_score:.4f}')


In [None]:
# Evaluation on the test set
test_target_sequences_shifted = np.zeros_like(X_test_target)
test_target_sequences_shifted[:, :-1] = X_test_target[:, 1:]
y_test = to_categorical(test_target_sequences_shifted, num_classes=target_vocab_size)
test_loss, test_accuracy = model.evaluate([X_test_source, X_test_target], y_test)
print(f'Test Accuracy: {test_accuracy:.4f}')

In [None]:
word2idx_inputs = source_tokenizer.word_index
word2idx_outputs = target_tokenizer.word_index

# Check the current max index in word2idx_outputs
max_index = max(word2idx_outputs.values())

# Add the <sos> token with the next available index
word2idx_outputs['<sos>'] = max_index + 1

# Optional: Ensure idx2word_target is updated accordingly
idx2word_target = {v: k for k, v in word2idx_outputs.items()}


idx2word_input = {v:k for k, v in word2idx_inputs.items()}

In [None]:
word2idx_outputs

In [None]:
import numpy as np
from tensorflow.keras.layers import Input, Embedding, SimpleRNN, Dense
from tensorflow.keras.models import Model

# Hyperparameters (same as training)
EMBEDDING_SIZE = 256       # Size of the embedding layer
RNN_UNITS = 256            # Number of RNN units
max_source_length = 77      # Max length of source sequences
max_target_length = 77      # Max length of target sequences

# --- Encoder Inference Model ---

# Encoder inputs (same shape as training)
encoder_inputs = Input(shape=(max_source_length,))
encoder_embedding_layer = Embedding(source_vocab_size, EMBEDDING_SIZE)(encoder_inputs)
encoder_rnn = SimpleRNN(RNN_UNITS, return_state=True)

# Get encoder's hidden state
encoder_outputs, encoder_state_h = encoder_rnn(encoder_embedding_layer)
encoder_model = Model(encoder_inputs, encoder_state_h)  # Encoder model returns hidden state

# --- Decoder Inference Model ---

# Decoder inputs (single time step input for inference)
decoder_inputs_single = Input(shape=(1,))
decoder_state_input_h = Input(shape=(RNN_UNITS,))  # Hidden state input for inference

# Embedding layer
decoder_embedding_layer = Embedding(target_vocab_size, EMBEDDING_SIZE)(decoder_inputs_single)

# RNN layer (with previous hidden state as initial state)
decoder_rnn = SimpleRNN(RNN_UNITS, return_sequences=True, return_state=True)
decoder_outputs, decoder_state_h = decoder_rnn(decoder_embedding_layer, initial_state=decoder_state_input_h)

# Dense softmax layer to predict the next token
output_dense_layer = Dense(target_vocab_size, activation='softmax')
decoder_outputs = output_dense_layer(decoder_outputs)

# Define the decoder inference model, which outputs predicted token and new hidden state
decoder_model = Model(
    [decoder_inputs_single, decoder_state_input_h],
    [decoder_outputs, decoder_state_h]
)

# --- Function to Generate Translations ---

def translate_sentence(input_seq):
    # Encode the input sequence to get the initial hidden state
    state_value = encoder_model.predict(input_seq)

    # Initialize the target sequence with the start token <sos>
    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = word2idx_outputs['<sos>']  # Or another start token

    output_sentence = []
    stop_condition = False
    while not stop_condition:
        # Predict the next token and the next hidden state
        output_tokens, h = decoder_model.predict([target_seq, state_value])

        # Get the index of the predicted token
        predicted_token_index = np.argmax(output_tokens[0, -1, :])
        predicted_word = idx2word_target.get(predicted_token_index, '')

        # Append predicted word to the output sentence
        if predicted_word == '<eos>' or len(output_sentence) >= max_target_length:
            stop_condition = True
        else:
            output_sentence.append(predicted_word)

        # Update the target sequence (current predicted token becomes next input)
        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = predicted_token_index

        # Update the hidden state for the next time step
        state_value = h

    return ' '.join(output_sentence)

# Test the translation with an example input sentence
input_seq = X_test_source[0:1]  # Example input
translated_sentence = translate_sentence(input_seq)
# Assuming X_test_source is a numpy array or a list containing your test sentences
# You may want to iterate through the input sequences and print them

for i, input_sentence in enumerate(input_seq):
    # If the input is tokenized (numerical indices), you may want to reverse it to words
    input_sentence_words = ' '.join(idx2word_input.get(token, '') for token in input_sentence if token != 0)  # Assuming 0 is padding
    print(f'Input Sentence {i + 1}: {input_sentence_words}')

print('Predicted Translation:', translated_sentence)
