In [37]:
import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, SimpleRNN, Embedding, Dense, Attention
from tensorflow.keras.utils import to_categorical
from tensorflow.keras import layers

# Load your dataset
train_data = pd.read_csv('datasets/mt/train.csv')
val_data = pd.read_csv('datasets/mt/valid.csv')
test_data = pd.read_csv('datasets/mt/test.csv')
# Ensure your dataset has 'source' and 'target' columns
train_source_texts = train_data['indonesian'].values
train_target_texts = train_data['english'].values

val_source_texts = val_data['indonesian'].values
val_target_texts = val_data['english'].values

test_source_texts = test_data['indonesian'].values
test_target_texts = test_data['english'].values

FileNotFoundError: [Errno 2] No such file or directory: 'train.csv'

In [2]:
# Tokenization (Source and Target)
source_tokenizer = Tokenizer()
target_tokenizer = Tokenizer()

source_tokenizer.fit_on_texts(train_source_texts)
target_tokenizer.fit_on_texts(train_target_texts)

source_vocab_size = len(source_tokenizer.word_index) + 1
target_vocab_size = len(target_tokenizer.word_index) + 1

# Convert texts to sequences
train_source_sequences = source_tokenizer.texts_to_sequences(train_source_texts)
train_target_sequences = target_tokenizer.texts_to_sequences(train_target_texts)

val_source_sequences = source_tokenizer.texts_to_sequences(val_source_texts)
val_target_sequences = target_tokenizer.texts_to_sequences(val_target_texts)

test_source_sequences = source_tokenizer.texts_to_sequences(test_source_texts)
test_target_sequences = target_tokenizer.texts_to_sequences(test_target_texts)

In [3]:
print(len(train_target_sequences[1]))
print(len(val_target_sequences[0]))

20
19


In [4]:
# Pad sequences to ensure uniform input size
max_source_length = 77
max_target_length = 77   #to include EOS or shifted tokens
X_train_source = pad_sequences(train_source_sequences, maxlen=max_source_length, padding='post')
X_train_target = pad_sequences(train_target_sequences, maxlen=max_target_length, padding='post')

X_val_source = pad_sequences(val_source_sequences, maxlen=max_source_length, padding='post')
X_val_target = pad_sequences(val_target_sequences, maxlen=max_target_length, padding='post')

X_test_source = pad_sequences(test_source_sequences, maxlen=max_source_length, padding='post')
X_test_target = pad_sequences(test_target_sequences, maxlen=max_target_length, padding='post')


In [5]:
# Shift target sequences for teacher forcing
train_target_sequences_shifted = np.zeros_like(X_train_target)
train_target_sequences_shifted[:, :-1] = X_train_target[:, 1:]

# One-hot encode target sequences
y_train = X_train_target[:, 1:]
y_train = to_categorical(train_target_sequences_shifted, num_classes=target_vocab_size)

val_target_sequences_shifted = np.zeros_like(X_val_target)
val_target_sequences_shifted[:, :-1] = X_val_target[:, 1:]

y_val = X_val_target[:, 1:]  # Exclude the first token
y_val = to_categorical(val_target_sequences_shifted, num_classes=target_vocab_size)

In [6]:
# Check dimensions
print(f'X_train_source shape: {X_train_source.shape}')
print(f'X_val_source shape: {X_val_source.shape}')
print(f'y_train shape: {y_train.shape}')
print(f'y_val shape: {y_val.shape}')

X_train_source shape: (500, 77)
X_val_source shape: (100, 77)
y_train shape: (500, 77, 2863)
y_val shape: (100, 77, 2863)


In [7]:
NUM_SENTENCES = 20000 # Use only the first 20,000 records.
MAX_NUM_WORDS = 20000 # Use 20,000 words for tokenizing
MAX_SENT_LEN = 50

EMBEDDING_SIZE = 100

LSTM_NEURONS = 100

BATCH_SIZE = 64
EPOCHS = 5

In [8]:
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense

encoder_embedding_layer = Embedding(source_vocab_size, EMBEDDING_SIZE, input_length=target_vocab_size)
decoder_embedding_layer = Embedding(target_vocab_size, LSTM_NEURONS)

encoder_inputs = Input(shape=(max_source_length,))
encoder_inputs_emb = encoder_embedding_layer(encoder_inputs)
encoder = LSTM(LSTM_NEURONS, return_state=True)
encoder_outputs, h, c = encoder(encoder_inputs_emb)
encoder_states = [h, c]

decoder_inputs = Input(shape=(max_target_length,))
decoder_inputs_emb = decoder_embedding_layer(decoder_inputs)
decoder = LSTM(LSTM_NEURONS, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder(decoder_inputs_emb, initial_state=encoder_states)

output_dense_layer = Dense(target_vocab_size, activation='softmax')
outputs = output_dense_layer(decoder_outputs)
model = Model([encoder_inputs, decoder_inputs], outputs)
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])



In [9]:
print(decoder_outputs.shape)
print(outputs.shape)

(None, 77, 100)
(None, 77, 2863)


In [10]:
from tensorflow.keras.callbacks import ModelCheckpoint
# Add model checkpoint to save the best model
model_checkpoint = ModelCheckpoint(
    'mt_best_val_accuracy_{val_accuracy:.4f}.keras',  # File path to save the model
    monitor='val_accuracy',  # Monitor validation loss
    save_best_only=True,  # Save only the best model
    mode='max',  # Minimize validation loss
    verbose=1  # Print a message when the model is saved
)

In [None]:
# Fit the model with the ModelCheckpoint callback
history = model.fit([X_train_source, X_train_target], y_train,
                    epochs=50, batch_size=64,
                    validation_data=([X_val_source, X_val_target], y_val),
                    callbacks=[model_checkpoint])

Epoch 1/50
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 883ms/step - accuracy: 0.4200 - loss: 7.7159
Epoch 1: val_accuracy improved from -inf to 0.70753, saving model to mt_best_val_accuracy_0.7075.keras
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 1s/step - accuracy: 0.4347 - loss: 7.6763 - val_accuracy: 0.7075 - val_loss: 5.9281
Epoch 2/50
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 974ms/step - accuracy: 0.6387 - loss: 5.6703
Epoch 2: val_accuracy did not improve from 0.70753
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 1s/step - accuracy: 0.6386 - loss: 5.6281 - val_accuracy: 0.7075 - val_loss: 4.0480
Epoch 3/50
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 751ms/step - accuracy: 0.6361 - loss: 4.1612
Epoch 3: val_accuracy did not improve from 0.70753
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 843ms/step - accuracy: 0.6362 - loss: 4.1394 - val_accuracy: 0.7075 - val_loss: 3.0957

In [None]:
# Evaluation on the test set
y_test = to_categorical(X_test_target[:, 1:], num_classes=target_vocab_size)
test_loss, test_accuracy = model.evaluate([X_test_source, X_test_target], y_test)
print(f'Test Accuracy: {test_accuracy:.4f}')