In [None]:
# Import Library yang dibutuhkan
import pandas as pd
import numpy as np
import tensorflow_datasets as tfds
import tensorflow as tf
tf.__version__

'2.15.0'

In [None]:
!gdown 1hRsS4M9qQXrAMj53KFLiqUnpw1km-_2a

Downloading...
From: https://drive.google.com/uc?id=1hRsS4M9qQXrAMj53KFLiqUnpw1km-_2a
To: /content/eng-indo-augmented.txt
  0% 0.00/1.15M [00:00<?, ?B/s]100% 1.15M/1.15M [00:00<00:00, 106MB/s]


In [None]:
# Fungsi untuk load document text
def load_doc(url):
  df = pd.read_csv(url, delimiter="\t", header=None)
  return df

In [None]:
 # Load document txt
# doc = load_doc("dataset/ind.txt")
doc = load_doc("eng-indo-augmented.txt")
doc=doc.iloc[:,:2]
doc.columns=['english','indonesian']
doc.head(20)

Unnamed: 0,english,indonesian
0,run !,lari !
1,who ?,siapa ?
2,wow !,wow !
3,help !,tolong !
4,jump !,lompat !
5,jump .,loncat .
6,stop !,berhenti !
7,wait !,tunggu !
8,wait .,tunggu .
9,hurry !,cepatlah !


In [None]:
# Mengambil 6500 kalimat dari dataset untuk menghindari crash
doc = doc.sample(6500)

In [None]:
tokenizer_id = tf.keras.preprocessing.text.Tokenizer()
tokenizer_en = tf.keras.preprocessing.text.Tokenizer()

In [None]:
tokenizer_en.fit_on_texts(doc.iloc[:,0])
tokenizer_id.fit_on_texts(doc.iloc[:,1])

In [None]:
doc["id_indices"] = tokenizer_id.texts_to_sequences(doc.iloc[:,1])
doc["en_indices"] = tokenizer_en.texts_to_sequences(doc.iloc[:,0])

In [None]:
doc.head(20)

Unnamed: 0,english,indonesian,id_indices,en_indices
11668,ken started to descend the stairs .,ken mulai menuruni tangganya .,"[29, 200, 343, 1204]","[39, 426, 1, 1169, 3, 228]"
3711,i was born on march .,saya lahir pada tanggal maret .,"[15, 971, 51, 1464, 1876]","[2, 13, 988, 30, 1711]"
7682,michael must be asleep .,michael pasti tertidur .,"[41, 201, 1205]","[50, 169, 23, 1170]"
1167,this is my brother .,ini adikku .,"[9, 820]","[24, 6, 58, 311]"
7151,maria was injured .,maria telah terluka .,"[42, 60, 281]","[51, 13, 520]"
12860,vijay went splashing through the shallows .,vijay pergi melewati sungai yang dangkal denga...,"[48, 21, 366, 384, 4, 972, 11, 170, 344]","[56, 119, 989, 359, 3, 990]"
8087,darren speaks very fast .,darren berbicara dengan sangat cepat .,"[30, 70, 11, 49, 209]","[40, 403, 78, 298]"
6884,mia is tense .,mia tidak tenang .,"[28, 1, 714]","[37, 6, 991]"
6594,due to my lumbago i won t be able to attend ko...,karena sakit punggungku kambuh aku tidak bisa ...,"[171, 163, 1877, 1878, 3, 1, 24, 1206, 1879, 1...","[1712, 1, 58, 1713, 2, 187, 7, 23, 258, 1, 139..."
9552,i d like to speak with mommy .,saya ingin berbicara dengan ibu .,"[15, 18, 70, 11, 26]","[2, 94, 34, 1, 154, 35, 32]"


In [None]:
# Padding
padded_id_indices = tf.keras.preprocessing.sequence.pad_sequences(doc["id_indices"], padding="post")
padded_en_indices = tf.keras.preprocessing.sequence.pad_sequences(doc["en_indices"], padding="post")

In [None]:
padded_id_indices.shape

(6500, 25)

In [None]:
padded_en_indices.shape

(6500, 24)

In [None]:
binarized_en_indices = tf.keras.utils.to_categorical(padded_en_indices)
binarized_en_indices.shape

(6500, 25, 2577)

In [None]:
id_ds = tf.data.Dataset.from_tensor_slices(padded_id_indices)
en_ds = tf.data.Dataset.from_tensor_slices(binarized_en_indices)

In [None]:
tf_ds = tf.data.Dataset.zip((id_ds, en_ds))

In [None]:
id_tensor, en_tensor = next(iter(tf_ds))
print(id_tensor)
print(en_tensor)

tf.Tensor(
[339 135  10   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0], shape=(19,), dtype=int32)
tf.Tensor(
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]], shape=(25, 2577), dtype=float32)


In [None]:
# Shuffle & Batch
BATCH_SIZE = 32
vocab_size_id = len(tokenizer_id.word_index)
vocab_size_en = len(tokenizer_en.word_index)

In [None]:
# Train Test Split
TAKE_SIZE = int(0.7*len(doc))

train_data = tf_ds.take(TAKE_SIZE)
test_data = tf_ds.skip(TAKE_SIZE)

train_data = train_data.shuffle(TAKE_SIZE).batch(BATCH_SIZE)
test_data = test_data.shuffle(len(doc) - TAKE_SIZE).batch(BATCH_SIZE)

In [None]:
# Create the model
model = tf.keras.Sequential([
                  # Input Word Embedding layer
                  tf.keras.layers.Embedding(vocab_size_id+1, 64, mask_zero=True),

                  # LSTM Bidirectional layer
                  tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True)),

                  # LSTM Bidirectionnal new layer
                  tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=False)),

                  # Repeat Vector
                  tf.keras.layers.RepeatVector(binarized_en_indices.shape[1]),

                  # LSTM new layer
                  tf.keras.layers.LSTM(32, return_sequences=True),

                  # Output layer with number of output neurons equal to class number with softmax function
                  tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(vocab_size_en+1, activation="softmax"))

])

In [None]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 64)          191424    
                                                                 
 bidirectional (Bidirection  (None, None, 128)         66048     
 al)                                                             
                                                                 
 bidirectional_1 (Bidirecti  (None, 128)               98816     
 onal)                                                           
                                                                 
 repeat_vector (RepeatVecto  (None, 24, 128)           0         
 r)                                                              
                                                                 
 lstm_2 (LSTM)               (None, 24, 32)            20608     
                                                        

In [None]:
# "Random" prediction to test our model
input_text, output_text = next(iter(train_data))
print(input_text.numpy().shape)
print(model.predict(input_text).shape)
print(output_text.numpy().shape)

(32, 25)
(32, 24, 2575)
(32, 24, 2575)


In [None]:
# Using a simple compiler with an Adam optimizer to compute our gradients
optimizer= tf.keras.optimizers.Adam()

model.compile(optimizer=optimizer,
              loss=tf.keras.losses.CategoricalCrossentropy(),
              metrics=[tf.keras.metrics.CategoricalAccuracy()])

In [None]:
# from keras import callbacks

# early_stop = callbacks.EarlyStopping(
#     monitor="val_loss",
#     mode="min",
#     patience=3,
#     restore_best_weights=True
# )

In [None]:
# Application of the model on 200 epochs
history = model.fit(train_data,
                    validation_data=test_data,
                    epochs=200,
                    # verbose=1,
                    # callbacks=[early_stop]
                    )

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200


In [None]:
# Testing a translation
for input_text, translation in test_data.take(1):
  pred = model.predict(input_text)



In [None]:
tokenizer_id.sequences_to_texts(input_text.numpy())

['mata ayah merah dan ia terlihat sangat lelah',
 'apa david tahu apa yang harus ia lakukan',
 'polisi melihat pelaku kriminal itu melarikan diri',
 'aku sudah tidak lagi mempercayai dia',
 'kami tidak mengerti kenapa tom melakukan apa yang dilakukannya',
 'anda punya apa',
 'bisakah saya meminjam mobil anda sebentar',
 'carrie sampai tidak sanggup melihat ke arah mary',
 'apa masalahmu',
 'vijay benar benar kelelahan',
 'jenis makanan apa yang harus ku bawa',
 'kim sampai di rumah lewat tengah malam',
 'nancy tidak tahu cara berbicara bahasa perancis',
 'kurasa mia akan segera kembali',
 'aku pikir kita akan baik baik saja',
 'payung ini berapa harganya',
 'lee tidak lemah',
 'kamu tahu semua yang kulakukan john',
 'kau terlambat untuk bekerja',
 'angin sedang berhembus dari utara',
 'tom tidak akan duduk',
 'akan kutunggu di halte bus',
 'maria masih ada waktu',
 'aku akan membawanya kembali',
 'ibu masih tidak mengangkat teleponnya',
 'sarah mendorong kursi roda melewati lorong',
 '

In [None]:
tokenizer_en.sequences_to_texts(tf.argmax(translation, axis=-1).numpy())

['daddy s eyes are red and he looks very tired',
 'did david know what to do',
 'the police caught sight of the criminal running away',
 'i don t trust him any longer',
 'we re not entirely sure why tom did what he did',
 'what do you have',
 'can i use your car for a little while',
 'carrie couldn t even look at mary',
 'what is your problem',
 'vijay was all worn out',
 'what kind of food do you want me to bring',
 'kim got home after midnight',
 'nancy doesn t know how to speak french',
 'i think mia will return soon',
 'i think we ll be ok',
 'how much is this umbrella',
 'lee s not weak',
 'you know everything i do john',
 'you were late for work',
 'the wind is blowing from the north',
 'tom wouldn t sit down',
 'i ll wait at the bus stop',
 'maria still has some time',
 'i ll bring it back',
 'mommy is still not answering his phone',
 'sarah pushed the wheelchair down the hall',
 'you may choose whichever book you like',
 'eat something',
 'you are tired aren t you',
 'carrie re

In [None]:
tokenizer_en.sequences_to_texts(tf.argmax(pred, axis=-1).numpy())

['lim s i s t s been been very today',
 'we could what what do do',
 'that has been no quieted three mother years',
 'i shouldn t like any any longer',
 'we may to entirely sure he he he he did it',
 'are you you',
 'have you to to to go yourself',
 'darren shouldn t where look mary mary',
 'is re happy',
 'lim was completely worn out',
 'she was a a a s s s eat',
 'david got at the midnight',
 'carrie doesn t know that to here be',
 'i think i will return soon',
 'i don t think i about about leave',
 'what does lim might possibly happen',
 'lim isn t weak',
 'you know i do do that',
 'can to be fight alone',
 'lee is broke at water',
 'tom won t be be',
 'i d to to the do do do the the the help',
 'john is is in time',
 'i m the back',
 'michael is not not answering his phone',
 'tom pushed the wheelchair through the hall',
 'don you you you you look about tom',
 'where s to',
 'you ve afraid tom t you',
 'darren received a than mary mary',
 'what s the matter',
 'i saw up in the hall

In [None]:
model.save('machine_translation_k1.keras')
model.save('machine_translation_k1')
!zip -r 'machine_translation_k1.zip' 'machine_translation_k1'

  adding: machine_translation_k1/ (stored 0%)
  adding: machine_translation_k1/variables/ (stored 0%)
  adding: machine_translation_k1/variables/variables.index (deflated 68%)
  adding: machine_translation_k1/variables/variables.data-00000-of-00001 (deflated 10%)
  adding: machine_translation_k1/keras_metadata.pb (deflated 93%)
  adding: machine_translation_k1/saved_model.pb (deflated 90%)
  adding: machine_translation_k1/assets/ (stored 0%)
  adding: machine_translation_k1/fingerprint.pb (stored 0%)


In [None]:
import pickle

with open('history.pkl', 'wb') as f:
    pickle.dump(history, f)

In [None]:
from google.colab import files

In [None]:
files.download('machine_translation_k1.keras')

files.download('machine_translation_k1.zip')

files.download('history.pkl')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# Download Model yang sudah di save
!gdown 1HdpNKrbgxNbv5VbCxmlIvesiPqRrPkd-

Downloading...
From: https://drive.google.com/uc?id=1HdpNKrbgxNbv5VbCxmlIvesiPqRrPkd-
To: /content/machine_translation_k1.keras
100% 5.62M/5.62M [00:00<00:00, 22.0MB/s]


In [None]:
# Download history.pkl
!gdown 1TozLqc1I1-1JLy89QzieDgX1sdiAXKq_

Downloading...
From: https://drive.google.com/uc?id=1TozLqc1I1-1JLy89QzieDgX1sdiAXKq_
To: /content/history.pkl
100% 5.62M/5.62M [00:00<00:00, 17.0MB/s]


In [None]:
model = tf.

AttributeError: ignored