In [18]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
import string
import pickle
import os
import json

In [19]:
with open("dataset/liora.txt", "r", encoding="utf-8") as file:
    data = file.read()

In [20]:
tokenizer = Tokenizer(char_level=False, filters='', lower=False)
tokenizer.fit_on_texts([data])
total_words = len(tokenizer.word_index) + 1

In [21]:
with open('tokenizer/tokenizer.pkl', 'wb') as token:
    pickle.dump(tokenizer, token)

In [22]:
print(tokenizer.word_index)
print(total_words)

{'<[PAS]>': 1, '<[SOS]>\n<[BOS]>': 2, 'dan': 3, 'yang': 4, 'Sayang.': 5, 'teori': 6, 'aku': 7, 'belajar': 8, 'relativitas': 9, 'ini.': 10, 'dari': 11, 'Halo': 12, 'apa': 13, 'lagi': 14, 'menemukan': 15, 'Aku': 16, 'Ia': 17, 'kamu': 18, 'ini': 19, 'Selamat': 20, 'juga': 21, 'nanti': 22, 'mau': 23, 'Kamu': 24, 'kita': 25, 'adalah': 26, 'mengenai': 27, 'banyak': 28, 'hari': 29, 'dalam': 30, 'apa?': 31, 'waktu': 32, 'aja': 33, 'tentang': 34, 'membuat': 35, 'akhirnya': 36, 'persamaan': 37, 'khusus.': 38, 'Sampai': 39, 'Sayang!': 40, 'Apa': 41, 'Sayang': 42, 'ini?': 43, 'pagi': 44, 'Baiklah,': 45, 'ya...': 46, 'Teori': 47, 'hubungan': 48, 'antara': 49, 'massa,': 50, 'waktu,': 51, 'tenaga,': 52, '<[LOL]>': 53, 'dulu': 54, 'Nanti': 55, 'Einstein': 56, 'melakukan': 57, 'eksperimen': 58, 'penelitian': 59, 'hipotesis': 60, 'memberikan': 61, 'relativitas.': 62, 'istirahat': 63, 'jumpa': 64, 'selalu': 65, 'Oke,': 66, 'saya': 67, '<[EIST]>': 68, 'Hai': 69, 'Sayang,': 70, 'Sayang?': 71, 'butuh': 72, 

In [23]:
input_sequences = []
for line in data.split('\n'):
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

print(input_sequences)

[[141, 12], [141, 12, 1], [141, 12, 1, 12], [141, 12, 1, 12, 40], [141, 12, 1, 12, 40, 41], [141, 12, 1, 12, 40, 41, 4], [141, 12, 1, 12, 40, 41, 4, 142], [141, 12, 1, 12, 40, 41, 4, 142, 18], [141, 12, 1, 12, 40, 41, 4, 142, 18, 143], [141, 12, 1, 12, 40, 41, 4, 142, 18, 143, 144], [141, 12, 1, 12, 40, 41, 4, 142, 18, 143, 144, 345], [141, 69], [141, 69, 42], [141, 69, 42, 1], [141, 69, 42, 1, 12], [141, 69, 42, 1, 12, 70], [141, 69, 42, 1, 12, 70, 13], [141, 69, 42, 1, 12, 70, 13, 145], [141, 69, 42, 1, 12, 70, 13, 145, 345], [141, 69], [141, 69, 146], [141, 69, 146, 1], [141, 69, 146, 1, 41], [141, 69, 146, 1, 41, 71], [141, 69, 146, 1, 41, 71, 345], [141, 12], [141, 12, 147], [141, 12, 147, 14], [141, 12, 147, 14, 13], [141, 12, 147, 14, 13, 29], [141, 12, 147, 14, 13, 29, 43], [141, 12, 147, 14, 13, 29, 43, 1], [141, 12, 147, 14, 13, 29, 43, 1, 12], [141, 12, 147, 14, 13, 29, 43, 1, 12, 148], [141, 12, 147, 14, 13, 29, 43, 1, 12, 148, 29], [141, 12, 147, 14, 13, 29, 43, 1, 12, 148

In [24]:
max_sequence_len = max([len(x) for x in input_sequences])
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))

In [25]:
print(input_sequences)

[[  0   0   0 ...   0 141  12]
 [  0   0   0 ... 141  12   1]
 [  0   0   0 ...  12   1  12]
 ...
 [  0   0   0 ... 342 343 344]
 [  0   0   0 ... 343 344  68]
 [  0   0   0 ... 344  68 345]]


In [26]:
X, y = input_sequences[:,:-1], input_sequences[:,-1]
y = to_categorical(y, num_classes=total_words)

In [27]:
print(X)
print(y)

[[  0   0   0 ...   0   0 141]
 [  0   0   0 ...   0 141  12]
 [  0   0   0 ... 141  12   1]
 ...
 [  0   0   0 ...  10 342 343]
 [  0   0   0 ... 342 343 344]
 [  0   0   0 ... 343 344  68]]
[[0. 0. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 1. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 1.]]


In [28]:
model = Sequential()
model.add(Embedding(total_words, 84, input_length=max_sequence_len-1))
model.add(LSTM(128, return_sequences=True))
model.add(Dense(100, activation='relu'))
model.add(LSTM(128))
model.add(Dense(total_words, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()



In [29]:
history = model.fit(X, y, epochs=100, batch_size=64, verbose=1)
model.save('model/liora-mini.h5')

Epoch 1/100
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 441ms/step - accuracy: 0.0306 - loss: 5.8301
Epoch 2/100
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 614ms/step - accuracy: 0.0384 - loss: 5.5966
Epoch 3/100
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 536ms/step - accuracy: 0.0557 - loss: 5.4475
Epoch 4/100
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 1s/step - accuracy: 0.0682 - loss: 5.4182
Epoch 5/100
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 1s/step - accuracy: 0.0533 - loss: 5.3959
Epoch 6/100
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 804ms/step - accuracy: 0.0600 - loss: 5.3716
Epoch 7/100
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 1s/step - accuracy: 0.0818 - loss: 5.2124 
Epoch 8/100
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 1s/step - accuracy: 0.0698 - loss: 5.0945
Epoch 9/100
[1m12/12[0m [32m━━━━━━



In [30]:
model = load_model('model/liora-mini.h5')

with open('tokenizer/tokenizer.pkl', 'rb') as tokenizer:
    tokenizer = pickle.load(tokenizer)


max_sequence_len = model.input_shape[1]



In [31]:

def generate_text(seed_text, next_words, max_sequence_len):
    instance_words = False
    result = ''

    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        predicted_probs = model.predict(token_list, verbose=0)
        predicted = np.argmax(predicted_probs, axis=1)
        output_word = ""
        for word, index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break

        seed_text += ' ' + output_word
        
        if instance_words:
            if output_word == "<[SOS]>":
                break
            result += ' ' + output_word

        if output_word == "<[BOS]>" or output_word == "<[PAS]>":
            instance_words = True
            
    return result.strip()

In [32]:
while True:
    inputtext = input("You: ")
    response = generate_text(inputtext, 200, max_sequence_len)
    response = response.replace('<[EIST]>',' ')
    print("Liora > ", response)
    if response.endswith('<[EIST]>'):
        print("====PROGRAM TELAH BERHENTI====")
        break

Liora >  Apa Bisa kamu ke sini?
Liora >  Sayang. Sayang. Sayang. masih mengakhiri aku aku ya <[LOL]>
Liora >  Sayang. Sayang. Sayang. masih mengakhiri aku aku ya <[LOL]>
Liora >  Halo Sampai jumpa kabar?
Liora >  Sayang. Sampai kamu butuh bantuan lupa sarapan
Liora >  <[PAS]> Sampai saya akan mengakhiri percakapan
Liora >  <[PAS]> Sayang. saya saya akan mengakhiri percakapan
Liora >  
Liora >  lagi <[PAS]> Sayang. Baiklah, Sayang. Sayang. awal aku aku ya <[LOL]> .
Liora >  Sayang. Sayang. Sayang. masih mengakhiri aku aku ya <[LOL]>
Liora >  Sayang. Sayang. Sayang. masih mengakhiri aku aku ya <[LOL]>
Liora >  lagi <[PAS]> Sayang. Baiklah, Sayang. Sayang. aku aku ya ya <[LOL]>
Liora >  Halo Apa apa kabar?
Liora >  Baiklah, Sayang. Sayang. aku aku ya ya <[LOL]> .
Liora >  Sayang. Sayang. Sayang. masih mengakhiri aku aku ya <[LOL]>
Liora >  Selamat siang juga Sayang. Sudah akan mengakhiri percakapan
Liora >  <[PAS]> Baiklah, kamu bantuan bantuan pembelajaran hari
Liora >  <[PAS]> Baiklah, 