In [1]:
import pandas as pd
human_data = pd.read_table("data/human_text_indo.txt", header=None)
human_data.columns=["human"]
robot_data = pd.read_table("data/robot_text_indo.txt", header=None)
robot_data.columns=["robot"]

In [18]:
data = {'human':human_data.human, 'robot':robot_data.robot}
df = pd.DataFrame(data)
df.head()

Unnamed: 0,human,robot
0,[mulai],Hai apa kabar ! ? 😁😁
1,Oh terima kasih ! saya baik-baik saja. ini ada...,ini sudah sore!
2,bagaimana perasaanmu hari ini ? ceritakan kepa...,"nama saya rdany, tetapi Anda bisa memanggil sa..."
3,berapa banyak teman virtual yang kamu punya?,saya punya banyak ! tetapi tidak cukup untuk s...
4,apakah itu dilarang bagi Anda untuk memberi ta...,"saya telah berbicara dengan 143 pengguna, meng..."


In [19]:
# mengganti kata didalam kurung siku [...] menjadi kata hai
import re
df.human = df.human.apply(lambda x : re.sub(r"\[w+\]","hai", x))
df.robot = df.robot.apply(lambda x : re.sub(r"\[w+\]","hai", x))

In [20]:
#koversi ke huruf kecil (Casefolding)
df.human = df.human.apply(lambda x: x.lower())
df.robot = df.robot.apply(lambda x: x.lower())

In [21]:
#hapus tanda baca (stopword removal)
import string
exclude = set(string.punctuation)
df.human = df.human.apply(lambda x : ''.join(ch for ch in x if ch not in exclude))
df.robot = df.robot.apply(lambda x : ''.join(ch for ch in x if ch not in exclude))

In [22]:
#hapus angka
remove_digits = str.maketrans('','',string.digits)
df.human = df.human.apply(lambda x: x.translate(remove_digits))
df.robot = df.robot.apply(lambda x: x.translate(remove_digits))

In [23]:
#hapus emoticon
df.human = df.human.apply(lambda x: x.encode('ascii','ignore').decode('ascii'))
df.robot = df.robot.apply(lambda x: x.encode('ascii','ignore').decode('ascii'))

In [24]:
#split data
from sklearn.model_selection import train_test_split
df_train, df_test = train_test_split(df, test_size=0.1)

In [25]:
df_train.shape

(2126, 2)

In [26]:
df_test.shape

(237, 2)

In [27]:
#buat vocab dari data training
vocabulary = set ()
for idx, row in df_train.iterrows():
    sent = row.human + ' '+row.robot
    [vocabulary.update(sent.split())]


print(f"Ukuran Vocab : {len(vocabulary)}")    

Ukuran Vocab : 3481


In [28]:
all_vocab = []

for idx, row in df_train.iterrows():
    sent = row.human + ' ' +row.robot
    [all_vocab.append(i) for i in sent.split()]

print(f"Jumlah Semua Token : {len(all_vocab)}")

Jumlah Semua Token : 32557


In [29]:
#hitung frekuensi vocab dan hapus yang tidak perlu (sedikit)
from collections import Counter

counter = Counter(all_vocab)

dic_ = dict(counter)
threshold = 3

sorted_dic = sorted(dic_.items(), reverse=True, key=lambda x: x[1])
sorted_dic = [x for x in sorted_dic if x[1] > threshold]
all_vocab = [x[0] for x in sorted_dic]
len(all_vocab) 

937

In [31]:
#buat dictionary word to idx & idx to word
ix = 1 
word_to_idx = {}
idx_to_word = {}

for e in all_vocab:
    word_to_idx[e] = ix
    idx_to_word[ix] = e
    ix += 1

In [32]:
#tambahkan "startseq" dan "endseq"
word_to_idx['startseq'] = 938
word_to_idx['endseq'] = 939

idx_to_word[938] = 'startseq'
idx_to_word[939] = 'endseq'

In [33]:
vocab_size = len(idx_to_word) + 1

In [34]:
#tambahkan "startseq" dan "endseq" di data train dibagian data robot
df_train.robot = df.robot.apply(lambda x: 'startseq '+ x + ' endseq')
df_train.head()

Unnamed: 0,human,robot
1703,hmm karena kamu selalu membaca pijatanku tapi ...,startseq saya mengerti sekarang saya bot dan ...
272,saya berharap begitu saya akan merekomendasik...,startseq terima kasih saya akan lihat endseq
1171,dapatkah anda membantu saya menulis program di...,startseq tentu apa yang ingin kamu lakukan e...
691,oke apakah anda tahu beberapa permainan,startseq saya tahu tetris bercanda endseq
2358,mulai f,startseq hai disini apa kabarmu endseq


In [35]:
#buat data generator x1 = input, x2=output, y=output Neural Network
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
import numpy as np
def data_generator(train_df,word_to_idx,max_len,number_conversation):
    X1, X2, y = [],[],[]
    n = 0
    while True:
        for idx, row in train_df.iterrows():
            seq_human = [word_to_idx[word] for word in row ['human'].split() if word in word_to_idx]
            seq_human = pad_sequences([seq_human], maxlen=max_len, value=0, padding='post') [0]
            seq_robot = [word_to_idx[word] for word in row['robot'].split() if word in word_to_idx]

            for i in range(1, len(seq_robot)):
                in_seq = seq_robot[:i]
                out_seq = seq_robot[i]

                in_seq = pad_sequences([in_seq], maxlen=max_len, value=0, padding='post') [0]
                out_seq = to_categorical ([out_seq],num_classes=vocab_size)[0]

                X1.append(seq_human)
                X2.append(in_seq)
                y.append(out_seq)

            if n==number_conversation:
                #output [X1, X2, y]; yield lebih memori friendly dari array
                yield([np.array(X1),np.array(X2),np.array(y)])
                X1, X2, y = [], [], []
                n = 0

In [36]:
datasample = df_train.sample(2)
datagen = data_generator(datasample, word_to_idx, 50, len(datasample))

In [37]:
datasample

Unnamed: 0,human,robot
625,baik diberikan,startseq saya pikir kita bisa menjadi teman vi...
1733,ya kamu harus,startseq endseq


In [38]:
#training
from tensorflow.keras.layers import Input, Dense, Dropout, Embedding, LSTM, Add
from tensorflow.keras.models import Model, load_model
max_len = 50
input_chat = Input(shape=(max_len))
input_x = Embedding(input_dim=vocab_size, output_dim=50, mask_zero=True)(input_chat)
#mencegah overfit
input_x = Dropout(0.3)(input_x)
input_x = LSTM(256)(input_x) 

In [39]:
output_chat = Input(shape=(max_len))
output_x = Embedding(input_dim=vocab_size, output_dim=50, mask_zero=True)(input_chat)
#mencegah overfit
output_x = Dropout(0.3)(output_x)
output_x = LSTM(256)(output_x)

In [40]:
#penyatuan model
decoder = Add()([input_x, output_x])
decoder = Dense(256, activation='relu')(decoder)
outputs = Dense(vocab_size, activation='softmax')(decoder)

model = Model(inputs=[input_chat, output_chat], outputs=outputs)

In [41]:
 model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 50)]         0           []                               
                                                                                                  
 embedding (Embedding)          (None, 50, 50)       47000       ['input_1[0][0]']                
                                                                                                  
 embedding_1 (Embedding)        (None, 50, 50)       47000       ['input_1[0][0]']                
                                                                                                  
 dropout (Dropout)              (None, 50, 50)       0           ['embedding[0][0]']              
                                                                                              

In [42]:
model.compile(loss='categorical_crossentropy', optimizer="adam")

In [43]:
#konfigurasi training
#melakukan training per25 conversation dalam 1 data set
epochs = 10
number_conversation = 25
steps = len(df_train) // number_conversation

In [44]:
steps

85

In [92]:
generator_train = data_generator(df_train, word_to_idx, max_len, number_conversation)
model.fit(generator_train, epochs=3, steps_per_epoch=steps, verbose=1)