# Requirement
numpy==1.23.4

pandas==1.5.3

tensorflow==2.10.1

tqdm==4.65.0

# import library & parameters


In [1]:
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.utils import shuffle
import tensorflow as tf
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

### Control tensorflow won't occupied all your GPU memory
os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true'
### Configs --> Parameters you can attempt to tuning
MAXLEN = 128 # If you data has short sentence, please try lower MAXLEN in order to increase performance of model training
EPOCHS = 10000
BATCH_SIZE = 64
EMB_DIM = 100
UNIT = 128

# load Data

In [2]:
!git clone https://github.com/doudou030/C_Chat_Chatbot.git
!cp C_Chat_Chatbot/train_data/train.txt train.txt
!cp C_Chat_Chatbot/train_data/train.json train.json

Cloning into 'C_Chat_Chatbot'...
remote: Enumerating objects: 26, done.[K
remote: Counting objects: 100% (26/26), done.[K
remote: Compressing objects: 100% (22/22), done.[K
remote: Total 26 (delta 4), reused 0 (delta 0), pack-reused 0[K
Unpacking objects: 100% (26/26), 6.21 MiB | 5.23 MiB/s, done.


In [3]:
q_data = []
a_data = []
f = open('/content/train.txt')
lines = f.readlines() #總共行數
for line in lines:
    line = line.strip()  
    if line.startswith('Q:'):
        q_data.append(line[3:])  
    elif line.startswith('A:'):
        a_data.append(line[3:])  
f.close


<function TextIOWrapper.close()>

In [4]:
q_data = np.array(q_data)
a_data = np.array(a_data)
q_data, a_data = shuffle(q_data,a_data)

In [5]:
q, a = [], []
for i in range(len(q_data)):
  seq_q, seq_a = q_data[i][0], a_data[i][0]
  q.append("".join(seq_q))
  a.append("".join(seq_a))

# Tokenizer

In [6]:
### Build character-based vocabulary
def tokenize_chinese(texts, voc, voc_ind):
    for t in tqdm(texts):
        for ch in str(t):
            if ch not in voc:
                voc[ch] = voc_ind
                voc_ind += 1 
    return voc, voc_ind

In [7]:
voc = {} # Vocabulary dictionary
voc_ind = 1 # vocabulary index start from 1, index 0 means nothing

voc, voc_ind = tokenize_chinese(q, voc, voc_ind)
voc, voc_ind = tokenize_chinese(a, voc, voc_ind)

100%|██████████| 49478/49478 [00:00<00:00, 3242488.88it/s]
100%|██████████| 49478/49478 [00:00<00:00, 2159881.91it/s]


# data preprocessing

In [8]:
def fit_sentence(sen, voc):
    res = []
    for i in sen:
        res.append(voc[i])
    return res

In [9]:
### Insert "Start Of Sentence" token into vocabulary
voc["<SOS>"] = len(voc)+1
### Insert "End Of Sentence" token into vocabulary
voc["<EOS>"] = len(voc)+1

In [10]:
q_x = []
ans_x, ans_y = [], []

In [11]:
### Question input
for i in tqdm(q):
    res = fit_sentence(i, voc)
    while len(res) < MAXLEN: ### If sentence is shorter than maxlen, append 0 until length reach maxlen
        res.append(0)
    q_x.append(res)

100%|██████████| 49478/49478 [00:01<00:00, 43921.10it/s]


In [12]:
### Answer input
for i in tqdm(a):
    res = fit_sentence(i, voc)
    res.insert(0,voc["<SOS>"])
    res.append(voc["<EOS>"])
    while len(res) < MAXLEN: ### If sentence is shorter than maxlen, append 0 until length reach maxlen
        res.append(0)
    ans_x.append(res)

100%|██████████| 49478/49478 [00:01<00:00, 37537.20it/s]


In [13]:
### Answer output
for i in ans_x:
    tmp = i[1:]
    tmp.append(0)
    ans_y.append(tmp)

In [14]:
### Turn into np.array for training
q_x = np.array(q_x)
ans_x = np.array(ans_x)
ans_y = np.array(ans_y)

# Model

In [15]:
def build_model(voc):
    Q_in = Input((MAXLEN,),name='Q_input')
    Q_emb = Embedding(len(voc)+1,EMB_DIM,mask_zero=True,name='Q_emb')(Q_in)
    Q_out, Q_h, Q_c = LSTM(UNIT,return_state=True,recurrent_dropout=0.2,name='Q_LSTM')(Q_emb)
    Q_state = [Q_h,Q_c]
    A_in = Input((MAXLEN,),name='A_input')
    A_emb = Embedding(len(voc)+1,EMB_DIM,mask_zero=True,name='A_emb')(A_in)
    A_out = LSTM(UNIT,return_sequences=True,recurrent_dropout=0.2,name='A_LSTM')(A_emb,initial_state=Q_state)
    output = Dense(len(voc)+1,activation='softmax',name='Output')(A_out)

    model = Model(inputs=[Q_in,A_in],outputs=output,name='Gossip_ChatBot')

    return model

In [16]:
model = build_model(voc)
model.summary()
model.compile(loss='sparse_categorical_crossentropy',optimizer='adam',metrics=['accuracy'])


Model: "Gossip_ChatBot"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 Q_input (InputLayer)           [(None, 128)]        0           []                               
                                                                                                  
 A_input (InputLayer)           [(None, 128)]        0           []                               
                                                                                                  
 Q_emb (Embedding)              (None, 128, 100)     251100      ['Q_input[0][0]']                
                                                                                                  
 A_emb (Embedding)              (None, 128, 100)     251100      ['A_input[0][0]']                
                                                                                     

In [17]:
### Callbacks --> Checkpoint: Change file path to the directory where you want to save your model
checkpoint = ModelCheckpoint(filepath="./models/chatbot_LSTM.h5", monitor='accuracy',verbose=1,save_best_only=True,save_weights_only=True)
### Callbacks --> Earlystop: Monitor accuracy and decide whether to stop the training procedure
earlystop = EarlyStopping(monitor='accuracy',patience=3,verbose=1)

### If you have model trained before, you can load it back and continue previous training procedure
try:
    model.load_weights('chatbot.h5')
    print("Load model...")
### If you haven't train any model yet, train model from initial
except:
    print("Fail to load pretrained model...")

### Train your model
model.fit((q_x, ans_x), ans_y, batch_size=BATCH_SIZE,epochs=EPOCHS,callbacks=[checkpoint, earlystop],verbose=1)

Fail to load pre trained model...
Epoch 1/10000
Epoch 1: accuracy improved from -inf to 0.65981, saving model to ./models/chatbot_LSTM.h5
Epoch 2/10000
Epoch 2: accuracy improved from 0.65981 to 0.68107, saving model to ./models/chatbot_LSTM.h5
Epoch 3/10000
Epoch 3: accuracy did not improve from 0.68107
Epoch 4/10000
Epoch 4: accuracy improved from 0.68107 to 0.68114, saving model to ./models/chatbot_LSTM.h5
Epoch 5/10000
Epoch 5: accuracy did not improve from 0.68114
Epoch 6/10000
Epoch 6: accuracy did not improve from 0.68114
Epoch 7/10000
Epoch 7: accuracy did not improve from 0.68114
Epoch 7: early stopping


<keras.callbacks.History at 0x7f322cf1b640>

In [18]:
model.save('chatbot.h5')

# Inference 

In [22]:
#跑下面inference前,要先改你要測試的model，否則就是前方訓練好的model
#也可以不跑前方訓練，那就是這裡要記得import model
model = model.load_model('chatbot.h5')

AttributeError: ignored

In [23]:
from keras.preprocessing.text import Tokenizer

Q_max_length = 42 #q_data max = 42, a_data max char = 70
A_max_length = 70
start = '<sos>'
end = '<end>'
token = Tokenizer(char_level=True,filters='',oov_token='<unk>')
token.fit_on_texts(q)
token.fit_on_texts(a)
word_index = token.word_index
index_word = token.index_word

while True:
  question = input('來點動漫話題: ')
  if question == '滾': # 輸入 '滾' 即可結束Chat Bot
    print('...88')
    break
  ans_seq = ''
  cur_token = start
  word_count = 1
  target_seq = np.zeros((1,A_max_length),dtype='int64')
  target_seq[0,0] = word_index[start]
  q = fit_sentence(question,word_index,Q_max_length)
  q = np.squeeze(q)
  q = tf.expand_dims(q,0)
  while word_count < A_max_length:
    decoder_output = model.predict([q,target_seq])
    ind = np.argmax(decoder_output[0,word_count])
    cur_token = index_word[ind]
    if cur_token == end:
      break
    ans_seq += cur_token
    target_seq[0,word_count] = ind
    word_count += 1
  print(ans_seq)

來點動漫話題: ya


KeyError: ignored