### Import libraries

In [1]:
%pip install spacy
!python -m spacy download en_core_web_sm

Note: you may need to restart the kernel to use updated packages.


2021-04-15 03:18:28.523054: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library cudart64_110.dll


✔ Download and installation successful
You can now load the package via spacy.load('en_core_web_sm')


In [2]:
import numpy as np
import json
import re
import tensorflow as tf
import random
import spacy
import pandas as pd
import os
nlp = spacy.load('en_core_web_sm')

### Preprocessing data
In this part we gonna clean data, split them into inputs and targets tensor, build a tokenizer dictionary and turn sentences into sequences.
The target tensor has a bunch of list with a length of unique title list.

In [3]:
def preprocessing(line):
    line = re.sub(r'[^a-zA-z.?!\']', ' ', line)
    line = re.sub(r'[ ]+', ' ', line)
    return line

### Inputs

In [4]:
inputs, targets = [], []
classes = []
intent_doc = {}

In [5]:
for root, dirs, files in os.walk('data'):
    print(f"root={root}, dirs={dirs}, files={files}")
    for f in files:
        path = os.path.join(root, f)
        dataframe = pd.read_csv(path)

        for index, line in dataframe.iterrows():
            intent = line['intent']
            if intent not in intent_doc:
                classes.append(intent)
                intent_doc[intent] = []
            if 'request' in line:
                inputs.append(preprocessing(line['request']))
                targets.append(intent)
            if 'response' in line:
                intent_doc[intent].append(line['response'])


root=data, dirs=[], files=['Clever_request.csv', 'Clever_response.csv', 'CourtesyGoodBye_request.csv', 'CourtesyGoodBye_response.csv', 'CourtesyGreetingResponse_request.csv', 'CourtesyGreetingResponse_response.csv', 'CourtesyGreeting_request.csv', 'CourtesyGreeting_response.csv', 'covid_19_request.csv', 'covid_19_response.csv', 'CurrentHumanQuery_request.csv', 'CurrentHumanQuery_response.csv', 'GoodBye_request.csv', 'GoodBye_response.csv', 'Gossip_request.csv', 'Gossip_response.csv', 'GreetingResponse_request.csv', 'GreetingResponse_response.csv', 'Greeting_request.csv', 'Greeting_response.csv', 'Jokes_request.csv', 'Jokes_response.csv', 'NameQuery_request.csv', 'NameQuery_response.csv', 'NotTalking2U_request.csv', 'NotTalking2U_response.csv', 'PodBayDoorResponse_request.csv', 'PodBayDoorResponse_response.csv', 'PodBayDoor_request.csv', 'PodBayDoor_response.csv', 'RealNameQuery_request.csv', 'RealNameQuery_response.csv', 'SelfAware_request.csv', 'SelfAware_response.csv', 'Shutup_reques

In [6]:
# # get text and intent title from json data
# inputs, targets = [], []
# classes = []
# intent_doc = {}

# for intent in intents['intents']:
#     if intent['intent'] not in classes:
#         classes.append(intent['intent'])
#     if intent['intent'] not in intent_doc:
#         intent_doc[intent['intent']] = []
        
#     for text in intent['text']:
#         inputs.append(preprocessing(text))
#         targets.append(intent['intent'])
        
#     for response in intent['responses']:
#         intent_doc[intent['intent']].append(response)

In [7]:
def tokenize_data(input_list):
    tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='', oov_token='<unk>')
    
    tokenizer.fit_on_texts(input_list)
    
    input_seq = tokenizer.texts_to_sequences(input_list)

    input_seq = tf.keras.preprocessing.sequence.pad_sequences(input_seq, padding='pre')
    
    return tokenizer, input_seq

# preprocess input data
tokenizer, input_tensor = tokenize_data(inputs)

In [8]:
def create_categorical_target(targets):
    word={}
    categorical_target=[]
    counter=0
    for trg in targets:
        if trg not in word:
            word[trg]=counter
            counter+=1
        categorical_target.append(word[trg])
    
    categorical_tensor = tf.keras.utils.to_categorical(categorical_target, num_classes=len(word), dtype='int32')
    return categorical_tensor, dict((v,k) for k, v in word.items())

# preprocess output data
target_tensor, trg_index_word = create_categorical_target(targets)

In [9]:
print('input shape: {} and output shape: {}'.format(input_tensor.shape, target_tensor.shape))

input shape: (212, 10) and output shape: (212, 27)


### Build the model

In [10]:
# hyperparameters
epochs=50
vocab_size=len(tokenizer.word_index) + 1
embed_dim=512
units=128
target_length=target_tensor.shape[1]

In [11]:
# build RNN Model with tensorflow
model = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(vocab_size, embed_dim),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(units, dropout=0.2)),
    tf.keras.layers.Dense(units, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(target_length, activation='softmax')
])

optimizer = tf.keras.optimizers.Adam(lr=1e-2)
model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 512)         134656    
_________________________________________________________________
bidirectional (Bidirectional (None, 256)               656384    
_________________________________________________________________
dense (Dense)                (None, 128)               32896     
_________________________________________________________________
dropout (Dropout)            (None, 128)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 27)                3483      
Total params: 827,419
Trainable params: 827,419
Non-trainable params: 0
_________________________________________________________________


In [12]:
early_stop = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=4)

# train the model
model.fit(input_tensor, target_tensor, epochs=epochs, callbacks=[early_stop])

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50


<tensorflow.python.keras.callbacks.History at 0x247c904f3d0>

In [13]:
#Define API

SyntaxError: invalid syntax (<ipython-input-13-03c05dcb3a43>, line 1)

In [None]:
def response(sentence):
    sent_seq = []
    doc = nlp(repr(sentence))
    
    # split the input sentences into words
    for token in doc:
        if token.text in tokenizer.word_index:
            sent_seq.append(tokenizer.word_index[token.text])

        # handle the unknown words error
        else:
            sent_seq.append(tokenizer.word_index['<unk>'])

    sent_seq = tf.expand_dims(sent_seq, 0)
    # predict the category of input sentences
    pred = model(sent_seq)

    pred_class = np.argmax(pred.numpy(), axis=1)
    
    # choice a random response for predicted sentence
    return random.choice(intent_doc[trg_index_word[pred_class[0]]]), trg_index_word[pred_class[0]]


In [None]:

# chat with bot
print("Note: Enter 'quit' to break the loop.")
while True:
    input_ = input('You: ')
    if input_.lower() == 'quit':
        break
    res, typ = response(input_)
    print('Bot: {} -- TYPE: {}'.format(res, typ))
    print()