In [1]:
# Here we train and save our chatbot classifier model

import re
from nltk.tokenize import word_tokenize
from gensim.models import word2vec

chat_topics = [
    {"user_input": "quem é o responsável pelo sistema?", "class": "contact" , "representation": [] },
    {"user_input": "qual a informação de contato?", "class": "contact" , "representation": [] },
    {"user_input": "qual o e-mail ou telefone? ", "class": "contact" , "representation": [] },
    {"user_input": "quais os produtos que são vendidos? ", "class": "products" , "representation": [] },
    {"user_input": "vende carros novos e usados? ", "class": "products" , "representation": [] },
    {"user_input": "aluga ou vende carros? ", "class": "products" , "representation": [] },
    {"user_input": "oi!", "class": "greetings" , "representation": [] },
    {"user_input": "olá, como vai você?", "class": "greetings" , "representation": [] },
    {"user_input": "bom dia!", "class": "greetings" , "representation": [] }
]              

# now we build the normalized text without stop words, with lower case and without punctuation signals:
topics = []
for topic in chat_topics:
    topics.append(topic["user_input"])
training_corpus = topics
sentence_tokens = training_corpus 
print("Sentence tokens: ")
print(sentence_tokens)
print("\n\n")
# 1. Basic word tokens building
word_tokens = [ word_tokenize(sentence)  for sentence in sentence_tokens]
print("Word tokens: ")
print(word_tokens)
print("\n\n")

# 2. Removing punctuation characters and converting
# all charecters to lower case:
normalized_sentences = []
i = 0
for sentence in word_tokens:
    normalized_sentences.append([])
    for word in sentence:
        word = re.sub(r'[^A-Za-zÀ-Ýà-ý]','', word).lower()
        if word!='':
            normalized_sentences[i].append(word)
    i = i+1

print("\n\n")
print("Lower case, punctuation free word tokens")
print(normalized_sentences)

# 3. Stop word removal: 
#stop_words = ['a', 'as', 'e', 'o', 'os', 'da', 'de', 'do', 'um', 'uma']
#for word in stop_words:
#    for sentence in normalized_sentences:
#        if word in sentence:
#            print(word)
#            print(sentence)
#            sentence.remove(word)
#print("\n\n")
#print("Normalized text: ")
#print(normalized_sentences)

# 4. building the word2vec model
# Model configuration
feature_size = 32  # size of vector representation
window_context = 3
min_word_count = 1
sample = 1e-3
w2vec_repr = word2vec.Word2Vec(normalized_sentences, vector_size= feature_size,
                                window=window_context, min_count= min_word_count,
                                sample=sample, epochs = 50)

# Here we save the word2vec model to be used in the chatbot:
w2vec_repr.save("word2vec_bot.model")

Sentence tokens: 
['quem é o responsável pelo sistema?', 'qual a informação de contato?', 'qual o e-mail ou telefone? ', 'quais os produtos que são vendidos? ', 'vende carros novos e usados? ', 'aluga ou vende carros? ', 'oi!', 'olá, como vai você?', 'bom dia!']



Word tokens: 
[['quem', 'é', 'o', 'responsável', 'pelo', 'sistema', '?'], ['qual', 'a', 'informação', 'de', 'contato', '?'], ['qual', 'o', 'e-mail', 'ou', 'telefone', '?'], ['quais', 'os', 'produtos', 'que', 'são', 'vendidos', '?'], ['vende', 'carros', 'novos', 'e', 'usados', '?'], ['aluga', 'ou', 'vende', 'carros', '?'], ['oi', '!'], ['olá', ',', 'como', 'vai', 'você', '?'], ['bom', 'dia', '!']]






Lower case, punctuation free word tokens
[['quem', 'é', 'o', 'responsável', 'pelo', 'sistema'], ['qual', 'a', 'informação', 'de', 'contato'], ['qual', 'o', 'email', 'ou', 'telefone'], ['quais', 'os', 'produtos', 'que', 'são', 'vendidos'], ['vende', 'carros', 'novos', 'e', 'usados'], ['aluga', 'ou', 'vende', 'carros'], ['oi'], 

In [2]:
import numpy as np

#Now we build the feature vector for each user input example:
for i, sentence in enumerate(normalized_sentences):
    sentence_vector = np.zeros(w2vec_repr.vector_size)
    for word in sentence:
        sentence_vector += w2vec_repr.wv[word]
    sentence_vector /= len(sentence)
    chat_topics[i]["representation"] = sentence_vector
    


In [3]:
# Now we train our MLP model and save it using pickle
import pickle
from sklearn.neural_network import MLPClassifier
X = []
Y = []
for topic in chat_topics:
    X.append(topic["representation"])
    Y.append(topic["class"])

classifier = MLPClassifier(solver='lbfgs', alpha=1e-5,
                    hidden_layer_sizes=(5, 3), random_state=1)

classifier.fit(X, Y)
# Now we save the model 
with open("mlp_classifier.pkl", "wb") as f:
    pickle.dump(classifier, f)

In [12]:
# Let's test the model loading and classification:
# First we load the model for representation and classification:

word2vec_model_loaded = word2vec.Word2Vec.load("word2vec_bot.model")
with open("mlp_classifier.pkl", "rb") as f:
    loaded_classifier = pickle.load(f)
    
# Then we simulate user input:
user_input = "com quem eu falo? "
user_input_word_tokens = word_tokenize(user_input)
user_input_normalized = []
for word in user_input_word_tokens:
        word = re.sub(r'[^A-Za-zÀ-Ýà-ý]','', word).lower()
        if word!='':
            user_input_normalized.append(word)
print(user_input_normalized)

#now we build the word2vec representation:
input_sentence_vector = np.zeros(word2vec_model_loaded.vector_size)
nwords = 0
for word in user_input_normalized:
    if word in word2vec_model_loaded.wv:
        nwords +=1
        input_sentence_vector += word2vec_model_loaded.wv[word]
input_sentence_vector /= nwords
print(input_sentence_vector)

# Let's test classification:

print( loaded_classifier.predict(input_sentence_vector.reshape(1, -1)) )
print( loaded_classifier.predict_proba(input_sentence_vector.reshape(1, -1)) )

['com', 'quem', 'eu', 'falo']
[-2.07186881e-02  1.35313021e-02 -1.17272406e-03 -1.11813890e-02
  2.14475952e-02  1.21138496e-02 -1.19936625e-02  2.41497671e-03
  2.86840163e-02  2.43211538e-02  1.99904069e-02  1.45933731e-02
  7.23376125e-03 -5.83335757e-03 -1.98978763e-02 -9.40478989e-04
 -4.71610902e-03 -1.93193590e-03 -1.96045060e-02  2.35680304e-02
 -2.04276610e-02 -2.25921944e-02 -8.39035027e-03 -4.63318499e-03
 -2.37647332e-02  2.05050455e-03 -1.68453883e-02 -4.01476864e-03
 -2.30164919e-02  5.89921512e-03  1.01045361e-02 -3.15128200e-05]
['contact']
[[9.92921799e-01 7.06321400e-03 1.49870802e-05]]
