Our first approach is to generate, by hand, a simple collections of sentences organised by intents and associated answers.
We will then process that data to make it suitable for NLP applications, encode it using "bag of word" and train a neural network to predict user intent from an utterance.

Then, we wil used pre-trained word embeddings

Then, we will generate training data using OpenAI GPT-3 API and train it, using both bag of words and word embeddings
We can also try TF-IDF to compare it with the NN.

# Import / Generate data

In [23]:
import json
import pickle
import pandas as pd

data_file = open('intents.json').read()
intents = json.loads(data_file)


data = []
for intent in intents['intents']:
    for pattern in intent['patterns']:
        data.append([pattern, intent['tag']])

df = pd.DataFrame(data, columns=['text','intent'])

In [24]:
import spacy
from spacy.tokens import DocBin
nlp = spacy.load("en_core_web_lg")

In [40]:
from sklearn import tree
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

# Data pre-processing

In [49]:
# Helper function

def lematize_text(text, preprocessed=False):
    if not preprocessed:
        text = nlp(text)
    lematized_texts = [token.lemma_ for token in text
                               if not token.is_punct and not token.is_space and not token.like_url and not token.like_email]
    return lematized_texts

In [28]:
doc_bin = DocBin(attrs=["LEMMA", "ENT_IOB", "ENT_TYPE"], store_user_data=True)

for doc in nlp.pipe(df['text'].str.lower()):
    doc_bin.add(doc)

# save DocBin to a file on disc
file_name_spacy = 'preprocessed_documents.spacy'
doc_bin.to_disk(file_name_spacy)

In [29]:
#Load DocBin at later time or on different system from disc or bytes object
doc_bin = DocBin().from_disk(file_name_spacy)

In [30]:
doc_bin

<spacy.tokens._serialize.DocBin at 0x7ff668c148e0>

In [69]:
docs = list(doc_bin.get_docs(nlp.vocab))
print(len(docs))
print(docs)

31
[hi, hey, is anyone there ?, hello, good morning !, bye, see you later, goodbye, thanks, thank you, that's great, thanks for the help, perfect, thank you very much, who are you ?, what are you ?, what is this, what is your name ?, what should i call you ?, what is your name ?, could you help me ?, give me a hand please, can you help ?, what can you do for me ?, i need help, i want to print 46 pages of my_doc, can you help me get 64 pages of doc4 ?, get me 6 pages of my_file, print 78 pages from doc8, i have a complaint, i want to raise a complaint, i am not satisfied]


In [70]:
df["doc"] = docs

In [71]:
X_train = df["doc"].apply(lematize_text, args=(True,))

In [72]:
X_train

0                                             [hi]
1                                            [hey]
2                              [be, anyone, there]
3                                          [hello]
4                                  [good, morning]
5                                            [bye]
6                                [see, you, later]
7                                        [goodbye]
8                                          [thank]
9                                     [thank, you]
10                               [that, be, great]
11                         [thank, for, the, help]
12               [perfect, thank, you, very, much]
13                                  [who, be, you]
14                                 [what, be, you]
15                                [what, be, this]
16                          [what, be, your, name]
17                    [what, should, I, call, you]
18                          [what, be, your, name]
19                           [c

In [58]:
df["doc"] = [lematize_text(doc, preprocessed=True) for doc in docs]

In [41]:
# classifier to use
clf = tree.DecisionTreeClassifier()

vectorizer = TfidfVectorizer(ngram_range=(1, 2), lowercase=False, tokenizer=lambda x: x, max_features=3000)

pipeline = Pipeline([('vect', vectorizer), ('dectree', clf)])
parameters = {'dectree__max_depth':[4, 10]}

gs_clf = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1, cv=5)
gs_clf.fit(X_train, y_train)

print(gs_clf.best_estimator_.get_params()['dectree'])

Fitting 5 folds for each of 2 candidates, totalling 10 fits
DecisionTreeClassifier(max_depth=4)


In [48]:
lematize_text("I want to print a document")

['I', 'want', 'to', 'print', 'a', 'document']

In [45]:
nlp("I want to print a document")
lemnas =  [token.lemma_ for token in nlp("I want to print a document")
                               if not token.is_punct and not token.is_space and not token.like_url and not token.like_email]
gs_clf.predict(lemnas)

array([0, 0, 0, 0, 0, 0])

In [16]:
classes = df["intent"].unique()
words = set() # change words to vocab

In [17]:
pickle.dump(words, open('words.pkl', 'wb'))
pickle.dump(classes, open('classes.pkl', 'wb'))

In [19]:
documents = pickle.load(open('words.pkl', 'rb'))
classes = pickle.load(open('classes.pkl', 'rb'))

In [20]:
classes

array(['greeting', 'goodbye', 'thanks', 'about', 'name', 'help',
       'printing_request', 'complaint'], dtype=object)

# Data preparation

In [None]:
# =============================================================
# Training datasets
# =============================================================
# create our training data
training = []
# create an empty array for our output
output_empty = [0] * len(classes)
# training set, bag of words for each sentence
for doc in documents:
    # initialize our bag of words
    bag = []
    # list of tokenized words for the pattern
    pattern_words = doc[0]
    # lemmatize each word - create base word, in attempt to represent related words
    pattern_words = [lemmatizer.lemmatize(word.lower()) for word in pattern_words]
    # create our bag of words array with 1, if word match found in current pattern
    for word in words:
        bag.append(1) if word in pattern_words else bag.append(0)

    # output is a '0' for each tag and '1' for current tag (for each pattern)
    output_row = list(output_empty)
    output_row[classes.index(doc[1])] = 1

    training.append([bag, output_row])
# shuffle our features and turn into np.array
random.shuffle(training)
training = np.array(training)
# create train and test lists. X - patterns, Y - intents
train_x = list(training[:, 0])
train_y = list(training[:, 1])
print("Training data created")
print(train_y)




# Model preparation

# Model training

# Model evaluation