Our first approach is to generate, by hand, a simple collections of sentences organised by intents and associated answers.
We will then process that data to make it suitable for NLP applications, encode it using "bag of word" and train a neural network to predict user intent from an utterance.

Then, we wil used pre-trained word embeddings

Then, we will generate training data using OpenAI GPT-3 API and train it, using both bag of words and word embeddings
We can also try TF-IDF to compare it with the NN.

# Import / Generate data

In [33]:
import json
import pickle
import pandas as pd
from sklearn import tree
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from pprint import pprint
from time import time
import numpy as np

In [None]:
import spacy
from spacy.tokens import DocBin
nlp = spacy.load("en_core_web_lg")

In [4]:
data_file = open('intents.json').read()
intents = json.loads(data_file)


data = []
for intent in intents['intents']:
    for pattern in intent['patterns']:
        data.append([pattern, intent['tag']])

df = pd.DataFrame(data, columns=['text','intent'])

In [8]:
df.sample(10)

Unnamed: 0,text,intent
13,Who are you ?,about
29,I want to raise a complaint,complaint
12,"Perfect, thank you very much",thanks
2,Is anyone there ?,greeting
24,I want to print 46 pages of my_doc,printing_request
25,Can you help me get 64 pages of doc4 ?,printing_request
22,What can you do for me ?,help
14,What are you ?,about
27,Print 78 pages from doc8,printing_request
8,Thanks,thanks


# Data pre-processing

In [17]:
# Helper function

def lemmatize_text(text, preprocessed=False):
    if not preprocessed:
        text = nlp(text)
    lemmatized_texts = [token.lemma_ for token in text
                               if not token.is_punct and not token.is_space and not token.like_url and not token.like_email]
    return lemmatized_texts

In [11]:
doc_bin = DocBin(attrs=["LEMMA", "ENT_IOB", "ENT_TYPE"], store_user_data=True)

for doc in nlp.pipe(df['text']):
    doc_bin.add(doc)

# save DocBin to a file on disc
file_name_spacy = 'preprocessed_documents.spacy'
doc_bin.to_disk(file_name_spacy)

In [12]:
#Load DocBin at later time or on different system from disc or bytes object
doc_bin = DocBin().from_disk(file_name_spacy)

In [14]:
docs = list(doc_bin.get_docs(nlp.vocab))
print(len(docs))
print(docs)

31
[Hi, Hey, Is anyone there ?, Hello, Good morning !, Bye, See you later, Goodbye, Thanks, Thank you, That's great, Thanks for the help, Perfect, thank you very much, Who are you ?, What are you ?, What is this, What is your name ?, What should I call you ?, What is your name ?, Could you help me ?, Give me a hand please, Can you help ?, What can you do for me ?, I need help, I want to print 46 pages of my_doc, Can you help me get 64 pages of doc4 ?, Get me 6 pages of my_file, Print 78 pages from doc8, I have a complaint, I want to raise a complaint, I am not satisfied]


# Data preparation

In [31]:
df["doc"] = docs
X_train = df["doc"].apply(lemmatize_text, args=(True,))
y_train = df["intent"]
y_train

0             greeting
1             greeting
2             greeting
3             greeting
4             greeting
5              goodbye
6              goodbye
7              goodbye
8               thanks
9               thanks
10              thanks
11              thanks
12              thanks
13               about
14               about
15               about
16                name
17                name
18                name
19                help
20                help
21                help
22                help
23                help
24    printing_request
25    printing_request
26    printing_request
27    printing_request
28           complaint
29           complaint
30           complaint
Name: intent, dtype: object

In [35]:
X_train

0                                             [hi]
1                                            [hey]
2                              [be, anyone, there]
3                                          [hello]
4                                  [good, morning]
5                                            [bye]
6                                [see, you, later]
7                                        [goodbye]
8                                          [thank]
9                                     [thank, you]
10                               [that, be, great]
11                         [thank, for, the, help]
12               [perfect, thank, you, very, much]
13                                  [who, be, you]
14                                 [what, be, you]
15                                [what, be, this]
16                          [what, be, your, name]
17                    [what, should, I, call, you]
18                          [what, be, your, name]
19                           [c

# Model preparation

In [37]:
vect = TfidfVectorizer(ngram_range=(1, 2), lowercase=False, tokenizer=lambda x: x, max_features=3000)

# classifier to use
clf = tree.DecisionTreeClassifier()

intent_clf = Pipeline([
    ('vect', vect),
    ('dectree', clf)])

parameters = {
    "dectree__max_depth": [4, 10],
    "vect__ngram_range": ((1, 1), (1, 2), (1,3)),
    "vect__use_idf": (True, False),
}

gs_clf = GridSearchCV(intent_clf, parameters, n_jobs=-1, verbose=1, cv=5)

print("Performing grid search...")
print("pipeline:", [name for name, _ in intent_clf.steps])
print("parameters:")
pprint(parameters)
t0 = time()

gs_clf.fit(X_train, y_train)

print("done in %0.3fs" % (time() - t0))
print()

print("Best score: %0.3f" % gs_clf.best_score_)
print("Best parameters set:")
best_parameters = gs_clf.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Performing grid search...
pipeline: ['vect', 'dectree']
parameters:
{'dectree__max_depth': [4, 10],
 'vect__ngram_range': ((1, 1), (1, 2), (1, 3)),
 'vect__use_idf': (True, False)}
Fitting 5 folds for each of 12 candidates, totalling 60 fits




done in 2.289s

Best score: 0.548
Best parameters set:
	dectree__max_depth: 10
	vect__ngram_range: (1, 2)
	vect__use_idf: True


In [48]:
lematize_text("I want to print a document")

['I', 'want', 'to', 'print', 'a', 'document']

In [16]:
classes = df["intent"].unique()
words = set() # change words to vocab

In [17]:
pickle.dump(words, open('words.pkl', 'wb'))
pickle.dump(classes, open('classes.pkl', 'wb'))

In [19]:
documents = pickle.load(open('words.pkl', 'rb'))
classes = pickle.load(open('classes.pkl', 'rb'))

# Model evaluation