Our first approach is to generate, by hand, a simple collections of sentences organised by intents and associated answers.
We will then process that data to make it suitable for NLP applications, encode it using "bag of word" and train a neural network to predict user intent from an utterance.

Then, we wil used pre-trained word embeddings

Then, we will generate training data using OpenAI GPT-3 API and train it, using both bag of words and word embeddings
We can also try TF-IDF to compare it with the NN.

# Import / Generate data

In [48]:
import json
import pickle
import pandas as pd
from sklearn import tree, svm, naive_bayes
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from pprint import pprint
from time import time
import numpy as np

In [4]:
import spacy
from spacy.tokens import DocBin
nlp = spacy.load("en_core_web_lg")

In [5]:
data_file = open('intents.json').read()
intents = json.loads(data_file)


data = []
for intent in intents['intents']:
    for pattern in intent['patterns']:
        data.append([pattern, intent['tag']])

df = pd.DataFrame(data, columns=['text','intent'])

In [6]:
df.sample(10)

Unnamed: 0,text,intent
10,That's great,thanks
5,Bye,goodbye
3,Hello,greeting
14,What are you ?,about
8,Thanks,thanks
21,Can you help ?,help
12,"Perfect, thank you very much",thanks
6,See you later,goodbye
19,Could you help me ?,help
27,Print 78 pages from doc8,printing_request


# Data pre-processing

In [57]:
# Helper function

def lemmatize_text(text, preprocessed=False):
    if not preprocessed:
        text = nlp(text)
    lemmatized_texts = [token.lemma_ for token in text
                               if not token.is_punct and not token.is_space and not token.like_url and not token.like_email]
    return lemmatized_texts

In [None]:
doc_bin = DocBin(attrs=["LEMMA", "ENT_IOB", "ENT_TYPE"], store_user_data=True)

for doc in nlp.pipe(df['text']):
    doc_bin.add(doc)

# save DocBin to a file on disc
file_name_spacy = 'preprocessed_documents.spacy'
doc_bin.to_disk(file_name_spacy)

In [9]:
#Load DocBin at later time or on different system from disc or bytes object
file_name_spacy = 'preprocessed_documents.spacy'
doc_bin = DocBin().from_disk(file_name_spacy)

In [10]:
docs = list(doc_bin.get_docs(nlp.vocab))
print(len(docs))
print(docs)

31
[Hi, Hey, Is anyone there ?, Hello, Good morning !, Bye, See you later, Goodbye, Thanks, Thank you, That's great, Thanks for the help, Perfect, thank you very much, Who are you ?, What are you ?, What is this, What is your name ?, What should I call you ?, What is your name ?, Could you help me ?, Give me a hand please, Can you help ?, What can you do for me ?, I need help, I want to print 46 pages of my_doc, Can you help me get 64 pages of doc4 ?, Get me 6 pages of my_file, Print 78 pages from doc8, I have a complaint, I want to raise a complaint, I am not satisfied]


# Data preparation

In [51]:
df["doc"] = docs
X_train = df["doc"].apply(lemmatize_text, args=(True,)).str.join(" ")
y_train = df["intent"]

In [52]:
y_train

0             greeting
1             greeting
2             greeting
3             greeting
4             greeting
5              goodbye
6              goodbye
7              goodbye
8               thanks
9               thanks
10              thanks
11              thanks
12              thanks
13               about
14               about
15               about
16                name
17                name
18                name
19                help
20                help
21                help
22                help
23                help
24    printing_request
25    printing_request
26    printing_request
27    printing_request
28           complaint
29           complaint
30           complaint
Name: intent, dtype: object

# Models preparation

In [49]:
vect = TfidfVectorizer(ngram_range=(1, 2), lowercase=False, tokenizer=lambda x: x, max_features=3000)

# classifiers to use

from collections import defaultdict

gs_dict = defaultdict(dict)

dectree = tree.DecisionTreeClassifier()
svm = svm.SVC()
multi_nb = naive_bayes.MultinomialNB()

gs_dict['dectree']['pipeline'] = Pipeline([
    ('vect', vect),
    ('dectree', dectree)])
gs_dict['svm']['pipeline'] = Pipeline([
    ('vect', vect),
    ('svm', svm)])
gs_dict['multi_nb']['pipeline'] = Pipeline([
    ('vect', vect),
    ('multi_nb', multi_nb)])

gs_dict['dectree']['params'] = {
    "dectree__max_depth": [4, 10],
    "vect__ngram_range": ((1, 1), (1, 2), (1,3)),
    "vect__use_idf": (True, False),
}
gs_dict['svm']['params'] = {
    "svm__kernel": ["linear", "rbf"],
    "vect__ngram_range": ((1, 1), (1, 2), (1,3)),
    "vect__use_idf": (True, False),
}
gs_dict['multi_nb']['params'] = {
    "multi_nb__alpha": [0.00001, 0.0001, 0.001, 0.1, 1, 10, 100,1000],
    "vect__ngram_range": ((1, 1), (1, 2), (1,3)),
    "vect__use_idf": (True, False),
}

# Model Selection

In [53]:
def perform_grid_search(pipeline, parameters):
    gs_clf = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=3, cv=5)

    print("\nPerforming grid search...")
    print("pipeline:", [name for name, _ in pipeline.steps])
    print("parameters:")
    pprint(parameters)

    t0 = time()

    gs_clf.fit(X_train, y_train)

    print("done in %0.3fs" % (time() - t0))
    print()

    print("Best score: %0.3f" % gs_clf.best_score_)
    print("Best parameters set:")
    best_parameters = gs_clf.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))
    return gs_clf

In [54]:
def best_estimator_per_clf(gs_dict: defaultdict):
    for clf in dict(gs_dict):
        gs_dict[clf]['gs'] = perform_grid_search(
            gs_dict[clf]['pipeline'],
            gs_dict[clf]['params']
        )

In [55]:
best_estimator_per_clf(gs_dict)


Performing grid search...
pipeline: ['vect', 'dectree']
parameters:
{'dectree__max_depth': [4, 10],
 'vect__ngram_range': ((1, 1), (1, 2), (1, 3)),
 'vect__use_idf': (True, False)}
Fitting 5 folds for each of 12 candidates, totalling 60 fits




done in 2.013s

Best score: 0.576
Best parameters set:
	dectree__max_depth: 10
	vect__ngram_range: (1, 3)
	vect__use_idf: True

Performing grid search...
pipeline: ['vect', 'svm']
parameters:
{'svm__kernel': ['linear', 'rbf'],
 'vect__ngram_range': ((1, 1), (1, 2), (1, 3)),
 'vect__use_idf': (True, False)}
Fitting 5 folds for each of 12 candidates, totalling 60 fits
done in 0.088s

Best score: 0.548
Best parameters set:
	svm__kernel: 'linear'
	vect__ngram_range: (1, 3)
	vect__use_idf: True

Performing grid search...
pipeline: ['vect', 'multi_nb']
parameters:
{'multi_nb__alpha': [1e-05, 0.0001, 0.001, 0.1, 1, 10, 100, 1000],
 'vect__ngram_range': ((1, 1), (1, 2), (1, 3)),
 'vect__use_idf': (True, False)}
Fitting 5 folds for each of 48 candidates, totalling 240 fits




done in 0.269s

Best score: 0.610
Best parameters set:
	multi_nb__alpha: 1e-05
	vect__ngram_range: (1, 3)
	vect__use_idf: False


In [62]:
df_result = pd.DataFrame(gs_dict['multi_nb']['gs'].cv_results_)
df_result

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_multi_nb__alpha,param_vect__ngram_range,param_vect__use_idf,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.003502,0.000441,0.001913,0.000668,1e-05,"(1, 1)",True,"{'multi_nb__alpha': 1e-05, 'vect__ngram_range'...",0.714286,0.5,0.333333,0.5,0.333333,0.47619,0.140456,18
1,0.003189,0.000284,0.000959,0.000112,1e-05,"(1, 1)",False,"{'multi_nb__alpha': 1e-05, 'vect__ngram_range'...",0.714286,0.5,0.333333,0.5,0.333333,0.47619,0.140456,18
2,0.003631,0.000764,0.001189,0.000115,1e-05,"(1, 2)",True,"{'multi_nb__alpha': 1e-05, 'vect__ngram_range'...",0.714286,0.666667,0.333333,0.666667,0.5,0.57619,0.141582,9
3,0.003164,0.000533,0.000989,9.4e-05,1e-05,"(1, 2)",False,"{'multi_nb__alpha': 1e-05, 'vect__ngram_range'...",0.714286,0.666667,0.333333,0.666667,0.5,0.57619,0.141582,9
4,0.00558,0.000489,0.001746,0.000342,1e-05,"(1, 3)",True,"{'multi_nb__alpha': 1e-05, 'vect__ngram_range'...",0.571429,0.666667,0.333333,0.666667,0.666667,0.580952,0.129187,8
5,0.004397,0.000454,0.00128,0.000207,1e-05,"(1, 3)",False,"{'multi_nb__alpha': 1e-05, 'vect__ngram_range'...",0.714286,0.666667,0.333333,0.666667,0.666667,0.609524,0.139321,1
6,0.003728,0.000189,0.001219,0.000279,0.0001,"(1, 1)",True,"{'multi_nb__alpha': 0.0001, 'vect__ngram_range...",0.714286,0.5,0.333333,0.5,0.333333,0.47619,0.140456,18
7,0.002976,0.000171,0.000871,0.000212,0.0001,"(1, 1)",False,"{'multi_nb__alpha': 0.0001, 'vect__ngram_range...",0.714286,0.5,0.333333,0.5,0.333333,0.47619,0.140456,18
8,0.004129,0.000805,0.001384,0.000169,0.0001,"(1, 2)",True,"{'multi_nb__alpha': 0.0001, 'vect__ngram_range...",0.714286,0.666667,0.333333,0.666667,0.666667,0.609524,0.139321,1
9,0.003066,0.000466,0.001037,0.000132,0.0001,"(1, 2)",False,"{'multi_nb__alpha': 0.0001, 'vect__ngram_range...",0.714286,0.666667,0.333333,0.666667,0.5,0.57619,0.141582,9


In [70]:
import string

test = lemmatize_text("I want to print")
test = " ".join(test)
test

'I want to print'

In [71]:
predict = gs_dict['multi_nb']['gs'].predict([test])
predict

array(['printing_request'], dtype='<U16')

In [None]:
classes = df["intent"].unique()
words = set() # change words to vocab

In [None]:
pickle.dump(words, open('words.pkl', 'wb'))
pickle.dump(classes, open('classes.pkl', 'wb'))

In [None]:
documents = pickle.load(open('words.pkl', 'rb'))
classes = pickle.load(open('classes.pkl', 'rb'))

# Model evaluation