# **HAI Coursework 1: AN INTERACTIVE NLP-BASED AI SYSTEM**

## Project Title: Chatbot Assistant for Computer Programmers

20204134 Chuang Caleb hcycc2

BSc Hons Computer Science with Artificial Intelligence

---


## **INITIALIZATION**

### Imports


In [59]:
# External python modules/packages
import nltk
from nltk.stem import WordNetLemmatizer

import json

# My own python modules
# from utils.my_global import *


### Loading Intents Data


In [60]:
data_file = open('intents.json').read()
intents = json.loads(data_file)
tok_doc = {}
classes = []
documents = []
vocabulary = []

## Pre-processing


In [61]:
from itertools import chain

# Tokenization and document/class tagging

"""
- tok_doc: class - tokenized phrases
- documents: pattern - its class
- classes: all classes 
"""

for intent in intents['intents']:

    # Tokenize every word
    tok_doc[intent['tag']] = list(chain.from_iterable([nltk.word_tokenize(pattern) for pattern in intent['patterns']]))

    for pattern in intent['patterns']:

        # adding documents
        documents.append((pattern, intent['tag']))

        # adding classes to our class list
        if intent['tag'] not in classes:
            classes.append(intent['tag'])

In [62]:
from nltk.corpus import stopwords

english_stopwords = stopwords.words('english')
ignore_words = [',']
ignore_words.extend(english_stopwords)
lemmatizer = WordNetLemmatizer()

# lemmatize, lowercase and remove stopwords
for intent in classes:
    tok_doc[intent] = [lemmatizer.lemmatize(word.lower()) for word in tok_doc[intent] if word not in ignore_words]

# vocabulary: distinct set of all words in documents
vocabulary = sorted(list(set(chain.from_iterable([tok_doc[intent] for intent in classes]))))

# sort classes
classes = sorted(list(set(classes)))
# documents = combination between patterns and intents
print(len(documents), "documents")
# classes = intents
print(len(classes), "classes", classes)
# words = all words, vocabulary
print(len(vocabulary), "unique lemmatized words", vocabulary)

47 documents
9 classes ['adverse_drug', 'blood_pressure', 'blood_pressure_search', 'goodbye', 'greeting', 'hospital_search', 'options', 'pharmacy_search', 'thanks']
72 unique lemmatized words ["'s", '?', 'adverse', 'anyone', 'awesome', 'behavior', 'blood', 'bye', 'causing', 'chatting', 'check', 'could', 'data', 'day', 'detail', 'dont', 'drug', 'entry', 'find', 'give', 'good', 'goodbye', 'hello', 'help', 'helpful', 'helping', 'hey', 'hi', 'history', 'hola', 'hospital', 'how', 'i', 'id', 'is', 'later', 'list', 'load', 'locate', 'log', 'looking', 'lookup', 'management', 'module', 'nearby', 'next', 'nice', 'offered', 'open', 'patient', 'pharmacy', 'pressure', 'provide', 'reaction', 'related', 'result', 'search', 'searching', 'see', 'show', 'suitable', 'support', 'task', 'thank', 'thanks', 'that', 'till', 'time', 'transfer', 'want', 'what', 'which']


### Create the bag-of-word model

In [66]:
import numpy as np

training = []
output_empty = [0] * len(classes)

print(f'VOCABULARY: {vocabulary[1:10]}')

bow = {}
for intent in classes:
    bow[intent] = np.zeros(len(vocabulary))
    for stem in tok_doc[intent]:
        index = vocabulary.index(stem)
        bow[intent][index] += 1
    print(f'{intent}: {bow[intent][1:10]}')

VOCABULARY: ['?', 'adverse', 'anyone', 'awesome', 'behavior', 'blood', 'bye', 'causing', 'chatting']
adverse_drug: [2. 5. 0. 0. 1. 0. 0. 1. 0.]
blood_pressure: [0. 0. 0. 0. 0. 5. 0. 0. 0.]
blood_pressure_search: [0. 0. 0. 0. 0. 5. 0. 0. 0.]
goodbye: [0. 0. 0. 0. 0. 0. 2. 0. 1.]
greeting: [1. 0. 1. 0. 0. 0. 0. 0. 0.]
hospital_search: [0. 0. 0. 0. 0. 0. 0. 0. 0.]
options: [4. 0. 0. 0. 0. 0. 0. 0. 0.]
pharmacy_search: [0. 0. 0. 0. 0. 0. 0. 0. 0.]
thanks: [0. 0. 0. 1. 0. 0. 0. 0. 0.]


Weighing function on bag of words model


In [68]:
from math import log10

def logfreq_weighting(vector):
    lf_vector = []
    for frequency in vector:
        lf_vector.append(log10(1+frequency))
    return np.array(lf_vector)

In [70]:
for intent in bow:
    bow[intent] = logfreq_weighting(bow[intent])
    # print(f'{intent}: {bow[intent][1:10]}')

Cosine similarity function

In [74]:
from scipy import spatial

def sim_cosine(vector_1, vector_2):
    similarity = 1 - spatial.distance.cosine(vector_1, vector_2)
    return similarity

In [83]:
query = "pharmacy"

# Tokenize
tokenizer = nltk.RegexpTokenizer(r"\w+")
tok_query = tokenizer.tokenize(query)

# Remove stopwords and lower casing
english_stopwords = stopwords.words('english')
filtered_query = [word.lower() for word in tok_query
                  if word.lower() not in english_stopwords]

# Stemming --> Lemmatising
stemmed_query = [lemmatizer.lemmatize(word) for word in filtered_query]

vector_query = np.zeros(len(vocabulary))
for stem in stemmed_query:
    try:
        index = vocabulary.index(stem)
        vector_query[index] += 1
    except ValueError:
        continue
vector_query = logfreq_weighting(vector_query)

for intent in bow.keys():
    similarity = sim_cosine(bow[intent], vector_query)
    print(f'Similarity with {intent}: {similarity}')

Similarity with adverse_drug: 0.0
Similarity with blood_pressure: 0.0
Similarity with blood_pressure_search: 0.0
Similarity with goodbye: 0.0
Similarity with greeting: 0.0
Similarity with hospital_search: 0.0
Similarity with options: 0.3631908228367414
Similarity with pharmacy_search: 0.0
Similarity with thanks: 0.0


---

## **TRAINING**

from sklearn.pipeline import Pipeline
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB()),
                     ])
text_clf = text_clf.fit(twenty_train.data, twenty_train.target)
