In [1]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import numpy as np
import random
import string
import json
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from sklearn.model_selection import train_test_split
from spellchecker import SpellChecker
cpu = torch.device('cpu')



# Eliot's Interactive Syllabus Chatbot

## Project Purpose

For my CS 472 final project, I developed an interactive chatbot that serves as an alternative version of the CS 472 syllabus. My chatbot can answer questions related to the class structure, class schedule, class policies, and other syllabus related topics in natural language. The main purpose of my chatbot is to understand and respond to students' natural language queries regarding the CS 472 course material. By offering an interactive way for students to access information about CS 472, I hope to enhance student understanding and engagement with the material, ultimately improving their overall learning experience.

Code inspired by tutorial created by [Patrick Loeber](https://www.youtube.com/playlist?list=PLqnslRFeH2UrFW4AUgn-eY37qOAWQpJyg)

<h4 style="color:red">UNCOMMENT THE LINES BELOW IF THIS IS YOUR FIRST TIME USING THIS NOTEBOOK</h4>

In [2]:
# nltk.download('stopwords')
# nltk.download('wordnet')
# nltk.download('omw-1.4')
# nltk.download('punkt')

## Preprocessing Techniques

##### How can we make our textual data easier to work with?
- Remove "stop words" that do not add meaning to the sentence
- Map synonyms to a single word to reduce the number of unique features and increase the frequency of important words
- Reduce the feature space by stemming each word to its root form
- Create a representation of the sentence using techniques such as bag of words or embeddings

#### First, let's define our stop words (meaningless words we want to remove)

We can modify the list of stop words provided by NLTK to fit our needs:

*Example: what sort of words are in the stop words list?*

In [3]:
stop_words = set(stopwords.words('english'))
'do' in stop_words, 'when' in stop_words

(True, True)

First, what words do we think will help clarify the meaning of sentences that we DO NOT want in our `stop_words` set?

In [4]:
stop_words.remove('where')
stop_words.remove('when')
stop_words.remove('what')
stop_words.remove('who')
stop_words.remove('how')
stop_words.remove('more')

Now, what words do we think will not clarify the meaning of sentences that we DO want in our `stop_words` set?

In [5]:
stop_words.add('tell')
stop_words.add('know')
stop_words.add('cs')
stop_words.add('cis')
stop_words.add('academic')

#### Spell checking

Spelling errors can be a major source of noise in our input data, which can make it more difficult for our model to identify relevant features and patterns. For instance, our model may learn that the term "professor" is significant, but it may not recognize that "profesor" has the same meaning.

To improve the quality of our input data, we can incorporate logic to spell check words:

In [6]:
spell_check = SpellChecker(language='en', case_sensitive=False)

*Example usage:*

In [7]:
# define a sample text with some spelling errors
text = "Thiss is a sampel text with some speling erors."

# split the text into individual words
words = text.split()

words = [spell_check.correction(word) for word in words]
words

['Thiss', 'is', 'a', 'sample', 'text', 'with', 'some', 'spelling', 'errors']

#### Synonym Replacement

Let's make a list of useful synonyms to help us replace them with a single word. This will help us reduce the number of features and highlight important words.

In [114]:
synonyms = {
    'location': {'room', 'classroom', 'building', 'place', 'site', 'hall'},
    'disability': {'injury', 'disorder', 'condition', 'impairment', 'impaired',
                  'injured', 'ptsd', 'anxiety', 'dyslexia', 'adhd', 'depression',
                  'illness', 'disease', 'autism'},
    'accomadate': {'support', 'supported', 'supporting', 'supports',
             'accomadate', 'accomadates', 'accommodation', 'accomadated' 
             'assistance', 'assist', 'assisting', 'assists'},
    'more': {'added', 'additional', 'extra'},
    'zoom': {'remote'},
    'ml': {'ai', 'rnn', 'cnn', 'PCA', 'dimensionality', 'regression', 'knn', 
           'svm', 'svms', 'gans', 'gan', 'optimization', 'cnns', 'transformers',
          'vaes', 'vae', 'regularization', 'gradient', 'neural', 'network'},
    'test': {'exam', 'test'},
    'dishonesty': {'dishonesty', 'cheating', 'plagiarize', 'copying', 'copied',
                   'collusion', 'plagiarism', 'lying', 'cheat'},
    '315': {'315', 'cs315', 'cis315', 'cs415', '415', '313', 'cs313', 'cis313', 
           'cis415', 'cs314', '314', 'cis314', 'cs212', 'cis212', '212',
           'cs211', 'cis211', '211', 'cs210', 'cis210', '210',
           'cs425', 'cis425', '425', 'cs471', 'cis471', '471', 
           'cs330', 'cis330', '330'},
    'course': {'cs472', 'class', 'course', '472', 'cis472', 'curriculum', 'lecture',
              'lectures'},
    'content': {'subject', 'topic', 'concepts'},
    'cs': {'cis', 'cs'},
    'you': {'ya', 'you'},
    'time': {'monday', 'tuesday', 'wednesday', 
             'thursday', 'friday','saturday', 'sunday',
             'tomorrow', 'yesterday', 'today', 'january', 'february', 
             'march', 'april', 'may', 'june', 'july', 'august',
             'september', 'october', 'november', 'december',
             'tonight', 'afternoon', 'tonight', 'morning'},
    'software': {'pytorch', 'tensorflow', 'numpy', 'pandas', 
               'language', 'framework', 'javascript', 'c',
              'sklearn', 'scikitlearn', 'keras', 'jupyter', 'python', 'r'},
    'sick': {'ill', 'fever', 'illness', 'flu', 'covid', 'covid19', 'nausea', 'cough'
            'headache', 'diarrhea', 'congestion', 'sickness'},
    'coding': {'programming', 'code', 'program', 'develop', 'developing'},
    'group': {'team'},
    'start': ['begin', 'commence'],
    'computer': {'pc', 'mac', 'laptop'},
    'employee': {'employee', 'worker', 'staff'},
    'ta': {'ge', 'steven', 'walton'},
    'good': {'great', 'nice', 'awesome', 'cool'},
    'goodbye': {'adieu', 'farewell', 
                'bye', 'adios', 'arrivederci', 
                'auf', 'ciao', 'later', 'peace', 
                'sayonara', 'see ya', 'ttyl', 
                'wiedersehen'},
    'grad': {'graduate'},
    'hi': {'hello', 'hey', 'hiya', 'hola', 'greetings', 'yo'},
    'i': {'myself'},
    'long': {'lengthy'},
    'prerequisites': {'prereqs', 'prerequisite'},
    'professor': {'teacher', 'instructor', 'prof', 'humphrey', 'shi'},
    'homework': {'hw', 'assignment'},
    'grade': {'evaluate', 'assess', 'score'},
    'require': {'need', 'necessary', "mandatory", "compulsory"}
}

Now that we have a dictionary that we can use to map synonyms to a single word, we can create a function to perform our synonym replacement!

In [115]:
def synonymReplacer(tokens, stemmed=False):
    # loop through each token in the list
    for i, word in enumerate(tokens):
        # convert the word to lowercase
        word = word.lower()
        # loop through each synonym list in the synonyms dictionary
        for syn, syn_list in synonyms.items():
            # check if the current word is in the synonym list
            if stemmed:
                syn_list = [stemmer.stem(w) for w in syn_list]
            if word in syn_list:
                # replace the current token with the synonym
                tokens[i] = syn
                # break out of the loop since we've found a synonym
                break
    # return the modified list of tokens
    return tokens

*Example synonym replacement:*

In [116]:
synonymReplacer(["when", "does", "class", "begin"])

['when', 'does', 'course', 'start']

#### Stemming

Stemming reduces words to their root form by removing parts of the word like prefixes and suffixes. This also helps to reduce the feature space as well as increase the frequency of similar words (like "run", "running", and "runs").

In [117]:
# initialize the porter stemmer from NLTK
stemmer = PorterStemmer()

*Example of stemming*

In [118]:
[stemmer.stem(word) for word in ["run", "running", "runs"]]

['run', 'run', 'run']

#### A Preprocessing Function

Now we can combine the preprocessing techniques described above into a single function that we can use to remove noise and irrelevant information from our data.

The preprocess function in the cell below takes a sentence as input, removes punctuation and stop words, stems each word in the sentence, maps synonymous words to a single word, and returns the preprocessed sentence as a list of words.

In [119]:
def preprocess(sentence, remove_stop=True):
    """
    This function takes a sentence as input and performs various text preprocessing steps on it,
    including removing punctuation, stop words, and stemming each word in the sentence.
    """
    # remove punctuation from sentence
    sentence = ''.join(
        char for char in sentence if char not in string.punctuation
    )
    # tokenizing the sentence
    tokens = nltk.word_tokenize(sentence)
    # try to correct spelling issues
    tokens_checked = []
    for word in tokens:
        corrected_word = spell_check.correction(word)
        if corrected_word is not None:
            tokens_checked.append(corrected_word)
        else:
            tokens_checked.append(word)
    # replace synonyms
    tokens_checked = synonymReplacer(tokens_checked)
    if remove_stop:
        # removing stop words
        tokens_checked = [
            token for token in tokens_checked if token.lower() not in stop_words
        ]
    # stemming each word in the sentence
    stemmed_words = [stemmer.stem(word) for word in tokens_checked]
    stemmed_words = synonymReplacer(stemmed_words, stemmed=True)
    # return the preprocessed sentence as a list of words
    return stemmed_words

*Preprocessing Example*

What exactly does our preprocessing do?

In [120]:
example = "Can you tell me who will be leeding the letures for CS 472?"
tokens = preprocess(example)
tokens

['who', 'lead', 'course', 'course']

First, the `preprocess` function removes all punctuation marks from the sentence using the `string.punctuation` module. Then, the sentence is tokenized into a list of words using the nltk.word_tokenize method.

Next, the function removes stop words, which are common words that do not carry much meaning in the sentence, such as "a", "an", "the", "of", and so on. In this case, the function is using a pre-defined list of stop words to remove them from the list of tokens.

After that, the function performs stemming on each word in the sentence, which involves converting the words into their root or base form, called their stem. The function uses a stemmer to perform this task.

Finally, the preprocessed words are returned as a list.

#### A bag-of-words representation of a sentence

The main goal of the `bag_of_words` function is to convert a sentence into a numerical representation that captures the presence or absence of each known word in the vocabulary of known words (we will build our vocabulary soon!).

Here is how the function works:

The bag_of_words function takes a tokenized sentence and a list of all known words in the vocabulary as input, and creates a bag of words representation for the given sentence. It initializes the bag with zeros for each word in the vocabulary, and updates the bag with 1 for each word in the sentence that exists in the vocabulary. The function returns a numpy array representing the bag of words with 1 for each known word that exists in the sentence, 0 otherwise.

In [121]:
def bag_of_words(tokenized_sentence, words):
    """
    Create a bag of words representation for a given tokenized sentence.
    """
    # initialize the bag with zeros for each word in the vocabulary
    bag = np.zeros(len(words), dtype=np.float32)

    # update the bag with 1 for each word in the sentence that exists in the vocabulary
    for idx, w in enumerate(words):
        if w in tokenized_sentence: 
            bag[idx] = 1
    return bag

In [122]:
tokens

['who', 'lead', 'course', 'course']

In [123]:
example_all_words = ['hello', 'tell', 'what', 'ten', 'lead', 'lectur', 'cs', 'who', '472']
bag_of_words(tokens, example_all_words)

array([0., 0., 0., 0., 1., 0., 0., 1., 0.], dtype=float32)

## Loading and Preprocessing the Data

We will now load our data from the intents file and preprocess its content using the logic we defined earlier.

In [124]:
# load the data file as a Python object
with open('intents.json', 'r') as file:
    intents = json.load(file)

# empty lists and dictionary for storing the tokenized words, tags, and (X, y) pairs
all_words = []  # no stop words
all_words_stop = []  # stop words included
tags = []
xy_pairs = []  # no stop words
xy_pairs_stop = []  # stop words included

# loop through each intent in the data file
for intent in intents['intents']:
    # get the tag
    tag = intent['tag']
    # add the tag to our list of tags
    tags.append(tag)
    # loop through each pattern (sentence) in the intent
    for pattern in intent['patterns']:
        # tokenize the pattern into a list of words
        words = preprocess(pattern)
        words_stop = preprocess(pattern, remove_stop=False)
        # loop through each word in the tokenized pattern
        for word in words:
            # add the word to our list of all words
            all_words.append(word)
        for word in words_stop:
            # add the word to our list of all words
            all_words_stop.append(word)
        # add the (words, tag) pair to our list of (X, y) pairs
        xy_pairs.append((words, tag))
        xy_pairs_stop.append((words_stop, tag))

#### Vocabulary Sets

Now that we have a list of all words, we have a vocabulary for our dataset.

In [125]:
# remove duplicate words and sort the list
all_words = sorted(set(all_words))
all_words_stop = sorted(set(all_words_stop))
# sort the list of tags
tags = sorted(set(tags))
len(all_words), len(all_words_stop)

(649, 733)

In [126]:
tags

['GE',
 'accomadations',
 'attendance',
 'cheatingConsequences',
 'cheatingPolicy',
 'cheatingReporting',
 'covid',
 'finalDetails',
 'goodbye',
 'grading',
 'greeting',
 'lab',
 'language',
 'lateWork',
 'lectureLocation',
 'lectures',
 'midtermContent',
 'midtermDetails',
 'officeHours',
 'prerequisites',
 'professor',
 'project',
 'projectGroups',
 'textbook',
 'thanks',
 'topics',
 'zoom']

#### Create the Training and Test Set

The code in the cell below creates the training data by converting each input sentence in the original data to a bag of words representation using the bag_of_words function, and then converting the intent tags to integer labels that can be used for training. The resulting training data is represented as two numpy arrays, X_train and y_train.

In [127]:
X_data1 = []
y_data1 = []

# for each (X, y) pair in the data
for (pattern_sentence, tag) in xy_pairs:
    # create a bag of words for the pattern sentence
    bag = bag_of_words(pattern_sentence, all_words)
    # append the bag of words to the X data
    X_data1.append(bag)
    # convert the tag to a label that can be used with PyTorch CrossEntropyLoss
    label = tags.index(tag)
    # append the label to the y data
    y_data1.append(label)

# convert X and y to numpy arrays
X_data1 = np.array(X_data1)
y_data1 = np.array(y_data1)

Now we can get training and testing data:

In [128]:
# split the data into 10% test set and 90% train set
X_train1, X_test1, y_train1, y_test1 = train_test_split(X_data1, y_data1, test_size=0.1)

## Create The Model 

**NOTE**: The hyperparameters below were not chosen at random, but through manual testing. For instance, I tried the following hidden layer sizes: 4, 8, 16, 32, 64, 128, 256. I found that 32 produced the best results.

#### Set Hyperparameters

In [158]:
# Hyperparameters 
num_epochs = 1000
batch_size = 8
learning_rate = 0.001
input_size1 = len(X_train1[0])
hidden_size = 32
output_size = len(tags)
max_seq_length = 50
print(input_size1, output_size)

649 27


#### Model architecture
In order to effectively capture the underlying patterns in language data, a model capable of learning complex relationships is required. However, it is important to strike a balance and avoid overfitting by not making the model more complex than necessary. To start, I chose to create a simple feed forward neural network.

My first model in the cell below uses three fully connected layers, providing the capability to learn complex patterns in the input data. The ReLU activation function between the first and second fully connected layers introduces non-linearity into the model, improving its ability to model complex relationships. The third fully connected layer maps the learned features to the output classes, and no activation function is used for this layer. 

#### Addressing overfitting - dropout layers
Dropout is a regularization technique that randomly drops a certain percentage of the neurons in a neural network during training. By dropping out some neurons, the remaining neurons must learn to compensate for the missing ones, which encourages them to be more independent.

Essentially, adding dropout layers to my model will help it to learn more generalized features that and helps prevent the model from learning noise in the training data.

[source](https://machinelearningmastery.com/dropout-for-regularizing-deep-neural-networks/)

In [130]:
class BasicNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(BasicNN, self).__init__()
        # the first fully connected layer
        self.fc1 = nn.Linear(input_size, hidden_size)
        # the second fully connected layer
        self.fc2 = nn.Linear(hidden_size, hidden_size) 
        # the third fully connected layer
        self.fc3 = nn.Linear(hidden_size, num_classes)
        # the ReLU activation function
        self.relu = nn.ReLU()
        # zero out certain values to help prevent overfitting
        self.dropout = nn.Dropout(0.25)
    
    def forward(self, x):
        # apply the ReLU activation function to the first fully connected layer
        out = self.relu(self.fc1(x))  # apply relu to the 1st fully connected layer output
        # apply dropout regularization to prevent overfitting on the 2nd fully connected layer
        out = self.dropout(out)
        # apply the ReLU activation function to the second fully connected layer
        out = self.relu(self.fc2(out))  # apply relu to the 2nd fully connected layer output
        # apply dropout regularization to prevent overfitting on the 3rd fully connected layer
        out = self.dropout(out)
        # apply the third fully connected layer without any activation function
        out = self.fc3(out)  # apply the last fully connected layer
        return out  # return the final output of the network
    
    def sentence_representation(X):
        # FIXME
        pass

#### Load the Data

The `ChatDataset` class is a PyTorch dataset object that is designed to be used with PyTorch's DataLoader module to retrieve training data and corresponding labels for my chatbot application. The purpose of this class is to encapsulate the training data and labels as attributes of the object and to define the methods `__getitem__()` and `__len__()` to support indexing and length operations, respectively, on the dataset object. This class is an implementation of the Dataset abstract class in PyTorch and provides a consistent interface for loading training data for use with PyTorch models.

In [131]:
class ChatDataset(Dataset):
    """
    A PyTorch dataset object that is designed to be used with DataLoader to retrieve
    training data and labels.
    """
    def __init__(self, X_train, y_train):
        """
        Initializes the ChatDataset object.
        """
        self.n_samples = len(X_train)
        self.x_data = X_train
        self.y_data = y_train

    def __getitem__(self, index):
        """
        Returns the specified training sample.
        """
        return self.x_data[index], self.y_data[index]

    def __len__(self):
        """
        Returns the number of training samples.
        """
        return self.n_samples

This code creates a `ChatDataset` object with training data and corresponding labels `X_train` and `y_train`, and then uses a PyTorch DataLoader object to iterate over the dataset during model training. The `DataLoader` is configured to retrieve data in batches of size `batch_size`, to shuffle the data before each epoch of training, and to use 0 worker processes for data loading.

In [132]:
training_dataset1 = ChatDataset(X_train1, y_train1)
train_loader1 = DataLoader(dataset=training_dataset1,
                          batch_size=batch_size,
                          shuffle=True,
                          num_workers=0,
                          drop_last=True)

## Train The Model

In [133]:
def trainer(model, criterion, optimizer, num_epochs, train_loader, bert=False):
    # move loss function and model to gpu
    criterion = criterion.to(device)
    model = model.to(device)
    for epoch in range(num_epochs): # loop over the specified number of epochs
        for (words, labels) in train_loader: # iterate over training data in batches
            # forward pass
            if bert==True:
                inputs = tokenizer.batch_encode_plus(inputs, max_length=50, 
                                                     pad_to_max_length=True, return_tensors='pt', truncation=True)
                input_ids = inputs['input_ids'].to(device)
                attention_mask = inputs['attention_mask'].to(device)
                labels = labels.long()
                labels = labels.to(device)

                output = bert_classifier(input_ids, attention_mask)
            else:
                labels = labels.to(dtype=torch.long).to(device) # load batch onto gpu
                words = words.to(device) # load batch onto GPU
                output = model(words) # make predictions for given inputs

            loss = criterion(output, labels) # compare predictions to actual labels

            # Backward and optimize
            optimizer.zero_grad() # reset gradients
            loss.backward() # compute gradients using backpropagation
            optimizer.step() # update the model weights

        if (epoch+1) % 100 == 0: # print loss every 100 epochs
            print (f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

    print(f'final loss: {loss.item():.4f}') # print the final loss after training is complete
    
    # move back to cpu to free up gpu memory
    criterion = criterion.to(cpu)
    model = model.to(cpu)

First, let's see whether the computer has an GPU available for use with PyTorch:

In [134]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cpu')

Now let's instantiate our model, loss function, and optimizer:

In [135]:
model1 = BasicNN(input_size1, hidden_size, output_size)
# loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model1.parameters(), lr=learning_rate)

Now we can train our model:

In [136]:
trainer(model1, criterion, optimizer, num_epochs, train_loader1)

Epoch [100/500], Loss: 0.0121
Epoch [200/500], Loss: 0.0033
Epoch [300/500], Loss: 0.0083
Epoch [400/500], Loss: 0.0004
Epoch [500/500], Loss: 0.0490
final loss: 0.0490


## Evaluate the Model

In [137]:
testing_dataset1 = ChatDataset(X_test1, y_test1)
test_loader1 = DataLoader(dataset=testing_dataset1,
                          shuffle=False,
                          num_workers=0,
                          drop_last=True)

In [138]:
def evaluator(model, test_loader, bert=False):
    # evaluation loop
    model.eval()
    model = model.to(device)
    with torch.no_grad():
        total_correct = 0
        total_samples = 0
        for inputs, targets in test_loader:
            if bert==True:
                inputs = tokenizer.batch_encode_plus(inputs, max_length=50, 
                                                     pad_to_max_length=True, return_tensors='pt', truncation=True)
                input_ids = inputs['input_ids'].to(device)
                attention_mask = inputs['attention_mask'].to(device)
                targets = targets.long()
                targets = targets.to(device)

                outputs = bert_classifier(input_ids, attention_mask)
            else:
                # move the inputs and targets to the device
                inputs = inputs.to(device)
                targets = targets.to(device)
                # forward pass
                outputs = model(inputs)
                
            _, predictions = torch.max(outputs, 1)

            # update the total number of correct predictions and total number of samples
            total_correct += (predictions == targets).sum().item()
            total_samples += targets.size(0)

        # print the accuracy
        print(f'Accuracy: {total_correct/total_samples*100:.2f}%')

In [139]:
evaluator(model1, test_loader1)

Accuracy: 91.60%


## Add an Embedding Layer to the Model

#### What are the down sides of a bag-of-words representation of a sentence?

The primary downside of the bag-of-words representation is that it assumes that each word occurs independently of all other words, which of course is unrealistic. Thus, the following issues arise:
- The bag-of-words representation cannot capture relationships between words
- The bag-of-words representation cannot handle words outside of its vocabulary

[Source](https://medium.com/swlh/word-embeddings-versus-bag-of-words-the-curious-case-of-recommender-systems-6ac1604d4424)

#### How does an embedding representation help to address these issues?

The embedding representation is capable of considering the context of a sentence a word is in. This can lead to improved performance as...
- Embedding representations allow for better generalization as they do not only rely on words appearing that they already know, but can recognize similar semantic patterns, too
- This means it can map words outside of its vocabulary by mapping these words to other words in the sentence

### Include Stop Words?

I am curious as to whether my network with an embedding layer will perform better with stop words included or with stop words excluded and compare the accuracies. 

#### A slightly modified preprocessing strategy

In [140]:
max_length_with_stopwords = 35
max_length_no_stopwords = 15

We first want to create a mapping between words in our vocabulary and their corresponding integer indices. 

NOTE: I also add a special null word to the mapping dictionary, which is represented by the string "123NULLWORD123". This token is used to pad sequences to a fixed length.

In [141]:
# with stop words
mapping1 = {word: i for i, word in enumerate(all_words_stop)}
mapping1["123NULLWORD123"] = len(all_words_stop)
input_size2 = len(mapping1)

# without stopwords
mapping2 = {word: i for i, word in enumerate(all_words)}
mapping2["123NULLWORD123"] = len(all_words)
input_size3 = len(mapping2)
input_size3

650

Next, let's make a new function `word_mapper`, to assist with our preprocessing. `word_mapper` takes a sentence, the `mapping` dictionary, and a maximum sequence length as inputs. First, `word_mapper` maps each word in the sentence to its corresponding index in the `mapping` dictionary. Then, `word_mapper` pads the resulting list with the index of the last word if its length is less than `max_length`. Finally, the resulting list of indices is returned, representing the input sentence encoded as a sequence of integers with padding. We can use this encoded sequence as input to an embedding layer in our new neural network!

In [142]:
def word_mapper(sentence, word_to_id, input_size, max_length=15):
    # map each word to its id
    mapped = [word_to_id.get(word, 1) for word in sentence]

    # pad the sequence with number out of sequence up to max_length
    if len(mapped) < max_length:
        mapped = mapped + [input_size-1] * (max_length - len(mapped))
    else:
        mapped = mapped[:max_length]

    return mapped

#### Picking an embedding size

It is tempting to pick an embedding size equal to the max length of the sentence. Afterall, could we not capture the underlying context of a word in a sentence using a vector of the length of the sentence and mark where it falls?

In [143]:
embedding_size = 50

#### Build our dataset like usual...

In [144]:
# with stop words
X_data2 = []
y_data2 = []

# for each (X, y) pair in the data
for (pattern_sentence, tag) in xy_pairs_stop:
    # append sentence where each word is mapped to an id
    mapped = word_mapper(pattern_sentence, mapping1, input_size2, max_length=max_length_with_stopwords)
    X_data2.append(mapped)
    # convert the tag to a label that can be used with PyTorch CrossEntropyLoss
    label = tags.index(tag)
    # append the label to the y data
    y_data2.append(label)

# convert X and y to numpy arrays
X_data2 = np.array(X_data2)
y_data2 = np.array(y_data2)

# ==============================================================================
# with NO stop words
X_data3 = []
y_data3 = []

# for each (X, y) pair in the data
for (pattern_sentence, tag) in xy_pairs:
    # append sentence where each word is mapped to an id
    mapped = word_mapper(pattern_sentence, mapping2, input_size3, max_length=max_length_no_stopwords)
    X_data3.append(mapped)
    # convert the tag to a label that can be used with PyTorch CrossEntropyLoss
    label = tags.index(tag)
    # append the label to the y data
    y_data3.append(label)

# convert X and y to numpy arrays
X_data3 = np.array(X_data3)
y_data3 = np.array(y_data3)

#### Training our model

In [145]:
# split the data with stop words into 10% test set and 90% train set
X_train2, X_test2, y_train2, y_test2 = train_test_split(X_data2, y_data2, test_size=0.1)

# split the data without stop words into 10% test set and 90% train set
X_train3, X_test3, y_train3, y_test3 = train_test_split(X_data3, y_data3, test_size=0.1)

In [146]:
# stop words included
training_dataset2 = ChatDataset(X_train2, y_train2)
train_loader2 = DataLoader(dataset=training_dataset2,
                          batch_size=batch_size,
                          shuffle=True,
                          num_workers=0,
                          drop_last=True)

# stop words excluded
training_dataset3 = ChatDataset(X_train3, y_train3)
train_loader3 = DataLoader(dataset=training_dataset3,
                          batch_size=batch_size,
                          shuffle=True,
                          num_workers=0,
                          drop_last=True)

In [147]:
class EmbeddingNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, num_classes, max_length):
        super(EmbeddingNN, self).__init__()
        # load pretrained GloVe embeddings
        self.embedding_dim = embedding_dim
        self.max_length = max_length
        self.embedding = nn.Embedding(num_embeddings=vocab_size, 
                                      embedding_dim=embedding_size)
        # the first fully connected layer
        self.fc1 = nn.Linear(max_length*embedding_dim, 256)
        # the second fully connected layer
        self.fc2 = nn.Linear(256, num_classes)
        # the ReLU activation function
        self.relu = nn.ReLU()
        # zero out certain values to help prevent overfitting
        self.dropout = nn.Dropout(0.85)
    
    def forward(self, x):
        x = x.long()
        # pass input through the embedding layer
        out = self.embedding(x)
        # flatten the output to feed it to the first fully connected layer
        out = out.view(-1, self.max_length, self.embedding_dim)
        out = out.view(out.size(0), -1)
        # apply the ReLU activation function to the first fully connected layer
        out = self.relu(self.fc1(out))
        # apply dropout regularization to prevent overfitting on the second fully connected layer
        out = self.dropout(out)
        # apply the ReLU activation function to the second fully connected layer
        out = self.fc2(out)
        return out

#### Train and evaluate model with stop words *INCLUDED* in input sentence:

In [148]:
model2 = EmbeddingNN(input_size2, embedding_size, output_size, max_length_with_stopwords)
# loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model2.parameters(), lr=learning_rate)

In [149]:
trainer(model2, criterion, optimizer, num_epochs, train_loader2)

Epoch [100/500], Loss: 1.1301
Epoch [200/500], Loss: 1.1499
Epoch [300/500], Loss: 0.4882
Epoch [400/500], Loss: 0.1465
Epoch [500/500], Loss: 0.0002
final loss: 0.0002


In [150]:
testing_dataset2 = ChatDataset(X_test2, y_test2)
test_loader2 = DataLoader(dataset=testing_dataset2,
                          shuffle=False,
                          num_workers=0,
                          drop_last=True)

In [151]:
evaluator(model2, test_loader2)

Accuracy: 80.67%


#### Train and evaluate model with stop words *EXCLUDED* from input sentence:

In [159]:
model3 = EmbeddingNN(input_size3, embedding_size, output_size, max_length_no_stopwords)
# loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model3.parameters(), lr=learning_rate)

In [160]:
trainer(model3, criterion, optimizer, num_epochs, train_loader3)

Epoch [100/1000], Loss: 0.0209
Epoch [200/1000], Loss: 0.0029
Epoch [300/1000], Loss: 0.1280
Epoch [400/1000], Loss: 0.4169
Epoch [500/1000], Loss: 0.0000
Epoch [600/1000], Loss: 0.0006
Epoch [700/1000], Loss: 0.0170
Epoch [800/1000], Loss: 0.0009
Epoch [900/1000], Loss: 0.0000
Epoch [1000/1000], Loss: 0.1469
final loss: 0.1469


In [161]:
testing_dataset3 = ChatDataset(X_test3, y_test3)
test_loader3 = DataLoader(dataset=testing_dataset3,
                          shuffle=False,
                          num_workers=0,
                          drop_last=True)

In [162]:
evaluator(model3, test_loader3)

Accuracy: 84.03%


#### Conclusion

- Including stop words hurts model accuracy significantly.
- The neural network with an embedding layer was highly accurate, but not more accurate than the simple feed forward neural net. My guess for why this is that the sentences are simple enough and similar enough that an embedding layer does not add much value. 

### Transformers

FIXME

### Let's Chat!

In the below code, the user is prompted for input until they type 'quit' to exit. Assuming the user did not type quit, their input is preprocessed, converted to a bag of words, and fed into the model for prediction. If the model predicts an intent with a high probability, a response is generated based on that intent from a set of predefined responses. If the model predicts with low probability, the chatbot responds with an "I do not understand..." message. Thus, the chatbot will (in theory) only respond to questions it can understand!

In [163]:
# set the model to evaluation mode
def chat(model, embedding=False):
    model.to(cpu)
    model.eval()

    bot_name = "Eliot"
    print("Let's chat! (type 'quit' to exit)")
    while True:
        sentence = input("You: ")
        if sentence.lower() == "quit":
            break

        sentence = preprocess(sentence)
        if embedding:
            X = word_mapper(sentence, 
                            mapping2, 
                            input_size3, 
                            max_length=max_length_no_stopwords
                           )
            X = np.array(X)
            X = torch.from_numpy(X)
        else:
            X = bag_of_words(sentence, all_words)
            X = X.reshape(1, X.shape[0])
            X = torch.from_numpy(X)
        output = model(X)
        _, predicted = torch.max(output, dim=1)

        tag = tags[predicted.item()]

        probs = torch.softmax(output, dim=1)
        prob = probs[0][predicted.item()]

        if prob.item() > 0.9:
            for intent in intents['intents']:
                if tag == intent["tag"]:
                    print(f"{bot_name}: {random.choice(intent['responses'])}")
        else:
            print(f"{bot_name}: I'm sorry, but I do not understand. Could you try being more specific?")

In [164]:
chat(model1, embedding=False)

Let's chat! (type 'quit' to exit)
You: hi
Eliot: Hi there, what can I do for you?
You: can i submit work late?
Eliot: I do not know if there is a penalty for submitting work late. I'm Sorry.
You: what should i do if i suspect my group member of cheating?
Eliot: Cases of dishonesty will be handled according to university policy. Students are responsible for taking reasonable precautions to protect their work, and turning in someone else's code is considered collusion and will result in a failing grade for all involved parties.
You: when is class?
Eliot: Two lectures are delivered in person every week on Tuesdays and Thursdays, from 4:00 PM to 5:20 PM
You: when is the final exam?
Eliot: CS 472 does not have a final exam
You: do i have to go to class?
Eliot: It is highly recommended that you attend lecture. However, it is not required.
You: i feel ill. what should i do?
Eliot: Hi there, how can I help?
You: quit


# TODO