In [81]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import numpy as np
import random
import string
import json
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
stemmer = PorterStemmer()
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\eliot\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\eliot\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\eliot\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\eliot\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# Eliot's Interactive Syllabus Chatbot

Code inspired by tutorial created by [Patrick Loeber](https://www.youtube.com/playlist?list=PLqnslRFeH2UrFW4AUgn-eY37qOAWQpJyg)

### Preparing the text

The code in the cell below defines two functions to preprocess text data and create a bag of words representation for a given sentence.

The preprocess function takes a sentence as input, removes punctuation and stop words, stems each word in the sentence, and returns the preprocessed sentence as a list of words.

The bag_of_words function takes a tokenized sentence and a list of all known words in the vocabulary as input, and creates a bag of words representation for the given sentence. It initializes the bag with zeros for each word in the vocabulary, and updates the bag with 1 for each word in the sentence that exists in the vocabulary. The function returns a numpy array representing the bag of words with 1 for each known word that exists in the sentence, 0 otherwise.

The techniques in used in these functions are commonly used in natural language processing tasks.

NOTE: the stop words list provided by NLTK includes words that I don't think are stop words, and it also doesn't include some words that I do think are stop words for the purposes of this project. Thus, I start by modifying the stop words:

In [82]:
stop_words = set(stopwords.words('english'))
stop_words.remove('where')
stop_words.remove('when')
stop_words.remove('what')
stop_words.remove('who')
stop_words.add('tell')
stop_words.add('know')
stop_words.add('cs')
stop_words.add('cis')
stop_words.add('academic')

Next, let's create a dictionary of meaningful synonyms. By replacing all synonyms with a single word, we can reduce the number of features in our feature space while increasing the frequency of important words.

In [83]:
synonyms = {
    'disability': {'injury', 'disorder', 'condition', 'impairment', 'impaired',
                  'injured'},
    'zoom': {'remote'},
    'ml': {'ai', 'rnn', 'cnn', 'PCA', 'dimensionality', 'regression', 'knn', 
           'svm', 'svms', 'gans', 'gan', 'optimization', 'cnns', 'transformers',
          'vaes', 'vae', 'regularization', 'gradient', 'neural', 'network'},
    'test': {'exam', 'test', 'midterm'},
    'dishonesty': {'dishonesty', 'cheating', 'plagiarize', 'copying', 'copied',
                   'collusion', 'plagiarism', 'lying', 'cheat'},
    '315': {'315', 'cs315', 'cis315'},
    'course': ['cs472', 'class', 'course', '472'],
    'cs': {'cis', 'cs'},
    'you': {'ya', 'you'},
    'weekday': {'monday', 'tuesday', 'wednesday', 
                'thursday', 'friday','saturday', 'sunday'},
    'month': {'january', 'february', 'march', 'april', 
              'may', 'june', 'july', 'august',
             'september', 'october', 'november', 'december'},
    'software': {'pytorch', 'tensorflow', 'numpy', 'pandas', 
               'language', 'framework', 'javascript', 'c',
              'sklearn', 'scikitlearn', 'keras', 'jupyter', 'python'},
    'coding': {'programming', 'code'},
    'start': ['start', 'begin', 'commence'],
    'computer': {'computer', 'pc', 'mac', 'laptop'},
    'employee': {'employee', 'worker', 'staff'},
    'ta': {'ge', 'ta'},
    'good': {'good', 'great', 'nice', 'awesome', 'cool'},
    'goodbye': {'adieu', 'goodbye', 'farewell', 
                'bye', 'adios', 'arrivederci', 
                'auf', 'ciao', 'later', 'peace', 
                'sayonara', 'see ya', 'ttyl', 
                'wiedersehen'},
    'grad': {'grad', 'graduate'},
    'hi': {'hello', 'hi', 'hey', 'hiya', 'hola', 'greetings', 'yo'},
    'i': {'i', 'myself'},
    'long': {'long', 'lengthy'},
    'prerequisites': {'prereqs', 'prerequisites'},
    'professor': {'professor', 'teacher', 'instructor', 'prof'}
}

def synonymReplacer(tokens):
    # loop through each token in the list
    for i, word in enumerate(tokens):
        # convert the word to lowercase
        word = word.lower()
        # loop through each synonym list in the synonyms dictionary
        for syn, syn_list in synonyms.items():
            # check if the current word is in the synonym list
            if word in syn_list:
                # replace the current token with the synonym
                tokens[i] = syn
                # break out of the loop since we've found a synonym
                break
    # return the modified list of tokens
    return tokens

In [84]:
synonymReplacer(["when", "does", "class", "begin"])

['when', 'does', 'course', 'start']

In [85]:
# initialize the Porter Stemmer
stemmer = PorterStemmer()

def preprocess(sentence):
    """
    This function takes a sentence as input and performs various text preprocessing steps on it,
    including removing punctuation, stop words, and stemming each word in the sentence.
    """
    # remove punctuation from sentence
    sentence = ''.join(
        char for char in sentence if char not in string.punctuation
    )
    # tokenizing the sentence
    tokens = nltk.word_tokenize(sentence)
    # replace synonyms
    tokens = synonymReplacer(tokens)
    # removing stop words
    tokens = [token for token in tokens if token.lower() not in stop_words]
    # stemming each word in the sentence
    stemmed_words = [stemmer.stem(word) for word in tokens]
    # return the preprocessed sentence as a list of words
    return stemmed_words



def bag_of_words(tokenized_sentence, words):
    """
    Create a bag of words representation for a given tokenized sentence.
    """
    # initialize the bag with zeros for each word in the vocabulary
    bag = np.zeros(len(words), dtype=np.float32)

    # update the bag with 1 for each word in the sentence that exists in the vocabulary
    for idx, w in enumerate(words):
        if w in tokenized_sentence: 
            bag[idx] = 1
    return bag

#### Preprocessing Example

What exactly does our preprocessing do?

In [86]:
example = "Can you tell me who will be leading the lectures for CS 472?"
tokens = preprocess(example)
tokens

['who', 'lead', 'lectur', 'cours']

First, the `preprocess` function removes all punctuation marks from the sentence using the `string.punctuation` module. Then, the sentence is tokenized into a list of words using the nltk.word_tokenize method.

Next, the function removes stop words, which are common words that do not carry much meaning in the sentence, such as "a", "an", "the", "of", and so on. In this case, the function is using a pre-defined list of stop words to remove them from the list of tokens.

After that, the function performs stemming on each word in the sentence, which involves converting the words into their root or base form, called their stem. The function uses a stemmer to perform this task.

Finally, the preprocessed words are returned as a list.

In [87]:
example_all_words = ['hello', 'tell', 'what', 'ten', 'lead', 'lectur', 'cs', 'who', '472']
bag_of_words(tokens, example_all_words)

array([0., 0., 0., 0., 1., 1., 0., 1., 0.], dtype=float32)

The purpose of the code in the below cell is to read in the data file, tokenize the sentences into individual words, and create a list of (X, y) pairs, where X is a list of tokenized words and y is the associated intent tag. This is a common preprocessing step in Natural Language Processing (NLP) where the goal is to classify user input into one of several predefined categories. By tokenizing the input patterns and creating (X, y) pairs, the data can be transformed into a format that is more suitable for use in machine learning algorithms.

In [88]:
# load the data file as a Python object
with open('intents.json', 'r') as file:
    intents = json.load(file)

# empty lists for storing the tokenized words, tags, and (X, y) pairs
all_words = []
tags = []
xy_pairs = []

# loop through each intent in the data file
for intent in intents['intents']:
    # get the tag
    tag = intent['tag']
    # add the tag to our list of tags
    tags.append(tag)
    # loop through each pattern (sentence) in the intent
    for pattern in intent['patterns']:
        # tokenize the pattern into a list of words
        words = preprocess(pattern)        
        # add the tokenized words to our list of all words
        all_words.extend(words)
        # add the (words, tag) pair to our list of (X, y) pairs
        xy_pairs.append((words, tag))

The purpose of the code is to preprocess a list of words and associated intent tags for use in Natural Language Processing (NLP) tasks such as intent classification. The code first stems and lowercases each word in the list, ignoring any words in a specified ignore list. It then removes duplicate words from the list and sorts the resulting list. The code also sorts the list of intent tags. The resulting processed data can be used as input to machine learning algorithms for tasks such as training a model to classify user input into one of several predefined categories.

In [89]:
# remove duplicate words and sort the list
all_words = sorted(set(all_words))

# sort the list of tags
tags = sorted(set(tags))

### Create training data

The code in the cell below creates the training data by converting each input sentence in the original data to a bag of words representation using the bag_of_words function, and then converting the intent tags to integer labels that can be used for training. The resulting training data is represented as two numpy arrays, X_train and y_train.

In [90]:
X_train = []
y_train = []

# for each (X, y) pair in the data
for (pattern_sentence, tag) in xy_pairs:
    # create a bag of words for the pattern sentence
    bag = bag_of_words(pattern_sentence, all_words)
    # append the bag of words to the X training data
    X_train.append(bag)
    # convert the tag to a label that can be used with PyTorch CrossEntropyLoss
    label = tags.index(tag)
    # append the label to the y training data
    y_train.append(label)

# convert X and y to numpy arrays
X_train = np.array(X_train)
y_train = np.array(y_train)

### Create model to classify text 

In order to effectively capture the underlying patterns in language data, a model capable of learning complex relationships is required. However, it is important to strike a balance and avoid overfitting by not making the model more complex than necessary. To start with, a simple feed forward neural network was chosen.

The model presented in the cell below uses three fully connected layers, providing the capability to learn complex patterns in the input data. The ReLU activation function between the first and second fully connected layers introduces non-linearity into the model, improving its ability to model complex relationships. The third fully connected layer maps the learned features to the output classes, and no activation function is used for this layer. 

In [91]:
class NeuralNet(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(NeuralNet, self).__init__()
        # the first fully connected layer
        self.fc1 = nn.Linear(input_size, hidden_size)
        # the second fully connected layer
        self.fc2 = nn.Linear(hidden_size, hidden_size) 
        # the third fully connected layer
        self.fc3 = nn.Linear(hidden_size, num_classes)
        # the ReLU activation function
        self.relu = nn.ReLU()
    
    def forward(self, x):
        # apply the ReLU activation function to the first fully connected layer
        out = self.relu(self.fc1(x))
        # apply the ReLU activation function to the second fully connected layer
        out = self.relu(self.fc2(out))
        # apply the third fully connected layer without any activation function
        out = self.fc3(out)
        return out

### Set Hyperparameters

In [92]:
# Hyperparameters 
num_epochs = 500
batch_size = 8
learning_rate = 0.001
input_size = len(X_train[0])
hidden_size = 16
output_size = len(tags)
print(input_size, output_size)

418 16


### Load the Data

The `ChatDataset` class is a PyTorch dataset object that is designed to be used with PyTorch's DataLoader module to retrieve training data and corresponding labels for my chatbot application. The purpose of this class is to encapsulate the training data and labels as attributes of the object and to define the methods `__getitem__()` and `__len__()` to support indexing and length operations, respectively, on the dataset object. This class is an implementation of the Dataset abstract class in PyTorch and provides a consistent interface for loading training data for use with PyTorch models.

In [93]:
class ChatDataset(Dataset):
    """
    A PyTorch dataset object that is designed to be used with DataLoader to retrieve
    training data and labels.

    Attributes:
    X_train: numpy array, shape (n_samples, n_features)
        Training data.
    y_train: numpy array, shape (n_samples, n_classes)
        Corresponding labels.
    """

    def __init__(self, X_train, y_train):
        """
        Initializes the ChatDataset object.

        Parameters:
        X_train: numpy array, shape (n_samples, n_features)
            Training data.
        y_train: numpy array, shape (n_samples, n_classes)
            Corresponding labels.
        """
        self.n_samples = len(X_train)
        self.x_data = X_train
        self.y_data = y_train

    def __getitem__(self, index):
        """
        Returns the specified training sample.

        Parameters:
        index: int
            Index of the sample to retrieve.

        Returns:
        Tuple containing the training data and corresponding label for the specified index.
        """
        return self.x_data[index], self.y_data[index]

    def __len__(self):
        """
        Returns the number of training samples.

        Returns:
        The number of training samples as an integer.
        """
        return self.n_samples

This code creates a `ChatDataset` object with training data and corresponding labels `X_train` and `y_train`, and then uses a PyTorch DataLoader object to iterate over the dataset during model training. The `DataLoader` is configured to retrieve data in batches of size `batch_size`, to shuffle the data before each epoch of training, and to use 0 worker processes for data loading.

In [94]:
dataset = ChatDataset(X_train, y_train)
train_loader = DataLoader(dataset=dataset,
                          batch_size=batch_size,
                          shuffle=True,
                          num_workers=0,
                          drop_last=True)

### Train The Model

First, let's see whether the computer has an GPU available for use with PyTorch:

In [95]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

Now let's instantiate our model, loss function, and optimizer:

In [96]:
model = NeuralNet(input_size, hidden_size, output_size).to(device)

# loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

Now we can train our model:

In [97]:
for epoch in range(num_epochs): # loop over the specified number of epochs
    for (words, labels) in train_loader: # iterate over training data in batches
        words = words.to(device) # load batch onto GPU
        labels = labels.to(dtype=torch.long).to(device) # load batch onto GPU

        # forward pass
        outputs = model(words) # make predictions for given inputs
        loss = criterion(outputs, labels) # compare predictions to actual labels

        # Backward and optimize
        optimizer.zero_grad() # reset gradients
        loss.backward() # compute gradients using backpropagation
        optimizer.step() # update the model weights

    if (epoch+1) % 100 == 0: # print loss every 100 epochs
        print (f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

print(f'final loss: {loss.item():.4f}') # print the final loss after training is complete

Epoch [100/500], Loss: 0.0001
Epoch [200/500], Loss: 0.0000
Epoch [300/500], Loss: 0.0000
Epoch [400/500], Loss: 0.0000
Epoch [500/500], Loss: 0.0000
final loss: 0.0000


### Let's Chat!

In the below code, the user is prompted for input until they type 'quit' to exit. Assuming the user did not type quit, their input is preprocessed, converted to a bag of words, and fed into the model for prediction. If the model predicts an intent with a high probability, a response is generated based on that intent from a set of predefined responses. If the model predicts with low probability, the chatbot responds with an "I do not understand..." message. Thus, the chatbot will (in theory) only respond to questions it can understand!

In [98]:
# set the model to evaluation mode
model.eval()

bot_name = "Eliot"
print("Let's chat! (type 'quit' to exit)")
while True:
    sentence = input("You: ")
    if sentence.lower() == "quit":
        break

    sentence = preprocess(sentence)
    X = bag_of_words(sentence, all_words)
    X = X.reshape(1, X.shape[0])
    X = torch.from_numpy(X).to(device)

    output = model(X)
    _, predicted = torch.max(output, dim=1)

    tag = tags[predicted.item()]

    probs = torch.softmax(output, dim=1)
    prob = probs[0][predicted.item()]
    
    if prob.item() > 0.95:
        for intent in intents['intents']:
            if tag == intent["tag"]:
                print(f"{bot_name}: {random.choice(intent['responses'])}")
    else:
        print(f"{bot_name}: I do not understand...")

Let's chat! (type 'quit' to exit)
You: does the course have accomadations for people with disabilities?
Eliot: The goal of the class is to be inclusive and accessible to all students, regardless of any disabilities. If a student experiences barriers to full participation or fair evaluation due to a disability, they should inform the instructor so that accommodations can be made. It is also important to inform the instructor early on in the quarter to ensure that appropriate accommodations can be provided. If there are any other obstacles to full participation, students are encouraged to share their concerns with the instructor.
You: where are lectures located?
Eliot: Lectures are located in room 166 in Lawrence hall
You: can you share the zoom link?
Eliot: Live lectures will often be viewable by joining the zoom meeting via the following link: https://uoregon.zoom.us/j/91353372386?pwd=MnBMVUh4VC9lU3A3UlJQUmRLY0RjZz09
You: when are lectures over?
Eliot: Two lectures are delivered in per

# TODO
- add spell checking to preprocessing