In [54]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import numpy as np
import random
import string
import json
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from sklearn.model_selection import train_test_split
from collections import Counter

from torchtext.vocab import GloVe
import torchtext.vocab 
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt')
cpu = torch.device('cpu')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\eliot\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\eliot\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\eliot\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\eliot\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Eliot's Interactive Syllabus Chatbot

Code inspired by tutorial created by [Patrick Loeber](https://www.youtube.com/playlist?list=PLqnslRFeH2UrFW4AUgn-eY37qOAWQpJyg)

### Preparing the text

The code in the cell below defines two functions to preprocess text data and create a bag of words representation for a given sentence.

The preprocess function takes a sentence as input, removes punctuation and stop words, stems each word in the sentence, and returns the preprocessed sentence as a list of words.

The bag_of_words function takes a tokenized sentence and a list of all known words in the vocabulary as input, and creates a bag of words representation for the given sentence. It initializes the bag with zeros for each word in the vocabulary, and updates the bag with 1 for each word in the sentence that exists in the vocabulary. The function returns a numpy array representing the bag of words with 1 for each known word that exists in the sentence, 0 otherwise.

The techniques in used in these functions are commonly used in natural language processing tasks.

NOTE: the stop words list provided by NLTK includes words that I don't think are stop words, and it also doesn't include some words that I do think are stop words for the purposes of this project. Thus, I start by modifying the stop words:

In [55]:
stop_words = set(stopwords.words('english'))
'do' in stop_words

True

In [56]:
stop_words = set(stopwords.words('english'))
stop_words.remove('where')
stop_words.remove('when')
stop_words.remove('what')
stop_words.remove('who')
stop_words.remove('how')
stop_words.remove('more')
stop_words.add('tell')
stop_words.add('know')
stop_words.add('cs')
stop_words.add('cis')
stop_words.add('academic')

Next, let's create a dictionary of meaningful synonyms. By replacing all synonyms with a single word, we can reduce the number of features in our feature space while increasing the frequency of important words.

In [57]:
synonyms = {
    'location': {'room', 'classroom', 'building', 'place', 'site', 'hall'},
    'disability': {'injury', 'disorder', 'condition', 'impairment', 'impaired',
                  'injured', 'ptsd', 'anxiety', 'dyslexia', 'adhd', 'depression',
                  'illness', 'disease', 'autism'},
    'accomadate': {'support', 'supported', 'supporting', 'supports',
             'accomadate', 'accomadates', 'accommodation', 'accomadated' 
             'assistance', 'assist', 'assisting', 'assists'},
    'more': {'added', 'additional', 'extra'},
    'zoom': {'remote'},
    'ml': {'ai', 'rnn', 'cnn', 'PCA', 'dimensionality', 'regression', 'knn', 
           'svm', 'svms', 'gans', 'gan', 'optimization', 'cnns', 'transformers',
          'vaes', 'vae', 'regularization', 'gradient', 'neural', 'network'},
    'test': {'exam', 'test', 'midterm'},
    'dishonesty': {'dishonesty', 'cheating', 'plagiarize', 'copying', 'copied',
                   'collusion', 'plagiarism', 'lying', 'cheat'},
    '315': {'315', 'cs315', 'cis315', 'cs415', '415', '313', 'cs313', 'cis313', 
           'cis415', 'cs314', '314', 'cis314', 'cs212', 'cis212', '212',
           'cs211', 'cis211', '211', 'cs210', 'cis210', '210',
           'cs425', 'cis425', '425', 'cs471', 'cis471', '471', 
           'cs330', 'cis330', '330'},
    'course': ['cs472', 'class', 'course', '472', 'cis472', 'curriculum', 'lecture',
              'lectures'],
    'cs': {'cis', 'cs'},
    'you': {'ya', 'you'},
    'time': {'monday', 'tuesday', 'wednesday', 
             'thursday', 'friday','saturday', 'sunday',
             'tomorrow', 'yesterday', 'today', 'january', 'february', 
             'march', 'april', 'may', 'june', 'july', 'august',
             'september', 'october', 'november', 'december',
             'tonight', 'afternoon', 'tonight', 'morning'},
    'software': {'pytorch', 'tensorflow', 'numpy', 'pandas', 
               'language', 'framework', 'javascript', 'c',
              'sklearn', 'scikitlearn', 'keras', 'jupyter', 'python'},
    'coding': {'programming', 'code', 'program', 'develop', 'developing'},
    'start': ['begin', 'commence'],
    'computer': {'pc', 'mac', 'laptop'},
    'employee': {'employee', 'worker', 'staff'},
    'ta': {'ge', 'steven', 'walton'},
    'good': {'great', 'nice', 'awesome', 'cool'},
    'goodbye': {'adieu', 'farewell', 
                'bye', 'adios', 'arrivederci', 
                'auf', 'ciao', 'later', 'peace', 
                'sayonara', 'see ya', 'ttyl', 
                'wiedersehen'},
    'grad': {'graduate'},
    'hi': {'hello', 'hey', 'hiya', 'hola', 'greetings', 'yo'},
    'i': {'myself'},
    'long': {'lengthy'},
    'prerequisites': {'prereqs', 'prerequisite'},
    'professor': {'teacher', 'instructor', 'prof', 'humphrey', 'shi'}
}

def synonymReplacer(tokens, stemmed=False):
    # loop through each token in the list
    for i, word in enumerate(tokens):
        # convert the word to lowercase
        word = word.lower()
        # loop through each synonym list in the synonyms dictionary
        for syn, syn_list in synonyms.items():
            # check if the current word is in the synonym list
            if stemmed:
                syn_list = [stemmer.stem(w) for w in syn_list]
            if word in syn_list:
                # replace the current token with the synonym
                tokens[i] = syn
                # break out of the loop since we've found a synonym
                break
    # return the modified list of tokens
    return tokens

In [58]:
synonymReplacer(["when", "does", "class", "begin"])

['when', 'does', 'course', 'start']

In [59]:
# initialize the Porter Stemmer
stemmer = PorterStemmer()

def preprocess(sentence):
    """
    This function takes a sentence as input and performs various text preprocessing steps on it,
    including removing punctuation, stop words, and stemming each word in the sentence.
    """
    # remove punctuation from sentence
    sentence = ''.join(
        char for char in sentence if char not in string.punctuation
    )
    # tokenizing the sentence
    tokens = nltk.word_tokenize(sentence)
    # replace synonyms
    tokens = synonymReplacer(tokens)
    # removing stop words
    tokens = [token for token in tokens if token.lower() not in stop_words]
    # stemming each word in the sentence
    stemmed_words = [stemmer.stem(word) for word in tokens]
    stemmed_words = synonymReplacer(stemmed_words, stemmed=True)
    # return the preprocessed sentence as a list of words
    return stemmed_words



def bag_of_words(tokenized_sentence, words):
    """
    Create a bag of words representation for a given tokenized sentence.
    """
    # initialize the bag with zeros for each word in the vocabulary
    bag = np.zeros(len(words), dtype=np.float32)

    # update the bag with 1 for each word in the sentence that exists in the vocabulary
    for idx, w in enumerate(words):
        if w in tokenized_sentence: 
            bag[idx] = 1
    return bag

#### Preprocessing Example

What exactly does our preprocessing do?

In [60]:
example = "Can you tell me who will be leading the lectures for CS 472?"
tokens = preprocess(example)
tokens

['who', 'lead', 'course', 'course']

First, the `preprocess` function removes all punctuation marks from the sentence using the `string.punctuation` module. Then, the sentence is tokenized into a list of words using the nltk.word_tokenize method.

Next, the function removes stop words, which are common words that do not carry much meaning in the sentence, such as "a", "an", "the", "of", and so on. In this case, the function is using a pre-defined list of stop words to remove them from the list of tokens.

After that, the function performs stemming on each word in the sentence, which involves converting the words into their root or base form, called their stem. The function uses a stemmer to perform this task.

Finally, the preprocessed words are returned as a list.

In [9]:
example_all_words = ['hello', 'tell', 'what', 'ten', 'lead', 'lectur', 'cs', 'who', '472']
bag_of_words(tokens, example_all_words)

array([0., 0., 0., 0., 1., 0., 0., 1., 0.], dtype=float32)

The purpose of the code in the below cell is to read in the data file, tokenize the sentences into individual words, and create a list of (X, y) pairs, where X is a list of tokenized words and y is the associated intent tag. This is a common preprocessing step in Natural Language Processing (NLP) where the goal is to classify user input into one of several predefined categories. By tokenizing the input patterns and creating (X, y) pairs, the data can be transformed into a format that is more suitable for use in machine learning algorithms.

In [10]:
# load the data file as a Python object
with open('intents.json', 'r') as file:
    intents = json.load(file)

# empty lists and dictionary for storing the tokenized words, tags, and (X, y) pairs
all_words = []
tags = []
xy_pairs = []
bert_pairs = []
word_counts = {}

# loop through each intent in the data file
for intent in intents['intents']:
    # get the tag
    tag = intent['tag']
    # add the tag to our list of tags
    tags.append(tag)
    # loop through each pattern (sentence) in the intent
    for pattern in intent['patterns']:
        # tokenize the pattern into a list of words
        words = preprocess(pattern)
        # loop through each word in the tokenized pattern
        for word in words:
            # add the word to our list of all words
            all_words.append(word)
            # add the word to our word_counts dictionary and increment its count
            if word not in word_counts:
                word_counts[word] = 1
            else:
                word_counts[word] += 1
        # add the (words, tag) pair to our list of (X, y) pairs
        xy_pairs.append((words, tag))
        bert_pairs.append((pattern, tag))

The purpose of the code is to preprocess a list of words and associated intent tags for use in Natural Language Processing (NLP) tasks such as intent classification. The code first stems and lowercases each word in the list, ignoring any words in a specified ignore list. It then removes duplicate words from the list and sorts the resulting list. The code also sorts the list of intent tags. The resulting processed data can be used as input to machine learning algorithms for tasks such as training a model to classify user input into one of several predefined categories.

In [11]:
# remove duplicate words and sort the list
all_words = sorted(set(all_words))

# sort the list of tags
tags = sorted(set(tags))

### Create training data

The code in the cell below creates the training data by converting each input sentence in the original data to a bag of words representation using the bag_of_words function, and then converting the intent tags to integer labels that can be used for training. The resulting training data is represented as two numpy arrays, X_train and y_train.

In [68]:
X_data1 = []
y_data1 = []

# for each (X, y) pair in the data
for (pattern_sentence, tag) in xy_pairs:
    # create a bag of words for the pattern sentence
    bag = bag_of_words(pattern_sentence, all_words)
    # append the bag of words to the X data
    X_data1.append(bag)
    # convert the tag to a label that can be used with PyTorch CrossEntropyLoss
    label = tags.index(tag)
    # append the label to the y data
    y_data1.append(label)

# convert X and y to numpy arrays
X_data1 = np.array(X_data1)
y_data1 = np.array(y_data1)

Now we can get training and testing data:

In [69]:
# split the data into 10% test set and 90% train set
X_train1, X_test1, y_train1, y_test1 = train_test_split(X_data1, y_data1, test_size=0.1, random_state=42)

### Create model to classify text 

#### Set Hyperparameters

In [70]:
# Hyperparameters 
num_epochs = 500
batch_size = 8
learning_rate = 0.001
input_size = len(X_train[0])
hidden_size = 32
output_size = len(tags)
max_seq_length = 50
print(input_size, output_size)

503 16


In order to effectively capture the underlying patterns in language data, a model capable of learning complex relationships is required. However, it is important to strike a balance and avoid overfitting by not making the model more complex than necessary. To start with, a simple feed forward neural network was chosen.

The model presented in the cell below uses three fully connected layers, providing the capability to learn complex patterns in the input data. The ReLU activation function between the first and second fully connected layers introduces non-linearity into the model, improving its ability to model complex relationships. The third fully connected layer maps the learned features to the output classes, and no activation function is used for this layer. 

In [71]:
class BasicNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(BasicNN, self).__init__()
        # the first fully connected layer
        self.fc1 = nn.Linear(input_size, hidden_size)
        # the second fully connected layer
        self.fc2 = nn.Linear(hidden_size, hidden_size) 
        # the third fully connected layer
        self.fc3 = nn.Linear(hidden_size, num_classes)
        # the ReLU activation function
        self.relu = nn.ReLU()
        # zero out certain values to help prevent overfitting
        self.dropout = nn.Dropout(0.25)
    
    def forward(self, x):
        # apply the ReLU activation function to the first fully connected layer
        out = self.relu(self.fc1(x))  # apply relu to the 1st fully connected layer output
        # apply dropout regularization to prevent overfitting on the 2nd fully connected layer
        out = self.dropout(out)
        # apply the ReLU activation function to the second fully connected layer
        out = self.relu(self.fc2(out))  # apply relu to the 2nd fully connected layer output
        # apply dropout regularization to prevent overfitting on the 3rd fully connected layer
        out = self.dropout(out)
        # apply the third fully connected layer without any activation function
        out = self.fc3(out)  # apply the last fully connected layer
        return out  # return the final output of the network

### Load the Data

The `ChatDataset` class is a PyTorch dataset object that is designed to be used with PyTorch's DataLoader module to retrieve training data and corresponding labels for my chatbot application. The purpose of this class is to encapsulate the training data and labels as attributes of the object and to define the methods `__getitem__()` and `__len__()` to support indexing and length operations, respectively, on the dataset object. This class is an implementation of the Dataset abstract class in PyTorch and provides a consistent interface for loading training data for use with PyTorch models.

In [33]:
class ChatDataset(Dataset):
    """
    A PyTorch dataset object that is designed to be used with DataLoader to retrieve
    training data and labels.
    """
    def __init__(self, X_train, y_train):
        """
        Initializes the ChatDataset object.
        """
        self.n_samples = len(X_train)
        self.x_data = X_train
        self.y_data = y_train

    def __getitem__(self, index):
        """
        Returns the specified training sample.
        """
        return self.x_data[index], self.y_data[index]

    def __len__(self):
        """
        Returns the number of training samples.
        """
        return self.n_samples

This code creates a `ChatDataset` object with training data and corresponding labels `X_train` and `y_train`, and then uses a PyTorch DataLoader object to iterate over the dataset during model training. The `DataLoader` is configured to retrieve data in batches of size `batch_size`, to shuffle the data before each epoch of training, and to use 0 worker processes for data loading.

In [97]:
training_dataset1 = ChatDataset(X_train1, y_train1)
train_loader1 = DataLoader(dataset=training_dataset1,
                          batch_size=batch_size,
                          shuffle=True,
                          num_workers=0,
                          drop_last=True)

### Train The Model

In [79]:
def trainer(model, criterion, optimizer, num_epochs, train_loader, bert=False):
    # Move the loss function to GPU
    criterion = criterion.to(device)
    model = model.to(device)
    for epoch in range(num_epochs): # loop over the specified number of epochs
        for (words, labels) in train_loader: # iterate over training data in batches
            # forward pass
            if bert==True:
                inputs = tokenizer.batch_encode_plus(inputs, max_length=50, 
                                                     pad_to_max_length=True, return_tensors='pt', truncation=True)
                input_ids = inputs['input_ids'].to(device)
                attention_mask = inputs['attention_mask'].to(device)
                labels = labels.long()
                labels = labels.to(device)

                output = bert_classifier(input_ids, attention_mask)
            else:
                labels = labels.to(dtype=torch.long).to(device) # load batch onto GPU
                words = words.to(device) # load batch onto GPU
                output = model(words) # make predictions for given inputs

            loss = criterion(output, labels) # compare predictions to actual labels

            # Backward and optimize
            optimizer.zero_grad() # reset gradients
            loss.backward() # compute gradients using backpropagation
            optimizer.step() # update the model weights

        if (epoch+1) % 100 == 0: # print loss every 100 epochs
            print (f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

    print(f'final loss: {loss.item():.4f}') # print the final loss after training is complete
    criterion = criterion.to(cpu)
    model = model.to(cpu)

First, let's see whether the computer has an GPU available for use with PyTorch:

In [80]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

Now let's instantiate our model, loss function, and optimizer:

In [81]:
model = BasicNN(input_size, hidden_size, output_size)
# loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

Now we can train our model:

In [82]:
trainer(model, criterion, optimizer, num_epochs, train_loader)

Epoch [100/500], Loss: 0.3349
Epoch [200/500], Loss: 0.0000
Epoch [300/500], Loss: 0.0000
Epoch [400/500], Loss: 0.0107
Epoch [500/500], Loss: 0.0001
final loss: 0.0001


### Evaluation

In [83]:
testing_dataset = ChatDataset(X_test, y_test)
test_loader = DataLoader(dataset=testing_dataset,
                          shuffle=False,
                          num_workers=0,
                          drop_last=True)

In [84]:
def evaluator(model, test_loader, bert=False):
    # evaluation loop
    model.eval()
    model = model.to(device)
    with torch.no_grad():
        total_correct = 0
        total_samples = 0
        for inputs, targets in test_loader:
            if bert==True:
                inputs = tokenizer.batch_encode_plus(inputs, max_length=50, 
                                                     pad_to_max_length=True, return_tensors='pt', truncation=True)
                input_ids = inputs['input_ids'].to(device)
                attention_mask = inputs['attention_mask'].to(device)
                targets = targets.long()
                targets = targets.to(device)

                outputs = bert_classifier(input_ids, attention_mask)
            else:
                # move the inputs and targets to the device
                inputs = inputs.to(device)
                targets = targets.to(device)
                # forward pass
                outputs = model(inputs)
                
            _, predictions = torch.max(outputs, 1)

            # update the total number of correct predictions and total number of samples
            total_correct += (predictions == targets).sum().item()
            total_samples += targets.size(0)

        # print the accuracy
        print(f'Accuracy: {total_correct/total_samples*100:.2f}%')

In [85]:
evaluator(model, test_loader)

Accuracy: 90.59%


## Add an embedding layer

In [120]:
mapping = {word: i+1 for i, word in enumerate(all_words)}

In [121]:
def word_mapper(sentence, word_to_id, max_length=15):
    # map each word to its ID
    mapped = [word_to_id.get(word, 1) for word in sentence]

    # pad the sequence with zeros up to max_length
    if len(mapped) < max_length:
        mapped = mapped + [0] * (max_length - len(mapped))
    else:
        mapped = mapped[:max_length]

    return mapped

In [140]:
X_data2 = []
y_data2 = []

# for each (X, y) pair in the data
for (pattern_sentence, tag) in xy_pairs:
    # append sentence where each word is mapped to an id
    mapped = word_mapper(pattern_sentence, mapping)
    X_data2.append(mapped)
    # convert the tag to a label that can be used with PyTorch CrossEntropyLoss
    label = tags.index(tag)
    # append the label to the y data
    y_data2.append(label)

# convert X and y to numpy arrays
X_data2 = np.array(X_data2)
y_data1 = np.array(y_data2)
X_data2[0]

array([488,  88, 125,  97,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0])

In [141]:
# split the data into 10% test set and 90% train set
X_train2, X_test2, y_train2, y_test2 = train_test_split(X_data2, y_data2, test_size=0.1, random_state=42)

In [142]:
training_dataset2 = ChatDataset(X_train2, y_train2)
train_loader2 = DataLoader(dataset=training_dataset2,
                          batch_size=batch_size,
                          shuffle=True,
                          num_workers=0,
                          drop_last=True)

In [151]:
class EmbeddingNN(nn.Module):
    def __init__(self, input_size, embedding_dim, hidden_size, num_classes):
        super(EmbeddingNN, self).__init__()
        # load pretrained GloVe embeddings
        self.embedding = nn.Embedding(input_size, embedding_dim=embedding_size)
        # the first fully connected layer
        self.fc1 = nn.Linear(input_size * embedding_size, hidden_size)
        # the second fully connected layer
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        # the third fully connected layer
        self.fc3 = nn.Linear(hidden_size, num_classes)
        # the ReLU activation function
        self.relu = nn.ReLU()
        # zero out certain values to help prevent overfitting
        self.dropout = nn.Dropout(0.25)
    
    def forward(self, x):
        x = x.long()
        # pass input through the embedding layer
        out = self.embedding(x)
        print(out.size())
        # flatten the output to feed it to the first fully connected layer
        out = out.view(out.size(0), -1)
        print(out.size())
        # apply the ReLU activation function to the first fully connected layer
        out = self.relu(self.fc1(out))
        # apply dropout regularization to prevent overfitting on the second fully connected layer
        out = self.dropout(out)
        # apply the ReLU activation function to the second fully connected layer
        out = self.relu(self.fc2(out))
        # apply dropout regularization to prevent overfitting on the third fully connected layer
        out = self.dropout(out)
        # apply the third fully connected layer without any activation function
        out = self.fc3(out)
        return out

In [152]:
embedding_size = 15

In [155]:
model = EmbeddingNN(input_size, embedding_size, hidden_size, output_size)
# loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [156]:
trainer(model, criterion, optimizer, num_epochs, train_loader2)

torch.Size([8, 15, 15])
torch.Size([8, 225])


RuntimeError: mat1 and mat2 shapes cannot be multiplied (8x225 and 7545x32)

### Not bad! Now let's use a more sophisticated model!

In [35]:
from transformers import BertModel, BertTokenizer

In [36]:
X_bert = []
y_bert = []

# for each (X, y) pair in the data
for (pattern_sentence, tag) in bert_pairs:
    X_bert.append(pattern_sentence)
    # convert the tag to a label that can be used with PyTorch CrossEntropyLoss
    label = tags.index(tag)
    # append the label to the y data
    y_bert.append(label)

# convert X and y to numpy arrays
X_bert = np.array(X_bert, dtype=object)
y_bert = np.array(y_bert)

In [37]:
# split the data into 10% test set and 90% train set
X_bert_train, X_bert_test, y_bert_train, y_bert_test = train_test_split(X_bert, y_bert, test_size=0.1, random_state=42)

In [38]:
bert_training_dataset = ChatDataset(X_bert_train, y_bert_train)
bert_train_loader = DataLoader(dataset=bert_training_dataset,
                          batch_size=batch_size,
                          shuffle=True,
                          num_workers=0,
                          drop_last=True)

In [39]:
bert_testing_dataset = ChatDataset(X_bert_test, y_bert_test)
bert_test_loader = DataLoader(dataset=bert_testing_dataset,
                          shuffle=False,
                          num_workers=0,
                          drop_last=True)

In [43]:
# Step 1: Load the pre-trained BERT model
bert_model = BertModel.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [46]:
# Step 2: Add a classification layer on top of the pre-trained BERT model
class BertClassifier(nn.Module):
    def __init__(self, bert_model, num_classes):
        super(BertClassifier, self).__init__()
        self.bert = bert_model
        self.dropout = nn.Dropout(0.1)
        self.linear = nn.Linear(768, num_classes)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, input_ids, attention_mask):
        out = self.bert(input_ids=input_ids, attention_mask=attention_mask, return_dict=False)
        _, pooled_output = out
        pooled_output = self.dropout(pooled_output)
        linear_output = self.linear(pooled_output)
        return self.softmax(linear_output)

In [48]:
bert_classifier = BertClassifier(bert_model, output_size).to(device)
loss_fn = nn.CrossEntropyLoss()
loss_fn.to(device)
optimizer = torch.optim.Adam(bert_classifier.parameters(), lr=learning_rate)

Epoch [100/500], Loss: 2.7496
Epoch [200/500], Loss: 2.6246
Epoch [300/500], Loss: 2.8746
Epoch [400/500], Loss: 2.6246
Epoch [500/500], Loss: 2.7496


In [49]:
bert_path = 'bert.pth'
torch.save(bert_classifier.state_dict(), bert_path)

In [65]:
evaluator(bert_classifier, bert_test_loader, bert=True)

Accuracy: 14.12%


### Let's Chat!

In the below code, the user is prompted for input until they type 'quit' to exit. Assuming the user did not type quit, their input is preprocessed, converted to a bag of words, and fed into the model for prediction. If the model predicts an intent with a high probability, a response is generated based on that intent from a set of predefined responses. If the model predicts with low probability, the chatbot responds with an "I do not understand..." message. Thus, the chatbot will (in theory) only respond to questions it can understand!

In [156]:
# set the model to evaluation mode
model.eval()

bot_name = "Eliot"
print("Let's chat! (type 'quit' to exit)")
while True:
    sentence = input("You: ")
    if sentence.lower() == "quit":
        break

    sentence = preprocess(sentence)
    X = bag_of_words(sentence, all_words)
    X = X.reshape(1, X.shape[0])
    X = torch.from_numpy(X).to(device)

    output = model(X)
    _, predicted = torch.max(output, dim=1)

    tag = tags[predicted.item()]

    probs = torch.softmax(output, dim=1)
    prob = probs[0][predicted.item()]
    
    if prob.item() > 0.9:
        for intent in intents['intents']:
            if tag == intent["tag"]:
                print(f"{bot_name}: {random.choice(intent['responses'])}")
    else:
        print(f"{bot_name}: I'm sorry, but I do not understand. Could you try being more specific?")

Let's chat! (type 'quit' to exit)
You: who is the professor for this course?
Eliot: The professor of CS 472 is Humphrey Shi


KeyboardInterrupt: Interrupted by user

# TODO
- add spell checking to preprocessing