In [4]:
from models import *
import torch
import torchtext
from torchtext import data
import numpy as np

import spacy

#Hyperparameters 
learning_rate = 0.005
batch_size = 4
epochs = 30
seed = 0

embedding_dim = 300
hidden_dim = 75

torch.manual_seed(seed)

def tokenizer(text):
    spacy_en = spacy.load('en')
    return [tok.text for tok in spacy_en(text)]

def result(prediction):
    _, predicted = torch.max(prediction,1)
    if predicted == 0:
        result = "Academics"
    elif predicted == 1:
        result = "Alerts"
    elif predicted == 2:
        result = "Personal"
    elif predicted == 3:
        result = "Professional"
    else:
        result = "Promotions and Events"
    
    return result
    

In [None]:
# ------------------------------------------------------------------------------------
# Instantiates 2 data.Field objects 
TEXT = data.Field(sequential=True,lower=True, tokenize='spacy', include_lengths=True)
LABELS = data.Field(sequential=False, use_vocab=False)

# Load the train, validation, and test datasets to become datasets
train_data, val_data, test_data = data.TabularDataset.splits(
        path='data/', train='train.tsv',
        validation='validation.tsv', test='test.tsv', format='tsv',
        skip_header=True, fields=[('text', TEXT), ('label', LABELS)])

# Create an object that can be enumerated (for training loop later)
train_iter, val_iter, test_iter = data.BucketIterator.splits(
  (train_data, val_data, test_data), batch_sizes=(batch_size, batch_size, batch_size),
sort_key=lambda x: len(x.text), device=None, sort_within_batch=True, repeat=False)


# Vocab object contains the index/token for each unique word in the dataset (looks through all sentences in dataset)
# Loading GloVe Vector and Using Embedding Layer
TEXT.build_vocab(train_data, val_data, test_data, vectors='fasttext.simple.300d')

vocab = TEXT.vocab

# Load model
rnn = RNN(embedding_dim, vocab, hidden_dim)
rnn.load_state_dict(torch.load('saved_model/model_rnn_dict.pt'))
rnn.eval()


# ------------------------------------------------------------------------------------
print("Welcome to the ClutterCutter!\nPlease enter the email you want to classify.")
print("To exit the loop, enter 'exit' as the input")

while True:
    # Get email
    email = input("\nEnter email text here:\n")
    if email == "exit":
        break
    else:
        # Convert sentence string into token (using SpaCy)
        tokens = tokenizer(email) #list of individual string words

        # Convert each string token to an integer
        token_ints = [vocab.stoi[tok] for tok in tokens] #list of individual numbers

        # Convert the list of token integers into a torch.LongTensor with the shape [L,1] <--- L is number of tokens/words
        token_tensor = torch.LongTensor(token_ints).view(-1,1) # Shape is [sentence_len, 1]

        # Create a tensor for the length of the sentence with the shape [1]
        lengths = torch.Tensor([len(token_ints)]) #needed when calling RNN 

        #------------------------------------- MODEL ------------------------------------------
        rnn_output = rnn(token_tensor,lengths)
        rnn_result = result(rnn_output)
        print("\nModel rnn predicts that this email is:", rnn_result)



Welcome to the ClutterCutter!
Please enter the email you want to classify.
To exit the loop, enter 'exit' as the input
