In [9]:
from os import listdir
from os.path import isfile, join

# Reading all of the negative and positive reviews into a tuple list
neg_path = 'aclImdb/train/neg'
pos_path = 'aclImdb/train/pos'
negative_files = [f for f in listdir(neg_path) if isfile(join(neg_path, f))]
postive_files = [f for f in listdir(pos_path) if isfile(join(pos_path, f))]

data = []
for neg_file in negative_files:
    with open(neg_path + '/' + neg_file, 'r') as f:
        review = f.read()
        data.append( (0,review) )
num_neg_reviews = len(data)
print('There are {} negative reviews.'.format(num_neg_reviews))
for pos_file in postive_files:
    with open(pos_path + '/' + pos_file, 'r') as f:
        review = f.read()
        data.append( (1,review) )
print('There are {} positive reviews.'.format(len(data) - num_neg_reviews))
print('Loaded {} examples'.format(str(len(data))))
print()
print(data[0])
print()
print(data[len(data)-1])        

There are 12500 negative reviews.
There are 12500 positive reviews.
Loaded 25000 examples

(0, "Working with one of the best Shakespeare sources, this film manages to be creditable to it's source, whilst still appealing to a wider audience.<br /><br />Branagh steals the film from under Fishburne's nose, and there's a talented cast on good form.")

(1, "Enchanted April is a tone poem, an impressionist painting, a masterpiece of conveying a message with few words. It has been one of my 10 favorite films since it came out. I continue to wait, albeit less patiently, for the film to come out in DVD format. Apparently, I am not alone.<br /><br />If parent company Amazon's listings are correct, there are many people who want this title in DVD format. Many people want to go to Italy with this cast and this script. Many people want to keep a permanent copy of this film in their libraries. The cast is spectacular, the cinematography and direction impeccable. The film is a definite keeper. Many h

In [10]:
import random
# Avoid any ordering bias by randomly shuffling the list
print([target[0] for target in data[0:10]])
random.Random(4).shuffle(data)
print([target[0] for target in data[0:10]])

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[0, 0, 1, 1, 1, 0, 0, 1, 1, 1]


In [11]:
# Create a validation set from the training data
train_split = 20000

train_data = data[:train_split]
train_x = [data[1] for data in train_data]
train_y = [data[0] for data in train_data]

val_data = data[train_split:]
val_x = [data[1] for data in val_data]
val_y = [data[0] for data in val_data]

print('{} training samples'.format(len(train_data)))
print([target for target in train_y[0:10]])
print('{} validation samples'.format(len(val_data)))
print([target for target in val_y[0:10]])
print(len(val_y))

20000 training samples
[0, 0, 1, 1, 1, 0, 0, 1, 1, 1]
5000 validation samples
[0, 0, 0, 1, 1, 1, 1, 0, 0, 0]
5000


In [12]:
# Assess performance on the test set
# Reading all of the negative and positive reviews into a tuple list
neg_path = 'aclImdb/test/neg'
pos_path = 'aclImdb/test/pos'
negative_files = [f for f in listdir(neg_path) if isfile(join(neg_path, f))]
postive_files = [f for f in listdir(pos_path) if isfile(join(pos_path, f))]

test_data = []
for neg_file in negative_files:
    with open(neg_path + '/' + neg_file, 'r') as f:
        review = f.read()
        test_data.append( (0,review) )
num_neg_reviews = len(test_data)
print('There are {} negative test reviews.'.format(num_neg_reviews))
for pos_file in postive_files:
    with open(pos_path + '/' + pos_file, 'r') as f:
        review = f.read()
        test_data.append( (1,review) )
print('There are {} positive test reviews.'.format(len(test_data) - num_neg_reviews))
print('Loaded {} examples'.format(str(len(test_data))))
print()
print(test_data[0])
print()
print(test_data[len(test_data)-1])   

There are 12500 negative test reviews.
There are 12500 positive test reviews.
Loaded 25000 examples

(0, "Alan Rickman & Emma Thompson give good performances with southern/New Orleans accents in this detective flick. It's worth seeing for their scenes- and Rickman's scene with Hal Holbrook. These three actors mannage to entertain us no matter what the movie, it seems. The plot for the movie shows potential, but one gets the impression in watching the film that it was not pulled off as well as it could have been. The fact that it is cluttered by a rather uninteresting subplot and mostly uninteresting kidnappers really muddles things. The movie is worth a view- if for nothing more than entertaining performances by Rickman, Thompson, and Holbrook.")

(1, 'I saw this movie on TV and loved it! I am a real disaster film fan, and this one was great. The cast was made of some really interesting people. Connie Selleca is always great. And William Devane is in a league of his own. He can play bo

In [13]:
test_x = [data_t[1] for data_t in test_data]
test_y = [data_t[0] for data_t in test_data]

print('{} test samples'.format(len(test_data)))
print([target for target in test_y[0:10]])

25000 test samples
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [14]:
# Tokenize the reviews
import nltk
import os
import pickle as pkl
import spacy
import string
import time

from collections import Counter
from tqdm import tqdm_notebook

class NGramBuilder(object):
    """
    Class which enables hyperparameter searching over the word tokenization process
    """
    # save index 0 for unk and 1 for pad
    PAD_IDX = 0
    UNK_IDX = 1
    def __init__(self, max_vocab_size, n_gram_size, all_permutations=True):
        """
        all_permutations: if True then that means if you picked an n-gram size of 3, then you would have
        all the 1-gram and 2-gram combos included as well.
        """
        self.tokenizer = spacy.load('en_core_web_sm')
        self.punctuations = string.punctuation
        self.max_vocab_size = max_vocab_size
        self.n_gram_size = n_gram_size
        self.all_permutations = all_permutations
        self.vocabulary_tokens = []
        self.id2token = None
        self.token2id = None
    
    def _lower_case_remove_punc(self, parsed):
        return [token.text.lower() for token in parsed if (token.text not in self.punctuations)]

    def tokenize_dataset(self, dataset, training=False):
        token_dataset = []
        start_time = time.time()
        for sample in tqdm_notebook(self.tokenizer.pipe(dataset, disable=['parser', 'tagger', 'ner'], batch_size=512, n_threads=4)):
            tokens = self._lower_case_remove_punc(sample)
            all_tokens = []
            n_grams = nltk.ngrams(tokens, self.n_gram_size)
            all_tokens = [' '.join(grams) for grams in n_grams]
            token_dataset.append(all_tokens)
            if training:
                self.vocabulary_tokens += all_tokens
        print("--- {} seconds ---".format(time.time() - start_time))
        return token_dataset
    
    def build_vocab(self):
        # Returns:
        # id2token: list of tokens, where id2token[i] returns token that corresponds to token i
        # token2id: dictionary where keys represent tokens and corresponding values represent indices
        token_counter = Counter(self.vocabulary_tokens)
        vocab, count = zip(*token_counter.most_common(self.max_vocab_size))
        self.id2token = list(vocab)
        self.token2id = dict(zip(vocab, range(2,2+len(vocab)))) 
        self.id2token = ['<pad>', '<unk>'] + self.id2token
        self.token2id['<pad>'] = self.PAD_IDX 
        self.token2id['<unk>'] = self.UNK_IDX
        self.vocabulary_tokens = None
    
    def token2index_dataset(self, tokens_data):
        indices_data = []
        for tokens in tokens_data:
            index_list = [self.token2id[token] if token in self.token2id else self.UNK_IDX for token in tokens]
            indices_data.append(index_list)
        return indices_data
    
    def get_indexed_dataset_from_training_text_vector(self, text_vector, overwrite=True):
        data_tokens = None
        training_pickle_path = "train_data_tokens_{}_{}.p".format(self.n_gram_size, self.max_vocab_size)
        id2token_path = "id2token_{}_{}.p".format(self.n_gram_size, self.max_vocab_size)
        token2id_path = "token2id_{}_{}.p".format(self.n_gram_size, self.max_vocab_size)

        train_data_tokens = None
        if os.path.isfile(training_pickle_path) and os.path.isfile(id2token_path) and not overwrite:
            print("Loading existing training token pickle file and vocabulary.")
            train_data_tokens = pkl.load(open(training_pickle_path, "rb"))
            self.id2token = pkl.load(open(id2token_path, "rb"))
            self.token2id = pkl.load(open(token2id_path, "rb"))
        elif not overwrite:
            raise ValueError('File not found for training')
        else:
            train_data_tokens = self.tokenize_dataset(text_vector, training=True)
#             pkl.dump(train_data_tokens, open(training_pickle_path, "wb"))
            self.build_vocab()
#             pkl.dump(self.id2token, open(id2token_path, "wb"))
#             pkl.dump(self.token2id, open(token2id_path, "wb"))

        train_data_indices = self.token2index_dataset(train_data_tokens)
        return train_data_indices
        
    
    def get_indexed_dataset_from_val_and_test_vector(self, val_vector, test_vector, overwrite=True):
        if not self.id2token:
            raise ValueError('Must load training set prior to validationa and test sets')
        data_tokens = None
        
        val_data_tokens = None
        test_data_tokens = None
        val_pickle_path = "val_data_tokens_{}_{}.p".format(self.n_gram_size, self.max_vocab_size)
        test_pickle_path = "test_data_tokens_{}_{}.p".format(self.n_gram_size, self.max_vocab_size)
        
        if os.path.isfile(val_pickle_path) and os.path.isfile(test_pickle_path) and not overwrite:
            print("Loading existing training token pickle file and vocabulary.")
            val_data_tokens = pkl.load(open(val_pickle_path, "rb"))
            test_data_tokens = pkl.load(open(test_pickle_path, "rb"))
        elif not overwrite:
            raise ValueError('File not found for validation or testing')
        else:
            val_data_tokens = self.tokenize_dataset(val_vector)
#             pkl.dump(val_data_tokens, open(val_pickle_path, "wb"))
            test_data_tokens = self.tokenize_dataset(val_vector)
#             pkl.dump(test_data_tokens, open(test_pickle_path, "wb"))
            
        val_data_indices = self.token2index_dataset(val_data_tokens)
        test_data_indices = self.token2index_dataset(test_data_tokens)
        return val_data_indices, test_data_indices


In [None]:
from multiprocessing import Process

def multi_write(vocab_size, n):
    print("Building bag of words for {} vocab size and {} gram(s)".format(vocab_size, n))
    ngram_builder = NGramBuilder(max_vocab_size=vocab_size, n_gram_size=n, all_permutations=True) 
    train_data_indices = ngram_builder.get_indexed_dataset_from_training_text_vector(train_x)
    val_data_indices, test_data_indices = ngram_builder.get_indexed_dataset_from_val_and_test_vector(val_x, test_x)

    training_pickle_path = "train_index_tokens_{}_{}.p".format(n, vocab_size)
    val_pickle_path = "val_index_tokens_{}_{}.p".format(n, vocab_size)
    test_pickle_path = "test_index_tokens_{}_{}.p".format(n, vocab_size)
    print('Writing the pickles')
    pkl.dump(train_data_indices, open(training_pickle_path, "wb"))
    pkl.dump(val_data_indices, open(val_pickle_path, "wb"))
    pkl.dump(test_data_indices, open(test_pickle_path, "wb"))

vocab_sizes = [10000, 20000, 40000, 80000]
n_sizes = [1,2,3]
# vocab_sizes = [100]
# n_sizes = [1]
for n in n_sizes:
    processes = []
    for vocab_size in vocab_sizes:
        p = Process(target=multi_write, args=(vocab_size,n,))
        processes.append(p)
        p.start()

    for one_process in processes:
        one_process.join()

    print("Done!")


Building bag of words for 10000 vocab size and 1 gram(s)
Building bag of words for 20000 vocab size and 1 gram(s)
Building bag of words for 40000 vocab size and 1 gram(s)
Building bag of words for 80000 vocab size and 1 gram(s)


Widget Javascript not detected.  It may not be installed or enabled properly.
Widget Javascript not detected.  It may not be installed or enabled properly.
Widget Javascript not detected.  It may not be installed or enabled properly.
Widget Javascript not detected.  It may not be installed or enabled properly.



--- 207.7871069908142 seconds ---

--- 208.19585919380188 seconds ---

--- 208.42174696922302 seconds ---

--- 208.51249718666077 seconds ---


Widget Javascript not detected.  It may not be installed or enabled properly.


Widget Javascript not detected.  It may not be installed or enabled properly.


Widget Javascript not detected.  It may not be installed or enabled properly.


Widget Javascript not detected.  It may not be installed or enabled properly.


In [None]:
#print(ngram_builder.vocabulary_tokens[0:100])
print()
print(train_data_indices[2])
print()
print(len(ngram_builder.id2token))
#print(len(ngram_builder.vocabulary_tokens))
print(ngram_builder.id2token[-100:])
print()
print([ngram_builder.token2id[k] for k in sorted(ngram_builder.token2id.keys())[:2]])
print([v for v in list(ngram_builder.token2id.values())[:2]])

print(len(train_data_indices))
print(len(train_y))

print(len(val_data_indices))
print(len(val_y))

In [1]:
MAX_SENTENCE_LENGTH = 200

import numpy as np
import torch
from torch.utils.data import Dataset

class ImdbDataset(Dataset):
    """
    Class that represents a train/validation/test dataset that's readable for PyTorch
    Note that this class inherits torch.utils.data.Dataset
    """
    
    def __init__(self, data_list, target_list):
        """
        @param data_list: list of imdb review tokens 
        @param target_list: list of imdb review targets 

        """
        self.data_list = data_list
        self.target_list = target_list
        assert (len(self.data_list) == len(self.target_list))

    def __len__(self):
        return len(self.data_list)
        
    def __getitem__(self, key):
        """
        Triggered when you call dataset[i]
        """
        
        token_idx = self.data_list[key][:MAX_SENTENCE_LENGTH]
        label = self.target_list[key]
        return [token_idx, len(token_idx), label]

def imdb_collate_func(batch):
    """
    Customized function for DataLoader that dynamically pads the batch so that all 
    data have the same length
    """
    data_list = []
    label_list = []
    length_list = []
    #print("collate batch: ", batch[0][0])
    #batch[0][0] = batch[0][0][:MAX_SENTENCE_LENGTH]
    for datum in batch:
        label_list.append(datum[2])
        length_list.append(datum[1])
    # padding
    for datum in batch:
        padded_vec = np.pad(np.array(datum[0]), 
                                pad_width=((0,MAX_SENTENCE_LENGTH-datum[1])), 
                                mode="constant", constant_values=0)
        data_list.append(padded_vec)
    return [torch.from_numpy(np.array(data_list)), torch.LongTensor(length_list), torch.LongTensor(label_list)]

# create pytorch dataloader


# for i, (data, lengths, labels) in enumerate(train_loader):
#    print(data)
#    print(labels)
#    break

In [2]:
# First import torch related libraries
import torch
import torch.nn as nn
import torch.nn.functional as F

class BagOfWords(nn.Module):
    """
    BagOfWords classification model
    """
    def __init__(self, vocab_size, emb_dim):
        """
        @param vocab_size: size of the vocabulary. 
        @param emb_dim: size of the word embedding (use at least 100 to 500 and keep increasing)
        """
        super(BagOfWords, self).__init__()
        # pay attention to padding_idx 
        self.embed = nn.Embedding(vocab_size, emb_dim, padding_idx=0)
        self.linear = nn.Linear(emb_dim,2)
    
    def forward(self, data, length):
        """
        
        @param data: matrix of size (batch_size, max_sentence_length). Each row in data represents a 
            review that is represented using n-gram index. Note that they are padded to have same length.
        @param length: an int tensor of size (batch_size), which represents the non-trivial (excludes padding)
            length of each sentences in the data.
        """
        out = self.embed(data)
        out = torch.sum(out, dim=1)
        out /= length.view(length.size()[0],1).expand_as(out).float()
     
        # return logits
        out = self.linear(out.float())
        return out


In [5]:
class HyperParameterTuner(object):
    """
    This class will help tune hyperparameters
    """
    def __init__(self,model, criterion, optimizer):
        self.model = model
        self.criterion = criterion
        self.optimizer = optimizer
        
    def get_indices_for_hyperparamters(self, n_gram_size, max_vocab_size):
        print("Retrieving bag of words for {} vocab size and {} gram(s)".format(max_vocab_size, n_gram_size))
        training_pickle_path = "train_index_tokens_{}_{}.p".format(n_gram_size, max_vocab_size)
        val_pickle_path = "val_index_tokens_{}_{}.p".format(n_gram_size, max_vocab_size)
        test_pickle_path = "test_index_tokens_{}_{}.p".format(n_gram_size, max_vocab_size)
        train_data_indices = pkl.load(open(training_pickle_path, "rb"))
        val_data_indices = pkl.load(open(val_pickle_path, "rb"))
        test_data_indices = pkl.load(open(test_pickle_path, "rb"))
        return train_data_indices, val_data_indices, test_data_indices

    # Function for testing the model
    def test_model2(self, loader):
        """
        Help function that tests the model's performance on a dataset
        @param: loader - data loader for the dataset to test against
        """
        correct = 0
        total = 0
        self.model.eval()
        for data, lengths, labels in loader:
            data_batch, length_batch, label_batch = data, lengths, labels
            outputs = F.softmax(self.model(data_batch, length_batch), dim=1)
            predicted = outputs.max(1, keepdim=True)[1]
            total += labels.size(0)
            correct += predicted.eq(labels.view_as(predicted)).sum().item()
        return (100 * correct / total)

    def train_and_validate_model(self, num_epochs, train_loader, val_loader):
        accuracies = []
        losses = []
        for epoch in range(num_epochs):
            running_loss = 0.0
            last_accuracy = 0
            for i, (data, lengths, labels) in enumerate(train_loader):
                self.model.train()
                data_batch, length_batch, label_batch = data, lengths, labels
                optimizer.zero_grad()
                outputs = self.model(data_batch, length_batch)
                loss = criterion(outputs, label_batch)
                loss.backward()
                optimizer.step()
                running_loss += loss.item()
                # validate every 100 iterations
                if i > 0 and i % 100 == 0:
                    # validate
                    print('loss: ' + str(running_loss))
                    val_acc = self.test_model2(val_loader)
                    last_accuracy = val_acc
                    print('Epoch: [{}/{}], Step: [{}/{}], Validation Acc: {}'.format( 
                               epoch+1, num_epochs, i+1, len(train_loader), val_acc))
            losses.append(running_loss)
            accuracies.append(last_accuracy)
        print( (accuracies, losses))
        return accuracies, losses


In [8]:
def run_model(max_vocab_size,n):
    tuning_record = []
#     learning_rates = [0.5, 0.1, 0.05, 0.01, 0.005, 0.001]
    learning_rates = [0.01]
    emb_dim = 100
    model = BagOfWords(max_vocab_size + 2, emb_dim)
    for learning_rate in learning_rates:
        print("learning rate: "+ str(learning_rate))
        # Criterion and Optimizer
        criterion = torch.nn.CrossEntropyLoss() 
        # criterion = torch.nn.BCEWithLogitsLoss()
        optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
        hpt = HyperParameterTuner(model, criterion, optimizer)
        train_data_indices, val_data_indices, test_data_indices = hpt.get_indices_for_hyperparamters(n, max_vocab_size)

        BATCH_SIZE = 32
        print(len(train_y))
        print(len(train_data_indices))
        train_dataset = ImdbDataset(train_data_indices, train_y)
        train_loader = torch.utils.data.DataLoader(dataset=train_dataset, 
                                                   batch_size=BATCH_SIZE,
                                                   collate_fn=imdb_collate_func,
                                                   shuffle=True)

        val_dataset = ImdbDataset(val_data_indices, val_y)
        val_loader = torch.utils.data.DataLoader(dataset=val_dataset, 
                                                   batch_size=BATCH_SIZE,
                                                   collate_fn=imdb_collate_func,
                                                   shuffle=True)

        num_epochs = 20 # number epoch to train
        accuracies, losses = hpt.train_and_validate_model(num_epochs, train_loader, val_loader)
        accuracies_all = (max_vocab_size, n, learning_rate, accuracies, losses)
        tuning_record.append(accuracies_all)
        print(accuracies_all)
    pkl.dump(tuning_record, open("tuning_record_{}_{}.p".format(n, max_vocab_size), "wb"))

run_model(10000,1)
    
# vocab_sizes = [10000, 20000, 40000, 80000]
# n_sizes = [1,2,3]
# for n in n_sizes:
#     processes = []
#     for vocab_size in vocab_sizes:
#         p = Process(target=run_model, args=(vocab_size,n,))
#         processes.append(p)
#         p.start()

#     for one_process in processes:
#         one_process.join()

#     print("Done!")


learning rate: 0.01
Retrieving bag of words for 10000 vocab size and 1 gram(s)


NameError: name 'train_y' is not defined

In [None]:
tuning_record2 = pkl.load(open("tuning_record.p", "rb"))
print(str(tuning_record2))

In [None]:
# Run this so your plots show properly
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc
%matplotlib inline
#plt.rcParams['figure.figsize'] = 12, 12

val_predicted_y = []
actual_y = []

for data, lengths, labels in val_loader:
        data_batch, length_batch, label_batch = data, lengths, labels
        print(model(data_batch, length_batch))
        outputs = F.softmax(model(data_batch, length_batch), dim=1)
        print(outputs[:,0])
#         val_predicted_y += (outputs.max(1, keepdim=True)[1]).squeeze(1).tolist()
        val_predicted_y += outputs[:,0].squeeze(0).tolist()
        actual_y += labels.tolist()
        print(val_predicted_y)
        print(actual_y)
        break

fpr_log_ct, tpr_log_ct, threshold_log_ct = roc_curve(actual_y, val_predicted_y)
roc_auc_log_ct = auc(fpr_log_ct, tpr_log_ct)
print(roc_auc_log_ct)

plt.title('Comparing ROC Across Different Vocabulary Sizes')
plt.plot(fpr_log_ct, tpr_log_ct, 'b', label = 'AUC for vocab size of 10,000 = %0.4f' % roc_auc_log_ct)
# plt.plot(fpr_nb_ct, tpr_nb_ct, 'b', label = 'AUC for vocab size of 20,000 = %0.4f' % roc_auc_nb_ct, color='purple')
# plt.plot(fpr_log_tfidf, tpr_log_tfidf, 'b', label = 'AUC for vocab size of 40,000 = %0.4f' % roc_auc_log_tfidf, color='green')
# plt.plot(fpr_nb_tfidf, tpr_nb_tfidf, 'b', label = 'AUC for vocab size of 80,000 = %0.4f' % roc_auc_nb_tfidf, color='orange')
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

In [None]:
# Assess performance on the test set
# Reading all of the negative and positive reviews into a tuple list
neg_path = 'aclImdb/test/neg'
pos_path = 'aclImdb/test/pos'
negative_files = [f for f in listdir(neg_path) if isfile(join(neg_path, f))]
postive_files = [f for f in listdir(pos_path) if isfile(join(pos_path, f))]

test_data = []
for neg_file in negative_files:
    with open(neg_path + '/' + neg_file, 'r') as f:
        review = f.read()
        test_data.append( (0,review) )
num_neg_reviews = len(test_data)
print('There are {} negative test reviews.'.format(num_neg_reviews))
for pos_file in postive_files:
    with open(pos_path + '/' + pos_file, 'r') as f:
        review = f.read()
        test_data.append( (1,review) )
print('There are {} positive test reviews.'.format(len(test_data) - num_neg_reviews))
print('Loaded {} examples'.format(str(len(test_data))))
print()
print(test_data[0])
print()
print(test_data[len(test_data)-1])   

In [None]:
# Create a validation set from the training data
test_x = [data_t[1] for data_t in test_data]
test_y = [data_t[0] for data_t in test_data]

print('{} test samples'.format(len(test_data)))
print([target for target in test_y[0:10]])

In [None]:
# test set tokens
print ("Tokenizing test data")
test_data_tokens, _ = tokenize_dataset(test_x)
pkl.dump(test_data_tokens, open("test_data_tokens.p", "wb"))
print ("Test dataset size is {}".format(len(test_data_tokens)))

In [None]:
test_data_tokens = pkl.load(open("test_data_tokens.p", "rb"))

# double checking
print ("Test dataset size is {}".format(len(test_data_tokens)))

test_data_indices = token2index_dataset(test_data_tokens)
test_dataset = ImdbDataset(test_data_indices, test_y)
test_loader = torch.utils.data.DataLoader(dataset=test_dataset, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=imdb_collate_func,
                                           shuffle=False)

In [None]:
print ("After training for {} epochs".format(num_epochs))
print ("Val Acc {}".format(test_model(val_loader, model)))
print ("Test Acc {}".format(test_model(test_loader, model)))

In [None]:
fpr_log_ct, tpr_log_ct, threshold_log_ct = roc_curve(Y_test, preds_log_ct)
roc_auc_log_ct = auc(fpr_log_ct, tpr_log_ct)

plt.title('ROC For Test Set With Best Model')
plt.plot(fpr_nb_tfidf, tpr_nb_tfidf, 'b', label = 'AUC for tfidf Naive Bayes = %0.4f' % roc_auc_nb_tfidf, color='orange')
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()