In [25]:
import pandas as pd
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from fastprogress import progress_bar

import numpy as np
from torchvision import datasets
from torchvision import transforms
from torch.utils.data.sampler import SubsetRandomSampler
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader

from string import punctuation

import nltk
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics

[nltk_data] Downloading package stopwords to
[nltk_data]     /s/chopin/l/grad/fahadktk/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Pytorch implementation

In [27]:
class DatasetIMDB(Dataset):
    """
    max_len: Maximum review length (in number of words) to use for padding
    min_len: Minimum review length (in number of words) to keep
    
    """
    def __init__(self, file_path, transform="embd", embedding="default", min_len = 10, max_len = 300, remove_stopWords = True):
        self.transform = transform
        self.max_len = max_len
        self.data = feather.read_dataframe(file_path)
        self.data['review'] = self.data.Text.apply(lambda x: x.replace('<br />',''))
        stop_words = set(stopwords.words('english'))
        if remove_stopWords:
            self.data['review'] = self.data.review.apply(lambda x: ' '.join([c for c in x.split() if c not in stop_words]))
        self.data['review'] = self.data.review.apply(lambda x: ''.join([c for c in x if c not in punctuation]))
        self.data['textLen'] = self.data.review.apply(lambda x: len([c for c in x.split()]))
        self.data = self.data[self.data['textLen'].isin([i for i in range(min_len,max_len)])].reset_index() #pick 
        
        if embedding == "default":
            self.word_to_ix = {}
            for review in self.data['review']:
                for word in review.split():
                    if word not in self.word_to_ix:
                        self.word_to_ix[word] = len(self.word_to_ix)
            #print(word_to_ix)
            self.tag_to_ix = {"1": 1, "0": 0} #not useful right now
            #tag_to_ix = {"DET": 0, "NN": 1, "V": 2}
        
        
    def __len__(self):
        return len(self.data)
    
    def pad_data(self, s):
        padded = np.zeros((self.max_len,), dtype=np.int64)
        if len(s) > self.max_len: padded[:] = s[:self.max_len]
        else: padded[:len(s)] = s
        return padded
    
    def __getitem__(self, index):
        review = self.data['review'][index]
        label = self.data['label'][index]
       
        
        if self.transform is not None:
            idxs = [self.word_to_ix[w] for w in review.split()]
            lenReview = min(len(idxs),self.max_len)
            idxs = self.pad_data(idxs)
            
            review = torch.tensor(idxs, dtype=torch.long) #creating tensor here is slow
        label = torch.tensor(label,dtype=torch.long)
            
        return review, label, lenReview

In [28]:
class DatasetIMDBversion2(Dataset):
    """
    max_len: Maximum review length (in number of words) to use for padding
    min_len: Minimum review length (in number of words) to keep
    
    """
    def __init__(self, file_path, transform="embd", embedding="default", min_len = 10, max_len = 300, remove_stopWords = True, device=None, feather=True):
        self.transform = transform
        self.max_len = max_len
        #self.data = feather.read_dataframe(file_path)
        self.data = pd.read_csv(file_path) if feather==False else feather.read_dataframe(file_path)
        self.data['review'] = self.data.review.apply(lambda x: x.replace('<br />',''))
        stop_words = set(stopwords.words('english'))
        if remove_stopWords:
            self.data['review'] = self.data.review.apply(lambda x: ' '.join([c for c in x.split() if c not in stop_words]))
        self.data['review'] = self.data.review.apply(lambda x: ''.join([c for c in x if c not in punctuation]))
        self.data['textLen'] = self.data.review.apply(lambda x: len([c for c in x.split()]))
        self.data = self.data[self.data['textLen'].isin([i for i in range(min_len,max_len)])].reset_index() #pick 
        
        if embedding == "default":
            self.word_to_ix = {}
            for review in self.data['review']:
                for word in review.split():
                    if word not in self.word_to_ix:
                        self.word_to_ix[word] = (len(self.word_to_ix)) + 1# avoid starting from 0
            #print(word_to_ix)
            self.tag_to_ix = {"1": 1, "0": 0} #not useful right now
            #tag_to_ix = {"DET": 0, "NN": 1, "V": 2}
        
        self.idx_list = []
        self.lenReview_list = []
        self.label_list = []
        
        print("Generating Data Tensors...")
        for i in progress_bar(range(0,self.data.shape[0])):
            review = self.data['review'][i]
            label = self.data['label'][i] 
            
            if self.transform is not None:
                idxs = [self.word_to_ix[w] for w in review.split()]
                lenReview = min(len(idxs),self.max_len)
                idxs = self.pad_data(idxs)
                
                review = torch.tensor(idxs, dtype=torch.long) #creating tensor here is slow
                self.idx_list.append(review)
            label = torch.tensor(label,dtype=torch.long)
            self.label_list.append(label)
            
            lenReivew = torch.tensor(lenReview, dtype=torch.long)
            self.lenReview_list.append(lenReview)
        
        
        
    def __len__(self):
        return len(self.data)
    
    def pad_data(self, s):
        padded = np.zeros((self.max_len,), dtype=np.int64)
        if len(s) > self.max_len: padded[:] = s[:self.max_len]
        else: padded[:len(s)] = s
        return padded
    
    def __getitem__(self, index):
        return self.idx_list[index], self.label_list[index], self.lenReview_list[index]

In [29]:
batch_size = 64
valid_split = .2
shuffle_data = True
rand_seed= 100
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [30]:
torch.cuda.is_available()

True

In [32]:
#dataset = DatasetIMDB('data/classification/imdb_train.feather',remove_stopWords=False)
#testdataset = DatasetIMDB('data/classification/imdb_test.feather',remove_stopWords=False)
#dataset = DatasetIMDBversion2('data/classification/imdb_train.feather',remove_stopWords=False,device=device)
dataset = DatasetIMDBversion2('IMDB_Dataset_v2.csv', remove_stopWords=False, device=device, feather=False)

Generating Data Tensors...


In [33]:
# Creating data indices for training and validation splits:
dataset_size = len(dataset)
indices = list(range(dataset_size))
split = int(np.floor(valid_split * dataset_size))
if shuffle_data:
    np.random.seed(rand_seed)
    np.random.shuffle(indices)
train_indx, val_indx = indices[split:], indices[:split]

# Creating PT data samplers and loaders:
train_sampler = SubsetRandomSampler(train_indx)
valid_sampler = SubsetRandomSampler(val_indx)
#test_sampler = SubsetRandomSampler([i for i in range(0,2000)]+[j for j in range(15200,17200)])

train_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, 
                                           sampler=train_sampler, drop_last=True)
validation_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size,
                                                sampler=valid_sampler, drop_last=True)
#validation_loader = torch.utils.data.DataLoader(testdataset, batch_size=batch_size, #can't use different dataset since the token IDs will change?
#                                          sampler=test_sampler)
#test_loader = torch.utils.data.DataLoader(testdataset, batch_size=batch_size,
#                                          sampler=test_sampler)

In [34]:
len(train_indx),len(val_indx)

(31402, 7850)

In [35]:
# Usage Example:
#num_epochs = 10
#for epoch in range(num_epochs):
#    # Train:   
#    for batch_index, (faces, labels) in enumerate(train_loader):
#        # ...

In [37]:
len(dataset.word_to_ix)

172727

In [38]:
class LSTMiMDB(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        super(LSTMiMDB, self).__init__()
        self.hidden_dim = hidden_dim

        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx = 0) #try with padding_idx = 0
        #print (self.word_em)

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        #self.lstm = nn.LSTM(embedding_dim, hidden_dim)
        self.lstm = nn.GRU(embedding_dim, hidden_dim,batch_first=True)
        
        #self.dropout = nn.Dropout(0.5)
        
        # The linear layer that maps from hidden state space to target space
        self.target = nn.Linear(hidden_dim, tagset_size)
        
    
    def forward(self, review, lengths):
        #pdb.set_trace()

        self.embs = self.word_embeddings(review)
        self.embspack = pack_padded_sequence(self.embs, lengths,batch_first=True) # unpad enforce_sorted=False
        lstm_out, self.h = self.lstm(self.embspack) # gru returns hidden state of all timesteps as well as hidden state at last timestep
        #lstm_out, lengths = pad_packed_sequence(lstm_out) # pad the sequence to the max length in the batch (not needed here)
        
        #outp = self.target(self.h[0]) #for LSTM
        outp = self.target(self.h) # For GRU #vs self.h[-1]? self.h shape[1,batch_size,num_hidden] and self.h[-1] shape[batch_size,num_hidden] (basically the same thing?)
        
        return outp
        #return F.softmax(outp, dim=-1)
    


In [39]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    rounded_preds = np.round(preds)
    correct = (rounded_preds == y)#.float() #convert into float for division 
    acc = 5.0#float(correct).sum() / len(correct)
    return acc

In [40]:
def train(model, iterator, optimizer, criterion, device):
    
    epoch_loss = 0
    epoch_acc = 0
    epoch_auc = 0
    epoch_f1 = 0
    
    model.train()
    
    #for i in progress_bar(range(0,len(iterator))):
    for batch in progress_bar(iterator):
        #batch = iterator[i]
        optimizer.zero_grad()
        
        text, labels, lengths = batch
        lengths_Argsorted = lengths.argsort(descending=True)
        lengths = lengths[lengths_Argsorted]
        text = text[lengths_Argsorted]
        labels = labels[lengths_Argsorted]
        text = text.to(device)
        labels = labels.to(device)
        lengths = lengths.to(device)
        
        predictions = model(text,lengths).squeeze()
        
        #print(predictions[:,1])
        #print(predictions.shape,labels.shape)
        
        loss = criterion(predictions, labels)
        
        labels = labels.cpu().detach().numpy()
        softmax = torch.nn.Softmax(dim=1)
        predictions = softmax(predictions).cpu().detach().numpy()
        
        acc = binary_accuracy(predictions[:,-1], labels)
        #print(np.asarray(labels),predictions[:,1])
        
        fpr, tpr, thresholds = metrics.roc_curve(np.asarray(labels), predictions[:,-1])
        auc = metrics.auc(fpr, tpr)
        f1 = metrics.f1_score(labels, np.round(predictions[:,-1]))
      
        #print("Loss and Accuracy: ",loss,acc)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item() #0 for checking speed
        epoch_acc += acc#.item()
        epoch_auc += auc
        epoch_f1 += f1
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator), epoch_auc/len(iterator), epoch_f1/len(iterator)

In [41]:
def evaluate(model, iterator, criterion, device):
    
    epoch_loss = 0
    epoch_acc = 0
    epoch_auc = 0
    epoch_f1 = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in progress_bar(iterator):

            text, labels, lengths = batch
            lengths_Argsorted = lengths.argsort(descending=True)
            lengths = lengths[lengths_Argsorted]
            text = text[lengths_Argsorted]
            labels = labels[lengths_Argsorted]
            text = text.to(device)
            labels = labels.to(device)
            lengths = lengths.to(device)
            
            predictions = model(text,lengths).squeeze()
            
            loss = criterion(predictions, labels)
            
            softmax = torch.nn.Softmax(dim=1)
            predictions = softmax(predictions).cpu().detach().numpy()
            labels = labels.cpu().detach().numpy()
            
            acc = binary_accuracy(predictions[:,-1], labels)
            
            fpr, tpr, thresholds = metrics.roc_curve(np.asarray(labels), predictions[:,-1])
            auc = metrics.auc(fpr, tpr)
            f1 = metrics.f1_score(labels, np.round(predictions[:,-1]))

            epoch_loss += loss.item() #0 for checking speed
            epoch_acc += acc#.item()
            epoch_auc += auc
            epoch_f1 += f1
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator), epoch_auc/len(iterator), epoch_f1/len(iterator)

In [42]:
EMBEDDING_DIM = 100
HIDDEN_DIM = 50

In [43]:
model = LSTMiMDB(EMBEDDING_DIM, HIDDEN_DIM, len(dataset.word_to_ix)+1, len(dataset.tag_to_ix)).to(device)
loss_function = nn.CrossEntropyLoss(reduction='mean')   #nn.NLLLoss()
optimizer = optim.Adam(model.parameters())#, weight_decay = 0.01)   #optim.Adam(model.parameters()) #optim.SGD(model.parameters(), lr=0.001)

#model = model.to(device)
#loss_function = loss_function.to(device)

In [44]:
model

LSTMiMDB(
  (word_embeddings): Embedding(172728, 100, padding_idx=0)
  (lstm): GRU(100, 50, batch_first=True)
  (target): Linear(in_features=50, out_features=2, bias=True)
)

In [45]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [46]:
#train(model,train_loader,optimizer,loss_function)
N_EPOCHS = 15

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc, train_auc, train_f1 = train(model, train_loader, optimizer, loss_function, device)
    valid_loss, valid_acc, valid_auc, valid_f1 = evaluate(model, validation_loader, loss_function, device)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'lstmIMDB_model.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f} | Train Auc: {train_auc:.2f} | Train F1-score: {train_f1:.2f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f} |  Val. Auc: {valid_auc:.2f} | Valid F1-score: {valid_f1:.2f}')

Epoch: 01 | Epoch Time: 0m 12s
	Train Loss: 0.627 | Train Acc: 500.00 | Train Auc: 0.69 | Train F1-score: 0.63
	 Val. Loss: 0.654 |  Val. Acc: 500.00 |  Val. Auc: 0.67 | Valid F1-score: 0.63


Epoch: 02 | Epoch Time: 0m 12s
	Train Loss: 0.585 | Train Acc: 500.00 | Train Auc: 0.75 | Train F1-score: 0.68
	 Val. Loss: 0.493 |  Val. Acc: 500.00 |  Val. Auc: 0.85 | Valid F1-score: 0.78


Epoch: 03 | Epoch Time: 0m 12s
	Train Loss: 0.418 | Train Acc: 500.00 | Train Auc: 0.89 | Train F1-score: 0.81
	 Val. Loss: 0.407 |  Val. Acc: 500.00 |  Val. Auc: 0.90 | Valid F1-score: 0.81


Epoch: 04 | Epoch Time: 0m 12s
	Train Loss: 0.287 | Train Acc: 500.00 | Train Auc: 0.95 | Train F1-score: 0.88
	 Val. Loss: 0.349 |  Val. Acc: 500.00 |  Val. Auc: 0.93 | Valid F1-score: 0.86


Epoch: 05 | Epoch Time: 0m 12s
	Train Loss: 0.185 | Train Acc: 500.00 | Train Auc: 0.98 | Train F1-score: 0.93
	 Val. Loss: 0.337 |  Val. Acc: 500.00 |  Val. Auc: 0.94 | Valid F1-score: 0.86


Epoch: 06 | Epoch Time: 0m 12s
	Train Loss: 0.119 | Train Acc: 500.00 | Train Auc: 0.99 | Train F1-score: 0.96
	 Val. Loss: 0.374 |  Val. Acc: 500.00 |  Val. Auc: 0.94 | Valid F1-score: 0.87


Epoch: 07 | Epoch Time: 0m 12s
	Train Loss: 0.072 | Train Acc: 500.00 | Train Auc: 1.00 | Train F1-score: 0.98
	 Val. Loss: 0.433 |  Val. Acc: 500.00 |  Val. Auc: 0.94 | Valid F1-score: 0.87


Epoch: 08 | Epoch Time: 0m 12s
	Train Loss: 0.047 | Train Acc: 500.00 | Train Auc: 1.00 | Train F1-score: 0.99
	 Val. Loss: 0.462 |  Val. Acc: 500.00 |  Val. Auc: 0.94 | Valid F1-score: 0.87


KeyboardInterrupt: 

In [90]:
test_embs = model.embs[:,0,:].clone().data.cpu().numpy()

In [331]:
test_embs.shape

(56, 100)

In [72]:
#np.mean(test_embs,axis=1)

In [188]:
with torch.no_grad():
    for batch in validation_loader:
        text, labels, lengths = batch
        lengths_Argsorted = lengths.argsort(descending=True)
        lengths = lengths[lengths_Argsorted]
        text = text[lengths_Argsorted]
        labels = labels[lengths_Argsorted]
        predictions = model(text.t(),lengths)
        #softmax = torch.sigmoid()
        preds = torch.sigmoid(predictions)
        break

In [332]:
model.embs.shape

torch.Size([56, 300, 100])

In [334]:
model.embs[0,:,:].shape

torch.Size([300, 100])

In [191]:
#preds

In [351]:
text[10]

tensor([29350,    40, 21080,   964,    13,   966,   899,    13,   157,  5434,
        11703, 12574,  7147,   207,   381,   805, 12301,  1540,  9337,  9338,
         1839,   207, 12857,     9,  4375, 29351,    68,  6681,  8158,  4911,
        21080,   132,  6681,  8158, 20177, 15647, 29352,  2896,    55,  9853,
          567,  5522, 21080,    82,   112,  9339, 29353,    12, 10759,  1012,
          132,  6263,   132,     9,   976,  1186,   558,   259,   585,  5380,
        29354,   132,  3704, 13961,   183, 29355, 29356,    19,   813,  1073,
         1494,   259,  2004,    82,  2166,  7885,   132,  5815,   155,    82,
           13,  5447,   907,    15,  1064,     3,   294,  3666,    12,    13,
        10614,  6838,  1281, 21080, 28998,  6681,  8158,    55,  2593,  9064,
           19, 12304,    74,    12,  3980,   183, 24202,  1885,   236,    13,
         5447,    76,    82, 10295,   294,  4679,   132,  9220,   259,  1670,
           82,   294, 11141,  1839,   294,  7476,   132,   921, 

In [352]:
index = 10
final_review = []
for ix in text[index].clone().data.cpu().numpy():
    for key in dataset.word_to_ix:
        if dataset.word_to_ix[key] == ix:
            final_review.append(key)

In [353]:
index = 10
labels[index],preds[0][index]

(tensor(1), tensor([0.1950, 0.8014]))

In [354]:
' '.join(final_review)

'Overshadowed by Braveheart released the same year the two costume dramas beg comparison I admit my bias against Mel Gibson yet I maintain a rational preference for Rob Roy Both Braveheart and Rob Roy compellingly depict Scots history in bloody romantic fashion Braveheart is an epic paean to individual honor and courage and a fine revenge fantasy Its also melodramatic anachronistic and maudlin Note its cornball usage of slow motion filming Its violence is both ugly and glorious It is the latter quality which makes it more appealing to the adolescent mindset While Braveheart surpasses Rob Roy in sheer levels of carnage not to mention its indulgent running time the latter film is ultimately more mature and satisfying Its action is more understated yet more surprising and clever Its sex is less showy yet more erotic Rob Roy also has a better realized romantic interest Its dialog attempts to approximate the poetry of the period Its rotted teeth in the mouths of the actors attempt to approx

In [355]:
test_embs = model.embs[index,:,:].clone().data.cpu().numpy()

In [356]:
np.set_printoptions(suppress=True)
test_embs_mean = np.mean(test_embs,axis=1)
test_embs_mean_args = np.argsort(test_embs_mean)[::-1]

In [357]:
np.asarray(final_review)[test_embs_mean_args]

array(['of', 'of', 'of', 'of', 'of', 'of', 'of', 'its', 'its', 'its', 'a',
       'a', 'a', 'a', 'which', 'revenge', 'approximate', 'approximate',
       'appealing', 'costume', 'Last', 'honor', 'villain', 'bloody',
       'maudlin', 'two', 'interest', 'anachronistic', 'Rob', 'Rob', 'Rob',
       'Rob', 'Rob', 'both', 'their', 'fine', 'it', 'poetry', 'history',
       'less', 'carnage', 'filming', 'find', 'yet', 'yet', 'yet', 'and',
       'and', 'and', 'and', 'and', 'and', 'and', 'and', 'You', 'mention',
       'sheer', 'comparison', 'violence', 'violence', 'slow', 'released',
       'indulgent', 'rotted', 'an', 'former', 'film', 'tone',
       'melodramatic', 'admit', 'more', 'more', 'more', 'more', 'more',
       'more', 'more', 'Scots', 'not', 'may', 'against', 'sex', 'I', 'I',
       'compellingly', 'fantasy', 'realized', 'lust', 'merit', 'usage',
       'Both', 'showy', 'understated', 'bias', 'fashion', 'with', 'with',
       'And', 'time', 'dramas', 'quality', 'action', 'R', 'di

In [342]:
#import shap.explainers.deep.deep_pytorch as shapdpt
import shap

In [343]:
text.shape

torch.Size([64, 300])

In [358]:
nn.LSTM()

Help on class Embedding in module torch.nn.modules.sparse:

class Embedding(torch.nn.modules.module.Module)
 |  A simple lookup table that stores embeddings of a fixed dictionary and size.
 |  
 |  This module is often used to store word embeddings and retrieve them using indices.
 |  The input to the module is a list of indices, and the output is the corresponding
 |  word embeddings.
 |  
 |  Args:
 |      num_embeddings (int): size of the dictionary of embeddings
 |      embedding_dim (int): the size of each embedding vector
 |      padding_idx (int, optional): If given, pads the output with the embedding vector at :attr:`padding_idx`
 |                                       (initialized to zeros) whenever it encounters the index.
 |      max_norm (float, optional): If given, each embedding vector with norm larger than :attr:`max_norm`
 |                                  is renormalized to have norm :attr:`max_norm`.
 |      norm_type (float, optional): The p of the p-norm to comput

In [345]:
background = text[:60]
bglengths = lengths[:60]
test_reviews = text[60:64]
test_lengths = lengths[60:64]

data = [background, bglengths]
data_test = [test_reviews,test_lengths]
#pdb.set_trace()
e = shap.DeepExplainer(model, data)
shap_values = e.shap_values(data_test)

RuntimeError: only Tensors of floating point dtype can require gradients

In [296]:
assert type(data) == list, "Expected a list of model inputs!"

In [311]:
help(shap.DeepExplainer)

Help on class DeepExplainer in module shap.explainers.deep:

class DeepExplainer(shap.explainers.explainer.Explainer)
 |  Meant to approximate SHAP values for deep learning models.
 |  
 |  This is an enhanced version of the DeepLIFT algorithm (Deep SHAP) where, similar to Kernel SHAP, we
 |  approximate the conditional expectations of SHAP values using a selection of background samples.
 |  Lundberg and Lee, NIPS 2017 showed that the per node attribution rules in DeepLIFT (Shrikumar,
 |  Greenside, and Kundaje, arXiv 2017) can be chosen to approximate Shapley values. By integrating
 |  over many backgound samples DeepExplainer estimates approximate SHAP values such that they sum
 |  up to the difference between the expected model output on the passed background samples and the
 |  current model output (f(x) - E[f(x)]).
 |  
 |  Method resolution order:
 |      DeepExplainer
 |      shap.explainers.explainer.Explainer
 |      builtins.object
 |  
 |  Methods defined here:
 |  
 |  __in

In [269]:
with torch.no_grad():
    model(*data)

In [273]:
print(*data)

tensor([[  207,   207,   478,  ...,   916,   207, 24632],
        [  763,   256,    61,  ...,   664,    58,   322],
        [ 7826,  4476,   619,  ...,   266,  1060,   185],
        ...,
        [ 3960,     0,     0,  ...,     0,     0,     0],
        [    0,     0,     0,  ...,     0,     0,     0],
        [    0,     0,     0,  ...,     0,     0,     0]]) tensor([298, 297, 291, 283, 270, 262, 260, 235, 235, 234, 226, 225, 224, 223,
        222, 220, 217, 212, 203, 201, 196, 195, 191, 189, 188, 185, 184, 182,
        174, 174, 172, 160, 158, 157, 156, 155, 152, 144, 143, 143, 137, 137,
        135, 135, 131, 127, 127, 125, 123, 121, 119, 118, 117, 114, 112, 112,
        109, 109, 109,  94])


In [38]:
valid_loss, valid_acc, valid_auc, valid_f1 = evaluate(model, validation_loader, loss_function)

In [41]:
#metrics.precision_recall_curve

In [39]:
valid_loss,valid_acc,valid_auc, valid_f1

(0.3656083770462724, 0.840273711525026, 0.927367345262119, 0.8520957465360808)

In [176]:
for batch in validation_loader:
    text, labels, lengths = batch
            
    predictions = model(text.t(),lengths).squeeze()
            
    loss = loss_function(predictions, labels)
            
    acc = binary_accuracy(predictions[:,1], labels)
    
    break

### Testing the model

In [40]:
testdataset = DatasetIMDB('data/classification/imdb_test.feather',remove_stopWords=False)

In [46]:
testdataset.__len__()

19771

In [68]:
test_sampler = SubsetRandomSampler([i for i in range(0,3000)]+[j for j in range(14200,17200)])

In [69]:
test_loader = torch.utils.data.DataLoader(testdataset, batch_size=batch_size,sampler=test_sampler)

In [101]:
#evaluate(model, test_loader, loss_function) #won't work since tokens are different in this dataset?

In [58]:
df_test['Target'].value_counts()

1    12500
0    12500
Name: Target, dtype: int64

In [263]:
model.word_embeddings

Embedding(110300, 100, padding_idx=0)

In [189]:
batch[0][batch[-1].argsort(descending=True)]

tensor([[ 207,  256, 4476,  ...,    0,    0,    0],
        [ 358,  143,   19,  ...,    0,    0,    0],
        [4777,  908,   55,  ...,    0,    0,    0],
        ...,
        [3466,    9,  167,  ...,    0,    0,    0],
        [ 207, 3797, 3199,  ...,    0,    0,    0],
        [   0,  938,  178,  ...,    0,    0,    0]])

In [190]:
batch[-1].argsort()

tensor([18, 20, 44, 57, 49, 61, 32, 46, 42, 39,  8, 54, 52, 31, 26, 35, 62, 43,
        28, 29, 25, 63, 58, 15, 59, 37, 19,  5, 30, 36, 47, 50,  9,  3, 45, 40,
        12,  1, 48,  4, 14, 38, 13, 55, 53, 51, 22, 17, 34, 56,  7, 21, 33, 60,
        24, 27,  2,  6, 23, 11, 10, 16,  0, 41])

In [192]:
batch[0][41]#tokenize from 0 or 1? if we use 0 padding

tensor([  207,   256,  4476,    12,   261,   115,    76,   111,   120,    91,
           19,     3,    82, 94457,    35,   223,  1612,   108,    19,    13,
         7377,    82,  8306,  2576,     0,  2862,  3457, 11003,   663,    45,
          254,    31,   482,  2064, 20818,    82, 18314, 14401,    19,    13,
           51,    55,    13,  6427,  3427,  1508,  4034, 27020,   805,  2629,
        22162,   805, 94458, 22162,   805, 46960, 18274,   364,   207, 10705,
          730,    12,   538,    13, 94459, 10718,  4159,  1211, 51278,  4236,
          132,    74,   260,  2243, 10358,    55,  1075,   358,   575, 72445,
         2230,   450,   132,     9,  3310, 13515, 55839, 36291,     4,   346,
        94460,  7047,   228,    71,  2118,  5104,   792,   228,  1820,  5348,
        94461,     0, 25081, 11638,    55,    13, 17427, 18500,   227,     9,
          874,   228,  1845,    12,   318,    55, 51278,   132,  2166,  1353,
         1661,     9,   912,    19, 94462, 94463,    19,   115, 

In [303]:
batch[0].t().shape

torch.Size([200, 32])

In [45]:
torch.cuda.device_count()

1

In [230]:
help(F.sigmoid)

Help on function sigmoid in module torch.nn.functional:

sigmoid(input)
    sigmoid(input) -> Tensor
    
    Applies the element-wise function :math:`\text{Sigmoid}(x) = \frac{1}{1 + \exp(-x)}`
    
    See :class:`~torch.nn.Sigmoid` for more details.



In [385]:
res = batch[1]

In [386]:
np.asarray(res)

array([0, 0, 1, 0], dtype=int32)

In [376]:
len(train_loader)

146

In [374]:
for i in progress_bar(range(100)):
    ;

In [1]:
#help(nn.LSTM)