In [1]:
import torch.nn as nn
import torch
from torchtext.vocab import GloVe
import torchtext
import torch.optim as optim
from torch.autograd import Variable
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import ast
from torch.utils.data import TensorDataset, DataLoader
from time import sleep
from tqdm import tqdm
from tqdm.notebook import tqdm

In [5]:
#Import The data 
data = pd.read_csv(r'~/Deep_learning/deeplearning-badnl-replication/Data/IMDB_BadChar_poisoned_start_processed.csv')

#get reviews and sentiment 
review = data.review
sentiments = data.sentiment
text = [ast.literal_eval(data.review[i]) for i in range(len(review))]
# Get test train val split 
X_train, X_test, Y_train, Y_test = train_test_split(text, sentiments, test_size=0.2)
X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train, test_size=0.25) 

In [12]:

#Import GloVe embedding
#embedding = GloVe(name='6B', dim=300)
#Create Vocab
#vocab = embedding.itos
#Get embedding Vectors 
#pretrained_embedding = embedding.vectors
#print(pretrained_embedding.size())
#Import GloVe embedding
embedding = GloVe(name='6B', dim=300)

vocab = torchtext.vocab.vocab(embedding.stoi)

#Reduce glove vector dictionary to those words that occur in your vocabulary
pretrained_embedding = embedding.vectors
print(pretrained_embedding.size())

torch.Size([400000, 300])


In [13]:
#Allign embedding vectors from test train val split with those of GloVe

for i, sentence in tqdm(enumerate(X_train)):
    X_train[i] = [vocab.index(word) if word in vocab else 0 for word in sentence]


for i, sentence in tqdm(enumerate(X_test)):
    X_test[i] = [vocab.index(word) if word in vocab else 0 for word in sentence]

    
for i, sentence in tqdm(enumerate(X_val)):
    X_val[i] = [vocab.index(word) if word in vocab else 0 for word in sentence]




TypeError: __contains__(): incompatible function arguments. The following argument types are supported:
    1. (self: torchtext._torchtext.Vocab, arg0: str) -> bool

Invoked with: <torchtext._torchtext.Vocab object at 0x7f9c30455fb0>, 0

In [None]:
# Defining a function that either shortens sentences or pads sentences with 0 to a fixed length
def pad_input(sentences, seq_len):
    features = np.zeros((len(sentences), seq_len),dtype=int)
    for ii, review in enumerate(sentences):
        if len(review) != 0:
            features[ii, -len(review):] = np.array(review)[:seq_len].flatten()
    return features

seq_len = 200  # The length that the sentences will be padded/shortened to

train_sentences = pad_input(X_train, seq_len)
val_sentences = pad_input(X_val, seq_len)
test_sentences = pad_input(X_test, seq_len)

# Converting our labels into numpy arrays
train_labels = np.array(Y_train)
test_labels = np.array(Y_test)
val_labels = np.array(Y_val)

In [7]:
#Make our data tensors of labeled pairs 
train_data = TensorDataset(torch.from_numpy(train_sentences), torch.from_numpy(train_labels))
val_data = TensorDataset(torch.from_numpy(val_sentences), torch.from_numpy(val_labels))
test_data = TensorDataset(torch.from_numpy(test_sentences), torch.from_numpy(test_labels))
#Batch for training 
batch_size = 400

#Automatic data loader to shuffle and get random batches 
train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
val_loader = DataLoader(val_data, shuffle=True, batch_size=batch_size)
test_loader = DataLoader(test_data, shuffle=True, batch_size=batch_size)

In [8]:
# torch.cuda.is_available() checks and returns a Boolean True if a GPU is available, else it'll return False
is_cuda = torch.cuda.is_available()

# If we have a GPU available, we'll set our device to GPU. We'll use this device variable later in our code.
if is_cuda:
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

In [9]:
#Standard sentiment network
class SentimentNet(nn.Module):
    def __init__(self, vocab_size, output_size, embedding_dim, hidden_dim, n_layers,pretrained_embedding, drop_prob=0.5):
        super(SentimentNet, self).__init__()
        self.output_size = output_size
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim
        
        self.embedding = nn.Embedding.from_pretrained(pretrained_embedding)

        self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout=drop_prob, batch_first=True)
        self.dropout = nn.Dropout(drop_prob)
        self.fc = nn.Linear(hidden_dim, output_size)
        self.sigmoid = nn.Sigmoid()
    #forward pass
    def forward(self, x, hidden):
        batch_size = x.size(0)
        x = x.long()
        embeds = self.embedding(x)
        lstm_out, hidden = self.lstm(embeds, hidden)
        lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim)
        
        out = self.dropout(lstm_out)
        out = self.fc(out)
        out = self.sigmoid(out)
        
        out = out.view(batch_size, -1)
        out = out[:,-1]
        return out, hidden
    #Initialise hidden layers
    def init_hidden(self, batch_size):
        weight = next(self.parameters()).data
        hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().to(device),
                      weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().to(device))
        return hidden

In [10]:
#hyperparameters 
vocab_size = len(vocab) + 1
output_size = 1
embedding_dim = 300
hidden_dim = 256
n_layers = 2

#Create model 
model = SentimentNet(vocab_size, output_size, embedding_dim, hidden_dim, n_layers, pretrained_embedding)
#send to device (cpu or gpu)
model.to(device)

lr=0.005
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

In [15]:
# model training 
epochs = 2
counter = 0
print_every = 15
clip = 5
valid_loss_min = np.Inf

model.train()
for i in tqdm(range(epochs)):
    h = model.init_hidden(batch_size)
    
    for inputs, labels in tqdm(train_loader):
        counter += 1
        h = tuple([e.data for e in h])
        inputs, labels = inputs.to(device), labels.to(device)
        model.zero_grad()
        output, h = model(inputs, h)
        loss = criterion(output.squeeze(), labels.float())
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        
        if counter%print_every == 0:
            val_h = model.init_hidden(batch_size)
            val_losses = []
            model.eval()
            for inp, lab in tqdm(val_loader):
                val_h = tuple([each.data for each in val_h])
                inp, lab = inp.to(device), lab.to(device)
                out, val_h = model(inp, val_h)
                val_loss = criterion(out.squeeze(), lab.float())
                val_losses.append(val_loss.item())
                
            model.train()
            print("Epoch: {}/{}...".format(i+1, epochs),
                  "Step: {}...".format(counter),
                  "Loss: {:.6f}...".format(loss.item()),
                  "Val Loss: {:.6f}".format(np.mean(val_losses)))
            if np.mean(val_losses) <= valid_loss_min:
                torch.save(model.state_dict(), './state_dict.pt')
                print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(valid_loss_min,np.mean(val_losses)))
                valid_loss_min = np.mean(val_losses)

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/75 [00:00<?, ?it/s]

  0%|          | 0/25 [00:00<?, ?it/s]

Epoch: 1/2... Step: 15... Loss: 0.498042... Val Loss: 0.493695
Validation loss decreased (inf --> 0.493695).  Saving model ...


  0%|          | 0/25 [00:00<?, ?it/s]

Epoch: 1/2... Step: 30... Loss: 0.517969... Val Loss: 0.444950
Validation loss decreased (0.493695 --> 0.444950).  Saving model ...


KeyboardInterrupt: 

In [None]:
# Loading the best model
model.load_state_dict(torch.load('./state_dict.pt'))

test_losses = []
num_correct = 0
h = model.init_hidden(batch_size)

model.eval()
for inputs, labels in test_loader:
    h = tuple([each.data for each in h])
    inputs, labels = inputs.to(device), labels.to(device)
    output, h = model(inputs, h)
    test_loss = criterion(output.squeeze(), labels.float())
    test_losses.append(test_loss.item())
    pred = torch.round(output.squeeze())  # Rounds the output to 0/1
    correct_tensor = pred.eq(labels.float().view_as(pred))
    correct = np.squeeze(correct_tensor.cpu().numpy())
    num_correct += np.sum(correct)

print("Test loss: {:.3f}".format(np.mean(test_losses)))
test_acc = num_correct/len(test_loader.dataset)
print("Test accuracy: {:.3f}%".format(test_acc*100))