In [142]:
import pdb
import os
import pandas as pd
import numpy as np

import nltk
nltk.download('stopwords')
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from fastprogress import progress_bar
from nltk.corpus import stopwords
from torch.utils.data.sampler import SubsetRandomSampler
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader

from string import punctuation

from sklearn import metrics

[nltk_data] Downloading package stopwords to
[nltk_data]     /s/chopin/l/grad/fahadktk/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [200]:
class IMDBDataset(Dataset):
    def __init__(self, file_path, min_len=10, max_len=300, embedding='default', remove_stopWords=True, device=None):
        self.max_len = max_len
        self.min_len = min_len
        self.data = pd.read_csv(file_path)
        #self.data['label'] = self.data['sentiment'].apply(lambda x: 1 if x=='positive' else 0)
        self.data['review'] = self.data.review.apply(lambda x: x.replace('<br />', ''))
        stop_words = set(stopwords.words('english'))
        if remove_stopWords:
            self.data['review'] = self.data.review.apply(lambda x: ' '.join([c for c in x.split() if c not in stop_words]))
        self.data['review'] = self.data.review.apply(lambda x: ''.join([c for c in x if c not in punctuation]))
        self.data['reviewLen'] = self.data.review.apply(lambda x: len([c for c in x.split()]))
        self.data = self.data[self.data['reviewLen'].isin([i for i in range(self.min_len, self.max_len)])].reset_index(drop=True)    
        if embedding=='default':
            self.word_to_ix = {}
            for review in self.data['review']:
                for word in review.split():
                    if word not in self.word_to_ix:
                        self.word_to_ix[word] = len(self.word_to_ix)+1 #0 is for paddings
        self.tag_to_ix = {"1":1, "0":0}
    
    def __len__(self):
        return len(self.data)
    
    def pad_data(self, s):
        padded = np.zeros((self.max_len,),dtype=np.int64)
        if len(s) > self.max_len: padded[:] = s[:self.max_len]
        else: padded[:len(s)] = s
        return padded

    def __getitem__(self, index):
        review = self.data['review'][index]
        label = self.data['label'][index]
        idxs = [self.word_to_ix[w] for w in review.split()]
        lenReview = self.data['reviewLen'][index]#min(len(idxs), self.max_len)
        idxs = self.pad_data(idxs)
        review = torch.tensor(idxs, dtype=torch.long)
        #label = self.tag_to_ix[label]
        label = torch.tensor(label, dtype=torch.long)
        return review, label, lenReview

In [201]:
class IMDBDatasetv2(Dataset):
    def __init__(self, file_path, min_len=10, max_len=300, embedding='default', remove_stopWords=True, device=None):
        self.max_len = max_len
        self.min_len = min_len
        self.data = pd.read_csv(file_path)
        #self.data['label'] = self.data['sentiment'].apply(lambda x: "1" if x=='positive' else "0")
        self.data['review'] = self.data.review.apply(lambda x: x.replace('<br />', ''))
        stop_words = set(stopwords.words('english'))
        if remove_stopWords:
            self.data['review'] = self.data.review.apply(lambda x: ' '.join([c for c in x.split() if c not in stop_words]))
        self.data['review'] = self.data.review.apply(lambda x: ''.join([c for c in x if c not in punctuation]))
        self.data['reviewLen'] = self.data.review.apply(lambda x: len([c for c in x.split()]))
        self.data = self.data[self.data['reviewLen'].isin([i for i in range(self.min_len, self.max_len)])].reset_index(drop=True)    
        if embedding=='default':
            self.word_to_ix = {}
            for review in self.data['review']:
                for word in review.split():
                    if word not in self.word_to_ix:
                        self.word_to_ix[word] = len(self.word_to_ix)+1 #0 is for paddings
        self.tag_to_ix = {"1":1, "0":0}
        self.idx_list = []
        self.lenReview_list = []
        self.label_list = []
        print('Generating data tensors...')
        for i in progress_bar(range(0, self.data.shape[0])):
            review = self.data['review'][i]
            label = self.data['label'][i]
            
            idxs = [self.word_to_ix[w] for w in review.split()]
            lenReview = self.data['reviewLen'][i]#min(len(idxs), self.max_len)
            idxs = self.pad_data(idxs)
            
            review = torch.tensor(idxs, dtype=torch.long)
            self.idx_list.append(review)
            #label = self.tag_to_ix[label]
            label = torch.tensor(label, dtype=torch.long)
            self.label_list.append(label)
            lenReview = torch.tensor(lenReview, dtype=torch.long)
            self.lenReview_list.append(lenReview)
            
            
    
    def __len__(self):
        return len(self.data)
    
    def pad_data(self, s):
        padded = np.zeros((self.max_len,),dtype=np.int64)
        if len(s) > self.max_len: padded[:] = s[:self.max_len]
        else: padded[:len(s)] = s
        return padded

    def __getitem__(self, index):
        return self.idx_list[index], self.label_list[index], self.lenReview_list[index]
        

In [202]:
class ModelIMDB(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, vocab_size, target_size):
        super(ModelIMDB, self).__init__()
        self.hidden_dim = hidden_dim
        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        #self.lstm = nn.LSTM(embedding_dim, hidden_dim)
        self.lstm = nn.GRU(embedding_dim, hidden_dim, batch_first=True)
        self.target = nn.Linear(hidden_dim, target_size)
    
    def forward(self, review, lengths):
        #pdb.set_trace()
        self.embs = self.word_embeddings(review)
        self.embspack = pack_padded_sequence(self.embs, lengths, batch_first=True)
        #lstm_out, self.h = self.lstm(self.embspack)
        lstm_out, self.h = self.lstm(self.embspack)
        
        #outp = self.target(self.h[0]) #[hidden state, cell state]
        outp = self.target(self.h)
        return outp

In [212]:
def train(model, iterator, optimizer, criterion, device):
    epoch_loss = 0
    epoch_auc = 0
    epoch_f1 = 0
    model.train()
    count = 0
    for batch in progress_bar(iterator):
        #pdb.set_trace()
        optimizer.zero_grad()
        text,labels,lengths = batch
        lengths_Argsorted = lengths.argsort(descending=True)
        lengths = lengths[lengths_Argsorted]
        labels = labels[lengths_Argsorted]
        text = text.to(device)
        labels = labels.to(device)
        lengths = lengths.to(device)
        outputs = model(text, lengths).squeeze()
        loss = criterion(outputs, labels)
        softmax = torch.nn.Softmax(dim=1)
        
        labels = labels.cpu().detach().numpy()
        predictions = softmax(outputs).cpu().detach().numpy()
        auc = metrics.roc_auc_score(labels, predictions[:,-1])
        f1 = metrics.f1_score(labels, np.round(predictions[:,-1]))
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_auc += auc
        epoch_f1 += f1
        count += 1
    return epoch_loss/len(iterator), epoch_auc/len(iterator), epoch_f1/len(iterator)

In [213]:
def evaluate(model, iterator, criterion, device):
    epoch_loss = 0
    epoch_auc = 0
    epoch_f1 = 0
    model.eval()
    with torch.no_grad():
        for batch in progress_bar(iterator):
            #pdb.set_trace()
            text,labels,lengths = batch
            lengths_Argsorted = lengths.argsort(descending=True)
            lengths = lengths[lengths_Argsorted]
            labels = labels[lengths_Argsorted]
            text = text.to(device)
            labels = labels.to(device)
            lengths = lengths.to(device)
            outputs = model(text, lengths).squeeze()
            loss = criterion(outputs, labels)
            softmax = torch.nn.Softmax(dim=1)
            
            labels = labels.cpu().detach().numpy()
            predictions = softmax(outputs).cpu().detach().numpy()
            auc = metrics.roc_auc_score(labels, predictions[:,-1])
            f1 = metrics.f1_score(labels, np.round(predictions[:,-1]))

            epoch_loss += loss.item()
            epoch_auc += auc
            epoch_f1 += f1
    return epoch_loss/len(iterator), epoch_auc/len(iterator), epoch_f1/len(iterator)

In [214]:
batch_size = 128
test_split = 0.1
shuffle_data = True
rand_seed = 100
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [215]:
dataset = IMDBDatasetv2('IMDB_Dataset_v2.csv', device=device, remove_stopWords=True)

Generating data tensors...


In [216]:
dataset_size = len(dataset)
indices = list(range(dataset_size))
split_val = int(np.floor(test_split*dataset_size))
if shuffle_data:
    np.random.seed(rand_seed)
    np.random.shuffle(indices)
train_indices, test_indices, valid_indices = np.array(indices[2*split_val:]),np.array(indices[:split_val]),np.array(indices[split_val:2*split_val])

train_sampler = SubsetRandomSampler(train_indices)
test_sampler = SubsetRandomSampler(test_indices)
valid_sampler = SubsetRandomSampler(valid_indices)
train_loader = DataLoader(dataset, batch_size = batch_size, sampler = train_sampler, num_workers = 8, drop_last=True)
test_loader = DataLoader(dataset, batch_size = batch_size, sampler = test_sampler, num_workers = 8, drop_last=True)
valid_loader = DataLoader(dataset, batch_size = batch_size, sampler = valid_sampler, num_workers = 8, drop_last=True)

In [217]:
len(indices[2*split_val:])

37249

In [218]:
#dataset

In [221]:
model = ModelIMDB(100, 50, len(dataset.word_to_ix)+1, len(dataset.tag_to_ix)).to(device)
optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss(reduction='mean')

In [222]:
model

ModelIMDB(
  (word_embeddings): Embedding(220395, 100, padding_idx=0)
  (lstm): GRU(100, 50, batch_first=True)
  (target): Linear(in_features=50, out_features=2, bias=True)
)

In [220]:
N_EPOCHS = 15
best_valid_loss = np.inf
best_valid_auc = 0
best_valid_f1 = 0
for epoch in range(N_EPOCHS):
    train_loss, train_auc, train_f1 = train(model, train_loader, optimizer, criterion, device)
    valid_loss, valid_auc, valid_f1 = evaluate(model, valid_loader, criterion, device)
    print(train_loss)
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        best_valid_auc = valid_auc
        best_valid_f1 = valid_f1
        print("Best Validation Loss: %.3f, AUC: %.2f, F1:%.2f"%(best_valid_loss, best_valid_auc, best_valid_f1), "\n")
        torch.save({'epoch': epoch,
                    'model_state_dict': model.state_dict(),
                    'optimizer_state_dict':optimizer.state_dict(),
                    'loss':valid_loss
                    },
                   'saved_model')

0.6959861176939764
Best Validation Loss: 0.694, AUC: 0.50, F1:0.39 



0.6942852110797187
Best Validation Loss: 0.693, AUC: 0.51, F1:0.36 



0.6934085893876774


KeyboardInterrupt: 