In [1]:
import os
import re

from tqdm import tqdm
import shutil
import pandas as pd
import numpy as np
from argparse import Namespace
from sklearn.model_selection import train_test_split


import torch 
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from transformers import BertModel, BertConfig, BertTokenizer, AdamW, get_linear_schedule_with_warmup



In [2]:
## Set seed
seed = 121
np.random.seed(seed)
torch.random.manual_seed(seed)

device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

## Define Config

In [3]:
config = Namespace(
    
    train_file_path  = '../input/bag-of-words-meets-bags-of-popcorn-/labeledTrainData.tsv',
    model_dir = 'models'
)


model_config = Namespace(
    
    model_name = 'bert-base-uncased',
    max_len = 512,
    batch_size = 8,
    epochs = 3
)

In [4]:
## Create Folders
if os.path.exists(config.model_dir):
    shutil.rmtree(config.model_dir)
os.makedirs(config.model_dir, exist_ok=True)

In [5]:
## Load Data
train_df = pd.read_csv(config.train_file_path, delimiter='\t')

## Data Cleaning

In [6]:
## Split data into train and val data
X_train, X_val, y_train, y_val = train_test_split(train_df['review'].tolist(),
                                                  train_df['sentiment'].tolist(), 
                                                  test_size = 0.3,
                                                  random_state = 121)

In [7]:
len(X_train), len(y_train), len(X_val), len(y_val)

(17500, 17500, 7500, 7500)

## Dataset

In [20]:
class SentimentDataset(Dataset):
    
    def __init__(self, reviews, labels, tokenizer, max_len):
        self.reviews = reviews
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __getitem__(self,index):
        review, label = self.reviews[index], self.labels[index]
        
        encoding = self.tokenizer.encode_plus(
                          review,
                          add_special_tokens=True,
                          max_length=self.max_len,
                          return_token_type_ids=False,
                          pad_to_max_length=True,
                          return_attention_mask=True,
                          return_tensors='pt',
                        )
        
        return {
                  'review_text': review,
                  'input_ids': encoding['input_ids'].flatten(),
                  'attention_mask': encoding['attention_mask'].flatten(),
                  'label': torch.tensor(label, dtype=torch.long)
                }
        
        
    def __len__(self):
        return len(self.reviews)

In [21]:
tokenizer = BertTokenizer.from_pretrained(model_config.model_name, do_lower_case = True)
train_dataset = SentimentDataset(X_train, y_train, tokenizer, model_config.max_len)
val_dataset = SentimentDataset(X_val, y_val, tokenizer, model_config.max_len)

In [22]:
class DataLoaderWrapper:
    
    def __init__(self, device, dataset, batch_size, shuffle):
        self.device = device
        self.dataset = dataset
        self.dataset_size = len(dataset)
        self.shuffle = shuffle
        self.batch_size = batch_size
        
    def __iter__(self):
        
        dataloader = DataLoader(self.dataset,batch_size = self.batch_size,
                                shuffle = self.shuffle, num_workers=4)
    
        for data in tqdm(dataloader, total = len(dataloader)):
            ids = data['input_ids']
            mask = data['attention_mask']
            label = data['label']
            yield ids.to(device),  mask.to(device), label.to(device)
            
    def get_number_of_batches(self):
        return int(np.round(len(self.dataset)/self.batch_size,0))

In [23]:
train_dl = DataLoaderWrapper(device, train_dataset, model_config.batch_size, True)
val_dl = DataLoaderWrapper(device, val_dataset, model_config.batch_size, False)

In [24]:
data = next(iter(train_dl))

print(f"ids : {data[0].shape}")
print(f"mask : {data[1].shape}")
print(f"label : {data[2].shape}")

  0%|          | 0/2188 [00:00<?, ?it/s]

ids : torch.Size([8, 512])
mask : torch.Size([8, 512])
label : torch.Size([8])





## Load Model

In [25]:
class SentimentClassifier(nn.Module):
   
    def __init__(self, modelname, n_classes):
        super(SentimentClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(modelname)
        self.drop = nn.Dropout(p=0.3)
        self.out = nn.Linear(self.bert.config.hidden_size, n_classes)
        
    def forward(self, input_ids, attention_mask):
        last_hidden_state, pooled_output = self.bert(
                                  input_ids=input_ids,
                                  attention_mask=attention_mask
                                )
        output = self.drop(pooled_output)
        output = self.out(output)
        probs = F.softmax(output, dim = 1)
        return probs

In [26]:
def train_fn(model,dataloader,optimizer, loss_fn):
        
        ## training phase
        model.train()
        
        running_loss = 0
        correct = 0        
        number_of_batches = dataloader.get_number_of_batches()
        dataset_size = dataloader.dataset_size
        
       
        for input_ids, attenstion_mask, label in dataloader:
            
            optimizer.zero_grad()
            
            probs = model(input_ids, attenstion_mask)
            
            loss = loss_fn(probs, label)
            loss.backward()
            optimizer.step()
            
            running_loss += loss.item()
            pred_labels = torch.argmax(probs, dim = 1)
            correct += torch.sum(pred_labels == label).item()
        
        epoch_running_loss = running_loss/number_of_batches
        accuracy = correct/dataset_size
        
        return epoch_running_loss, accuracy

    
def eval_fn(model,dataloader, loss_fn):
        
        ## validation phase
        model.eval()
        
        running_loss = 0
        correct = 0
        number_of_batches = dataloader.get_number_of_batches()
        dataset_size = dataloader.dataset_size
        
        with torch.no_grad():
            for input_ids, attenstion_mask, label in dataloader:

                probs = model(input_ids, attenstion_mask)     

                loss = loss_fn(probs, label)
                running_loss += loss.item()
                pred_labels = torch.argmax(probs, dim = 1)

                running_loss += loss.item()
                correct += torch.sum(pred_labels == label).item()


            epoch_running_loss = running_loss/number_of_batches
            accuracy = correct/dataset_size
        
        return epoch_running_loss, accuracy
    
    
def run(model,train_dataloader,val_dataloader, epochs, optimizer,loss_fn, verbose = True):

    print("Training Starts  ")
    best_val_loss = 100000

    for epoch in range(epochs):

        ## Training
        train_loss, train_acc = train_fn(model,train_dataloader, optimizer,loss_fn)
        
        ## Validation
        val_loss, val_acc = eval_fn(model,val_dataloader, loss_fn)

        if verbose: 
            loss_log = f" Train loss: {train_loss:.3f}  Val loss: {val_loss:.3f}"
            acc_log = f" Train accuracy : {train_acc:.3f}  Val accuarcy : {val_acc:.3f}"
            print(f"Epoch : {epoch} "+ loss_log + acc_log)


        if best_val_loss > val_loss:
            best_val_loss = val_loss
            model_path = f"models/epoch_{epoch}_best_val_loss_model.model"
            torch.save(model.state_dict(), model_path)


In [None]:
# total_steps = len(train_dataset) * model_config.epochs
model = SentimentClassifier(model_config.model_name,n_classes=2)
optimizer = optim.Adam(model.parameters(), lr= 3e-6)
loss_fn = nn.CrossEntropyLoss()

## Training
model.to(device)
run(model,train_dl, val_dl, model_config.epochs, optimizer, loss_fn)

  0%|          | 0/2188 [00:00<?, ?it/s]

Training Starts  


 67%|██████▋   | 1458/2188 [10:48<05:31,  2.20it/s]