In [8]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv
/kaggle/input/bert-base-uncased/config.json
/kaggle/input/bert-base-uncased/pytorch_model.bin
/kaggle/input/bert-base-uncased/vocab.txt


In [9]:
import transformers
import torch

#config.py
class Config:
    MAX_LEN = 512
    TRAIN_BATCH_SIZE = 8
    VALID_BATCH_SIZE = 4
    
    EPOCHS = 10
    BERT_PATH = "../input/bert-base-uncased/"
    MODEL_PATH = "model.bin"
    
    TRAINING_FILE = "../input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv"
    
    TOKENIZER = transformers.BertTokenizer.from_pretrained(BERT_PATH, do_lower_case=True)

In [10]:
#dataset.py
class BERTDataset:
    def __init__(self, review, target):
        self.review = review
        self.target = target
        self.tokenizer = Config.TOKENIZER
        self.max_len = Config.MAX_LEN
        
    def __len__(self):
        return len(self.review)
    
    def __getitem__(self, item):
        review = str(self.review[item])
        review = " ".join(review.split())
        
        inputs = self.tokenizer.encode_plus(
            review, 
            None, 
            add_special_tokens=True, 
            max_length=self.max_len,
            padding = "max_length",
            truncation = True
        )
        ids = inputs["input_ids"]
        mask = inputs["attention_mask"]
        token_type_ids =  inputs["token_type_ids"]
        
        
        return {
            "ids": torch.tensor(ids, dtype=torch.long),
            "mask": torch.tensor(mask, dtype=torch.long),
            "token_type_ids": torch.tensor(token_type_ids, dtype=torch.long),
            "targets": torch.tensor(self.target[item], dtype=torch.float)
        }

In [11]:
#model.py
import torch.nn as nn

class BERTBaseUncased(nn.Module):
    def __init__(self):
        super().__init__()
        self.bert = transformers.BertModel.from_pretrained(Config.BERT_PATH)
        self.bert_drop = nn.Dropout(0.3)
        self.out = nn.Linear(768, 1)
        
    def forward(self, ids, mask, token_type_ids):
        _, o2 = self.bert(ids, attention_mask = mask, token_type_ids=token_type_ids, return_dict=False)
        bo = self.bert_drop(o2)
        output = self.out(bo)
        
        return output

In [12]:
#engine.py
def loss_fn(outputs, targets):
    targets = targets.type_as(outputs)
    return nn.BCEWithLogitsLoss()(outputs, targets.view(-1,1))

def train_fn(data_loader, model, optimizer, device, scheduler):
    model.train()
    
    for d in data_loader:
        ids = d["ids"]
        mask = d["mask"]
        token_type_ids = d["token_type_ids"]
        targets = d["targets"]
        
        ids = ids.to(device, dtype=torch.long)
        token_type_ids = token_type_ids.to(device, dtype=torch.long)
        mask = mask.to(device, dtype=torch.long)
        targets = targets.to(device, dtype=torch.long)
        
        optimizer.zero_grad()
        
        outputs = model(ids=ids, mask=mask, token_type_ids=token_type_ids)
        
        loss = loss_fn(outputs, targets)
        loss.backward()
        
        optimizer.step()
        scheduler.step()
        
def eval_fn(data_loader, model, device):
    model.eval()
    fin_targets = []
    fin_outputs = []
    with torch.no_grad():
        for d in data_loader:
            ids = d["ids"]
            mask = d["mask"]
            token_type_ids = d["token_type_ids"]
            targets = d["targets"]

            ids = ids.to(device, dtype=torch.long)
            token_type_ids = token_type_ids.to(device, dtype=torch.long)
            mask = mask.to(device, dtype=torch.long)
            targets = targets.to(device, dtype=torch.long)
            
            outputs = model(ids=ids, mask=mask, token_type_ids=token_type_ids)
            
            targets = targets.cpu().detach()
            fin_targets.extend(targets.numpy().tolist())
            
            outputs = torch.sigmoid(outputs).cpu().detach()
            fin_outputs.extend(outputs.numpy().tolist())
    
    return fin_outputs, fin_targets

In [13]:
from sklearn import model_selection, metrics
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup
import multiprocessing

def train():
    print("df loading...")
    df = pd.read_csv(Config.TRAINING_FILE, nrows=5000).fillna("none")
    df["sentiment"] = df["sentiment"].apply(lambda label: 1 if label == "positive" else 0)
    
    df_train, df_validation = model_selection.train_test_split(
        df, 
        test_size=0.1,
        random_state=12,
        stratify=df["sentiment"].values
    )
    
    df_train = df_train.reset_index(drop=True)
    df_validation = df_validation.reset_index(drop=True)
    
    
    train_dataset = BERTDataset(
        review = df_train["review"].values,
        target = df_train["sentiment"].values
    )
    
    train_data_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size = Config.TRAIN_BATCH_SIZE,
        num_workers = multiprocessing.cpu_count()
    )
    
    
    validation_dataset = BERTDataset(
        review = df_validation["review"].values,
        target = df_validation["sentiment"].values
    )
    
    validation_data_loader = torch.utils.data.DataLoader(
        validation_dataset,
        batch_size = Config.VALID_BATCH_SIZE,
        num_workers = multiprocessing.cpu_count()
    )
    
    device = torch.device("cuda")
    model = BERTBaseUncased()
    model.to(device)
    
    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_parameters = [
        {
            "params": [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
            "weight_decay": 0.001,
        },
        {
            "params": [p for n,p in param_optimizer if any(nd in n for nd in no_decay)],
            "weight_decay": 0.0,
        },
    ]
    
    num_train_steps = int(len(df_train) / (Config.TRAIN_BATCH_SIZE * Config.EPOCHS))
    optimizer = AdamW(optimizer_parameters, lr=3e-5)
    
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=0,
        num_training_steps=num_train_steps
    )
    
    model = nn.DataParallel(model)
    
    best_accuracy = 0
    print("Epochs starting...")
    for epoch in range(Config.EPOCHS):
        print(f"EPOCH: {epoch}", end="\t")
        train_fn(train_data_loader, model, optimizer, device, scheduler)
        outputs, targets = eval_fn(validation_data_loader, model, device)
        
        outputs = np.array(outputs) > 0.5
        accuracy = metrics.accuracy_score(targets, outputs)
        print(f"Accuracy: {accuracy}")
        if accuracy > best_accuracy:
            torch.save(model.state_dict(), Config.MODEL_PATH)
            best_accuracy = accuracy
    


In [14]:
%%time

train()

df loading...
Epochs starting...
EPOCH: 0	Accuracy: 0.696
EPOCH: 1	Accuracy: 0.696
EPOCH: 2	Accuracy: 0.696
EPOCH: 3	

KeyboardInterrupt: 