In [1]:
from transformers import BertTokenizer, BertForSequenceClassification
import torch
import torch.nn as nn

import pandas as pd
import numpy as np



## Loading and reading data

In [2]:
df_train = pd.read_csv("/kaggle/input/nlp-getting-started/train.csv")
df_test = pd.read_csv("/kaggle/input/nlp-getting-started/test.csv")

In [3]:
df_train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


## Extracting text and targets

In [4]:
src = df_train['text']
tgt = df_train['target']

In [5]:
# Setting up device for GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

## Tokenization data using pretrained bert model from HuggingFace

In [6]:
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

In [7]:
encoded_data = tokenizer.batch_encode_plus(
    src,
    padding=True,
    truncation=True,
    max_length=512,
    return_tensors='pt'
)

In [8]:
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, encoded_data, labels=None):
        self.input_ids = encoded_data['input_ids']
        self.attention_mask = encoded_data['attention_mask']
        self.labels = torch.tensor(labels) if labels is not None else None
        
    def __len__(self):
        return len(self.input_ids)
    
    def __getitem__(self, idx):
        if self.labels is None:
            return {
                'input_ids': self.input_ids[idx],
                'attention_mask': self.attention_mask[idx],
            }

        return {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attention_mask[idx],
            'labels': self.labels[idx]
        }
    
dataset = CustomDataset(encoded_data, tgt)

In [9]:
from torch.utils.data import random_split

train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

print(len(train_dataset), len(val_dataset))

6090 1523


In [10]:
from torch.utils.data import DataLoader

batch_size = 16

train_loader = DataLoader(
    dataset=train_dataset,
    batch_size=batch_size, 
    shuffle=True
)

val_loader = DataLoader(
    dataset=val_dataset,
    batch_size=batch_size,
    shuffle=True
)

## Define train % validation functions

In [11]:
@torch.no_grad()
def validate_model(model, val_loader):
    model.eval()
    
    correct_samples = 0
    total_samples = 0
    
    for batch in val_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask =  batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        logits = outputs.logits
        
        _, indices = torch.max(logits, 1)
        correct_samples += torch.sum(indices == labels)
        total_samples += labels.shape[0]
        
    return float(correct_samples) / total_samples

In [12]:
def train_model(model, optimizer, train_loader, val_loader, num_epochs):
    train_losses = []
    
    train_accuracies = []
    val_accuracies = []
    
    for epoch in range(num_epochs):
        model.train()

        running_loss = 0.0

        correct_samples = 0
        total_samples = 0

        for batch in train_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask =  batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            optimizer.zero_grad()

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            logits = outputs.logits

            loss.backward()
            optimizer.step()

            running_loss += loss.item()

            _, indices = torch.max(logits, 1)
            correct_samples += torch.sum(indices == labels)
            total_samples += labels.shape[0]    

        epoch_loss = running_loss / len(train_loader)

        train_accuracy = float(correct_samples) / total_samples
        val_accuracy = validate_model(model, val_loader)

        print(f"Epoch: {epoch+1}/{num_epochs} | Train Loss: {epoch_loss:.16f} | Train Accuracy: {train_accuracy:.4f} | Validation Accuracy: {val_accuracy:.4f}")
        
#         train_losses.append(epoch_loss)

#         train_accuracies.append(train_accuracy)
#         val_accuracies.append(val_accuracy)
        
    return train_losses, train_accuracies, val_accuracies

## Hyper-parameter tunning using randomized approach

In [13]:
# Explanation: 
# Getting every search iteration random learning rate and weight decay from uniform distribution.
# Initially you should set bounds to np.random.uniform from (-7, -1) approximately.
# Number of epochs for each tuning step is around 3-4, I've set it to 3.
# After search is done you can select new bound i.e. (-5, -3) and repeat this process again.
# You can do it until you find a good learning rate and weight decay parameters and then use it to your model

# random_search_iters = 5
# for it in range(random_search_iters):
#     lr = 10**np.random.uniform(-6, -4)
#     weight_decay = 10**np.random.uniform(-6, -2)
    
#     print()
#     print(f"Iteration {it+1}/{random_search_iters} for randomized search.")
#     print(f"Learning rate: {lr} | Weight decay: {weight_decay}")
#     print()
    
#     model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
#     model.to(device)

#     optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
    
#     train_model(model, optimizer, train_loader, val_loader, 3)

In [14]:
lr = 5.9891071551069174e-06
weight_decay = 1.850822698520024e-04

model = BertForSequenceClassification.from_pretrained('bert-large-uncased', num_labels=2)
model.to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)

Downloading model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-large-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
train_losses, train_accuracies, val_accuracies = train_model(model, optimizer, train_loader, val_loader, num_epochs=3)

Epoch: 1/3 | Train Loss: 0.4695580268860489 | Train Accuracy: 0.7959 | Validation Accuracy: 0.8332
Epoch: 2/3 | Train Loss: 0.3588918076725457 | Train Accuracy: 0.8580 | Validation Accuracy: 0.8385
Epoch: 3/3 | Train Loss: 0.3198460609385660 | Train Accuracy: 0.8744 | Validation Accuracy: 0.8411


## Predict test data & save outputs

In [16]:
src_test = df_test['text'] 
idxs_test = df_test['id']

In [17]:
encoded_data_test = tokenizer.batch_encode_plus(
    src_test,
    padding=True,
    truncation=True,
    max_length=512,
    return_tensors='pt'
)

In [18]:
test_dataset = CustomDataset(encoded_data_test)

In [19]:
test_loader = DataLoader(
    dataset=test_dataset,
    batch_size=batch_size, 
)

In [20]:
model.eval()
    
predicted_indices = []
    
for batch in test_loader:
    input_ids = batch['input_ids'].to(device)
    attention_mask =  batch['attention_mask'].to(device)
        
    outputs = model(input_ids, attention_mask=attention_mask)
    logits = outputs.logits
        
    _, indices = torch.max(logits, 1)
    predicted_indices.extend([int(i) for i in indices])

In [21]:
output = pd.DataFrame({
    "id": idxs_test,
    "target": predicted_indices
})

In [22]:
output.head(3)

Unnamed: 0,id,target
0,0,0
1,2,1
2,3,1


In [23]:
output.to_csv("pretrained_bert_5.csv", index=False)