In [1]:
import os
import json
import random
import numpy as np
import pickle as pkl
from tqdm import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils import data

from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [36]:
DEVICE = "cuda:0"
USE_FP16 = True

RANDOM_SEED = 40

DATASET_PATH = "/home/work/.datasets/dataset/nsmc"
TRAIN_DATASET_FN = "ratings_train.txt"
VALID_DATASET_FN = "ratings_test.txt"
CACHE_PATH = "/home/work/chnaaam/.cache/nsmc"
MODEL_SAVE_PATH = "/home/work/chnaaam/.cache"

# BERT models
MODEL_NAME = "google/bert_uncased_L-12_H-768_A-12" # Base BERT model
# MODEL_NAME = "beomi/kcbert-base" # Train with korean comment corpus

# Parameters
MAX_SEQ_LENGTH = 256
TRAIN_BATCH_SIZE = 42
VALID_BATCH_SIZE = 42
LEARNING_RATE = 1e-5
EPOCH = 3

In [3]:
torch.manual_seed(RANDOM_SEED)

torch.cuda.manual_seed(RANDOM_SEED)
torch.cuda.manual_seed_all(RANDOM_SEED)

torch.backends.cudnn.deterministic = False
torch.backends.cudnn.benchmark = True

np.random.seed(RANDOM_SEED)
random.seed(RANDOM_SEED)

In [4]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)

# Train on GPU
model = model.to(DEVICE)

Some weights of the model checkpoint at google/bert_uncased_L-12_H-768_A-12 were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification

In [5]:
# Define a optimizer
optimizer = optim.AdamW(model.parameters(), lr=LEARNING_RATE)

# Define a loss function
loss_func = nn.CrossEntropyLoss()

In [6]:
class NsmcDataset(data.Dataset):
    def __init__(
        self, 
        dataset_path, 
        dataset_fn, 
        cache_path, 
        tokenizer,
        max_seq_length=256
    ):
        super().__init__()
        
        dataset_type = dataset_fn.split(".")[0]
        cached_dataset_full_path = os.path.join(cache_path, f"{dataset_type}.cache")
        cached_label_full_path = os.path.join(cache_path, "labels.json")
        
        self.data = []
        
        if not os.path.isfile(cached_dataset_full_path):
            buffer = []
            self.labels = []

            with open(os.path.join(dataset_path, dataset_fn), "r", encoding="utf-8") as fp:
                for idx, line in enumerate(fp.readlines()):
                    
                    # Remove title
                    if idx == 0:
                        continue
                        
                    line = line.replace("\n", "")

                    if line:
                        line = line.split("\t")
                        
                        buffer.append({
                            "text": line[1],
                            "label": int(line[2])
                        })
                        self.labels.append(int(line[2]))
            
            # Tokenize dataset
            for b in tqdm(buffer):
                text, label = b["text"], b["label"]
                
                inputs = tokenizer(
                    text,
                    padding="max_length",
                    truncation=True,
                    max_length=max_seq_length,
                    return_tensors="pt"
                )
                inputs = {k: v.squeeze(dim=0) for k, v in inputs.items()}
                
                self.data.append({
                    "inputs": inputs,
                    "label": label
                })
            
            with open(cached_dataset_full_path, "wb") as fp:
                pkl.dump(self.data, fp)
            
        else:
            with open(cached_dataset_full_path, "rb") as fp:
                self.data = pkl.load(fp)
        
        if not os.path.isfile(cached_label_full_path):
            # Set labels and label-index mapping table
            self.labels = list(set(self.labels))
            self.label2idx = {l: i for i, l in enumerate(self.labels)}
            self.idx2label = {i: l for i, l in enumerate(self.labels)}
            
            with open(cached_label_full_path, "w") as fp:
                json.dump({
                    "label2idx": self.label2idx
                }, fp)
        else:
            with open(cached_label_full_path, "r") as fp:
                self.label2idx = json.load(fp)["label2idx"]
                self.idx2label = {i: l for l, i in self.label2idx.items()}
                self.labels = self.label2idx.keys()
                
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        return {
            "inputs": self.data[idx]["inputs"],
            "label": torch.LongTensor([self.data[idx]["label"]])
        }

In [7]:
train_nsmc_dataset = NsmcDataset(
    dataset_path=DATASET_PATH,
    dataset_fn=TRAIN_DATASET_FN,
    cache_path=CACHE_PATH,
    tokenizer=tokenizer,
    max_seq_length=MAX_SEQ_LENGTH
)

valid_nsmc_dataset = NsmcDataset(
    dataset_path=DATASET_PATH,
    dataset_fn=VALID_DATASET_FN,
    cache_path=CACHE_PATH,
    tokenizer=tokenizer,
    max_seq_length=MAX_SEQ_LENGTH
)

In [8]:
train_nsmc_dataset.data[0]["inputs"]["input_ids"].shape

torch.Size([256])

In [9]:
print("Train dataset size : ", len(train_nsmc_dataset))
print("Validation dataset size : ", len(valid_nsmc_dataset))

print("Sample inputs : ", train_nsmc_dataset.data[0])

Train dataset size :  150000
Validation dataset size :  50000
Sample inputs :  {'inputs': {'input_ids': tensor([  101,  1463, 30006,  1457, 30008, 29996, 30019, 30025,  1012,  1012,
          100,   100,  1459, 30011, 30020, 29997, 30011, 29994, 30019,   102,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,   

In [10]:
train_data_loader = data.DataLoader(
    dataset=train_nsmc_dataset,
    batch_size=TRAIN_BATCH_SIZE,
    shuffle=True,
    pin_memory=True,
    num_workers=0
)

valid_data_loader = data.DataLoader(
    dataset=valid_nsmc_dataset,
    batch_size=VALID_BATCH_SIZE,
    shuffle=False,
    pin_memory=True,
    num_workers=0
)

In [11]:
# Use fp16
scaler = None

if USE_FP16:
    scaler = torch.cuda.amp.GradScaler()

In [34]:
for e in range(EPOCH):
    
    # Training step
    model.train()
    training_losses, valid_losses = [], []
    valid_acc = []
    
    avg_training_loss, avg_valid_loss = 0.0, 0.0
    avg_valid_acc = 0.0
    
    training_progress = tqdm(train_data_loader)
    for batch in training_progress:
        training_progress.set_description(
            f"Training [Epoch : {e+1}|{EPOCH}, Avg traininig loss : {avg_training_loss:.4f}]"
        )
            
        optimizer.zero_grad()
        
        inputs, label = batch["inputs"], batch["label"]
        inputs = {k: v.to(DEVICE) for k, v in inputs.items()}
        inputs["labels"] = label.to(DEVICE)
        
        if not scaler:
            outputs = model(**inputs)

            loss = outputs.loss
            loss.backward()
            optimizer.step()
        else:
            with torch.cuda.amp.autocast():
                outputs = model(**inputs)
            
            loss = outputs.loss
            
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
            
        training_losses.append(loss)
        avg_training_loss = sum(training_losses) / len(training_losses)

    # Validation step
    model.eval()
    
    valid_progress = tqdm(valid_data_loader)
    
    with torch.no_grad():
        for batch in valid_progress:
            valid_progress.set_description(
                f"Validation [Epoch : {e+1}|{EPOCH}, Avg validation loss : {avg_valid_loss:.4f}] Avg acc : {avg_valid_acc:.2f}"
            )

            inputs, label = batch["inputs"], batch["label"]
            inputs = {k: v.to(DEVICE) for k, v in inputs.items()}
            inputs["labels"] = label.to(DEVICE)
            
            if not scaler:
                outputs = model(**inputs)

                loss = outputs.loss
                logits = outputs.logits
            else:
                with torch.cuda.amp.autocast():
                    outputs = model(**inputs)

                loss = outputs.loss
                logits = outputs.logits
            
            valid_losses.append(loss)
            avg_valid_loss = sum(valid_losses) / len(valid_losses)
            
            true_y = label.squeeze(dim=-1).tolist()
            pred_y = torch.argmax(F.softmax(logits, dim=-1), dim=-1).tolist()
            
            total = len(true_y)
            correct = 0
            for ty, py in zip(true_y, pred_y):
                if ty == py:
                    correct += 1
            
            valid_acc.append(correct / total * 100)
            avg_valid_acc = sum(valid_acc) / len(valid_acc)

Training [Epoch : 1|3, Avg traininig loss : 0.0000]:   0%|          | 0/3572 [00:00<?, ?it/s]
Validation [Epoch : 1|3, Avg validation loss : 0.5818] Avg acc : 69.54:  19%|█▉        | 225/1191 [00:19<01:23, 11.58it/s]


KeyboardInterrupt: 

In [38]:
# Save pre-trained model
model.save_pretrained(os.path.join(
    MODEL_SAVE_PATH,
    f"{MODEL_NAME.replace('/', '_')}_acc_{avg_valid_acc:.2f}"
))