In [1]:
import pandas as pd
import torch
import torch.nn as nn
from torch.nn import functional as F
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizerFast, BertModel, AdamW
from tqdm.notebook import tqdm

class NSMCDataset(Dataset):
  
    def __init__(self, file_path):
        self.dataset = pd.read_csv(file_path, sep='\t')
        
        # drop nan row
        self.dataset = self.dataset.dropna(axis = 0)
        # drop duplicate row
        self.dataset['document'] = self.dataset['document'].replace(r'http\S+', '', regex=True).replace(r'www\S+', '', regex=True)
        self.dataset.drop_duplicates(subset=['document'], inplace=True)
        
        # tokenizer
        self.tokenizer = BertTokenizerFast.from_pretrained("kykim/bert-kor-base")
  
        # add special tokens
        self.tokenizer.add_tokens(["OO", "OOO", "OOOO"], special_tokens=True)
        
    def __len__(self):
        return len(self.dataset)
  
    def __getitem__(self, idx):
        row = self.dataset.iloc[idx, 1:3].values # no ids!
    
        '''
        ["document", "label"]
        '''
        document = row[0]
        label = row[1]

        inputs = self.tokenizer(
            document, 
            return_tensors='pt',
            truncation=True,
            max_length=256,
            pad_to_max_length=True,
            add_special_tokens=True
            )

        input_ids = inputs['input_ids'][0]
        attention_mask = inputs['attention_mask'][0]

        return input_ids, attention_mask, label

In [2]:
train_dataset = NSMCDataset("train.txt")
test_dataset = NSMCDataset("test.txt")

In [3]:
class BertClassifier(nn.Module):
    def __init__(self):
        super().__init__()
        self.bert_model = BertModel.from_pretrained("kykim/bert-kor-base")
        self.fc = nn.Linear(768, 2)
    def forward(self, input_ids, attention_mask):
        outputs = self.bert_model(input_ids = input_ids, attention_mask = attention_mask)

        pooled_output = outputs[1]
        return self.fc(pooled_output)

In [5]:
device = torch.device("cuda:7")
model = BertClassifier()

model.to(device)

model.bert_model.resize_token_embeddings(len(train_dataset.tokenizer))

Some weights of the model checkpoint at kykim/bert-kor-base were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Embedding(42003, 768)

In [6]:
epochs = 5
batch_size = 16

In [7]:
optimizer = AdamW(model.parameters(), lr=1e-5)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)



In [None]:
losses = []
accuracies = []

for epoch in range(epochs):
    total_loss = 0.0
    correct = 0
    total = 0
    batch_index = 0

    model.train()

    for input_ids_batch, attention_masks_batch, y_batch in tqdm(train_loader):
        optimizer.zero_grad()
        
        y_batch = y_batch.to(device)
        input_ids_batch = input_ids_batch.to(device)
        attention_masks_batch = attention_masks_batch.to(device)
        
        y_pred = model(input_ids=input_ids_batch, attention_mask=attention_masks_batch)
        loss = F.cross_entropy(y_pred, y_batch)

        loss.backward()

        optimizer.step()

        total_loss += loss.item()

        _, predicted = torch.max(y_pred, 1)
        correct += (predicted == y_batch).sum()
        total += len(y_batch)

        batch_index += 1
        if batch_index % 500 == 0:
            print(f"epoch #{epoch} {batch_index} Batch Loss:{total_loss} Accuracy:{correct.float() / total}")
  
    losses.append(total_loss)
    accuracies.append(correct.float() / total)
    print("Train Loss:", total_loss, "Accuracy:", correct.float() / total)
    torch.save(model.state_dict(), f"model_{i}_v2.pt")

  0%|          | 0/9137 [00:00<?, ?it/s]



epoch #0 500 Batch Loss:187.48752443119884 Accuracy:0.8335000276565552
epoch #0 1000 Batch Loss:337.8686069510877 Accuracy:0.8538750410079956
epoch #0 1500 Batch Loss:486.34194169938564 Accuracy:0.8603333234786987
epoch #0 2000 Batch Loss:623.1496058255434 Accuracy:0.8673437833786011
epoch #0 2500 Batch Loss:753.1026105880737 Accuracy:0.8728249669075012


In [None]:
model.eval()

test_correct = 0
test_total = 0

for input_ids_batch, attention_masks_batch, y_batch in tqdm(loader):
    y_batch = y_batch.to(device)
    y_pred = model(input_ids_batch.to(device), attention_mask=attention_masks_batch.to(device))
    predicted = torch.max(y_pred, 1)[1]
    test_correct += (predicted == y_batch).sum()
    test_total += len(y_batch)

print("Accuracy:", test_correct.float() / test_total)