In [1]:
# download raw datasets
import requests


f_train = requests.get('https://raw.githubusercontent.com/e9t/nsmc/master/ratings_train.txt')
f_test = requests.get('https://raw.githubusercontent.com/e9t/nsmc/master/ratings_test.txt')

open('train.txt', 'wb').write(f_train.content)
open('test.txt', 'wb').write(f_test.content)

4893335

In [11]:
import pandas as pd
import torch
import torch.nn as nn
from torch.nn import functional as F
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizerFast, BertModel, AdamW
from tqdm.notebook import tqdm

class NSMCDataset(Dataset):
  
    def __init__(self, file_path):
        self.dataset = pd.read_csv(file_path, sep='\t')
        
        # drop nan row
        self.dataset = self.dataset.dropna(axis = 0)
        # drop duplicate row
        self.dataset.drop_duplicates(subset=['document'], inplace=True)

        # tokenizer
        self.tokenizer = BertTokenizerFast.from_pretrained("kykim/bert-kor-base")
  
    def __len__(self):
        return len(self.dataset)
  
    def __getitem__(self, idx):
        row = self.dataset.iloc[idx, 1:3].values # no ids!
    
        '''
        ["document", "label"]
        '''
        document = row[0]
        label = row[1]

        inputs = self.tokenizer(
            document, 
            return_tensors='pt',
            truncation=True,
            max_length=256,
            pad_to_max_length=True,
            add_special_tokens=True
            )

        input_ids = inputs['input_ids'][0]
        attention_mask = inputs['attention_mask'][0]

        return input_ids, attention_mask, label

In [12]:
train_dataset = NSMCDataset("train.txt")
test_dataset = NSMCDataset("test.txt")

In [13]:
class BertClassifier(nn.Module):
    def __init__(self):
        super().__init__()
        self.bert_model = BertModel.from_pretrained("kykim/bert-kor-base")
        self.fc = nn.Linear(768, 2)
    def forward(self, input_ids, attention_mask):
        outputs = self.bert_model(input_ids = input_ids, attention_mask = attention_mask)

        pooled_output = outputs[1]
        return self.fc(pooled_output)

In [36]:
device = torch.device("cuda:6")
model = BertClassifier().to(device)

In [26]:
epochs = 5
batch_size = 16

In [31]:
optimizer = AdamW(model.parameters(), lr=1e-5)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=True)

In [None]:
losses = []
accuracies = []

for i in range(epochs):
    total_loss = 0.0
    correct = 0
    total = 0
    batch_index = 0

    model.train()

    for input_ids_batch, attention_masks_batch, y_batch in tqdm(train_loader):
        optimizer.zero_grad()
        
        y_batch = y_batch.to(device)
        input_ids_batch = input_ids_batch.to(device)
        attention_masks_batch = attention_masks_batch.to(device)
        
        y_pred = model(input_ids=input_ids_batch, attention_mask=attention_masks_batch)
        loss = F.cross_entropy(y_pred, y_batch)

        loss.backward()

        optimizer.step()

        total_loss += loss.item()

        _, predicted = torch.max(y_pred, 1)
        correct += (predicted == y_batch).sum()
        total += len(y_batch)

        batch_index += 1
        if batch_index % 500 == 0:
            print(f"epoch #{epoch} {batch_index} Batch Loss:{total_loss} Accuracy:{correct.float() / total}")
  
    losses.append(total_loss)
    accuracies.append(correct.float() / total)
    print("Train Loss:", total_loss, "Accuracy:", correct.float() / total)
    torch.save(model.state_dict(), f"model_{i}_base.pt")

  0%|          | 0/9137 [00:00<?, ?it/s]

epoch #0 500 Batch Loss:201.25931961089373 Accuracy:0.8147500157356262
epoch #0 1000 Batch Loss:358.8712701871991 Accuracy:0.843000054359436
epoch #0 1500 Batch Loss:502.3979359790683 Accuracy:0.8557916283607483
epoch #0 2000 Batch Loss:640.5342759676278 Accuracy:0.86265629529953
epoch #0 2500 Batch Loss:781.1512996219099 Accuracy:0.8658499717712402
epoch #0 3000 Batch Loss:918.0773126333952 Accuracy:0.8687708377838135
epoch #0 3500 Batch Loss:1050.5322037525475 Accuracy:0.8716250061988831
epoch #0 4000 Batch Loss:1176.810455687344 Accuracy:0.8745625615119934
epoch #0 4500 Batch Loss:1307.1942459661514 Accuracy:0.8766111135482788
epoch #0 5000 Batch Loss:1438.001855963841 Accuracy:0.8778374791145325
epoch #0 5500 Batch Loss:1558.1402444187552 Accuracy:0.8799772262573242
epoch #0 6000 Batch Loss:1677.0556378886104 Accuracy:0.8815937638282776
epoch #0 6500 Batch Loss:1800.6888582780957 Accuracy:0.8829230666160583
epoch #0 7000 Batch Loss:1921.5296100508422 Accuracy:0.8841160535812378
epo

  0%|          | 0/9137 [00:00<?, ?it/s]

epoch #0 500 Batch Loss:89.2964676618576 Accuracy:0.9288750290870667
epoch #0 1000 Batch Loss:184.36219006683677 Accuracy:0.9263750314712524
epoch #0 1500 Batch Loss:276.2442282056436 Accuracy:0.9266666769981384
epoch #0 2000 Batch Loss:370.8862567646429 Accuracy:0.9265000224113464
epoch #0 2500 Batch Loss:465.47427466511726 Accuracy:0.9265999794006348
epoch #0 3000 Batch Loss:558.4443779923022 Accuracy:0.9262083172798157
epoch #0 3500 Batch Loss:649.8625819347799 Accuracy:0.9265535473823547
epoch #0 4000 Batch Loss:750.7874247487634 Accuracy:0.9259063005447388
epoch #0 4500 Batch Loss:844.999480754137 Accuracy:0.9256805777549744
epoch #0 5000 Batch Loss:944.8569154441357 Accuracy:0.9250999689102173
epoch #0 5500 Batch Loss:1041.481407348998 Accuracy:0.9248067736625671
epoch #0 6000 Batch Loss:1134.40595112741 Accuracy:0.925000011920929
epoch #0 6500 Batch Loss:1230.7109476905316 Accuracy:0.9250673055648804
epoch #0 7000 Batch Loss:1331.0087671298534 Accuracy:0.9245803356170654
epoch #

  0%|          | 0/9137 [00:00<?, ?it/s]

epoch #0 500 Batch Loss:66.14676029281691 Accuracy:0.9515000581741333


In [37]:


test_loader = DataLoader(test_dataset, batch_size=8, shuffle=True)

model.eval()
model.to(device)

test_correct = 0
test_total = 0

for input_ids_batch, attention_masks_batch, y_batch in tqdm(test_loader):
    y_batch = y_batch.to(device)
    y_pred = model(input_ids_batch.to(device), attention_mask=attention_masks_batch.to(device))
    predicted = torch.max(y_pred, 1)[1]
    test_correct += (predicted == y_batch).sum()
    test_total += len(y_batch)

print("Accuracy:", test_correct.float() / test_total)

  0%|          | 0/6145 [00:00<?, ?it/s]

Accuracy: tensor(0.9097, device='cuda:6')
