<a href="https://colab.research.google.com/github/froggagul/nsmc/blob/master/Naver_sentiment_movie_base.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# download raw datasets
import requests

f_train = requests.get('https://raw.githubusercontent.com/e9t/nsmc/master/ratings_train.txt')
f_test = requests.get('https://raw.githubusercontent.com/e9t/nsmc/master/ratings_test.txt')

open('train.txt', 'wb').write(f_train.content)
open('test.txt', 'wb').write(f_test.content)

4893335

In [2]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.10.2-py3-none-any.whl (2.8 MB)
[K     |████████████████████████████████| 2.8 MB 8.0 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl (636 kB)
[K     |████████████████████████████████| 636 kB 66.7 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.45-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 78.1 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 60.2 MB/s 
Collecting huggingface-hub>=0.0.12
  Downloading huggingface_hub-0.0.17-py3-none-any.whl (52 kB)
[K     |████████████████████████████████| 52 kB 2.3 MB/s 
Installing collected packages: tokenizers, sacremoses, pyyaml, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3

In [3]:
import pandas as pd
import torch
import torch.nn as nn
from torch.nn import functional as F
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizerFast, BertModel, AdamW
from tqdm.notebook import tqdm

# reproducible pytorch
torch.manual_seed(1234)

<torch._C.Generator at 0x7f5908425ef0>

In [5]:
class NSMCDataset(Dataset):
  
    def __init__(self, file_path):
        self.dataset = pd.read_csv(file_path, sep='\t')
        
        # drop nan row
        self.dataset = self.dataset.dropna(axis = 0)
        # drop duplicate row
        self.dataset.drop_duplicates(subset=['document'], inplace=True)
        
        # tokenizer
        self.tokenizer = BertTokenizerFast.from_pretrained("kykim/bert-kor-base")
  
    def __len__(self):
        return len(self.dataset)
  
    def __getitem__(self, idx):
        row = self.dataset.iloc[idx, 1:3].values # no ids!
    
        '''
        ["document", "label"]
        '''
        document = row[0]
        label = row[1]

        inputs = self.tokenizer(
            document, 
            return_tensors='pt',
            truncation=True,
            max_length=256,
            pad_to_max_length=True,
            add_special_tokens=True
            )

        input_ids = inputs['input_ids'][0]
        attention_mask = inputs['attention_mask'][0]

        return input_ids, attention_mask, label

In [6]:
train_dataset = NSMCDataset("train.txt")
test_dataset = NSMCDataset("test.txt")

Downloading:   0%|          | 0.00/344k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/80.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/725 [00:00<?, ?B/s]

In [7]:
class BertClassifier(nn.Module):
    def __init__(self):
        super().__init__()
        self.bert_model = BertModel.from_pretrained("kykim/bert-kor-base")
        self.fc = nn.Linear(768, 2)
    def forward(self, input_ids, attention_mask):
        outputs = self.bert_model(input_ids = input_ids, attention_mask = attention_mask)

        pooled_output = outputs[1]
        return self.fc(pooled_output)

In [8]:
device = torch.device("cuda")

In [9]:
model = BertClassifier()

model.to(device)

model.bert_model.resize_token_embeddings(len(train_dataset.tokenizer))

Downloading:   0%|          | 0.00/476M [00:00<?, ?B/s]

Some weights of the model checkpoint at kykim/bert-kor-base were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Embedding(42000, 768, padding_idx=0)

In [10]:
!nvidia-smi

Mon Sep 20 07:35:28 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.63.01    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   40C    P0    26W /  70W |   1572MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [11]:
epochs = 4
batch_size = 16

In [12]:
optimizer = AdamW(model.parameters(), lr=1e-5)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=4, shuffle=True)

In [14]:
losses = []
train_accuracies = []
test_accuracies = []

def test(epoch):
    model.eval()

    test_correct = 0
    test_total = 0

    for input_ids_batch, attention_masks_batch, y_batch in tqdm(test_loader):
        y_batch = y_batch.to(device)
        y_pred = model(input_ids_batch.to(device), attention_mask=attention_masks_batch.to(device))
        predicted = torch.max(y_pred, 1)[1]
        test_correct += (predicted == y_batch).sum()
        test_total += len(y_batch)

    print(f"epoch #{epoch} Accuracy: {test_correct.float() / test_total}")
    test_accuracies.append(test_correct.float() / test_total)

def train():
    for epoch in range(1, epochs + 1):
        total_loss = 0.0
        correct = 0
        total = 0
        batch_index = 0

        model.train()

        for input_ids_batch, attention_masks_batch, y_batch in tqdm(train_loader):
            optimizer.zero_grad()

            y_batch = y_batch.to(device)
            input_ids_batch = input_ids_batch.to(device)
            attention_masks_batch = attention_masks_batch.to(device)
            
            y_pred = model(input_ids=input_ids_batch, attention_mask=attention_masks_batch)
            loss = F.cross_entropy(y_pred, y_batch)

            loss.backward()

            optimizer.step()

            total_loss += loss.item()

            predicted = torch.max(y_pred, 1)[1]
            correct += (predicted == y_batch).sum()
            total += len(y_batch)

            batch_index += 1
            if batch_index % 500 == 0:
                print(f"epoch #{epoch} {batch_index} Batch Loss:{total_loss} Train Accuracy:{correct.float() / total}")

        losses.append(total_loss)
        train_accuracies.append(correct.float() / total)
        print(f"Train Loss: {total_loss} Train Accuracy:{correct.float() / total}")

        torch.save(model.state_dict(), f"model_{epoch}_v2.pt")

        test(epoch)

train()

  0%|          | 0/9137 [00:00<?, ?it/s]



epoch #1 500 Batch Loss:151.20176910981536 Train Accuracy:0.8727500438690186
epoch #1 1000 Batch Loss:296.8729784414172 Train Accuracy:0.8740000128746033
epoch #1 1500 Batch Loss:449.56022880226374 Train Accuracy:0.8735833168029785
epoch #1 2000 Batch Loss:595.2586596161127 Train Accuracy:0.8746875524520874
epoch #1 2500 Batch Loss:727.4815600886941 Train Accuracy:0.8776249885559082
epoch #1 3000 Batch Loss:859.2814056891948 Train Accuracy:0.879645824432373
epoch #1 3500 Batch Loss:988.7890336103737 Train Accuracy:0.8809642791748047
epoch #1 4000 Batch Loss:1118.5906709786505 Train Accuracy:0.8824531435966492
epoch #1 4500 Batch Loss:1241.1432616449893 Train Accuracy:0.8842777609825134
epoch #1 5000 Batch Loss:1368.8293504249305 Train Accuracy:0.885112464427948
epoch #1 5500 Batch Loss:1487.5401788763702 Train Accuracy:0.8867499828338623
epoch #1 6000 Batch Loss:1611.1113699618727 Train Accuracy:0.887833297252655
epoch #1 6500 Batch Loss:1731.2422971948981 Train Accuracy:0.888932704925

  0%|          | 0/12290 [00:00<?, ?it/s]

epoch #1 Accuracy: 0.9062188267707825


  0%|          | 0/9137 [00:00<?, ?it/s]

epoch #2 500 Batch Loss:85.59225272806361 Train Accuracy:0.9316250681877136
epoch #2 1000 Batch Loss:183.12573789944872 Train Accuracy:0.9265625476837158
epoch #2 1500 Batch Loss:279.41698234668 Train Accuracy:0.9256249666213989
epoch #2 2000 Batch Loss:367.53754793526605 Train Accuracy:0.9272500276565552
epoch #2 2500 Batch Loss:458.57519490038976 Train Accuracy:0.9271999597549438
epoch #2 3000 Batch Loss:552.5485026077367 Train Accuracy:0.9272708296775818
epoch #2 3500 Batch Loss:644.0181527887471 Train Accuracy:0.927017867565155
epoch #2 4000 Batch Loss:736.8330812505446 Train Accuracy:0.9275156855583191
epoch #2 4500 Batch Loss:830.2269207672216 Train Accuracy:0.9270694255828857
epoch #2 5000 Batch Loss:925.09800821729 Train Accuracy:0.9268999695777893
epoch #2 5500 Batch Loss:1021.7655251566321 Train Accuracy:0.9266363382339478
epoch #2 6000 Batch Loss:1114.6718004988506 Train Accuracy:0.9266353845596313
epoch #2 6500 Batch Loss:1212.9503156421706 Train Accuracy:0.9264615178108215

  0%|          | 0/12290 [00:00<?, ?it/s]

epoch #2 Accuracy: 0.9126675724983215


  0%|          | 0/9137 [00:00<?, ?it/s]

epoch #3 500 Batch Loss:61.584049995522946 Train Accuracy:0.9535000324249268
epoch #3 1000 Batch Loss:131.82493426674046 Train Accuracy:0.9510625600814819
epoch #3 1500 Batch Loss:191.51420582504943 Train Accuracy:0.9528332948684692
epoch #3 2000 Batch Loss:256.83355196611956 Train Accuracy:0.9526875615119934
epoch #3 2500 Batch Loss:329.9091882603243 Train Accuracy:0.9513999819755554
epoch #3 3000 Batch Loss:399.0331239486113 Train Accuracy:0.9508749842643738
epoch #3 3500 Batch Loss:468.2405471629463 Train Accuracy:0.9505714178085327
epoch #3 4000 Batch Loss:537.4895233907737 Train Accuracy:0.9500781893730164
epoch #3 4500 Batch Loss:601.9753117426299 Train Accuracy:0.9500972032546997
epoch #3 5000 Batch Loss:669.3049678578973 Train Accuracy:0.9499624967575073
epoch #3 5500 Batch Loss:737.3227812512778 Train Accuracy:0.949988603591919
epoch #3 6000 Batch Loss:807.2506445136387 Train Accuracy:0.9497708082199097
epoch #3 6500 Batch Loss:876.6601524685975 Train Accuracy:0.94966346025466

  0%|          | 0/12290 [00:00<?, ?it/s]

epoch #3 Accuracy: 0.9104095101356506


  0%|          | 0/9137 [00:00<?, ?it/s]

KeyboardInterrupt: ignored

In [15]:
losses, test_accuracies, train_accuracies

([2345.2733005695045, 1706.86134163104, 1237.497708305018],
 [tensor(0.9062, device='cuda:0'),
  tensor(0.9127, device='cuda:0'),
  tensor(0.9104, device='cuda:0')],
 [tensor(0.8936, device='cuda:0'),
  tensor(0.9266, device='cuda:0'),
  tensor(0.9490, device='cuda:0')])

In [None]:
!nvidia-smi

Mon Sep 20 04:01:20 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.63.01    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   41C    P8     9W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
from google.colab import drive
drive.mount('/content/drive')



Mounted at /content/drive
