In [1]:
!date "+[%F %R:%S] [INIT] HelloNSMC (using $CONDA_PREFIX)"
import time
t0 = time.time()

[2020-10-22 06:49:50] [INIT] HelloNSMC (using /home/chris/anaconda3/envs/trans2)


In [2]:
!pip install tqdm boto3 requests regex sentencepiece sacremoses urllib3 pandas pytorch-transformers

Defaulting to user installation because normal site-packages is not writeable
You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.[0m


In [3]:
import os
if not os.path.exists("./nsmc/"):
    !git clone https://github.com/e9t/nsmc.git
else:
    print("Dataset already downloaded: nsmc")

Dataset already downloaded: nsmc


In [4]:
!date "+[%F %R:%S] [DONE] Environment & Dataset Installation"
print(f"td={time.time() - t0:.3f}")
t0 = time.time()

[2020-10-22 06:49:51] [DONE] Environment & Dataset Installation
td=0.893


In [5]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from pytorch_transformers import BertTokenizer, BertForSequenceClassification
from torch.optim import Adam
import torch.nn.functional as F

In [6]:
train_df = pd.read_csv('./nsmc/ratings_train.txt', sep='\t')
test_df = pd.read_csv('./nsmc/ratings_test.txt', sep='\t')
train_df.head(3)

Unnamed: 0,id,document,label
0,9976970,아 더빙.. 진짜 짜증나네요 목소리,0
1,3819312,흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나,1
2,10265843,너무재밓었다그래서보는것을추천한다,0


In [7]:
train_df.dropna(inplace=True)
test_df.dropna(inplace=True)

train_df = train_df.sample(frac=0.4, random_state=999)
test_df = test_df.sample(frac=0.4, random_state=999)

In [8]:
class NsmcDataset(Dataset):
    """ Naver Sentiment Movie Corpus Dataset """
    def __init__(self, df):
        self.df = df

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        text = self.df.iloc[idx, 1]
        label = self.df.iloc[idx, 2]
        return text, label

In [9]:
nsmc_train_dataset = NsmcDataset(train_df)
train_loader = DataLoader(nsmc_train_dataset, batch_size=2, shuffle=True, num_workers=2)

In [10]:
device = torch.device("cuda")
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased')
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elemen

In [11]:
!date "+[%F %R:%S] [DONE] Pretrained Model Loading"
print(f"td={time.time() - t0:.3f}")
t0 = time.time()

[2020-10-22 06:50:05] [DONE] Pretrained Model Loading
td=14.165


In [12]:
optimizer = Adam(model.parameters(), lr=1e-6)

epochs = 2
itr = 1
p_itr = 500
total_loss = 0
total_len = 0
total_correct = 0

model.train()
for epoch in range(epochs):

    for text, label in train_loader:
        optimizer.zero_grad()

        encoded_list = [tokenizer.encode(t, add_special_tokens=True) for t in text]
        padded_list =  [e + [0] * (512-len(e)) for e in encoded_list]
        sample = torch.tensor(padded_list)
        sample, label = sample.to(device), label.to(device)
        labels = torch.tensor(label)
        outputs = model(sample, labels=labels)
        loss, logits = outputs

        pred = torch.argmax(F.softmax(logits), dim=1)
        correct = pred.eq(labels)
        total_correct += correct.sum().item()
        total_len += len(labels)
        total_loss += loss.item()
        loss.backward()
        optimizer.step()

        if itr % p_itr == 0:
            !date "+[%F %R:%S] Training..."
            print(f"[Epoch {epoch+1}/{epochs}] Iteration {itr} -> Train Loss: {total_loss/p_itr:.4f}, Accuracy: {total_correct/total_len:.4f}")
            total_loss = 0
            total_len = 0
            total_correct = 0

        itr+=1

  labels = torch.tensor(label)
  pred = torch.argmax(F.softmax(logits), dim=1)


[2020-10-22 06:51:00] Training...
[Epoch 1/2] Iteration 500 -> Train Loss: 0.6982, Accuracy: 0.5060
[2020-10-22 06:51:54] Training...
[Epoch 1/2] Iteration 1000 -> Train Loss: 0.6963, Accuracy: 0.4940
[2020-10-22 06:52:48] Training...
[Epoch 1/2] Iteration 1500 -> Train Loss: 0.6928, Accuracy: 0.5060
[2020-10-22 06:53:42] Training...
[Epoch 1/2] Iteration 2000 -> Train Loss: 0.6909, Accuracy: 0.5490
[2020-10-22 06:54:36] Training...
[Epoch 1/2] Iteration 2500 -> Train Loss: 0.6773, Accuracy: 0.5790
[2020-10-22 06:55:30] Training...
[Epoch 1/2] Iteration 3000 -> Train Loss: 0.6357, Accuracy: 0.6560
[2020-10-22 06:56:23] Training...
[Epoch 1/2] Iteration 3500 -> Train Loss: 0.5870, Accuracy: 0.7040
[2020-10-22 06:57:17] Training...
[Epoch 1/2] Iteration 4000 -> Train Loss: 0.5770, Accuracy: 0.7090
[2020-10-22 06:58:10] Training...
[Epoch 1/2] Iteration 4500 -> Train Loss: 0.5526, Accuracy: 0.7270
[2020-10-22 06:59:04] Training...
[Epoch 1/2] Iteration 5000 -> Train Loss: 0.5482, Accuracy

[2020-10-22 08:03:36] Training...
[Epoch 2/2] Iteration 41000 -> Train Loss: 0.3383, Accuracy: 0.8450
[2020-10-22 08:04:29] Training...
[Epoch 2/2] Iteration 41500 -> Train Loss: 0.3899, Accuracy: 0.8180
[2020-10-22 08:05:23] Training...
[Epoch 2/2] Iteration 42000 -> Train Loss: 0.3927, Accuracy: 0.8290
[2020-10-22 08:06:16] Training...
[Epoch 2/2] Iteration 42500 -> Train Loss: 0.3821, Accuracy: 0.8380
[2020-10-22 08:07:10] Training...
[Epoch 2/2] Iteration 43000 -> Train Loss: 0.3852, Accuracy: 0.8350
[2020-10-22 08:08:03] Training...
[Epoch 2/2] Iteration 43500 -> Train Loss: 0.3946, Accuracy: 0.8200
[2020-10-22 08:08:57] Training...
[Epoch 2/2] Iteration 44000 -> Train Loss: 0.3757, Accuracy: 0.8440
[2020-10-22 08:09:51] Training...
[Epoch 2/2] Iteration 44500 -> Train Loss: 0.3902, Accuracy: 0.8200
[2020-10-22 08:10:45] Training...
[Epoch 2/2] Iteration 45000 -> Train Loss: 0.3725, Accuracy: 0.8330
[2020-10-22 08:11:39] Training...
[Epoch 2/2] Iteration 45500 -> Train Loss: 0.371

In [13]:
!date "+[%F %R:%S] [DONE] Model Fine-tuning"
print(f"td={time.time() - t0:.3f}")
t0 = time.time()

[2020-10-22 08:37:36] [DONE] Model Fine-tuning
td=6450.898


In [14]:
# evaluation
model.eval()

nsmc_eval_dataset = NsmcDataset(test_df)
eval_loader = DataLoader(nsmc_eval_dataset, batch_size=2, shuffle=False, num_workers=2)

total_loss = 0
total_len = 0
total_correct = 0

for text, label in eval_loader:
    encoded_list = [tokenizer.encode(t, add_special_tokens=True) for t in text]
    padded_list =  [e + [0] * (512-len(e)) for e in encoded_list]
    sample = torch.tensor(padded_list)
    sample, label = sample.to(device), label.to(device)
    labels = torch.tensor(label)
    outputs = model(sample, labels=labels)
    _, logits = outputs

    pred = torch.argmax(F.softmax(logits), dim=1)
    correct = pred.eq(labels)
    total_correct += correct.sum().item()
    total_len += len(labels)

print(f"Test accuracy: {total_correct / total_len:.4f}")

  labels = torch.tensor(label)
  pred = torch.argmax(F.softmax(logits), dim=1)


Test accuracy: 0.8374


In [15]:
!date "+[%F %R:%S] [EXIT] HelloNSMC (using $CONDA_PREFIX)"
print(f"td={time.time() - t0:.3f}")

[2020-10-22 08:42:03] [EXIT] HelloNSMC (using /home/chris/anaconda3/envs/trans2)
td=266.871
