In [1]:
!pip3 install datasets
!pip3 install transformers

In [2]:
from transformers import BertTokenizer, BertForSequenceClassification
from datasets import load_dataset
from torch.utils.data import DataLoader, TensorDataset
import torch

from tqdm import tqdm

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
dataset = load_dataset("sepidmnorozy/Korean_sentiment")

x_train, y_train = dataset['train']['text'], dataset['train']['label']
x_test, y_test = dataset['validation']['text'], dataset['validation']['label']

tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

train_dataset = tokenizer(x_train, padding = True, truncation = True, return_tensors = "pt")
train_dataset = TensorDataset(train_dataset['input_ids'], train_dataset['attention_mask'], torch.tensor(y_train))

test_dataset = tokenizer(x_test, padding = True, truncation = True, return_tensors = "pt")
test_dataset = TensorDataset(test_dataset['input_ids'], test_dataset['attention_mask'], torch.tensor(y_test))


train_loader = DataLoader(train_dataset, batch_size = 100, shuffle = True, num_workers = 4)
test_loader = DataLoader(test_dataset, batch_size = 100, shuffle = False, num_workers = 4)

model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels = 2).to(DEVICE)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
def train(model, train_loader, optimizer):
    train_loss, train_acc, total_samples = 0, 0, 0

    model.train()
    for data in train_loader:
        input_ids, attention_mask, labels = data
        input_ids, attention_mask, labels = input_ids.to(DEVICE), attention_mask.to(DEVICE), labels.to(DEVICE)

        optimizer.zero_grad()

        outputs = model(input_ids, attention_mask = attention_mask, labels = labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        train_loss += loss.item()
        preds = torch.argmax(outputs.logits, dim = 1)
        train_acc += torch.sum(preds == labels).item()
        total_samples += labels.size(0)

    train_loss = train_loss / len(train_loader)
    train_acc = train_acc / total_samples

    return train_loss, train_acc

In [5]:
optimizer = torch.optim.AdamW(model.parameters(), lr = 2e-5)
criterion = torch.nn.CrossEntropyLoss()

num_epochs = 3

for epoch in tqdm(range(num_epochs)):
    train_loss, train_acc = train(model, train_loader, optimizer)
    print(f'Epoch {epoch+1}: train_loss {train_loss}, train_acc {train_acc}')

 33%|███▎      | 1/3 [13:17<26:34, 797.13s/it]

Epoch 1: train_loss 0.47478451935781374, train_acc 0.7657777777777778


 67%|██████▋   | 2/3 [26:32<13:16, 796.16s/it]

Epoch 2: train_loss 0.3480254902193944, train_acc 0.8452777777777778


100%|██████████| 3/3 [39:48<00:00, 796.13s/it]

Epoch 3: train_loss 0.2840343111505111, train_acc 0.8778333333333334





In [11]:
input = "실제 역사를 거의 그대로 다루다보니 보면서 화가 나긴 하지만 영화적으로 흥미롭게 구성을 잘해서 몰입도가 높아요. 몇 번을 봐도 볼 때마다 새롭게 보이는 면이 있구요. 의미와 재미를 다 잡은 작품이에요!"
input = tokenizer(input, padding = True, truncation = True, return_tensors = "pt")

model.eval()
with torch.no_grad():
    input_ids = input['input_ids'].to(DEVICE)
    outputs = model(input_ids)
    preds = torch.argmax(outputs.logits, dim = 1)
    print(preds)


tensor([1], device='cuda:0')
