In [None]:
# !poetry export --without-hashes --dev -f requirements.txt -o requirements-dev.txt
!pip install -U pip
# chikkapy depends on dartsclone
!pip install dartsclone
!pip install -r requirements-dev.txt

In [1]:
import pandas as pd
from io import BytesIO
from urllib.request import urlopen
import gc
import torch
# Reset GPU memory
# https://stackoverflow.com/questions/54374935/how-to-fix-this-strange-error-runtimeerror-cuda-error-out-of-memory
gc.collect()
torch.cuda.empty_cache()

In [70]:
df = pd.read_csv(
    BytesIO(urlopen("https://github.com/ids-cv/wrime/raw/master/wrime.tsv").read())
    , sep="\t"
)
df.sample()

Unnamed: 0,Sentence,UserID,Datetime,Train/Dev/Test,Writer_Joy,Writer_Sadness,Writer_Anticipation,Writer_Surprise,Writer_Anger,Writer_Fear,...,Reader3_Disgust,Reader3_Trust,Avg. Readers_Joy,Avg. Readers_Sadness,Avg. Readers_Anticipation,Avg. Readers_Surprise,Avg. Readers_Anger,Avg. Readers_Fear,Avg. Readers_Disgust,Avg. Readers_Trust
17254,花粉症の薬はじめました,21,2020/01/16 08:25,train,0,1,2,0,0,0,...,1,0,0,0,1,1,0,0,0,0


In [71]:
"""This data has multiple sentiment label. 
For sake of simplicity, transform the task multi-label to binary classification.
So, need to explore the label that seems to have most countable records (and it's `Joy`).
"""

df[[
    c for c in df.columns if c.startswith("Avg. Readers_")
]].applymap(lambda row: 1 if row > 0 else 0).agg('sum')

Avg. Readers_Joy             13317
Avg. Readers_Sadness         11194
Avg. Readers_Anticipation    13831
Avg. Readers_Surprise        10852
Avg. Readers_Anger            1429
Avg. Readers_Fear             9114
Avg. Readers_Disgust          7521
Avg. Readers_Trust            1918
dtype: int64

In [122]:
import numpy as np

targets = np.expand_dims(
    df["Avg. Readers_Joy"].apply(lambda row: 1 if row > 0 else 0),
    axis=1
)
user_ids = df["UserID"].values
sentences = df["Sentence"].values

In [125]:
from sklearn.model_selection import train_test_split

seed = 2021
record_idx = df.index.tolist()
# record_idx = list(range(len(df)))
_train_idx, test_idx, _, _ = train_test_split(record_idx, record_idx, test_size=0.2, random_state=seed,
                                              stratify=user_ids)
train_idx, valid_idx, _, _ = train_test_split(_train_idx, _train_idx, test_size=0.2, random_state=seed,
                                              stratify=user_ids[_train_idx])

# train_texts = sentences[train_idx]
# valid_texts = sentences[valid_idx]
# test_texts = sentences[test_idx]

y_train = targets[train_idx]
y_valid = targets[valid_idx]
y_test = targets[test_idx]

In [126]:
import torch
import os

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# tips to speed up
torch.backends.cudnn.benchmark = True
# https://huggingface.co/cl-tohoku/bert-base-japanese-v2
MODEL_TYPE = "cl-tohoku/bert-base-japanese-v2"
MAX_LENGTH = 128
LEARNING_RATE = 1e-5
WARM_UP_RATIO = 0.1
BATCH_SIZE = 64  # It's the best deal for GPU(T4). For more batch_size, stronger GPU.
# N_EPOCHS = 50
N_EPOCHS = 3
NUM_WORKERS = os.cpu_count() - 1

In [127]:
from tqdm.auto import tqdm
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(MODEL_TYPE)
tokens = [
    tokenizer.encode_plus(sentence,
                          add_special_tokens=True,
                          max_length=MAX_LENGTH,
                          padding='max_length',
                          truncation=True,
                          ) for sentence in tqdm(sentences)
]

train_tokens = [tokens[i] for i in train_idx]
valid_tokens = [tokens[i] for i in valid_idx]
test_tokens = [tokens[i] for i in test_idx]

len(tokenizer.get_vocab())

  0%|          | 0/43200 [00:00<?, ?it/s]

32768

In [128]:
class WrimeDataset(torch.utils.data.Dataset):
    def __init__(self, tokens, targets):
        self.tokens = tokens
        self.targets = targets

    def __len__(self):
        return len(self.tokens)

    def __getitem__(self, index):
        token = self.tokens[index]
        target = self.targets[index]

        input_ids = torch.tensor(token["input_ids"])
        attention_mask = torch.tensor(token["attention_mask"])
        token_type_ids = torch.tensor(token["token_type_ids"])
        target = torch.tensor(target).float()

        return dict(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            target=target,
        )

In [133]:
ds_train = torch.utils.data.DataLoader(
    WrimeDataset(train_tokens, y_train),
    # batch_size=BATCH_SIZE, drop_last=True, num_workers=NUM_WORKERS, pin_memory=True)
    batch_size=BATCH_SIZE, drop_last=True)
ds_valid = torch.utils.data.DataLoader(
    WrimeDataset(valid_tokens, y_valid),
    # batch_size=BATCH_SIZE, drop_last=False, num_workers=NUM_WORKERS, pin_memory=True)
    batch_size=BATCH_SIZE, drop_last=False)
ds_test = torch.utils.data.DataLoader(
    WrimeDataset(test_tokens, y_test),
    # batch_size=BATCH_SIZE, drop_last=False, num_workers=NUM_WORKERS, pin_memory=True)
    batch_size=BATCH_SIZE, drop_last=False)

# iter(ds_test).__next__()

In [130]:
from torch import nn
from transformers import (
    BertConfig, AutoModel, AdamW, get_cosine_schedule_with_warmup,
)


class BertClassifier(nn.Module):
    def __init__(self, model_type, tokenizer, num_classes):
        super().__init__()
        config = BertConfig(model_type)
        config.vocab_size = tokenizer.vocab_size
        self.bert = AutoModel.from_pretrained(model_type, config=config)
        self.fc1 = nn.Linear(config.hidden_size, config.hidden_size)
        self.fc2 = nn.Linear(config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask, token_type_ids):
        # https://stackoverflow.com/a/67352953/9489217
        _, h = self.bert(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, return_dict=False)
        h = nn.ReLU()(h)
        h = self.fc1(h)
        h = nn.ReLU()(h)
        h = self.fc2(h)
        return h


model = BertClassifier(MODEL_TYPE, tokenizer=tokenizer, num_classes=1)
model.to(device)

optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
num_train_steps = len(ds_train) * N_EPOCHS
num_warmup_steps = int(num_train_steps * WARM_UP_RATIO)
scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=num_warmup_steps,
                                         num_training_steps=num_train_steps)

Some weights of the model checkpoint at cl-tohoku/bert-base-japanese-v2 were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [131]:
import numpy as np

loss_fn = nn.BCEWithLogitsLoss()

def train_loop(ds, model, optimizer, scheduler, device):
    losses, learning_rates = [], []
    model.train()
    optimizer.zero_grad()
    for row in tqdm(ds, total=len(ds)):
        input_ids = row["input_ids"].to(device)
        attention_mask = row["attention_mask"].to(device)
        token_type_ids = row["token_type_ids"].to(device)
        target = row["target"].to(device)

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        output = model(input_ids, attention_mask, token_type_ids)
        loss = loss_fn(output, target)
        loss.backward()
        optimizer.step()
        scheduler.step()
        learning_rates.append(np.array([
            pm["lr"] for pm in optimizer.param_groups
        ]).mean())
        losses.append(loss.item())
    return learning_rates, losses, model

def test_loop(ds, model, device):
    losses, predicts = [], []
    model.eval()
    for row in tqdm(ds, total=len(ds)):
        input_ids = row["input_ids"].to(device)
        attention_mask = row["attention_mask"].to(device)
        token_type_ids = row["token_type_ids"].to(device)
        target = row["target"].to(device)

        with torch.no_grad():
            output = model(input_ids, attention_mask, token_type_ids)

        loss = loss_fn(output, target)
        losses.append(loss.item())
        predicts += output.sigmoid().cpu().tolist()
    return predicts, np.array(losses).mean()

In [None]:
from torch.utils.tensorboard import SummaryWriter

log_dir = "runs/baseline_exp1"
writer = SummaryWriter(log_dir=log_dir)

In [None]:
%reload_ext tensorboard
%tensorboard --logdir $log_dir

In [None]:
from tensorboard import notebook
notebook.list()

In [None]:
# notebook.display(port=6006, height=1000)

In [134]:
from sklearn.metrics import accuracy_score

for epoch in range(N_EPOCHS):
    print(f"Epoch-{epoch}")
    train_loop(ds_train, model, optimizer, scheduler, device)

    y_pred, val_loss = test_loop(ds_valid, model, device)
    val_acc = accuracy_score(y_valid, y_pred)

    y_pred, test_loss = test_loop(ds_test, model, device)
    test_acc = accuracy_score(y_test, y_pred)

    writer.add_scalar("Loss/train", val_loss, epoch)
    writer.add_scalar("Loss/test", test_loss, epoch)
    writer.add_scalar("Accuracy/train", val_acc, epoch)
    writer.add_scalar("Accuracy/test", test_acc, epoch)

    print(f"\tvalid:\tloss={val_loss}/score={val_acc}")
    print(f"\ttest:\tloss={test_loss}/score={test_acc}")