## BERT系でベースラインを組む
* 実験管理ちゃんとしよう


In [1]:
import os, gc, random, time, copy
import warnings; warnings.simplefilter("ignore")
import datetime
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import MeCab
import re
import demoji, mojimoji
import neologdn

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim import lr_scheduler
from torch.utils.data import Dataset, DataLoader

import transformers
from transformers import (
    BertJapaneseTokenizer, BertForSequenceClassification, 
    AutoTokenizer, AutoModel, AutoModelForSequenceClassification, 
    Trainer, TrainingArguments, EvalPrediction, AdamW
)

from tqdm import tqdm
from collections import defaultdict
from colorama import Fore
b_ = Fore.BLUE; y_ = Fore.YELLOW; g_ = Fore.GREEN; sr_ = Fore.RESET
from config import *
from myutils import *


****** SEED fixed : 42 ******




## BERTの実装 --

In [2]:
class HateSpeechDataset(Dataset):
    def __init__(self, df, tokenizer, max_length, num_classes, text_col="text"):
        self.df = df
        self.max_len = max_length
        self.tokenizer = tokenizer
        self.text = df[text_col].values
        self.target = df[label_name].values
        self.num_classes = num_classes

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):

        text = self.text[index]
        inputs_text = self.tokenizer.encode_plus(
            text,
            truncation=True,
            add_special_tokens=True,
            max_length=self.max_len,
            padding="max_length"
        )
        target = int(self.target[index])

        onehot_t = np.zeros(self.num_classes, dtype=np.float32)
        onehot_t[target] = 1.0

        return {
            "input_ids": torch.tensor(inputs_text["input_ids"], dtype=torch.long),
            "attention_mask": torch.tensor(inputs_text["attention_mask"], dtype=torch.long),
            "target": torch.tensor(onehot_t, dtype=torch.float)
        }

In [3]:
class HateSpeechModel(nn.Module):
    def __init__(self, model_name, num_classes):
        super(HateSpeechModel, self).__init__()
        self.model = AutoModel.from_pretrained(
            model_name,
            output_attentions=True,
            output_hidden_states=True,
            )
        self.dropout = nn.Dropout(p=0.2)
        self.fc = nn.Linear(768, num_classes)
        self.sigmoid = nn.Sigmoid()

    def forward(self, input_ids, attention_mask):
        out = self.model(input_ids=input_ids, attention_mask=attention_mask, output_hidden_states=False)
        out = self.dropout(out[1])
        outputs = self.fc(out)
        outputs = self.sigmoid(outputs)

        return outputs.squeeze()

In [4]:
def prepare_loaders(df, fold, tokenizer, trn_batch_size, val_batch_size, max_length, num_classes, text_col="text"):
    train_df = df[df.kfold != fold].reset_index(drop=True)
    valid_df = df[df.kfold == fold].reset_index(drop=True)

    train_dataset = HateSpeechDataset(train_df, tokenizer=tokenizer, max_length=max_length, num_classes=num_classes, text_col=text_col)
    valid_dataset = HateSpeechDataset(valid_df, tokenizer=tokenizer, max_length=max_length, num_classes=num_classes, text_col=text_col)

    train_loader = DataLoader(
        train_dataset, batch_size=trn_batch_size, num_workers=2, shuffle=True, pin_memory=True, drop_last=True
    )
    valid_loader = DataLoader(
        valid_dataset, batch_size=val_batch_size, num_workers=2, shuffle=False, pin_memory=True
    )
    return train_loader, valid_loader

In [5]:
def criterion(outputs, targets):
    loss_f = nn.BCELoss()
    return loss_f(outputs, targets)

In [6]:
def fetch_scheduler(scheduler, optimizer, T_max=500, eta_min=1e-7):
    if scheduler == "CosineAnnealingLR":
        scheduler = lr_scheduler.CosineAnnealingLR(optimizer, T_max=T_max, eta_min=eta_min)

    else:
        print(f"*** *** NOT implemented *** *** ")
        scheduler = lr_scheduler.CosineAnnealingLR(optimizer, T_max=T_max, eta_min=eta_min)
    return scheduler

In [7]:
def train_one_epoch(model, optimizer, scheduler, dataloader, device, epoch, n_accumulate):
    model.train()

    dataset_size = 0
    running_loss = 0.0

    bar = tqdm(enumerate(dataloader), total=len(dataloader))
    for step, data in bar:
        input_ids = data["input_ids"].to(device, dtype=torch.long)
        attention_mask = data["attention_mask"].to(device, dtype=torch.long)
        targets = data["target"].to(device, dtype=torch.float)

        batch_size = input_ids.size(0)

        outputs = model(input_ids, attention_mask)

        loss = criterion(outputs, targets)
        loss = loss / np.float(n_accumulate)
        loss.backward()

        if (step+1) % n_accumulate == 0:
            optimizer.step()
            optimizer.zero_grad()

            if scheduler is not None:
                scheduler.step()

        running_loss += (loss.item()*batch_size)
        dataset_size += batch_size

        epoch_loss = running_loss / dataset_size

        bar.set_postfix(Epoch=epoch, Train_Loss=epoch_loss, LR=optimizer.param_groups[0]["lr"])
    
    gc.collect()
    return epoch_loss

In [8]:
@torch.no_grad()
def valid_one_epoch(model, optimizer, dataloader, device, epoch):
    model.eval()

    dataset_size = 0
    running_loss = 0.0

    bar = tqdm(enumerate(dataloader), total=len(dataloader))
    for step, data in bar:
        input_ids = data["input_ids"].to(device, dtype=torch.long)
        attention_mask = data["attention_mask"].to(device, dtype=torch.long)
        targets = data["target"].to(device, dtype=torch.float)
        
        batch_size = input_ids.size(0)

        outputs = model(input_ids, attention_mask)

        loss = criterion(outputs, targets)

        running_loss += (loss.item()*batch_size)
        dataset_size += batch_size

        epoch_loss = running_loss / dataset_size

        bar.set_postfix(Epoch=epoch, Valid_Loss=epoch_loss, LR=optimizer.param_groups[0]["lr"])
    
    gc.collect()
    return epoch_loss

In [9]:
def run_training(model, train_loader, valid_loader, optimizer, scheduler, n_accumulate, device, num_epochs, fold, output_path):

    if torch.cuda.is_available():
        print(f"[INFO] Using GPU : {torch.cuda.get_device_name()}\n")

    start_time = time.time()
    best_model_wts = copy.deepcopy(model.state_dict())
    best_epoch_loss = np.inf
    history = defaultdict(list)

    for epoch in range(1, num_epochs+1):
        gc.collect()

        train_epoch_loss = train_one_epoch(
            model, optimizer, scheduler,
            dataloader=train_loader,
            device=device, epoch=epoch,
            n_accumulate=n_accumulate
        )

        valid_epoch_loss = valid_one_epoch(
            model, optimizer, 
            dataloader=valid_loader,
            device=device, epoch=epoch,
        )

        history["Train Loss"].append(train_epoch_loss)
        history["Valid Loss"].append(valid_epoch_loss)

        if valid_epoch_loss <= best_epoch_loss:
            print(f"{b_}Valid Loss Improved : {best_epoch_loss:.6f} ---> {valid_epoch_loss:.6f}")
            best_epoch_loss = valid_epoch_loss
            best_model_wts = copy.deepcopy(model.state_dict())

            #torch.save(model.state_dict(), f"{output_path}model-state-dict-fold{fold}.bin")
            torch.save({
                "epoch": epoch,
                "model_state_dict": model.state_dict(),
                "optimizer_state_dict": optimizer.state_dict(),
                "loss": valid_epoch_loss,
            }, f"{output_path}model-fold{fold}.pth")  # 途中再開したい場合はmodel.state_dict()以外も必要 --
            print(f"Model Saved{sr_}"); print()

    end_time = time.time()
    time_elapsed = end_time - start_time
    print("Training Complete in {:.0f}h {:.0f}m {:.0f}s".format(
        time_elapsed//3600, (time_elapsed%3600)//60, (time_elapsed%3600)%60
    ))
    print("Best Loss: {:.4f}".format(best_epoch_loss))

    model.load_state_dict(best_model_wts)

    return model, history

## メインの実行セル --

In [10]:
run_id = "tmp"
output_path = f"./output/{run_id}/"

epochs = 1
folds = 5
model_name = r"cl-tohoku/bert-base-japanese-whole-word-masking"
train_batch_size = 32
valid_batch_size = 64
max_length = 76

learning_rate = 1e-6
scheduler_name = "CosineAnnealingLR"
min_lr = 1e-7
T_max = 500,
weight_decay = 1e-6
max_grad_norm = 1.0
n_accumulate = 1
num_classes = 2
n_fold = 5
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

hidden_size = 768
num_hidden_layers = 24
dropout = 0.2

In [11]:
train = pd.read_csv(data_path+"train.csv")
test = pd.read_csv(data_path+"test.csv")

df = pd.concat([train, test]).reset_index(drop=True)
df.head(2)

Unnamed: 0,id,source,text,label
0,80074aa43,news4vip,まともに相手されてない人との関係なんて\nそんな大事にするものか？,0.0
1,6378fea6b,livejupiter,最近はアヘアヘQSマンやない？ ｲｲ!(・∀・)+1-0(・Ａ・)ｲｸﾅｲ!,0.0


In [12]:
df["clean_text"] = df["text"].map(lambda x: clean_text(x))

In [13]:
train_df = df.loc[:train.shape[0]-1, :]
test_df = df.loc[train.shape[0]:, :]

In [14]:
skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=SEED)
split = skf.split(train_df, train_df[label_name])

for fold, (_, val_index) in enumerate(skf.split(X=train_df, y=train_df[label_name])):
    train_df.loc[val_index, "kfold"] = int(fold)
train_df["kfold"] = train_df["kfold"].astype(int)

In [15]:
tokenizer = AutoTokenizer.from_pretrained(
    "cl-tohoku/bert-base-japanese-whole-word-masking",
    mecab_kwargs={"mecab_dic":None, "mecab_option": f"-d {dic_neologd}"}
)

In [16]:
if not os.path.exists(output_path):
    os.mkdir(output_path)

for fold in range(0, folds):
    print(f"{y_} ====== Fold: {fold} ======{sr_}")

    # Create DataLoader --
    train_loader, valid_loader = prepare_loaders(
        df=train_df,
        tokenizer=tokenizer,
        fold=fold,
        trn_batch_size=train_batch_size,
        val_batch_size=valid_batch_size,
        max_length=max_length,
        num_classes=num_classes,
        text_col="clean_text"
    )

    # Model construct --
    model = HateSpeechModel(model_name=model_name, num_classes=num_classes)
    model.to(device)

    # Define Optimizer and Scheduler --
    optimizer = AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
    scheduler = fetch_scheduler(optimizer=optimizer, scheduler=scheduler_name)

    model, history = run_training(
        model, train_loader, valid_loader, optimizer, scheduler, n_accumulate, device, epochs, fold, output_path
    )

    del model, history, train_loader, valid_loader
    _ = gc.collect()

    break



Some weights of the model checkpoint at cl-tohoku/bert-base-japanese-whole-word-masking were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[INFO] Using GPU : NVIDIA GeForce RTX 3090



100%|██████████| 131/131 [00:15<00:00,  8.69it/s, Epoch=1, LR=8.56e-7, Train_Loss=0.449]
100%|██████████| 17/17 [00:01<00:00, 14.96it/s, Epoch=1, LR=8.56e-7, Valid_Loss=0.311]


[34mValid Loss Improved : inf ---> 0.311017
Model Saved[39m

Training Complete in 0h 0m 21s
Best Loss: 0.3110


## Validation --

In [17]:
from glob import glob
model_paths = glob(f"{output_path}*.pth"); model_paths.sort()

In [18]:
model_paths

['./output/tmp/model-fold0.pth']

In [19]:
@torch.no_grad()
def valid_fn(model, dataloader, device):
    model.eval()  # modelはtrainの時点でto(device)されている前提 --

    preds = []

    bar = tqdm(enumerate(dataloader), total=len(dataloader))
    for step, data in bar:
        input_ids = data["input_ids"].to(device, dtype=torch.long)
        attention_mask = data["attention_mask"].to(device, dtype=torch.long)

        outputs = model(input_ids, attention_mask)

        preds.append(outputs.cpu().detach().numpy())

    preds = np.concatenate(preds)
    gc.collect()

    return preds

In [20]:
def inference(model_name, num_classes, model_paths, dataloader, device):
    final_preds = []

    for i, path in enumerate([model_paths]):
        model = HateSpeechModel(model_name=model_name, num_classes=num_classes)
        model.to(device)
        checkpoint = torch.load(model_paths)
        model.load_state_dict(checkpoint["model_state_dict"])

        print(f"Getting predictions for model {i+1}")
        preds = valid_fn(model, dataloader, device)
        final_preds.append(preds)


    final_preds = np.array(final_preds)
    final_preds = np.mean(final_preds, axis=0)
    return final_preds

In [21]:
from sklearn.metrics import f1_score, accuracy_score

fold_f1 = []
fold_acc = []

for fold in range(0, folds):
    print(f"{y_} ====== Fold: {fold} ======{sr_}")

    # Create DataLoader --
    train_loader, valid_loader = prepare_loaders(
        df=train_df,
        tokenizer=tokenizer,
        fold=fold,
        trn_batch_size=train_batch_size,
        val_batch_size=valid_batch_size,
        max_length=max_length,
        num_classes=num_classes,
        text_col="clean_text"
    )

    valid = train_df[train_df.kfold == fold]
    out = inference(model_name, num_classes, model_paths[fold], valid_loader, device)

    valid["oof"] = np.argmax(out, axis=1)

    fold_f1.append(f1_score(valid[label_name].values, valid["oof"].values))
    fold_acc.append(accuracy_score(valid[label_name].values, valid["oof"].values))

    train_df = pd.merge(train_df, valid.loc[:, ["id", "oof"]], how="left", on="id")
    break



Some weights of the model checkpoint at cl-tohoku/bert-base-japanese-whole-word-masking were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Getting predictions for model 1


100%|██████████| 17/17 [00:01<00:00, 15.10it/s]


In [22]:
train_df.reset_index(drop=False).to_feather(f"{output_path}train_df.feather")
test_df.reset_index(drop=False).to_feather(f"{output_path}test_df.feather")