## BERTでベースラインを組む(test_pred)

In [1]:
import warnings; warnings.simplefilter("ignore")

import gc, os, random
from tqdm import tqdm
import pandas as pd
import numpy as np

from glob import glob

import transformers
from transformers import (
    BertJapaneseTokenizer, BertForSequenceClassification, 
    AutoTokenizer, AutoModel, AutoModelForSequenceClassification, 
    Trainer, TrainingArguments, EvalPrediction, AdamW
)

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim import lr_scheduler
from torch.utils.data import Dataset, DataLoader

from config import *
from myutils import *


****** SEED fixed : 42 ******




In [2]:
# test_predを作りたいrun_idのフォルダパスを指定 --
# スクリプト化すれば悩むこともないかもだけど、notebook分けて実行するときは読み込み先が一意に定まらないと気持ち悪い --
# => configを全部notebook内で書いて逆にjsonに吐くとかすれば行けそう --

# そもそもそれやるより、baselineのコードをpy化したほうが良くないか？後々使えるし...
run_id = "tmp"
output_path = f"./output/{run_id}/"

In [3]:
epochs = 1
folds = 5
model_name = r"cl-tohoku/bert-base-japanese-whole-word-masking"
train_batch_size = 32
valid_batch_size = 64
test_batch_size = 64
max_length = 76

learning_rate = 1e-6
scheduler_name = "CosineAnnealingLR"
min_lr = 1e-7
T_max = 500,
weight_decay = 1e-6
max_grad_norm = 1.0
n_accumulate = 1
num_classes = 2
n_fold = 5
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

hidden_size = 768
num_hidden_layers = 24
dropout = 0.2

In [4]:
model_paths = glob(f"{output_path}*.pth"); model_paths.sort()

In [5]:
model_paths

['./output/tmp/model-fold0.pth']

In [6]:
test_df = pd.read_feather(f"{output_path}test_df.feather")

In [7]:
tokenizer = AutoTokenizer.from_pretrained(
    "cl-tohoku/bert-base-japanese-whole-word-masking",
    mecab_kwargs={"mecab_dic":None, "mecab_option": f"-d {dic_neologd}"}
)

In [8]:
class HateSpeechDataset(Dataset):
    def __init__(self, df, tokenizer, max_length, num_classes, text_col="text", isTrain=True):
        self.df = df
        self.max_len = max_length
        self.tokenizer = tokenizer
        self.text = df[text_col].values
        self.num_classes = num_classes
        if isTrain:
            self.target = df[label_name].values
        else:
            self.target = np.zeros(df.shape[0])
        self.isTrain = isTrain

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):

        text = self.text[index]
        inputs_text = self.tokenizer.encode_plus(
            text,
            truncation=True,
            add_special_tokens=True,
            max_length=self.max_len,
            padding="max_length"
        )
        
        if self.isTrain:
            target = int(self.target[index])
            onehot_t = np.zeros(self.num_classes, dtype=np.float32)
            onehot_t[target] = 1.0
            return {
                "input_ids": torch.tensor(inputs_text["input_ids"], dtype=torch.long),
                "attention_mask": torch.tensor(inputs_text["attention_mask"], dtype=torch.long),
                "target": torch.tensor(onehot_t, dtype=torch.float)
            }
        
        else:
            return {
                "input_ids": torch.tensor(inputs_text["input_ids"], dtype=torch.long),
                "attention_mask": torch.tensor(inputs_text["attention_mask"], dtype=torch.long),
            }

In [9]:
class HateSpeechModel(nn.Module):
    def __init__(self, model_name, num_classes):
        super(HateSpeechModel, self).__init__()
        self.model = AutoModel.from_pretrained(
            model_name,
            output_attentions=True,
            output_hidden_states=True,
            )
        self.dropout = nn.Dropout(p=0.2)
        self.fc = nn.Linear(768, num_classes)
        self.sigmoid = nn.Sigmoid()

    def forward(self, input_ids, attention_mask):
        out = self.model(input_ids=input_ids, attention_mask=attention_mask, output_hidden_states=False)
        out = self.dropout(out[1])
        outputs = self.fc(out)
        outputs = self.sigmoid(outputs)

        return outputs.squeeze()

In [10]:
@torch.no_grad()
def valid_fn(model, dataloader, device):
    model.eval()  # modelはtrainの時点でto(device)されている前提 --

    preds = []

    bar = tqdm(enumerate(dataloader), total=len(dataloader))
    for step, data in bar:
        input_ids = data["input_ids"].to(device, dtype=torch.long)
        attention_mask = data["attention_mask"].to(device, dtype=torch.long)

        outputs = model(input_ids, attention_mask)

        preds.append(outputs.cpu().detach().numpy())

    preds = np.concatenate(preds)
    gc.collect()

    return preds

In [11]:
def inference(model_name, num_classes, model_paths, dataloader, device):
    final_preds = []

    for i, path in enumerate([model_paths]):
        model = HateSpeechModel(model_name=model_name, num_classes=num_classes)
        model.to(device)
        checkpoint = torch.load(model_paths)
        model.load_state_dict(checkpoint["model_state_dict"])

        print(f"Getting predictions for model {i+1}")
        preds = valid_fn(model, dataloader, device)
        final_preds.append(preds)


    final_preds = np.array(final_preds)
    final_preds = np.mean(final_preds, axis=0)
    return final_preds

In [12]:
test_dataset = HateSpeechDataset(test_df, tokenizer=tokenizer, max_length=max_length, num_classes=num_classes, text_col="clean_text", isTrain=False)

In [13]:
test_loader = DataLoader(test_dataset, batch_size=test_batch_size, num_workers=2, shuffle=False, pin_memory=True)

In [14]:
for fold in range(0, folds):
    preds = inference(model_name, num_classes, model_paths[fold], test_loader, device)
    break

Some weights of the model checkpoint at cl-tohoku/bert-base-japanese-whole-word-masking were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Getting predictions for model 1


100%|██████████| 51/51 [00:03<00:00, 12.89it/s]


In [17]:
preds

array([[0.81462973, 0.18600619],
       [0.72755533, 0.20633605],
       [0.8023047 , 0.19576396],
       ...,
       [0.8148778 , 0.19981457],
       [0.8307979 , 0.18185726],
       [0.82658386, 0.19169123]], dtype=float32)

In [19]:
test_df["oof"] = np.argmax(preds, axis=1)

In [22]:
test_df

Unnamed: 0,index,id,source,text,label,clean_text,oof
0,5256,001026808,news4vip,上でも言ったけどオタクレベルの知識求めてる訳じゃない\nただ囲碁やります！って人が誰1人プロ...,,上でも言ったけどオタクレベルの知識求めてる訳じゃないただ囲碁やります!って人が誰1人プロ棋士...,0
1,5257,00465ac96,livejupiter,たとえば、黒人なんかは、生物学的欠陥はないのに、文化的要因で、悪循環に陥り、実力をつけられず...,,たとえば黒人なんかは生物学的欠陥はないのに文化的要因で悪循環に陥り実力をつけられずに生きてき...,0
2,5258,004674725,livejupiter,そうなんやろなあ色々と勿体ない感じしたわ\n終わり方と黒幕キャラは好きやったで\n\nちなワ...,,そうなんやろなあ色と勿体ない感じしたわ終わり方と黒幕キャラは好きやったでちなワイはダークナイ...,0
3,5259,00474460f,news4vip,法的というか自治体ごとにバラバラの条例で定めてるだけだからな\n普通の淫行条例だと「青少年に...,,法的というか自治体ごとにバラバラの条例で定めてるだけだからな普通の淫行条例だと青少年に淫らな...,0
4,5260,004a7525c,newsplus,別のジャーナリストの感想として言われてるので客観的な事実とは言えないけど、\n現地は不測の事...,,別のジャーナリストの感想として言われてるので客観的な事実とは言えないけど現地は不測の事態が起...,0
...,...,...,...,...,...,...,...
3218,8474,ffc4647ac,news4vip,１人がいいのか？\nなんで変なのと同棲したのか…\nなにがしたいんだ…,,1人がいいのか?なんで変なのと同棲したのか…なにがしたいんだ…,0
3219,8475,ffc6554ba,newsplus,ロシアもだなあ\n元々北朝鮮はロシアの工作で作られた国だから,,ロシアもだなあ元北朝鮮はロシアの工作で作られた国だから,0
3220,8476,ffd3b29c2,newsplus,クネが国境に拡声器を設置して昼も夜も北の悪口鳴らしてんだとよ\nお互い当たらないように大砲撃...,,クネが国境に拡声器を設置して昼も夜も北の悪口鳴らしてんだとよお互い当たらないように大砲撃ち合...,0
3221,8477,ffd3c69b6,news4vip,当然って言い方が腹立つんだよなあ\r\nその時点で何か男より優位に立ちたいみたいな感じがして...,,当然って言い方が腹立つんだよなあその時点で何か男より優位に立ちたいみたいな感じがしてくるんだ...,0


In [30]:
submission = pd.read_csv(f"{data_path}sample_submission.csv")

In [31]:
submission = pd.merge(submission, test_df.loc[:, ["id", "oof"]], how="left", on="id").drop(["label"], axis=1).rename(columns={"oof":"label"})

In [36]:
submission.to_csv(f"{output_path}submission.csv", index=False)