## BERTのベースラインをスクリプトから実行できるようにリファクタリング --

In [34]:
import pandas as pd
import torch
import os

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score, accuracy_score

from transformers import AutoTokenizer, AdamW

from glob import glob
from config import *
from bert_utils import *

In [35]:
RUN_ID = "bert-baseline"

In [36]:
settings = pd.Series(dtype=object)

# project settings --
settings["run_id"] = RUN_ID
settings["output_path"] = f"{output_root}{settings.run_id}/"
settings["num_classes"] = 2

# training settings --
settings["epochs"] = 10
settings["folds"] = 5
settings["train_batch_size"] = 32
settings["valid_batch_size"] = 64
settings["test_batch_size"] = 64

# bert settings --
settings["model_name"] = r"cl-tohoku/bert-base-japanese-whole-word-masking"
settings["max_length"] = 76
settings["hidden_size"] = 768
settings["num_hidden_layers"] = 24
settings["dropout"] = 0.2

# optimizer settings --
settings["learning_rate"] = 1e-5
settings["scheduler_name"] = "CosineAnnealingLR"
settings["min_lr"] = 1e-6
settings["T_max"] = 500
settings["weight_decay"] = 1e-5
settings["n_accumulate"] = 1

if not os.path.exists(settings.output_path):
    os.mkdir(settings.output_path)

os.system(f"cp ./*py {settings.output_path}")
settings.to_json(f"{settings.output_path}settings.json")

In [37]:
train = pd.read_csv(data_path+"train.csv")
test = pd.read_csv(data_path+"test.csv")

df = pd.concat([train, test]).reset_index(drop=True)

train_shape = train.shape[0]
del train, test; _ = gc.collect()

display(df.head(2))

Unnamed: 0,id,source,text,label
0,80074aa43,news4vip,まともに相手されてない人との関係なんて\nそんな大事にするものか？,0.0
1,6378fea6b,livejupiter,最近はアヘアヘQSマンやない？ ｲｲ!(・∀・)+1-0(・Ａ・)ｲｸﾅｲ!,0.0


In [38]:
df["clean_text"] = df["text"].map(lambda x: clean_text(x))

In [39]:
train_df = df.loc[:train_shape-1, :]
test_df = df.loc[train_shape:, :]

skf = StratifiedKFold(n_splits=settings.folds, shuffle=True, random_state=SEED)
split = skf.split(train_df, train_df[label_name])

for fold, (_, val_index) in enumerate(skf.split(X=train_df, y=train_df[label_name])):
    train_df.loc[val_index, "kfold"] = int(fold)
train_df["kfold"] = train_df["kfold"].astype(int)

In [40]:
tokenizer = AutoTokenizer.from_pretrained(
    settings.model_name,
    mecab_kwargs={"mecab_dic":None, "mecab_option": f"-d {dic_neologd}"}
)

In [41]:
for fold in range(0, settings.folds):
    print(f"{y_} ====== Fold: {fold} ======{sr_}")

    # Create DataLoader --
    train_loader, valid_loader = prepare_loaders(
        df=train_df,
        tokenizer=tokenizer,
        fold=fold,
        trn_batch_size=settings.train_batch_size,
        val_batch_size=settings.valid_batch_size,
        max_length=settings.max_length,
        num_classes=settings.num_classes,
        text_col="clean_text"
    )

    # Model construct --
    model = HateSpeechModel(model_name=settings.model_name, num_classes=settings.num_classes)
    model.to(device)

    # Define Optimizer and Scheduler --
    optimizer = AdamW(model.parameters(), lr=settings.learning_rate, weight_decay=settings.weight_decay)
    scheduler = fetch_scheduler(optimizer=optimizer, scheduler=settings.scheduler_name)

    model, history = run_training(
        model, train_loader, valid_loader, optimizer, scheduler, settings.n_accumulate, device, settings.epochs, fold, settings.output_path
    )

    del model, history, train_loader, valid_loader
    _ = gc.collect()



Some weights of the model checkpoint at cl-tohoku/bert-base-japanese-whole-word-masking were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[INFO] Using GPU : NVIDIA GeForce RTX 3090



100%|██████████| 131/131 [00:14<00:00,  9.08it/s, Epoch=1, LR=8.42e-6, Train_Loss=0.251]
100%|██████████| 17/17 [00:01<00:00, 14.80it/s, Epoch=1, LR=8.42e-6, Valid_Loss=0.187]


[34mValid Loss Improved : inf ---> 0.187037
Model Saved[39m



100%|██████████| 131/131 [00:14<00:00,  9.12it/s, Epoch=2, LR=4.68e-6, Train_Loss=0.14] 
100%|██████████| 17/17 [00:01<00:00, 14.83it/s, Epoch=2, LR=4.68e-6, Valid_Loss=0.12] 


[34mValid Loss Improved : 0.187037 ---> 0.119984
Model Saved[39m



100%|██████████| 131/131 [00:14<00:00,  9.11it/s, Epoch=3, LR=1.18e-6, Train_Loss=0.0954]
100%|██████████| 17/17 [00:01<00:00, 14.84it/s, Epoch=3, LR=1.18e-6, Valid_Loss=0.114]


[34mValid Loss Improved : 0.119984 ---> 0.114144
Model Saved[39m



100%|██████████| 131/131 [00:14<00:00,  9.09it/s, Epoch=4, LR=1.56e-7, Train_Loss=0.0764]
100%|██████████| 17/17 [00:01<00:00, 14.81it/s, Epoch=4, LR=1.56e-7, Valid_Loss=0.116]
100%|██████████| 131/131 [00:14<00:00,  9.05it/s, Epoch=5, LR=2.27e-6, Train_Loss=0.076] 
100%|██████████| 17/17 [00:01<00:00, 14.70it/s, Epoch=5, LR=2.27e-6, Valid_Loss=0.116]
100%|██████████| 131/131 [00:14<00:00,  9.01it/s, Epoch=6, LR=6.16e-6, Train_Loss=0.0812]
100%|██████████| 17/17 [00:01<00:00, 14.74it/s, Epoch=6, LR=6.16e-6, Valid_Loss=0.112]


[34mValid Loss Improved : 0.114144 ---> 0.111831
Model Saved[39m



100%|██████████| 131/131 [00:14<00:00,  9.04it/s, Epoch=7, LR=9.34e-6, Train_Loss=0.0646]
100%|██████████| 17/17 [00:01<00:00, 14.80it/s, Epoch=7, LR=9.34e-6, Valid_Loss=0.138]
100%|██████████| 131/131 [00:14<00:00,  9.02it/s, Epoch=8, LR=9.78e-6, Train_Loss=0.0521]
100%|██████████| 17/17 [00:01<00:00, 14.78it/s, Epoch=8, LR=9.78e-6, Valid_Loss=0.119]
100%|██████████| 131/131 [00:14<00:00,  8.99it/s, Epoch=9, LR=7.19e-6, Train_Loss=0.0203]
100%|██████████| 17/17 [00:01<00:00, 14.70it/s, Epoch=9, LR=7.19e-6, Valid_Loss=0.169]
100%|██████████| 131/131 [00:14<00:00,  8.96it/s, Epoch=10, LR=3.23e-6, Train_Loss=0.0147]
100%|██████████| 17/17 [00:01<00:00, 14.51it/s, Epoch=10, LR=3.23e-6, Valid_Loss=0.168]


Training Complete in 0h 2m 60s
Best Loss: 0.1118


Some weights of the model checkpoint at cl-tohoku/bert-base-japanese-whole-word-masking were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[INFO] Using GPU : NVIDIA GeForce RTX 3090



100%|██████████| 131/131 [00:14<00:00,  8.98it/s, Epoch=1, LR=8.42e-6, Train_Loss=0.249]
100%|██████████| 17/17 [00:01<00:00, 14.70it/s, Epoch=1, LR=8.42e-6, Valid_Loss=0.162]


[34mValid Loss Improved : inf ---> 0.162127
Model Saved[39m



100%|██████████| 131/131 [00:14<00:00,  8.99it/s, Epoch=2, LR=4.68e-6, Train_Loss=0.133]
100%|██████████| 17/17 [00:01<00:00, 14.71it/s, Epoch=2, LR=4.68e-6, Valid_Loss=0.13] 


[34mValid Loss Improved : 0.162127 ---> 0.129854
Model Saved[39m



100%|██████████| 131/131 [00:14<00:00,  8.96it/s, Epoch=3, LR=1.18e-6, Train_Loss=0.09]  
100%|██████████| 17/17 [00:01<00:00, 14.69it/s, Epoch=3, LR=1.18e-6, Valid_Loss=0.134]
100%|██████████| 131/131 [00:14<00:00,  8.96it/s, Epoch=4, LR=1.56e-7, Train_Loss=0.0752]
100%|██████████| 17/17 [00:01<00:00, 14.62it/s, Epoch=4, LR=1.56e-7, Valid_Loss=0.132]
100%|██████████| 131/131 [00:14<00:00,  8.91it/s, Epoch=5, LR=2.27e-6, Train_Loss=0.0726]
100%|██████████| 17/17 [00:01<00:00, 14.55it/s, Epoch=5, LR=2.27e-6, Valid_Loss=0.137]
100%|██████████| 131/131 [00:14<00:00,  8.90it/s, Epoch=6, LR=6.16e-6, Train_Loss=0.0747]
100%|██████████| 17/17 [00:01<00:00, 14.58it/s, Epoch=6, LR=6.16e-6, Valid_Loss=0.126]


[34mValid Loss Improved : 0.129854 ---> 0.126271
Model Saved[39m



100%|██████████| 131/131 [00:14<00:00,  8.97it/s, Epoch=7, LR=9.34e-6, Train_Loss=0.0622]
100%|██████████| 17/17 [00:01<00:00, 14.65it/s, Epoch=7, LR=9.34e-6, Valid_Loss=0.167]
100%|██████████| 131/131 [00:14<00:00,  8.93it/s, Epoch=8, LR=9.78e-6, Train_Loss=0.0407]
100%|██████████| 17/17 [00:01<00:00, 14.59it/s, Epoch=8, LR=9.78e-6, Valid_Loss=0.17] 
100%|██████████| 131/131 [00:14<00:00,  8.93it/s, Epoch=9, LR=7.19e-6, Train_Loss=0.0263]
100%|██████████| 17/17 [00:01<00:00, 14.59it/s, Epoch=9, LR=7.19e-6, Valid_Loss=0.161]
100%|██████████| 131/131 [00:14<00:00,  8.87it/s, Epoch=10, LR=3.23e-6, Train_Loss=0.0121] 
100%|██████████| 17/17 [00:01<00:00, 14.50it/s, Epoch=10, LR=3.23e-6, Valid_Loss=0.177]


Training Complete in 0h 2m 57s
Best Loss: 0.1263


Some weights of the model checkpoint at cl-tohoku/bert-base-japanese-whole-word-masking were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[INFO] Using GPU : NVIDIA GeForce RTX 3090



100%|██████████| 131/131 [00:14<00:00,  8.87it/s, Epoch=1, LR=8.42e-6, Train_Loss=0.237]
100%|██████████| 17/17 [00:01<00:00, 14.44it/s, Epoch=1, LR=8.42e-6, Valid_Loss=0.159]


[34mValid Loss Improved : inf ---> 0.159138
Model Saved[39m



100%|██████████| 131/131 [00:14<00:00,  8.90it/s, Epoch=2, LR=4.68e-6, Train_Loss=0.131]
100%|██████████| 17/17 [00:01<00:00, 14.80it/s, Epoch=2, LR=4.68e-6, Valid_Loss=0.143]


[34mValid Loss Improved : 0.159138 ---> 0.143228
Model Saved[39m



100%|██████████| 131/131 [00:14<00:00,  8.98it/s, Epoch=3, LR=1.18e-6, Train_Loss=0.0822]
100%|██████████| 17/17 [00:01<00:00, 14.82it/s, Epoch=3, LR=1.18e-6, Valid_Loss=0.131]


[34mValid Loss Improved : 0.143228 ---> 0.130952
Model Saved[39m



100%|██████████| 131/131 [00:14<00:00,  8.99it/s, Epoch=4, LR=1.56e-7, Train_Loss=0.0614]
100%|██████████| 17/17 [00:01<00:00, 14.83it/s, Epoch=4, LR=1.56e-7, Valid_Loss=0.133]
100%|██████████| 131/131 [00:14<00:00,  8.93it/s, Epoch=5, LR=2.27e-6, Train_Loss=0.0595]
100%|██████████| 17/17 [00:01<00:00, 14.62it/s, Epoch=5, LR=2.27e-6, Valid_Loss=0.14] 
100%|██████████| 131/131 [00:14<00:00,  8.90it/s, Epoch=6, LR=6.16e-6, Train_Loss=0.0557]
100%|██████████| 17/17 [00:01<00:00, 14.50it/s, Epoch=6, LR=6.16e-6, Valid_Loss=0.142]
100%|██████████| 131/131 [00:14<00:00,  8.89it/s, Epoch=7, LR=9.34e-6, Train_Loss=0.0462]
100%|██████████| 17/17 [00:01<00:00, 14.49it/s, Epoch=7, LR=9.34e-6, Valid_Loss=0.158]
100%|██████████| 131/131 [00:14<00:00,  8.88it/s, Epoch=8, LR=9.78e-6, Train_Loss=0.0326]
100%|██████████| 17/17 [00:01<00:00, 14.45it/s, Epoch=8, LR=9.78e-6, Valid_Loss=0.197]
100%|██████████| 131/131 [00:14<00:00,  8.86it/s, Epoch=9, LR=7.19e-6, Train_Loss=0.0294]
100%|██████████| 17/17 [0

Training Complete in 0h 2m 57s
Best Loss: 0.1310


Some weights of the model checkpoint at cl-tohoku/bert-base-japanese-whole-word-masking were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[INFO] Using GPU : NVIDIA GeForce RTX 3090



100%|██████████| 131/131 [00:14<00:00,  8.89it/s, Epoch=1, LR=8.42e-6, Train_Loss=0.248]
100%|██████████| 17/17 [00:01<00:00, 14.59it/s, Epoch=1, LR=8.42e-6, Valid_Loss=0.181]


[34mValid Loss Improved : inf ---> 0.180813
Model Saved[39m



100%|██████████| 131/131 [00:14<00:00,  8.91it/s, Epoch=2, LR=4.68e-6, Train_Loss=0.142]
100%|██████████| 17/17 [00:01<00:00, 14.59it/s, Epoch=2, LR=4.68e-6, Valid_Loss=0.125]


[34mValid Loss Improved : 0.180813 ---> 0.124661
Model Saved[39m



100%|██████████| 131/131 [00:14<00:00,  8.90it/s, Epoch=3, LR=1.18e-6, Train_Loss=0.0953]
100%|██████████| 17/17 [00:01<00:00, 14.37it/s, Epoch=3, LR=1.18e-6, Valid_Loss=0.123]


[34mValid Loss Improved : 0.124661 ---> 0.122527
Model Saved[39m



100%|██████████| 131/131 [00:14<00:00,  8.91it/s, Epoch=4, LR=1.56e-7, Train_Loss=0.0837]
100%|██████████| 17/17 [00:01<00:00, 14.44it/s, Epoch=4, LR=1.56e-7, Valid_Loss=0.123]
100%|██████████| 131/131 [00:14<00:00,  8.86it/s, Epoch=5, LR=2.27e-6, Train_Loss=0.0795]
100%|██████████| 17/17 [00:01<00:00, 14.44it/s, Epoch=5, LR=2.27e-6, Valid_Loss=0.122]


[34mValid Loss Improved : 0.122527 ---> 0.122290
Model Saved[39m



100%|██████████| 131/131 [00:14<00:00,  8.89it/s, Epoch=6, LR=6.16e-6, Train_Loss=0.081] 
100%|██████████| 17/17 [00:01<00:00, 14.54it/s, Epoch=6, LR=6.16e-6, Valid_Loss=0.137]
100%|██████████| 131/131 [00:14<00:00,  8.90it/s, Epoch=7, LR=9.34e-6, Train_Loss=0.0716]
100%|██████████| 17/17 [00:01<00:00, 14.51it/s, Epoch=7, LR=9.34e-6, Valid_Loss=0.125]
100%|██████████| 131/131 [00:14<00:00,  8.86it/s, Epoch=8, LR=9.78e-6, Train_Loss=0.0463]
100%|██████████| 17/17 [00:01<00:00, 14.44it/s, Epoch=8, LR=9.78e-6, Valid_Loss=0.143]
100%|██████████| 131/131 [00:14<00:00,  8.81it/s, Epoch=9, LR=7.19e-6, Train_Loss=0.0231]
100%|██████████| 17/17 [00:01<00:00, 14.46it/s, Epoch=9, LR=7.19e-6, Valid_Loss=0.168]
100%|██████████| 131/131 [00:14<00:00,  8.79it/s, Epoch=10, LR=3.23e-6, Train_Loss=0.0134]
100%|██████████| 17/17 [00:01<00:00, 14.46it/s, Epoch=10, LR=3.23e-6, Valid_Loss=0.187]


Training Complete in 0h 3m 2s
Best Loss: 0.1223


Some weights of the model checkpoint at cl-tohoku/bert-base-japanese-whole-word-masking were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[INFO] Using GPU : NVIDIA GeForce RTX 3090



100%|██████████| 131/131 [00:14<00:00,  8.84it/s, Epoch=1, LR=8.42e-6, Train_Loss=0.256]
100%|██████████| 17/17 [00:01<00:00, 14.38it/s, Epoch=1, LR=8.42e-6, Valid_Loss=0.153]


[34mValid Loss Improved : inf ---> 0.152967
Model Saved[39m



100%|██████████| 131/131 [00:14<00:00,  8.85it/s, Epoch=2, LR=4.68e-6, Train_Loss=0.14] 
100%|██████████| 17/17 [00:01<00:00, 14.63it/s, Epoch=2, LR=4.68e-6, Valid_Loss=0.109] 


[34mValid Loss Improved : 0.152967 ---> 0.109253
Model Saved[39m



100%|██████████| 131/131 [00:14<00:00,  8.99it/s, Epoch=3, LR=1.18e-6, Train_Loss=0.0936]
100%|██████████| 17/17 [00:01<00:00, 14.51it/s, Epoch=3, LR=1.18e-6, Valid_Loss=0.109] 


[34mValid Loss Improved : 0.109253 ---> 0.108724
Model Saved[39m



100%|██████████| 131/131 [00:14<00:00,  8.93it/s, Epoch=4, LR=1.56e-7, Train_Loss=0.0727]
100%|██████████| 17/17 [00:01<00:00, 14.57it/s, Epoch=4, LR=1.56e-7, Valid_Loss=0.112]
100%|██████████| 131/131 [00:14<00:00,  8.99it/s, Epoch=5, LR=2.27e-6, Train_Loss=0.0712]
100%|██████████| 17/17 [00:01<00:00, 14.71it/s, Epoch=5, LR=2.27e-6, Valid_Loss=0.117]
100%|██████████| 131/131 [00:14<00:00,  8.98it/s, Epoch=6, LR=6.16e-6, Train_Loss=0.0668]
100%|██████████| 17/17 [00:01<00:00, 14.71it/s, Epoch=6, LR=6.16e-6, Valid_Loss=0.108] 


[34mValid Loss Improved : 0.108724 ---> 0.107844
Model Saved[39m



100%|██████████| 131/131 [00:14<00:00,  8.96it/s, Epoch=7, LR=9.34e-6, Train_Loss=0.0571]
100%|██████████| 17/17 [00:01<00:00, 14.57it/s, Epoch=7, LR=9.34e-6, Valid_Loss=0.21] 
100%|██████████| 131/131 [00:14<00:00,  8.95it/s, Epoch=8, LR=9.78e-6, Train_Loss=0.0379]
100%|██████████| 17/17 [00:01<00:00, 14.65it/s, Epoch=8, LR=9.78e-6, Valid_Loss=0.142]
100%|██████████| 131/131 [00:14<00:00,  8.94it/s, Epoch=9, LR=7.19e-6, Train_Loss=0.02]  
100%|██████████| 17/17 [00:01<00:00, 14.63it/s, Epoch=9, LR=7.19e-6, Valid_Loss=0.154]
100%|██████████| 131/131 [00:14<00:00,  8.93it/s, Epoch=10, LR=3.23e-6, Train_Loss=0.0142]
100%|██████████| 17/17 [00:01<00:00, 14.42it/s, Epoch=10, LR=3.23e-6, Valid_Loss=0.163]


Training Complete in 0h 3m 2s
Best Loss: 0.1078


In [42]:
model_paths = glob(f"{settings.output_path}*.pth"); model_paths.sort()
model_paths

['./output/bert-baseline/model-fold0.pth',
 './output/bert-baseline/model-fold1.pth',
 './output/bert-baseline/model-fold2.pth',
 './output/bert-baseline/model-fold3.pth',
 './output/bert-baseline/model-fold4.pth']

In [43]:
fold_f1 = []
fold_acc = []

for fold in range(0, settings.folds):
    print(f"{y_} ====== Fold: {fold} ======{sr_}")

    model_id = model_paths[fold].split("/")[3].split(".")[0].split("-")[0]
    
    # Create DataLoader --
    train_loader, valid_loader = prepare_loaders(
        df=train_df,
        tokenizer=tokenizer,
        fold=fold,
        trn_batch_size=settings.train_batch_size,
        val_batch_size=settings.valid_batch_size,
        max_length=settings.max_length,
        num_classes=settings.num_classes,
        text_col="clean_text"
    )

    valid = train_df[train_df.kfold == fold]
    out = inference(settings.model_name, settings.num_classes, model_paths[fold], valid_loader, device)

    for _class in range(0, settings.num_classes):
        valid[f"{model_id}_oof_class{_class}"] = out[:, _class]
        train_df.loc[valid.index.tolist(), f"{model_id}_oof_class_{_class}"] = valid[f"{model_id}_oof_class{_class}"]

    valid_preds = np.argmax(out, axis=1)

    fold_f1.append(f1_score(valid[label_name].values, valid_preds))
    fold_acc.append(accuracy_score(valid[label_name].values, valid_preds))

    train_df.loc[valid.index.tolist(), f"{model_id}_pred"] = valid_preds



Some weights of the model checkpoint at cl-tohoku/bert-base-japanese-whole-word-masking were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Getting predictions for model : ./output/bert-baseline/model-fold0.pth


100%|██████████| 17/17 [00:01<00:00, 14.77it/s]




Some weights of the model checkpoint at cl-tohoku/bert-base-japanese-whole-word-masking were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Getting predictions for model : ./output/bert-baseline/model-fold1.pth


100%|██████████| 17/17 [00:01<00:00, 14.89it/s]




Some weights of the model checkpoint at cl-tohoku/bert-base-japanese-whole-word-masking were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Getting predictions for model : ./output/bert-baseline/model-fold2.pth


100%|██████████| 17/17 [00:01<00:00, 14.96it/s]




Some weights of the model checkpoint at cl-tohoku/bert-base-japanese-whole-word-masking were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Getting predictions for model : ./output/bert-baseline/model-fold3.pth


100%|██████████| 17/17 [00:01<00:00, 15.01it/s]




Some weights of the model checkpoint at cl-tohoku/bert-base-japanese-whole-word-masking were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Getting predictions for model : ./output/bert-baseline/model-fold4.pth


100%|██████████| 17/17 [00:01<00:00, 14.90it/s]


In [44]:
train_df.reset_index(drop=False).to_feather(f"{settings.output_path}train_df.feather")
test_df.reset_index(drop=False).to_feather(f"{settings.output_path}test_df.feather")

In [45]:
print(f"{g_} mean_valid_metric : f1 = {np.mean(fold_f1):.4f} ... acc = {np.mean(fold_acc):.4f}")
print(f"{g_}  all_valid_metric : f1 = {f1_score(train_df.label, train_df.model_pred):.4f} ... acc = {accuracy_score(train_df.label, train_df.model_pred):.4f} ")

[32m mean_valid_metric : f1 = 0.6054 ... acc = 0.9545
[32m  all_valid_metric : f1 = 0.6063 ... acc = 0.9545 
