## BERTのベースラインをスクリプトから実行できるようにリファクタリング --

In [18]:
import pandas as pd
import torch
import os

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score, accuracy_score

from transformers import AutoTokenizer, AdamW

from glob import glob
from config import *
from bert_utils import *

In [19]:
RUN_ID = "implement-exp-manage"

In [20]:
settings = pd.Series(dtype=object)

# project settings --
settings["run_id"] = RUN_ID
settings["output_path"] = f"{output_root}{settings.run_id}/"
settings["num_classes"] = 2

# training settings --
settings["epochs"] = 1
settings["folds"] = 5
settings["train_batch_size"] = 32
settings["valid_batch_size"] = 64
settings["test_batch_size"] = 64

# bert settings --
settings["model_name"] = r"cl-tohoku/bert-base-japanese-whole-word-masking"
settings["max_length"] = 76
settings["hidden_size"] = 768
settings["num_hidden_layers"] = 24
settings["dropout"] = 0.2

# optimizer settings --
settings["learning_rate"] = 1e-5
settings["scheduler_name"] = "CosineAnnealingLR"
settings["min_lr"] = 1e-6
settings["T_max"] = 500
settings["weight_decay"] = 1e-5
settings["n_accumulate"] = 1

if not os.path.exists(settings.output_path):
    os.mkdir(settings.output_path)

os.system(f"cp ./*py {settings.output_path}")
settings.to_json(f"{settings.output_path}settings.json")

In [21]:
train = pd.read_csv(data_path+"train.csv")
test = pd.read_csv(data_path+"test.csv")

df = pd.concat([train, test]).reset_index(drop=True)

train_shape = train.shape[0]
del train, test; _ = gc.collect()

display(df.head(2))

Unnamed: 0,id,source,text,label
0,80074aa43,news4vip,まともに相手されてない人との関係なんて\nそんな大事にするものか？,0.0
1,6378fea6b,livejupiter,最近はアヘアヘQSマンやない？ ｲｲ!(・∀・)+1-0(・Ａ・)ｲｸﾅｲ!,0.0


In [22]:
df["clean_text"] = df["text"].map(lambda x: clean_text(x))

In [23]:
train_df = df.loc[:train_shape-1, :]
test_df = df.loc[train_shape:, :]

skf = StratifiedKFold(n_splits=settings.folds, shuffle=True, random_state=SEED)
split = skf.split(train_df, train_df[label_name])

for fold, (_, val_index) in enumerate(skf.split(X=train_df, y=train_df[label_name])):
    train_df.loc[val_index, "kfold"] = int(fold)
train_df["kfold"] = train_df["kfold"].astype(int)

In [24]:
tokenizer = AutoTokenizer.from_pretrained(
    settings.model_name,
    mecab_kwargs={"mecab_dic":None, "mecab_option": f"-d {dic_neologd}"}
)

In [25]:
log = open(settings.output_path + "/train.log", "w", buffering=1)
Write_log(log, "***************** TRAINING ********************")

for fold in range(0, settings.folds):
    
    #print(f"{y_} ====== Fold: {fold} ======{sr_}")
    Write_log(log, f"\n================== Fold: {fold} ==================")

    # Create DataLoader --
    train_loader, valid_loader = prepare_loaders(
        df=train_df,
        tokenizer=tokenizer,
        fold=fold,
        trn_batch_size=settings.train_batch_size,
        val_batch_size=settings.valid_batch_size,
        max_length=settings.max_length,
        num_classes=settings.num_classes,
        text_col="clean_text"
    )

    # Model construct --
    model = HateSpeechModel(model_name=settings.model_name, num_classes=settings.num_classes)
    model.to(device)

    # Define Optimizer and Scheduler --
    optimizer = AdamW(model.parameters(), lr=settings.learning_rate, weight_decay=settings.weight_decay)
    scheduler = fetch_scheduler(optimizer=optimizer, scheduler=settings.scheduler_name)

    model, history = run_training(
        model, train_loader, valid_loader, optimizer, scheduler, settings.n_accumulate, device, settings.epochs, fold, settings.output_path, log
    )

    del model, history, train_loader, valid_loader
    _ = gc.collect()

***************** TRAINING ********************



Some weights of the model checkpoint at cl-tohoku/bert-base-japanese-whole-word-masking were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[INFO] Using GPU : NVIDIA GeForce RTX 3090



100%|██████████| 131/131 [00:14<00:00,  8.95it/s, Epoch=1, LR=8.42e-6, Train_Loss=0.245]
100%|██████████| 17/17 [00:01<00:00, 14.65it/s, Epoch=1, LR=8.42e-6, Valid_Loss=0.181]


Valid Loss Improved : inf ---> 0.181440
Model Saved

Training Complete in 0h 0m 20s
Best Loss: 0.1814



Some weights of the model checkpoint at cl-tohoku/bert-base-japanese-whole-word-masking were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[INFO] Using GPU : NVIDIA GeForce RTX 3090



100%|██████████| 131/131 [00:14<00:00,  8.90it/s, Epoch=1, LR=8.42e-6, Train_Loss=0.243]
100%|██████████| 17/17 [00:01<00:00, 14.59it/s, Epoch=1, LR=8.42e-6, Valid_Loss=0.184]


Valid Loss Improved : inf ---> 0.184069
Model Saved

Training Complete in 0h 0m 22s
Best Loss: 0.1841



Some weights of the model checkpoint at cl-tohoku/bert-base-japanese-whole-word-masking were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[INFO] Using GPU : NVIDIA GeForce RTX 3090



100%|██████████| 131/131 [00:14<00:00,  8.88it/s, Epoch=1, LR=8.42e-6, Train_Loss=0.237]
100%|██████████| 17/17 [00:01<00:00, 14.57it/s, Epoch=1, LR=8.42e-6, Valid_Loss=0.193]


Valid Loss Improved : inf ---> 0.192670
Model Saved

Training Complete in 0h 0m 22s
Best Loss: 0.1927



Some weights of the model checkpoint at cl-tohoku/bert-base-japanese-whole-word-masking were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[INFO] Using GPU : NVIDIA GeForce RTX 3090



100%|██████████| 131/131 [00:14<00:00,  8.88it/s, Epoch=1, LR=8.42e-6, Train_Loss=0.234]
100%|██████████| 17/17 [00:01<00:00, 14.77it/s, Epoch=1, LR=8.42e-6, Valid_Loss=0.162]


Valid Loss Improved : inf ---> 0.162024
Model Saved

Training Complete in 0h 0m 17s
Best Loss: 0.1620



Some weights of the model checkpoint at cl-tohoku/bert-base-japanese-whole-word-masking were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[INFO] Using GPU : NVIDIA GeForce RTX 3090



100%|██████████| 131/131 [00:14<00:00,  8.87it/s, Epoch=1, LR=8.42e-6, Train_Loss=0.246]
100%|██████████| 17/17 [00:01<00:00, 14.60it/s, Epoch=1, LR=8.42e-6, Valid_Loss=0.163]


Valid Loss Improved : inf ---> 0.162869
Model Saved

Training Complete in 0h 0m 17s
Best Loss: 0.1629


In [26]:
model_paths = glob(f"{settings.output_path}*.pth"); model_paths.sort()
model_paths

['./output/implement-exp-manage/model-fold0.pth',
 './output/implement-exp-manage/model-fold1.pth',
 './output/implement-exp-manage/model-fold2.pth',
 './output/implement-exp-manage/model-fold3.pth',
 './output/implement-exp-manage/model-fold4.pth']

In [27]:
fold_f1 = []
fold_acc = []

for fold in range(0, settings.folds):
    print(f"{y_} ====== Fold: {fold} ======{sr_}")

    model_id = model_paths[fold].split("/")[3].split(".")[0].split("-")[0]
    
    # Create DataLoader --
    train_loader, valid_loader = prepare_loaders(
        df=train_df,
        tokenizer=tokenizer,
        fold=fold,
        trn_batch_size=settings.train_batch_size,
        val_batch_size=settings.valid_batch_size,
        max_length=settings.max_length,
        num_classes=settings.num_classes,
        text_col="clean_text"
    )

    valid = train_df[train_df.kfold == fold]
    out = inference(settings.model_name, settings.num_classes, model_paths[fold], valid_loader, device)

    for _class in range(0, settings.num_classes):
        valid[f"{model_id}_oof_class{_class}"] = out[:, _class]
        train_df.loc[valid.index.tolist(), f"{model_id}_oof_class_{_class}"] = valid[f"{model_id}_oof_class{_class}"]

    valid_preds = np.argmax(out, axis=1)

    fold_f1.append(f1_score(valid[label_name].values, valid_preds))
    fold_acc.append(accuracy_score(valid[label_name].values, valid_preds))

    train_df.loc[valid.index.tolist(), f"{model_id}_pred"] = valid_preds



Some weights of the model checkpoint at cl-tohoku/bert-base-japanese-whole-word-masking were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Getting predictions for model : ./output/implement-exp-manage/model-fold0.pth


100%|██████████| 17/17 [00:01<00:00, 14.84it/s]




Some weights of the model checkpoint at cl-tohoku/bert-base-japanese-whole-word-masking were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Getting predictions for model : ./output/implement-exp-manage/model-fold1.pth


100%|██████████| 17/17 [00:01<00:00, 14.88it/s]




Some weights of the model checkpoint at cl-tohoku/bert-base-japanese-whole-word-masking were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Getting predictions for model : ./output/implement-exp-manage/model-fold2.pth


100%|██████████| 17/17 [00:01<00:00, 14.77it/s]




Some weights of the model checkpoint at cl-tohoku/bert-base-japanese-whole-word-masking were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Getting predictions for model : ./output/implement-exp-manage/model-fold3.pth


100%|██████████| 17/17 [00:01<00:00, 14.93it/s]




Some weights of the model checkpoint at cl-tohoku/bert-base-japanese-whole-word-masking were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Getting predictions for model : ./output/implement-exp-manage/model-fold4.pth


100%|██████████| 17/17 [00:01<00:00, 14.80it/s]


In [28]:
train_df.reset_index(drop=False).to_feather(f"{settings.output_path}train_df.feather")
test_df.reset_index(drop=False).to_feather(f"{settings.output_path}test_df.feather")

In [29]:
Write_log(log, "\n++++++++++++++++++++++++++++++++++++++++\n")

Write_log(log, f">> mean_valid_metric : f1 = {np.mean(fold_f1):.4f} ... acc = {np.mean(fold_acc):.4f}")
Write_log(log, f">>  all_valid_metric : f1 = {f1_score(train_df.label, train_df.model_pred):.4f} ... acc = {accuracy_score(train_df.label, train_df.model_pred):.4f} ")


++++++++++++++++++++++++++++++++++++++++

>> mean_valid_metric : f1 = 0.0000 ... acc = 0.9418
>>  all_valid_metric : f1 = 0.0000 ... acc = 0.9418 


In [30]:
# experiment manage --
mean_valid_metric = np.mean(fold_f1)
all_valid_metric = f1_score(train_df.label, train_df.model_pred)

log_df = pd.DataFrame(settings).T
log_df["all_valid_metric"] = all_valid_metric
log_df["mean_valid_metric"] = mean_valid_metric

Write_exp_management(output_root, log_df)