### コーパスのデータを入力として学習済みモデルから予測結果を出力するnotebook

In [1]:
import pandas as pd

from transformers import AutoTokenizer, T5Tokenizer
from torch.utils.data import DataLoader
from glob import glob

from bert_utils import *
from config import *

import argparse


****** SEED fixed : 42 ******




In [2]:
# 計算時のsettingはtrainで保存したjsonから読み込む --
# run_idだけ指定 --
parser = argparse.ArgumentParser()
parser.add_argument("--run_id", type=str, default="tmp")
args, unknown = parser.parse_known_args()
save_path = f"{input_root}corpus_label_{args.run_id}"

if not os.path.exists(f"{save_path}"):
    os.mkdir(f"{save_path}")

# settings, fine-tuningしたモデル, モデル作成時に前処理したtest_dfを読み込み -- 
output_path = f"{output_root}{args.run_id}/"
settings = pd.read_json(f"{output_path}settings.json", typ="series")
model_paths = glob(f"{settings.output_path}*.pth"); model_paths.sort()

# define tokenizer --
tokenizer = define_tokenizer(settings.model_name)

In [3]:
# 対象とするデータの読み込み --
corpus_paths = glob(f"{input_root}*.feather")

df = []
for corpus_path in corpus_paths:
    _df = pd.read_feather(corpus_path)
    _df = _df.reset_index(drop=False, names="id")
    _df["id"] = corpus_path.split("/")[-1].split(".")[0] + "_" + _df["id"].astype(str)
    df.append(_df)
df = pd.concat(df)

In [4]:
df.shape

(18589859, 3)

In [5]:
# debug --
df = df.head(100000)

In [6]:
# make test preds --
test_dataset = HateSpeechDataset(
    df, tokenizer=tokenizer, 
    max_length=settings.max_length, num_classes=settings.num_classes, 
    text_col="clean_text", isTrain=False
    )
#test_loader = DataLoader(test_dataset, batch_size=settings.test_batch_size, num_workers=2, shuffle=False, pin_memory=True)
test_loader = DataLoader(test_dataset, batch_size=1024, num_workers=2, shuffle=False, pin_memory=True)

In [7]:
preds_list = []
for fold in range(0, settings.folds):
    model_id = "model"
    preds = inference(settings.model_name, settings.num_classes, settings.model_custom_header, model_paths[fold], test_loader, device)
    preds_list.append(preds)

Some weights of the model checkpoint at cl-tohoku/bert-base-japanese-whole-word-masking were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[33mmax_pooling[39m
Getting predictions for model : /mnt/sdb/NISHIKA_DATA/hate-speech-detection/output/tmp/model-fold0.pth


100%|██████████| 98/98 [01:39<00:00,  1.01s/it]
Some weights of the model checkpoint at cl-tohoku/bert-base-japanese-whole-word-masking were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[33mmax_pooling[39m
Getting predictions for model : /mnt/sdb/NISHIKA_DATA/hate-speech-detection/output/tmp/model-fold1.pth


100%|██████████| 98/98 [01:39<00:00,  1.02s/it]


In [8]:
final_preds = np.mean(np.array(preds_list), axis=0)
df[f"{model_id}_pred"] = np.argmax(final_preds, axis=1)
for _class in range(0, settings.num_classes):
    df.loc[:, f"{model_id}_oof_class_{_class}"] = final_preds[:, _class]

In [9]:
df.to_feather(f"{save_path}/corpus_labeled.feather")

In [10]:
df

Unnamed: 0,id,source,clean_text,model_pred,model_oof_class_0,model_oof_class_1
0,news4vip_0,news4vip,医者ああ正常なコミュニケーションが取れてませんねえ・・・医者月一で病院に来てください申請しま...,0,0.950779,0.049221
1,news4vip_1,news4vip,自立支援手帳じゃなくて障害者手帳ですか?療育?,0,0.898060,0.101940
2,news4vip_2,news4vip,なにその全く実用性がない特殊能力,0,0.844515,0.155485
3,news4vip_3,news4vip,本人が発達障害の自覚がないと職場では単純に使えない奴で疎まれるそうならないように周りを誘導す...,0,0.870264,0.129736
4,news4vip_4,news4vip,いや発達はさ環境が支えてくれればしっかり労働出来るじゃんしかも悪気は元無いんだし人間的には嫌...,0,0.840931,0.159069
...,...,...,...,...,...,...
99995,news4vip_99995,news4vip,コミュ障死ね(笑),0,0.890367,0.109633
99996,news4vip_99996,news4vip,は!?コンビニの店員となに話すんだよwwww,0,0.944303,0.055697
99997,news4vip_99997,news4vip,数十秒やん,0,0.940866,0.059134
99998,news4vip_99998,news4vip,パスタ系は2分くらい待つよ,0,0.930972,0.069028
