### コーパスのデータを入力として学習済みモデルから予測結果を出力するnotebook

In [1]:
import pandas as pd
import torch.nn as nn

from transformers import AutoTokenizer, T5Tokenizer
from torch.utils.data import DataLoader
from glob import glob

from bert_utils import *
from config import *

import argparse

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.

****** SEED fixed : 42 ******




In [2]:
# 計算時のsettingはtrainで保存したjsonから読み込む --
# run_idだけ指定 --
parser = argparse.ArgumentParser()
parser.add_argument("--run_id", type=str, default="roberta_large_cat4")
parser.add_argument("--df_start_index", type=int, default=0)
parser.add_argument("--df_end_index", type=int, default=1024)
args, unknown = parser.parse_known_args()
save_path = f"{input_root}corpus_label_{args.run_id}"

# コーパス作成時のセッティングを保存 --
if not os.path.exists(f"{save_path}"):
    os.mkdir(f"{save_path}")

corpus_settings = pd.Series()
corpus_settings["used_model"] = args.run_id
corpus_settings["df_start_index"] = args.df_start_index
corpus_settings["df_end_index"] = args.df_end_index
corpus_settings.to_json(save_path+"/corpus_settings.json", indent=4)

# settings, fine-tuningしたモデル -- 
output_path = f"{output_root}{args.run_id}/"
settings = pd.read_json(f"{output_path}settings.json", typ="series")
model_paths = glob(f"{settings.output_path}*.pth"); model_paths.sort()

# define tokenizer --
tokenizer = define_tokenizer(settings.model_name)

In [3]:
# 対象とするデータの読み込み --
corpus_paths = glob(f"{input_root}*.feather")
Debug_print(corpus_paths)

df = []
for corpus_path in corpus_paths:
    _df = pd.read_feather(corpus_path)
    _df = _df.reset_index(drop=False, names="id")
    _df["id"] = corpus_path.split("/")[-1].split(".")[0] + "_" + _df["id"].astype(str)
    df.append(_df)
df = pd.concat(df)

[33m['./input/news4vip.feather', './input/newsplus.feather', './input/livejupiter.feather'][39m


In [4]:
df = df.iloc[args.df_start_index:args.df_end_index, :]

In [5]:
# make test preds --
test_dataset = HateSpeechDataset(
    df, tokenizer=tokenizer, 
    max_length=settings.max_length, num_classes=settings.num_classes, 
    text_col="clean_text", isTrain=False
    )

# batch_size=512でGPU:19GBくらい --
test_loader = DataLoader(test_dataset, batch_size=512, num_workers=2, shuffle=False, pin_memory=True)

In [6]:
preds_list = []
for fold in range(0, settings.folds):
    softmax = nn.Softmax()
    model_id = "model"
    preds = inference(settings.model_name, settings.num_classes, settings.model_custom_header, settings.dropout, model_paths[fold], test_loader, device)
    
    # preds : BERT -> fc, 確率にするためにsoftmaxに通す必要がある --
    preds_list.append(softmax(torch.Tensor(preds)).numpy())

Some weights of the model checkpoint at nlp-waseda/roberta-large-japanese-seq512 were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at nlp-waseda/roberta-large-japanese-seq512 and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for pre

[33mconcatenate-4[39m
Getting predictions for model : ./output/roberta_large_cat4/checkpoint-fold0.pth


100%|██████████| 2/2 [00:03<00:00,  1.92s/it]
Some weights of the model checkpoint at nlp-waseda/roberta-large-japanese-seq512 were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at nlp-waseda/roberta-large-japanese-seq512 and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a

[33mconcatenate-4[39m
Getting predictions for model : ./output/roberta_large_cat4/checkpoint-fold1.pth


100%|██████████| 2/2 [00:03<00:00,  1.58s/it]
Some weights of the model checkpoint at nlp-waseda/roberta-large-japanese-seq512 were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at nlp-waseda/roberta-large-japanese-seq512 and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a

[33mconcatenate-4[39m
Getting predictions for model : ./output/roberta_large_cat4/checkpoint-fold2.pth


100%|██████████| 2/2 [00:03<00:00,  1.58s/it]
Some weights of the model checkpoint at nlp-waseda/roberta-large-japanese-seq512 were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at nlp-waseda/roberta-large-japanese-seq512 and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a

[33mconcatenate-4[39m
Getting predictions for model : ./output/roberta_large_cat4/checkpoint-fold3.pth


100%|██████████| 2/2 [00:03<00:00,  1.60s/it]
Some weights of the model checkpoint at nlp-waseda/roberta-large-japanese-seq512 were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at nlp-waseda/roberta-large-japanese-seq512 and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a

[33mconcatenate-4[39m
Getting predictions for model : ./output/roberta_large_cat4/checkpoint-fold4.pth


100%|██████████| 2/2 [00:03<00:00,  1.59s/it]


In [7]:
final_preds = np.mean(np.array(preds_list), axis=0)
df[f"{model_id}_pred"] = np.argmax(final_preds, axis=1)
for _class in range(0, settings.num_classes):
    df.loc[:, f"{model_id}_oof_class_{_class}"] = final_preds[:, _class]

In [23]:
df.to_feather(f"{save_path}/corpus_labeled_{args.df_start_index}_to_{args.df_end_index}.feather")