<a href="https://colab.research.google.com/github/githubforkj/signate/blob/main/mufj/03.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# roberta-base

## 準備

In [1]:
# ドライブのマウント
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## 実装

In [2]:
%%capture
!pip install transformers datasets sentencepiece

In [3]:
!nvidia-smi

Sat Sep 17 02:52:48 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   42C    P8     9W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

### Import

In [4]:
from pathlib import Path
from multiprocessing import cpu_count
import random
import pandas as pd
import numpy as np
from tqdm.auto import tqdm

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_squared_error, f1_score

from datasets import load_dataset, Dataset, DatasetDict

from transformers import AutoModelForSequenceClassification, AutoTokenizer, TrainingArguments, Trainer, RobertaTokenizer, RobertaModel
from transformers import DataCollatorWithPadding
from transformers import EvalPrediction
from transformers import set_seed

import torch
from torch import nn

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


Moving 0 files to the new cache system


0it [00:00, ?it/s]

### Config

In [5]:
# 実験番号
EXP_NAME = 'exp03'
# ディレクトリ
INPUT_DIR = Path('/content/drive/MyDrive/signate/mufj/Input')
OUTPUT_DIR = Path('/content/drive/MyDrive/signate/mufj/Output')
# デバッグ用
DEBUG = True
TEXT_COLUMNS = ['goal','country','duration','category1','category2','html_content']
MODEL_NAME = 'roberta-base'
N_SPLIT = 5
TRN_FOLDS = [0,1,2,3,4]
MAX_LEN = 512
SEED = 3090


training_args = TrainingArguments(
    output_dir = str(OUTPUT_DIR/EXP_NAME),
    seed = SEED,
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs = 4,
    weight_decay = 0.01,
    fp16 = True,
    gradient_accumulation_steps=1,
    evaluation_strategy='epoch',
    save_total_limit = 1,
    save_strategy = 'epoch',
    metric_for_best_model = 'f1_score',
    load_best_model_at_end = True,
    greater_is_better=True
)

### Cross Validation ⌘

In [6]:
def create_folds(data, num_splits):
    data["kfold"] = -1

    mskf = StratifiedKFold(n_splits = num_splits, shuffle=True, random_state=42)
    data_labels = data['state']

    for f, (t_, v_) in enumerate(mskf.split(data, data_labels)):
        # f:enumerateで振られた番号
        # (t_, v_):二つ目はタプルになっているため、これで通る。
        # (num,(np.array,np.array))
        data.loc[v_, "kfold"] = f
        # v_(2分割した9870個のデータの片方。)の番号でインデックス参照し、"kfold"列の値を0,1,2,3,4(5分割の場合)のどれかに置き換える。
        
    
    return data


data = pd.read_csv(INPUT_DIR / "train.csv")
data = create_folds(data, num_splits=N_SPLIT)
data.to_csv("train_folds.csv", index = False) #セッションでのみ作成される
print("Folds created sucessfully")

Folds created sucessfully


### Preprocess ⌘

In [7]:
# テキストのトークン化&トークンをinput_idに変更
def text_to_input_ids(examples):
    return tokenizer(examples['text'] ,padding=False ,truncation=True, max_length=MAX_LEN)
    # padding:文字数を揃える
    # truncation:特定以下の文字数のトークンを切り捨て

def connect_text(df, text_cols, sep):
    text_df = df[text_cols].fillna('NAN').astype(str)
    # text_colsの空白をNANで埋める
    connected_text = text_df[text_cols[0]].str.cat(text_df[text_cols[1:]], sep=sep)
    # pandasのcat関数を使用して、各列の文字列を横に結合する(連結文字を[SEP]とする。)

    return connected_text

## Augumentation


In [8]:
class RandomMask:
    def __init__(self, tokenizer, prob=0.5, mask_prob=0.15):
        self.mask_token_id = tokenizer.mask_token_id
        self.prob = prob
        self.mask_prob = mask_prob
    
    def __call__(self, examples):
        if random.random() > self.prob:
            # マスキングを行う
            examples['input_ids'] = [self._mask(input_ids) for input_ids in examples['input_ids']]

        return examples
    
    def _mask(self, input_ids: list) -> list:
        length = len(input_ids)
        # ランダムに15%選ぶ
        mask_idx = random.sample(range(1, length), int(length*self.mask_prob)) # random masking except [CLS]
        # mask_idxに入っているidに該当するidのtoken_idをmask_token_idに置き換える
        for i in mask_idx:
            input_ids[i] = self.mask_token_id

        return input_ids

### Trainer

In [11]:
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get('labels').float()
        _inputs = {k: v for k, v in inputs.items() if k != 'labels'} #to not automatically calculate losses
        print("labels:",labels)
        print("_inputs:",_inputs)

        outputs = model(**_inputs)
        print("outputs:",outputs)
        last_hidden_state = outputs.get('last_hidden_state')
        print("last_hidden_state:",last_hidden_state)

        loss_fn = nn.BCEWithLogitsLoss()
        print("loss_fn:",loss_fn)

        # ここで、last_hidden_stateのデータサイズが合わないとのエラーが出た。
        # labelsとlast_hidden_stateのデータサイズを把握し、合わせればおｋ？
        print("labels.shape:",labels.shape)
        print("last_hidden_state.shape:",last_hidden_state.shape)

        loss = loss_fn(labels.view(-1),last_hidden_state.view(-1,16))
        print("loss:",loss)

        return (loss, outputs) if return_outputs else loss



# 第一引数に予測したラベル、第二引数にその確率
def threshold_search(y_true, y_proba):
    best_threshold = 0
    best_score = 0
    for threshold in [i * 0.01 for i in range(100)]:
        # sklearn
        score = f1_score(y_true = y_true, y_pred=y_proba > threshold)
        if score > best_score:
            best_threshold = threshold
            best_score = score
    search_result = {'threshold': best_threshold, 'f1': best_score}
    return search_result

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

# 学習結果を出力するための関数？(p: EvalPredictionと引数pにアノテーションをつけている。)
def compute_metrics(p: EvalPrediction):
    # シグモイド関数を呼び出し、
    preds = sigmoid(p.predictions)
    labels = p.label_ids
    score = threshold_search(labels, preds)['f1']
    metrics = {'f1_score': score}

    return metrics
        

### Main

In [12]:
if __name__ == "__main__":
    df = pd.read_csv('train_folds.csv')
    test_df = pd.read_csv(INPUT_DIR / 'test.csv')
    if DEBUG:
        df = df.head(50)
        test_df = test_df.head(50)
        training_args.num_train_epochs = 1

    tokenizer = RobertaTokenizer.from_pretrained(MODEL_NAME)
    # AutoTokenizerは様々なtokenizerのラッパー。MODEL_NAMEに入れるモデルによって中身が変わる。
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer) #for dynamic padding
    # データを集めてミニバッチを作成する。withpaddingを指定することで空いている箇所には0が埋まる
    # transformerのモジュール。

    # 上で定義したRandomMaskクラスを使用する。tokenの15%をマスキングする事前学習の設定。
    train_transform = RandomMask(tokenizer, prob=0.5, mask_prob=0.1) #[MASK] augmentation


    '''
        前処理
    '''
    df['text'] = connect_text(df, TEXT_COLUMNS, tokenizer.sep_token)
    # dfの最右列に'text'列を作成する。
    df['label'] = df['state']
    # dfの最右列に'state'列を作成する
    ds = Dataset.from_pandas(df[['text','label','kfold']])
    # pandasのdataframeをdatasetに。
    ds = ds.map(text_to_input_ids, batched=True, num_proc=cpu_count())
    # map関数は、シーケンスの構成要素全てに対して、ある関数の処理を行わせる
    # input_ids,token_type_ids,attention_mask列が追加される

    # test_dfに関しても上のdfと同様に前処理を行う
    test_df['text'] = connect_text(test_df, TEXT_COLUMNS, tokenizer.sep_token)
    test_ds = Dataset.from_pandas(test_df[['text']])
    test_ds = test_ds.map(text_to_input_ids, batched=True, num_proc=cpu_count())
    # なぜ'text'カラムを削除する？？か分からない
    test_ds = test_ds.remove_columns(['text'])

    '''
        前処理ここまで
    '''


    oof = []
    labels = []
    test_preds = []
    for fold in TRN_FOLDS:
        set_seed(SEED)
        # dictionary形式で持つ
        ds_i = DatasetDict(
            # foldは0,1,2,3,4のどれか
            {
                # 学習用のデータ7832個をfilter関数を用いて割り当てる。
                'train': ds.filter(lambda x: x['kfold']!=fold),
                # 評価用のデータ1959個を同上
                'eval': ds.filter(lambda x: x['kfold']==fold),
            }
        )
        # ds_iから'text','kfold'を除外する
        ds_i = ds_i.remove_columns(['text','kfold'])
        # ds_i['train'].set_transform(train_transform)

        '''
            学習
        '''

        # モデルのインスタンス化
        model = RobertaModel.from_pretrained(MODEL_NAME, num_labels=1)
        # OUTPUTディレクトリに出力する
        training_args.output_dir = str((OUTPUT_DIR / EXP_NAME) / f'fold_{fold}')
        # 学習パラメータ設定
        trainer = CustomTrainer(

            model = model,
            # 出力先
            args = training_args,
            # 学習用データセット
            train_dataset = ds_i["train"],
            # 検証用データセット(評価しなければなくても良い)
            eval_dataset=ds_i["eval"],
            tokenizer = tokenizer,
            data_collator=data_collator,
            # 上で定義した関数。デフォルトだと結果が表示されない。
            compute_metrics=compute_metrics,
        )


        # 学習！
        trainer.train()

        # inference
        # 検証用データセットの推論(trainer.evaluate()でも良いのでは？)
        oof_i = trainer.predict(ds_i['eval'])
        # テスト用データセットでの推論
        test_preds_i = trainer.predict(test_ds).predictions
        # EvalPredictionクラスはpredictionsとlabel_idsが定義されている。
        # 活性化関数sigmoidを通す
        oof.append(sigmoid(oof_i.predictions))
        # label_idsをlabelsリストに格納
        labels.append(oof_i.label_ids)
        # テスト用データセットの推論結果を格納
        test_preds.append(test_preds_i)
    
    # ？？？
    oof = np.vstack(oof).flatten()
    # ？？？
    labels = np.hstack(labels)
    # 上で定義した最適な閾値を探す関数をインスタンス化
    search_result = threshold_search(labels, oof)
    print('OOF Score: ', search_result['f1'], 'Threshold: ', search_result['threshold'])

    # 予測値の平均が閾値よりも高いものを採用している？？
    test_df['label'] = (np.mean(test_preds, axis=0) > search_result['threshold']).astype(int)
    # 結果を出力
    test_df[['id','label']].to_csv((OUTPUT_DIR/EXP_NAME)/ 'sub.csv', header=False, index=False)

loading file vocab.json from cache at /root/.cache/huggingface/hub/models--roberta-base/snapshots/251c3c36356d3ad6845eb0554fdb9703d632c6cc/vocab.json
loading file merges.txt from cache at /root/.cache/huggingface/hub/models--roberta-base/snapshots/251c3c36356d3ad6845eb0554fdb9703d632c6cc/merges.txt
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at None
loading file tokenizer_config.json from cache at None
loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--roberta-base/snapshots/251c3c36356d3ad6845eb0554fdb9703d632c6cc/config.json
Model config RobertaConfig {
  "_name_or_path": "roberta-base",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_

    

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

    

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--roberta-base/snapshots/251c3c36356d3ad6845eb0554fdb9703d632c6cc/config.json
Model config RobertaConfig {
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0
  },
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.22.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

loading weights file pytorch_model.bin from cache at /root/.cache/huggingface/hub/models--roberta-base/snapshots/251c3c

labels: tensor([1., 1., 0., 1., 1., 0., 1., 0., 0., 1., 1., 1., 0., 0., 1., 1.],
       device='cuda:0')
_inputs: {'input_ids': tensor([[    0,   134,    12,  ...,     1,     1,     1],
        [    0,  1497,   134,  ..., 50118, 41552,     2],
        [    0,  3079, 19089,  ...,    58,  1887,     2],
        ...,
        [    0,  4017,   134,  ...,     1,     1,     1],
        [    0, 33185,    12,  ...,    12,  3245,     2],
        [    0,   134,    12,  ...,     1,     1,     1]], device='cuda:0'), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')}


RuntimeError: ignored

## Make Pipline

In [None]:
# Trainerに必要なパラメーターたち

class CustomTrainer(Trainer):
    # self,モデル,インプット,
    def compute_loss(self, model, inputs, return_outputs=False):
        # インプットの中　
        labels = inputs.get('labels').float()
        _inputs = {k: v for k, v in inputs.items() if k != 'labels'} #to not automatically calculate losses
        print("labels:",labels)
        print("_inputs:",_inputs)

        outputs = model(**_inputs)
        print("outputs:",outputs)
        last_hidden_state = outputs.get('last_hidden_state')
        print("last_hidden_state:",last_hidden_state)

        loss_fn = nn.BCEWithLogitsLoss()
        print("loss_fn:",loss_fn)

        # ここで、last_hidden_stateのデータサイズが合わないとのエラーが出た。
        # labelsとlast_hidden_stateのデータサイズを把握し、合わせればおｋ？
        print("labels.shape:",labels.shape)
        print("last_hidden_state.shape:",last_hidden_state.shape)

        loss = loss_fn(labels.view(-1),last_hidden_state.view(-1,16))
        print("loss:",loss)

        return (loss, outputs) if return_outputs else loss