<a href="https://colab.research.google.com/github/daisukemiyajima/portfolio/blob/main/deberta.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!nvidia-smi

Fri Sep 16 03:27:04 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   35C    P0    26W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
%%capture
!pip install transformers datasets sentencepiece
from google.colab import drive
drive.mount('/content/drive')

In [None]:
EXP_NAME = 'exp001'
INPUT_DIR = Path('/content/drive/MyDrive/signate/Input')
OUTPUT_DIR = Path('/content/drive/MyDrive/signate/Output')

NameError: ignored

In [None]:
from pathlib import Path
from multiprocessing import cpu_count
import random
import pandas as pd
import numpy as np

from sklearn.metrics import mean_squared_error, f1_score

from datasets import load_dataset, Dataset, DatasetDict

from transformers import AutoModelForSequenceClassification, AutoTokenizer, TrainingArguments, Trainer
from transformers import DataCollatorWithPadding
from transformers import EvalPrediction
from transformers import EarlyStoppingCallback

import torch
from torch import nn

EXP_NAME = 'exp001'
INPUT_DIR = Path('/content/drive/MyDrive/signate/Input')
OUTPUT_DIR = Path('/content/drive/MyDrive/signate/Output')
DEBUG = False
TEXT_COLUMNS = ['goal', 'country', 'duration', 'category1', 'category2', 'html_content']
MODEL_NAME = 'roberta-base'
MAX_LEN = 512
SEED = 3090

training_args = TrainingArguments(
    output_dir=str(OUTPUT_DIR/EXP_NAME),
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=7,
    weight_decay=0.01,
    fp16=True,
    gradient_accumulation_steps=2,
    evaluation_strategy='epoch',
    save_strategy ='epoch',    
    metric_for_best_model='f1_score',
    load_best_model_at_end=True,
    greater_is_better=True
)

def text_to_input_ids(examples):
    return tokenizer(examples['text'], padding=False, truncation=True, max_length=MAX_LEN)

class RandomMask:
    def __init__(self, tokenizer, mask_prob=0.15):
        self.mask_token_id = tokenizer.mask_token_id
        self.mask_prob = mask_prob

    def __call__(self, examples):        
        examples['input_ids'] = [self._mask(input_ids) for input_ids in examples['input_ids']]

        return examples

    def _mask(self, input_ids: list) -> list:
        length = len(input_ids)
        mask_idx = random.sample(range(1, length), int(length*self.mask_prob)) # random masking except [CLS]
        for i in mask_idx:
            input_ids[i] = self.mask_token_id

        return input_ids

class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get('labels').float()
        _inputs = {k: v for k, v in inputs.items() if k != 'labels'} # to not automatically calculate losses
        
        outputs = model(**_inputs)
        logits = outputs.get('logits')
        
        loss_fn = nn.BCEWithLogitsLoss()
        loss = loss_fn(logits.view(-1), labels.view(-1))

        return (loss, outputs) if return_outputs else loss

def compute_metrics(p: EvalPrediction):
    preds = p.predictions > 0.5
    labels = p.label_ids
    score = f1_score(labels, preds)
    metrics = {'f1_score': score}    
    
    return metrics

if __name__ == "__main__":
    df = pd.read_csv(INPUT_DIR / "train_.csv")
    if DEBUG:
        df = df.head(500)

    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=1)
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer) # for dynamic paddnig
    train_transform = RandomMask(tokenizer, mask_prob=0.2) # [MASK] augmentation

    df['text'] = df[TEXT_COLUMNS[0]].fillna('NAN').astype(str).str.cat(df[TEXT_COLUMNS[1:]].fillna('NAN').astype(str), sep=tokenizer.sep_token)
    df['label'] = df['state']
    ds = Dataset.from_pandas(df[['text', 'label']])

    # train test split
    ds = ds.train_test_split(test_size=0.2, seed=SEED)

    # preprocessing
    ds = ds.map(text_to_input_ids, batched=True, num_proc=cpu_count())
    ds = ds.remove_columns(['text'])
    ds['train'].set_transform(train_transform)

    trainer = CustomTrainer(
        model=model,
        args=training_args,
        train_dataset=ds["train"],
        eval_dataset=ds["test"],
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )
    trainer.train()


Downloading tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/580 [00:00<?, ?B/s]

Downloading spm.model:   0%|          | 0.00/2.35M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
  "The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option"
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


KeyboardInterrupt: ignored

In [None]:
import pandas as pd
sub_df = pd.read_csv(INPUT_DIR / 'sample_submit.csv', names=['id', 'label'])
test_df = pd.read_csv(INPUT_DIR / 'test_.csv')
test_df['text'] = test_df[TEXT_COLUMNS[0]].fillna('NAN').astype(str).str.cat(test_df[TEXT_COLUMNS[1:]].fillna('NAN').astype(str), sep=tokenizer.sep_token)
test_ds = Dataset.from_pandas(test_df[['text']])
test_ds = test_ds.map(text_to_input_ids, batched=True, num_proc=cpu_count())
test_ds = test_ds.remove_columns(['text'])

preds = trainer.predict(test_ds)
sub_df['label'] = (preds.predictions > 0.5).astype(int)

sub_df.to_csv((OUTPUT_DIR/EXP_NAME)/ 'sub.csv', header=False, index=False)

     

#0:   0%|          | 0/3 [00:00<?, ?ba/s]

 

#1:   0%|          | 0/3 [00:00<?, ?ba/s]

 

#2:   0%|          | 0/3 [00:00<?, ?ba/s]

 

#3:   0%|          | 0/3 [00:00<?, ?ba/s]

AttributeError: ignored

In [None]:
import torch
trainer = torch.load('/content/drive/MyDrive/signate/Output/exp001/checkpoint-2772/rng_state.pth')


In [None]:
load_weights = torch.load('/content/drive/MyDrive/signate/Output/exp001/checkpoint-2772/rng_state.pth')
net.load_state_dict(load_weights)

NameError: ignored

In [None]:
model.load_state_dict(torch.load('/content/drive/MyDrive/signate/Output/exp001/checkpoint-2772/rng_state.pth'))

NameError: ignored