# settings, Download modules

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import torch
torch.cuda.empty_cache()

In [3]:
# Install transformers library.
!pip install -q git+https://github.com/huggingface/transformers.git
# Install helper functions.
!pip install -q git+https://github.com/gmihaila/ml_things.git
# Install matplotlib in proper version
! pip install matplotlib==3.1.3

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
albumentations 0.1.12 requires imgaug<0.2.7,>=0.2.5, but you have imgaug 0.2.9 which is incompatible.[0m
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting matplotlib==3.1.3
  Using cached matplotlib-3.1.3-cp37-cp37m-manylinux1_x86_64.whl (13.1 MB)
Installing collected packages: matplotlib
  Attempting uninstall: matplotlib
    Found existing installation: matplotlib 3.5.2
    Uninstalling matplotlib-3.5.2:
      Successfully uninstalled matplotlib-3.5.2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the sourc

In [4]:
# Install wandb
! pip install wandb

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [5]:
import matplotlib.pyplot as plt

plt.rc('font', family='NanumBarunGothic') 

## load_data.py

In [6]:
import pickle as pickle
import os
import pandas as pd
import torch

class news_dataset(torch.utils.data.Dataset):
    """Dataset 구성을 위한 class."""

    def __init__(self, news_dataset, labels):
        self.news_dataset = news_dataset
        self.labels = labels

    def __getitem__(self, idx):
        item = {
            key: val[idx].clone().detach() for key, val in self.news_dataset.items()
        }
        item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)


def load_data(dataset_dir):
    """csv 파일을 경로에 맡게 불러 옵니다."""
    pd_dataset = pd.read_csv(dataset_dir)
    return pd_dataset


def tokenized_dataset(dataset, tokenizer, max_length):
    """tokenizer에 따라 sentence를 tokenizing 합니다."""
    concat_entity = []
    for e01 in dataset['conversation']:
        temp = ""
        temp = e01
        # 밥을 굶은 아이들 [SEP] 아이들이 밥을 굶고있다... 
        concat_entity.append(temp)

    tokenized_sentences = tokenizer(
        concat_entity,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=max_length,
        add_special_tokens=True,
        return_token_type_ids=False, #BERT ->  RoBERTa 120, 20 100 밥을 먹었다 [PAD] [PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD] [1 1 1 00000000]
    )
    # Tokenizer : sentence -> token_id , attention mask , token_type_ids => 첫번째 문장과 두번째 문장 표시 00000000 111111111
    return tokenized_sentences

## train.py

In [7]:
! pwd

/content


In [None]:
# 모델을 실험을 해요. 기록 & 저장 -> WandB , MLflow / 실험하기 용이 -> argparse : python train.py -b 32 -lr 5e-5 + 전기세  nohup
# config.json

import os
import random
import argparse

import numpy as np
from sklearn.metrics import accuracy_score

import torch
from torch.utils.data import random_split
from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModelForSequenceClassification,
)

from transformers import Trainer, TrainingArguments
from transformers import EarlyStoppingCallback
from transformers.optimization import get_cosine_with_hard_restarts_schedule_with_warmup

import wandb

# ------* Fix Seeds * -----------#
def seed_everything(seed):
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)  # if use multi-GPU
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(seed)
    random.seed(seed)


def compute_metrics(pred):
    """validation을 위한 metrics function"""
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1) #argmax([0.01 0.0001 0.1])

    # calculate accuracy using sklearn's function
    acc = accuracy_score(labels, preds)
    return {
        "accuracy": acc,
    }


def label_to_num(label):
    dict_label_to_num = {
        '일반':0, '갈취':1, '협박':2, '직장 내 괴롭힘':3, '기타 괴롭힘':4
    }
    num_label = []

    for v in label:
        num_label.append(dict_label_to_num[v])

    return num_label


def train():
    # fix a seed
    seed_everything(seed)

    # load model and tokenizer
    MODEL_NAME = model_name # klue/roberta-large
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

    # load dataset
    train_dataset = load_data('./drive/MyDrive/Colab_Notebooks/voc_rm/datasets/dktc/data/insert_koen/train.csv')
    # test_dataset = load_data("..data/newszum_test_data.csv")

    train_label = label_to_num(train_dataset["class"].values)
    # test_label = label_to_num(test_dataset["class"].values)

    # tokenizing dataset
    tokenized_train = tokenized_dataset(train_dataset, tokenizer, max_len)
    # tokenized_test = tokenized_dataset(test_dataset, tokenizer, max_len)

    # make dataset for pytorch.
    voc_dataset = news_dataset(tokenized_train, train_label)
    # news_test_dataset = news_dataset(tokenized_test, test_label)

    # defining split_size
    dataset_size = len(voc_dataset)
    train_size = int(dataset_size * 0.8)
    validation_size = int(dataset_size * 0.1)
    test_size = dataset_size - train_size - validation_size

    # random split
    train_dataset, valid_dataset, test_dataset = random_split(voc_dataset, [train_size, validation_size, test_size])

    print(f"Training Data Size : {len(train_dataset)}")
    print(f"Validation Data Size : {len(valid_dataset)}")
    print(f"Testing Data Size : {len(test_dataset)}")

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    print(device) #a.device ,b => a*b XX

    # setting model hyperparameter
    model_config = AutoConfig.from_pretrained(MODEL_NAME)
    model_config.num_labels = 5

    model = AutoModelForSequenceClassification.from_pretrained(
        MODEL_NAME, config=model_config
    )

    print(model.config)
    model.parameters
    model.to(device)

    ### callback & optimizer & scheduler 추가
    MyCallback = EarlyStoppingCallback(
        early_stopping_patience=3, early_stopping_threshold=0.001
    )

    optimizer = torch.optim.AdamW(
        model.parameters(),
        lr=lr,
        betas=(0.9, 0.999),
        eps=1e-08,
        weight_decay=weight_decay,
        amsgrad=False,
    )

    training_args = TrainingArguments(
      output_dir = save_path + '/results',
      save_total_limit = save_limit,
      save_steps = save_step,
      num_train_epochs = epochs,
      learning_rate = lr,
      per_device_train_batch_size = batch_size,
      per_device_eval_batch_size = 2,
      warmup_steps = warmup_steps,
      weight_decay = weight_decay,
      logging_dir = save_path + '/logs',
      logging_steps = 100,
      evaluation_strategy = 'steps',
      gradient_accumulation_steps = 20,
      eval_accumulation_steps = 20,

      eval_steps = 500,
      load_best_model_at_end = True,
    )

    trainer = Trainer(
      model = model,
      args = training_args,
      train_dataset=train_dataset,  # training dataset
      eval_dataset=valid_dataset,  # evaluation dataset
      compute_metrics=compute_metrics,  # define metrics function
      callbacks=[MyCallback],
      optimizers=(
        optimizer,
        get_cosine_with_hard_restarts_schedule_with_warmup(
            optimizer,
            num_warmup_steps=warmup_steps,
            num_training_steps=len(train_dataset) * epochs,
            ),
        ),
    )

    # train model
    trainer.train()
    model.save_pretrained(save_path + "/best_model")


def main():
    train()


if __name__ == "__main__":
    save_path = './drive/MyDrive/Colab_Notebooks/voc_rm/model/bert'

    model_name = "klue/roberta-large"
    seed = 42
    max_len = 400
    save_limit = 5
    save_step = 500
    epochs = 10
    lr = 5e-3
    batch_size = 4
    per_device_eval_batch_size = 2
    warmup_steps = 300
    weight_decay = 0.01
    logging_steps = 100

    eval_steps = 500
    load_best_model_at_end = True

    # fix a seed
    seed_everything(seed)
    main()

Training Data Size : 6319
Validation Data Size : 789
Testing Data Size : 791
cuda:0


Some weights of the model checkpoint at klue/roberta-large were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.decoder.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at klue/roberta-large and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.bias', 'class

RobertaConfig {
  "_name_or_path": "klue/roberta-large",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4"
  },
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4
  },
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "tokenizer_class": "BertTokenizer",
  "transformers_version": "4.20.0.dev0",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 32000
}



***** Running training *****
  Num examples = 6319
  Num Epochs = 10
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 80
  Gradient Accumulation steps = 20
  Total optimization steps = 790
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
[34m[1mwandb[0m: Currently logged in as: [33methicsense[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss,Validation Loss,Accuracy
500,1.5591,1.558625,0.512041


***** Running Evaluation *****
  Num examples = 789
  Batch size = 2
Saving model checkpoint to ./drive/MyDrive/Colab_Notebooks/voc_rm/model/bert/results/checkpoint-500
Configuration saved in ./drive/MyDrive/Colab_Notebooks/voc_rm/model/bert/results/checkpoint-500/config.json
Model weights saved in ./drive/MyDrive/Colab_Notebooks/voc_rm/model/bert/results/checkpoint-500/pytorch_model.bin


## inference.py

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.utils.data import DataLoader
import pandas as pd
import torch
import torch.nn.functional as F

import numpy as np
import argparse
from tqdm import tqdm


def inference(model, tokenized_sent, device):
    """
    test dataset을 DataLoader로 만들어 준 후,
    batch_size로 나눠 model이 예측 합니다.
    """
    dataloader = DataLoader(tokenized_sent, batch_size=16, shuffle=False)
    model.eval()
    output_pred = []
    for i, data in enumerate(tqdm(dataloader)):
        with torch.no_grad():
            outputs = model(
                input_ids=data["input_ids"].to(device),
                attention_mask=data["attention_mask"].to(device),
            )
        # print(outputs[0])
        logits = outputs[0]
        logits = logits.detach().cpu().numpy()
        result = np.argmax(logits, axis=-1)

        output_pred.append(result)
    return (np.concatenate(output_pred).tolist(),)


def num_to_label(label):
    """
    숫자로 되어 있던 class를 원본 문자열 라벨로 변환 합니다.
    """
    origin_label = []
    dict_num_to_label = {
        '일반':0, '갈취':1, '협박':2, '직장 내 괴롭힘':3, '기타 괴롭힘':4
    }

    for v in label[0]:
        origin_label.append(dict_num_to_label[v])

    return origin_label


def load_test_dataset(dataset_dir, tokenizer):
    """
    test dataset을 불러온 후,
    tokenizing 합니다.
    """
    test_dataset = load_data(dataset_dir)
    test_label = list(map(int, label_to_num(test_dataset["class"].values)))

    # tokenizing dataset
    tokenized_test = tokenized_dataset(test_dataset, tokenizer, 384)
    return tokenized_test, test_label


def main():
    """
    주어진 dataset csv 파일과 같은 형태일 경우 inference 가능한 코드입니다.
    """
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    # load tokenizer
    Tokenizer_NAME = model
    tokenizer = AutoTokenizer.from_pretrained(Tokenizer_NAME)

    ## load my model
    MODEL_NAME = model_dir  # model dir.

    model = AutoModelForSequenceClassification.from_pretrained(model_dir)
    # model.parameters
    model.to(device)

    # ## load test datset
    # test_dataset_dir = "..data/newszum_test_data.csv"
    # test_dataset, test_label = load_test_dataset(test_dataset_dir, tokenizer)
    # news_test_dataset = news_dataset(test_dataset, test_label)

    ## predict answer
    pred_answer = inference(model, test_dataset, device)  # model에서 class 추론
    pred_answer = num_to_label(pred_answer)
    # test_dataset = load_data(test_dataset_dir)

    # ## make csv file with predicted answer
    # #########################################################
    # output = pd.DataFrame(
    #     {
    #         "title": test_dataset["title"],
    #         "cleanBody": test_dataset["cleanBody"],
    #         "category": list(test_dataset["category"].values),
    #         "result": pred_answer,
    #     }
    # )

    # output.to_csv(
    #     "./prediction/submission.csv", index=False
    # )  # 최종적으로 완성된 예측한 라벨 csv 파일 형태로 저장.
    # print("---- Finish! ----")


if __name__ == "__main__":

    # model dir
    model = 'klue/roberta-large'
    model_dir = save_path + '/best_model'
    main()