In [1]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"  # Arrange GPU devices starting from 0
os.environ["CUDA_VISIBLE_DEVICES"]= "0"  # Set the GPUs to use
import torch
print('Current cuda device:', torch.cuda.current_device())
print('Count of using GPUs:', torch.cuda.device_count())

Current cuda device: 0
Count of using GPUs: 1


# Description


# Modules and Global Variables

In [2]:
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    DataCollatorWithPadding, 
    TrainingArguments, Trainer,
)

import torch
import wandb

import datasets
import evaluate

import numpy as np
import pandas as pd

import os
import re
import random

from module.args import *

print_torch_info()

torch.__version__: 1.7.1
torch.cuda.is_available(): True
NGPU: 1


In [3]:
### labels

labels = ce_labels

label2id = {k: i for i, k in enumerate(labels)}
id2label = {i: k for i, k in enumerate(labels)}
num_labels = len(labels)

print(label2id)
print(id2label)

{'True': 0, 'False': 1}
{0: 'True', 1: 'False'}


In [4]:
### paths and names

PROJECT_NAME = 'aspect_category_detection'
RUN_ID = 'uncleaned_v23_run_1'

DATA_V = 'uncleaned_v23'
DATA_T = 'ce' # ce or pc or pc_binary
AUGMENTATION = False
AUG_NAME = ''

MAX_LEN = False

model_checkpoint = 'snunlp/KR-ELECTRA-discriminator'

notebook_name = 'acd_train_1.ipynb'

### fixed

model_name = re.sub(r'[/-]', r'_', model_checkpoint).lower()
run_name = f'acd_{model_name}_{RUN_ID}'

ROOT_PATH = './'
SAVE_PATH = os.path.join(ROOT_PATH, 'training_results', RUN_ID)
NOTEBOOK_PATH = os.path.join(ROOT_PATH, notebook_name)

augornot = f'_{AUG_NAME}' if AUGMENTATION is True else ''
TRAIN_DATA_PATH = os.path.join(ROOT_PATH, 'dataset', DATA_V, f'{DATA_T}_train{augornot}.csv')
EVAL_DATA_PATH = os.path.join(ROOT_PATH, 'dataset', DATA_V, f'{DATA_T}_dev.csv')

!mkdir -p {SAVE_PATH}

In [5]:
print_paths(PROJECT_NAME, model_checkpoint, DATA_V, SAVE_PATH, NOTEBOOK_PATH, TRAIN_DATA_PATH, EVAL_DATA_PATH)
print()
print_args()

PROJECT_NAME: aspect_category_detection

model_checkpoint: snunlp/KR-ELECTRA-discriminator
DATA_V: uncleaned_v23

SAVE_PATH: ./training_results/uncleaned_v23_run_1 exists.
NOTEBOOK_PATH: ./acd_train_1.ipynb exists.

TRAIN_DATA_PATH: ./dataset/uncleaned_v23/ce_train.csv exists.
EVAL_DATA_PATH: ./dataset/uncleaned_v23/ce_dev.csv exists.

per_device_train_batch_size: 24
per_device_eval_batch_size: 24
learning_rate: 2e-05
num_train_epochs: 12


# WandB Configuration

In [6]:
%env WANDB_PROJECT={PROJECT_NAME}
%env WANDB_NOTEBOOK_NAME={NOTEBOOK_PATH}
%env WANDB_LOG_MODEL=true
%env WANDB_WATCH=all
wandb.login()

env: WANDB_PROJECT=aspect_category_detection
env: WANDB_NOTEBOOK_NAME=./acd_train_1.ipynb
env: WANDB_LOG_MODEL=true
env: WANDB_WATCH=all


[34m[1mwandb[0m: Currently logged in as: [33mdotsnangles[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

# Load Model, Tokenizer, and Collator

In [7]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint, label2id=label2id, id2label=id2label, num_labels=num_labels
)

if MAX_LEN:
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding='max_length', max_length=MAX_LEN)
else:
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Some weights of the model checkpoint at snunlp/KR-ELECTRA-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.bias', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense_prediction.weight']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at snunlp/KR-ELECTRA-discriminator and are newly initialized: ['classifier.out_p

In [8]:
train_path = f'./dataset/{DATA_V}/raw_train.csv'
dev_path = f'./dataset/{DATA_V}/raw_dev.csv'
test_path = f'./dataset/{DATA_V}/raw_test.csv'
train = pd.read_csv(train_path)
dev = pd.read_csv(dev_path)
test = pd.read_csv(test_path)

print(len(tokenizer))
tokenizer_train_data = pd.concat([train.sentence_form, dev.sentence_form, test.sentence_form]).to_frame().drop_duplicates()
tokenizer_train_data = tokenizer_train_data.sentence_form.to_list()
new_tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
new_tokenizer = tokenizer.train_new_from_iterator(tokenizer_train_data, vocab_size=1)
new_tokens = set(list(new_tokenizer.vocab.keys())) - set(tokenizer.vocab.keys())
tokenizer.add_tokens(list(new_tokens))
print(len(new_tokenizer))
print(len(tokenizer))
model.resize_token_embeddings(len(tokenizer))

print(len(new_tokens))
print(new_tokens)

30000



3213
30104
104
{'## ü', '##Àá', '‚ùÄ', 'Ï±≥', 'üç∑', '##üíá', 'üíÜ', 'üò∫', 'üí°', '##‚óç', '##…¥', '##·¥°', 'Íàç', 'üï∫', '##·¥Ñ', ' ü', '##·µï', '„â¶', 'Â¶Ü', '##Íàç', 'üíÑ', 'üë†', 'üêÑ', '##ü§°', 'üòØ', '##Ôæû', '##·¥†', 'Èïø', 'ü§°', '##üöó', '##·¥ú', '…¢', ' ú', 'Îúå', '##ü•§', '·¥°', '##üíÜ', '…¥', '·¥ã', 'üîΩ', '##Ïùí', '##‚ïπ', 'üë±', '##üë±', '‚ûï', 'Èü©', '##‚ùî', 'Ïì©', '##üï∏', '·µï', '##üíÑ', '##…¢', '·¥ç', '##·¥õ', 'Ï®ï', 'Ï£±', 'ü•§', '##Îúå', '·¥õ', 'üöì', 'üï∑', '·¥ú', '##·¥ç', '##‚ûï', 'üíÅ', '##Ï®ï', '##Ï´ú', 'Ôæû', 'üçº', ' Ä', '##üï∑', 'Ôæâ', '‚óç', 'üí¨', 'Ïùí', 'Àá', '##·¥ò', 'Ìõì', 'üòü', '‚ïπ', '‚è∞', 'üöó', '“ì', '„Éæ', '‰∏Ω', '##üöì', '##üòü', 'Îø§', 'üçè', '…™', 'üï∏', 'Ï´ú', '·¥†', '##·¥ã', '## Ä', '##üë†', '##…™', 'üíá', 'Ëéé', '·¥Ñ', '##„â¶', '##Ï£±', '·¥ò', '‚ùî'}


In [9]:
model.config.label2id, model.config.id2label, model.num_labels

({'True': 0, 'False': 1}, {0: 'True', 1: 'False'}, 2)

# Define Metrics

In [10]:
accuracy_metric = evaluate.load('accuracy')
f1_metric = evaluate.load('f1')

In [11]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    
    accuracy = accuracy_metric.compute(references=labels, predictions=predictions)['accuracy']
    f1_true, f1_false = tuple(f1_metric.compute(references=labels, predictions=predictions, average=None, labels=[0,1])['f1'])
    f1_macro = f1_metric.compute(references=labels, predictions=predictions, average='macro')['f1']
    f1_micro = f1_metric.compute(references=labels, predictions=predictions, average='micro')['f1']
    
    return {'accuracy': accuracy, 'f1_true': f1_true, 'f1_false': f1_false, 'f1_macro': f1_macro, 'f1_micro': f1_micro}

# Load Data

In [12]:
def preprocess_function(examples):
    return tokenizer(examples["form"], examples["pair"], truncation=True)

In [13]:
train_dataset = pd.read_csv(TRAIN_DATA_PATH)
eval_dataset = pd.read_csv(EVAL_DATA_PATH)
train_dataset = datasets.Dataset.from_pandas(train_dataset).shuffle(seed=42)
eval_dataset = datasets.Dataset.from_pandas(eval_dataset).shuffle(seed=42)
train_dataset = train_dataset.map(preprocess_function, batched=True)
eval_dataset = eval_dataset.map(preprocess_function, batched=True)

  0%|          | 0/91 [00:00<?, ?ba/s]

  0%|          | 0/70 [00:00<?, ?ba/s]

In [14]:
len(train_dataset), len(eval_dataset)

(90600, 69825)

In [15]:
k = random.randrange(len(train_dataset))
print(tokenizer.decode(train_dataset['input_ids'][k]), train_dataset['labels'][k])
k = random.randrange(len(eval_dataset))
print(tokenizer.decode(eval_dataset['input_ids'][k]), eval_dataset['labels'][k])

[CLS] ÏïÑÏù¥Îì§Ïù¥ ÏûàÎäî Ïßë ÌïÑÏàòÌíà ` - ` # ÌïòÏù¥ÎßòÎ∞¥Îìú.. [SEP] Ï†úÌíà Ï†ÑÏ≤¥ # Í∞ÄÍ≤© [SEP] 1
[CLS] ÎåÄÎ∞ï Ìå© Î∞úÍ≤¨!! [SEP] Ï†úÌíà Ï†ÑÏ≤¥ # Îã§ÏñëÏÑ± [SEP] 1


# Load Trainer

In [16]:
args = TrainingArguments(
    output_dir=run_name,
    run_name=run_name,
    report_to=report_to,

    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    per_device_eval_batch_size=per_device_eval_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,

    optim=optim,

    learning_rate=learning_rate,
    weight_decay=weight_decay,
    adam_epsilon=adam_epsilon,

    lr_scheduler_type=lr_scheduler_type,
    warmup_ratio=warmup_ratio,

    save_total_limit=save_total_limit,

    load_best_model_at_end=load_best_model_at_end,
    metric_for_best_model=metric_for_best_model,
    
    save_strategy=save_strategy,
    evaluation_strategy=evaluation_strategy,

    logging_strategy=logging_strategy,
    logging_first_step=logging_first_step, 
    logging_steps=logging_steps,
    
    fp16=fp16,
)

In [17]:
# es = EarlyStoppingCallback(early_stopping_patience=early_stopping_patience)

In [18]:
trainer = Trainer(
    model,
    args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    data_collator=data_collator,
    # callbacks=[es],
)

# Run Trainer

In [19]:
trainer.train()
wandb.finish()

The following columns in the training set don't have a corresponding argument in `ElectraForSequenceClassification.forward` and have been ignored: pair, id, form. If pair, id, form are not expected by `ElectraForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 90600
  Num Epochs = 12
  Instantaneous batch size per device = 24
  Total train batch size (w. parallel, distributed & accumulation) = 24
  Gradient Accumulation steps = 1
  Total optimization steps = 45300
  Number of trainable parameters = 109162754
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


You're using a ElectraTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy,F1 True,F1 False,F1 Macro,F1 Micro
1,0.1107,0.090602,0.972216,0.605691,0.985601,0.795646,0.972216
2,0.0872,0.083403,0.975567,0.683723,0.987293,0.835508,0.975567
3,0.0808,0.089044,0.975439,0.665104,0.987252,0.826178,0.975439
4,0.0601,0.109037,0.974551,0.687862,0.986735,0.837298,0.974551
5,0.0519,0.108241,0.973333,0.669507,0.986106,0.827806,0.973333
6,0.0451,0.142912,0.973734,0.689573,0.986287,0.83793,0.973734
7,0.0326,0.152785,0.973906,0.685537,0.986388,0.835963,0.973906
8,0.0278,0.163332,0.974307,0.694482,0.98659,0.840536,0.974307
9,0.0196,0.200012,0.974422,0.689391,0.986662,0.838026,0.974422
10,0.0104,0.215746,0.974708,0.688316,0.986819,0.837568,0.974708


The following columns in the evaluation set don't have a corresponding argument in `ElectraForSequenceClassification.forward` and have been ignored: pair, id, form. If pair, id, form are not expected by `ElectraForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 69825
  Batch size = 24
Saving model checkpoint to acd_snunlp_kr_electra_discriminator_uncleaned_v23_run_1/checkpoint-3775
Configuration saved in acd_snunlp_kr_electra_discriminator_uncleaned_v23_run_1/checkpoint-3775/config.json
Model weights saved in acd_snunlp_kr_electra_discriminator_uncleaned_v23_run_1/checkpoint-3775/pytorch_model.bin
tokenizer config file saved in acd_snunlp_kr_electra_discriminator_uncleaned_v23_run_1/checkpoint-3775/tokenizer_config.json
Special tokens file saved in acd_snunlp_kr_electra_discriminator_uncleaned_v23_run_1/checkpoint-3775/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argu

0,1
eval/accuracy,‚ñÅ‚ñà‚ñà‚ñÜ‚ñÉ‚ñÑ‚ñÖ‚ñÖ‚ñÜ‚ñÜ‚ñÜ‚ñÜ
eval/f1_false,‚ñÅ‚ñà‚ñà‚ñÜ‚ñÉ‚ñÑ‚ñÑ‚ñÖ‚ñÖ‚ñÜ‚ñÜ‚ñÜ
eval/f1_macro,‚ñÅ‚ñá‚ñÜ‚ñá‚ñÜ‚ñà‚ñá‚ñà‚ñà‚ñà‚ñá‚ñà
eval/f1_micro,‚ñÅ‚ñà‚ñà‚ñÜ‚ñÉ‚ñÑ‚ñÖ‚ñÖ‚ñÜ‚ñÜ‚ñÜ‚ñÜ
eval/f1_true,‚ñÅ‚ñá‚ñÜ‚ñá‚ñÜ‚ñà‚ñá‚ñà‚ñà‚ñà‚ñá‚ñà
eval/loss,‚ñÅ‚ñÅ‚ñÅ‚ñÇ‚ñÇ‚ñÑ‚ñÑ‚ñÖ‚ñá‚ñà‚ñà‚ñà
eval/runtime,‚ñÜ‚ñÑ‚ñÑ‚ñÜ‚ñÇ‚ñÖ‚ñÉ‚ñÅ‚ñà‚ñÖ‚ñÑ‚ñÑ
eval/samples_per_second,‚ñÉ‚ñÖ‚ñÖ‚ñÉ‚ñá‚ñÑ‚ñÖ‚ñà‚ñÅ‚ñÑ‚ñÖ‚ñÖ
eval/steps_per_second,‚ñÉ‚ñÖ‚ñÖ‚ñÉ‚ñá‚ñÑ‚ñÖ‚ñà‚ñÅ‚ñÑ‚ñÖ‚ñÖ
train/epoch,‚ñÅ‚ñÅ‚ñÅ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÉ‚ñÉ‚ñÉ‚ñÉ‚ñÉ‚ñÉ‚ñÑ‚ñÑ‚ñÑ‚ñÑ‚ñÑ‚ñÖ‚ñÖ‚ñÖ‚ñÖ‚ñÖ‚ñÜ‚ñÜ‚ñÜ‚ñÜ‚ñÜ‚ñÜ‚ñá‚ñá‚ñá‚ñá‚ñá‚ñá‚ñà‚ñà‚ñà

0,1
eval/accuracy,0.97482
eval/f1_false,0.98688
eval/f1_macro,0.83852
eval/f1_micro,0.97482
eval/f1_true,0.69017
eval/loss,0.22346
eval/runtime,246.3183
eval/samples_per_second,283.475
eval/steps_per_second,11.814
train/epoch,12.0


In [20]:
keep = [
    'added_tokens.json',
    'config.json',
    'pytorch_model.bin',
    'special_tokens_map.json',
    'tokenizer.json',
    'tokenizer_config.json',
    'vocab.txt'
]

ckpts = os.listdir(run_name)
for ckpt in ckpts:
    ckpt = os.path.join(run_name, ckpt)
    for item in os.listdir(ckpt):
        if item not in keep:
            os.remove(os.path.join(ckpt, item))

!mv {run_name} {SAVE_PATH}/