# Description


# Modules and Global Variables

In [4]:
from transformers import (
    AutoConfig, AutoTokenizer, AutoModelForSequenceClassification, 
    DefaultDataCollator, DataCollatorWithPadding, 
    TrainingArguments, Trainer,
)

from transformers.optimization import (
    AdamW, get_linear_schedule_with_warmup,
    Adafactor, AdafactorSchedule,
)

import torch
import wandb

import datasets
import evaluate

from sklearn.metrics import accuracy_score, f1_score

import numpy as np
import pandas as pd

import os
import re
import random

import demoji

In [5]:
print(f'torch.__version__: {torch.__version__}')
print(f'torch.cuda.is_available(): {torch.cuda.is_available()}')
NGPU = torch.cuda.device_count()
print(f'NGPU: {NGPU}')
# NGPU = torch.cuda.device_count()
# if NGPU > 1:
#     model = torch.nn.DataParallel(model, device_ids=list(range(NGPU)))

torch.__version__: 1.12.1
torch.cuda.is_available(): True
NGPU: 4


In [6]:
### labels

ce_labels = ['True', 'False']
pc_labels = ['positive', 'negative', 'neutral']
pc_binary_labels = ['True', 'False']

labels = ce_labels

label2id = {k: i for i, k in enumerate(labels)}
id2label = {i: k for i, k in enumerate(labels)}
num_labels = len(labels)

print(label2id)
print(id2label)

{'True': 0, 'False': 1}
{0: 'True', 1: 'False'}


In [7]:
### paths and names

PROJECT_NAME = 'aspect_category_detection'
RUN_ID = 'uncleaned_v1'

DATA_V = 'uncleaned_v1'
DATA_T = 'ce' # ce or pc or pc_binary
AUGMENTATION = False
AUG_NAME = 'balanced'

model_checkpoint = 'snunlp/KR-ELECTRA-discriminator'

notebook_name = 'acd_binary_trainer.ipynb'

### fixed

model_name = re.sub(r'[/-]', r'_', model_checkpoint).lower()
run_name = f'{model_name}_{RUN_ID}'

ROOT_PATH = './'
SAVE_PATH = os.path.join(ROOT_PATH, 'training_results', run_name, 'acd')
NOTEBOOK_PATH = os.path.join('./', notebook_name)

augornot = f'_{AUG_NAME}' if AUGMENTATION is True else ''
TRAIN_DATA_PATH = os.path.join(ROOT_PATH, 'dataset', DATA_V, f'{DATA_T}_train{augornot}.csv')
EVAL_DATA_PATH = os.path.join(ROOT_PATH, 'dataset', DATA_V, f'{DATA_T}_dev.csv')

!mkdir -p {SAVE_PATH}

In [8]:
if os.path.exists(SAVE_PATH):
    print(f'{SAVE_PATH} exists.')
else:
    print(f'{SAVE_PATH} does not exist.')
if os.path.exists(NOTEBOOK_PATH):
    print(f'{NOTEBOOK_PATH} exists.')
else:
    print(f'{NOTEBOOK_PATH} does not exist.')
if os.path.exists(TRAIN_DATA_PATH):
    print(f'{TRAIN_DATA_PATH} exists.')
else:
    print(f'{TRAIN_DATA_PATH} does not exist.')
if os.path.exists(EVAL_DATA_PATH):
    print(f'{EVAL_DATA_PATH} exists.')
else:
    print(f'{EVAL_DATA_PATH} does not exist.')

./training_results/snunlp_kr_electra_discriminator_uncleaned_v1/acd exists.
./acd_binary_trainer.ipynb exists.
./dataset/uncleaned_v1/ce_train.csv exists.
./dataset/uncleaned_v1/ce_dev.csv exists.


In [9]:
### rest of training args

report_to="wandb"

fp16 = False

num_train_epochs = 10
batch_size = 25
gradient_accumulation_steps = 1

optim = 'adamw_torch' # 'adamw_hf'

learning_rate = 3e-6 # 5e-5
weight_decay = 0.01 # 0
adam_epsilon = 1e-8

lr_scheduler_type = 'cosine'
warmup_ratio = 0

save_total_limit = 1

load_best_model_at_end = True
metric_for_best_model ='f1_macro'

save_strategy = "epoch"
evaluation_strategy = "epoch"

logging_strategy = "steps"
logging_first_step = True 
logging_steps = 500

# WandB Configuration

In [10]:
%env WANDB_PROJECT={PROJECT_NAME}
%env WANDB_NOTEBOOK_NAME={NOTEBOOK_PATH}
%env WANDB_LOG_MODEL=true
%env WANDB_WATCH=all
wandb.login()

env: WANDB_PROJECT=aspect_category_detection
env: WANDB_NOTEBOOK_NAME=./acd_binary_trainer.ipynb
env: WANDB_LOG_MODEL=true
env: WANDB_WATCH=all


[34m[1mwandb[0m: Currently logged in as: [33mdotsnangles[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

# Load Model, Tokenizer, and Collator

In [11]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint, label2id=label2id, id2label=id2label, num_labels=num_labels
)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Some weights of the model checkpoint at snunlp/KR-ELECTRA-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at snunlp/KR-ELECTRA-discriminator and are newly initialized: ['classifier.dense

In [12]:
train_path = './dataset/uncleaned_v1/raw_train.csv'
dev_path = './dataset/uncleaned_v1/raw_dev.csv'
test_path = './dataset/uncleaned_v1/raw_test.csv'
train = pd.read_csv(train_path)
dev = pd.read_csv(dev_path)
test = pd.read_csv(test_path)

In [13]:
### new
entity_property_pair = [
    'Î≥∏Ìíà#Í∞ÄÍ≤©', 'Î≥∏Ìíà#Îã§ÏñëÏÑ±', 'Î≥∏Ìíà#ÎîîÏûêÏù∏', 'Î≥∏Ìíà#Ïù∏ÏßÄÎèÑ', 'Î≥∏Ìíà#ÏùºÎ∞ò', 'Î≥∏Ìíà#Ìé∏ÏùòÏÑ±', 'Î≥∏Ìíà#ÌíàÏßà',
    'Î∏åÎûúÎìú#Í∞ÄÍ≤©', 'Î∏åÎûúÎìú#ÎîîÏûêÏù∏', 'Î∏åÎûúÎìú#Ïù∏ÏßÄÎèÑ', 'Î∏åÎûúÎìú#ÏùºÎ∞ò', 'Î∏åÎûúÎìú#ÌíàÏßà',
    'Ï†úÌíà Ï†ÑÏ≤¥#Í∞ÄÍ≤©', 'Ï†úÌíà Ï†ÑÏ≤¥#Îã§ÏñëÏÑ±', 'Ï†úÌíà Ï†ÑÏ≤¥#ÎîîÏûêÏù∏', 'Ï†úÌíà Ï†ÑÏ≤¥#Ïù∏ÏßÄÎèÑ', 'Ï†úÌíà Ï†ÑÏ≤¥#ÏùºÎ∞ò', 'Ï†úÌíà Ï†ÑÏ≤¥#Ìé∏ÏùòÏÑ±', 'Ï†úÌíà Ï†ÑÏ≤¥#ÌíàÏßà',
    'Ìå®ÌÇ§ÏßÄ/Íµ¨ÏÑ±Ìíà#Í∞ÄÍ≤©', 'Ìå®ÌÇ§ÏßÄ/Íµ¨ÏÑ±Ìíà#Îã§ÏñëÏÑ±', 'Ìå®ÌÇ§ÏßÄ/Íµ¨ÏÑ±Ìíà#ÎîîÏûêÏù∏', 'Ìå®ÌÇ§ÏßÄ/Íµ¨ÏÑ±Ìíà#ÏùºÎ∞ò', 'Ìå®ÌÇ§ÏßÄ/Íµ¨ÏÑ±Ìíà#Ìé∏ÏùòÏÑ±', 'Ìå®ÌÇ§ÏßÄ/Íµ¨ÏÑ±Ìíà#ÌíàÏßà'
]


more_tokens = ['&name&', '&affiliation&', '&social-security-num&', '&tel-num&', '&card-num&', '&bank-account&', '&num&', '&online-account&']

emojis = pd.concat([train.sentence_form, dev.sentence_form, test.sentence_form], ignore_index=True, verify_integrity=True).to_frame()
emojis = list(set(demoji.findall(' '.join(emojis.sentence_form.to_list())).keys()))

tokensToAdd = more_tokens + emojis
ep_labels = pd.Series(entity_property_pair, name='sentence_form', copy=True)

In [14]:
data = pd.concat([train.sentence_form, dev.sentence_form, test.sentence_form], ignore_index=True, verify_integrity=True).to_frame()
print(len(data))
data = data.drop_duplicates()
print(len(data.drop_duplicates()))

7920
7915


In [15]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
print(len(tokenizer))

tokenizerTrainData = data.sentence_form.to_list()
newTokenizer = tokenizer.train_new_from_iterator(tokenizerTrainData, vocab_size=1)

new_tokens = set(list(newTokenizer.vocab.keys())) - set(tokenizer.vocab.keys())
tokenizer.add_tokens(list(new_tokens) + tokensToAdd)
print(len(newTokenizer))
print(len(tokenizer))

model.resize_token_embeddings(len(tokenizer))

30000



3060
30117


Embedding(30117, 768)

In [17]:
len(new_tokens), new_tokens

(76,
 {'##…¢',
  '##…™',
  '##…¥',
  '## Ä',
  '##Àá',
  '##·¥ç',
  '##·¥ò',
  '##·¥õ',
  '##·¥ú',
  '##·¥†',
  '##·¥°',
  '##·µï',
  '##‚óç',
  '##‚ùî',
  '##‚ûï',
  '##„â¶',
  '##Íàç',
  '##Îúå',
  '##Ïùí',
  '##Ï£±',
  '##Ï®ï',
  '##Ï´ú',
  '##üë†',
  '##üíÑ',
  '##üíÜ',
  '##üíá',
  '##üï∑',
  '##üï∏',
  '##üöó',
  '##ü§°',
  '##ü•§',
  '…¢',
  '…™',
  '…¥',
  ' Ä',
  ' ú',
  'Àá',
  '“ì',
  '·¥ç',
  '·¥ò',
  '·¥õ',
  '·¥ú',
  '·¥†',
  '·¥°',
  '·µï',
  '‚è∞',
  '‚óç',
  '‚ùî',
  '‚ûï',
  '„â¶',
  'Íàç',
  'Îúå',
  'Îø§',
  'Ïì©',
  'Ïùí',
  'Ï£±',
  'Ï®ï',
  'Ï´ú',
  'Ï±≥',
  'üç∑',
  'üçº',
  'üêÑ',
  'üë†',
  'üíÑ',
  'üíÜ',
  'üíá',
  'üí°',
  'üí¨',
  'üï∑',
  'üï∏',
  'üï∫',
  'üòØ',
  'üò∫',
  'üöó',
  'ü§°',
  'ü•§'})

In [18]:
model.config.label2id, model.config.id2label, model.num_labels

({'True': 0, 'False': 1}, {0: 'True', 1: 'False'}, 2)

In [19]:
# entity_property_pair = [
#     'Î≥∏Ìíà#Í∞ÄÍ≤©', 'Î≥∏Ìíà#Îã§ÏñëÏÑ±', 'Î≥∏Ìíà#ÎîîÏûêÏù∏', 'Î≥∏Ìíà#Ïù∏ÏßÄÎèÑ', 'Î≥∏Ìíà#ÏùºÎ∞ò', 'Î≥∏Ìíà#Ìé∏ÏùòÏÑ±', 'Î≥∏Ìíà#ÌíàÏßà',
#     'Î∏åÎûúÎìú#Í∞ÄÍ≤©', 'Î∏åÎûúÎìú#ÎîîÏûêÏù∏', 'Î∏åÎûúÎìú#Ïù∏ÏßÄÎèÑ', 'Î∏åÎûúÎìú#ÏùºÎ∞ò', 'Î∏åÎûúÎìú#ÌíàÏßà',
#     'Ï†úÌíà Ï†ÑÏ≤¥#Í∞ÄÍ≤©', 'Ï†úÌíà Ï†ÑÏ≤¥#Îã§ÏñëÏÑ±', 'Ï†úÌíà Ï†ÑÏ≤¥#ÎîîÏûêÏù∏', 'Ï†úÌíà Ï†ÑÏ≤¥#Ïù∏ÏßÄÎèÑ', 'Ï†úÌíà Ï†ÑÏ≤¥#ÏùºÎ∞ò', 'Ï†úÌíà Ï†ÑÏ≤¥#Ìé∏ÏùòÏÑ±', 'Ï†úÌíà Ï†ÑÏ≤¥#ÌíàÏßà',
#     'Ìå®ÌÇ§ÏßÄ/Íµ¨ÏÑ±Ìíà#Í∞ÄÍ≤©', 'Ìå®ÌÇ§ÏßÄ/Íµ¨ÏÑ±Ìíà#Îã§ÏñëÏÑ±', 'Ìå®ÌÇ§ÏßÄ/Íµ¨ÏÑ±Ìíà#ÎîîÏûêÏù∏', 'Ìå®ÌÇ§ÏßÄ/Íµ¨ÏÑ±Ìíà#ÏùºÎ∞ò', 'Ìå®ÌÇ§ÏßÄ/Íµ¨ÏÑ±Ìíà#Ìé∏ÏùòÏÑ±', 'Ìå®ÌÇ§ÏßÄ/Íµ¨ÏÑ±Ìíà#ÌíàÏßà'
# ]

# polarity_id_to_name = ['positive', 'negative', 'neutral']

# tokenizer_tester = []
# for pair in entity_property_pair:
#     for polarity in polarity_id_to_name:
#         tokenizer_tester.append('#'.join([pair, polarity]))

# for e in tokenizer_tester:
#     print(tokenizer.decode(tokenizer.encode(e)))

# for e in tokenizer_tester:
#     print(tokenizer.encode(e))

# Define Metric

In [20]:
accuracy_metric = evaluate.load('accuracy')
f1_metric = evaluate.load('f1')

In [21]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    
    accuracy = accuracy_metric.compute(references=labels, predictions=predictions)['accuracy']
    f1_true, f1_false = tuple(f1_metric.compute(references=labels, predictions=predictions, average=None, labels=[0,1])['f1'])
    f1_macro = f1_metric.compute(references=labels, predictions=predictions, average='macro')['f1']
    f1_micro = f1_metric.compute(references=labels, predictions=predictions, average='micro')['f1']
    
    return {'accuracy': accuracy, 'f1_true': f1_true, 'f1_false': f1_false, 'f1_macro': f1_macro, 'f1_micro': f1_micro}

# Load Data

In [22]:
def preprocess_function(examples):
    return tokenizer(examples["sentence_form"], examples["entity_property"], truncation=True)

In [24]:
train_dataset = pd.read_csv(TRAIN_DATA_PATH)
eval_dataset = pd.read_csv(EVAL_DATA_PATH)
# train_dataset = pd.concat([train_dataset, eval_dataset])
train_dataset = datasets.Dataset.from_pandas(train_dataset) #.shuffle(seed=42)
eval_dataset = datasets.Dataset.from_pandas(eval_dataset) #.shuffle(seed=42)
train_dataset = train_dataset.map(preprocess_function, batched=False)
eval_dataset = eval_dataset.map(preprocess_function, batched=False)

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 75000/75000 [00:19<00:00, 3809.29ex/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 69825/69825 [00:19<00:00, 3608.51ex/s]


In [25]:
len(train_dataset), len(eval_dataset)

(75000, 69825)

In [28]:
k = random.randrange(len(train_dataset))
print(tokenizer.decode(train_dataset['input_ids'][k]), train_dataset['labels'][k])
k = random.randrange(len(eval_dataset))
print(tokenizer.decode(eval_dataset['input_ids'][k]), eval_dataset['labels'][k])

[CLS] < < Î∞îÎ•¥ÏûêÎßàÏûê Î≥¥Îì§Î≥¥Îì§Ìï¥ÏÑú ÏïÑÏπ®Ïóê Î∞úÎûêÏùÑ Îïå Í∏∞Î∂ÑÏù¥ ÌäπÌûà Ï¢ãÏïÑÏöî > > [SEP] ÏÉÅÌíàÌèâ Î¨∏Ïû•Ïùò Î≤îÏ£º Ïú†ÌòïÏùÄ < < Ï†úÌíà Ï†ÑÏ≤¥ # Îã§ÏñëÏÑ± > > Ïù¥Îã§. [SEP] 1
[CLS] < < ÌôïÏã§Ìûà ÏÉùÏÇ∞ÌïòÏûêÎßàÏûê Í∏âÏÜçÎÉâÍ∞ÅÌïú # Ï†úÎåÄÌòàÎ∞∞ÏñëÏï° ÏùÑ ÎÇ¥Í∞Ä ÏõêÌï†Îïå ÏÑûÍ≥† Í≥†ÍπîÏùÑ ÏîåÏõåÏÑú ÏÇ¨Ïö©ÌïòÎäî Í±∞ÎãàÍπå Îçî Ïã†ÏÑ†ÌïòÍ≤å ÌîºÎ∂ÄÏóê Ï†ÑÎã¨Ìï¥Ï£ºÎäî Í≤É Í∞ôÏïÑÏöî ~ > > [SEP] ÏÉÅÌíàÌèâ Î¨∏Ïû•Ïùò Î≤îÏ£º Ïú†ÌòïÏùÄ < < Ìå®ÌÇ§ÏßÄ / Íµ¨ÏÑ±Ìíà # Í∞ÄÍ≤© > > Ïù¥Îã§. [SEP] 1


# Load Trainer

In [29]:
args = TrainingArguments(
    output_dir=run_name,
    run_name=run_name,
    report_to=report_to,

    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,

    optim=optim,

    learning_rate=learning_rate,
    weight_decay=weight_decay,
    adam_epsilon=adam_epsilon,

    lr_scheduler_type=lr_scheduler_type,
    warmup_ratio=warmup_ratio,

    save_total_limit=save_total_limit,

    load_best_model_at_end=load_best_model_at_end,
    metric_for_best_model=metric_for_best_model,
    
    save_strategy=save_strategy,
    evaluation_strategy=evaluation_strategy,

    logging_strategy=logging_strategy,
    logging_first_step=logging_first_step, 
    logging_steps=logging_steps,
    
    fp16=fp16,
)

In [23]:
# es = EarlyStoppingCallback(early_stopping_patience=early_stopping_patience)

In [24]:
trainer = Trainer(
    model,
    args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    data_collator=data_collator,
    # callbacks=[es],
)

# Run Trainer

In [25]:
trainer.train()
wandb.finish()

The following columns in the training set don't have a corresponding argument in `ElectraForSequenceClassification.forward` and have been ignored: entity_property, id, sentence_form. If entity_property, id, sentence_form are not expected by `ElectraForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 144825
  Num Epochs = 15
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 1
  Total optimization steps = 16980
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


You're using a ElectraTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

: 

In [None]:
keep = [
    'added_tokens.json',
    'config.json',
    'pytorch_model.bin',
    'special_tokens_map.json',
    'tokenizer.json',
    'tokenizer_config.json',
    'vocab.txt'
]

ckpts = os.listdir(run_name)
for ckpt in ckpts:
    ckpt = os.path.join(run_name, ckpt)
    for item in os.listdir(ckpt):
        if item not in keep:
            os.remove(os.path.join(ckpt, item))

!mv wandb {run_name} {SAVE_PATH}/