# Description

# Modules and Global Variables

In [4]:
from transformers import (
    AutoConfig, AutoTokenizer, AutoModelForSequenceClassification, 
    DefaultDataCollator, DataCollatorWithPadding, 
    TrainingArguments, Trainer,
)

from transformers.optimization import (
    AdamW, get_linear_schedule_with_warmup,
    Adafactor, AdafactorSchedule,
)

import wandb

import datasets
import evaluate

from sklearn.metrics import accuracy_score, f1_score

import numpy as np
import pandas as pd
import demoji

import os
import re
import random

In [5]:
### labels

ce_labels = ['True', 'False']
pc_labels = ['positive', 'negative', 'neutral']

labels = pc_labels

label2id = {k: i for i, k in enumerate(labels)}
id2label = {i: k for i, k in enumerate(labels)}
num_labels = len(labels)

In [6]:
### paths and names

PROJECT_NAME = 'aspect_sentiment_classification'
RUN_ID = 'v5'

DATA_V = 'v9'
DATA_T = 'pc' # ce or pc
AUGMENTATION = False
AUG_NAME = 'balanced'

model_checkpoint = 'klue/roberta-base'

notebook_name = 'asc_encoder_klue_roberta_base_training.ipynb'

### fixed

model_name = re.sub(r'[/-]', r'_', model_checkpoint)
run_name = f'{model_name}_{RUN_ID}'

ROOT_PATH = '/content/drive/MyDrive/aspect_based_sentiment_analysis'
SAVE_PATH = os.path.join(ROOT_PATH, PROJECT_NAME, model_name)
NOTEBOOK_PATH = os.path.join(SAVE_PATH, notebook_name)

augornot = f'_{AUG_NAME}' if AUGMENTATION is True else ''
TRAIN_DATA_PATH = os.path.join(ROOT_PATH, 'data', DATA_V, f'{DATA_T}_train{augornot}.csv')
EVAL_DATA_PATH = os.path.join(ROOT_PATH, 'data', DATA_V, f'{DATA_T}_dev.csv')

if os.path.exists(SAVE_PATH):
    print(f'{SAVE_PATH} exists.')
else:
    print(f'{SAVE_PATH} does not exist.')
if os.path.exists(NOTEBOOK_PATH):
    print(f'{NOTEBOOK_PATH} exists.')
else:
    print(f'{NOTEBOOK_PATH} does not exist.')
if os.path.exists(TRAIN_DATA_PATH):
    print(f'{TRAIN_DATA_PATH} exists.')
else:
    print(f'{TRAIN_DATA_PATH} does not exist.')
if os.path.exists(EVAL_DATA_PATH):
    print(f'{EVAL_DATA_PATH} exists.')
else:
    print(f'{EVAL_DATA_PATH} does not exist.')

/content/drive/MyDrive/aspect_based_sentiment_analysis/aspect_sentiment_classification/klue_roberta_base exists.
/content/drive/MyDrive/aspect_based_sentiment_analysis/aspect_sentiment_classification/klue_roberta_base/asc_encoder_klue_roberta_base_training.ipynb does not exist.
/content/drive/MyDrive/aspect_based_sentiment_analysis/data/v9/pc_train.csv exists.
/content/drive/MyDrive/aspect_based_sentiment_analysis/data/v9/pc_dev.csv exists.


In [7]:
### rest of training args

report_to="wandb"

fp16 = False

num_train_epochs = 20
batch_size = 8
gradient_accumulation_steps = 1

optim = 'adamw_torch' # 'adamw_hf'

learning_rate = 3e-6 # 5e-5
weight_decay = 0.01 # 0
adam_epsilon = 1e-8

lr_scheduler_type = 'cosine'
warmup_ratio = 0

save_total_limit = 5

load_best_model_at_end = True
metric_for_best_model='eval_loss'

save_strategy = "epoch"
evaluation_strategy = "epoch"

logging_strategy = "steps"
logging_first_step = True 
logging_steps = 500

# WandB Configuration

In [8]:
%env WANDB_PROJECT={PROJECT_NAME}
%env WANDB_NOTEBOOK_NAME={NOTEBOOK_PATH}
%env WANDB_LOG_MODEL=true
%env WANDB_WATCH=all
wandb.login()



env: WANDB_PROJECT=aspect_sentiment_classification
env: WANDB_NOTEBOOK_NAME=/content/drive/MyDrive/aspect_based_sentiment_analysis/aspect_sentiment_classification/klue_roberta_base/asc_encoder_klue_roberta_base_training.ipynb
env: WANDB_LOG_MODEL=true
env: WANDB_WATCH=all


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

# Load Model, Tokenizer, and Collator

In [9]:
ckpt = '/content/drive/MyDrive/aspect_based_sentiment_analysis/base_model/klue_roberta_base/v2/klue_roberta_base_mlm/checkpoint-19860'

tokenizer = AutoTokenizer.from_pretrained(ckpt)
model = AutoModelForSequenceClassification.from_pretrained(
    ckpt, label2id=label2id, id2label=id2label, num_labels=num_labels
)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Some weights of the model checkpoint at /content/drive/MyDrive/aspect_based_sentiment_analysis/base_model/klue_roberta_base/v2/klue_roberta_base_mlm/checkpoint-19860 were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at /content/drive/MyDrive/aspect_based_sentiment_analy

# Define Metric

In [10]:
accuracy_metric = evaluate.load('accuracy')
f1_metric = evaluate.load('f1')

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.77k [00:00<?, ?B/s]

In [11]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    
    accuracy = accuracy_metric.compute(references=labels, predictions=predictions)['accuracy']
    f1_positive, f1_negative, f1_neutral = tuple(f1_metric.compute(references=labels, predictions=predictions, average=None, labels=[0,1,2])['f1'])
    f1_macro = f1_metric.compute(references=labels, predictions=predictions, average='macro')['f1']
    f1_micro = f1_metric.compute(references=labels, predictions=predictions, average='micro')['f1']
    
    return {'accuracy': accuracy, 'f1_positive': f1_positive, 'f1_negative': f1_negative, 'f1_neutral': f1_neutral, 'f1_macro': f1_macro, 'f1_micro': f1_micro}

# Load Data

In [12]:
def preprocess_function(examples):
    return tokenizer(examples["sentence_form"], examples["entity_property"], truncation=True)

In [13]:
train_dataset = pd.read_csv(TRAIN_DATA_PATH)
eval_dataset = pd.read_csv(EVAL_DATA_PATH)
train_dataset = pd.concat([train_dataset, eval_dataset])
train_dataset = datasets.Dataset.from_pandas(train_dataset).shuffle(seed=42)
eval_dataset = datasets.Dataset.from_pandas(eval_dataset).shuffle(seed=42)
train_dataset = train_dataset.map(preprocess_function, batched=False)
eval_dataset = eval_dataset.map(preprocess_function, batched=False)

  0%|          | 0/6198 [00:00<?, ?ex/s]

  0%|          | 0/3002 [00:00<?, ?ex/s]

In [14]:
len(train_dataset), len(eval_dataset)

(6198, 3002)

In [15]:
k = random.randrange(len(train_dataset))
tokenizer.decode(train_dataset['input_ids'][k])

'[CLS] - 차이오 홈페이지와 오픈마켓 ( 11번가, 지마켓, 옥션 ) 에서 구입이 가능하고 다른 제품과 비교하여 이 제품만이 가지고 있는 의미있는 특이점인 완벽커버가능 ~ ~ [SEP] 제품 전체 # 품질 [SEP]'

# Load Trainer

In [16]:
args = TrainingArguments(
    output_dir=run_name,
    run_name=run_name,
    report_to=report_to,

    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,

    optim=optim,

    learning_rate=learning_rate,
    weight_decay=weight_decay,
    adam_epsilon=adam_epsilon,

    lr_scheduler_type=lr_scheduler_type,
    warmup_ratio=warmup_ratio,

    save_total_limit=save_total_limit,

    load_best_model_at_end=load_best_model_at_end,
    metric_for_best_model=metric_for_best_model,
    
    save_strategy=save_strategy,
    evaluation_strategy=evaluation_strategy,

    logging_strategy=logging_strategy,
    logging_first_step=logging_first_step, 
    logging_steps=logging_steps,
    
    fp16=fp16,
)

In [None]:
# es = EarlyStoppingCallback(early_stopping_patience=early_stopping_patience)

In [17]:
trainer = Trainer(
    model,
    args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    data_collator=data_collator,
    # callbacks=[es],
)

# Run Trainer

In [18]:
trainer.train()
wandb.finish()

The following columns in the training set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: entity_property, sentence_form, id, __index_level_0__. If entity_property, sentence_form, id, __index_level_0__ are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 6198
  Num Epochs = 20
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 15500
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
[34m[1mwandb[0m: Currently logged in as: [33mdotsnangles[0m. Use [1m`wandb login --relogin`[0m to force relogin


You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Positive,F1 Negative,F1 Neutral,F1 Macro,F1 Micro
1,0.2344,0.097219,0.977348,0.990641,0.630137,0.0,0.540259,0.977348
2,0.1285,0.088459,0.980013,0.991343,0.745763,0.0,0.579035,0.980013
3,0.1299,0.104559,0.98068,0.990166,0.857143,0.105263,0.650857,0.98068
4,0.1073,0.068435,0.986009,0.993869,0.793103,0.513514,0.766829,0.986009
5,0.0879,0.056873,0.98934,0.994888,0.901961,0.658824,0.851891,0.98934
6,0.0757,0.055402,0.99034,0.995568,0.888889,0.690476,0.858311,0.99034
7,0.0669,0.049784,0.992005,0.995907,0.923077,0.772727,0.897237,0.992005
8,0.05,0.038866,0.992672,0.996582,0.923077,0.8,0.906553,0.992672
9,0.0478,0.033726,0.99467,0.997607,0.923077,0.862745,0.92781,0.99467
10,0.0314,0.024718,0.99567,0.997949,0.923077,0.9,0.940342,0.99567


The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: id, sentence_form, entity_property. If id, sentence_form, entity_property are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 3002
  Batch size = 8
Saving model checkpoint to klue_roberta_base_v3/checkpoint-775
Configuration saved in klue_roberta_base_v3/checkpoint-775/config.json
Model weights saved in klue_roberta_base_v3/checkpoint-775/pytorch_model.bin
tokenizer config file saved in klue_roberta_base_v3/checkpoint-775/tokenizer_config.json
Special tokens file saved in klue_roberta_base_v3/checkpoint-775/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: id, sentence_form, entity_property. If id, sentence_form, entity_pro

0,1
eval/accuracy,▁▂▂▄▅▆▆▆▇▇▇▇████████
eval/f1_macro,▁▂▃▅▆▆▇▇▇███████████
eval/f1_micro,▁▂▂▄▅▆▆▆▇▇▇▇████████
eval/f1_negative,▁▄▆▅▇▇██████████████
eval/f1_neutral,▁▁▂▅▆▆▇▇▇███████████
eval/f1_positive,▁▂▁▄▅▅▆▆▇▇▇▇████████
eval/loss,▇▇█▅▄▄▄▃▃▂▂▂▁▁▁▁▁▁▁▁
eval/runtime,▆▆▁▆▆▁▅▆▆▆▅▆▁▅█▁▅▅▆▅
eval/samples_per_second,▃▃█▃▃█▃▃▃▃▃▃█▃▁█▃▃▃▃
eval/steps_per_second,▃▃█▃▃█▃▃▃▃▃▃█▃▁█▃▃▃▃

0,1
eval/accuracy,0.99734
eval/f1_macro,0.95809
eval/f1_micro,0.99734
eval/f1_negative,0.92308
eval/f1_neutral,0.95238
eval/f1_positive,0.9988
eval/loss,0.01213
eval/runtime,11.223
eval/samples_per_second,267.486
eval/steps_per_second,33.503


In [19]:
keep = [
    'added_tokens.json',
    'config.json',
    'pytorch_model.bin',
    'special_tokens_map.json',
    'tokenizer.json',
    'tokenizer_config.json',
    'vocab.txt'
]

ckpts = os.listdir(run_name)
for ckpt in ckpts:
    ckpt = os.path.join(run_name, ckpt)
    for item in os.listdir(ckpt):
        if item not in keep:
            os.remove(os.path.join(ckpt, item))

!cp -r wandb {run_name} {SAVE_PATH}/