# Description


# Modules and Global Variables

In [4]:
from transformers import (
    AutoConfig, AutoTokenizer, AutoModelForSequenceClassification, 
    DefaultDataCollator, DataCollatorWithPadding, 
    TrainingArguments, Trainer,
)

from transformers.optimization import (
    AdamW, get_linear_schedule_with_warmup,
    Adafactor, AdafactorSchedule,
)

import wandb

import datasets
import evaluate

from sklearn.metrics import accuracy_score, f1_score

import numpy as np
import pandas as pd
import demoji

import os
import re
import random

In [5]:
### labels

ce_labels = ['True', 'False']
pc_labels = ['positive', 'negative', 'neutral']
pc_binary_labels = ['True', 'False']

labels = ce_labels

label2id = {k: i for i, k in enumerate(labels)}
id2label = {i: k for i, k in enumerate(labels)}
num_labels = len(labels)

print(label2id)
print(id2label)

{'True': 0, 'False': 1}
{0: 'True', 1: 'False'}


In [6]:
### paths and names

PROJECT_NAME = 'aspect_category_detection'
RUN_ID = 'v1'

DATA_V = 'v10'
DATA_T = 'ce' # ce or pc or pc_binary
AUGMENTATION = False
AUG_NAME = 'balanced'

model_checkpoint = 'snunlp/KR-ELECTRA-discriminator'

notebook_name = 'acd_snunlp_kr_electra_discriminator_v1.ipynb'

### fixed

model_name = re.sub(r'[/-]', r'_', model_checkpoint).lower()
run_name = f'{model_name}_{RUN_ID}'

ROOT_PATH = '/content/drive/MyDrive/aspect_based_sentiment_analysis'
SAVE_PATH = os.path.join(ROOT_PATH, PROJECT_NAME, model_name)
NOTEBOOK_PATH = os.path.join(SAVE_PATH, notebook_name)

augornot = f'_{AUG_NAME}' if AUGMENTATION is True else ''
TRAIN_DATA_PATH = os.path.join(ROOT_PATH, 'data', DATA_V, f'{DATA_T}_train{augornot}.csv')
EVAL_DATA_PATH = os.path.join(ROOT_PATH, 'data', DATA_V, f'{DATA_T}_dev.csv')

if os.path.exists(SAVE_PATH):
    print(f'{SAVE_PATH} exists.')
else:
    print(f'{SAVE_PATH} does not exist.')
if os.path.exists(NOTEBOOK_PATH):
    print(f'{NOTEBOOK_PATH} exists.')
else:
    print(f'{NOTEBOOK_PATH} does not exist.')
if os.path.exists(TRAIN_DATA_PATH):
    print(f'{TRAIN_DATA_PATH} exists.')
else:
    print(f'{TRAIN_DATA_PATH} does not exist.')
if os.path.exists(EVAL_DATA_PATH):
    print(f'{EVAL_DATA_PATH} exists.')
else:
    print(f'{EVAL_DATA_PATH} does not exist.')

/content/drive/MyDrive/aspect_based_sentiment_analysis/aspect_category_detection/snunlp_kr_electra_discriminator exists.
/content/drive/MyDrive/aspect_based_sentiment_analysis/aspect_category_detection/snunlp_kr_electra_discriminator/acd_snunlp_kr_electra_discriminator_v1.ipynb exists.
/content/drive/MyDrive/aspect_based_sentiment_analysis/data/v10/ce_train.csv exists.
/content/drive/MyDrive/aspect_based_sentiment_analysis/data/v10/ce_dev.csv exists.


In [7]:
### rest of training args

report_to="wandb"

fp16 = False

num_train_epochs = 10
batch_size = 8
gradient_accumulation_steps = 1

optim = 'adamw_torch' # 'adamw_hf'

learning_rate = 3e-6 # 5e-5
weight_decay = 0.01 # 0
adam_epsilon = 1e-8

lr_scheduler_type = 'cosine'
warmup_ratio = 0

save_total_limit = 10

load_best_model_at_end = True
metric_for_best_model='eval_loss'

save_strategy = "epoch"
evaluation_strategy = "epoch"

logging_strategy = "steps"
logging_first_step = True 
logging_steps = 500

# WandB Configuration

In [8]:
%env WANDB_PROJECT={PROJECT_NAME}
%env WANDB_NOTEBOOK_NAME={NOTEBOOK_PATH}
%env WANDB_LOG_MODEL=true
%env WANDB_WATCH=all
wandb.login()

env: WANDB_PROJECT=aspect_category_detection
env: WANDB_NOTEBOOK_NAME=/content/drive/MyDrive/aspect_based_sentiment_analysis/aspect_category_detection/snunlp_kr_electra_discriminator/acd_snunlp_kr_electra_discriminator_v1.ipynb
env: WANDB_LOG_MODEL=true
env: WANDB_WATCH=all


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

# Load Model, Tokenizer, and Collator

In [9]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint, label2id=label2id, id2label=id2label, num_labels=num_labels
)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Downloading:   0%|          | 0.00/56.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/468 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/214k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of the model checkpoint at snunlp/KR-ELECTRA-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense_prediction.weight']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at snunlp/KR-ELECTRA-discriminator and are newly initialized: ['classifier.out_p

In [10]:
train_json = '/content/drive/MyDrive/aspect_based_sentiment_analysis/data/NIKL_ABSA_2022_COMPETITION/nikluge-sa-2022-train.jsonl'
dev_json = '/content/drive/MyDrive/aspect_based_sentiment_analysis/data/NIKL_ABSA_2022_COMPETITION/nikluge-sa-2022-dev.jsonl'
test_json = '/content/drive/MyDrive/aspect_based_sentiment_analysis/data/NIKL_ABSA_2022_COMPETITION/nikluge-sa-2022-test.jsonl'
train = pd.read_json(train_json, lines=True)
dev = pd.read_json(dev_json, lines=True)
test = pd.read_json(test_json, lines=True)

In [11]:
### new
entity_property_pair = [
    '본품#가격', '본품#다양성', '본품#디자인', '본품#인지도', '본품#일반', '본품#편의성', '본품#품질',
    '브랜드#가격', '브랜드#디자인', '브랜드#인지도', '브랜드#일반', '브랜드#품질',
    '제품 전체#가격', '제품 전체#다양성', '제품 전체#디자인', '제품 전체#인지도', '제품 전체#일반', '제품 전체#편의성', '제품 전체#품질',
    '패키지/구성품#가격', '패키지/구성품#다양성', '패키지/구성품#디자인', '패키지/구성품#일반', '패키지/구성품#편의성', '패키지/구성품#품질'
]


more_tokens = ['&name&', '&affiliation&', '&social-security-num&', '&tel-num&', '&card-num&', '&bank-account&', '&num&', '&online-account&']

emojis = pd.concat([train.sentence_form, dev.sentence_form, test.sentence_form], ignore_index=True, verify_integrity=True).to_frame()
emojis = list(set(demoji.findall(' '.join(emojis.sentence_form.to_list())).keys()))

tokensToAdd = more_tokens + emojis
ep_labels = pd.Series(entity_property_pair, name='sentence_form', copy=True)

In [12]:
data = pd.concat([train.sentence_form, dev.sentence_form, test.sentence_form], ignore_index=True, verify_integrity=True).to_frame()

def preprocess(sent):
    sent = sent.strip()
    # sent = demoji.replace_with_desc(string=sent, sep= " ")
    sent = re.sub(r'\s+', ' ', sent)
    sent = re.sub(r'#', '', sent)
    return sent

data.sentence_form = data.sentence_form.apply(preprocess)
data = pd.concat([data.sentence_form, ep_labels], ignore_index=True, verify_integrity=True).to_frame()
print(len(data))
data = data.drop_duplicates()
print(len(data.drop_duplicates()))

7947
7940


In [13]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
print(len(tokenizer))

tokenizerTrainData = data.sentence_form.to_list()
newTokenizer = tokenizer.train_new_from_iterator(tokenizerTrainData, vocab_size=1)

# new_tokens = set(list(newTokenizer.vocab.keys()) + tokensToAdd) - set(tokenizer.vocab.keys())
new_tokens = set(list(newTokenizer.vocab.keys())) - set(tokenizer.vocab.keys())
tokenizer.add_tokens(list(new_tokens))
print(len(newTokenizer))
print(len(tokenizer))

model.resize_token_embeddings(len(tokenizer))

30000
3060
30076


Embedding(30076, 768)

In [14]:
model.config.label2id, model.config.id2label, model.num_labels

({'True': 0, 'False': 1}, {0: 'True', 1: 'False'}, 2)

In [15]:
entity_property_pair = [
    '본품#가격', '본품#다양성', '본품#디자인', '본품#인지도', '본품#일반', '본품#편의성', '본품#품질',
    '브랜드#가격', '브랜드#디자인', '브랜드#인지도', '브랜드#일반', '브랜드#품질',
    '제품 전체#가격', '제품 전체#다양성', '제품 전체#디자인', '제품 전체#인지도', '제품 전체#일반', '제품 전체#편의성', '제품 전체#품질',
    '패키지/구성품#가격', '패키지/구성품#다양성', '패키지/구성품#디자인', '패키지/구성품#일반', '패키지/구성품#편의성', '패키지/구성품#품질'
]

polarity_id_to_name = ['positive', 'negative', 'neutral']

tokenizer_tester = []
for pair in entity_property_pair:
    for polarity in polarity_id_to_name:
        tokenizer_tester.append('#'.join([pair, polarity]))

for e in tokenizer_tester:
    print(tokenizer.decode(tokenizer.encode(e)))

# for e in tokenizer_tester:
#     print(tokenizer.encode(e))

[CLS] 본품 # 가격 # positive [SEP]
[CLS] 본품 # 가격 # negative [SEP]
[CLS] 본품 # 가격 # neutral [SEP]
[CLS] 본품 # 다양성 # positive [SEP]
[CLS] 본품 # 다양성 # negative [SEP]
[CLS] 본품 # 다양성 # neutral [SEP]
[CLS] 본품 # 디자인 # positive [SEP]
[CLS] 본품 # 디자인 # negative [SEP]
[CLS] 본품 # 디자인 # neutral [SEP]
[CLS] 본품 # 인지도 # positive [SEP]
[CLS] 본품 # 인지도 # negative [SEP]
[CLS] 본품 # 인지도 # neutral [SEP]
[CLS] 본품 # 일반 # positive [SEP]
[CLS] 본품 # 일반 # negative [SEP]
[CLS] 본품 # 일반 # neutral [SEP]
[CLS] 본품 # 편의성 # positive [SEP]
[CLS] 본품 # 편의성 # negative [SEP]
[CLS] 본품 # 편의성 # neutral [SEP]
[CLS] 본품 # 품질 # positive [SEP]
[CLS] 본품 # 품질 # negative [SEP]
[CLS] 본품 # 품질 # neutral [SEP]
[CLS] 브랜드 # 가격 # positive [SEP]
[CLS] 브랜드 # 가격 # negative [SEP]
[CLS] 브랜드 # 가격 # neutral [SEP]
[CLS] 브랜드 # 디자인 # positive [SEP]
[CLS] 브랜드 # 디자인 # negative [SEP]
[CLS] 브랜드 # 디자인 # neutral [SEP]
[CLS] 브랜드 # 인지도 # positive [SEP]
[CLS] 브랜드 # 인지도 # negative [SEP]
[CLS] 브랜드 # 인지도 # neutral [SEP]
[CLS] 브랜드 # 일반 # positive [SEP]
[CLS] 브랜드 # 일반 # nega

# Define Metric

In [16]:
accuracy_metric = evaluate.load('accuracy')
f1_metric = evaluate.load('f1')

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.77k [00:00<?, ?B/s]

In [17]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    
    accuracy = accuracy_metric.compute(references=labels, predictions=predictions)['accuracy']
    f1_true, f1_false = tuple(f1_metric.compute(references=labels, predictions=predictions, average=None, labels=[0,1])['f1'])
    f1_macro = f1_metric.compute(references=labels, predictions=predictions, average='macro')['f1']
    f1_micro = f1_metric.compute(references=labels, predictions=predictions, average='micro')['f1']
    
    return {'accuracy': accuracy, 'f1_true': f1_true, 'f1_false': f1_false, 'f1_macro': f1_macro, 'f1_micro': f1_micro}

# Load Data

In [18]:
def preprocess_function(examples):
    return tokenizer(examples["sentence_form"], examples["entity_property"], truncation=True)

In [19]:
train_dataset = pd.read_csv(TRAIN_DATA_PATH)
eval_dataset = pd.read_csv(EVAL_DATA_PATH)
# train_dataset = pd.concat([train_dataset, eval_dataset])
train_dataset = datasets.Dataset.from_pandas(train_dataset).shuffle(seed=42)
eval_dataset = datasets.Dataset.from_pandas(eval_dataset).shuffle(seed=42)
train_dataset = train_dataset.map(preprocess_function, batched=False)
eval_dataset = eval_dataset.map(preprocess_function, batched=False)

  0%|          | 0/75000 [00:00<?, ?ex/s]

  0%|          | 0/69825 [00:00<?, ?ex/s]

In [20]:
len(train_dataset), len(eval_dataset)

(75000, 69825)

In [21]:
k = random.randrange(len(train_dataset))
tokenizer.decode(train_dataset['input_ids'][k]), train_dataset['labels'][k]

('[CLS] 가방에 보냉파트가 있어 이유식이나 물, 음료 시원하게 ~ [SEP] 패키지 / 구성품 # 디자인 [SEP]', 1)

# Load Trainer

In [22]:
args = TrainingArguments(
    output_dir=run_name,
    run_name=run_name,
    report_to=report_to,

    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,

    optim=optim,

    learning_rate=learning_rate,
    weight_decay=weight_decay,
    adam_epsilon=adam_epsilon,

    lr_scheduler_type=lr_scheduler_type,
    warmup_ratio=warmup_ratio,

    save_total_limit=save_total_limit,

    load_best_model_at_end=load_best_model_at_end,
    metric_for_best_model=metric_for_best_model,
    
    save_strategy=save_strategy,
    evaluation_strategy=evaluation_strategy,

    logging_strategy=logging_strategy,
    logging_first_step=logging_first_step, 
    logging_steps=logging_steps,
    
    fp16=fp16,
)

In [23]:
# es = EarlyStoppingCallback(early_stopping_patience=early_stopping_patience)

In [24]:
trainer = Trainer(
    model,
    args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    data_collator=data_collator,
    # callbacks=[es],
)

# Run Trainer

In [25]:
trainer.train()
wandb.finish()

The following columns in the training set don't have a corresponding argument in `ElectraForSequenceClassification.forward` and have been ignored: entity_property, id, sentence_form. If entity_property, id, sentence_form are not expected by `ElectraForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 75000
  Num Epochs = 10
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 93750
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
[34m[1mwandb[0m: Currently logged in as: [33mdotsnangles[0m. Use [1m`wandb login --relogin`[0m to force relogin


You're using a ElectraTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy,F1 True,F1 False,F1 Macro,F1 Micro
1,0.1153,0.127637,0.970068,0.618056,0.984424,0.80124,0.970068
2,0.0997,0.096749,0.975625,0.688962,0.987315,0.838139,0.975625
3,0.0871,0.104082,0.976126,0.696633,0.987574,0.842104,0.976126
4,0.0744,0.113014,0.976112,0.707368,0.987548,0.847458,0.976112
5,0.0744,0.122972,0.975696,0.706351,0.987324,0.846837,0.975696
6,0.048,0.143467,0.976241,0.710623,0.987612,0.849117,0.976241
7,0.0468,0.150474,0.975782,0.70576,0.987371,0.846565,0.975782
8,0.0452,0.157362,0.975424,0.704342,0.987179,0.845761,0.975424
9,0.0329,0.160836,0.975539,0.706327,0.987238,0.846783,0.975539
10,0.0321,0.161451,0.975682,0.708948,0.987311,0.848129,0.975682


The following columns in the evaluation set don't have a corresponding argument in `ElectraForSequenceClassification.forward` and have been ignored: entity_property, id, sentence_form. If entity_property, id, sentence_form are not expected by `ElectraForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 69825
  Batch size = 8
Saving model checkpoint to snunlp_kr_electra_discriminator_v1/checkpoint-9375
Configuration saved in snunlp_kr_electra_discriminator_v1/checkpoint-9375/config.json
Model weights saved in snunlp_kr_electra_discriminator_v1/checkpoint-9375/pytorch_model.bin
tokenizer config file saved in snunlp_kr_electra_discriminator_v1/checkpoint-9375/tokenizer_config.json
Special tokens file saved in snunlp_kr_electra_discriminator_v1/checkpoint-9375/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `ElectraForSequenceClassification.forward` and have been i

0,1
eval/accuracy,▁▇██▇█▇▇▇▇
eval/f1_false,▁▇██▇█▇▇▇▇
eval/f1_macro,▁▆▇███████
eval/f1_micro,▁▇██▇█▇▇▇▇
eval/f1_true,▁▆▇███████
eval/loss,▄▁▂▃▄▆▇███
eval/runtime,▇▂▁▄██▅▁▄▆
eval/samples_per_second,▂▇█▅▁▁▄█▅▃
eval/steps_per_second,▂▇█▅▁▁▄█▅▃
train/epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███

0,1
eval/accuracy,0.97568
eval/f1_false,0.98731
eval/f1_macro,0.84813
eval/f1_micro,0.97568
eval/f1_true,0.70895
eval/loss,0.16145
eval/runtime,241.8241
eval/samples_per_second,288.743
eval/steps_per_second,36.096
train/epoch,10.0


In [26]:
keep = [
    'added_tokens.json',
    'config.json',
    'pytorch_model.bin',
    'special_tokens_map.json',
    'tokenizer.json',
    'tokenizer_config.json',
    'vocab.txt'
]

ckpts = os.listdir(run_name)
for ckpt in ckpts:
    ckpt = os.path.join(run_name, ckpt)
    for item in os.listdir(ckpt):
        if item not in keep:
            os.remove(os.path.join(ckpt, item))

!cp -r wandb {run_name} {SAVE_PATH}/

# Model Test

In [27]:
# import torch

# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# asc_model = model
# asc_tokenizer = tokenizer

In [28]:
# # 이 정도 크기면 포스가 느껴진다..	본품#일반	positive
# # 토미진 샀으니 당분간 이제 청바지 생각은 안녕.	제품 전체#일반	positive

# # 지금까지로선 분침 넘어가는 건 상당히 부드러운데 시간이 바뀔 때는 좀 소리가 난다.	제품 전체#일반	neutral
# # 그것만 아니면 책상 위에 놓고 보고 싶다.	제품 전체#일반	neutral
# # 면이 부드러운 것도 게중 있는 한데 그런 건 또 품절이다.	제품 전체#일반	neutral
# # 나의 경우에는 미스 식스티가 가장 낫지만 얘는 날이 갈수록 심각하게 비싸지고 있어서...	본품#일반	neutral
# # 소니의 무게가 배터리 포함 117g이라 망설였는데 140g도 요즘 나오는 스마트폰 ...	제품 전체#편의성	neutral
# # 고냥이 동영상도 찍어봤는데 HD화질이 아니어서 아쉽긴 하지만 촬영중 줌도 되고, 음...	제품 전체#품질	neutral
# # 전에 카메라도 지겹게 캐논이라 7년을 들고 있었는데 또 캐논이라니..	브랜드#일반	neutral
# # 어차피 진이 다 거기서 거기지 뭐..	제품 전체#일반	neutral

# # 리바이스도 결코 싸지 않지만 입어보면 왜 이렇게 불편한지..	제품 전체#일반	negative
# # 내 체형에는 잘 맞지 않는 거 같다.	제품 전체#일반	negative
# # 스판이 안 들어간 것은 너무 뻣뻣하고, 스판이 들어간 건 좀 더 낫지만 뒷태를 비롯...	제품 전체#디자인	negative

# form = '토미진 샀으니 당분간 이제 청바지 생각은 안녕.'
# pair = '제품 전체#일반'
# sentiments = ['positive', 'negative', 'neutral']
# asc_pair = []
# for sentiment in sentiments:
#     asc_pair.append('#'.join([pair, sentiment]))

# positive = asc_tokenizer(form, asc_pair[0], truncation=True, return_tensors="pt")
# positive = {k:v.to(device) for k,v in positive.items()}
# negative = asc_tokenizer(form, asc_pair[1], truncation=True, return_tensors="pt")
# negative = {k:v.to(device) for k,v in negative.items()}
# neutral = asc_tokenizer(form, asc_pair[2], truncation=True, return_tensors="pt")
# neutral = {k:v.to(device) for k,v in neutral.items()}

# with torch.no_grad():
#     positive_outputs = asc_model(**positive)
#     negative_outputs = asc_model(**negative)
#     neutral_outputs = asc_model(**neutral)

# pc_predictions = torch.tensor([positive_outputs['logits'][0][0], negative_outputs['logits'][0][0], neutral_outputs['logits'][0][0]]).argmax(-1)
# pc_result = polarity_id_to_name[pc_predictions]

# # sentence['annotation'].append([pair, pc_result])

In [29]:
# positive_outputs['logits'], negative_outputs['logits'], neutral_outputs['logits']

In [30]:
# positive_outputs['logits'].argmax(-1), negative_outputs['logits'].argmax(-1), neutral_outputs['logits'].argmax(-1)

In [31]:
# pc_result