In [1]:
!python --version

Python 3.7.15


In [None]:
!pip install transformers
!pip install datasets

import nltk

nltk.download("punkt")

# emoBERTa

In [None]:
import os
import re
import warnings
from google.colab import drive

import torch
import pandas as pd
from tqdm import tqdm
from datasets import Dataset
from transformers import (
    Trainer,
    TrainingArguments,
    RobertaTokenizerFast,
    RobertaForSequenceClassification,
)
from nltk.tokenize import TweetTokenizer
from sklearn.metrics import accuracy_score, f1_score

drive.mount("/content/drive")
warnings.filterwarnings("ignore")

In [4]:
# 상수 선언
CWD = "/content/drive/MyDrive/DACON"


def join_path(*args):
    return os.path.join(CWD, *args)


DEVICE = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
TRAIN_CSV = join_path("data", "train.csv")
TEST_CSV = join_path("data", "test.csv")
MODEL = "tae898/emoberta-large"
BATCH_SIZE = 32
EPOCHS = 4
MAX_LENGTH = 128

TRAIN_ARGS = TrainingArguments(
    output_dir=join_path("emoberta"),
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    learning_rate=3e-6,
    warmup_steps=500,
    weight_decay=0.01,
    dataloader_num_workers=0,
    save_total_limit=1,
    run_name="[DACON]emoberta",
)

# Tokenizers

In [5]:
# 데이터 전처리 tokenizers 정의
twt_tokenizer = nltk.tokenize.TweetTokenizer(
    preserve_case=False, strip_handles=True, reduce_len=True
)
roberta_tokenizer = RobertaTokenizerFast.from_pretrained(MODEL, truncation=True)


def shorten_repeated_words(tokens):
    for i, token in enumerate(tokens):
        if "-" in token:
            token = token.split("-")
            token = "-".join(dict.fromkeys(token))
            tokens[i] = token
    return tokens


def decode_tokens(tokens):
    sentence = " ".join(tokens)
    marks = re.findall(r"\s\W\s*", sentence)
    for mark in marks:
        if mark.strip() in ["'", "’"]:
            sentence = sentence.replace(mark, mark.strip())
        else:
            sentence = sentence.replace(mark, mark.lstrip())
    return sentence


def twt_tokenize(sentence):
    twt_tokens = twt_tokenizer.tokenize(sentence)
    twt_tokens = shorten_repeated_words(twt_tokens)
    twt_sentence = decode_tokens(twt_tokens)
    return twt_sentence

In [6]:
# 전처리 테스트
sent1 = "And-and-and-and-and the cars?"
sent2 = "Awwwwwww?! Cheol-su!"
sent3 = "I didn't. I didn't."
print(f"{sent1}\n>>> {twt_tokenize(sent1)}\n")
print(f"{sent2}\n>>> {twt_tokenize(sent2)}\n")
print(f"{sent3}\n>>> {twt_tokenize(sent3)}\n")

And-and-and-and-and the cars?
>>> and the cars?

Awwwwwww?! Cheol-su!
>>> awww?! cheol-su!

I didn't. I didn't.
>>> i didn't. i didn't.



In [7]:
class LabelEncoder(object):
    def __init__(self):
        self._targets = [
            "neutral",
            "joy",
            "surprise",
            "anger",
            "sadness",
            "disgust",
            "fear",
        ]
        self.target_size = len(self._targets)

    def encode(self, labels):
        labels = [self._targets.index(lb) for lb in labels]
        return labels

    def decode(self, labels):
        labels = [self._targets[lb] for lb in labels]
        return labels

# Dataset

In [8]:
# 데이터 전처리
train_csv = pd.read_csv(TRAIN_CSV)
train_csv["Utterance"] = train_csv["Utterance"].map(twt_tokenize)
train_csv.head()

Unnamed: 0,ID,Utterance,Speaker,Dialogue_ID,Target
0,TRAIN_0000,also i was the point person on my company’s tr...,Chandler,0,neutral
1,TRAIN_0001,you must’ve had your hands full.,The Interviewer,0,neutral
2,TRAIN_0002,that i did. that i did.,Chandler,0,neutral
3,TRAIN_0003,so let’s talk a little bit about your duties.,The Interviewer,0,neutral
4,TRAIN_0004,my duties? all right.,Chandler,0,surprise


In [9]:
# 레이블 인코딩
label_encoder = LabelEncoder()
label_size = label_encoder.target_size
train_csv["Target"] = label_encoder.encode(train_csv["Target"])

# Train-set, Eval-set 나누기
train_id = len(train_csv) // 8
dialogue_id = train_csv.loc[train_id, "Dialogue_ID"]
df_train, df_eval = train_csv[: train_id + 1], train_csv[train_id:]

# 데이터 컬럼  선택
df_train = df_train.loc[:, ["Utterance", "Target"]].rename(columns={"Target": "label"})
df_eval = df_eval.loc[:, ["Utterance", "Target"]].rename(columns={"Target": "label"})
df_train.head()

Unnamed: 0,Utterance,label
0,also i was the point person on my company’s tr...,0
1,you must’ve had your hands full.,0
2,that i did. that i did.,0
3,so let’s talk a little bit about your duties.,0
4,my duties? all right.,2


In [10]:
def roberta_tokenize(data):
    return roberta_tokenizer(
        data["Utterance"],
        max_length=MAX_LENGTH,
        padding="max_length",
        truncation=True,
    )


# 데이터 토큰화
train_set = Dataset.from_pandas(df_train.reset_index(drop=True))
eval_set = Dataset.from_pandas(df_eval.reset_index(drop=True))

train_set = train_set.map(roberta_tokenize, batched=True, batch_size=len(train_set))
eval_set = eval_set.map(roberta_tokenize, batched=True, batch_size=len(eval_set))

train_set.set_format("torch", columns=["input_ids", "attention_mask", "label"])
eval_set.set_format("torch", columns=["input_ids", "attention_mask", "label"])

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

# Model

In [11]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average="macro")
    acc = accuracy_score(labels, preds)
    return {"f1-macro": f1, "accuracy": acc}


torch.cuda.empty_cache()

# 모델 정의
model = RobertaForSequenceClassification.from_pretrained(MODEL, num_labels=label_size)

trainer = Trainer(
    model=model,
    args=TRAIN_ARGS,
    compute_metrics=compute_metrics,
    train_dataset=train_set,
    eval_dataset=eval_set,
)

In [None]:
# 모델 학습
trainer.train()

In [13]:
# 모델 평가
model_eval = trainer.evaluate()
print(f"Accuracy: {model_eval['eval_accuracy']:.5f}")
print(f"F1-macro: {model_eval['eval_f1-macro']:.5f}")

The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: Utterance. If Utterance are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 8741
  Batch size = 32


Accuracy: 0.77657
F1-macro: 0.65906


In [14]:
# 모델 저장
trainer.save_model(join_path("emoberta"))
torch.cuda.empty_cache()

Saving model checkpoint to /content/drive/MyDrive/DACON/emoberta
Configuration saved in /content/drive/MyDrive/DACON/emoberta/config.json
Model weights saved in /content/drive/MyDrive/DACON/emoberta/pytorch_model.bin


# Prediction

In [15]:
# Test 데이터
test_csv = pd.read_csv(join_path("data", "test.csv"))
test_csv["Utterance"] = test_csv["Utterance"].map(twt_tokenize)
test_csv.head()

Unnamed: 0,ID,Utterance,Speaker,Dialogue_ID
0,TEST_0000,why do all the coffee cups have figures below?,Mark,0
1,TEST_0001,"oh. it's so monica can follow. of this way, if...",Rachell,0
2,TEST_0002,you know what?,Rachell,0
3,TEST_0003,"come on, lydia, you can do it.",Joeyy,1
4,TEST_0004,to push!,Joeyy,1


In [16]:
def roberta_tokenize(data):
    return roberta_tokenizer(
        data["Utterance"],
        max_length=MAX_LENGTH,
        padding="max_length",
        truncation=True,
        return_tensors="pt",
    )


df_test = test_csv.loc[:, "Utterance"].to_frame()
test_set = Dataset.from_pandas(df_test.reset_index(drop=True))
test_set = test_set.map(roberta_tokenize, batched=True, batch_size=len(test_set))
test_set.set_format("torch", columns=["input_ids", "attention_mask"])

  0%|          | 0/1 [00:00<?, ?ba/s]

In [17]:
def predict(model, test_set):
    model.to(DEVICE)
    model.eval()

    test_predict = []
    for data in tqdm(test_set):
        input_id = data["input_ids"].unsqueeze(0).to(DEVICE)
        mask = data["attention_mask"].unsqueeze(0).to(DEVICE)
        output = model(input_id, mask)
        y_pred = output.logits
        test_predict += y_pred.argmax(1).detach().cpu().numpy().tolist()
    return test_predict

In [18]:
# 레이블 예측
preds = predict(model, test_set)
preds = label_encoder.decode(preds)

100%|██████████| 2610/2610 [01:25<00:00, 30.65it/s]


In [19]:
test_csv["Target"] = preds
submit = test_csv.loc[:, ["ID", "Target"]]
submit.head()

Unnamed: 0,ID,Target
0,TEST_0000,surprise
1,TEST_0001,neutral
2,TEST_0002,neutral
3,TEST_0003,neutral
4,TEST_0004,joy


In [20]:
# 예측 값 저장
submit.to_csv(join_path("emoberta", "submit.csv"), index=False)