In [None]:
!pip install transformers
!pip install datasets

import nltk
nltk.download("punkt")

In [None]:
!python --version

Python 3.7.15


In [None]:
import os 
import re
import warnings
from google.colab import drive

import torch
from tqdm import tqdm
import pandas as pd
from datasets import Dataset
from nltk.tokenize import TweetTokenizer
from transformers import Trainer, TrainingArguments
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split 


drive.mount("/content/drive")
warnings.filterwarnings("ignore")

In [None]:
# 상수 선언
CWD = "/content/drive/MyDrive/DACON"

def join_path(*args):
    return os.path.join(CWD, *args)

DEVICE = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
TRAIN_CSV = join_path("data", "train.csv")
TEST_CSV = join_path("data", "test.csv")
MODEL = "siebert/sentiment-roberta-large-english"
MODEL_DIR = "siebert"
BATCH_SIZE = 32
EPOCHS = 20
MAX_LENGTH = 128

TRAIN_ARGS = TrainingArguments(
    output_dir=join_path(MODEL_DIR),
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,  
    per_device_eval_batch_size=BATCH_SIZE,
    learning_rate=5e-6,
    warmup_steps=500,
    weight_decay=0.01,
    dataloader_num_workers=0,
    save_total_limit=1,
    evaluation_strategy="epoch",
    save_strategy="no",
    run_name="[DACON]{}".format(MODEL_DIR)
)

In [None]:
twt_tokenizer = nltk.tokenize.TweetTokenizer(
    preserve_case=False, 
    strip_handles=True, 
    reduce_len=True
)
roberta_tokenizer = AutoTokenizer.from_pretrained(
    MODEL,
    truncation=True
)

def shorten_repeated_words(tokens):
    for i, token in enumerate(tokens):
        if "-" in token:
            token = token.split("-")
            token = "-".join(dict.fromkeys(token))
            tokens[i] = token
    return tokens

def decode_tokens(tokens):
    sentence = " ".join(tokens)
    marks = re.findall(r"\s\W\s*", sentence)
    for mark in marks:
        if mark.strip() in ["'", "’"]:
            sentence = sentence.replace(mark, mark.strip())
        else:
            sentence = sentence.replace(mark, mark.lstrip())
    return sentence

def twt_tokenize(sentence):
    twt_tokens = twt_tokenizer.tokenize(sentence)
    twt_tokens = shorten_repeated_words(twt_tokens)
    twt_sentence = decode_tokens(twt_tokens)
    return twt_sentence

# 모델, 데이터 준비

In [None]:
# 데이터 확인
train_csv = pd.read_csv(TRAIN_CSV)
train_csv["Utterance"] = train_csv["Utterance"].map(twt_tokenize)
train_csv.head()

Unnamed: 0,ID,Utterance,Speaker,Dialogue_ID,Target
0,TRAIN_0000,also i was the point person on my company’s tr...,Chandler,0,neutral
1,TRAIN_0001,you must’ve had your hands full.,The Interviewer,0,neutral
2,TRAIN_0002,that i did. that i did.,Chandler,0,neutral
3,TRAIN_0003,so let’s talk a little bit about your duties.,The Interviewer,0,neutral
4,TRAIN_0004,my duties? all right.,Chandler,0,surprise


In [None]:
# 레이블 인코딩
label_encoder = LabelEncoder()
label_encoder = label_encoder.fit(train_csv["Target"])
label_size = label_encoder.classes_.shape[0]

train_csv["Target"] = label_encoder.transform(train_csv["Target"])
train_csv = train_csv.loc[:, ["Utterance", "Target"]]
train_csv.rename(columns={"Target": "label"}, inplace=True)

df_train, df_eval = train_test_split(train_csv, test_size=0.1)
df_train.head()

Unnamed: 0,Utterance,label
5867,"oh and keep in mind, now, i was carrying tripl...",4
8078,do you wanna?,4
4104,closer than here?,4
8316,do i rach?,5
4115,"yeah, they're not so bad.",4


In [None]:
def roberta_tokenize(data):
    return roberta_tokenizer(
        data["Utterance"], 
        max_length=MAX_LENGTH,
        padding=True, 
        truncation=True, 
    )

train_set = Dataset.from_pandas(df_train.reset_index(drop=True))
eval_set = Dataset.from_pandas(df_eval.reset_index(drop=True))

train_set = train_set.map(roberta_tokenize, batched=True, batch_size=len(train_set))
eval_set = eval_set.map(roberta_tokenize, batched=True, batch_size=len(eval_set))

train_set.set_format("torch", columns=["input_ids", "attention_mask", "label"])
eval_set.set_format("torch", columns=["input_ids", "attention_mask", "label"])

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

# 모델 학습

In [None]:
# 모델 학습 
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average="macro")
    acc = accuracy_score(labels, preds)
    return {
        "f1": f1,
        "accuracy": acc
    }

# tokenizer, model
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL, 
    num_labels=label_size, 
    ignore_mismatched_sizes=True
)

trainer = Trainer(
    model=model,
    args=TRAIN_ARGS,
    compute_metrics=compute_metrics,
    train_dataset=train_set,
    eval_dataset=eval_set
)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at siebert/sentiment-roberta-large-english and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([2, 1024]) in the checkpoint and torch.Size([7, 1024]) in the model instantiated
- classifier.out_proj.bias: found shape torch.Size([2]) in the checkpoint and torch.Size([7]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
torch.cuda.empty_cache()
trainer.train()

In [None]:
# 모델 평가
model_eval = trainer.evaluate()
print("Accuracy: {:.5f}".format(model_eval["eval_accuracy"]))
print("F1-macro: {:.5f}".format(model_eval["eval_f1"]))

In [None]:
# 모델 저장
trainer.save_model(join_path(MODEL_DIR))
torch.cuda.empty_cache()

# 예측

In [None]:
test_csv = pd.read_csv(join_path("data", "test.csv"))
test_csv["Utterance"] = test_csv["Utterance"].map(twt_tokenize)
test_csv.head()

In [None]:
def roberta_tokenize(data):
    return roberta_tokenizer(
        data["Utterance"], 
        max_length=MAX_LENGTH,
        padding=True, 
        truncation=True, 
        return_tensors="pt"
    )

df_test = test_csv.loc[:, "Utterance"].to_frame()

test_set = Dataset.from_pandas(df_test.reset_index(drop=True))
test_set = test_set.map(roberta_tokenize, batched=True, batch_size=len(test_set))
test_set.set_format("torch", columns=["input_ids", "attention_mask"])

In [None]:
def inference(model, test_set, ):
    model.to(DEVICE)
    model.eval()
    
    test_predict = []
    for data in tqdm(test_set):
        input_id = data["input_ids"].unsqueeze(0).to(DEVICE)
        mask = data["attention_mask"].unsqueeze(0).to(DEVICE)
        output = model(input_id, mask)
        y_pred = output.logits
        test_predict += y_pred.argmax(1).detach().cpu().numpy().tolist()
    return test_predict

In [None]:
preds = inference(model, test_set)
preds = label_encoder.inverse_transform(preds) 

In [None]:
test_csv["Target"] = preds
submit = test_csv.loc[:, ["ID", "Target"]]
submit.head()

In [None]:
submit.to_csv(join_path(MODEL_DIR, "submit.csv"), index=False)