**1. 준비 (GPU T4로 세팅)**

In [1]:
!nvidia-smi -L || echo "No GPU detected (ok for quick tests)"
!pip -q install "transformers" "datasets==3.0.1" "torch==2.3.0+cu118" "torchvision==0.18.0+cu118" "torchaudio==2.3.0+cu118" "sentencepiece" \
                 "pandas" "numpy" "matplotlib" "tqdm" "evaluate" -f https://download.pytorch.org/whl/torch_stable.html

GPU 0: Tesla T4 (UUID: GPU-6e7d2e85-665e-5851-79b5-a5f11957763f)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m839.6/839.6 MB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m94.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m41.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m471.6/471.6 kB[0m [31m15.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m417.9/417.9 MB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.1/13.1 MB[0m [31m102.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.2/23.2 MB[0m [31m81.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m875.6/875.6 kB[0m [31m58.1 MB/s

**2. Github Link 추가하여 데이터 불러오기, 데이터 파일 명만 추가하여 파일 구분**

In [2]:
import pandas as pd

# GitHub raw URL 목록
urls = [
    "https://raw.githubusercontent.com/hyunseo-adastra/SocialData_SportsCommunity/refs/heads/main/LCK_data/1.R1%7E2.csv",
    "https://raw.githubusercontent.com/hyunseo-adastra/SocialData_SportsCommunity/refs/heads/main/LCK_data/2.RTM.csv",
    "https://raw.githubusercontent.com/hyunseo-adastra/SocialData_SportsCommunity/refs/heads/main/LCK_data/3.R3%7E5.csv",
    "https://raw.githubusercontent.com/hyunseo-adastra/SocialData_SportsCommunity/refs/heads/main/LCK_data/4.PLAY-IN.csv",
    "https://raw.githubusercontent.com/hyunseo-adastra/SocialData_SportsCommunity/refs/heads/main/LCK_data/5.PLAYOFF.csv"
]

dfs = []

for url in urls:
    # 파일 이름만 추출 (예: Round1_Bahrain.csv → Round1_Bahrain)
    filename = url.split("/")[-1].replace(".csv", "")

    # CSV 읽기
    df = pd.read_csv(url)

    dfs.append(df)

# 모든 데이터 합치기
df_all = pd.concat(dfs, ignore_index=True)

In [3]:
df_all.head()

Unnamed: 0,time_text,timestamp,author,message,amount,format,date
0,-1:54:31,1749014450918319,UCpTHr9Rn5_w9BEMheofv82w,KT 파이팅!!!,,R1~2,250604
1,-1:42:23,1749015179089550,UCuSabTEvRG-f7w6Pa80RyLg,쇼메이커는 무조건 1찍이다,,R1~2,250604
2,-1:32:52,1749015750551203,UCNInwRrJCCrVFBz-kkxVqfA,블루가어디임?,,R1~2,250604
3,-1:25:43,1749016179287640,UCC4cqWCL8LGCsQLmB8afheA,T,,R1~2,250604
4,-1:25:25,1749016197284369,UC13UxJwYop0g_YWB-wA08Uw,ㄷㄱㄷㄱ 아무나 이겨랑,,R1~2,250604


**3. 데이터 컬럼 선택 및 클리닝**

클리닝
- 디시인사이드의 경우 "dc official App" 삭제
- 해시태그, 멘션, URL, 공백 삭제
- "ㅋㅋㅋ"나 "!!!" 등의 표현 수는 남겨두기

**추후논의하여 클리닝 수정 가능

In [5]:
# 마이크로초 단위 timestamp를 datetime 객체로 변환
# unit='us'를 사용하여 마이크로초 단위임을 지정합니다.
df_all['datetime_utc'] = pd.to_datetime(df_all['timestamp'], unit='us', errors='coerce')

# 초 이하 단위 제거
df_all['datetime_utc'] = df_all['datetime_utc'].dt.floor('s')

# UTC 시간을 KST (UTC+9)로 변환
# 먼저 UTC 타임존 정보를 추가하고, 그 다음 KST로 변환합니다.
df_all['datetime_kst'] = df_all['datetime_utc'].dt.tz_localize('UTC').dt.tz_convert('Asia/Seoul')

# 초 이하 단위 제거
df_all['datetime_kst'] = df_all['datetime_kst'].dt.floor('s')

In [6]:
#결과 확인
df_all.head()

Unnamed: 0,time_text,timestamp,author,message,amount,format,date,datetime_utc,datetime_kst
0,-1:54:31,1749014450918319,UCpTHr9Rn5_w9BEMheofv82w,KT 파이팅!!!,,R1~2,250604,2025-06-04 05:20:50,2025-06-04 14:20:50+09:00
1,-1:42:23,1749015179089550,UCuSabTEvRG-f7w6Pa80RyLg,쇼메이커는 무조건 1찍이다,,R1~2,250604,2025-06-04 05:32:59,2025-06-04 14:32:59+09:00
2,-1:32:52,1749015750551203,UCNInwRrJCCrVFBz-kkxVqfA,블루가어디임?,,R1~2,250604,2025-06-04 05:42:30,2025-06-04 14:42:30+09:00
3,-1:25:43,1749016179287640,UCC4cqWCL8LGCsQLmB8afheA,T,,R1~2,250604,2025-06-04 05:49:39,2025-06-04 14:49:39+09:00
4,-1:25:25,1749016197284369,UC13UxJwYop0g_YWB-wA08Uw,ㄷㄱㄷㄱ 아무나 이겨랑,,R1~2,250604,2025-06-04 05:49:57,2025-06-04 14:49:57+09:00


In [9]:
import pandas as pd
import numpy as np
import re

# ---------------------------------------------------------------------
# 데이터 클리닝

def normalize_text(s: str) -> str:
    if not isinstance(s, str):
        return ""

    # 1) URL 제거
    s = re.sub(r"http\S+|www\.\S+", " ", s)

    # 2) 멘션/해시태그 제거
    s = re.sub(r"[@#]\w+", " ", s)

    # 3) 반복 문자 축약 (ㅋㅋㅋㅋ -> ㅋㅋ, 아아아아 -> 아아, !!!!! -> !!)
    s = re.sub(r"([ㄱ-ㅎㅏ-ㅣ가-힣a-zA-Z0-9!?.])\1{2,}", r"\1\1", s)

    #4) 이모지/특수문자 과도한 것 정리 (한글/영문/숫자/기본문장부호만 유지)
    s = re.sub(r"[^\w\s가-힣ㄱ-ㅎㅏ-ㅣ!?.]", " ", s, flags=re.UNICODE)

    # 생략 5) 팀명/드라이버 약칭 정규화 (원하면 계속 추가)
    # team_map = {
    #    "T1": "티원",
    #    "GEN": "젠지",
    #    "VER": "페르스타펜",
    #    "HAM": "해밀턴",
    #}
    #for short_name, full_name in team_map.items():
    #    s = re.sub(fr"\b{short_name}\b", full_name, s, flags=re.IGNORECASE)

    # 6) 공백 정리
    s = re.sub(r"\s+", " ", s).strip()
    return s

####클리닝은 논의 후 추가

# ---------------------------------------------------------------------
print("Before cleaning, rows:", len(df_all))
# 데이터프레임 컬럼 선택 (df_all에서 필요한 컬럼 선택)
df = df_all[["datetime_kst", "author", "message", "format"]].copy()

# 채팅 제한 봇 데이터 삭제 (유저id: UCSvjQBDgYDB5TGVmCZObcwA)
df = df[df['author'] != 'UCSvjQBDgYDB5TGVmCZObcwA'].copy()

# 클리닝
df["clean_text"] = df["message"].map(normalize_text)


# 빈 텍스트/공백만 남은 행 제거
df["clean_text"] = df["clean_text"].fillna("").str.strip()
df = df[df["clean_text"].astype(bool)].reset_index(drop=True)

print("After cleaning, rows:", len(df))


Before cleaning, rows: 1520294
After cleaning, rows: 1504522


In [13]:
#데이터 확인
df.sample(5)

Unnamed: 0,datetime_kst,author,message,format,clean_text
1296465,2025-09-21 18:49:14+09:00,UCi6IER5oX2ZVcffjB3iAUeA,좀 스킨 써라 니껀데,PLAYOFF,좀 스킨 써라 니껀데
979845,2025-08-15 22:11:21+09:00,UC98ZWC1cjhab5tsyBh9OV8A,ㄷㅅㅎ,R3~5,ㄷㅅㅎ
764877,2025-06-13 20:54:04+09:00,UCU_xlaMYNhSXUqCd7Lx96tA,실버스크랩스 틀어!!!,RTM,실버스크랩스 틀어!!
905306,2025-08-22 18:19:02+09:00,UC3_Nu7D8kGHeoqmUW8IphyA,슼럼슼렇지,R3~5,슼럼슼렇지
1441518,2025-09-10 18:06:49+09:00,UCuW1KKgc9p1fJVyu-NbSydg,제이카는 이해하겠는데 뭔 유나라까지 못하면 어쩌자는거지,PLAYOFF,제이카는 이해하겠는데 뭔 유나라까지 못하면 어쩌자는거지


**4. 파인튜닝을 위한 데이터셋 불러오기**

사용데이터: KOTE, KMRE
- KOTE 한국어 온라인 댓글 감정 라벨링: https://github.com/searle-j/KOTE
- KMRE 한국어 영화 리뷰 댓글 감정 라벨링: https://github.com/passing2961/KMRE

In [10]:
from datasets import Dataset, DatasetDict
import requests, io, pickle
from datasets import load_dataset

kote = load_dataset("searle-j/kote")
print(kote)

def load_kmre_from_raw(file_url: str):
    resp = requests.get(file_url)
    resp.raise_for_status()
    buf = io.BytesIO(resp.content)
    data = pickle.load(buf)

    # dict 리스트로 변경
    records = []
    for row in data:
        # row가 ["text", "label"] 꼴이라고 가정
        # 혹시 길이가 다르면 건너뛰도록 방어
        if isinstance(row, (list, tuple)) and len(row) >= 2:
            text, label = row[0], row[1]
            records.append({"text": text, "label": label})
        elif isinstance(row, dict):
            # 혹시 이미 dict면 그대로
            records.append(row)

    return Dataset.from_list(records)

base_url = "https://raw.githubusercontent.com/passing2961/KMRE/master/data/"

kmre_train = load_kmre_from_raw(base_url + "kmre_train")
kmre_dev   = load_kmre_from_raw(base_url + "kmre_dev")
kmre_test  = load_kmre_from_raw(base_url + "kmre_test")

kmre = DatasetDict({
    "train": kmre_train,
    "validation": kmre_dev,
    "test": kmre_test,
})

print(kmre)
print(kmre["train"][0])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


kote.py: 0.00B [00:00, ?B/s]

README.md:   0%|          | 0.00/326 [00:00<?, ?B/s]

The repository for searle-j/kote contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/searle-j/kote.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


Downloading data:   0%|          | 0.00/6.74M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/843k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/858k [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['ID', 'text', 'labels'],
        num_rows: 40000
    })
    test: Dataset({
        features: ['ID', 'text', 'labels'],
        num_rows: 5000
    })
    validation: Dataset({
        features: ['ID', 'text', 'labels'],
        num_rows: 5000
    })
})
DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 119995
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 29999
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 49997
    })
})
{'text': '너~~~무 재미없어서 중간에 나왔습니다;;', 'label': 2}


**5. KoBERT 모델 불러오기**

In [14]:
import transformers

# =========================
# 0. transformers.utils 에서 모자란 것들 먼저 채워넣기
# =========================

# (1) quanto 관련
if not hasattr(transformers.utils, "is_quanto_available"):
    def is_quanto_available():
        return False
    transformers.utils.is_quanto_available = is_quanto_available

# (2) accelerate 관련
if not hasattr(transformers.utils, "is_accelerate_available"):
    def is_accelerate_available():
        return False
    transformers.utils.is_accelerate_available = is_accelerate_available

# (3) cached_property 가 필요한 경우가 있어서 넣어둔다
try:
    from functools import cached_property as _cached_property
except ImportError:
    # 파이썬 3.7 같은 경우 fallback
    def _cached_property(func):
        return property(func)

if not hasattr(transformers.utils, "cached_property"):
    transformers.utils.cached_property = _cached_property

# (4) 일부 버전에서 feature_extraction_utils 가 부르는 util
if not hasattr(transformers.utils, "add_model_info_to_auto_map"):
    def add_model_info_to_auto_map(config, **kwargs):
        # 우리 목적에는 noop 해도 된다
        return
    transformers.utils.add_model_info_to_auto_map = add_model_info_to_auto_map

# =========================
# 1. 이제 안전하게 monologg/kobert 불러오기
# =========================
from transformers import AutoTokenizer, AutoModelForSequenceClassification

MODEL_BASE = "monologg/kobert"

tok = AutoTokenizer.from_pretrained(
    MODEL_BASE,
    trust_remote_code=True,   # 이걸 써야 monologg 쪽 커스텀 토크나이저를 쓴다
)

model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_BASE,
    trust_remote_code=True,
    num_labels=2,
)

print("loaded!", model.__class__)

tokenizer_config.json:   0%|          | 0.00/263 [00:00<?, ?B/s]

tokenization_kobert.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/monologg/kobert:
- tokenization_kobert.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


tokenizer_78b3253a26.model:   0%|          | 0.00/371k [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/426 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/369M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at monologg/kobert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


loaded! <class 'transformers.models.bert.modeling_bert.BertForSequenceClassification'>


**6. KoBERT Model Tokenizing & Training(Finetuning)**

In [18]:
from datasets import DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer,
)
import numpy as np
import evaluate

# 1) KMRE 라벨 매핑 (CPU 버전과 동일)
all_labels = (
    list(set(kmre["train"]["label"]))
    + list(set(kmre["validation"]["label"]))
    + (list(set(kmre["test"]["label"])) if "test" in kmre else [])
)
unique_labels = sorted(list(set(all_labels)))
print("KMRE 전체 라벨:", unique_labels)

label2id = {lbl: i for i, lbl in enumerate(unique_labels)}
id2label = {i: str(lbl) for lbl, i in label2id.items()}
num_labels = len(unique_labels)
print("num_labels:", num_labels)

def encode_label(ex):
    ex["labels"] = label2id[ex["label"]]
    return ex

kmre_num = DatasetDict({
    "train": kmre["train"].map(encode_label, remove_columns=["label"]),
    "validation": kmre["validation"].map(encode_label, remove_columns=["label"]),
})
if "test" in kmre:
    kmre_num["test"] = kmre["test"].map(encode_label, remove_columns=["label"])

# 2) 토크나이저 / 모델
MODEL_NAME = "skt/kobert-base-v1"
tok = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=num_labels,
    id2label=id2label,
    label2id={str(k): v for k, v in label2id.items()},
)

# 3) 토크나이즈
def tokenize_fn(batch):
    return tok(batch["text"], truncation=True, max_length=128)

tokenized_kmre = kmre_num.map(
    tokenize_fn,
    batched=True,
    remove_columns=["text"]
)

# 4) token_type_ids 제거하는 collator (이게 GPU에서도 안전하게 해줌)
class MyCollator(DataCollatorWithPadding):
    def __call__(self, features):
        batch = super().__call__(features)
        if "token_type_ids" in batch:
            batch.pop("token_type_ids")
        return batch

collator = MyCollator(tokenizer=tok)
metric_f1 = evaluate.load("f1")

# 5) GPU용 TrainingArguments (no_cuda 빼기)
training_args = TrainingArguments(
    output_dir="./results_kmre_kobert_gpu",
    eval_strategy="epoch", # evaluation_strategy -> eval_strategy
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    report_to="none",
    # no_cuda=False  # 기본값이 False라 안 써도 됨 → GPU 사용
)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    return {
        "f1": metric_f1.compute(predictions=preds, references=labels, average="macro")["f1"],
        "accuracy": (preds == labels).mean().item(),
    }

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_kmre["train"],
    eval_dataset=tokenized_kmre["validation"],
    data_collator=collator,
    compute_metrics=compute_metrics,
    tokenizer=tok,
)

print("🚀 Training on GPU...")
trainer.train()
print("✅ Training finished on GPU")

KMRE 전체 라벨: [0, 1, 2, 3, 4, 5]
num_labels: 6


Map:   0%|          | 0/119995 [00:00<?, ? examples/s]

Map:   0%|          | 0/29999 [00:00<?, ? examples/s]

Map:   0%|          | 0/49997 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at skt/kobert-base-v1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/119995 [00:00<?, ? examples/s]

Map:   0%|          | 0/29999 [00:00<?, ? examples/s]

Map:   0%|          | 0/49997 [00:00<?, ? examples/s]

  trainer = Trainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 3, 'bos_token_id': 2}.


🚀 Training on GPU...


Epoch,Training Loss,Validation Loss,F1,Accuracy
1,1.6104,1.608508,0.192789,0.356479
2,1.5805,1.578363,0.240459,0.374746
3,1.5414,1.553745,0.250058,0.382646


✅ Training finished on GPU


**7. 감정분석**

In [21]:
import torch, numpy as np
from tqdm.auto import tqdm
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import glob, os

# 1. 학습 결과 디렉토리
TRAINING_OUTPUT_DIR = "./results_kmre_kobert_gpu" # <-- 디렉토리 이름 수정
BASE = "monologg/kobert"     # 학습 때 썼던 거랑 맞춰

device = "cuda" if torch.cuda.is_available() else "cpu"

# 2. 가장 최신 체크포인트 찾기
checkpoints = [d for d in glob.glob(f"{TRAINING_OUTPUT_DIR}/checkpoint-*") if os.path.isdir(d)]
if not checkpoints:
    raise FileNotFoundError(f"No checkpoint directories found in {TRAINING_OUTPUT_DIR}")
latest_checkpoint = max(checkpoints, key=lambda x: int(x.split("-")[-1]))
print("Loading model from:", latest_checkpoint)

# 3. 토크나이저/모델 로드
tok = AutoTokenizer.from_pretrained(BASE, trust_remote_code=True)
model = AutoModelForSequenceClassification.from_pretrained(latest_checkpoint).eval().to(device)

# label 이름 가져오기 (KMRE면 anger/disgust/... 일 거야)
id2label = model.config.id2label if model.config.id2label else None

def batched(lst, n=64):
    for i in range(0, len(lst), n):
        yield lst[i:i+n]

texts = df["clean_text"].tolist()

all_probs = []
all_preds = []
all_labels = []

for chunk in tqdm(batched(texts, 64), total=(len(texts)//64 + 1)):
    enc = tok(
        chunk,
        truncation=True,
        padding=True,
        max_length=128,
        return_tensors="pt"
    )
    enc = {k: v.to(device) for k, v in enc.items()}
    with torch.no_grad():
        logits = model(**enc).logits
        probs = logits.softmax(dim=-1).cpu().numpy()
    all_probs.append(probs)

    pred_ids = probs.argmax(axis=1)
    all_preds.extend(pred_ids)

    if id2label:
        # id2label이 {'0': 'anger', ...} 이런 식일 수도 있음
        lbls = []
        for i in pred_ids:
            key = str(i)
            lbls.append(id2label[key] if key in id2label else id2label.get(i, str(i)))
        all_labels.extend(lbls)
    else:
        all_labels.extend([str(i) for i in pred_ids])

all_probs = np.vstack(all_probs)

# 4. df에 결과 붙이기
df["emo_pred_id"] = all_preds
df["emo_pred_label"] = all_labels
df["emo_pred_conf"] = all_probs.max(axis=1)

# 5. 저장
output_path = "/content/LCK_emotion_results_kobert.csv"
df.to_csv(output_path, index=False, encoding="utf-8-sig")
print("저장 완료 →", output_path)

Loading model from: ./results_kmre_kobert_gpu/checkpoint-22500


  0%|          | 0/23509 [00:00<?, ?it/s]

저장 완료 → /content/LCK_emotion_results_kobert.csv
