**1. 준비 (GPU T4로 세팅)**

In [None]:
!nvidia-smi -L || echo "No GPU detected (ok for quick tests)"
!pip -q install "transformers" "datasets==3.0.1" "torch==2.3.0+cu118" "torchvision==0.18.0+cu118" "torchaudio==2.3.0+cu118" "sentencepiece" \
                 "pandas" "numpy" "matplotlib" "tqdm" "evaluate" -f https://download.pytorch.org/whl/torch_stable.html

GPU 0: Tesla T4 (UUID: GPU-2dd876a1-f59e-1038-5330-47b87336cf24)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m54.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m27.1 MB/s[0m eta [36m0:00:00[0m
[?25h

**2. Github Link 추가하여 데이터 불러오기, 데이터 파일 명만 추가하여 파일 구분**

In [None]:
import pandas as pd

# GitHub raw URL 목록
urls = [
    "https://raw.githubusercontent.com/hyunseo-adastra/SocialData_SportsCommunity/main/F1_forKCElectra/Round1_Bahrain.csv",
    "https://raw.githubusercontent.com/hyunseo-adastra/SocialData_SportsCommunity/main/F1_forKCElectra/Round3_Australia.csv",
    "https://raw.githubusercontent.com/hyunseo-adastra/SocialData_SportsCommunity/main/F1_forKCElectra/Round4_Japan.csv",
    "https://raw.githubusercontent.com/hyunseo-adastra/SocialData_SportsCommunity/main/F1_forKCElectra/Round7_Italy.csv",
    "https://raw.githubusercontent.com/hyunseo-adastra/SocialData_SportsCommunity/main/F1_forKCElectra/Round10_Spain.csv",
    "https://raw.githubusercontent.com/hyunseo-adastra/SocialData_SportsCommunity/main/F1_forKCElectra/Round12_UnitedKingdom.csv",
    "https://raw.githubusercontent.com/hyunseo-adastra/SocialData_SportsCommunity/main/F1_forKCElectra/Round13_Hungary.csv",
    "https://raw.githubusercontent.com/hyunseo-adastra/SocialData_SportsCommunity/main/F1_forKCElectra/Round15_Netherlands.csv",
    "https://raw.githubusercontent.com/hyunseo-adastra/SocialData_SportsCommunity/main/F1_forKCElectra/Round17_Singapore.csv",
    "https://raw.githubusercontent.com/hyunseo-adastra/SocialData_SportsCommunity/main/F1_forKCElectra/Round22_USA.csv",
    "https://raw.githubusercontent.com/hyunseo-adastra/SocialData_SportsCommunity/main/F1_forKCElectra/Round23_Qatar.csv",
    "https://raw.githubusercontent.com/hyunseo-adastra/SocialData_SportsCommunity/main/F1_forKCElectra/Round24_AbuDhabi.csv"
]

dfs = []

for url in urls:
    # 파일 이름만 추출 (예: Round1_Bahrain.csv → Round1_Bahrain)
    filename = url.split("/")[-1].replace(".csv", "")

    # CSV 읽기
    df = pd.read_csv(url)

    # 파일 이름 컬럼 추가
    df["source_file"] = filename

    dfs.append(df)

# 모든 데이터 합치기
df_all = pd.concat(dfs, ignore_index=True)

**3. 데이터 컬럼 선택 및 클리닝**

클리닝
- 디시인사이드의 경우 "dc official App" 삭제
- 해시태그, 멘션, URL, 공백 삭제
- "ㅋㅋㅋ"나 "!!!" 등의 표현 수는 남겨두기

**추후논의하여 클리닝 수정 가능

In [None]:
import pandas as pd
import numpy as np
import re

# 컬럼 찾아내기
TEXT_CAND = [
    "text","content","message","body","comment","chat",
    "post_content","post_text","desc","description"
]
TITLE_CAND = ["title","post_title","subject","headline"]
TIME_CAND = [
    "timestamp","time","created_at","createdAt","date","datetime",
    "post_timestamp","post_time","post_date","published_at","created"
]
GROUP_CAND = ["team","team_name","driver","player","user","author","username"]

def pick_col(cols, cand_list):
    cols_set = set(cols)
    # 1) 정확히 일치
    for c in cand_list:
        if c in cols_set:
            return c
    # 2) 부분 포함(대소문자 무시)
    for col in cols:
        for k in cand_list:
            if k.lower() in col.lower():
                return col
    return None

# 텍스트 결합(제목+본문)도 지원하기 위해 보조 함수
def pick_text_columns(cols):
    text_col = pick_col(cols, TEXT_CAND)
    title_col = pick_col(cols, TITLE_CAND)
    return text_col, title_col

# ---------------------------------------------------------------------
# 데이터 클리닝
# "dc official app" 변형 전부 제거 (따옴표/하이픈/대소문자/앞뒤공백 포함)
_DC_OFFICIAL_PAT = re.compile(
    r"""
    ^\s*        # 라인 시작 공백
    [\-\–\—\u2013\u2014]*\s*   # 하이픈 류와 공백(선택)
    ["']?\s*    # 따옴표(선택)
    d\s*c\s*o\s*f\s*f\s*i\s*c\s*i\s*a\s*l\s*a\s*p\s*p # dc official app(문자 사이 공백 허용 대비)
    \s*["']?    # 따옴표(선택)
    \s*$        # 라인 끝
    """,
    re.IGNORECASE | re.VERBOSE
)

def remove_dc_official_app_lines(s: str) -> str:
    """문장 중 개별 라인으로 들어온 '- dc official App' 류 라인을 날림."""
    # 줄 단위로 쪼개서 해당 패턴에 맞는 줄 제거
    lines = re.split(r"[\r\n]+", s)
    kept = [ln for ln in lines if not _DC_OFFICIAL_PAT.match(ln)]
    return "\n".join(kept).strip()

def normalize_text(s: str) -> str:
    if not isinstance(s, str):
        return ""
    # 0) 먼저 dc official app 같은 라인 제거
    s = remove_dc_official_app_lines(s)

    # 1) URL 제거
    s = re.sub(r"http\S+|www\.\S+", " ", s)

    # 2) 멘션/해시태그 제거
    s = re.sub(r"[@#]\w+", " ", s)

    # 생략 3) 반복 문자 축약 (ㅋㅋㅋㅋ -> ㅋㅋ, 아아아아 -> 아아, !!!!! -> !!)
    # s = re.sub(r"([ㄱ-ㅎㅏ-ㅣ가-힣a-zA-Z0-9!?.])\1{2,}", r"\1\1", s)

    # 생략 4) 이모지/특수문자 과도한 것 정리 (한글/영문/숫자/기본문장부호만 유지)
    # s = re.sub(r"[^\w\s가-힣ㄱ-ㅎㅏ-ㅣ!?.]", " ", s, flags=re.UNICODE)

    # 생략 5) 팀명/드라이버 약칭 정규화 (원하면 계속 추가)
    # team_map = {
    #    "T1": "티원",
    #    "GEN": "젠지",
    #    "VER": "페르스타펜",
    #    "HAM": "해밀턴",
    #}
    #for short_name, full_name in team_map.items():
    #    s = re.sub(fr"\b{short_name}\b", full_name, s, flags=re.IGNORECASE)

    # 6) 공백 정리
    s = re.sub(r"\s+", " ", s).strip()
    return s

####클리닝은 논의 후 추가

# ---------------------------------------------------------------------
df = df_all.copy()

# 텍스트/제목/시간/그룹 컬럼 자동 선택
TEXT_COL, TITLE_COL = pick_text_columns(df.columns)
TIME_COL = pick_col(df.columns, TIME_CAND)
GROUP_COL = pick_col(df.columns, GROUP_CAND)

print(f"Detected -> TEXT_COL={TEXT_COL}, TITLE_COL={TITLE_COL}, TIME_COL={TIME_COL}, GROUP_COL={GROUP_COL}")

# 텍스트 컬럼이 없으면 제목만이라도 사용, 그것도 없으면 에러
if TEXT_COL is None and TITLE_COL is None:
    raise ValueError("텍스트(채팅/본문/내용) 컬럼을 못 찾았어요. 필요 시 df_raw.rename(columns={'실제컬럼명':'text'}) 후 재실행하세요.")

# 텍스트 결합: 본문+제목 (제목이 있으면 앞에 붙이기)
if TEXT_COL is not None and TITLE_COL is not None:
    df["_raw_text_for_clean"] = (
        df[TITLE_COL].astype(str).fillna("") + " " + df[TEXT_COL].astype(str).fillna("")
    )
elif TEXT_COL is not None:
    df["_raw_text_for_clean"] = df[TEXT_COL].astype(str).fillna("")
else:
    df["_raw_text_for_clean"] = df[TITLE_COL].astype(str).fillna("")

# 클리닝
df["clean_text"] = df["_raw_text_for_clean"].map(normalize_text)

# 타임스탬프 파싱(있으면). 문자열/숫자 혼재 대응 & 실패는 NaT로
if TIME_COL is not None:
    # 숫자형 유닉스 타임스탬프(초/밀리초)일 가능성도 처리
    if np.issubdtype(df[TIME_COL].dtype, np.number):
        # 초/밀리초 구분 추정
        median_val = df[TIME_COL].dropna().astype(float).median() if df[TIME_COL].notna().any() else None
        if median_val and median_val > 1e12:
            # 밀리초로 보임
            df["ts"] = pd.to_datetime(df[TIME_COL], unit="ms", errors="coerce", utc=True).dt.tz_localize(None)
        elif median_val and median_val > 1e9:
            # 초로 보임
            df["ts"] = pd.to_datetime(df[TIME_COL], unit="s", errors="coerce", utc=True).dt.tz_localize(None)
        else:
            # 일반 숫자 -> 문자열 변환 후 파서에 맡김
            df["ts"] = pd.to_datetime(df[TIME_COL].astype(str), errors="coerce")
    else:
        df["ts"] = pd.to_datetime(df[TIME_COL], errors="coerce")
else:
    df["ts"] = pd.NaT

# 빈 텍스트/공백만 남은 행 제거
df["clean_text"] = df["clean_text"].fillna("").str.strip()
df = df[df["clean_text"].astype(bool)].reset_index(drop=True)

# 편의: source_file에서 라운드명 추출
if "source_file" in df.columns:
    # 예: Round22_USA -> round_num=22, gp_name=USA
    m = df["source_file"].str.extract(r"Round\s*([0-9]+)[\s_]+(.+)", expand=True)
    df["round_num"] = pd.to_numeric(m[0], errors="coerce")
    df["gp_name"] = m[1].fillna("").str.replace("_", " ").str.strip()

print("After cleaning, rows:", len(df))
display_cols = ["clean_text","ts"]
if "source_file" in df.columns: display_cols.append("source_file")
if "round_num" in df.columns:   display_cols.append("round_num")
if "gp_name" in df.columns:     display_cols.append("gp_name")
if GROUP_COL is not None:       display_cols.append(GROUP_COL)
print(df[display_cols].head(10))

Detected -> TEXT_COL=post_content, TITLE_COL=post_title, TIME_COL=post_timestamp, GROUP_COL=None
After cleaning, rows: 43134
                              clean_text                  ts     source_file  \
0                     와 존나설레 ㅅㅂ 가보자고ㅜㅜㅜㅡ 2024-03-03 00:04:04  Round1_Bahrain   
1                      스포한다 물탕이가 르끌레 꽂는다 2024-03-03 00:04:06  Round1_Bahrain   
2        라마단 <<< 고마우면 개추 ㅋㅋㅋ 덕분에 편하게본다 ㅆ 2024-03-03 00:04:09  Round1_Bahrain   
3              와 씹 진짜 저 초록색 흉물 좆같네 ㅅㅂㅋㅋㅋ 2024-03-03 00:04:10  Round1_Bahrain   
4                   개ㅈ핀 도착 ㅋㅋㅋㅋㅋㅋㅋㅋㅋ nan 2024-03-03 00:04:07  Round1_Bahrain   
5                  ㅋㅋㅋ 알핀이 마지막ㅋㅋㅋㅋ ㅋㅋㅋㅋㅋ 2024-03-03 00:04:11  Round1_Bahrain   
6  It's lights out and away we go!!! nan 2024-03-03 00:04:14  Round1_Bahrain   
7                          드가좤ㅋㅋㅋㅋㅋㅋ nan 2024-03-03 00:04:15  Round1_Bahrain   
8                          츠노다 가자 ㅋㅋ nan 2024-03-03 00:04:18  Round1_Bahrain   
9                          스타트!!!!!! nan 2024-03-03 00:04:17  Round1_Bahrai

**4. 파인튜닝을 위한 데이터셋 불러오기**

사용데이터: KOTE, KMRE
- KOTE 한국어 온라인 댓글 감정 라벨링: https://github.com/searle-j/KOTE
- KMRE 한국어 영화 리뷰 댓글 감정 라벨링: https://github.com/passing2961/KMRE

In [None]:
from datasets import Dataset, DatasetDict
import requests, io, pickle
from datasets import load_dataset

kote = load_dataset("searle-j/kote")
print(kote)

def load_kmre_from_raw(file_url: str):
    resp = requests.get(file_url)
    resp.raise_for_status()
    buf = io.BytesIO(resp.content)
    data = pickle.load(buf)

    # dict 리스트로 변경
    records = []
    for row in data:
        # row가 ["text", "label"] 꼴이라고 가정
        # 혹시 길이가 다르면 건너뛰도록 방어
        if isinstance(row, (list, tuple)) and len(row) >= 2:
            text, label = row[0], row[1]
            records.append({"text": text, "label": label})
        elif isinstance(row, dict):
            # 혹시 이미 dict면 그대로
            records.append(row)

    return Dataset.from_list(records)

base_url = "https://raw.githubusercontent.com/passing2961/KMRE/master/data/"

kmre_train = load_kmre_from_raw(base_url + "kmre_train")
kmre_dev   = load_kmre_from_raw(base_url + "kmre_dev")
kmre_test  = load_kmre_from_raw(base_url + "kmre_test")

kmre = DatasetDict({
    "train": kmre_train,
    "validation": kmre_dev,
    "test": kmre_test,
})

print(kmre)
print(kmre["train"][0])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


DatasetDict({
    train: Dataset({
        features: ['ID', 'text', 'labels'],
        num_rows: 40000
    })
    test: Dataset({
        features: ['ID', 'text', 'labels'],
        num_rows: 5000
    })
    validation: Dataset({
        features: ['ID', 'text', 'labels'],
        num_rows: 5000
    })
})
DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 119995
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 29999
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 49997
    })
})
{'text': '너~~~무 재미없어서 중간에 나왔습니다;;', 'label': 2}


**5. KoBERT 모델 불러오기**

In [None]:
import transformers

# =========================
# 0. transformers.utils 에서 모자란 것들 먼저 채워넣기
# =========================

# (1) quanto 관련
if not hasattr(transformers.utils, "is_quanto_available"):
    def is_quanto_available():
        return False
    transformers.utils.is_quanto_available = is_quanto_available

# (2) accelerate 관련
if not hasattr(transformers.utils, "is_accelerate_available"):
    def is_accelerate_available():
        return False
    transformers.utils.is_accelerate_available = is_accelerate_available

# (3) cached_property 가 필요한 경우가 있어서 넣어둔다
try:
    from functools import cached_property as _cached_property
except ImportError:
    # 파이썬 3.7 같은 경우 fallback
    def _cached_property(func):
        return property(func)

if not hasattr(transformers.utils, "cached_property"):
    transformers.utils.cached_property = _cached_property

# (4) 일부 버전에서 feature_extraction_utils 가 부르는 util
if not hasattr(transformers.utils, "add_model_info_to_auto_map"):
    def add_model_info_to_auto_map(config, **kwargs):
        # 우리 목적에는 noop 해도 된다
        return
    transformers.utils.add_model_info_to_auto_map = add_model_info_to_auto_map

# =========================
# 1. 이제 안전하게 monologg/kobert 불러오기
# =========================
from transformers import AutoTokenizer, AutoModelForSequenceClassification

MODEL_BASE = "monologg/kobert"

tok = AutoTokenizer.from_pretrained(
    MODEL_BASE,
    trust_remote_code=True,   # 이걸 써야 monologg 쪽 커스텀 토크나이저를 쓴다
)

model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_BASE,
    trust_remote_code=True,
    num_labels=2,
)

print("loaded!", model.__class__)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at monologg/kobert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


loaded! <class 'transformers.models.bert.modeling_bert.BertForSequenceClassification'>


**6. KoBERT Model Tokenizing & Training(Finetuning)**

In [None]:
from datasets import DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer,
)
import numpy as np
import evaluate

# 1) KMRE 라벨 매핑 (CPU 버전과 동일)
all_labels = (
    list(set(kmre["train"]["label"]))
    + list(set(kmre["validation"]["label"]))
    + (list(set(kmre["test"]["label"])) if "test" in kmre else [])
)
unique_labels = sorted(list(set(all_labels)))
print("KMRE 전체 라벨:", unique_labels)

label2id = {lbl: i for i, lbl in enumerate(unique_labels)}
id2label = {i: str(lbl) for lbl, i in label2id.items()}
num_labels = len(unique_labels)
print("num_labels:", num_labels)

def encode_label(ex):
    ex["labels"] = label2id[ex["label"]]
    return ex

kmre_num = DatasetDict({
    "train": kmre["train"].map(encode_label, remove_columns=["label"]),
    "validation": kmre["validation"].map(encode_label, remove_columns=["label"]),
})
if "test" in kmre:
    kmre_num["test"] = kmre["test"].map(encode_label, remove_columns=["label"])

# 2) 토크나이저 / 모델
MODEL_NAME = "skt/kobert-base-v1"
tok = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=num_labels,
    id2label=id2label,
    label2id={str(k): v for k, v in label2id.items()},
)

# 3) 토크나이즈
def tokenize_fn(batch):
    return tok(batch["text"], truncation=True, max_length=128)

tokenized_kmre = kmre_num.map(
    tokenize_fn,
    batched=True,
    remove_columns=["text"]
)

# 4) token_type_ids 제거하는 collator (이게 GPU에서도 안전하게 해줌)
class MyCollator(DataCollatorWithPadding):
    def __call__(self, features):
        batch = super().__call__(features)
        if "token_type_ids" in batch:
            batch.pop("token_type_ids")
        return batch

collator = MyCollator(tokenizer=tok)
metric_f1 = evaluate.load("f1")

# 5) GPU용 TrainingArguments (no_cuda 빼기)
training_args = TrainingArguments(
    output_dir="./results_kmre_kobert_gpu",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    report_to="none",
    # no_cuda=False  # 기본값이 False라 안 써도 됨 → GPU 사용
)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    return {
        "f1": metric_f1.compute(predictions=preds, references=labels, average="macro")["f1"],
        "accuracy": (preds == labels).mean().item(),
    }

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_kmre["train"],
    eval_dataset=tokenized_kmre["validation"],
    data_collator=collator,
    compute_metrics=compute_metrics,
    tokenizer=tok,
)

print("🚀 Training on GPU...")
trainer.train()
print("✅ Training finished on GPU")

KMRE 전체 라벨: [0, 1, 2, 3, 4, 5]
num_labels: 6


Map:   0%|          | 0/119995 [00:00<?, ? examples/s]

Map:   0%|          | 0/29999 [00:00<?, ? examples/s]

Map:   0%|          | 0/49997 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at skt/kobert-base-v1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/119995 [00:00<?, ? examples/s]

Map:   0%|          | 0/29999 [00:00<?, ? examples/s]

Map:   0%|          | 0/49997 [00:00<?, ? examples/s]

  trainer = Trainer(


🚀 Training on GPU...


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss


**7. 감정분석**

In [None]:
import torch, numpy as np
from tqdm.auto import tqdm
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import glob, os

# 1. 학습 결과 디렉토리
TRAINING_OUTPUT_DIR = "./results_kmre_kobert_debug"
BASE = "monologg/kobert"     # 학습 때 썼던 거랑 맞춰

device = "cuda" if torch.cuda.is_available() else "cpu"

# 2. 가장 최신 체크포인트 찾기
checkpoints = [d for d in glob.glob(f"{TRAINING_OUTPUT_DIR}/checkpoint-*") if os.path.isdir(d)]
if not checkpoints:
    raise FileNotFoundError(f"No checkpoint directories found in {TRAINING_OUTPUT_DIR}")
latest_checkpoint = max(checkpoints, key=lambda x: int(x.split("-")[-1]))
print("Loading model from:", latest_checkpoint)

# 3. 토크나이저/모델 로드
tok = AutoTokenizer.from_pretrained(BASE, trust_remote_code=True)
model = AutoModelForSequenceClassification.from_pretrained(latest_checkpoint).eval().to(device)

# label 이름 가져오기 (KMRE면 anger/disgust/... 일 거야)
id2label = model.config.id2label if model.config.id2label else None

def batched(lst, n=64):
    for i in range(0, len(lst), n):
        yield lst[i:i+n]

texts = df["clean_text"].tolist()

all_probs = []
all_preds = []
all_labels = []

for chunk in tqdm(batched(texts, 64), total=(len(texts)//64 + 1)):
    enc = tok(
        chunk,
        truncation=True,
        padding=True,
        max_length=128,
        return_tensors="pt"
    )
    enc = {k: v.to(device) for k, v in enc.items()}
    with torch.no_grad():
        logits = model(**enc).logits
        probs = logits.softmax(dim=-1).cpu().numpy()
    all_probs.append(probs)

    pred_ids = probs.argmax(axis=1)
    all_preds.extend(pred_ids)

    if id2label:
        # id2label이 {'0': 'anger', ...} 이런 식일 수도 있음
        lbls = []
        for i in pred_ids:
            key = str(i)
            lbls.append(id2label[key] if key in id2label else id2label.get(i, str(i)))
        all_labels.extend(lbls)
    else:
        all_labels.extend([str(i) for i in pred_ids])

all_probs = np.vstack(all_probs)

# 4. df에 결과 붙이기
df["emo_pred_id"] = all_preds
df["emo_pred_label"] = all_labels
df["emo_pred_conf"] = all_probs.max(axis=1)

# 5. 저장
output_path = "/content/F1_emotion_results_kobert.csv"
df.to_csv(output_path, index=False, encoding="utf-8-sig")
print("저장 완료 →", output_path)