In [8]:
import torch
import transformers
import datasets
import pandas as pd
import numpy as np
import sklearn

print("Torch Version:", torch.__version__)
print("Transformers Version:", transformers.__version__)
print("Datasets Version:", datasets.__version__)
print("Pandas Version:", pd.__version__)
print("Numpy Version:", np.__version__)
print("Scikit-learn Version:", sklearn.__version__)

print("CUDA Available:", torch.cuda.is_available())  # GPU 사용 가능 여부
print("CUDA Version:", torch.version.cuda)  # PyTorch에서 인식하는 CUDA 버전
print("CuDNN Available:", torch.backends.cudnn.is_available())  # CuDNN 사용 여부
print("CuDNN Version:", torch.backends.cudnn.version())  # CuDNN 버전

Torch Version: 2.6.0+cu126
Transformers Version: 4.49.0
Datasets Version: 3.3.1
Pandas Version: 2.2.3
Numpy Version: 1.26.3
Scikit-learn Version: 1.6.1
CUDA Available: True
CUDA Version: 12.6
CuDNN Available: True
CuDNN Version: 90501


In [11]:
import pandas as pd
import torch
from datasets import Dataset
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# CSV 데이터 로드
file_path = "train.csv"
df = pd.read_csv(file_path)

# 텍스트 정제 (불필요한 개행문자, 공백 제거)
df["text"] = df["text"].astype(str).str.replace(r'\n', ' ', regex=True).str.strip()

# 레이블 인코딩 (클래스가 숫자형이면 필요 없음)
label_encoder = LabelEncoder()
df["class"] = label_encoder.fit_transform(df["class"])

# Train/Test 데이터 분할
train_texts, test_texts, train_labels, test_labels = train_test_split(df["text"], df["class"], test_size=0.2, random_state=42)

# ELECTRA 모델 지정 (한국어 지원 모델 선택 가능)
MODEL_NAME = "monologg/koelectra-base-v3-discriminator"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# 토크나이징 함수 정의
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=128)

# Hugging Face Dataset 형식 변환
train_dataset = Dataset.from_dict({"text": train_texts.tolist(), "label": train_labels.tolist()})
test_dataset = Dataset.from_dict({"text": test_texts.tolist(), "label": test_labels.tolist()})

# 토크나이징 적용
train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Map: 100%|██████████| 3960/3960 [00:00<00:00, 15899.91 examples/s]
Map: 100%|██████████| 990/990 [00:00<00:00, 15883.26 examples/s]


In [25]:
# ELECTRA 모델 로드 (분류 문제를 위한 모델)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=len(set(df["class"])))

# TrainingArguments 설정
training_args = TrainingArguments(
    output_dir="./electra_results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=10,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    report_to="none",
)

# Trainer 객체 생성
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
)

# 학습 시작
trainer.train()

Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at monologg/koelectra-base-v3-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,0.4856,0.811021
2,0.2194,0.556302
3,0.1294,0.749307
4,0.2777,0.639332
5,0.0043,0.687807
6,0.0018,0.736099
7,0.0082,0.879045
8,0.0009,0.857468
9,0.0009,0.84811
10,0.0009,0.847543


TrainOutput(global_step=4950, training_loss=0.16789742789974418, metrics={'train_runtime': 311.8689, 'train_samples_per_second': 126.976, 'train_steps_per_second': 15.872, 'total_flos': 2604869610393600.0, 'train_loss': 0.16789742789974418, 'epoch': 10.0})

In [26]:
# 모델 저장
model.save_pretrained("./electra_trained")
tokenizer.save_pretrained("./electra_trained")

('./electra_trained\\tokenizer_config.json',
 './electra_trained\\special_tokens_map.json',
 './electra_trained\\vocab.txt',
 './electra_trained\\added_tokens.json',
 './electra_trained\\tokenizer.json')

In [27]:
from sklearn.metrics import f1_score

# ✅ 테스트 데이터 로드
test_file_path = "test.csv"
test_df = pd.read_csv(test_file_path)

# 텍스트 정제 (불필요한 개행 문자 제거)
test_df["text"] = test_df["text"].astype(str).str.replace(r'\n', ' ', regex=True).str.strip()

In [28]:
# ✅ 저장된 ELECTRA 모델과 토크나이저 로드
MODEL_PATH = "./electra_trained"  # 학습된 모델이 저장된 경로
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_PATH)

# ✅ 🚀 GPU 사용 설정 (CUDA 사용)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
model.to(device)
model.eval()

cuda


ElectraForSequenceClassification(
  (electra): ElectraModel(
    (embeddings): ElectraEmbeddings(
      (word_embeddings): Embedding(35000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): ElectraEncoder(
      (layer): ModuleList(
        (0-11): 12 x ElectraLayer(
          (attention): ElectraAttention(
            (self): ElectraSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): ElectraSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): L

In [29]:
# ✅ 토큰화 함수 정의
def tokenize_function(texts):
    return tokenizer(texts, truncation=True, padding="max_length", max_length=128, return_tensors="pt")

# ✅ 데이터 토큰화 및 입력 데이터 준비
tokenized_data = tokenize_function(test_df["text"].tolist())

# 입력 데이터를 GPU로 이동
tokenized_data = {key: val.to(device) for key, val in tokenized_data.items()}

# ✅ 모델 추론 (예측 수행)
with torch.no_grad():
    outputs = model(**tokenized_data)
    predictions = torch.argmax(outputs.logits, dim=-1).cpu().numpy()

# ✅ 예측 결과 저장
test_df["predicted_class"] = predictions

In [30]:
# ✅ F1-score 계산 (실제 라벨이 존재할 경우)
if "class" in test_df.columns:
    true_labels = test_df["class"].values
    f1 = f1_score(true_labels, predictions, average="weighted")
    print(f"✅ F1-score: {f1:.4f}")
else:
    print("⚠️ 'class' 컬럼이 없어서 F1-score를 계산할 수 없습니다.")

✅ F1-score: 0.8584
