In [1]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from datasets import Dataset

Microsoft Visual C++ Redistributable is not installed, this may lead to the DLL load failure.
It can be downloaded at https://aka.ms/vs/16/release/vc_redist.x64.exe


  from .autonotebook import tqdm as notebook_tqdm


In [17]:
# 1. 데이터 로드
# 데이터는 'text'와 'label' 컬럼을 포함해야 합니다.
# text: 뉴스 본문, label: 진짜(1)/가짜(0) 레이블
data = pd.read_csv("C:/py/fakeNEWs_project/News_Dataset.csv")

# 데이터 컬럼 확인
print(data.head()) # text and label 열이 존재해야 함

# 2. Train/Test 데이터 분리
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

                           Title  \
0     갈루치 전 미 핵대사가 본 (새해 한반도 기류)   
1  집권 마무리­정권 재창출 '큰짐'-(문민 1기) 결산   
2               정부서 직접 주택정보 제공한다   
3                     (제야의 종)타종식   
4                            野鼓賦   

                                             Content  Label  
0  한국,미국,제네바,국무부,갈루치,클린턴,일본,남북대화,남북한,워싱턴,4자,조지타운대...      1  
1  김대통령,미국,부총리,반수석,경북,권부총리,당서열,위원장,신한국당,5인,유장관,외교...      1  
2  부동산,건교부,지자체,매물정보,유상열,연계망,소비자,주택가격,pc,교통부,실수요자,...      1  
3  삼천동,타종식,각게인사들,각게,경제계,96제야,春川,참석자,崔珏圭,종각,지역,도민,...      1  
4  정치의식,大權病,대권병,野鼓賦,민주화,대선열병,정치인들,대선병,정치인,大選病,국가안...      1  


In [9]:
# 3. Hugging Face Datasets로 변환
train_dataset = Dataset.from_dict({"text": train_data, "label": train_data})
val_dataset = Dataset.from_dict({"text": test_data, "label": test_data})

# 4. 토크나이저 및 모델 초기화
model_name = "beomi/KcELECTRA-base-v2022"  # Hugging Face에서 제공하는 KR-BERT 모델
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
You are using a model of type electra to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at beomi/KcELECTRA-base-v2022 and are newly initialized: ['classifier.bias', 'classifier.weight', 'embeddings.LayerNorm.bias', 'embeddings.LayerNorm.weight', 'embeddings.position_embeddings.weight', 'embeddings.token_type_embeddings.weight', 'embeddings.word_embeddings.weight', 'encoder.layer.0.attention.output.LayerNorm.bias', 'encoder.layer.0.attention.output.LayerNorm.weight', 'encoder.layer.0.attention.output.dense.bias', 'encoder.layer.0.attention.output.dense.weight', 'encoder.layer.

In [10]:
# 5. 데이터 전처리
def preprocess_data(batch):
    return tokenizer(batch["text"], padding="max_length", truncation=True, max_length=512)

train_dataset = train_dataset.map(preprocess_data, batched=True)
val_dataset = val_dataset.map(preprocess_data, batched=True)

# 모델이 필요로 하지 않는 열 제거
train_dataset = train_dataset.remove_columns(["text"])
val_dataset = val_dataset.remove_columns(["text"])

train_dataset.set_format("torch")
val_dataset.set_format("torch")


Map: 100%|██████████| 3/3 [00:00<00:00, 157.95 examples/s]
Map: 100%|██████████| 3/3 [00:00<00:00, 498.77 examples/s]


In [11]:
# 6. 학습 파라미터 설정
training_args = TrainingArguments(
    output_dir="./results",          # 출력 디렉토리
    evaluation_strategy="epoch",    # 평가 주기
    learning_rate=2e-5,             # 학습률
    per_device_train_batch_size=16, # 학습 배치 크기
    per_device_eval_batch_size=64,  # 평가 배치 크기
    num_train_epochs=3,             # 학습 에폭 수
    weight_decay=0.01,              # 가중치 감쇠
    logging_dir="./logs",           # 로그 디렉토리
    save_total_limit=2,             # 저장할 체크포인트 개수
)



ImportError: Using the `Trainer` with `PyTorch` requires `accelerate>=0.26.0`: Please run `pip install transformers[torch]` or `pip install 'accelerate>=0.26.0'`