# 1. 구글 DRIVE 연결

In [None]:
# prompt: 코랩이랑 마운트 하는거 써줘

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install kobert-transformers transformers torch torchvision scikit-learn tqdm --upgrade

Collecting kobert-transformers
  Downloading kobert_transformers-0.6.0-py3-none-any.whl.metadata (7.3 kB)
Collecting transformers
  Downloading transformers-4.47.1-py3-none-any.whl.metadata (44 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/44.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.1/44.1 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
Downloading kobert_transformers-0.6.0-py3-none-any.whl (12 kB)
Downloading transformers-4.47.1-py3-none-any.whl (10.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.1/10.1 MB[0m [31m41.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: transformers, kobert-transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.47.0
    Uninstalling transformers-4.47.0:
      Successfully uninstalled transformers-4.47.0
Successfully installed kobert-transformers-0.6.0 transformers-4.47.1


# 2. 데이터 셋 클래스 정의

In [None]:
import os
import torch
import pandas as pd
from kobert_transformers import get_kobert_model, get_tokenizer
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
from transformers import AdamW, get_scheduler
from tqdm import tqdm
from sklearn.model_selection import train_test_split

# Step 1: KoBERT 모델 및 토크나이저 로드
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
kobert_model = get_kobert_model().to(device)
tokenizer = get_tokenizer()

# 데이터셋 클래스 정의
class TitleContentDataset(Dataset):
    def __init__(self, titles, contents, labels, tokenizer, max_len=512, stride=256):
        self.titles = titles
        self.contents = contents
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.stride = stride

    def __len__(self):
        return len(self.titles)

    def __getitem__(self, idx):
        title = self.titles[idx]
        content = self.contents[idx]
        label = self.labels[idx]

        title_inputs = self.tokenizer(
            title, max_length=self.max_len, padding="max_length", truncation=True, return_tensors="pt"
        )

        content_inputs = self.tokenizer(
            content, max_length=self.max_len, padding="max_length", truncation=True,
            stride=self.stride, return_overflowing_tokens=True, return_tensors="pt"
        )

        return {
            'title_input_ids': title_inputs['input_ids'].squeeze(0),
            'title_attention_mask': title_inputs['attention_mask'].squeeze(0),
            'content_input_ids': content_inputs['input_ids'],
            'content_attention_mask': content_inputs['attention_mask'],
            'label': torch.tensor(label, dtype=torch.long)
        }

# DataLoader 생성 함수
def create_dataloader(titles, contents, labels, tokenizer, batch_size, shuffle=False):
    dataset = TitleContentDataset(titles, contents, labels, tokenizer)
    return DataLoader(dataset, batch_size=batch_size, shuffle=shuffle, num_workers=8)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/426 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/369M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/263 [00:00<?, ?B/s]

tokenizer_78b3253a26.model:   0%|          | 0.00/371k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/77.8k [00:00<?, ?B/s]

# 3. 모델 정의

In [None]:
# 모델 정의
class KoBERTComparisonClassifier(nn.Module):
    def __init__(self, kobert_model, hidden_size=768, num_classes=2):
        super(KoBERTComparisonClassifier, self).__init__()
        self.bert = kobert_model
        self.classifier = nn.Sequential(
            nn.Linear(hidden_size * 2, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, num_classes)
        )

    def forward(self, title_input_ids, title_attention_mask, content_input_ids, content_attention_mask):
        title_outputs = self.bert(input_ids=title_input_ids, attention_mask=title_attention_mask)
        title_cls_output = title_outputs.last_hidden_state[:, 0, :]

        batch_size, num_chunks, seq_len = content_input_ids.size()
        content_input_ids = content_input_ids.view(-1, seq_len)
        content_attention_mask = content_attention_mask.view(-1, seq_len)
        content_outputs = self.bert(input_ids=content_input_ids, attention_mask=content_attention_mask)
        content_cls_output = content_outputs.last_hidden_state[:, 0, :]
        content_cls_output = content_cls_output.view(batch_size, num_chunks, -1).mean(dim=1)

        combined_cls_output = torch.cat((title_cls_output, content_cls_output), dim=1)
        return self.classifier(combined_cls_output)


# 4. 모델 저장, 로드, 학습, 평가 함수 정의

In [None]:
# 모델 저장 및 로드 함수
def save_model(model, path):
    torch.save(model.state_dict(), path)
    print(f"Model saved to {path}")

def load_model(model, path):
    model.load_state_dict(torch.load(path))
    print(f"Model loaded from {path}")

# 학습 및 평가 함수
def train_epoch(model, dataloader, optimizer, scaler, scheduler=None, device="cuda"):
    model.train()
    epoch_loss = 0
    for batch in tqdm(dataloader, desc="Training", mininterval=2.0):
        title_input_ids = batch['title_input_ids'].to(device)
        title_attention_mask = batch['title_attention_mask'].to(device)
        content_input_ids = batch['content_input_ids'].to(device)
        content_attention_mask = batch['content_attention_mask'].to(device)
        labels = batch['label'].to(device)

        with torch.amp.autocast(device):
            logits = model(title_input_ids, title_attention_mask, content_input_ids, content_attention_mask)
            loss = nn.CrossEntropyLoss()(logits, labels)

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        optimizer.zero_grad()

        if scheduler:
            scheduler.step()

        epoch_loss += loss.item()
    return epoch_loss / len(dataloader)

def evaluate_model(model, dataloader, device="cuda"):
    model.eval()
    total_loss = 0
    total_correct = 0
    total_samples = 0
    with torch.no_grad():
        for batch in dataloader:
            title_input_ids = batch['title_input_ids'].to(device)
            title_attention_mask = batch['title_attention_mask'].to(device)
            content_input_ids = batch['content_input_ids'].to(device)
            content_attention_mask = batch['content_attention_mask'].to(device)
            labels = batch['label'].to(device)

            logits = model(title_input_ids, title_attention_mask, content_input_ids, content_attention_mask)
            loss = nn.CrossEntropyLoss()(logits, labels)
            total_loss += loss.item()

            predictions = torch.argmax(logits, dim=1)
            total_correct += (predictions == labels).sum().item()
            total_samples += labels.size(0)

    avg_loss = total_loss / len(dataloader)
    accuracy = total_correct / total_samples
    return avg_loss, accuracy


# 4. 데이터셋 클래스를 통한 데이터 준비

In [None]:
# Step 4: 데이터 준비
file_path = "/content/drive/MyDrive/p_project/Balanced_data.csv"
data = pd.read_csv(file_path)

titles = data["newsTitle"].tolist()
contents = data["newsContent"].tolist()
labels = data["useType"].tolist()

train_titles, temp_titles, train_contents, temp_contents, train_labels, temp_labels = train_test_split(
    titles, contents, labels, test_size=0.1, random_state=42
)
val_titles, test_titles, val_contents, test_contents, val_labels, test_labels = train_test_split(
    temp_titles, temp_contents, temp_labels, test_size=0.5, random_state=42
)

train_loader = create_dataloader(train_titles, train_contents, train_labels, tokenizer, batch_size=64, shuffle=True)
val_loader = create_dataloader(val_titles, val_contents, val_labels, tokenizer, batch_size=64)
test_loader = create_dataloader(test_titles, test_contents, test_labels, tokenizer, batch_size=64)

# 5. 신경망 학습(kobert 동결)

In [None]:
# 모델 초기화
model = KoBERTComparisonClassifier(kobert_model).to(device)

# Step 5: Dense Layer 학습
dense_optimizer = AdamW(model.classifier.parameters(), lr=1e-4)
scaler = torch.amp.GradScaler()

for param in model.bert.parameters():
    param.requires_grad = False

for epoch in range(20):
    train_loss = train_epoch(model, train_loader, dense_optimizer, scaler)
    val_loss, val_accuracy = evaluate_model(model, val_loader)
    print(f"Epoch {epoch + 1}: Train Loss {train_loss:.4f}, Val Loss {val_loss:.4f}, Val Accuracy {val_accuracy:.4f}")

save_model(model, "/content/drive/MyDrive/new_models/dense_trained_model.pth")

Training: 100%|██████████| 839/839 [07:24<00:00,  1.89it/s]


Epoch 1: Train Loss 0.6297, Val Loss 0.5459, Val Accuracy 0.7247


Training: 100%|██████████| 839/839 [07:22<00:00,  1.89it/s]


Epoch 2: Train Loss 0.5249, Val Loss 0.4455, Val Accuracy 0.7981


Training: 100%|██████████| 839/839 [07:20<00:00,  1.90it/s]


Epoch 3: Train Loss 0.4645, Val Loss 0.3970, Val Accuracy 0.8212


Training: 100%|██████████| 839/839 [07:20<00:00,  1.90it/s]


Epoch 4: Train Loss 0.4247, Val Loss 0.3506, Val Accuracy 0.8519


Training: 100%|██████████| 839/839 [07:20<00:00,  1.90it/s]


Epoch 5: Train Loss 0.3965, Val Loss 0.3250, Val Accuracy 0.8644


Training: 100%|██████████| 839/839 [07:20<00:00,  1.90it/s]


Epoch 6: Train Loss 0.3757, Val Loss 0.3106, Val Accuracy 0.8684


Training: 100%|██████████| 839/839 [07:20<00:00,  1.90it/s]


Epoch 7: Train Loss 0.3601, Val Loss 0.2899, Val Accuracy 0.8799


Training: 100%|██████████| 839/839 [07:20<00:00,  1.90it/s]


Epoch 8: Train Loss 0.3477, Val Loss 0.2742, Val Accuracy 0.8887


Training: 100%|██████████| 839/839 [07:21<00:00,  1.90it/s]


Epoch 9: Train Loss 0.3374, Val Loss 0.2697, Val Accuracy 0.8876


Training: 100%|██████████| 839/839 [07:23<00:00,  1.89it/s]


Epoch 10: Train Loss 0.3296, Val Loss 0.2576, Val Accuracy 0.8932


Training: 100%|██████████| 839/839 [07:22<00:00,  1.89it/s]


Epoch 11: Train Loss 0.3209, Val Loss 0.2526, Val Accuracy 0.8960


Training: 100%|██████████| 839/839 [07:23<00:00,  1.89it/s]


Epoch 12: Train Loss 0.3152, Val Loss 0.2458, Val Accuracy 0.8996


Training: 100%|██████████| 839/839 [07:23<00:00,  1.89it/s]


Epoch 13: Train Loss 0.3097, Val Loss 0.2384, Val Accuracy 0.9025


Training: 100%|██████████| 839/839 [07:23<00:00,  1.89it/s]


Epoch 14: Train Loss 0.3059, Val Loss 0.2412, Val Accuracy 0.9004


Training: 100%|██████████| 839/839 [07:22<00:00,  1.90it/s]


Epoch 15: Train Loss 0.2996, Val Loss 0.2362, Val Accuracy 0.9019


Training: 100%|██████████| 839/839 [07:20<00:00,  1.90it/s]


Epoch 16: Train Loss 0.2980, Val Loss 0.2271, Val Accuracy 0.9063


Training: 100%|██████████| 839/839 [07:20<00:00,  1.90it/s]


Epoch 17: Train Loss 0.2937, Val Loss 0.2261, Val Accuracy 0.9077


Training: 100%|██████████| 839/839 [07:20<00:00,  1.90it/s]


Epoch 18: Train Loss 0.2894, Val Loss 0.2216, Val Accuracy 0.9100


Training: 100%|██████████| 839/839 [07:20<00:00,  1.90it/s]


Epoch 19: Train Loss 0.2862, Val Loss 0.2157, Val Accuracy 0.9104


Training: 100%|██████████| 839/839 [07:20<00:00,  1.90it/s]


Epoch 20: Train Loss 0.2833, Val Loss 0.2154, Val Accuracy 0.9126
Model saved to /content/drive/MyDrive/new_models/dense_trained_model.pth


# 6. fine_tuning(kobert 동결 해제) + 신경망 학습

In [None]:
# Step 6: Fine-Tuning

# 1. KoBERT 모델 및 Tokenizer 초기화
kobert_model = get_kobert_model().to(device)  # KoBERT 모델 초기화

# 2. 모델 구조 초기화
model = KoBERTComparisonClassifier(kobert_model).to(device)  # 모델 초기화

# Step 6: Fine-Tuning
load_model(model, "/content/drive/MyDrive/new_models/dense_trained_model.pth")
for param in model.bert.parameters():
    param.requires_grad = True

fine_tune_optimizer = AdamW(model.parameters(), lr=5e-5)
scheduler = get_scheduler("linear", optimizer=fine_tune_optimizer,
                           num_warmup_steps=int(0.1 * len(train_loader) * 3),
                           num_training_steps=len(train_loader) * 3)

# 6. Mixed Precision Training 준비 (scaler 초기화)
scaler = torch.cuda.amp.GradScaler()  # 정의되지 않은 경우 NameError 발생

for epoch in range(3):
    train_loss = train_epoch(model, train_loader, fine_tune_optimizer, scaler, scheduler)
    val_loss, val_accuracy = evaluate_model(model, val_loader)
    print(f"Epoch {epoch + 1}: Train Loss {train_loss:.4f}, Val Loss {val_loss:.4f}, Val Accuracy {val_accuracy:.4f}")

save_model(model, "/content/drive/MyDrive/models/fine_tuned_model.pth")

  model.load_state_dict(torch.load(path))
  scaler = torch.cuda.amp.GradScaler()  # 정의되지 않은 경우 NameError 발생


Model loaded from /content/drive/MyDrive/new_models/dense_trained_model.pth


Training: 100%|██████████| 3355/3355 [22:58<00:00,  2.43it/s]


Epoch 1: Train Loss 0.1878, Val Loss 0.0986, Val Accuracy 0.9654


Training: 100%|██████████| 3355/3355 [22:55<00:00,  2.44it/s]


Epoch 2: Train Loss 0.1031, Val Loss 0.0806, Val Accuracy 0.9722


Training: 100%|██████████| 3355/3355 [22:55<00:00,  2.44it/s]


Epoch 3: Train Loss 0.0474, Val Loss 0.0664, Val Accuracy 0.9783
Model saved to /content/drive/MyDrive/models/fine_tuned_model.pth


# 7. 테스트

In [None]:
kobert_model = get_kobert_model().to(device)  # KoBERT 모델 초기화

model = KoBERTComparisonClassifier(kobert_model).to(device)  # 모델 초기화

# Step 6: Fine-Tuning
load_model(model, "/content/drive/MyDrive/models/fine_tuned_model.pth")

# Step 7: 테스트 평가
test_loss, test_accuracy = evaluate_model(model, test_loader)
print(f"Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.4f}")

  model.load_state_dict(torch.load(path))


Model loaded from /content/drive/MyDrive/models/fine_tuned_model.pth
Test Loss: 0.0674, Test Accuracy: 0.9775


# 8. 독립적으로 돌릴수있는 코드(예시 데이터)

In [None]:
import torch
from kobert_transformers import get_kobert_model, get_tokenizer
import torch.nn as nn
import torch.nn.functional as F

# 모델 정의
class KoBERTComparisonClassifier(nn.Module):
    def __init__(self, kobert_model, hidden_size=768, num_classes=2):
        super(KoBERTComparisonClassifier, self).__init__()
        self.bert = kobert_model
        self.classifier = nn.Sequential(
            nn.Linear(hidden_size * 2, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, num_classes)
        )

    def forward(self, title_input_ids, title_attention_mask, content_input_ids, content_attention_mask):
        title_outputs = self.bert(input_ids=title_input_ids, attention_mask=title_attention_mask)
        title_cls_output = title_outputs.last_hidden_state[:, 0, :]

        batch_size, num_chunks, seq_len = content_input_ids.size()
        content_input_ids = content_input_ids.view(-1, seq_len)
        content_attention_mask = content_attention_mask.view(-1, seq_len)
        content_outputs = self.bert(input_ids=content_input_ids, attention_mask=content_attention_mask)
        content_cls_output = content_outputs.last_hidden_state[:, 0, :]
        content_cls_output = content_cls_output.view(batch_size, num_chunks, -1).mean(dim=1)

        combined_cls_output = torch.cat((title_cls_output, content_cls_output), dim=1)
        return self.classifier(combined_cls_output)

# 모델 로드 함수
def load_model(model, path):
    model.load_state_dict(torch.load(path, map_location=torch.device('cpu')))
    print(f"Model loaded from {path}")

# 데이터 전처리 함수
def preprocess_input(title, content, tokenizer, max_len=512, stride=256):
    title_inputs = tokenizer(
        title, max_length=max_len, padding="max_length", truncation=True, return_tensors="pt"
    )

    content_inputs = tokenizer(
        content, max_length=max_len, padding="max_length", truncation=True,
        stride=stride, return_overflowing_tokens=True, return_tensors="pt"
    )

    return {
        'title_input_ids': title_inputs['input_ids'].squeeze(0),
        'title_attention_mask': title_inputs['attention_mask'].squeeze(0),
        'content_input_ids': content_inputs['input_ids'],
        'content_attention_mask': content_inputs['attention_mask']
    }

# 예측 함수
def predict(title, content, model, tokenizer, device="cpu"):
    model.eval()
    inputs = preprocess_input(title, content, tokenizer)
    with torch.no_grad():
        title_input_ids = inputs['title_input_ids'].unsqueeze(0).to(device)
        title_attention_mask = inputs['title_attention_mask'].unsqueeze(0).to(device)
        content_input_ids = inputs['content_input_ids'].unsqueeze(0).to(device)
        content_attention_mask = inputs['content_attention_mask'].unsqueeze(0).to(device)

        logits = model(title_input_ids, title_attention_mask, content_input_ids, content_attention_mask)
        probabilities = F.softmax(logits, dim=1).squeeze(0)  # 확률 계산
        pred_class = torch.argmax(probabilities).item()
        return pred_class, probabilities

# 실행 코드
if __name__ == "__main__":
    # KoBERT 모델 및 토크나이저 로드
    kobert_model = get_kobert_model()
    tokenizer = get_tokenizer()

    # 모델 초기화 및 로드
    model = KoBERTComparisonClassifier(kobert_model)
    model_path = "/content/drive/MyDrive/models/fine_tuned_model.pth"  # 저장된 모델 경로
    load_model(model, model_path)
    model = model.to("cpu")

    # 예시 입력
    example_title = "이것은 낚시성 제목입니다!"
    example_content = "내용이 제목과 전혀 맞지 않는 과장된 설명입니다."

    # 예측
    pred_class, probabilities = predict(example_title, example_content, model, tokenizer)
    fishing_prob = probabilities[0].item() * 100
    non_fishing_prob = probabilities[1].item() * 100

    print(f"Prediction: {'낚시성' if pred_class == 0 else '비낚시성'}")
    print(f"낚시성 확률: {fishing_prob:.2f}%")
    print(f"비낚시성 확률: {non_fishing_prob:.2f}%")


  model.load_state_dict(torch.load(path, map_location=torch.device('cpu')))


Model loaded from /content/drive/MyDrive/models/fine_tuned_model.pth
Prediction: 비낚시성
낚시성 확률: 3.64%
비낚시성 확률: 96.36%
