# 📌 1️⃣ 라이브러리 불러오기

In [1]:
# ✅ 1️⃣ 라이브러리 로드
import sys
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import roc_auc_score
import torch.nn.functional as F

# ✅ 2️⃣ EarlyStopping 클래스 직접 정의
class EarlyStopping:
    def __init__(self, patience=7, verbose=False, delta=0):
        self.patience = patience
        self.verbose = verbose
        self.counter = 0
        self.best_score = None
        self.early_stop = False
        self.val_loss_min = np.inf
        self.delta = delta

    def __call__(self, val_loss, model):
        score = -val_loss

        if self.best_score is None:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
        elif score < self.best_score + self.delta:
            self.counter += 1
            if self.verbose:
                print(f'EarlyStopping counter: {self.counter} out of {self.patience}')
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
            self.counter = 0

    def save_checkpoint(self, val_loss, model):
        if self.verbose:
            print(f'Validation loss decreased ({self.val_loss_min:.6f} --> {val_loss:.6f}).  Saving model ...')
        torch.save(model.state_dict(), 'checkpoint.pt')
        self.val_loss_min = val_loss

# 📌 2️⃣ 데이터 로드

In [2]:
# ✅ 1️⃣ 데이터 로드
train_df = pd.read_csv("./data/train.csv")
test_df = pd.read_csv("./data/test.csv")

# 📌 3️⃣ TF-IDF 벡터화 (최적화 적용)

In [None]:
# ✅ 4️⃣ TF-IDF 벡터화 (최적화 적용)
vectorizer = TfidfVectorizer(
    ngram_range=(1, 2),   # ✅ 1~2-gram 사용
    max_features=1000,    # ✅ Feature 개수 조정
    sublinear_tf=True,    # ✅ TF 값 log 스케일링 적용
    stop_words=None       # ✅ 불용어 제거 안 함 (URL에 특수 문자 많음)
)

# ✅ 희소 행렬 적용하여 메모리 절약
train_tfidf = vectorizer.fit_transform(train_df["URL"])
test_tfidf = vectorizer.transform(test_df["URL"])

# ✅ 희소 행렬을 그대로 PyTorch Tensor로 변환 (메모리 절약)
X = torch.tensor(train_tfidf.astype(np.float32).toarray(), dtype=torch.float32)
y = torch.tensor(train_df["label"].values, dtype=torch.float32)
X_test = torch.tensor(test_tfidf.astype(np.float32).toarray(), dtype=torch.float32)

: 


# 📌 5️⃣ 데이터셋 분리 (Train 80% / Validation 20%)

In [None]:
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# 📌 6️⃣ PyTorch Dataset 정의

In [None]:
# ✅ 6️⃣ PyTorch Dataset 정의
class URLDataset(Dataset):
    def __init__(self, X, y=None):
        self.X = X
        self.y = y if y is not None else None

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        if self.y is not None:
            return self.X[idx], self.y[idx]
        return self.X[idx]

# 📌 7️⃣ DataLoader 정의

In [None]:
# ✅ 7️⃣ DataLoader 설정 (속도 최적화)
batch_size = 128  # 기존 256에서 감소
train_dataset = URLDataset(X_train, y_train)
val_dataset = URLDataset(X_val, y_val)
test_dataset = URLDataset(X_test)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=0, pin_memory=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=0, pin_memory=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=0, pin_memory=True)

# 📌 8️⃣ MLP 기반 URL 분류 모델 (구조 개선)

In [None]:
# ✅ 8️⃣ 더 깊은 MLP 기반 URL 분류 모델
class DeepURLClassifier(nn.Module):
    def __init__(self, input_dim):
        super(DeepURLClassifier, self).__init__()
        self.fc1 = nn.Linear(input_dim, 1024)
        self.bn1 = nn.BatchNorm1d(1024)
        self.fc2 = nn.Linear(1024, 512)
        self.bn2 = nn.BatchNorm1d(512)
        self.fc3 = nn.Linear(512, 256)
        self.bn3 = nn.BatchNorm1d(256)
        self.fc4 = nn.Linear(256, 128)
        self.bn4 = nn.BatchNorm1d(128)
        self.fc5 = nn.Linear(128, 1)  # Output Layer

        self.dropout = nn.Dropout(0.4)  # ✅ 기존보다 높은 Dropout 적용
        self.swish = nn.SiLU()  # ✅ Swish 활성 함수 적용

    def forward(self, x):
        x = self.swish(self.bn1(self.fc1(x)))
        x = self.dropout(x)
        x = self.swish(self.bn2(self.fc2(x)))
        x = self.dropout(x)
        x = self.swish(self.bn3(self.fc3(x)))
        x = self.dropout(x)
        x = self.swish(self.bn4(self.fc4(x)))
        x = self.dropout(x)
        x = torch.sigmoid(self.fc5(x))  # ✅ 마지막 층에서는 시그모이드 적용
        return x

# 📌 1️⃣1️⃣ 모델 초기화

In [None]:
# ✅ 모델 초기화
input_dim = X_train.shape[1]
model = DeepURLClassifier(input_dim)

# 📌 1️⃣2️⃣ Optimizer & Learning Rate Scheduler

In [None]:
# ✅ 🔟 Optimizer & Loss 설정
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.BCELoss()

# 📌 1️⃣3️⃣ 학습 함수 정의

In [None]:
# ✅ 1️⃣1️⃣ 디바이스 설정 (M1 맥북에서는 CPU 강제 사용)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"사용 중인 디바이스: {device}")

model = model.to(device)


# ✅ 1️⃣2️⃣ 학습 함수 정의 (ROC-AUC 출력 추가)
def train_model(model, train_loader, val_loader, epochs=10):
    early_stopping = EarlyStopping(patience=3, verbose=True)

    for epoch in range(epochs):
        model.train()
        train_loss = 0.0
        print(f"Epoch {epoch+1} 시작...")

        # 🔹 🔥 Training Loop
        # for batch_idx, (X_batch, y_batch) in enumerate(train_loader):
        #     X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        #     optimizer.zero_grad()
        #     outputs = model(X_batch).squeeze()
        #     loss = criterion(outputs, y_batch)
        #     loss.backward()
        #     optimizer.step()
        #     train_loss += loss.item()
        
        for X_batch, y_batch in train_loader:
            # 데이터 디바이스 이동
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)

            # 옵티마이저 초기화
            optimizer.zero_grad()

            # Forward, Loss, Backward
            outputs = model(X_batch).squeeze()  # ✅ squeeze()로 차원 축소
            loss = criterion(outputs, y_batch)
            loss.backward()
            optimizer.step()

            train_loss += loss.item()
        

            # if batch_idx % 1000 == 0:
            #     print(f"  🔄 [Batch {batch_idx}/{len(train_loader)}] Train Loss: {loss.item():.4f}")

        # ✅ Training 완료 후 평균 Loss 출력
        avg_train_loss = train_loss / len(train_loader)
        print(f"✅ Epoch {epoch+1} 완료! Train Loss: {avg_train_loss:.4f}")

        # 🔹 🔥 Validation Loop (ROC-AUC 점수 계산)
        model.eval()
        val_loss = 0.0
        y_true, y_pred = [], []  # ✅ 실제값 & 예측값 저장 리스트

        with torch.no_grad():
            for X_batch, y_batch in val_loader:
                X_batch, y_batch = X_batch.to(device), y_batch.to(device)
                outputs = model(X_batch).squeeze()
                loss = criterion(outputs, y_batch)
                val_loss += loss.item()

                y_true.extend(y_batch.cpu().numpy())  # ✅ 실제값 저장
                y_pred.extend(outputs.cpu().numpy())  # ✅ 예측값 저장

        # ✅ Validation Loss & ROC-AUC 계산 
        avg_val_loss = val_loss / len(val_loader)
        roc_auc = roc_auc_score(y_true, y_pred)

        print(f"📊 Epoch {epoch+1} - Validation Loss: {avg_val_loss:.4f} | ROC-AUC: {roc_auc:.4f}")

        # ✅ Early Stopping 적용
        early_stopping(avg_val_loss, model)
        if early_stopping.early_stop:
            print("✅ 조기 종료 (Early Stopping)")
            break

# ✅ 1️⃣3️⃣ 학습 실행
train_model(model, train_loader, val_loader, epochs=10)

사용 중인 디바이스: cuda:0
Epoch 1 시작...
✅ Epoch 1 완료! Train Loss: 0.2590
📊 Epoch 1 - Validation Loss: 0.2493 | ROC-AUC: 0.9066
Validation loss decreased (inf --> 0.249335).  Saving model ...
Epoch 2 시작...
✅ Epoch 2 완료! Train Loss: 0.2505
📊 Epoch 2 - Validation Loss: 0.2467 | ROC-AUC: 0.9079
Validation loss decreased (0.249335 --> 0.246676).  Saving model ...
Epoch 3 시작...
✅ Epoch 3 완료! Train Loss: 0.2484
📊 Epoch 3 - Validation Loss: 0.2460 | ROC-AUC: 0.9083
Validation loss decreased (0.246676 --> 0.245996).  Saving model ...
Epoch 4 시작...
✅ Epoch 4 완료! Train Loss: 0.2474
📊 Epoch 4 - Validation Loss: 0.2453 | ROC-AUC: 0.9086
Validation loss decreased (0.245996 --> 0.245335).  Saving model ...
Epoch 5 시작...
✅ Epoch 5 완료! Train Loss: 0.2466
📊 Epoch 5 - Validation Loss: 0.2451 | ROC-AUC: 0.9087
Validation loss decreased (0.245335 --> 0.245062).  Saving model ...
Epoch 6 시작...
✅ Epoch 6 완료! Train Loss: 0.2461
📊 Epoch 6 - Validation Loss: 0.2450 | ROC-AUC: 0.9086
Validation loss decreased (0.245062

In [None]:
# ✅ 1️⃣4️⃣ 테스트 데이터 예측 및 제출 파일 생성
model.eval()
y_test_preds = []

with torch.no_grad():
    for X_batch in test_loader:
        X_batch = X_batch.to(device)
        outputs = model(X_batch).squeeze()
        y_test_preds.extend(outputs.cpu().numpy())

submission = pd.DataFrame({"ID": test_df["ID"], "probability": y_test_preds})
submission.to_csv("submission_6th.csv", index=False)
print("✅ 최종 예측 완료. 제출 파일 생성됨!")

✅ 최종 예측 완료. 제출 파일 생성됨!
