Python ver : 3.85

torch : 2.4.1

pandas : 1.3.5

numpy : 1.24.3

In [21]:
import torch
import torch.nn.functional as F
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import roc_auc_score
import pandas as pd
import numpy as np

In [22]:
# CSV 데이터 로드
df = pd.read_csv("dataset_example.csv")
features = torch.tensor(df.drop(columns=["label"]).values, dtype=torch.float)
labels = torch.tensor(df["label"].values, dtype=torch.long)

# edge_index는 임시 fully connected로 예시 구성 (GCN에서 필요)
def generate_dummy_edges(num_nodes):
    row, col = [], []
    for i in range(num_nodes):
        for j in range(num_nodes):
            if i != j:
                row.append(i)
                col.append(j)
    return torch.tensor([row, col], dtype=torch.long)

edge_index = generate_dummy_edges(features.shape[0])

In [23]:
# GCN 모델 정의
class GCN(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(in_channels, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, out_channels)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = F.relu(self.conv1(x, edge_index))
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)
        return x

In [24]:
# 전체 데이터 객체 생성
data = Data(x=features, edge_index=edge_index, y=labels)

# 50회 반복 (5-fold × 10회)
total_aucs = []
num_iterations = 0

for repeat in range(10):
    print(f"\n Repeat {repeat+1}/10")
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=repeat) # 5개의 fold로 분할 (매번 1개는 test용)
    splits = list(skf.split(np.zeros(len(labels)), labels)) # 클래스 비율 유지한 채로 (train+val, test) 인덱스 쌍 생성

    for fold, (train_val_idx, test_idx) in enumerate(splits):
        print(f"\nFold {fold+1}/5")
        print(f"- Test indices (sample): {test_idx[:5].tolist()} ... 총 {len(test_idx)}개")
        print(f"- Train+Val indices (sample): {train_val_idx[:5].tolist()} ... 총 {len(train_val_idx)}개")

        # train/val 분리 (전체의 80%에서 80% → train / 20% → val)
        train_idx, val_idx = train_test_split(
            train_val_idx,
            test_size=0.2,
            stratify=labels[train_val_idx],
            random_state=42
        )

        print(f"  - Train size: {len(train_idx)} ({len(train_idx)/len(labels):.2%})")
        print(f"  - Val size:   {len(val_idx)} ({len(val_idx)/len(labels):.2%})")
        print(f"  - Test size:  {len(test_idx)} ({len(test_idx)/len(labels):.2%})")

        # 마스크 설정
        train_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
        val_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
        test_mask = torch.zeros(data.num_nodes, dtype=torch.bool)

        train_mask[train_idx] = True
        val_mask[val_idx] = True
        test_mask[test_idx] = True

        data.train_mask = train_mask
        data.val_mask = val_mask
        data.test_mask = test_mask

        # 모델 및 옵티마이저 초기화
        model = GCN(in_channels=data.num_features, hidden_channels=32, out_channels=2)
        optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)

        # 10 epoch 학습
        model.train()
        for epoch in range(10):
            optimizer.zero_grad()
            out = model(data)
            loss = F.cross_entropy(out[data.train_mask], data.y[data.train_mask])
            loss.backward()
            optimizer.step()

        # AUC 평가
        model.eval()
        with torch.no_grad():
            logits = model(data)
            probs = F.softmax(logits[data.test_mask], dim=1)[:, 1].cpu().numpy()
            true = data.y[data.test_mask].cpu().numpy()
            auc = roc_auc_score(true, probs)
            total_aucs.append(auc)
            num_iterations += 1
            print(f"Iteration {num_iterations}/50 | AUC = {auc:.4f}")

# 최종 결과 출력
mean_auc = np.mean(total_aucs)
print(f"\n Final classification performance reported as mean AUC over 50 iterations: {mean_auc:.4f}")


 Repeat 1/10

Fold 1/5
- Test indices (sample): [10, 19, 21, 37, 43] ... 총 102개
- Train+Val indices (sample): [0, 1, 2, 3, 4] ... 총 404개
  - Train size: 323 (63.83%)
  - Val size:   81 (16.01%)
  - Test size:  102 (20.16%)
Iteration 1/50 | AUC = 0.5000

Fold 2/5
- Test indices (sample): [2, 4, 5, 8, 14] ... 총 101개
- Train+Val indices (sample): [0, 1, 3, 6, 7] ... 총 405개
  - Train size: 324 (64.03%)
  - Val size:   81 (16.01%)
  - Test size:  101 (19.96%)
Iteration 2/50 | AUC = 0.4927

Fold 3/5
- Test indices (sample): [7, 20, 24, 29, 35] ... 총 101개
- Train+Val indices (sample): [0, 1, 2, 3, 4] ... 총 405개
  - Train size: 324 (64.03%)
  - Val size:   81 (16.01%)
  - Test size:  101 (19.96%)
Iteration 3/50 | AUC = 0.5000

Fold 4/5
- Test indices (sample): [1, 3, 9, 11, 18] ... 총 101개
- Train+Val indices (sample): [0, 2, 4, 5, 6] ... 총 405개
  - Train size: 324 (64.03%)
  - Val size:   81 (16.01%)
  - Test size:  101 (19.96%)
Iteration 4/50 | AUC = 0.5000

Fold 5/5
- Test indices (sample):