In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np

# --- (11일차 추가) 데이터 분리 도구 ---
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# --- 1. (10일차 수정) Dataset 클래스 ---
# csv_path 대신 'DataFrame'과 'scaler'를 직접 받도록 수정
class JeonseDataset(Dataset):

    # __init__을 대폭 수정
    def __init__(self, dataframe, scaler, is_train=True):
        self.scaler = scaler
        feature_cols = dataframe.columns.drop('risk_label')

        # is_train 플래그 대신, fit_transform은 이 클래스 밖에서 수행
        # __init__에서는 transform만 수행
        self.features = self.scaler.transform(dataframe[feature_cols])
        self.labels = dataframe['risk_label'].values

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        feature = self.features[idx]
        label = self.labels[idx]

        feature_tensor = torch.tensor(feature, dtype=torch.float32)
        label_tensor = torch.tensor(label, dtype=torch.float32)

        return feature_tensor, label_tensor.view(1)

# --- 2. (5일차) 모델(MLP) 클래스 (동일) ---
class SimpleMLP(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(SimpleMLP, self).__init__()
        self.layer1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.layer2 = nn.Linear(hidden_size, output_size)
        self.sigmoid = nn.Sigmoid()
    def forward(self, x):
        out = self.layer1(x); out = self.relu(out)
        out = self.layer2(out); out = self.sigmoid(out)
        return out

# --- 3. (11일차 핵심) 데이터 준비 (분리 및 스케일링) ---

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# 3-1. 원본 데이터 로드
csv_file_path = 'dummy_data.csv' # 8일차에 생성한 파일
full_df = pd.read_csv(csv_file_path)

# 3-2. 훈련 / 검증 데이터 분리 (예: 80% 훈련, 20% 검증)
train_df, val_df = train_test_split(full_df, test_size=0.2, random_state=42)

print(f"총 데이터: {len(full_df)}개")
print(f"훈련 데이터: {len(train_df)}개, 검증 데이터: {len(val_df)}개")

# 3-3. 스케일러(Scaler) 준비 (데이터 누수 방지!)
# (중요!) Scaler는 'train_df'로만 'fit' 해야 합니다.
feature_cols = train_df.columns.drop('risk_label')
scaler = StandardScaler()
scaler.fit(train_df[feature_cols]) # 훈련셋으로만 평균, 표준편차 계산

# 3-4. Dataset 및 DataLoader 준비
# 훈련셋: 훈련용 scaler를 전달
train_dataset = JeonseDataset(train_df, scaler)
# 검증셋: '똑같은' 훈련용 scaler를 전달
val_dataset = JeonseDataset(val_df, scaler)

# Hyperparameters (동일)
input_dim = 3; hidden_dim = 8; output_dim = 1
learning_rate = 0.001; batch_size = 16; num_epochs = 50

# DataLoader 준비
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
# (중요!) 검증용(val_loader)은 섞을(shuffle) 필요가 없습니다.
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

# (모델, 손실함수, 옵티마이저 - 동일)
model = SimpleMLP(input_dim, hidden_dim, output_dim).to(device)
loss_fn = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

print("--- 훈련/검증 분리 후 학습 시작 ---")

# --- 4. (11일차 핵심) 훈련/검증 루프 분리 ---
for epoch in range(num_epochs):

    # === 훈련(Training) 루프 ===
    model.train() # 훈련 모드
    train_loss = 0.0
    train_correct = 0

    for features_batch, labels_batch in train_loader:
        features_batch = features_batch.to(device)
        labels_batch = labels_batch.to(device)

        prediction = model(features_batch)
        loss = loss_fn(prediction, labels_batch)

        optimizer.zero_grad()
        loss.backward() # 훈련 O
        optimizer.step()  # 훈련 O

        train_loss += loss.item()
        train_correct += (prediction.round() == labels_batch).sum().item()

    # === 검증(Validation) 루프 ===
    model.eval() # 평가 모드
    val_loss = 0.0
    val_correct = 0

    with torch.no_grad(): # (핵심!) 기울기 계산 비활성화
        for features_batch, labels_batch in val_loader:
            features_batch = features_batch.to(device)
            labels_batch = labels_batch.to(device)

            prediction = model(features_batch)
            loss = loss_fn(prediction, labels_batch)

            # .backward() / .step() 없음!

            val_loss += loss.item()
            val_correct += (prediction.round() == labels_batch).sum().item()

    # --- Epoch 종료: 훈련/검증 결과 출력 ---
    avg_train_loss = train_loss / len(train_loader)
    avg_train_acc = train_correct / len(train_dataset)

    avg_val_loss = val_loss / len(val_loader)
    avg_val_acc = val_correct / len(val_dataset)

    if (epoch + 1) % 5 == 0:
        print(f"Epoch [{epoch+1:3d}/{num_epochs}]")
        print(f"  Train Loss: {avg_train_loss:.4f} | Train Acc: {avg_train_acc * 100:.2f}%")
        print(f"  Valid Loss: {avg_val_loss:.4f} | Valid Acc: {avg_val_acc * 100:.2f}%")

print("--- 학습 완료 ---")

총 데이터: 100개
훈련 데이터: 80개, 검증 데이터: 20개
--- 훈련/검증 분리 후 학습 시작 ---
Epoch [  5/50]
  Train Loss: 0.6998 | Train Acc: 55.00%
  Valid Loss: 0.7013 | Valid Acc: 50.00%
Epoch [ 10/50]
  Train Loss: 0.6964 | Train Acc: 55.00%
  Valid Loss: 0.6887 | Valid Acc: 55.00%
Epoch [ 15/50]
  Train Loss: 0.6929 | Train Acc: 56.25%
  Valid Loss: 0.6781 | Valid Acc: 55.00%
Epoch [ 20/50]
  Train Loss: 0.6899 | Train Acc: 56.25%
  Valid Loss: 0.6678 | Valid Acc: 55.00%
Epoch [ 25/50]
  Train Loss: 0.6871 | Train Acc: 56.25%
  Valid Loss: 0.6586 | Valid Acc: 55.00%
Epoch [ 30/50]
  Train Loss: 0.6845 | Train Acc: 56.25%
  Valid Loss: 0.6503 | Valid Acc: 55.00%
Epoch [ 35/50]
  Train Loss: 0.6821 | Train Acc: 56.25%
  Valid Loss: 0.6418 | Valid Acc: 55.00%
Epoch [ 40/50]
  Train Loss: 0.6798 | Train Acc: 57.50%
  Valid Loss: 0.6352 | Valid Acc: 55.00%
Epoch [ 45/50]
  Train Loss: 0.6778 | Train Acc: 57.50%
  Valid Loss: 0.6283 | Valid Acc: 55.00%
Epoch [ 50/50]
  Train Loss: 0.6759 | Train Acc: 56.25%
  Valid L