# 데이터 로드 및 전처리

In [1]:
import pandas as pd
# from sklearn.preprocessing import MinMaxScaler

df = pd.read_csv('data/clinical_mastitis_cows.csv')

changing_ids = ['cow10'] # 클래스 변한 개체 드랍

# 사용할 컬럼
cols = ['Cow_ID', 'Day', 'IUFL', 'EUFL', 'IUFR', 'EUFR',
        'IURL', 'EURL', 'IURR', 'EURR', 'Temperature', 'class1']

# 변화한 개체 및 홀스타인 종 제거할지말지?
df_filtered = df[
    (~df['Cow_ID'].isin(changing_ids)) &
    (df['Breed'] != 'hostlene')
][cols].copy()

# 평균과 범위 계산 대상 컬럼
value_cols = ['IUFL', 'EUFL', 'IUFR', 'EUFR', 
              'IURL', 'EURL', 'IURR', 'EURR', 'Temperature']

# (1) 평균 계산
mean_df = df_filtered.groupby('Cow_ID')[value_cols].mean()
mean_df.columns = [f"{col}_mean" for col in mean_df.columns]

# (2) 범위 (최대 - 최소) 계산
range_df = (
    df_filtered.groupby('Cow_ID')[value_cols].max()
    - df_filtered.groupby('Cow_ID')[value_cols].min()
)
range_df.columns = [f"{col}_range" for col in range_df.columns]

# (3) class1은 마지막 날짜 기준 (사실 필요없간 함 모든 개체 6일치분 클래스 동일)
class_df = (
    df_filtered.sort_values(['Cow_ID', 'Day'])
               .groupby('Cow_ID')['class1']
               .last()
)

# (4) 병합
final_df = pd.concat([mean_df, range_df], axis=1)
final_df['class1'] = class_df

# 정렬 및 인덱스 정리
final_df = final_df.reset_index()
final_df['Cow_ID_num'] = final_df['Cow_ID'].str.extract('(\d+)').astype(int)
final_df = final_df.sort_values('Cow_ID_num').drop(columns='Cow_ID_num')
final_df = final_df.set_index('Cow_ID')

# 결과 확인
print(f"최종 변수 개수: {final_df.shape[1]}")
final_df.head()

최종 변수 개수: 19


Unnamed: 0_level_0,IUFL_mean,EUFL_mean,IUFR_mean,EUFR_mean,IURL_mean,EURL_mean,IURR_mean,EURR_mean,Temperature_mean,IUFL_range,EUFL_range,IUFR_range,EUFR_range,IURL_range,EURL_range,IURR_range,EURR_range,Temperature_range,class1
Cow_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
cow1,151.833333,182.166667,152.0,183.833333,151.166667,182.5,152.0,183.333333,41.166667,5,6,5,9,5,6,5,7,3,0
cow2,237.833333,275.5,233.5,276.166667,234.333333,279.5,238.666667,277.333333,40.833333,15,11,10,13,15,17,20,10,3,1
cow3,238.833333,276.5,234.5,277.166667,235.333333,280.5,239.666667,278.333333,42.166667,15,11,10,13,15,17,20,10,6,1
cow4,183.333333,211.5,185.5,215.833333,183.833333,211.333333,185.666667,213.666667,41.166667,7,6,8,8,6,5,10,5,2,0
cow5,154.333333,184.833333,154.333333,188.166667,156.833333,183.833333,154.0,184.666667,42.666667,8,9,9,9,10,7,10,8,5,1


# Grid Search - 파라미터 튜닝

In [7]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    confusion_matrix, precision_score, recall_score,
    f1_score, accuracy_score, fbeta_score, roc_auc_score
)
import pandas as pd
import numpy as np

# 시드 고정 함수
def set_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    np.random.seed(seed)
    import random
    random.seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

# AutoEncoder 정의
class AutoEncoder(nn.Module):
    def __init__(self, input_dim=18, latent_dim=4, dropout_rate=0.3):
        super(AutoEncoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, latent_dim),
            nn.ReLU(),
            nn.Dropout(dropout_rate)
        )
        self.decoder = nn.Sequential(
            nn.Linear(latent_dim, input_dim)
        )

    def forward(self, x):
        z = self.encoder(x)
        return self.decoder(z)

# 재구성 오차 계산
def reconstruction_error(x, model, batch_size=128):
    model.eval()
    errors = []
    with torch.no_grad():
        for i in range(0, len(x), batch_size):
            x_batch = x[i:i+batch_size]
            recon = model(x_batch)
            err = torch.mean((x_batch - recon) ** 2, dim=1)
            errors.append(err)
    return torch.cat(errors)

# 결과 저장 리스트
all_results = []

# 실험 조합 반복
for latent_dim in [4, 5, 6, 7, 8]:
    for dropout_rate in [0.2, 0.3, 0.4, 0.5]:
        for lr in [1e-2, 1e-3, 1e-4]:
            for patience in [1000, 5000, 10000]:
                precisions, recalls, f1s, f2s, accs, specs, aucs = [], [], [], [], [], [], []
                print(f"\n=== latent_dim={latent_dim}, dropout={dropout_rate}, lr={lr}, patience={patience} ===")

                for seed in [1, 10, 100, 1000, 10000]:
                    set_seed(seed)

                    # 데이터 분할
                    normal_df = final_df[final_df['class1'] == 0].drop(columns='class1')
                    abnormal_df = final_df[final_df['class1'] == 1].drop(columns='class1')

                    train_normal, remaining_normal = train_test_split(normal_df, train_size=400, random_state=seed)
                    val_normal = remaining_normal.sample(n=100, random_state=seed)
                    test_normal = remaining_normal.drop(val_normal.index).sample(n=100, random_state=seed)

                    val_abnormal = abnormal_df.sample(n=100, random_state=seed)
                    test_abnormal = abnormal_df.drop(val_abnormal.index).sample(n=100, random_state=seed)

                    # 텐서 변환
                    X_train = torch.tensor(train_normal.values, dtype=torch.float32)
                    X_val_normal = torch.tensor(val_normal.values, dtype=torch.float32)
                    X_val_abnormal = torch.tensor(val_abnormal.values, dtype=torch.float32)
                    X_test_normal = torch.tensor(test_normal.values, dtype=torch.float32)
                    X_test_abnormal = torch.tensor(test_abnormal.values, dtype=torch.float32)

                    # 모델 정의
                    model = AutoEncoder(input_dim=18, latent_dim=latent_dim, dropout_rate=dropout_rate)
                    criterion = nn.MSELoss()
                    optimizer = optim.Adam(model.parameters(), lr=lr)

                    best_loss = float('inf')
                    trigger = 0

                    # 학습
                    for epoch in range(100000):
                        model.train()
                        optimizer.zero_grad()
                        output = model(X_train)
                        loss = criterion(output, X_train)
                        loss.backward()
                        optimizer.step()

                        # 검증
                        model.eval()
                        with torch.no_grad():
                            val_output = model(X_val_normal)
                            val_loss = criterion(val_output, X_val_normal)

                        if val_loss < best_loss:
                            best_loss = val_loss
                            best_model_state = model.state_dict()
                            trigger = 0
                        else:
                            trigger += 1
                            if trigger >= patience:
                                break

                    model.load_state_dict(best_model_state)

                    # 임계값 설정 (AUC 기준)
                    recon_val_normal = reconstruction_error(X_val_normal, model)
                    recon_val_abnormal = reconstruction_error(X_val_abnormal, model)
                    recon_val_all = torch.cat([recon_val_normal, recon_val_abnormal])
                    y_val_true = torch.cat([
                        torch.zeros_like(recon_val_normal),
                        torch.ones_like(recon_val_abnormal)
                    ])

                    thresholds = torch.linspace(recon_val_all.min(), recon_val_all.max(), steps=200)
                    best_auc, best_th = 0, None
                    for t in thresholds:
                        y_pred = (recon_val_all >= t).int()
                        auc = roc_auc_score(y_val_true.numpy(), y_pred.numpy())
                        if auc > best_auc:
                            best_auc, best_th = auc, t

                    # 테스트
                    recon_test_normal = reconstruction_error(X_test_normal, model)
                    recon_test_abnormal = reconstruction_error(X_test_abnormal, model)
                    recon_test_all = torch.cat([recon_test_normal, recon_test_abnormal])
                    y_test_true = torch.cat([
                        torch.zeros_like(recon_test_normal),
                        torch.ones_like(recon_test_abnormal)
                    ])
                    y_test_pred = (recon_test_all >= best_th).int()

                    # 평가지표
                    cm = confusion_matrix(y_test_true, y_test_pred, labels=[0, 1])
                    tn, fp, fn, tp = cm.ravel()
                    specificity = tn / (tn + fp)
                    auc = roc_auc_score(y_test_true.numpy(), y_test_pred.numpy())

                    precision = precision_score(y_test_true, y_test_pred, pos_label=1)
                    recall = recall_score(y_test_true, y_test_pred, pos_label=1)
                    f1 = f1_score(y_test_true, y_test_pred, pos_label=1)
                    acc = accuracy_score(y_test_true, y_test_pred)
                    f2 = fbeta_score(y_test_true, y_test_pred, beta=2, pos_label=1)

                    print(f"\n[Seed {seed}] Confusion Matrix\n{cm}")
                    print(f"Recall: {recall:.4f}, Specificity: {specificity:.4f}, AUC: {auc:.4f}")

                    precisions.append(precision)
                    recalls.append(recall)
                    f1s.append(f1)
                    f2s.append(f2)
                    accs.append(acc)
                    specs.append(specificity)
                    aucs.append(auc)

                # 평균 결과 저장
                all_results.append({
                    'latent_dim': latent_dim,
                    'dropout': dropout_rate,
                    'lr': lr,
                    'patience': patience,
                    'precision': np.mean(precisions),
                    'recall': np.mean(recalls),
                    'f1': np.mean(f1s),
                    'f2': np.mean(f2s),
                    'accuracy': np.mean(accs),
                    'specificity': np.mean(specs),
                    'auc': np.mean(aucs)
                })

# 최종 결과 출력
df_results = pd.DataFrame(all_results)
print("\n📊 실험 결과 요약:")
print(df_results)


=== latent_dim=4, dropout=0.2, lr=0.01, patience=1000 ===

[Seed 1] Confusion Matrix
[[95  5]
 [15 85]]
Recall: 0.8500, Specificity: 0.9500, AUC: 0.9000

[Seed 10] Confusion Matrix
[[96  4]
 [ 6 94]]
Recall: 0.9400, Specificity: 0.9600, AUC: 0.9500

[Seed 100] Confusion Matrix
[[96  4]
 [10 90]]
Recall: 0.9000, Specificity: 0.9600, AUC: 0.9300

[Seed 1000] Confusion Matrix
[[97  3]
 [17 83]]
Recall: 0.8300, Specificity: 0.9700, AUC: 0.9000

[Seed 10000] Confusion Matrix
[[97  3]
 [10 90]]
Recall: 0.9000, Specificity: 0.9700, AUC: 0.9350

=== latent_dim=4, dropout=0.2, lr=0.01, patience=5000 ===

[Seed 1] Confusion Matrix
[[90 10]
 [14 86]]
Recall: 0.8600, Specificity: 0.9000, AUC: 0.8800

[Seed 10] Confusion Matrix
[[93  7]
 [ 6 94]]
Recall: 0.9400, Specificity: 0.9300, AUC: 0.9350

[Seed 100] Confusion Matrix
[[95  5]
 [16 84]]
Recall: 0.8400, Specificity: 0.9500, AUC: 0.8950

[Seed 1000] Confusion Matrix
[[97  3]
 [17 83]]
Recall: 0.8300, Specificity: 0.9700, AUC: 0.9000

[Seed 1000

# 선택된 단일 모델만

In [18]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    confusion_matrix, precision_score, recall_score,
    f1_score, accuracy_score, fbeta_score, roc_auc_score
)
import pandas as pd
import numpy as np

# 시드 고정 함수
def set_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    np.random.seed(seed)
    import random
    random.seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

# AutoEncoder 정의
class AutoEncoder(nn.Module):
    def __init__(self, input_dim=18, latent_dim=7, dropout_rate=0.5):
        super(AutoEncoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, latent_dim),
            nn.ReLU(),
            nn.Dropout(dropout_rate)
        )
        self.decoder = nn.Sequential(
            nn.Linear(latent_dim, input_dim)
        )
    def forward(self, x):
        z = self.encoder(x)
        return self.decoder(z)

# 재구성 오차 계산
def reconstruction_error(x, model, batch_size=128):
    model.eval()
    errors = []
    with torch.no_grad():
        for i in range(0, len(x), batch_size):
            x_batch = x[i:i+batch_size]
            recon = model(x_batch)
            err = torch.mean((x_batch - recon) ** 2, dim=1)
            errors.append(err)
    return torch.cat(errors)

# 결과 저장 리스트
all_results = []

# 고정 설정
latent_dim = 7
dropout_rate = 0.5
lr = 1e-3
patience = 10000

for seed in [1, 10, 100, 1000, 10000]:
    set_seed(seed)

    # 데이터 분할
    normal_df = final_df[final_df['class1'] == 0].drop(columns='class1')
    abnormal_df = final_df[final_df['class1'] == 1].drop(columns='class1')

    train_normal, remaining_normal = train_test_split(normal_df, train_size=400, random_state=seed)
    val_normal = remaining_normal.sample(n=100, random_state=seed)
    test_normal = remaining_normal.drop(val_normal.index).sample(n=100, random_state=seed)

    val_abnormal = abnormal_df.sample(n=100, random_state=seed)
    test_abnormal = abnormal_df.drop(val_abnormal.index).sample(n=100, random_state=seed)

    # 텐서 변환
    X_train = torch.tensor(train_normal.values, dtype=torch.float32)
    X_val_normal = torch.tensor(val_normal.values, dtype=torch.float32)
    X_val_abnormal = torch.tensor(val_abnormal.values, dtype=torch.float32)
    X_test_normal = torch.tensor(test_normal.values, dtype=torch.float32)
    X_test_abnormal = torch.tensor(test_abnormal.values, dtype=torch.float32)

    # 모델 정의
    model = AutoEncoder(input_dim=18, latent_dim=latent_dim, dropout_rate=dropout_rate)
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)

    best_loss = float('inf')
    trigger = 0

    # 학습
    for epoch in range(100000):
        model.train()
        optimizer.zero_grad()
        output = model(X_train)
        loss = criterion(output, X_train)
        loss.backward()
        optimizer.step()

        # 검증
        model.eval()
        with torch.no_grad():
            val_output = model(X_val_normal)
            val_loss = criterion(val_output, X_val_normal)

        if val_loss < best_loss:
            best_loss = val_loss
            best_model_state = model.state_dict()
            trigger = 0
        else:
            trigger += 1
            if trigger >= patience:
                break

    model.load_state_dict(best_model_state)

    # 임계값 설정 (AUC 기준)
    recon_val_normal = reconstruction_error(X_val_normal, model)
    recon_val_abnormal = reconstruction_error(X_val_abnormal, model)
    recon_val_all = torch.cat([recon_val_normal, recon_val_abnormal])
    y_val_true = torch.cat([
        torch.zeros_like(recon_val_normal),
        torch.ones_like(recon_val_abnormal)
    ])

    thresholds = torch.linspace(recon_val_all.min(), recon_val_all.max(), steps=200)
    best_auc, best_th = 0, None
    for t in thresholds:
        y_pred = (recon_val_all >= t).int()
        auc = roc_auc_score(y_val_true.numpy(), y_pred.numpy())
        if auc > best_auc:
            best_auc, best_th = auc, t

    # 테스트
    recon_test_normal = reconstruction_error(X_test_normal, model)
    recon_test_abnormal = reconstruction_error(X_test_abnormal, model)
    recon_test_all = torch.cat([recon_test_normal, recon_test_abnormal])
    y_test_true = torch.cat([
        torch.zeros_like(recon_test_normal),
        torch.ones_like(recon_test_abnormal)
    ])
    y_test_pred = (recon_test_all >= best_th).int()

    # 평가지표
    cm = confusion_matrix(y_test_true, y_test_pred, labels=[0, 1])
    tn, fp, fn, tp = cm.ravel()
    specificity = tn / (tn + fp)
    auc = roc_auc_score(y_test_true.numpy(), y_test_pred.numpy())

    precision = precision_score(y_test_true, y_test_pred, pos_label=1)
    recall = recall_score(y_test_true, y_test_pred, pos_label=1)
    f1 = f1_score(y_test_true, y_test_pred, pos_label=1)
    acc = accuracy_score(y_test_true, y_test_pred)
    f2 = fbeta_score(y_test_true, y_test_pred, beta=2, pos_label=1)

    print(f"\n[Seed {seed}] Confusion Matrix\n{cm}")
    print(f"Recall: {recall:.4f}, Specificity: {specificity:.4f}, AUC: {auc:.4f}")

    all_results.append({
        'seed': seed,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'f2': f2,
        'accuracy': acc,
        'specificity': specificity,
        'auc': auc
    })

# 결과 요약
df_results = pd.DataFrame(all_results)
print("\n📊 단일 실험 결과 요약:")
print(df_results)


[Seed 1] Confusion Matrix
[[97  3]
 [ 9 91]]
Recall: 0.9100, Specificity: 0.9700, AUC: 0.9400

[Seed 10] Confusion Matrix
[[99  1]
 [ 8 92]]
Recall: 0.9200, Specificity: 0.9900, AUC: 0.9550

[Seed 100] Confusion Matrix
[[99  1]
 [11 89]]
Recall: 0.8900, Specificity: 0.9900, AUC: 0.9400

[Seed 1000] Confusion Matrix
[[97  3]
 [ 6 94]]
Recall: 0.9400, Specificity: 0.9700, AUC: 0.9550

[Seed 10000] Confusion Matrix
[[98  2]
 [10 90]]
Recall: 0.9000, Specificity: 0.9800, AUC: 0.9400

📊 단일 실험 결과 요약:
    seed  precision  recall        f1        f2  accuracy  specificity    auc
0      1   0.968085    0.91  0.938144  0.921053     0.940         0.97  0.940
1     10   0.989247    0.92  0.953368  0.933063     0.955         0.99  0.955
2    100   0.988889    0.89  0.936842  0.908163     0.940         0.99  0.940
3   1000   0.969072    0.94  0.954315  0.945674     0.955         0.97  0.955
4  10000   0.978261    0.90  0.937500  0.914634     0.940         0.98  0.940
