### 라이브러리 임포트

In [22]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error

import torch
from torch import nn
from torch.utils.data import TensorDataset, DataLoader
import torch.optim as optim

### CSV 파일 불러오기

In [4]:
file_path = r"C:\Users\MinGi\Desktop\year3-sem1\인공지능개론\dataset\diabetes.csv"
df = pd.read_csv(file_path)

print(df.head())

   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   

   DiabetesPedigreeFunction  Age  Outcome  
0                     0.627   50        1  
1                     0.351   31        0  
2                     0.672   32        1  
3                     0.167   21        0  
4                     2.288   33        1  


### 레이블 개수 확인

In [5]:
df['BMI'].value_counts()

BMI
32.0    13
31.6    12
31.2    12
0.0     11
32.4    10
        ..
36.7     1
41.8     1
42.6     1
42.8     1
46.3     1
Name: count, Length: 248, dtype: int64

### 결측치 확인

In [6]:
df.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

### 입력/정답 나누기 및 전처리

In [8]:
x = df.drop('Outcome', axis=1)
x = df.drop('BMI', axis=1)
y = df['BMI']

scaler = StandardScaler()
x = scaler.fit_transform(x)

In [19]:
Y = pd.get_dummies(y).values

### 훈련/테스트 나누기

In [10]:
x_train, x_test, y_train, y_test = train_test_split(x, y,test_size=0.25,random_state=0)

print(x_train.shape, y_train.shape)
print(x_test.shape, y_test.shape)

(576, 8) (576,)
(192, 8) (192,)


### TensorDataset으로 매핑

In [13]:
X_train_tensor = torch.tensor(x_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).view(-1, 1)
X_test_tensor = torch.tensor(x_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32).view(-1, 1)

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32)

### 회귀 모델 정의

In [14]:
class RegressionModel(nn.Module):
    def __init__(self):
        super(RegressionModel, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(8, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1)
        )

    def forward(self, x):
        return self.model(x)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = RegressionModel().to(device)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)


### 반복 학습

In [28]:
model.train()
for epoch in range(20):
    total_loss = 0
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        optimizer.zero_grad()
        output = model(X_batch)
        loss = criterion(output, y_batch)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {total_loss / len(train_loader):.4f}")


Epoch 1, Loss: 40.9207
Epoch 2, Loss: 40.7877
Epoch 3, Loss: 40.5049
Epoch 4, Loss: 40.2533
Epoch 5, Loss: 39.9228
Epoch 6, Loss: 39.7730
Epoch 7, Loss: 39.4960
Epoch 8, Loss: 39.2114
Epoch 9, Loss: 39.1051
Epoch 10, Loss: 38.8172
Epoch 11, Loss: 38.7508
Epoch 12, Loss: 38.5465
Epoch 13, Loss: 38.2715
Epoch 14, Loss: 38.1016
Epoch 15, Loss: 37.9451
Epoch 16, Loss: 37.7266
Epoch 17, Loss: 37.5425
Epoch 18, Loss: 37.3533
Epoch 19, Loss: 37.2486
Epoch 20, Loss: 37.0883


### 모델 평가

In [29]:
model.eval()
preds, actuals = [], []
with torch.no_grad():
    for X_batch, y_batch in test_loader:
        X_batch = X_batch.to(device)
        outputs = model(X_batch).cpu().numpy()
        preds.extend(outputs)
        actuals.extend(y_batch.numpy())

mse = mean_squared_error(actuals, preds)
print(f"Test MSE: {mse:.4f}")

Test MSE: 43.7410


In [24]:
plt.figure(figsize=(10,7))
plt.plot(range(N_EPOCHS), model_history.history['loss'], label='train loss')
plt.plot(range(N_EPOCHS), model_history.history['val_loss'], label='test loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.show()

NameError: name 'N_EPOCHS' is not defined

<Figure size 1000x700 with 0 Axes>

In [22]:
model.evaluate(x_test, y_test)

[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - loss: 0.0040 - mse: 0.0040 


[0.00404388178139925, 0.00404388178139925]

In [27]:
# 기준 모델: 평균값만 예측
baseline = np.mean(y_test)
mse_baseline = mean_squared_error(y_test, [baseline] * len(y_test))

print(f"Baseline MSE: {mse_baseline:.2f}")


Baseline MSE: 57.98
