In [2]:
import pandas as pd

df = pd.read_csv('diabetes.csv')
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [3]:
cols_with_invalid_zeros = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']

In [4]:
for col in cols_with_invalid_zeros:
    print(f"{col} 컬럼에서 0의 개수: {(df[col] == 0).sum()}")

Glucose 컬럼에서 0의 개수: 5
BloodPressure 컬럼에서 0의 개수: 35
SkinThickness 컬럼에서 0의 개수: 227
Insulin 컬럼에서 0의 개수: 374
BMI 컬럼에서 0의 개수: 11


In [5]:
import numpy as np

for col in cols_with_invalid_zeros:
    df[col] = df[col].replace(0, np.nan)  # 0을 NaN으로 바꿈
    median = df[col].median()             # 중앙값 구함
    df[col] = df[col].fillna(median)      # NaN을 중앙값으로 대체

In [6]:
from sklearn.preprocessing import StandardScaler

# target은 제외하고 feature만
X = df.drop('Outcome', axis=1)

# 정규화
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [7]:
X_scaled = pd.DataFrame(X_scaled, columns=X.columns)
X_scaled.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,0.639947,0.866045,-0.03199,0.670643,-0.181541,0.166619,0.468492,1.425995
1,-0.844885,-1.205066,-0.528319,-0.012301,-0.181541,-0.8522,-0.365061,-0.190672
2,1.23388,2.016662,-0.693761,-0.012301,-0.181541,-1.3325,0.604397,-0.105584
3,-0.844885,-1.073567,-0.528319,-0.695245,-0.540642,-0.633881,-0.920763,-1.041549
4,-1.141852,0.504422,-2.679076,0.670643,0.316566,1.549303,5.484909,-0.020496


In [9]:
# 1. 결측치 처리
# 2. 정규화
# 3. 분할

X = df.drop('Outcome', axis=1)
y_class = df['Outcome']
y_reg = df['Glucose']

# 정규화
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 분류용 분할
X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(X_scaled, y_class, test_size=0.2, random_state=42)

# 회귀용 분할
X_train_r, X_test_r, y_train_r, y_test_r = train_test_split(X_scaled, y_reg, test_size=0.2, random_state=42)

In [10]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

# 1. 텐서로 변환
X_train_tensor = torch.tensor(X_train_c, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train_c.values, dtype=torch.float32).view(-1, 1)
X_test_tensor = torch.tensor(X_test_c, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test_c.values, dtype=torch.float32).view(-1, 1)

# 2. DataLoader 생성
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# 3. 모델 정의
class MLPClassifier(nn.Module):
    def __init__(self):
        super(MLPClassifier, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(8, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1),
            nn.Sigmoid()  # 이진 분류용
        )
    
    def forward(self, x):
        return self.model(x)

# 4. 학습 설정
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = MLPClassifier().to(device)
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x105a04fd0>>
Traceback (most recent call last):
  File "/opt/anaconda3/envs/ai_env/lib/python3.10/site-packages/ipykernel/ipkernel.py", line 775, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(
KeyboardInterrupt: 


In [11]:
# 5. 학습 루프
num_epochs = 20
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        
        optimizer.zero_grad()
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
    
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {running_loss / len(train_loader):.4f}")

Epoch 1/20, Loss: 0.6954
Epoch 2/20, Loss: 0.6204
Epoch 3/20, Loss: 0.5544
Epoch 4/20, Loss: 0.4907
Epoch 5/20, Loss: 0.4810
Epoch 6/20, Loss: 0.4615
Epoch 7/20, Loss: 0.4708
Epoch 8/20, Loss: 0.4460
Epoch 9/20, Loss: 0.4352
Epoch 10/20, Loss: 0.4370
Epoch 11/20, Loss: 0.4225
Epoch 12/20, Loss: 0.4149
Epoch 13/20, Loss: 0.4180
Epoch 14/20, Loss: 0.4154
Epoch 15/20, Loss: 0.4095
Epoch 16/20, Loss: 0.4038
Epoch 17/20, Loss: 0.4065
Epoch 18/20, Loss: 0.3971
Epoch 19/20, Loss: 0.4080
Epoch 20/20, Loss: 0.3961


In [12]:
# 6. 평가
model.eval()
correct = 0
total = 0

with torch.no_grad():
    for X_batch, y_batch in test_loader:
        X_batch = X_batch.to(device)
        outputs = model(X_batch)
        predicted = (outputs.cpu().numpy() > 0.5).astype(int)
        total += y_batch.size(0)
        correct += (predicted == y_batch.numpy()).sum()

print(f"Test Accuracy: {100 * correct / total:.2f}%")

Test Accuracy: 75.97%


In [13]:
# 회귀용 텐서 변환
X_train_tensor = torch.tensor(X_train_r, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train_r.values, dtype=torch.float32).view(-1, 1)

X_test_tensor = torch.tensor(X_test_r, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test_r.values, dtype=torch.float32).view(-1, 1)

# DataLoader 구성
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32)

In [14]:
class MLPRegressor(nn.Module):
    def __init__(self):
        super(MLPRegressor, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(8, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1)  # 활성화 함수 없음
        )
    
    def forward(self, x):
        return self.model(x)

# 장비 설정 및 학습 준비
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = MLPRegressor().to(device)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [15]:
# 학습 루프
num_epochs = 20
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        
        optimizer.zero_grad()
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
    
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {running_loss / len(train_loader):.4f}")

Epoch 1/20, Loss: 15553.9887
Epoch 2/20, Loss: 15352.6904
Epoch 3/20, Loss: 15204.5962
Epoch 4/20, Loss: 14469.6544
Epoch 5/20, Loss: 13816.5209
Epoch 6/20, Loss: 12414.0424
Epoch 7/20, Loss: 10497.8493
Epoch 8/20, Loss: 8437.0992
Epoch 9/20, Loss: 6331.6344
Epoch 10/20, Loss: 4272.0687
Epoch 11/20, Loss: 2702.1924
Epoch 12/20, Loss: 1776.6237
Epoch 13/20, Loss: 1196.3055
Epoch 14/20, Loss: 964.2556
Epoch 15/20, Loss: 824.1014
Epoch 16/20, Loss: 786.0971
Epoch 17/20, Loss: 706.5546
Epoch 18/20, Loss: 650.2680
Epoch 19/20, Loss: 591.0960
Epoch 20/20, Loss: 561.7494


In [16]:
from sklearn.metrics import mean_squared_error
import numpy as np

# 평가
model.eval()
predictions = []
actuals = []

with torch.no_grad():
    for X_batch, y_batch in test_loader:
        X_batch = X_batch.to(device)
        outputs = model(X_batch).cpu().numpy()
        predictions.extend(outputs)
        actuals.extend(y_batch.numpy())

# MSE 계산
mse = mean_squared_error(actuals, predictions)
rmse = np.sqrt(mse)
print(f"Test RMSE: {rmse:.2f}")

Test RMSE: 24.25


In [17]:
#CNN 분류
# 분류용 데이터
X_train_tensor = torch.tensor(X_train_c, dtype=torch.float32).unsqueeze(1)  # (batch, 1, features)
y_train_tensor = torch.tensor(y_train_c.values, dtype=torch.float32).view(-1, 1)

X_test_tensor = torch.tensor(X_test_c, dtype=torch.float32).unsqueeze(1)
y_test_tensor = torch.tensor(y_test_c.values, dtype=torch.float32).view(-1, 1)

# Dataloader
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32)

In [18]:
class CNNClassifier(nn.Module):
    def __init__(self):
        super(CNNClassifier, self).__init__()
        self.conv1 = nn.Conv1d(in_channels=1, out_channels=16, kernel_size=3, padding=1)
        self.conv2 = nn.Conv1d(16, 32, kernel_size=3, padding=1)
        self.fc1 = nn.Linear(32 * 8, 64)
        self.fc2 = nn.Linear(64, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = torch.relu(self.conv1(x))
        x = torch.relu(self.conv2(x))
        x = x.view(x.size(0), -1)  # flatten
        x = torch.relu(self.fc1(x))
        x = self.sigmoid(self.fc2(x))
        return x

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = CNNClassifier().to(device)
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [19]:
# 학습
num_epochs = 20
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        optimizer.zero_grad()
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    
    print(f"Epoch {epoch+1}, Loss: {running_loss / len(train_loader):.4f}")

Epoch 1, Loss: 0.6198
Epoch 2, Loss: 0.5412
Epoch 3, Loss: 0.4784
Epoch 4, Loss: 0.4506
Epoch 5, Loss: 0.4372
Epoch 6, Loss: 0.4500
Epoch 7, Loss: 0.4163
Epoch 8, Loss: 0.4380
Epoch 9, Loss: 0.4156
Epoch 10, Loss: 0.4392
Epoch 11, Loss: 0.4104
Epoch 12, Loss: 0.3985
Epoch 13, Loss: 0.3958
Epoch 14, Loss: 0.3964
Epoch 15, Loss: 0.3989
Epoch 16, Loss: 0.3954
Epoch 17, Loss: 0.3803
Epoch 18, Loss: 0.3861
Epoch 19, Loss: 0.3825
Epoch 20, Loss: 0.3902


In [20]:
# 평가
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for X_batch, y_batch in test_loader:
        X_batch = X_batch.to(device)
        outputs = model(X_batch).cpu().numpy()
        predicted = (outputs > 0.5).astype(int)
        correct += (predicted == y_batch.numpy()).sum()
        total += y_batch.size(0)

print(f"Test Accuracy: {100 * correct / total:.2f}%")

Test Accuracy: 74.68%


In [21]:
# 텐서 변환 + CNN용 형태 (unsqueeze)
X_train_tensor = torch.tensor(X_train_r, dtype=torch.float32).unsqueeze(1)  # (batch, 1, features)
y_train_tensor = torch.tensor(y_train_r.values, dtype=torch.float32).view(-1, 1)

X_test_tensor = torch.tensor(X_test_r, dtype=torch.float32).unsqueeze(1)
y_test_tensor = torch.tensor(y_test_r.values, dtype=torch.float32).view(-1, 1)

# DataLoader 생성
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32)

In [22]:
class CNNRegressor(nn.Module):
    def __init__(self):
        super(CNNRegressor, self).__init__()
        self.conv1 = nn.Conv1d(1, 16, kernel_size=3, padding=1)
        self.conv2 = nn.Conv1d(16, 32, kernel_size=3, padding=1)
        self.fc1 = nn.Linear(32 * 8, 64)
        self.fc2 = nn.Linear(64, 1)

    def forward(self, x):
        x = torch.relu(self.conv1(x))
        x = torch.relu(self.conv2(x))
        x = x.view(x.size(0), -1)  # Flatten
        x = torch.relu(self.fc1(x))
        x = self.fc2(x)  # 활성화 함수 없음 (회귀)
        return x

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = CNNRegressor().to(device)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [23]:
# 학습
num_epochs = 20
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        optimizer.zero_grad()
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    
    print(f"Epoch {epoch+1}, Loss: {running_loss / len(train_loader):.4f}")

Epoch 1, Loss: 15477.7448
Epoch 2, Loss: 13876.1903
Epoch 3, Loss: 8537.9873
Epoch 4, Loss: 2602.8234
Epoch 5, Loss: 1565.0324
Epoch 6, Loss: 1233.8521
Epoch 7, Loss: 1086.4831
Epoch 8, Loss: 1023.5634
Epoch 9, Loss: 933.7635
Epoch 10, Loss: 860.8677
Epoch 11, Loss: 780.2200
Epoch 12, Loss: 718.4413
Epoch 13, Loss: 662.6441
Epoch 14, Loss: 643.8965
Epoch 15, Loss: 536.3581
Epoch 16, Loss: 500.8459
Epoch 17, Loss: 448.9327
Epoch 18, Loss: 392.8699
Epoch 19, Loss: 359.0199
Epoch 20, Loss: 313.8256


In [None]:
from sklearn.metrics import mean_squared_error
import numpy as np

model.eval()
predictions = []
actuals = []

with torch.no_grad():
    for X_batch, y_batch in test_loader:
        X_batch = X_batch.to(device)
        outputs = model(X_batch).cpu().numpy()
        predictions.extend(outputs)
        actuals.extend(y_batch.numpy())

mse = mean_squared_error(actuals, predictions)
rmse = np.sqrt(mse)
print(f"Test RMSE: {rmse:.2f}")