In [3]:
#데이터 불러오기
import pandas as pd

# CSV 파일 불러오기
df = pd.read_csv('diabetes.csv')

# 상위 5개 샘플 미리보기
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [4]:
# 컬럼 이름 확인
print("📌 컬럼 목록:")
print(df.columns)

# 각 컬럼별 데이터 타입 + 결측치 유무 확인
print("\n📌 요약 정보:")
print(df.info())

# 기본 통계 보기
print("\n📌 기본 통계:")
print(df.describe())

# 0이 몇 개 있는지 확인 (결측치처럼 간주할지 판단하기 위해)
print("\n📌 0값 개수 확인:")
print((df == 0).sum())

📌 컬럼 목록:
Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],
      dtype='object')

📌 요약 정보:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB
None

📌 기본 통계:
       Pregnancies     Glucose  BloodPressure

In [5]:
print(df.describe())


       Pregnancies     Glucose  BloodPressure  SkinThickness     Insulin  \
count   768.000000  768.000000     768.000000     768.000000  768.000000   
mean      3.845052  120.894531      69.105469      20.536458   79.799479   
std       3.369578   31.972618      19.355807      15.952218  115.244002   
min       0.000000    0.000000       0.000000       0.000000    0.000000   
25%       1.000000   99.000000      62.000000       0.000000    0.000000   
50%       3.000000  117.000000      72.000000      23.000000   30.500000   
75%       6.000000  140.250000      80.000000      32.000000  127.250000   
max      17.000000  199.000000     122.000000      99.000000  846.000000   

              BMI  DiabetesPedigreeFunction         Age     Outcome  
count  768.000000                768.000000  768.000000  768.000000  
mean    31.992578                  0.471876   33.240885    0.348958  
std      7.884160                  0.331329   11.760232    0.476951  
min      0.000000                  

In [6]:
print((df == 0).sum())

Pregnancies                 111
Glucose                       5
BloodPressure                35
SkinThickness               227
Insulin                     374
BMI                          11
DiabetesPedigreeFunction      0
Age                           0
Outcome                     500
dtype: int64


In [7]:
import numpy as np

# 1. 0을 결측치로 간주할 컬럼들 (의학적으로 0이 불가능)
cols_with_invalid_zeros = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']

# 2. 해당 컬럼들의 0값을 NaN으로 바꾸기
df[cols_with_invalid_zeros] = df[cols_with_invalid_zeros].replace(0, np.nan)

# 3. NaN을 평균값으로 대체하기
df.fillna(df.mean(numeric_only=True), inplace=True)

# 4. 확인: 결측치가 잘 처리되었는지 확인
print(df.isnull().sum())

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64


In [8]:
from sklearn.preprocessing import StandardScaler

# 1. 특성과 타겟 분리
X = df.drop('Outcome', axis=1)  # 입력 데이터
y = df['Outcome']               # 타겟값 (분류 목적)

# 2. 정규화 (평균 0, 표준편차 1)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 3. 결과 확인
print(pd.DataFrame(X_scaled).describe())

                  0             1             2             3             4  \
count  7.680000e+02  7.680000e+02  7.680000e+02  7.680000e+02  7.680000e+02   
mean  -6.476301e-17 -3.561966e-16  6.915764e-16  7.956598e-16 -3.330669e-16   
std    1.000652e+00  1.000652e+00  1.000652e+00  1.000652e+00  1.000652e+00   
min   -1.141852e+00 -2.554131e+00 -4.004245e+00 -2.521670e+00 -1.665945e+00   
25%   -8.448851e-01 -7.212214e-01 -6.953060e-01 -4.727737e-01 -4.007289e-01   
50%   -2.509521e-01 -1.540881e-01 -1.675912e-02  8.087936e-16 -3.345079e-16   
75%    6.399473e-01  6.103090e-01  6.282695e-01  3.240194e-01 -3.345079e-16   
max    3.906578e+00  2.541850e+00  4.102655e+00  7.950467e+00  8.126238e+00   

                  5             6             7  
count  7.680000e+02  7.680000e+02  7.680000e+02  
mean   3.515706e-16  2.451743e-16  1.931325e-16  
std    1.000652e+00  1.000652e+00  1.000652e+00  
min   -2.075119e+00 -1.189553e+00 -1.041549e+00  
25%   -7.215397e-01 -6.889685e-01 -7.8

In [9]:
from sklearn.model_selection import train_test_split

# X: 입력 데이터 (정규화 완료된)
# y: 타겟값 (Outcome - 0 or 1)

X_train, X_test, y_train, y_test = train_test_split(
    X_scaled,           # 입력값
    y,                  # 정답값
    test_size=0.2,      # 테스트 데이터 비율 (20%)
    random_state=42     # 랜덤 시드 (재현 가능성 확보)
)

# 분할 결과 확인
print("학습 데이터 크기:", X_train.shape)
print("테스트 데이터 크기:", X_test.shape)

학습 데이터 크기: (614, 8)
테스트 데이터 크기: (154, 8)


In [10]:
import torch
import torch.nn as nn

class DiabetesClassifier(nn.Module):
    def __init__(self):
        super(DiabetesClassifier, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(8, 32),
            nn.ReLU(),
            nn.Linear(32, 16),
            nn.ReLU(),
            nn.Linear(16, 1),
            nn.Sigmoid()  # 확률 출력
        )

    def forward(self, x):
        return self.model(x)

In [11]:
import torch
from torch.utils.data import TensorDataset, DataLoader
import torch.optim as optim

# 1. 넘파이 → 텐서 변환
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).view(-1, 1)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32).view(-1, 1)

# 2. 데이터로더 구성
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# 3. 모델 생성
model = DiabetesClassifier()
criterion = nn.BCELoss()  # 이진 분류용 손실 함수
optimizer = optim.Adam(model.parameters(), lr=0.001)

# 4. 학습 루프
epochs = 50
for epoch in range(epochs):
    model.train()
    running_loss = 0.0

    for X_batch, y_batch in train_loader:
        optimizer.zero_grad()
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()

    print(f"Epoch {epoch+1}/{epochs}, Loss: {running_loss / len(train_loader):.4f}")

Epoch 1/50, Loss: 0.6658
Epoch 2/50, Loss: 0.6348
Epoch 3/50, Loss: 0.5935
Epoch 4/50, Loss: 0.5613
Epoch 5/50, Loss: 0.5292
Epoch 6/50, Loss: 0.4960
Epoch 7/50, Loss: 0.4747
Epoch 8/50, Loss: 0.4735
Epoch 9/50, Loss: 0.4632
Epoch 10/50, Loss: 0.4405
Epoch 11/50, Loss: 0.4510
Epoch 12/50, Loss: 0.4834
Epoch 13/50, Loss: 0.4348
Epoch 14/50, Loss: 0.4485
Epoch 15/50, Loss: 0.4370
Epoch 16/50, Loss: 0.4455
Epoch 17/50, Loss: 0.4199
Epoch 18/50, Loss: 0.4402
Epoch 19/50, Loss: 0.4152
Epoch 20/50, Loss: 0.4182
Epoch 21/50, Loss: 0.4223
Epoch 22/50, Loss: 0.4186
Epoch 23/50, Loss: 0.4117
Epoch 24/50, Loss: 0.4060
Epoch 25/50, Loss: 0.4170
Epoch 26/50, Loss: 0.4084
Epoch 27/50, Loss: 0.4065
Epoch 28/50, Loss: 0.4066
Epoch 29/50, Loss: 0.4056
Epoch 30/50, Loss: 0.4148
Epoch 31/50, Loss: 0.3994
Epoch 32/50, Loss: 0.4042
Epoch 33/50, Loss: 0.4031
Epoch 34/50, Loss: 0.3954
Epoch 35/50, Loss: 0.4199
Epoch 36/50, Loss: 0.3972
Epoch 37/50, Loss: 0.4127
Epoch 38/50, Loss: 0.3889
Epoch 39/50, Loss: 0.

In [12]:
from sklearn.metrics import accuracy_score

# 1. 평가 모드 설정
model.eval()

# 2. 예측 및 정답 저장 리스트
all_preds = []
all_labels = []

# 3. 테스트셋에서 예측 수행
with torch.no_grad():
    for X_batch, y_batch in test_loader:
        outputs = model(X_batch)
        predicted = (outputs > 0.5).float()  # 시그모이드 결과가 0.5 이상이면 1로 분류
        all_preds.extend(predicted.numpy())
        all_labels.extend(y_batch.numpy())

# 4. 정확도 계산
acc = accuracy_score(all_labels, all_preds)
print(f"✅ Test Accuracy: {acc * 100:.2f}%")

✅ Test Accuracy: 75.32%


In [13]:
#데이터 불러오기
import pandas as pd

# CSV 파일 불러오기
df = pd.read_csv('heart (1).csv')

# 상위 5개 샘플 미리보기
df.head()
	# 1.	df.head(), df.info() → 데이터 구조 확인
	# 2.	df.describe() → 이상치, 값의 범위 확인
	# 3.	df.isnull().sum() 또는 (df == 0).sum() → 결측치 탐색
	# 4.	있을 때만 → fillna() 또는 평균/중앙값으로 대체
	# 5.	필요할 때만 → StandardScaler 또는 MinMaxScaler 사용

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [14]:
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       303 non-null    int64  
 1   sex       303 non-null    int64  
 2   cp        303 non-null    int64  
 3   trestbps  303 non-null    int64  
 4   chol      303 non-null    int64  
 5   fbs       303 non-null    int64  
 6   restecg   303 non-null    int64  
 7   thalach   303 non-null    int64  
 8   exang     303 non-null    int64  
 9   oldpeak   303 non-null    float64
 10  slope     303 non-null    int64  
 11  ca        303 non-null    int64  
 12  thal      303 non-null    int64  
 13  target    303 non-null    int64  
dtypes: float64(1), int64(13)
memory usage: 33.3 KB


In [16]:
df.describe()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
count,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0
mean,54.366337,0.683168,0.966997,131.623762,246.264026,0.148515,0.528053,149.646865,0.326733,1.039604,1.39934,0.729373,2.313531,0.544554
std,9.082101,0.466011,1.032052,17.538143,51.830751,0.356198,0.52586,22.905161,0.469794,1.161075,0.616226,1.022606,0.612277,0.498835
min,29.0,0.0,0.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,47.5,0.0,0.0,120.0,211.0,0.0,0.0,133.5,0.0,0.0,1.0,0.0,2.0,0.0
50%,55.0,1.0,1.0,130.0,240.0,0.0,1.0,153.0,0.0,0.8,1.0,0.0,2.0,1.0
75%,61.0,1.0,2.0,140.0,274.5,0.0,1.0,166.0,1.0,1.6,2.0,1.0,3.0,1.0
max,77.0,1.0,3.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,2.0,4.0,3.0,1.0


In [17]:
df.isnull().sum()

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64