In [40]:
import pandas as pd

df = pd.read_csv('healthcare-dataset-stroke-data.csv')
df

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
...,...,...,...,...,...,...,...,...,...,...,...,...
5105,18234,Female,80.0,1,0,Yes,Private,Urban,83.75,,never smoked,0
5106,44873,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,0
5107,19723,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
5108,37544,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0


In [41]:
# 결측치 확인

df.isna().sum()
df = df.dropna()
print(df.isna().sum())

id                   0
gender               0
age                  0
hypertension         0
heart_disease        0
ever_married         0
work_type            0
Residence_type       0
avg_glucose_level    0
bmi                  0
smoking_status       0
stroke               0
dtype: int64


In [43]:
# 입력, 라벨 분리
X = df.drop(columns=['stroke','gender','ever_married']).values
Y = df['stroke'].values
print(X)

[[9046 67.0 0 ... 228.69 36.6 'formerly smoked']
 [31112 80.0 0 ... 105.92 32.5 'never smoked']
 [60182 49.0 0 ... 171.23 34.4 'smokes']
 ...
 [19723 35.0 0 ... 82.99 30.6 'never smoked']
 [37544 51.0 0 ... 166.29 25.6 'formerly smoked']
 [44679 44.0 0 ... 85.28 26.2 'Unknown']]


In [44]:
from sklearn.preprocessing import StandardScaler, LabelEncoder

encoder = LabelEncoder()
df['ever_married'] = encoder.fit_transform(X)
# 정규화
scaler = StandardScaler()
X = scaler.fit_transform(X)


ValueError: y should be a 1d array, got an array of shape (4909, 9) instead.

In [None]:
from sklearn.model_selection import train_test_split

# 훈련 셋과 테스트 셋 분할
X_train, X_test, y_train, y_test = train_test_split(X,Y, test_size=0.2, random_state=42)
print(X_train.shape)

(3927, 11)


In [None]:
import torch
from torch.utils.data import TensorDataset, DataLoader

# TensorDataset으로 래핑
X_train_tensor = torch.tensor(X_train, dtype=torch.float32) 
y_train_tensor = torch.tensor(y_train, dtype=torch.float32).view(-1,1) # 2차원 형태로 변경 -> (n,1), 위에서 y_train.shape이 (n,)의 형태
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32).view(-1,1)

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32)

TypeError: can't convert np.ndarray of type numpy.object_. The only supported types are: float64, float32, float16, complex64, complex128, int64, int32, int16, int8, uint64, uint32, uint16, uint8, and bool.

In [None]:
import torch.nn as nn
import torch.optim as optim

# 분류 모델 정의
class ClassificationModel(nn.Module):
    def __init__(self):
        super(ClassificationModel, self).__init__()
        self.fc1 = nn.Linear(11,64)
        self.fc2 = nn.Linear(64,32)
        self.fc3 = nn.Linear(32,1)
    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)
        return x

# GPU 사용, 없으면 cpu 사용
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = ClassificationModel().to(device)
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [None]:
# 학습 루프
model.train()
for epoch in range(50):
    total_loss = 0
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        optimizer.zero_grad()
        output = model(X_batch)
        loss = criterion(output, y_batch)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {total_loss / len(train_loader):.4f}")

NameError: name 'train_loader' is not defined

In [None]:
from sklearn.metrics import  accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score
import torch

# 평가
model.eval()

all_preds = []
all_probs = []
all_labels = []

with torch.no_grad():
    for X_batch, y_batch in test_loader:
        X_batch = X_batch.to(device)
        y_batch = y_batch.to(device)

        # 예측
        outputs = model(X_batch)
        probs = torch.sigmoid(outputs)  # 확률로 변환
        preds = (probs > 0.5).float()   # 0.5 기준으로 이진 분류

        all_preds.extend(preds.cpu().numpy())
        all_probs.extend(probs.cpu().numpy())  # ROC AUC용
        all_labels.extend(y_batch.cpu().numpy())


# 평가 지표 출력
accuracy = accuracy_score(all_labels, all_preds)
precision = precision_score(all_labels, all_preds)
recall = recall_score(all_labels, all_preds)
f1 = f1_score(all_labels, all_preds)
conf_matrix = confusion_matrix(all_labels, all_preds)
roc_auc = roc_auc_score(all_labels, all_probs)

print(f"Accuracy     : {accuracy:.4f}")
print(f"Precision    : {precision:.4f}")
print(f"Recall       : {recall:.4f}")
print(f"F1 Score     : {f1:.4f}")
print(f"ROC AUC Score: {roc_auc:.4f}")
print(f"Confusion Matrix:\n{conf_matrix}")