# ИКМ

##### Датасет
[heart.xls](https://www.kaggle.com/datasets/fedesoriano/heart-failure-prediction)

### Описание задачи

Предсказать заболевание сердца по медицинским показателям

Целевая переменная `HeartDisease`
- **0** - отсутствие заболевания
- **1** - наличие заболевания

### Загрузка и анализ данных

In [159]:
import pandas as pd
from sklearn.metrics import classification_report, roc_auc_score

data = pd.read_csv('heart.xls')
data.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [160]:
data.info()
data.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             918 non-null    int64  
 1   Sex             918 non-null    object 
 2   ChestPainType   918 non-null    object 
 3   RestingBP       918 non-null    int64  
 4   Cholesterol     918 non-null    int64  
 5   FastingBS       918 non-null    int64  
 6   RestingECG      918 non-null    object 
 7   MaxHR           918 non-null    int64  
 8   ExerciseAngina  918 non-null    object 
 9   Oldpeak         918 non-null    float64
 10  ST_Slope        918 non-null    object 
 11  HeartDisease    918 non-null    int64  
dtypes: float64(1), int64(6), object(5)
memory usage: 86.2+ KB


Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease
count,918.0,918.0,918.0,918.0,918.0,918.0,918.0
mean,53.510893,132.396514,198.799564,0.233115,136.809368,0.887364,0.553377
std,9.432617,18.514154,109.384145,0.423046,25.460334,1.06657,0.497414
min,28.0,0.0,0.0,0.0,60.0,-2.6,0.0
25%,47.0,120.0,173.25,0.0,120.0,0.0,0.0
50%,54.0,130.0,223.0,0.0,138.0,0.6,1.0
75%,60.0,140.0,267.0,0.0,156.0,1.5,1.0
max,77.0,200.0,603.0,1.0,202.0,6.2,1.0


#### Заполнение пропусков

In [161]:
data.isnull().sum()

Age               0
Sex               0
ChestPainType     0
RestingBP         0
Cholesterol       0
FastingBS         0
RestingECG        0
MaxHR             0
ExerciseAngina    0
Oldpeak           0
ST_Slope          0
HeartDisease      0
dtype: int64

In [162]:
for column in ['Cholesterol', 'RestingBP']:
    data[column].replace(0, data[column].median(), inplace=True) # Замена нулей, будут только мешать

In [163]:
cat_cols = data.select_dtypes(include=['object']).columns
cat_cols

Index(['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope'], dtype='object')

In [164]:
data = pd.get_dummies(data, drop_first=True)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 16 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Age                918 non-null    int64  
 1   RestingBP          918 non-null    int64  
 2   Cholesterol        918 non-null    int64  
 3   FastingBS          918 non-null    int64  
 4   MaxHR              918 non-null    int64  
 5   Oldpeak            918 non-null    float64
 6   HeartDisease       918 non-null    int64  
 7   Sex_M              918 non-null    bool   
 8   ChestPainType_ATA  918 non-null    bool   
 9   ChestPainType_NAP  918 non-null    bool   
 10  ChestPainType_TA   918 non-null    bool   
 11  RestingECG_Normal  918 non-null    bool   
 12  RestingECG_ST      918 non-null    bool   
 13  ExerciseAngina_Y   918 non-null    bool   
 14  ST_Slope_Flat      918 non-null    bool   
 15  ST_Slope_Up        918 non-null    bool   
dtypes: bool(9), float64(1), in

### Разбиение данных

In [165]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X = data.drop('HeartDisease', axis=1)
y = data['HeartDisease']

seed = 42

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=seed)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

#### Параметры
- Для классических моделей используется стандартная функция потерь, соответствующая задаче (логистическая регрессия — логарифмическая функция потерь, ансамбли — критерии разделения).
- Для нейронной сети выбрана функция потерь BCE (Binary Cross Entropy), так как задача — бинарная классификация.
- Для всех моделей использовано масштабирование признаков.
- Для оценки качества использованы метрики classification_report и ROC-AUC, которые позволяют оценить точность, полноту, F1-меру и качество ранжирования.
## **Итог**
- Все выбранные подходы показали высокое качество на тестовой выборке. Классические модели обеспечивают интерпретируемость, а нейронная сеть — гибкость и возможность выявления сложных паттернов. Выбор моделей и параметров обоснован спецификой задачи и структурой данных.

### Классические модели

In [166]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(max_iter=1000, random_state=seed)
lr.fit(X_train, y_train)
lr_pred = lr.predict(X_test)

print(classification_report(y_test, lr_pred))
roc_auc_score(y_test, lr_pred)

              precision    recall  f1-score   support

           0       0.79      0.89      0.84       112
           1       0.92      0.84      0.88       164

    accuracy                           0.86       276
   macro avg       0.86      0.87      0.86       276
weighted avg       0.87      0.86      0.86       276



0.8671602787456446

### Ансамбли

In [186]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(
    n_estimators=300,
    max_depth=10,
    random_state=seed
)

rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)
print(classification_report(y_test, rf_pred))
roc_auc_score(y_test, rf_pred)

              precision    recall  f1-score   support

           0       0.83      0.85      0.84       112
           1       0.90      0.88      0.89       164

    accuracy                           0.87       276
   macro avg       0.86      0.87      0.87       276
weighted avg       0.87      0.87      0.87       276



0.8661803135888502

In [182]:
from sklearn.ensemble import GradientBoostingClassifier

gbr = GradientBoostingClassifier(
    n_estimators=300,
    learning_rate=.1,
    max_depth=3,
    random_state=seed
)

gbr.fit(X_train, y_train)
gbr_pred = gbr.predict(X_test)
print(classification_report(y_test, gbr_pred))
roc_auc_score(y_test, gbr_pred)

              precision    recall  f1-score   support

           0       0.80      0.85      0.82       112
           1       0.89      0.85      0.87       164

    accuracy                           0.85       276
   macro avg       0.85      0.85      0.85       276
weighted avg       0.85      0.85      0.85       276



0.8509364111498258

### Нейронная сеть

In [169]:

from torch.utils.data import DataLoader, TensorDataset
import torch
import torch.nn as nn

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [170]:
X_train_tensor = torch.tensor(X_train, dtype=torch.float32).to(device)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32).to(device)

y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).unsqueeze(1).to(device)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32).unsqueeze(1).to(device)

In [171]:
train = TensorDataset(X_train_tensor, y_train_tensor)
test = TensorDataset(X_test_tensor, y_test_tensor)

train_loader = DataLoader(train, batch_size=32, shuffle=True)
test_loader = DataLoader(test, batch_size=32, shuffle=False)

In [172]:
X_train_tensor.device

device(type='cuda', index=0)

In [173]:
class HeartNet(nn.Module):
    def __init__(self, shape):
        super(HeartNet, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(shape, 64),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.model(x)


In [236]:
model = HeartNet(X_train.shape[1]).to(device)
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [237]:
num_epochs = 100

for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0
    for X_batch, y_batch in train_loader:
        optimizer.zero_grad()
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()

    if (epoch+1) % 10 == 0:
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

Epoch [10/100], Loss: 0.0928
Epoch [20/100], Loss: 0.0108
Epoch [30/100], Loss: 0.1462
Epoch [40/100], Loss: 0.1160
Epoch [50/100], Loss: 0.6620
Epoch [60/100], Loss: 0.0660
Epoch [70/100], Loss: 0.0232
Epoch [80/100], Loss: 0.0784
Epoch [90/100], Loss: 0.2504
Epoch [100/100], Loss: 0.0347


In [238]:
model.eval()
with torch.no_grad():
    y_pred_prob = model(X_test_tensor)
    y_pred = (y_pred_prob >= 0.5).float()

print(classification_report(y_test_tensor.cpu(), y_pred.cpu()))
roc_auc_score(y_test_tensor.cpu(), y_pred.cpu())


              precision    recall  f1-score   support

         0.0       0.85      0.89      0.87       112
         1.0       0.92      0.89      0.91       164

    accuracy                           0.89       276
   macro avg       0.89      0.89      0.89       276
weighted avg       0.89      0.89      0.89       276



0.8915505226480835