In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, ShuffleSplit
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [2]:
heart_attack_file_path = 'data.csv'
ha_data = pd.read_csv(heart_attack_file_path)

In [3]:
ha_data.head(5)

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num
0,28,1,2,130,132,0,2,185,0,0.0,?,?,?,0
1,29,1,2,120,243,0,0,160,0,0.0,?,?,?,0
2,29,1,2,140,?,0,0,170,0,0.0,?,?,?,0
3,30,0,1,170,237,0,1,170,0,0.0,?,?,6,0
4,31,0,2,100,219,0,1,150,0,0.0,?,?,?,0


In [4]:
ha_data.shape

(294, 14)

## Очистка данных

In [5]:
ha_data = ha_data.replace("?", pd.NA)
for clmn in ha_data.columns:
    if ha_data[clmn].dtype == 'object':
        ha_data[clmn] = pd.to_numeric(ha_data[clmn])

In [6]:
ha_data.isna().sum()

age             0
sex             0
cp              0
trestbps        1
chol           23
fbs             8
restecg         1
thalach         1
exang           1
oldpeak         0
slope         190
ca            291
thal          266
num             0
dtype: int64

In [7]:
ha_data['trestbps'] = ha_data['trestbps'].fillna(ha_data['trestbps'].mean()).round(0)
ha_data['chol'] = ha_data['chol'].fillna(ha_data['chol'].mean()).round(0)
ha_data['fbs'] = ha_data['fbs'].fillna(1)
ha_data['restecg'] = ha_data['restecg'].fillna(0)
ha_data['thalach'] = ha_data['thalach'].fillna(ha_data['thalach'].mean()).round(0)
ha_data['exang'] = ha_data['exang'].fillna(0)
ha_data['slope'] = ha_data['slope'].fillna(2)
ha_data['thal'] = ha_data['thal'].fillna(ha_data['thal'].mean()).round(0)
ha_data = ha_data.drop(columns=['ca'])

ha_data.shape

(294, 13)

In [8]:
ha_data.head(5)

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,thal,num
0,28,1,2,130.0,132.0,0.0,2.0,185.0,0.0,0.0,2.0,6.0,0
1,29,1,2,120.0,243.0,0.0,0.0,160.0,0.0,0.0,2.0,6.0,0
2,29,1,2,140.0,251.0,0.0,0.0,170.0,0.0,0.0,2.0,6.0,0
3,30,0,1,170.0,237.0,0.0,1.0,170.0,0.0,0.0,2.0,6.0,0
4,31,0,2,100.0,219.0,0.0,1.0,150.0,0.0,0.0,2.0,6.0,0


## Тренировка и предсказание

In [9]:
y = ha_data["num       "]
X = ha_data.drop("num       " ,axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=42)

In [10]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [11]:
dtr = DecisionTreeRegressor(random_state=42)
dtr.fit(X_train, y_train)
y_pred = dtr.predict(X_test)

In [12]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification report:\n", classification_report(y_test, y_pred))

Accuracy: 0.7865168539325843
Confusion matrix:
 [[48 13]
 [ 6 22]]
Classification report:
               precision    recall  f1-score   support

           0       0.89      0.79      0.83        61
           1       0.63      0.79      0.70        28

    accuracy                           0.79        89
   macro avg       0.76      0.79      0.77        89
weighted avg       0.81      0.79      0.79        89



## Предсказание на 10 случайных разбиениях

In [13]:
ss = ShuffleSplit(n_splits=10, train_size=0.7, random_state=42)
accuracies = []
for train_index, test_index in ss.split(y):
    X_train = X.iloc[train_index]
    y_train = y.iloc[train_index]
    X_test = X.iloc[test_index]
    y_test = y.iloc[test_index]
    
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    
    dtr.fit(X_train, y_train)
    y_pred = dtr.predict(X_test)
    
    accuracies.append(accuracy_score(y_test, y_pred))

for i, accuracy in enumerate(accuracies):
    print("{:4} | {}".format(i, accuracy))
print("mean |", np.mean(accuracies))

   0 | 0.7865168539325843
   1 | 0.7752808988764045
   2 | 0.7640449438202247
   3 | 0.797752808988764
   4 | 0.7415730337078652
   5 | 0.7303370786516854
   6 | 0.7865168539325843
   7 | 0.797752808988764
   8 | 0.7191011235955056
   9 | 0.6853932584269663
mean | 0.7584269662921349


## Эксперимент на других данных

In [14]:
heart_attack_file_path = 'heart.csv'
ha_data = pd.read_csv(heart_attack_file_path)
ha_data.head(5)

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3,0
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3,0
2,70,1,0,145,174,0,1,125,1,2.6,0,0,3,0
3,61,1,0,148,203,0,1,161,0,0.0,2,1,3,0
4,62,0,0,138,294,1,1,106,0,1.9,1,3,2,0


In [15]:
y = ha_data["target"]
X = ha_data.drop("target" ,axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=42)

In [16]:
dtr = DecisionTreeRegressor(random_state=42)
dtr.fit(X_train, y_train)
y_pred = dtr.predict(X_test)

In [17]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification report:\n", classification_report(y_test, y_pred))

Accuracy: 0.9707792207792207
Confusion matrix:
 [[159   0]
 [  9 140]]
Classification report:
               precision    recall  f1-score   support

           0       0.95      1.00      0.97       159
           1       1.00      0.94      0.97       149

    accuracy                           0.97       308
   macro avg       0.97      0.97      0.97       308
weighted avg       0.97      0.97      0.97       308



In [18]:
accuracies = []
for train_index, test_index in ss.split(y):
    X_train = X.iloc[train_index]
    y_train = y.iloc[train_index]
    X_test = X.iloc[test_index]
    y_test = y.iloc[test_index]
    
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    
    dtr.fit(X_train, y_train)
    y_pred = dtr.predict(X_test)
    
    accuracies.append(accuracy_score(y_test, y_pred))

for i, accuracy in enumerate(accuracies):
    print("{:4} | {}".format(i, accuracy))
print("mean |", np.mean(accuracies))

   0 | 0.9707792207792207
   1 | 0.9707792207792207
   2 | 0.948051948051948
   3 | 0.987012987012987
   4 | 1.0
   5 | 0.9805194805194806
   6 | 0.9805194805194806
   7 | 0.987012987012987
   8 | 0.9902597402597403
   9 | 0.9805194805194806
mean | 0.9795454545454547
