# Полносвязная нейронная сеть для задачи классификации сердечно-сосудистых заболеваний

Описание данных см лабораторную работу №1

In [1]:
import numpy as np              # Массивы (матрицы, векторы, линейная алгебра)
import matplotlib.pyplot as plt # Научная графика
%matplotlib inline 
    # Говорим jupyter'у, чтобы весь графический вывод был в браузере, а не в отдельном окне
import pandas as pd             # Таблицы и временные ряды (dataframe, series)
import seaborn as sns           # Еще больше красивой графики для визуализации данных
import sklearn                  # Алгоритмы машинного обучения

In [2]:
data_raw = pd.read_csv('./Datasets/cardio_filtered.csv')
data_raw.shape

(69791, 12)

In [3]:

data_raw.head()

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,18393,female,168,62.0,110,80,1,1,0,0,1,0
1,20228,male,156,85.0,140,90,3,1,0,0,1,1
2,18857,male,165,64.0,130,70,3,1,0,0,0,1
3,17623,female,169,82.0,150,100,1,1,0,0,1,1
4,17474,male,156,56.0,100,60,1,1,0,0,0,0


Изменим значения гендера на бинарную переменную

In [4]:
data_raw['gender'] = data_raw['gender'].replace(['male','female'], [0, 1]) 

Нормализуем данные

In [5]:
data_norm = (data_raw - data_raw.min())/(data_raw.max() - data_raw.min())
data_norm

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0.588076,1.0,0.579487,0.273684,0.277778,0.388889,0.0,0.0,0.0,0.0,1.0,0.0
1,0.730159,0.0,0.517949,0.394737,0.444444,0.444444,1.0,0.0,0.0,0.0,1.0,1.0
2,0.624003,0.0,0.564103,0.284211,0.388889,0.333333,1.0,0.0,0.0,0.0,0.0,1.0
3,0.528455,1.0,0.584615,0.378947,0.500000,0.500000,0.0,0.0,0.0,0.0,1.0,1.0
4,0.516918,0.0,0.517949,0.242105,0.222222,0.277778,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
69786,0.653659,1.0,0.579487,0.347368,0.333333,0.388889,0.0,0.0,1.0,0.0,1.0,0.0
69787,0.913899,0.0,0.528205,0.610526,0.444444,0.444444,0.5,0.5,0.0,0.0,1.0,1.0
69788,0.640186,1.0,0.656410,0.500000,0.666667,0.444444,1.0,0.0,0.0,1.0,0.0,1.0
69789,0.900736,0.0,0.553846,0.326316,0.416667,0.388889,0.0,0.5,0.0,0.0,0.0,1.0


Уменьшим размерность с помощью PCA. Опытным путем было получено оптимальное число - 9

In [6]:
X = data_norm.drop(['cardio'],axis= 1)
Y = data_norm['cardio']

In [7]:
from sklearn.decomposition import PCA

n = 9
PCA_res = PCA(n_components=n).fit(X).transform(X)
PCA_DF = pd.DataFrame([[*elem[0],elem[1]] for elem in list(zip(PCA_res,Y))],columns= list(range(n)) + ['res'] )

In [8]:
PCA_DF

Unnamed: 0,0,1,2,3,4,5,6,7,8,res
0,0.601911,-0.152266,-0.224084,-0.290362,0.004846,0.022913,0.081739,-0.096533,-0.039899,0.0
1,-0.394281,-0.262888,0.576150,0.002540,-0.549257,-0.040111,0.052132,0.049417,0.024835,1.0
2,-0.419367,0.732828,0.641861,0.048522,-0.544988,-0.035211,0.176690,-0.065259,-0.044315,1.0
3,0.608881,-0.153059,-0.212097,-0.288447,-0.002257,0.027439,0.107059,0.177859,-0.027597,1.0
4,-0.374612,0.811849,-0.194171,0.060496,0.024233,-0.006599,0.158160,-0.166946,-0.028641,0.0
...,...,...,...,...,...,...,...,...,...,...
69786,0.885171,-0.178212,-0.166527,0.521693,0.050008,-0.473409,-0.035026,-0.035255,0.011758,0.0
69787,-0.382600,-0.242303,0.473698,-0.042436,0.121231,-0.021466,-0.211788,0.128817,0.226560,1.0
69788,0.663273,0.740324,0.753430,0.216389,-0.581921,0.870164,0.083504,0.242049,0.035623,1.0
69789,-0.389458,0.791842,0.138884,-0.000048,0.391385,-0.006591,-0.253832,0.018880,-0.016697,1.0


Натренируем полносвязную нейронную сеть

Разбиваем данные на тестовую и обучающую выборку

In [9]:
X = PCA_DF.drop(['res'],axis= 1)
Y = PCA_DF['res']
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.33, random_state = 7)
N_test, _ = X_test.shape

Обучаем классификатор

In [12]:
from sklearn.neural_network import MLPClassifier

mlp_model = MLPClassifier(hidden_layer_sizes = (100, 50, 10), solver = 'sgd', 
                          activation = 'relu', random_state = 42, alpha=0.01)
mlp_model.fit(X_train, Y_train)

MLPClassifier(alpha=0.01, hidden_layer_sizes=(100, 50, 10), random_state=42,
              solver='sgd')

In [13]:
y_train_pred = mlp_model.predict(X_train)
y_test_pred = mlp_model.predict(X_test)

train loss, test loss

In [14]:
print(np.mean(Y_train != y_train_pred), np.mean(Y_test != y_test_pred))

0.2632434397656066 0.26610802361931224


Train score, test score

In [15]:
print(mlp_model.score(X_train, Y_train), mlp_model.score(X_test, Y_test))

0.7367565602343934 0.7338919763806877


Попытаемся найти лучший коэффициент для скорости обучения

In [21]:
from sklearn.model_selection import GridSearchCV

params = {'alpha': np.logspace(-3, 2, 21)} 
clf = GridSearchCV(MLPClassifier(hidden_layer_sizes = (100, 50, 10), solver = 'sgd', 
                          activation = 'relu', random_state = 42), params, n_jobs=-1)

clf.fit(X_train, Y_train)

In [22]:
tr_score, test_scr = clf.score(X_train, Y_train), clf.score(X_test, Y_test)
tr_score, test_scr

(0.737034581577878, 0.7324157693643626)

In [23]:
print('Best parameters found:\n', clf.best_params_)

Best parameters found:
 {'alpha': 0.1}


Метрики качества:

In [24]:
y_true, y_pred = Y_test , clf.predict(X_test)
from sklearn.metrics import classification_report
print('Results on the test set:')
print(classification_report(y_true, y_pred))

Results on the test set:
              precision    recall  f1-score   support

         0.0       0.72      0.76      0.74     11596
         1.0       0.75      0.70      0.72     11436

    accuracy                           0.73     23032
   macro avg       0.73      0.73      0.73     23032
weighted avg       0.73      0.73      0.73     23032



# Вывод:

 нейронная сеть не показала каких-либо улучшений в сравнеии с классическими методами машинного обучения. Ни одна из про