# Проверка знаний

Для выполнения практических заданий нам понадобится новый датасет.

Датасет болезней сердца содержит информацию о пациентах и переменную предсказания target — наличие у пациента болезни сердца.

Датасет содержит следующие признаки:

* ``age`` — возраст
* ``sex`` — пол (1 - мужчина, 0 - женщина)
* ``cp`` — тип боли в груди (4 значения)
* ``trestbps`` — артериальное давление в покое
* ``chol`` — холестерин сыворотки в мг/дл
* ``fbs`` — уровень сахара в крови натощак > 120 мг/дл
* ``restecg`` — результаты электрокардиографии в покое (значения 0,1,2)
* ``thalach`` — достигнута максимальная частота сердечных сокращений
* ``exang`` — стенокардия, вызванная физической нагрузкой
* ``oldpeak`` — депрессия ST, вызванная физической нагрузкой, по сравнению с состоянием покоя
* ``slope`` — наклон пикового сегмента ST при нагрузке
* ``ca`` — количество крупных сосудов (0-3), окрашенных при флюроскопии
* ``thal`` — дефект, где 3 = нормальный; 6 = фиксированный дефект; 7 = обратимый дефект

In [163]:
import pandas as pd 
import plotly.express as px 
import category_encoders as ce
from sklearn import preprocessing
import plotly.figure_factory as ff

In [164]:
heart_data = pd.read_csv('data/heart.csv')
heart_data

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3,0
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3,0
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3,0
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3,0


In [165]:
heart_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       303 non-null    int64  
 1   sex       303 non-null    int64  
 2   cp        303 non-null    int64  
 3   trestbps  303 non-null    int64  
 4   chol      303 non-null    int64  
 5   fbs       303 non-null    int64  
 6   restecg   303 non-null    int64  
 7   thalach   303 non-null    int64  
 8   exang     303 non-null    int64  
 9   oldpeak   303 non-null    float64
 10  slope     303 non-null    int64  
 11  ca        303 non-null    int64  
 12  thal      303 non-null    int64  
 13  target    303 non-null    int64  
dtypes: float64(1), int64(13)
memory usage: 33.3 KB


In [166]:
heart_data.describe()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
count,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0
mean,54.366337,0.683168,0.966997,131.623762,246.264026,0.148515,0.528053,149.646865,0.326733,1.039604,1.39934,0.729373,2.313531,0.544554
std,9.082101,0.466011,1.032052,17.538143,51.830751,0.356198,0.52586,22.905161,0.469794,1.161075,0.616226,1.022606,0.612277,0.498835
min,29.0,0.0,0.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,47.5,0.0,0.0,120.0,211.0,0.0,0.0,133.5,0.0,0.0,1.0,0.0,2.0,0.0
50%,55.0,1.0,1.0,130.0,240.0,0.0,1.0,153.0,0.0,0.8,1.0,0.0,2.0,1.0
75%,61.0,1.0,2.0,140.0,274.5,0.0,1.0,166.0,1.0,1.6,2.0,1.0,3.0,1.0
max,77.0,1.0,3.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,2.0,4.0,3.0,1.0


### Задание 8.1

Создайте новый признак old, где 1 — при возрасте пациента более 60 лет.

In [167]:
heart_data['old'] = heart_data['age'].apply(lambda x: 1 if x > 60 else 0)
heart_data['old'].sum()

79

### Задание 8.2

Создайте новый признак trestbps_mean, который будет обозначать норму давления в среднем для его возраста и пола. trestbps — систолическое артериальное давление в состоянии покоя.

Информацию о среднем давлении для возраста и пола возьмите из этой таблицы. В таблице систолическое давление написано первым, перед дробной чертой.

In [168]:
def get_age_index(age):
    if age <= 20:
        age_index = 0
    elif 21 <= age <= 30:
        age_index = 1
    elif 31 <= age <= 40:
        age_index = 2
    elif 41 <= age <= 50:
        age_index = 3
    elif 51 <= age <= 60:
        age_index = 4
    elif age >= 61:
        age_index = 5
    return age_index

def get_trestbps(age, sex):
    trest_bps_mean = {
        1: [123, 126, 129, 135, 142, 142],
        0: [116, 120, 127, 137, 144, 159]
    }
    age = get_age_index(age)
    return trest_bps_mean[sex][age]

heart_data['trestbps_mean'] = heart_data.apply(lambda x: get_trestbps(x['age'], x['sex']), axis=1)
heart_data

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target,old,trestbps_mean
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1,1,142
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1,0,129
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1,0,137
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1,0,142
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1,0,144
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3,0,0,144
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3,0,0,135
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3,0,1,142
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3,0,0,142


### Задание 8.5

Закодируйте вышеперечисленные признаки методом OneHotEncoding из библиотеки Category Encoders, удалив исходные признаки. Параметр use_cat_names оставьте по умолчанию.

Сколько признаков получилось в датасете?

In [169]:
encoder = ce.OneHotEncoder()
heart_data_encoder = encoder.fit_transform(heart_data[['cp', 'restecg', 'slope', 'ca', 'thal']].astype('category'))
heart_data = pd.concat([heart_data, heart_data_encoder], axis=1)
heart_data.drop(['cp', 'restecg', 'slope', 'ca', 'thal'], axis=1, inplace=True)
heart_data

Unnamed: 0,age,sex,trestbps,chol,fbs,thalach,exang,oldpeak,target,old,...,slope_3,ca_1,ca_2,ca_3,ca_4,ca_5,thal_1,thal_2,thal_3,thal_4
0,63,1,145,233,1,150,0,2.3,1,1,...,0,1,0,0,0,0,1,0,0,0
1,37,1,130,250,0,187,0,3.5,1,0,...,0,1,0,0,0,0,0,1,0,0
2,41,0,130,204,0,172,0,1.4,1,0,...,0,1,0,0,0,0,0,1,0,0
3,56,1,120,236,0,178,0,0.8,1,0,...,0,1,0,0,0,0,0,1,0,0
4,57,0,120,354,0,163,1,0.6,1,0,...,0,1,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,140,241,0,123,1,0.2,0,0,...,1,1,0,0,0,0,0,0,1,0
299,45,1,110,264,0,132,0,1.2,0,0,...,1,1,0,0,0,0,0,0,1,0
300,68,1,144,193,1,141,0,3.4,0,1,...,1,0,1,0,0,0,0,0,1,0
301,57,1,130,131,0,115,1,1.2,0,0,...,1,0,0,1,0,0,0,0,1,0


### Задание 8.6

Нормализуйте все числовые признаки подходящим способом.

В ответе напишите стандартное отклонение признака chol. Ответ округлите до шести знаков после запятой.

In [170]:
scaler = preprocessing.RobustScaler()
heart_data[['age', 'oldpeak', 'trestbps', 'chol', 'thalach', 'trestbps_mean']] = scaler.fit_transform(heart_data[['age', 'oldpeak', 'trestbps', 'chol', 'thalach', 'trestbps_mean']])
heart_data

Unnamed: 0,age,sex,trestbps,chol,fbs,thalach,exang,oldpeak,target,old,...,slope_3,ca_1,ca_2,ca_3,ca_4,ca_5,thal_1,thal_2,thal_3,thal_4
0,0.592593,1,0.75,-0.110236,1,-0.092308,0,0.9375,1,1,...,0,1,0,0,0,0,1,0,0,0
1,-1.333333,1,0.00,0.157480,0,1.046154,0,1.6875,1,0,...,0,1,0,0,0,0,0,1,0,0
2,-1.037037,0,0.00,-0.566929,0,0.584615,0,0.3750,1,0,...,0,1,0,0,0,0,0,1,0,0
3,0.074074,1,-0.50,-0.062992,0,0.769231,0,0.0000,1,0,...,0,1,0,0,0,0,0,1,0,0
4,0.148148,0,-0.50,1.795276,0,0.307692,1,-0.1250,1,0,...,0,1,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,0.148148,0,0.50,0.015748,0,-0.923077,1,-0.3750,0,0,...,1,1,0,0,0,0,0,0,1,0
299,-0.740741,1,-1.00,0.377953,0,-0.646154,0,0.2500,0,0,...,1,1,0,0,0,0,0,0,1,0
300,0.962963,1,0.70,-0.740157,1,-0.369231,0,1.6250,0,1,...,1,0,1,0,0,0,0,0,1,0
301,0.148148,1,0.00,-1.716535,0,-1.169231,1,0.2500,0,0,...,1,0,0,1,0,0,0,0,1,0


In [171]:
round(heart_data['chol'].std() ,6)

0.816232

In [172]:
age_h = heart_data['age']
oldpeak_h = heart_data['oldpeak']
trestbps_h = heart_data['trestbps']
chol_h = heart_data['chol']
thalach_h = heart_data['thalach']
trestbps_mean_h = heart_data['trestbps_mean']
heart_data_list = [age_h, oldpeak_h, trestbps_h, chol_h, thalach_h, trestbps_mean_h]
heart_data_name = ['age', 'oldpeak', 'trestbps', 'chol', 'thalach', 'trestbps_mean']
fig = ff.create_distplot(heart_data_list, heart_data_name, show_hist=False)
fig.show()

### Задание 8.7

Проведите корреляционный анализ и отберите признаки для будущей модели. Выберите пары сильно скоррелированных признаков.

In [173]:
heart_data_corr = heart_data.corr()
heart_data_corr

Unnamed: 0,age,sex,trestbps,chol,fbs,thalach,exang,oldpeak,target,old,...,slope_3,ca_1,ca_2,ca_3,ca_4,ca_5,thal_1,thal_2,thal_3,thal_4
age,1.0,-0.098447,0.279351,0.213678,0.121308,-0.398522,0.096801,0.210013,-0.225439,0.718028,...,0.177201,-0.350844,0.226491,0.179284,0.162243,-0.122378,0.063758,-0.135891,0.110752,-0.016779
sex,-0.098447,1.0,-0.056769,-0.197912,0.045032,-0.04402,0.141664,0.096093,-0.280937,-0.144941,...,-0.009157,-0.122854,-0.020571,0.096673,0.066758,0.088212,0.141135,-0.376365,0.321559,-0.032093
trestbps,0.279351,-0.056769,1.0,0.123174,0.177531,-0.046698,0.067616,0.193216,-0.144931,0.180631,...,0.025207,-0.055011,0.088386,-0.059485,0.080109,0.019059,0.076369,-0.140712,0.109573,-0.01687
chol,0.213678,-0.197912,0.123174,1.0,0.013294,-0.00994,0.067023,0.053952,-0.085239,0.159388,...,0.051177,-0.0806,0.059887,0.018775,0.098568,-0.095785,-0.09356,-0.004591,0.059621,-0.057127
fbs,0.121308,0.045032,0.177531,0.013294,1.0,-0.008567,0.025665,0.005747,-0.028046,0.02679,...,-0.03336,-0.112548,0.12208,-0.014774,0.075867,0.018754,0.091351,-0.086774,0.030953,0.080568
thalach,-0.398522,-0.04402,-0.046698,-0.00994,-0.008567,1.0,-0.378812,-0.344187,0.421741,-0.271898,...,-0.419627,0.273136,-0.056905,-0.195502,-0.175506,0.060911,-0.159748,0.294801,-0.215417,-0.050429
exang,0.096801,0.141664,0.067616,0.067023,0.025665,-0.378812,1.0,0.288223,-0.436757,0.019043,...,0.257687,-0.187734,0.097399,0.150206,0.013188,-0.035001,0.063073,-0.328539,0.300223,0.030113
oldpeak,0.210013,0.096093,0.193216,0.053952,0.005747,-0.344187,0.288223,1.0,-0.430696,0.159997,...,0.310986,-0.204803,0.218319,-0.011613,0.189296,-0.104998,0.104635,-0.339086,0.302145,-0.037946
target,-0.225439,-0.280937,-0.144931,-0.085239,-0.028046,0.421741,-0.436757,-0.430696,1.0,-0.121056,...,-0.362053,0.46559,-0.273998,-0.232412,-0.210615,0.066441,-0.106589,0.527334,-0.486112,-0.007293
old,0.718028,-0.144941,0.180631,0.159388,0.02679,-0.271898,0.019043,0.159997,-0.121056,1.0,...,0.128139,-0.222607,0.160985,0.055906,0.17516,-0.076925,0.009761,-0.004237,0.007644,-0.048408


In [177]:
fig = px.imshow(heart_data_corr, aspect='auto', text_auto='.2f', color_continuous_scale = 'RdBu_r', height=1000)
fig.show()