In [1]:
import pandas as pd
import numpy as np

#### 기획 
각종 질병 및 생활습관에 따른 심장마비 위험도 예측 

### 심장마비 환자 예측 
#### 오차행렬

- 양성(Positive): 심장마비 위험 환자를 의미.
- 음성(Negative): 정상 환자를 의미.


> True Positive (TP): 모델이 심장마비 위험 환자를 올바르게 분류 한 경우
>  
> False Positive (FP): 모델이 정상 환자를 심장마비 위험 환자로 잘못 분류한 경우(오진, error I)
>  
> True Negative (TN): 모델이 정상 환자를 정상으로 올바르게 분류한 경우 
> 
> False Negative (FN): 모델이 심장마비 위험 환자를 정상으로 잘못 분류한 경우 (놓침, error II)  

> 정밀도 (Precision): 실제 심장마비 위험 환자 중에서 모델이 심장마비 위험군으로 정확하게 예측한 환자의 비율을 나타낸다. 즉 모델이 얼마나 많은 심장마비 위험 환자를 "놓치지 않고" 감지 했는지 측정한다.
> 
> 
> 🏆임계치를 낮춰서 재현율을 높여주는 것이 적합하다.


1) id: unique identifier
2) gender: "Male", "Female" or "Other"
3) age: age of the patient
4) hypertension: 0 if the patient doesn't have hypertension, 1 if the patient has hypertension
5) heart_disease: 0 if the patient doesn't have any heart diseases, 1 if the patient has a heart disease
6) ever_married: "No" or "Yes"
7) work_type: "children", "Govt_jov", "Never_worked", "Private" or "Self-employed"
8) Residence_type: "Rural" or "Urban"
9) avg_glucose_level: average glucose level in blood
10) bmi: body mass index
11) smoking_status: "formerly smoked", "never smoked", "smokes" or "Unknown"*
12) stroke: 1 if the patient had a stroke or 0 if not

In [28]:
heart_df = pd.read_csv('./datasets/healthcare-dataset-stroke-data.csv')
heart_df

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
...,...,...,...,...,...,...,...,...,...,...,...,...
5105,18234,Female,80.0,1,0,Yes,Private,Urban,83.75,,never smoked,0
5106,44873,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,0
5107,19723,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
5108,37544,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0


In [29]:
heart_df.duplicated().sum() # 중복된 행도 없음 

0

In [30]:
heart_df.isna().sum()

id                     0
gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
bmi                  201
smoking_status         0
stroke                 0
dtype: int64

In [36]:
# heart_df['bmi'].describe().T
mean = heart_df['bmi'].mean
print(mean)

<bound method Series.mean of 0       36.6
1        NaN
2       32.5
3       34.4
4       24.0
        ... 
5105     NaN
5106    40.0
5107    30.6
5108    25.6
5109    26.2
Name: bmi, Length: 5110, dtype: float64>


In [37]:
heart_df['bmi'].fillna(mean)

0                                                    36.6
1       <bound method Series.mean of 0       36.6\n1  ...
2                                                    32.5
3                                                    34.4
4                                                    24.0
                              ...                        
5105    <bound method Series.mean of 0       36.6\n1  ...
5106                                                 40.0
5107                                                 30.6
5108                                                 25.6
5109                                                 26.2
Name: bmi, Length: 5110, dtype: object

In [38]:
heart_df

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
...,...,...,...,...,...,...,...,...,...,...,...,...
5105,18234,Female,80.0,1,0,Yes,Private,Urban,83.75,,never smoked,0
5106,44873,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,0
5107,19723,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
5108,37544,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0


In [39]:
heart_df.isna().sum()

id                     0
gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
bmi                  201
smoking_status         0
stroke                 0
dtype: int64