In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

In [4]:
df = pd.read_csv('./ml_source_files/heart_failure_clinical_records_dataset.csv')
df.head(3)

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
0,75.0,0,582,0,20,1,265000.0,1.9,130,1,0,4,1
1,55.0,0,7861,0,38,0,263358.03,1.1,136,1,0,6,1
2,65.0,0,146,0,20,0,162000.0,1.3,129,1,1,7,1


In [5]:
df.isna().sum()

age                         0
anaemia                     0
creatinine_phosphokinase    0
diabetes                    0
ejection_fraction           0
high_blood_pressure         0
platelets                   0
serum_creatinine            0
serum_sodium                0
sex                         0
smoking                     0
time                        0
DEATH_EVENT                 0
dtype: int64

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 299 entries, 0 to 298
Data columns (total 13 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   age                       299 non-null    float64
 1   anaemia                   299 non-null    int64  
 2   creatinine_phosphokinase  299 non-null    int64  
 3   diabetes                  299 non-null    int64  
 4   ejection_fraction         299 non-null    int64  
 5   high_blood_pressure       299 non-null    int64  
 6   platelets                 299 non-null    float64
 7   serum_creatinine          299 non-null    float64
 8   serum_sodium              299 non-null    int64  
 9   sex                       299 non-null    int64  
 10  smoking                   299 non-null    int64  
 11  time                      299 non-null    int64  
 12  DEATH_EVENT               299 non-null    int64  
dtypes: float64(3), int64(10)
memory usage: 30.5 KB


In [36]:
y = df.DEATH_EVENT
x = df.drop('DEATH_EVENT', axis=1)

In [37]:
x.head(2)

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time
0,75.0,0,582,0,20,1,265000.0,1.9,130,1,0,4
1,55.0,0,7861,0,38,0,263358.03,1.1,136,1,0,6


In [9]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=100)

In [38]:
from sklearn.linear_model import LogisticRegression, Ridge, Lasso
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

In [39]:
lr = LogisticRegression()
rid = Ridge()
las = Lasso()
tree = DecisionTreeClassifier()
rfc = RandomForestClassifier()
svc = SVC()
lgb = LGBMClassifier()
xx = XGBClassifier()

In [43]:
from sklearn.metrics import classification_report, f1_score
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [44]:
from sklearn.metrics import roc_auc_score

In [42]:
models = [lr, tree, rfc, svc, lgb, xx]

In [46]:
for m in models:
    m.fit(x_train, y_train)
    pred = m.predict(x_test)
#     pro = m.predict_proba(x_test)[:,1]
    print(m.__class__.__name__)
    print(classification_report(y_test, pred))
#     print(roc_auc_score(y_test, pro))
    print(m.score(x_test, y_test))
    print('='*100)

LogisticRegression
              precision    recall  f1-score   support

           0       0.76      0.90      0.82        39
           1       0.71      0.48      0.57        21

    accuracy                           0.75        60
   macro avg       0.74      0.69      0.70        60
weighted avg       0.74      0.75      0.74        60

0.75
DecisionTreeClassifier
              precision    recall  f1-score   support

           0       0.77      0.87      0.82        39
           1       0.69      0.52      0.59        21

    accuracy                           0.75        60
   macro avg       0.73      0.70      0.71        60
weighted avg       0.74      0.75      0.74        60

0.75
RandomForestClassifier
              precision    recall  f1-score   support

           0       0.80      0.95      0.87        39
           1       0.86      0.57      0.69        21

    accuracy                           0.82        60
   macro avg       0.83      0.76      0.78        60

In [47]:
models = [lr, rfc, lgb, xx]

In [48]:
for m in models:
    m.fit(x_train, y_train)
    pred = m.predict(x_test)
    pro = m.predict_proba(x_test)[:,1]
    print(m.__class__.__name__)
    print(classification_report(y_test, pred))
    print(roc_auc_score(y_test, pro))
    print(m.score(x_test, y_test))
    print('='*100)

LogisticRegression
              precision    recall  f1-score   support

           0       0.76      0.90      0.82        39
           1       0.71      0.48      0.57        21

    accuracy                           0.75        60
   macro avg       0.74      0.69      0.70        60
weighted avg       0.74      0.75      0.74        60

0.7716727716727717
0.75
RandomForestClassifier
              precision    recall  f1-score   support

           0       0.80      0.92      0.86        39
           1       0.80      0.57      0.67        21

    accuracy                           0.80        60
   macro avg       0.80      0.75      0.76        60
weighted avg       0.80      0.80      0.79        60

0.905982905982906
0.8
LGBMClassifier
              precision    recall  f1-score   support

           0       0.80      0.90      0.84        39
           1       0.75      0.57      0.65        21

    accuracy                           0.78        60
   macro avg       0.77  

In [25]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [26]:
std = StandardScaler()
mim = MinMaxScaler()

In [49]:
std.fit(x_train)
x_train_std = std.transform(x_train)
x_test_std = std.transform(x_test)

In [51]:
models = [lr, rfc, lgb, xx]

for m in models:
    m.fit(x_train_std, y_train)
    pred = m.predict(x_test_std)
    pro = m.predict_proba(x_test_std)[:,1]
    print(m.__class__.__name__)
    print(classification_report(y_test, pred))
    print(roc_auc_score(y_test, pro))
    print(m.score(x_test_std, y_test))
    print('='*100)

LogisticRegression
              precision    recall  f1-score   support

           0       0.77      0.87      0.82        39
           1       0.69      0.52      0.59        21

    accuracy                           0.75        60
   macro avg       0.73      0.70      0.71        60
weighted avg       0.74      0.75      0.74        60

0.7997557997557998
0.75
RandomForestClassifier
              precision    recall  f1-score   support

           0       0.77      0.92      0.84        39
           1       0.77      0.48      0.59        21

    accuracy                           0.77        60
   macro avg       0.77      0.70      0.71        60
weighted avg       0.77      0.77      0.75        60

0.9041514041514042
0.7666666666666667
LGBMClassifier
              precision    recall  f1-score   support

           0       0.80      0.92      0.86        39
           1       0.80      0.57      0.67        21

    accuracy                           0.80        60
   macro 

In [24]:
rid.fit(x_train, y_train)
las.fit(x_train, y_train)

rid_lr = rid.predict(x_test)
las_lr = las.predict(x_test)

print('rig_rmse : ', rmse(y_test, rid_lr), 'las_lr : ', rmse(y_test, las_lr))

NameError: name 'rmse' is not defined

In [52]:
from scipy.stats import skew

In [53]:
df.apply(lambda x: skew(x))

age                         0.420937
anaemia                     0.276863
creatinine_phosphokinase    4.440689
diabetes                    0.332251
ejection_fraction           0.552593
high_blood_pressure         0.623583
platelets                   1.454975
serum_creatinine            4.433610
serum_sodium               -1.042870
sex                        -0.623583
smoking                     0.766479
time                        0.127161
DEATH_EVENT                 0.766479
dtype: float64

In [None]:
# ?????????????????????모르겠다