In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from sklearn.preprocessing import scale
from sklearn.metrics import accuracy_score

In [2]:
df = pd.read_csv("heart_disease-mid.csv")
print(df.shape)
print(df.info())
display(df.head())

(2000, 9)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   HeartDisease     2000 non-null   int64  
 1   BMI              2000 non-null   float64
 2   Smoking          2000 non-null   int64  
 3   AlcoholDrinking  2000 non-null   int64  
 4   Stroke           2000 non-null   int64  
 5   M/F              2000 non-null   object 
 6   AgeCategory      2000 non-null   int64  
 7   GenHealth        2000 non-null   int64  
 8   SleepTime        2000 non-null   float64
dtypes: float64(2), int64(6), object(1)
memory usage: 140.8+ KB
None


Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,M/F,AgeCategory,GenHealth,SleepTime
0,0,24.37,0,0,0,M,0,4,7.0
1,0,41.2,0,0,0,F,1,3,7.0
2,0,19.8,0,0,0,M,10,4,6.0
3,0,24.33,1,0,0,M,8,2,8.0
4,0,19.74,0,0,0,F,0,4,8.0


### 説明変数と目的変数を分ける

In [6]:
X = df.loc[:, "BMI":]
y = df["HeartDisease"]

print("x: ", X.shape)
display(X.head())
print("y: ",y.shape)
print(y.head())

x:  (2000, 8)


Unnamed: 0,BMI,Smoking,AlcoholDrinking,Stroke,M/F,AgeCategory,GenHealth,SleepTime
0,24.37,0,0,0,M,0,4,7.0
1,41.2,0,0,0,F,1,3,7.0
2,19.8,0,0,0,M,10,4,6.0
3,24.33,1,0,0,M,8,2,8.0
4,19.74,0,0,0,F,0,4,8.0


y:  (2000,)
0    0
1    0
2    0
3    0
4    0
Name: HeartDisease, dtype: int64


In [7]:
print(y.value_counts())

HeartDisease
0    1000
1    1000
Name: count, dtype: int64


### ダミー変数化

In [9]:
X_dumm = pd.get_dummies(X, drop_first=True, dtype="uint8")
print("X_dumm: ", X_dumm.shape)
display(X_dumm.head())

X_dumm:  (2000, 8)


Unnamed: 0,BMI,Smoking,AlcoholDrinking,Stroke,AgeCategory,GenHealth,SleepTime,M/F_M
0,24.37,0,0,0,0,4,7.0,1
1,41.2,0,0,0,1,3,7.0,0
2,19.8,0,0,0,10,4,6.0,1
3,24.33,1,0,0,8,2,8.0,1
4,19.74,0,0,0,0,4,8.0,0


### 標準化なしでロジスティクス回帰分析

In [11]:
X_dumm_c = sm.add_constant(X_dumm)
model = sm.Logit(y, X_dumm_c)
results = model.fit()
print(results.summary())

Optimization terminated successfully.
         Current function value: 0.488728
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:           HeartDisease   No. Observations:                 2000
Model:                          Logit   Df Residuals:                     1991
Method:                           MLE   Df Model:                            8
Date:                Fri, 07 Jun 2024   Pseudo R-squ.:                  0.2949
Time:                        16:54:11   Log-Likelihood:                -977.46
converged:                       True   LL-Null:                       -1386.3
Covariance Type:            nonrobust   LLR p-value:                3.189e-171
                      coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------
const              -2.7243      0.449     -6.072      0.000      -3.604      -1.845
BMI           

### 命中率

In [12]:
y_pred = results.predict(X_dumm_c)
print("Predicted y: ")
print(y_pred.head())
y_pred_bin = (y_pred >= 0.5).astype("int")
print(y_pred_bin.head())

Predicted y: 
0    0.016228
1    0.041271
2    0.263944
3    0.616141
4    0.007364
dtype: float64
0    0
1    0
2    0
3    1
4    0
dtype: int32


In [21]:
acc = accuracy_score(y, y_pred_bin)
print("命中率: ", acc)


命中率:  0.768


### 回帰係数からオッズ比を求める

In [14]:
print("Odds ratio")
print(np.exp(results.params))

Odds ratio
const              0.065594
BMI                1.035041
Smoking            1.938841
AlcoholDrinking    0.559827
Stroke             2.784954
AgeCategory        1.379735
GenHealth          0.506740
SleepTime          0.982617
M/F_M              1.862844
dtype: float64


### 全説明変数を標準化してロジスティク回帰分析

In [15]:
X_scaled_ar = scale(X_dumm)
X_scaled = pd.DataFrame(X_scaled_ar, columns=X_dumm.columns)
X_scaled_c = sm.add_constant(X_scaled)

model = sm.Logit(y, X_scaled_c)
results_scaled = model.fit()
print(results_scaled.summary())

Optimization terminated successfully.
         Current function value: 0.488728
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:           HeartDisease   No. Observations:                 2000
Model:                          Logit   Df Residuals:                     1991
Method:                           MLE   Df Model:                            8
Date:                Fri, 07 Jun 2024   Pseudo R-squ.:                  0.2949
Time:                        17:01:42   Log-Likelihood:                -977.46
converged:                       True   LL-Null:                       -1386.3
Covariance Type:            nonrobust   LLR p-value:                3.189e-171
                      coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------
const              -0.0702      0.058     -1.221      0.222      -0.183       0.043
BMI           

### 標準化回帰係数を比較

In [16]:
print(results_scaled.params.sort_values(key=np.abs, ascending=False))

AgeCategory        1.092171
GenHealth         -0.766751
Smoking            0.331045
M/F_M              0.310176
Stroke             0.300320
BMI                0.228071
AlcoholDrinking   -0.120897
const             -0.070229
SleepTime         -0.027165
dtype: float64


### 得られたモデルを用いて、予測を行う

In [19]:
X_test = pd.DataFrame([[20.0, 0, 1, 0, 0, 4, 8.0, 0], [70.0, 1, 0, 1, 12, 0, 4.0, 1]], columns=X_dumm.columns)
print("X for prediction: ")
display(X_test)

X for prediction: 


Unnamed: 0,BMI,Smoking,AlcoholDrinking,Stroke,AgeCategory,GenHealth,SleepTime,M/F_M
0,20.0,0,1,0,0,4,8.0,0
1,70.0,1,0,1,12,0,4.0,1


In [20]:
X_test_c = sm.add_constant(X_test, has_constant="add")
y_test = results.predict(X_test_c)
print("Predicted y: ")
print(y_test)

Predicted y: 
0    0.004173
1    0.996944
dtype: float64
