# 로지스틱 회귀 연습문제
### 문제 1
- 피마 인디언 당뇨병 발병여부를 예측할 수 있는 분석 모델을 구현하기 위해 아래와 같은 항목들을 조사하였다. 분석하라.
> https://data.hossam.kr/E05/indian_diabetes.xlsx
- 단, 모든 독립변수는 명목형 변수를 포함하지 않으며 (-> 더미변수로 안 바꿔도 됨) 정규분포를 만족한다고 가정(-> 표준화 해야됨?)한다.



In [1]:
from pandas import DataFrame, merge, read_excel
from matplotlib import pyplot as pyplot
import seaborn as sb
import numpy as np
from patsy import dmatrix
import sys, os
sys.path.append(os.path.dirname(os.path.dirname(os.getcwd())))
from helper import my_logit, scalling

#### 데이터 가져오기

In [2]:
df = read_excel("https://data.hossam.kr/E05/indian_diabetes.xlsx")
df

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


#### 데이터 전처리 없이 분석 수행

In [3]:
x = list(df.columns)
x.remove('Outcome')
x

['Pregnancies',
 'Glucose',
 'BloodPressure',
 'SkinThickness',
 'Insulin',
 'BMI',
 'DiabetesPedigreeFunction',
 'Age']

In [4]:
logit_result = my_logit(df, y="Outcome", x=x)
print(logit_result.summary)

Optimization terminated successfully.
         Current function value: 0.470993
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:                Outcome   No. Observations:                  768
Model:                          Logit   Df Residuals:                      759
Method:                           MLE   Df Model:                            8
Date:                Tue, 01 Aug 2023   Pseudo R-squ.:                  0.2718
Time:                        14:50:34   Log-Likelihood:                -361.72
converged:                       True   LL-Null:                       -496.74
Covariance Type:            nonrobust   LLR p-value:                 9.652e-54
                               coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------------------
Intercept                   -8.4047      0.717    -11.728      0.000      -9.809

In [5]:
logit_result.cmdf

Unnamed: 0,Negative,Positive
True,445,156
False,112,55


In [7]:
logit_result.result_df.T

Unnamed: 0,0
설명력(Pseudo-Rsqe),0.27181
정확도(Accuracy),0.782552
정밀도(Precision),0.739336
"재현율(Recall, TPR)",0.58209
"위양성율(Fallout, FPR)",0.11
"특이성(Specificity, TNR)",0.89
RAS,0.736045
f1_score,0.651357


In [8]:
logit_result.odds_rate_df

Unnamed: 0,odds_rate
Intercept,0.000224
Pregnancies,1.131091
Glucose,1.035789
BloodPressure,0.986792
SkinThickness,1.000619
Insulin,0.998809
BMI,1.093847
DiabetesPedigreeFunction,2.573276
Age,1.01498


#### 표준화 적용하기

In [12]:
y_train = df.filter(['Outcome'])
x_train_std_df = scalling(df.drop('Outcome', axis=1))
result_df = merge(x_train_std_df, y_train, left_index=True, right_index=True)
result_df

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,0.639947,0.848324,0.149641,0.907270,-0.692891,0.204013,0.468492,1.425995,1
1,-0.844885,-1.123396,-0.160546,0.530902,-0.692891,-0.684422,-0.365061,-0.190672,0
2,1.233880,1.943724,-0.263941,-1.288212,-0.692891,-1.103255,0.604397,-0.105584,1
3,-0.844885,-0.998208,-0.160546,0.154533,0.123302,-0.494043,-0.920763,-1.041549,0
4,-1.141852,0.504055,-1.504687,0.907270,0.765836,1.409746,5.484909,-0.020496,1
...,...,...,...,...,...,...,...,...,...
763,1.827813,-0.622642,0.356432,1.722735,0.870031,0.115169,-0.908682,2.532136,0
764,-0.547919,0.034598,0.046245,0.405445,-0.692891,0.610154,-0.398282,-0.531023,0
765,0.342981,0.003301,0.149641,0.154533,0.279594,-0.735190,-0.685193,-0.275760,0
766,-0.844885,0.159787,-0.470732,-1.288212,-0.692891,-0.240205,-0.371101,1.170732,1


In [13]:
logit_result = my_logit(result_df, y="Outcome", x=x)
print(logit_result.summary)

Optimization terminated successfully.
         Current function value: 0.470993
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:                Outcome   No. Observations:                  768
Model:                          Logit   Df Residuals:                      759
Method:                           MLE   Df Model:                            8
Date:                Tue, 01 Aug 2023   Pseudo R-squ.:                  0.2718
Time:                        14:55:56   Log-Likelihood:                -361.72
converged:                       True   LL-Null:                       -496.74
Covariance Type:            nonrobust   LLR p-value:                 9.652e-54
                               coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------------------
Intercept                   -0.8711      0.097     -8.986      0.000      -1.061

In [15]:
logit_result.result_df.T

Unnamed: 0,0
설명력(Pseudo-Rsqe),0.27181
정확도(Accuracy),0.782552
정밀도(Precision),0.739336
"재현율(Recall, TPR)",0.58209
"위양성율(Fallout, FPR)",0.11
"특이성(Specificity, TNR)",0.89
RAS,0.736045
f1_score,0.651357
