### 파킨슨병 데이터
- 환자들의 뇌를 촬영한 사진의 상태를 기록한 자료에 각 환자의 상태 status(1: 파킨슨병 진단, 0: 파킨슨병 아님)로 추가한 테이블
- (data/parkinsons.csv)
1. 파킨슨 병을 예측하는 모델로 로지스틱 회귀모형을 적용하여 생성
2. 파킨슨병을 예측하는데 영향을 미치는 변수를 중요한 순서대로 3개 선정
3. 파킨슨 병을 진단하는 기준를 함수로 생성하여(매개변수명 = threshold, 함수명 = cutoff)을 0.5로 했을 때와 0.8로 했을 때 F1-스코어를 비교
    - 분석 조건
        - 필요 없는 컬럼 name을 삭제
        - 데이터의 정규화는 min-max 스케일러 사용
        - 로지스틱 회귀를 위한 상수항 추가
        - status는 카테고리 타입으로 변환
        - 트레이닝셋과 테스트셋 비율은 9:1
        - 모델은 로지스틱 회귀분석 사용
        - 모델의 최적화 방법론은 "lbfgs" 사용

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("./data/parkinsons.csv")
df

Unnamed: 0,name,MDVP:Fo(Hz),MDVP:Fhi(Hz),MDVP:Flo(Hz),MDVP:Jitter(%),MDVP:Jitter(Abs),MDVP:RAP,MDVP:PPQ,Jitter:DDP,MDVP:Shimmer,...,Shimmer:DDA,NHR,HNR,status,RPDE,DFA,spread1,spread2,D2,PPE
0,phon_R01_S01_1,119.992,157.302,74.997,0.00784,0.00007,0.00370,0.00554,0.01109,0.04374,...,0.06545,0.02211,21.033,1,0.414783,0.815285,-4.813031,0.266482,2.301442,0.284654
1,phon_R01_S01_2,122.400,148.650,113.819,0.00968,0.00008,0.00465,0.00696,0.01394,0.06134,...,0.09403,0.01929,19.085,1,0.458359,0.819521,-4.075192,0.335590,2.486855,0.368674
2,phon_R01_S01_3,116.682,131.111,111.555,0.01050,0.00009,0.00544,0.00781,0.01633,0.05233,...,0.08270,0.01309,20.651,1,0.429895,0.825288,-4.443179,0.311173,2.342259,0.332634
3,phon_R01_S01_4,116.676,137.871,111.366,0.00997,0.00009,0.00502,0.00698,0.01505,0.05492,...,0.08771,0.01353,20.644,1,0.434969,0.819235,-4.117501,0.334147,2.405554,0.368975
4,phon_R01_S01_5,116.014,141.781,110.655,0.01284,0.00011,0.00655,0.00908,0.01966,0.06425,...,0.10470,0.01767,19.649,1,0.417356,0.823484,-3.747787,0.234513,2.332180,0.410335
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
190,phon_R01_S50_2,174.188,230.978,94.261,0.00459,0.00003,0.00263,0.00259,0.00790,0.04087,...,0.07008,0.02764,19.517,0,0.448439,0.657899,-6.538586,0.121952,2.657476,0.133050
191,phon_R01_S50_3,209.516,253.017,89.488,0.00564,0.00003,0.00331,0.00292,0.00994,0.02751,...,0.04812,0.01810,19.147,0,0.431674,0.683244,-6.195325,0.129303,2.784312,0.168895
192,phon_R01_S50_4,174.688,240.005,74.287,0.01360,0.00008,0.00624,0.00564,0.01873,0.02308,...,0.03804,0.10715,17.883,0,0.407567,0.655683,-6.787197,0.158453,2.679772,0.131728
193,phon_R01_S50_5,198.764,396.961,74.904,0.00740,0.00004,0.00370,0.00390,0.01109,0.02296,...,0.03794,0.07223,19.020,0,0.451221,0.643956,-6.744577,0.207454,2.138608,0.123306


In [4]:
# name 컬럼 삭제
df.drop('name', axis = 1 ,inplace = True)

In [13]:
x = df.drop('status', axis = 1)
y = df['status']

In [15]:
# 데이터 정규화 (Min-Max 스케일링)
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

x_sc = scaler.fit_transform(x)

In [16]:
# 상수항 추가
df['intercept'] = 1

In [17]:
# status를 카테고리 타입으로 변환
df['status'] = df['status'].astype('category')

In [None]:
# 데이터 분할
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x_sc, y, test_size = 0.1, stratify = y, random_state = 42)

In [19]:
# 로지스틱 회귀 모델 생성 및 학습
from sklearn.linear_model import LogisticRegression
logR = LogisticRegression(solver = 'lbfgs')
logR.fit(x_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [36]:
# 변수 중요도 순서
feature_importance = pd.DataFrame({'Feature' : x_train.columns, "Importance" : logR.coef_[0]})
top_3 = feature_importance.sort_values(by='Importance', ascending=False).head(3)
print(top_3)

             Feature  Importance
20                D2    1.988830
9   MDVP:Shimmer(dB)    1.555016
18           spread1    1.244809


In [38]:
from sklearn.metrics import f1_score

In [39]:
# 파킨슨병 진단 함수
def cutoff(threshold):
    y_pred = logR.predict_proba(x_test)[:, 1]
    y_pred_binary = []
    for prob in y_pred:
        if prob >= threshold:
            y_pred_binary.append(1)
        else:
            y_pred_binary.append(0)
    f1 = f1_score(y_test, y_pred_binary)
    return f1

In [40]:
# F1-스코어 비교

f1_score_1 = cutoff(0.5)
f1_score_2 = cutoff(0.8)

print(f"F1-Score (threshold=0.5): {f1_score_1}")
print(f"F1-Score (threshold=0.8): {f1_score_2}")

F1-Score (threshold=0.5): 0.9032258064516129
F1-Score (threshold=0.8): 0.608695652173913
