In [1]:
import pandas as pd

#### 목표변수와 설명변수
- Subject: 환자
- Goal: Features가 Target인 수술실패여부에 미치는 영향 분석
- Target: '수술실패여부'
- Features: '고혈압여부', '성별', '신부전여부', '연령', '체중', '수술시간'

In [5]:
df_ROS = pd.read_csv('../../datasets/RecurrenceOfSurgery.csv')
df_ROS_select = df_ROS[['수술실패여부', '고혈압여부', '성별', '신부전여부', '연령', '체중', '수술시간']]
df_ROS_select[:2]

Unnamed: 0,수술실패여부,고혈압여부,성별,신부전여부,연령,체중,수술시간
0,0,0,2,0,66,60.3,68.0
1,0,0,1,0,47,71.7,31.0


In [14]:
df_ROS_select.isnull().sum()

수술실패여부     0
고혈압여부      0
성별         0
신부전여부      0
연령         0
체중         0
수술시간      54
dtype: int64

In [18]:
# 수술시간이 Null인 행 삭제
df_ROS_select = df_ROS_select.dropna(subset=['수술시간'], how='any', axis=0)

In [19]:
df_ROS_select.isnull().sum()

수술실패여부    0
고혈압여부     0
성별        0
신부전여부     0
연령        0
체중        0
수술시간      0
dtype: int64

#### 정형화

In [20]:
target = df_ROS_select['수술실패여부']
features = df_ROS_select[['연령', '체중', '수술시간']]
target.shape, features.shape

((1840,), (1840, 3))

In [21]:
from sklearn.model_selection import train_test_split
features_train, features_test, target_train, target_test = train_test_split(features, target, random_state = 111)
features_train.shape, features_test.shape, target_train.shape, target_test.shape

((1380, 3), (460, 3), (1380,), (460,))

#### 모델학습

In [24]:
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
model = DecisionTreeClassifier()
# 연속형 변수와 관련된 작업은 Tree로

In [25]:
from sklearn.model_selection import GridSearchCV
hyper_params = {'min_samples_leaf' : range(2,5),
               'max_depth' : range(2,5),
               'min_samples_split' : range(2,5)}

#### 평가 Score Default, 분류(Accuracy), 예측(R square)

In [26]:
from sklearn.metrics import f1_score, make_scorer
scoring = make_scorer(f1_score)

In [27]:
grid_search = GridSearchCV(model, param_grid = hyper_params, cv=2, verbose=1, scoring=scoring)
grid_search.fit(features, target)

Fitting 2 folds for each of 27 candidates, totalling 54 fits


In [28]:
grid_search.best_estimator_

In [29]:
grid_search.best_score_, grid_search.best_params_

(0.028571428571428574,
 {'max_depth': 3, 'min_samples_leaf': 2, 'min_samples_split': 2})

In [30]:
best_model = grid_search.best_estimator_
best_model

In [31]:
target_test_predict = best_model.predict(features_test)
target_test_predict

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [32]:
from sklearn.metrics import classification_report

In [33]:
print(classification_report(target_test, target_test_predict))

              precision    recall  f1-score   support

           0       0.93      1.00      0.97       429
           1       0.00      0.00      0.00        31

    accuracy                           0.93       460
   macro avg       0.47      0.50      0.48       460
weighted avg       0.87      0.93      0.90       460



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
