#### 목표변수와 설명변수
- Subject: 환자
- Goal: Features가 Target인 수술실패여부에 미치는 영향 분석
- Target: '수술실패여부'
- Features: '고혈압여부', '성별', '신부전여부', '연령', '체중', '수술시간'

In [1]:
import pandas as pd

In [2]:
df_ROS = pd.read_csv('../datasets/RecurrenceOfSurgery.csv')
df_ROS_select = df_ROS[['수술실패여부', '고혈압여부', '성별', '신부전여부', '연령', '체중', '수술시간']]
df_ROS_select[:2]

Unnamed: 0,수술실패여부,고혈압여부,성별,신부전여부,연령,체중,수술시간
0,0,0,2,0,66,60.3,68.0
1,0,0,1,0,47,71.7,31.0


In [3]:
df_ROS.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1894 entries, 0 to 1893
Data columns (total 52 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Unnamed: 0              1894 non-null   int64  
 1   환자ID                    1894 non-null   object 
 2   Large Lymphocyte        1894 non-null   float64
 3   Location of herniation  1894 non-null   int64  
 4   ODI                     462 non-null    float64
 5   가족력                     1843 non-null   float64
 6   간질성폐질환                  1894 non-null   int64  
 7   고혈압여부                   1894 non-null   int64  
 8   과거수술횟수                  1894 non-null   int64  
 9   당뇨여부                    1894 non-null   int64  
 10  말초동맥질환여부                1894 non-null   int64  
 11  빈혈여부                    1894 non-null   int64  
 12  성별                      1894 non-null   int64  
 13  스테로이드치료                 1894 non-null   int64  
 14  신부전여부                   1894 non-null   

#### 전처리

In [4]:
df_ROS_select = df_ROS[['수술실패여부', '고혈압여부', '성별', '신부전여부', '연령', '체중', '수술시간']]
df_ROS_select[:2]

Unnamed: 0,수술실패여부,고혈압여부,성별,신부전여부,연령,체중,수술시간
0,0,0,2,0,66,60.3,68.0
1,0,0,1,0,47,71.7,31.0


#### Scaling & Encoding & Concat

##### - OneHotEncoding

In [5]:
# 범주형 데이터 확인 : '고혈압여부', '성별', '신부전여부'
df_ROS_select['고혈압여부'].value_counts(),df_ROS_select['성별'].value_counts(),df_ROS_select['신부전여부'].value_counts()

(고혈압여부
 0    1646
 1     248
 Name: count, dtype: int64,
 성별
 1    1168
 2     726
 Name: count, dtype: int64,
 신부전여부
 0    1846
 1      48
 Name: count, dtype: int64)

In [6]:
from sklearn.preprocessing import OneHotEncoder

In [7]:
# 범주형 설명변수 OneHotEncoding
oneHotEncoder = OneHotEncoder() # 인스턴스화
oneHotEncoder.fit(df_ROS_select[['고혈압여부', '성별', '신부전여부']])

In [8]:
oneHotEncoder.categories_

[array([0, 1], dtype=int64),
 array([1, 2], dtype=int64),
 array([0, 1], dtype=int64)]

In [9]:
encoded_data = oneHotEncoder.transform(df_ROS_select[['고혈압여부', '성별', '신부전여부']]).toarray()
encoded_data.shape

(1894, 6)

In [10]:
df_encoded_data = pd.DataFrame(data=encoded_data, columns=oneHotEncoder.get_feature_names_out(['고혈압여부', '성별', '신부전여부']))
df_encoded_data[:2]

Unnamed: 0,고혈압여부_0,고혈압여부_1,성별_1,성별_2,신부전여부_0,신부전여부_1
0,1.0,0.0,0.0,1.0,1.0,0.0
1,1.0,0.0,1.0,0.0,1.0,0.0


##### - 병합(Concat)

In [11]:
df_ROS_select= pd.concat([df_ROS_select.reset_index(drop=True), df_encoded_data.reset_index(drop=True)], axis=1)
df_ROS_select[:2]

Unnamed: 0,수술실패여부,고혈압여부,성별,신부전여부,연령,체중,수술시간,고혈압여부_0,고혈압여부_1,성별_1,성별_2,신부전여부_0,신부전여부_1
0,0,0,2,0,66,60.3,68.0,1.0,0.0,0.0,1.0,1.0,0.0
1,0,0,1,0,47,71.7,31.0,1.0,0.0,1.0,0.0,1.0,0.0


In [12]:
df_ROS_select.shape

(1894, 13)

##### - Scaling

In [13]:
df_ROS_select.columns

Index(['수술실패여부', '고혈압여부', '성별', '신부전여부', '연령', '체중', '수술시간', '고혈압여부_0',
       '고혈압여부_1', '성별_1', '성별_2', '신부전여부_0', '신부전여부_1'],
      dtype='object')

In [14]:
target = df_ROS_select['수술실패여부']
features = df_ROS_select.drop(columns=['수술실패여부', '고혈압여부', '성별', '신부전여부'])

In [15]:
features.columns

Index(['연령', '체중', '수술시간', '고혈압여부_0', '고혈압여부_1', '성별_1', '성별_2', '신부전여부_0',
       '신부전여부_1'],
      dtype='object')

##### - MinMaxScaler

In [16]:
from sklearn.preprocessing import MinMaxScaler

In [17]:
minMaxScaler = MinMaxScaler() #인스턴스화
features= minMaxScaler.fit_transform(features)
features.shape

(1894, 9)

#### 모델학습, apply()함수를 사용하여 null값 채우기
- null 값 채우기 : '수술시간'

In [18]:
# null값 확인 -> 수술시간 null값 존재
df_ROS_select.isnull().sum()

수술실패여부      0
고혈압여부       0
성별          0
신부전여부       0
연령          0
체중          0
수술시간       54
고혈압여부_0     0
고혈압여부_1     0
성별_1        0
성별_2        0
신부전여부_0     0
신부전여부_1     0
dtype: int64

In [19]:
# null값 삭제 -> null값이 없는 데이터로 모델 학습시키기 위함
df_ROS_select_drop = df_ROS_select.dropna()
df_ROS_select_drop.isnull().sum()

수술실패여부     0
고혈압여부      0
성별         0
신부전여부      0
연령         0
체중         0
수술시간       0
고혈압여부_0    0
고혈압여부_1    0
성별_1       0
성별_2       0
신부전여부_0    0
신부전여부_1    0
dtype: int64

In [20]:
# null값이 없는 데이터로 모델학습 준비
# 1. target : '수술시간', feature: '성별'
target = df_ROS_select_drop[['수술시간']]
feature = df_ROS_select_drop[['성별']]
target.shape, feature.shape

((1840, 1), (1840, 1))

In [21]:
# 2. null값이 없는 데이터와 실제값을 사용하여 회귀모델 훈련
from sklearn.linear_model import LinearRegression
model = LinearRegression() # 인스턴스화(초기화)
model.fit(feature, target) # 모델 훈련

In [22]:
# 모델 예측 확인해보기 (type : numpy의 array)
model.predict(feature)

array([[62.40798859],
       [61.85601405],
       [61.85601405],
       ...,
       [61.85601405],
       [61.85601405],
       [62.40798859]])

In [23]:
# apply()
import numpy as np
def convert_notnull(row) :
    if pd.isnull(row) : # 변수 row의 값이 null이라면
        feature = df_ROS_select[['성별']]
        result = model.predict(feature)
        return result
    else :
        return row  # null이 아니면 원래 데이터 값 반환

In [24]:
df_ROS_select['수술시간'] = df_ROS_select['수술시간'].apply(convert_notnull)
df_ROS_select['수술시간']

0       68.0
1       31.0
2       78.0
3       73.0
4       29.0
        ... 
1889    80.0
1890    20.0
1891    50.0
1892    25.0
1893    45.0
Name: 수술시간, Length: 1894, dtype: object

In [25]:
# apply() 적용 후 null값 확인
df_ROS_select['수술시간'].isnull().sum()

0

In [26]:
df_ROS_select.isnull().sum()

수술실패여부     0
고혈압여부      0
성별         0
신부전여부      0
연령         0
체중         0
수술시간       0
고혈압여부_0    0
고혈압여부_1    0
성별_1       0
성별_2       0
신부전여부_0    0
신부전여부_1    0
dtype: int64

#### Imbalanced Data Sampling
- under sampling : Tomek's Link

In [48]:
from imblearn.under_sampling import TomekLinks

In [49]:
from sklearn.datasets import make_classification

In [50]:
features, target = make_classification(n_classes=2, class_sep=2,
                    weights=[0.4, 0.6], n_informative=3, n_redundant=1, flip_y=0,
                    n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=10)

In [51]:
features.shape, target.shape

((1000, 20), (1000,))

In [52]:
from collections import Counter

In [53]:
Counter(target)

Counter({0: 400, 1: 600})

In [54]:
tomekLinks = TomekLinks() #인스턴스화
features_resample, target_resample = tomekLinks.fit_resample(features, target) #교육

In [55]:
features_resample.shape, target_resample.shape

((995, 20), (995,))

In [56]:
Counter(target_resample)

Counter({0: 400, 1: 595})

#### 정형화

In [37]:
from sklearn.model_selection import train_test_split
features_train, features_test, target_train, target_test = train_test_split(features, target, test_size = 0.3, random_state = 10)
features_train.shape, features_test.shape, target_train.shape, target_test.shape

((700, 20), (300, 20), (700,), (300,))

#### 모델학습

In [38]:
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
model = DecisionTreeClassifier()
# Target이 범주형이므로 Classifier을 사용

In [39]:
from sklearn.model_selection import GridSearchCV
hyper_params = {'min_samples_leaf' : range(2,5),
               'max_depth' : range(2,5),
               'min_samples_split' : range(2,5)}

#### 평가 Score Default, 분류(Accuracy), 예측(R square)

In [40]:
from sklearn.metrics import f1_score, make_scorer
scoring = make_scorer(f1_score)

In [41]:
grid_search = GridSearchCV(model, param_grid = hyper_params, cv=2, verbose=1, scoring=scoring)
grid_search.fit(features, target)

Fitting 2 folds for each of 27 candidates, totalling 54 fits


In [42]:
grid_search.best_estimator_

In [43]:
grid_search.best_score_, grid_search.best_params_

# 전처리 전의 정확도(accuracy) : 0.028571428571428574
# 낮은 정확도, 모델이 예측을 잘 수행하지 못함

(0.993282922517922,
 {'max_depth': 2, 'min_samples_leaf': 2, 'min_samples_split': 2})

In [44]:
best_model = grid_search.best_estimator_
best_model

In [45]:
target_test_predict = best_model.predict(features_test)
target_test_predict

array([1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1,
       1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1,
       0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0,
       1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0,
       1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1,
       0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0,
       1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0,
       1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0,
       1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1,
       1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1,
       0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1,
       1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1,
       0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1,
       1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0])

In [46]:
from sklearn.metrics import classification_report

In [47]:
print(classification_report(target_test, target_test_predict))

# 전처리 전의 값
#  precision  recall  f1-score   support
#0  0.94      1.00      0.97       516
#1  1.00      0.03      0.05        36

              precision    recall  f1-score   support

           0       0.99      1.00      1.00       131
           1       1.00      0.99      1.00       169

    accuracy                           1.00       300
   macro avg       1.00      1.00      1.00       300
weighted avg       1.00      1.00      1.00       300

