### Cross Validation Task

### 약물 A, B, C, X, Y
##### 다중 분류(Multiclass Classification)
- 의학 연구원으로서 동일한 질병을 앓고 있는 일련의 환자에 대한 데이터를 수집했다.
- 치료 과정 동안 각 환자는 5가지 약물, 즉 약물 A, 약물 B, 약물 c, 약물 x 및 y 중 하나에 반응했다.
-  미래에 동일한 질병을 앓는 환자에게 어떤 약물이 적합할 수 있는지 알아보기 위한 모델을 구축한다.

##### feature
- Age: 환자의 나이
- Sex: 환자의 성별
- BP: 혈압
- Cholesterol: 콜레스테롤 수치
- Na_to_K: 나트륨-칼륨

##### target
- Drug: 의약품, 환자에게 효과가 있었던 약

In [1]:
import pandas as pd

drugs_df = pd.read_csv('./datasets/drugs.csv')
drugs_df

Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,Drug
0,23,F,HIGH,HIGH,25.355,drugY
1,47,M,LOW,HIGH,13.093,drugC
2,47,M,LOW,HIGH,10.114,drugC
3,28,F,NORMAL,HIGH,7.798,drugX
4,61,F,LOW,HIGH,18.043,drugY
...,...,...,...,...,...,...
195,56,F,LOW,HIGH,11.567,drugC
196,16,M,LOW,HIGH,12.006,drugC
197,52,M,NORMAL,HIGH,9.894,drugX
198,23,M,NORMAL,NORMAL,14.020,drugX


In [2]:
from sklearn.preprocessing import LabelEncoder
drugs_encoder = LabelEncoder()

targets = drugs_encoder.fit_transform(drugs_df['Drug'])
drugs_df['Drug'] = targets

In [3]:
display(drugs_df)
drugs_encoder.classes_[drugs_df.loc[0, 'Drug']]

Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,Drug
0,23,F,HIGH,HIGH,25.355,4
1,47,M,LOW,HIGH,13.093,2
2,47,M,LOW,HIGH,10.114,2
3,28,F,NORMAL,HIGH,7.798,3
4,61,F,LOW,HIGH,18.043,4
...,...,...,...,...,...,...
195,56,F,LOW,HIGH,11.567,2
196,16,M,LOW,HIGH,12.006,2
197,52,M,NORMAL,HIGH,9.894,3
198,23,M,NORMAL,NORMAL,14.020,3


'drugY'

In [4]:
from sklearn.preprocessing import LabelEncoder
gender_encoder = LabelEncoder()

targets = gender_encoder.fit_transform(drugs_df['Sex'])
drugs_df['Sex'] = targets

In [5]:
display(drugs_df)
gender_encoder.classes_[drugs_df.loc[0, 'Sex']]

Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,Drug
0,23,0,HIGH,HIGH,25.355,4
1,47,1,LOW,HIGH,13.093,2
2,47,1,LOW,HIGH,10.114,2
3,28,0,NORMAL,HIGH,7.798,3
4,61,0,LOW,HIGH,18.043,4
...,...,...,...,...,...,...
195,56,0,LOW,HIGH,11.567,2
196,16,1,LOW,HIGH,12.006,2
197,52,1,NORMAL,HIGH,9.894,3
198,23,1,NORMAL,NORMAL,14.020,3


'F'

In [6]:
from sklearn.preprocessing import LabelEncoder
blood_pressure_encoder = LabelEncoder()

targets = blood_pressure_encoder.fit_transform(drugs_df['BP'])
drugs_df['BP'] = targets

In [7]:
display(drugs_df)
blood_pressure_encoder.classes_[drugs_df.loc[0, 'BP']]

Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,Drug
0,23,0,0,HIGH,25.355,4
1,47,1,1,HIGH,13.093,2
2,47,1,1,HIGH,10.114,2
3,28,0,2,HIGH,7.798,3
4,61,0,1,HIGH,18.043,4
...,...,...,...,...,...,...
195,56,0,1,HIGH,11.567,2
196,16,1,1,HIGH,12.006,2
197,52,1,2,HIGH,9.894,3
198,23,1,2,NORMAL,14.020,3


'HIGH'

In [8]:
from sklearn.preprocessing import LabelEncoder
cholesterol_encoder = LabelEncoder()

targets = cholesterol_encoder.fit_transform(drugs_df['Cholesterol'])
drugs_df['Cholesterol'] = targets

In [9]:
display(drugs_df)
cholesterol_encoder.classes_[drugs_df.loc[0, 'Cholesterol']]

Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,Drug
0,23,0,0,0,25.355,4
1,47,1,1,0,13.093,2
2,47,1,1,0,10.114,2
3,28,0,2,0,7.798,3
4,61,0,1,0,18.043,4
...,...,...,...,...,...,...
195,56,0,1,0,11.567,2
196,16,1,1,0,12.006,2
197,52,1,2,0,9.894,3
198,23,1,2,1,14.020,3


'HIGH'

In [19]:
import pandas as pd

features = features.iloc[:, [0, -1]]
targets = drugs_encoder.fit_transform(drugs_df['Drug'])

target_df = pd.DataFrame(targets, columns=['Drug'])
target_df.value_counts()

Drug
4       91
3       54
0       23
1       16
2       16
Name: count, dtype: int64

In [21]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
import numpy as np

decision_tree_classifier = DecisionTreeClassifier(min_samples_leaf=6, random_state=124)
kfold = KFold(n_splits=5)

In [23]:
for train_index, test_index in kfold.split(features):
    print(train_index)
    print(test_index)
    print("=" * 80)

[ 40  41  42  43  44  45  46  47  48  49  50  51  52  53  54  55  56  57
  58  59  60  61  62  63  64  65  66  67  68  69  70  71  72  73  74  75
  76  77  78  79  80  81  82  83  84  85  86  87  88  89  90  91  92  93
  94  95  96  97  98  99 100 101 102 103 104 105 106 107 108 109 110 111
 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129
 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147
 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165
 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183
 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199]
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39]
[  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
  18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35
  36  37  38  39  80  81  82  83  84  85  86  87  88  89  90  91  92  93
  94  95

In [27]:
count = 0
cv_accuracy = []

for train_index, test_index in kfold.split(features):
    count += 1
    
    X_train, X_test = features.iloc[train_index], features.iloc[test_index]
    y_train, y_test = targets[train_index], targets[test_index]
    
    train_targets = pd.DataFrame(y_train)
    test_targets = pd.DataFrame(y_test)
    
    #학습 및 예측
    decision_tree_classifier.fit(X_train, y_train)
    prediction = decision_tree_classifier.predict(X_test)
    
    # 정확도 측정
    accuracy = np.round(accuracy_score(y_test, prediction), 4)
    
    cv_accuracy.append(accuracy)
    
    train_size = X_train.shape[0]
    test_size = X_test.shape[0]
    print(f"\n# {count} 교차 검증 정확도: {accuracy}, 학습 데이터 크기: {train_size}, 검증 데이터 크기: {test_size}")
    print(f"#{count} 학습 타겟 데이터 분포: \n{train_targets.value_counts()}")
    print(f"#{count} 검증 타겟 데이터 분포: \n{test_targets.value_counts()}")
    print(f"#{count} 학습 세트 인덱스: {train_index}")
    print(f"#{count} 검증 세트 인덱스: {test_index}")
    print("=" * 100)

# 폴드 별 검증 정확도를 합하여 평균 정확도 계산
print(f"▶ 평균 검증 정확도: {np.mean(cv_accuracy)}")


# 1 교차 검증 정확도: 0.7, 학습 데이터 크기: 160, 검증 데이터 크기: 40
#1 학습 타겟 데이터 분포: 
4    71
3    42
0    21
1    15
2    11
Name: count, dtype: int64
#1 검증 타겟 데이터 분포: 
4    20
3    12
2     5
0     2
1     1
Name: count, dtype: int64
#1 학습 세트 인덱스: [ 40  41  42  43  44  45  46  47  48  49  50  51  52  53  54  55  56  57
  58  59  60  61  62  63  64  65  66  67  68  69  70  71  72  73  74  75
  76  77  78  79  80  81  82  83  84  85  86  87  88  89  90  91  92  93
  94  95  96  97  98  99 100 101 102 103 104 105 106 107 108 109 110 111
 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129
 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147
 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165
 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183
 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199]
#1 검증 세트 인덱스: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31 32 33