## 데이터셋 출처
* [Pima Indians Diabetes Database | Kaggle](https://www.kaggle.com/uciml/pima-indians-diabetes-database)
* https://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_diabetes.html


### 데이터 구성

* Pregnancies : 임신 횟수
* Glucose : 2시간 동안의 경구 포도당 내성 검사에서 혈장 포도당 농도 
* BloodPressure : 이완기 혈압 (mm Hg)
* SkinThickness : 삼두근 피부 주름 두께 (mm), 체지방을 추정하는데 사용되는 값
* Insulin : 2시간 혈청 인슐린 (mu U / ml)
* BMI : 체질량 지수 (체중kg / 키(m)^2)
* DiabetesPedigreeFunction : 당뇨병 혈통 기능
* Age : 나이
* Outcome : 768개 중에 268개의 결과 클래스 변수(0 또는 1)는 1이고 나머지는 0입니다.


## 필요한 라이브러리 로드

In [2]:
# 데이터 분석을 위한 pandas, 수치계산을 위한 numpy
# 시각화를 위한 seaborn, matplotlib.pyplot 을 로드합니다. 

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline

## 데이터셋 로드

In [7]:
df = pd.read_csv("data/diabetes_feature.csv")
df.shape

(768, 16)

In [8]:
df_insulin = pd.read_csv("data/diabetes_fill_insulin.csv")
df["Insulin"] = df_insulin["Insulin"]
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome,Pregnancies_high,Age_low,Age_middle,Age_high,Insulin_nan,Insulin_log,low_glu_insulin
0,6,148,72,35,182.66561,33.6,0.627,50,1,False,False,True,False,169.5,5.138735,False
1,1,85,66,29,57.73325,26.6,0.351,31,0,False,False,True,False,102.5,4.639572,True
2,8,183,64,0,186.519678,23.3,0.672,32,1,True,False,True,False,169.5,5.138735,False
3,1,89,66,23,94.0,28.1,0.167,21,0,False,True,False,False,94.0,4.553877,True
4,0,137,40,35,168.0,43.1,2.288,33,1,False,False,True,False,168.0,5.129899,False


In [9]:
# 데이터셋을 미리보기 합니다.

df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome,Pregnancies_high,Age_low,Age_middle,Age_high,Insulin_nan,Insulin_log,low_glu_insulin
0,6,148,72,35,182.66561,33.6,0.627,50,1,False,False,True,False,169.5,5.138735,False
1,1,85,66,29,57.73325,26.6,0.351,31,0,False,False,True,False,102.5,4.639572,True
2,8,183,64,0,186.519678,23.3,0.672,32,1,True,False,True,False,169.5,5.138735,False
3,1,89,66,23,94.0,28.1,0.167,21,0,False,True,False,False,94.0,4.553877,True
4,0,137,40,35,168.0,43.1,2.288,33,1,False,False,True,False,168.0,5.129899,False


## 학습과 예측에 사용할 데이터셋 만들기

In [10]:
df.columns

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome', 'Pregnancies_high',
       'Age_low', 'Age_middle', 'Age_high', 'Insulin_nan', 'Insulin_log',
       'low_glu_insulin'],
      dtype='object')

In [11]:
X = df[['Glucose', 'BloodPressure', 'SkinThickness',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Pregnancies_high',
       'Insulin']]
X.shape

(768, 8)

In [12]:
y = df['Outcome']
y.shape

(768,)

In [13]:
# 사이킷런에서 제공하는 model_selection 의 train_test_split 으로 만듭니다.

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

In [14]:
# train 세트의 문제와 정답의 데이터 수를 확인해 주세요.

X_train.shape, y_train.shape

((614, 8), (614,))

In [15]:
# test 세트의 문제와 정답의 데이터 수를 확인해 주세요.

X_test.shape, y_test.shape

((154, 8), (154,))

## 여러 개의 알고리즘을 사용해서 비교하기

In [16]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

estimators = [DecisionTreeClassifier(random_state=42),
             RandomForestClassifier(random_state=42),
             GradientBoostingClassifier(random_state=42)
            ]
estimators

[DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                        max_depth=None, max_features=None, max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, presort='deprecated',
                        random_state=42, splitter='best'),
 RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                        criterion='gini', max_depth=None, max_features='auto',
                        max_leaf_nodes=None, max_samples=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, n_estimators=100,
                        n_jobs=None, oob_score=False, random_state=42, verbose=0,
                        warm_start=False),
 GradientBoost

In [17]:
max_depth = np.random.randint(2, 20, 10)
max_depth

array([17, 17, 11,  9,  4,  8,  2,  3, 18, 14])

In [18]:
max_features = np.random.uniform(0.3, 1.0, 10)
max_features

array([0.35552391, 0.60442609, 0.89168654, 0.60985534, 0.96218866,
       0.59671363, 0.35642827, 0.35723371, 0.47558156, 0.72363583])

In [19]:
results = []
for estimator in estimators:
    result = []
    result.append(estimator.__class__.__name__)
    results.append(result)
results

[['DecisionTreeClassifier'],
 ['RandomForestClassifier'],
 ['GradientBoostingClassifier']]

In [20]:
# param_distributions["n_estimators"] = np.random.randint(100, 1000, 10)
# param_distributions

In [21]:
from sklearn.model_selection import RandomizedSearchCV

max_depth = np.random.randint(2, 20, 10)
max_features = np.random.uniform(0.3, 1.0, 10)

param_distributions = {"max_depth": max_depth, 
                       "max_features": max_features}

results = []
for estimator in estimators:
    result = []
    if estimator.__class__.__name__ != 'DecisionTreeClassifier':
        param_distributions["n_estimators"] = np.random.randint(100, 200, 10)
        
    clf = RandomizedSearchCV(estimator, 
                       param_distributions, 
                       n_iter=100,
                       scoring="accuracy",
                       n_jobs=-1,
                       cv=5, 
                       verbose=2
                      )

    clf.fit(X_train, y_train)
    result.append(estimator.__class__.__name__)
    result.append(clf.best_params_)
    result.append(clf.best_score_)
    result.append(clf.score(X_test, y_test))
    result.append(clf.cv_results_)
    results.append(result)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  38 tasks      | elapsed:    2.1s
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:    3.4s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    5.6s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:   29.6s
[Parallel(n_jobs=-1)]: Done 357 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  1.5min finished


Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    6.4s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:   34.1s
[Parallel(n_jobs=-1)]: Done 357 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  2.0min finished


In [22]:
df = pd.DataFrame(results, 
             columns=["estimator", "best_params", "train_score", "test_score", "cv_result"])
df

Unnamed: 0,estimator,best_params,train_score,test_score,cv_result
0,DecisionTreeClassifier,"{'max_features': 0.8180028032092377, 'max_dept...",0.758963,0.766234,"{'mean_fit_time': [0.007220268249511719, 0.007..."
1,RandomForestClassifier,"{'n_estimators': 147, 'max_features': 0.822606...",0.793216,0.733766,"{'mean_fit_time': [0.5428557872772217, 0.68795..."
2,GradientBoostingClassifier,"{'n_estimators': 132, 'max_features': 0.304626...",0.781807,0.779221,"{'mean_fit_time': [1.2075350761413575, 0.29707..."


In [23]:
pd.DataFrame(df.loc[1, "cv_result"]).sort_values(by="rank_test_score")

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_estimators,param_max_features,param_max_depth,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
89,0.707551,0.027319,0.048029,0.009240,147,0.822606,4,"{'n_estimators': 147, 'max_features': 0.822606...",0.813008,0.829268,0.764228,0.731707,0.827869,0.793216,0.038771,1
94,0.915177,0.047109,0.054570,0.007325,197,0.523535,6,"{'n_estimators': 197, 'max_features': 0.523535...",0.804878,0.829268,0.756098,0.747967,0.819672,0.791577,0.033309,2
38,0.728567,0.033881,0.037683,0.004911,164,0.822606,17,"{'n_estimators': 164, 'max_features': 0.822606...",0.804878,0.821138,0.764228,0.739837,0.819672,0.789951,0.032409,3
30,0.499072,0.016985,0.035902,0.008088,147,0.714029,4,"{'n_estimators': 147, 'max_features': 0.714028...",0.813008,0.829268,0.747967,0.731707,0.819672,0.788325,0.040256,4
82,0.673295,0.006454,0.039532,0.005676,183,0.714029,6,"{'n_estimators': 183, 'max_features': 0.714028...",0.813008,0.829268,0.756098,0.739837,0.803279,0.788298,0.034348,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56,0.692040,0.042733,0.036579,0.001114,197,0.822606,3,"{'n_estimators': 197, 'max_features': 0.822606...",0.764228,0.804878,0.739837,0.715447,0.778689,0.760616,0.030889,96
3,0.415954,0.017077,0.025037,0.003560,147,0.615267,3,"{'n_estimators': 147, 'max_features': 0.615266...",0.756098,0.780488,0.731707,0.723577,0.803279,0.759030,0.029767,97
69,0.484662,0.005894,0.028507,0.002981,147,0.563565,3,"{'n_estimators': 147, 'max_features': 0.563565...",0.756098,0.780488,0.731707,0.723577,0.803279,0.759030,0.029767,97
32,0.538900,0.025098,0.033733,0.002514,172,0.615267,3,"{'n_estimators': 172, 'max_features': 0.615266...",0.756098,0.788618,0.731707,0.715447,0.795082,0.757390,0.031037,99
