In [5]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
#from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import plot_tree
from sklearn.metrics import accuracy_score

## 데이터 불러오기

In [6]:
df=pd.read_csv('../data/diabetes_feature.csv')
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome,preg_high,Age_low,Age_middle,Age_high,Insulin_n,Insulin_log,low_glu_insulin
0,6,0.911763,72,35,0,33.6,0.495819,50,1,False,False,True,False,206.846154,5.336798,False
1,1,-1.10014,66,29,0,26.6,-0.363426,31,0,False,False,True,False,130.287879,4.877392,True
2,8,2.029486,64,0,0,23.3,0.635913,32,1,True,False,True,False,206.846154,5.336798,False
3,1,-0.9724,66,23,94,28.1,-0.936255,21,0,False,True,False,False,94.0,4.553877,True
4,0,0.560478,40,35,168,43.1,5.66685,33,1,False,False,True,False,168.0,5.129899,False


## Train, Test split

In [7]:
# 독립변수: X, 종속변수: y
X = df[['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin_n',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'preg_high','low_glu_insulin']]
y = df['Outcome']

X_train, X_test,y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=1111)

## RandomForest 

### Fit, predict

In [8]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(random_state=1111, n_estimators = 1000)
model.fit(X_train, y_train)

RandomForestClassifier(n_estimators=1000, random_state=1111)

In [9]:
predict = model.predict(X_test)

### accuracy

In [10]:
accuracy_score(y_test, predict)

0.9060402684563759

## Gradient Boosting

### Fit, Predict

In [11]:
from sklearn.ensemble import GradientBoostingClassifier

model = GradientBoostingClassifier(random_state=1111, n_estimators = 1000)
model.fit(X_train, y_train)

GradientBoostingClassifier(n_estimators=1000, random_state=1111)

In [12]:
predict = model.predict(X_test)

### accuracy

In [13]:
accuracy_score(y_test, predict)

0.8926174496644296

## 여러 개의 알고리즘을 사용해서 비교하기

In [15]:
from sklearn.tree import DecisionTreeClassifier

models = [DecisionTreeClassifier(random_state=1111),
        RandomForestClassifier(random_state=1111),
        GradientBoostingClassifier(random_state=1111)]

In [None]:
from sklearn.model_selection import RandomizedSearchCV

param_distributions = {
    "max_depth" : np.random.randint(2,20,10),
    "max_features" : np.random.uniform(0.3,1.0,10)
}

results = []
for model in models:
    result = []
    
    # randomforest, gradient boosting에서 n_estimator 늘려주기
    if model.__class__.__name__ != 'DecisionTreeClassifier':
        param_distributions['n_estimators'] = np.random.randint(100,200,10)
    
    clf = RandomizedSearchCV(model, 
                             param_distributions,
                             n_iter = 100, 
                             scoring = 'accuracy' , 
                             n_jobs = -1, 
                             cv = 5,
                             verbose = 1)
    clf.fit(X_train,y_train)
    result.append(model.__class__.__name__)
    result.append(clf.best_params_)
    result.append(clf.best_score_)
    
    results.append(result)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done 248 tasks      | elapsed:    1.5s
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:    2.3s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    7.0s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   32.3s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  1.5min finished


Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   18.4s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  1.3min


In [None]:
pd.DataFrame(results)