In [1]:
import numpy as np
import pandas as pd

1. 데이터 전처리

In [2]:
df = pd.read_csv('pima-indians-diabetes.csv', skiprows=9, header=None)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [3]:
# pima 는 Numpy n-dimensional array
pima = df.values
pima.shape

(768, 9)

In [4]:
X = pima[:,:-1]
y = pima[:,-1]

In [5]:
X.shape, y.shape

((768, 8), (768,))

In [6]:
X = df.iloc[:,:-1].values
y = df[8]
y = y.values

2. 훈련/테스트 데이터셋 분리

In [7]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=0.2, random_state=2021
)

In [8]:
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier(random_state=2021)
dtc.fit(X, y)
dtc.score(X, y)

1.0

In [9]:
from sklearn.model_selection import cross_validate
res = cross_validate(dtc, X, y)
res

{'fit_time': array([0.00399113, 0.00371051, 0.00199556, 0.00399494, 0.00306106]),
 'score_time': array([0.00099516, 0.        , 0.0010078 , 0.00100112, 0.        ]),
 'test_score': array([0.67532468, 0.67532468, 0.7012987 , 0.77777778, 0.73856209])}

In [10]:
from sklearn.model_selection import cross_val_score
cross_val_score(dtc, X, y, scoring='accuracy')

array([0.67532468, 0.67532468, 0.7012987 , 0.77777778, 0.73856209])

3. GridSearchCV

In [11]:
import warnings
warnings.filterwarnings('ignore')
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

In [12]:
dtc = DecisionTreeClassifier(random_state=2021)
params = {
    'max_depth': [2,3,4,5,6],
    'min_samples_split': [2,3,4]
}

In [13]:
from sklearn.model_selection import GridSearchCV
grid_dt = GridSearchCV(
    dtc, param_grid=params, scoring='accuracy', cv=5
)

In [14]:
grid_dt.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=DecisionTreeClassifier(random_state=2021),
             param_grid={'max_depth': [2, 3, 4, 5, 6],
                         'min_samples_split': [2, 3, 4]},
             scoring='accuracy')

In [15]:
grid_dt.best_params_

{'max_depth': 2, 'min_samples_split': 2}

In [16]:
best_clf = grid_dt.best_estimator_
best_clf

DecisionTreeClassifier(max_depth=2, random_state=2021)

In [17]:
best_clf.score(X_test, y_test)

0.7337662337662337

4. Support Vector Macine

In [18]:
from sklearn.svm import SVC
svc = SVC(random_state=2021)
svc.get_params()

{'C': 1.0,
 'break_ties': False,
 'cache_size': 200,
 'class_weight': None,
 'coef0': 0.0,
 'decision_function_shape': 'ovr',
 'degree': 3,
 'gamma': 'scale',
 'kernel': 'rbf',
 'max_iter': -1,
 'probability': False,
 'random_state': 2021,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}

In [19]:
params = {'C':[0.01, 0.1, 1, 10, 100]}
grid_sv = GridSearchCV(svc, param_grid=params, scoring='accuracy', cv=5)
grid_sv.fit(X_train, y_train)
grid_sv.best_params_

{'C': 1}

In [20]:
params = {'C':[0.5, 1, 1.5]}
grid_sv = GridSearchCV(svc, param_grid=params, scoring='accuracy', cv=5)
grid_sv.fit(X_train, y_train)
grid_sv.best_params_

{'C': 1.5}

In [21]:
best_svc = grid_sv.best_estimator_
best_svc.score(X_test, y_test)

0.7467532467532467