## 붓꽃(Iris) 품종 예측 예제

- 필요 패키지 로딩

In [1]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

- 데이터 로딩

In [2]:
iris = load_iris()
iris_data = iris.data
iris_label = iris.target
print('iris target값', iris_label)
print('iris target명', iris.target_names)

iris target값 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2]
iris target명 ['setosa' 'versicolor' 'virginica']


- 데이터 프레임으로 변환

In [3]:
iris_df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
iris_df['label'] = iris.target
iris_df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),label
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


- 난수표 지정

In [4]:
np.random.seed(1234)

- train/test set 분할

In [5]:
x = iris_df.drop('label', axis='columns')
x_train, x_test, y_train, y_test = train_test_split(x, iris_df.label, test_size=0.3)

- 모델 정의

In [6]:
dt = DecisionTreeClassifier()

- 모델 학습

In [7]:
dt.fit(x_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

- 추론(예측값 생성)

In [8]:
pred = dt.predict(x_test)
pred

array([1, 1, 2, 0, 1, 0, 0, 0, 1, 2, 1, 0, 2, 1, 0, 1, 2, 0, 2, 1, 1, 1,
       1, 1, 2, 0, 2, 1, 2, 0, 1, 2, 0, 2, 2, 0, 0, 0, 0, 1, 0, 1, 0, 2,
       2])

- 모델 평가

In [9]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, pred)

0.9777777777777777

## 교차 검증(Cross Validation)

- k-fold 교차검증 예제

In [10]:
from sklearn.model_selection import KFold

In [11]:
kfold = KFold(n_splits=5)
cv_accuracy=[]

In [12]:
for train_idx, test_idx in kfold.split(x):
    x_train, x_test = x.iloc[train_idx], x.iloc[test_idx]
    y_train, y_test = iris_df.label[train_idx], iris_df.label[test_idx]
    
    dt = DecisionTreeClassifier()
    dt.fit(x_train, y_train)
    pred = dt.predict(x_test)
    
    acc = np.round(accuracy_score(y_test, pred), 4)
    cv_accuracy.append(acc)
    
print(cv_accuracy)
print('평균 검증 정확도:', np.mean(cv_accuracy))

[1.0, 0.9667, 0.8667, 0.9333, 0.7333]
평균 검증 정확도: 0.9


In [13]:
iris_df.label.value_counts()

2    50
1    50
0    50
Name: label, dtype: int64

In [14]:
n_iter = 0
for train_idx, test_idx in kfold.split(x):
    n_iter += 1
    y_train, y_test = iris_df.label[train_idx], iris_df.label[test_idx]
    print('#### 교차 검증:', n_iter)
    print('학습 데이터 분포\n', y_train.value_counts())
    print('검증 데이터 분포\n', y_test.value_counts())
    print('')

#### 교차 검증: 1
학습 데이터 분포
 2    50
1    50
0    20
Name: label, dtype: int64
검증 데이터 분포
 0    30
Name: label, dtype: int64

#### 교차 검증: 2
학습 데이터 분포
 2    50
1    40
0    30
Name: label, dtype: int64
검증 데이터 분포
 0    20
1    10
Name: label, dtype: int64

#### 교차 검증: 3
학습 데이터 분포
 2    50
0    50
1    20
Name: label, dtype: int64
검증 데이터 분포
 1    30
Name: label, dtype: int64

#### 교차 검증: 4
학습 데이터 분포
 0    50
1    40
2    30
Name: label, dtype: int64
검증 데이터 분포
 2    20
1    10
Name: label, dtype: int64

#### 교차 검증: 5
학습 데이터 분포
 1    50
0    50
2    20
Name: label, dtype: int64
검증 데이터 분포
 2    30
Name: label, dtype: int64



- Stratified k-fold

In [15]:
from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits=3)
n_iter = 0
for train_idx, test_idx in skf.split(x, iris_df.label):
    n_iter += 1
    y_train, y_test = iris_df.label[train_idx], iris_df.label[test_idx]
    print('#### 교차 검증:', n_iter)
    print('학습 데이터 분포\n', y_train.value_counts())
    print('검증 데이터 분포\n', y_test.value_counts())
    print('')

#### 교차 검증: 1
학습 데이터 분포
 2    33
1    33
0    33
Name: label, dtype: int64
검증 데이터 분포
 2    17
1    17
0    17
Name: label, dtype: int64

#### 교차 검증: 2
학습 데이터 분포
 2    33
1    33
0    33
Name: label, dtype: int64
검증 데이터 분포
 2    17
1    17
0    17
Name: label, dtype: int64

#### 교차 검증: 3
학습 데이터 분포
 2    34
1    34
0    34
Name: label, dtype: int64
검증 데이터 분포
 2    16
1    16
0    16
Name: label, dtype: int64



In [16]:
for train_idx, test_idx in skf.split(x, iris_df.label):
    x_train, x_test = x.iloc[train_idx], x.iloc[test_idx]
    y_train, y_test = iris_df.label[train_idx], iris_df.label[test_idx]
    
    dt = DecisionTreeClassifier()
    dt.fit(x_train, y_train)
    pred = dt.predict(x_test)
    
    acc = np.round(accuracy_score(y_test, pred), 4)
    cv_accuracy.append(acc)
    
print(cv_accuracy)
print('평균 검증 정확도:', np.mean(cv_accuracy))

[1.0, 0.9667, 0.8667, 0.9333, 0.7333, 0.9804, 0.9216, 0.9792]
평균 검증 정확도: 0.92265


- cross validation

In [17]:
from sklearn.model_selection import cross_val_score

dt = DecisionTreeClassifier()
scores = cross_val_score(dt, x, iris_df.label, scoring='accuracy', cv=5)
print(scores)
print('평균 검증 정확도:', np.mean(scores))

[0.96666667 0.96666667 0.9        1.         1.        ]
평균 검증 정확도: 0.9666666666666668


- GridSearch

In [18]:
dt_param = {'max_depth': [1, 2, 3],
              'min_samples_split': [2,3]
             }

In [19]:
from sklearn.model_selection import GridSearchCV

x_train, x_test, y_train, y_test = train_test_split(x, iris_df.label, test_size=0.3)

dt = DecisionTreeClassifier()

grid_dt = GridSearchCV(dt, param_grid=dt_param, cv=5, refit=True)
grid_dt.fit(x_train, y_train)

score_df = pd.DataFrame(grid_dt.cv_results_)
score_df



Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_min_samples_split,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.001524,8.3e-05,0.000682,2.7e-05,1,2,"{'max_depth': 1, 'min_samples_split': 2}",0.727273,0.727273,0.714286,0.7,0.7,0.714286,0.012183,5
1,0.001179,0.000104,0.000547,1e-05,1,3,"{'max_depth': 1, 'min_samples_split': 3}",0.727273,0.727273,0.714286,0.7,0.7,0.714286,0.012183,5
2,0.001142,1.4e-05,0.000538,1.2e-05,2,2,"{'max_depth': 2, 'min_samples_split': 2}",0.909091,0.863636,0.904762,1.0,0.95,0.92381,0.045967,4
3,0.00113,1.4e-05,0.00059,8.2e-05,2,3,"{'max_depth': 2, 'min_samples_split': 3}",0.909091,0.909091,0.904762,1.0,0.95,0.933333,0.03618,1
4,0.001152,1.5e-05,0.000548,1e-05,3,2,"{'max_depth': 3, 'min_samples_split': 2}",0.909091,0.909091,0.904762,1.0,0.95,0.933333,0.03618,1
5,0.00114,1.6e-05,0.000552,1.2e-05,3,3,"{'max_depth': 3, 'min_samples_split': 3}",0.909091,0.909091,0.904762,1.0,0.95,0.933333,0.03618,1


In [20]:
print(grid_dt.best_params_)
print(grid_dt.best_score_)

{'max_depth': 2, 'min_samples_split': 3}
0.9333333333333333


In [21]:
best_dt = grid_dt.best_estimator_
pred = best_dt.predict(x_test)
print(accuracy_score(y_test, pred))

0.9555555555555556


- Feature Scaling

In [22]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
ss = StandardScaler()
mm = MinMaxScaler()

ss.fit(iris_df)
mm.fit(iris_df)

iris_ss = ss.transform(iris_df)
iris_mm = mm.transform(iris_df)

In [23]:
print(pd.DataFrame(data = iris_ss, columns=iris_df.columns).mean())
print(pd.DataFrame(data = iris_ss, columns=iris_df.columns).var())

sepal length (cm)   -2.775558e-16
sepal width (cm)    -9.695948e-16
petal length (cm)   -8.652338e-16
petal width (cm)    -4.662937e-16
label                1.184238e-17
dtype: float64
sepal length (cm)    1.006711
sepal width (cm)     1.006711
petal length (cm)    1.006711
petal width (cm)     1.006711
label                1.006711
dtype: float64


In [24]:
print(pd.DataFrame(data = iris_mm, columns=iris_df.columns).min())
print(pd.DataFrame(data = iris_mm, columns=iris_df.columns).max())

sepal length (cm)    0.0
sepal width (cm)     0.0
petal length (cm)    0.0
petal width (cm)     0.0
label                0.0
dtype: float64
sepal length (cm)    1.0
sepal width (cm)     1.0
petal length (cm)    1.0
petal width (cm)     1.0
label                1.0
dtype: float64
