## 5장 5절 머신러닝 분석과정 빠르게 맛보기 - 분류분석

### 1. 데이터 확인

In [1]:
from sklearn.datasets import load_iris
import pandas as pd

In [2]:
iris = load_iris()
iris_dt = iris.data
iris_label = iris.target

df_iris = pd.DataFrame(data=iris_dt, columns=iris.feature_names)
df_iris['Species'] = iris_label

df_iris

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),Species
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,2
146,6.3,2.5,5.0,1.9,2
147,6.5,3.0,5.2,2.0,2
148,6.2,3.4,5.4,2.3,2


In [3]:
df_iris['Species'].unique()

array([0, 1, 2])

In [4]:
df_iris.shape

(150, 5)

### 2. 데이터 분할

In [5]:
from sklearn.model_selection import train_test_split



In [6]:
X_train, X_test, y_train, y_test = train_test_split(iris_dt, iris_label, test_size=0.2, random_state=0, stratify=iris_label)

In [7]:
X_train

array([[4.8, 3. , 1.4, 0.3],
       [4.9, 3. , 1.4, 0.2],
       [4.4, 3. , 1.3, 0.2],
       [5. , 3.4, 1.5, 0.2],
       [5.8, 2.7, 3.9, 1.2],
       [4.9, 3.6, 1.4, 0.1],
       [6.7, 2.5, 5.8, 1.8],
       [6.7, 3.3, 5.7, 2.1],
       [6.3, 2.5, 4.9, 1.5],
       [6.1, 3. , 4.9, 1.8],
       [5.9, 3. , 5.1, 1.8],
       [5.6, 2.7, 4.2, 1.3],
       [5.4, 3.9, 1.7, 0.4],
       [4.9, 2.4, 3.3, 1. ],
       [7.7, 2.6, 6.9, 2.3],
       [6.9, 3.1, 5.1, 2.3],
       [5.2, 4.1, 1.5, 0.1],
       [5.7, 2.9, 4.2, 1.3],
       [6.3, 3.3, 4.7, 1.6],
       [5.2, 3.4, 1.4, 0.2],
       [6.5, 3. , 5.8, 2.2],
       [4.6, 3.6, 1. , 0.2],
       [5.2, 3.5, 1.5, 0.2],
       [6.7, 3.1, 5.6, 2.4],
       [6.8, 3. , 5.5, 2.1],
       [5.1, 2.5, 3. , 1.1],
       [5.6, 2.5, 3.9, 1.1],
       [5. , 3.4, 1.6, 0.4],
       [5.8, 2.8, 5.1, 2.4],
       [7.2, 3.2, 6. , 1.8],
       [6. , 2.7, 5.1, 1.6],
       [6.1, 3. , 4.6, 1.4],
       [5.4, 3.4, 1.5, 0.4],
       [6.5, 3.2, 5.1, 2. ],
       [6.2, 2

### 3. 전처리

In [8]:
df_iris.isna().sum()

sepal length (cm)    0
sepal width (cm)     0
petal length (cm)    0
petal width (cm)     0
Species              0
dtype: int64

### 4. 모델학습

In [9]:
from sklearn.tree import DecisionTreeClassifier

In [10]:
dtree_clf_5 = DecisionTreeClassifier(max_depth=5, random_state=100)
dtree_clf_3 = DecisionTreeClassifier(max_depth=3, random_state=100)
dtree_clf_1 = DecisionTreeClassifier(max_depth=1, random_state=100)

In [11]:
from sklearn.model_selection import cross_val_score
import numpy as np

In [13]:
scores = cross_val_score(dtree_clf_5, X_train, y_train, scoring='accuracy', cv=10)
print('교차검증 정확도: ', np.round(scores, 3))
print('평균검증 정확도: ', np.round(np.mean(scores), 4))

교차검증 정확도:  [0.917 1.    0.917 1.    1.    0.833 1.    0.917 1.    0.833]
평균검증 정확도:  0.9417


In [14]:
scores = cross_val_score(dtree_clf_3, X_train, y_train, scoring='accuracy', cv=10)
print('교차검증 정확도: ', np.round(scores, 3))
print('평균검증 정확도: ', np.round(np.mean(scores), 4))

교차검증 정확도:  [0.917 1.    0.917 0.917 1.    0.833 1.    0.917 0.917 0.833]
평균검증 정확도:  0.925


In [15]:
scores = cross_val_score(dtree_clf_1, X_train, y_train, scoring='accuracy', cv=10)
print('교차검증 정확도: ', np.round(scores, 3))
print('평균검증 정확도: ', np.round(np.mean(scores), 4))

교차검증 정확도:  [0.667 0.667 0.667 0.667 0.667 0.667 0.667 0.667 0.667 0.667]
평균검증 정확도:  0.6667


### 5. 성능평가 및 예측값 저장

In [18]:
dtree_clf_5.fit(X_train, y_train)
pred = dtree_clf_5.predict(X_test)

from sklearn.metrics import accuracy_score
print('Decision tree(교차검증 후) 예측 정확도: {0:.5f}'.format(accuracy_score(y_true=y_test, y_pred=pred)))

Decision tree(교차검증 후) 예측 정확도: 0.96667


In [19]:
df_pred = pd.DataFrame(data=pred, columns=['pred Species'])
df_pred

Unnamed: 0,pred Species
0,0
1,1
2,0
3,2
4,0
5,1
6,2
7,0
8,0
9,1


In [20]:
df_actual = pd.DataFrame(data=y_test, columns=['actual Species'])
df_actual

Unnamed: 0,actual Species
0,0
1,1
2,0
3,2
4,0
5,1
6,2
7,0
8,0
9,1


In [21]:
classify_result = pd.concat([df_actual, df_pred], axis=1)
classify_result.head()

Unnamed: 0,actual Species,pred Species
0,0,0
1,1,1
2,0,0
3,2,2
4,0,0
