# 붓꽃 분류 모델

## 빅데이터: 붓꽃 데이터 결정트리 / 랜덤포레스트 ROC 커브

### 이영석, 문현수

#### munhyunsu@cs-cnu.org

In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline

# 붓꽃 데이터셋 불러오기

In [None]:
from sklearn.datasets import load_iris

In [None]:
iris = load_iris()

In [None]:
print(iris['DESCR'])

In [None]:
print(f'분류 대상: {iris.target_names}')
# Petal: 꽃잎, Sepal: 꽃받침
print(f'특징: {iris.feature_names}')
print('특징 예제')
for row in iris.data[:5]:
    print(row)

In [None]:
df = pd.DataFrame({'sepal length':iris.data[:,0],
                   'sepal width':iris.data[:,1],
                   'petal length':iris.data[:,2],
                   'petal width':iris.data[:,3],
                   'species':iris.target})
df.head()

In [None]:
print(df[df['species'] == 0].mean())
print(df[df['species'] == 1].mean())
print(df[df['species'] == 2].mean())

## 붓꽃 의사결정트리

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
features = ['sepal length', 'sepal width', 'petal length', 'petal width']
X = df[features]
y = df['species']

In [None]:
X.head()

In [None]:
y

In [None]:
test_size = 0.2
random_state = None
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

In [None]:
from sklearn.tree import DecisionTreeClassifier, plot_tree

In [None]:
random_state = None
clf= DecisionTreeClassifier(random_state=random_state)

In [None]:
clf.fit(X_train, y_train)

In [None]:
print(f'학습 평균 정확도: {clf.score(X_train, y_train)}') 
print(f'예측 평균 정확도: {clf.score(X_test, y_test)}') 

In [None]:
for name, importance in zip (features, clf.feature_importances_):
    print(f'{name} = {importance}')

In [None]:
fig = plt.figure(figsize= (8, 6))
ax = fig.add_subplot()
ax.barh(features, clf.feature_importances_)
ax.set_xlim((0, 1))

In [None]:
fig = plt.figure(figsize= (8*1, 6*1))
ax = fig.add_subplot()
_ = plot_tree(clf, 
#               class_names=iris.target_names,
              feature_names=features, 
              max_depth=3, filled=True, ax=ax, fontsize='medium')

## 의사 결정 트리 최대 깊이 N

In [None]:
features = ['sepal length', 'sepal width', 'petal length', 'petal width']
X = df[features]
y = df['species']

In [None]:
test_size = 0.2
random_state = None
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

In [None]:
max_depth = 3
random_state = None
clf = DecisionTreeClassifier(max_depth=max_depth, random_state=random_state)

In [None]:
clf.fit(X_train, y_train)

In [None]:
print(f'학습 평균 정확도: {clf.score(X_train, y_train)}') 
print(f'예측 평균 정확도: {clf.score(X_test, y_test)}') 

In [None]:
fig = plt.figure(figsize= (8*1, 6*1))
ax = fig.add_subplot()
_ = plot_tree(clf, 
#               class_names=iris.target_names,
              feature_names=features, 
              max_depth=3, filled=True, ax=ax, fontsize='medium')

## 랜덤포레스트 분류기

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
test_size = 0.2
random_state = None
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

In [None]:
random_state = None
clf = RandomForestClassifier(random_state=random_state)

In [None]:
clf.fit(X_train, y_train)

In [None]:
print(f'학습 평균 정확도: {clf.score(X_train, y_train)}') 
print(f'예측 평균 정확도: {clf.score(X_test, y_test)}') 

## 멀티클래스 ROC 커브 and AUC

### 멀티클래스 이진분류 클래스 전처리

In [None]:
from sklearn.preprocessing import label_binarize

In [None]:
df = pd.DataFrame({'sepal length': iris.data[:, 0],
                   'sepal width': iris.data[:, 1],
                   'petal length': iris.data[:, 2],
                   'petal width': iris.data[:, 3],
                   'setosa': label_binarize(iris.target, classes=[0, 1, 2])[:, 0],
                   'versicolor': label_binarize(iris.target, classes=[0, 1, 2])[:, 1],
                   'virginica': label_binarize(iris.target, classes=[0, 1, 2])[:, 2],})
df.head()

In [None]:
X = iris.data
X[:5]

In [None]:
y = iris.target
y[:5]

In [None]:
y = label_binarize(iris.target, classes=[0, 1, 2])
y[:5]

In [None]:
test_size = 0.2
random_state = None
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

### 멀티라벨 --> 이진분류: OneVsRest

In [None]:
from sklearn.multiclass import OneVsRestClassifier

In [None]:
random_state = None
clf = OneVsRestClassifier(DecisionTreeClassifier(random_state=random_state))

In [None]:
clf.fit(X_train, y_train)

### 멀티라벨 ROC 커브

In [None]:
from sklearn.metrics import roc_curve, auc

In [None]:
y_score = clf.predict_proba(X_test)

In [None]:
fpr = dict()
tpr = dict()
roc_auc = dict()
n_classes = y.shape[1]

for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

In [None]:
fig = plt.figure(figsize= (8*1, 6*1))
ax = fig.add_subplot()

colors = ['C1', 'C2', 'C3']
for i, color in zip(range(n_classes), colors):
    ax.plot(fpr[i], tpr[i], color=color, lw=2,
             label=f'ROC curve of class {i} (area = {roc_auc[i]:0.2f})')

ax.plot([0, 1], [0, 1], 'k--', lw=2)
ax.set_xlim([0.0, 1.0])
ax.set_ylim([0.0, 1.05])
ax.set_xlabel('False Positive Rate')
ax.set_ylabel('True Positive Rate')
ax.set_title('Some extension of Receiver operating characteristic to multi-class')
_ = ax.legend(loc="lower right")
# plt.show()

### 랜덤포레스트 ROC 커브

In [None]:
test_size = 0.2
random_state = None
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

In [None]:
random_state = None
clf = OneVsRestClassifier(RandomForestClassifier(random_state=random_state))

In [None]:
clf.fit(X_train, y_train)

In [None]:
y_score = clf.predict_proba(X_test)

In [None]:
fpr = dict()
tpr = dict()
roc_auc = dict()
n_classes = y.shape[1]

for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

In [None]:
fig = plt.figure(figsize= (8*1, 6*1))
ax = fig.add_subplot()

colors = ['C1', 'C2', 'C3']
for i, color in zip(range(n_classes), colors):
    ax.plot(fpr[i], tpr[i], color=color, lw=2,
             label=f'ROC curve of class {i} (area = {roc_auc[i]:0.2f})')

ax.plot([0, 1], [0, 1], 'k--', lw=2)
ax.set_xlim([0.0, 1.0])
ax.set_ylim([0.0, 1.05])
ax.set_xlabel('False Positive Rate')
ax.set_ylabel('True Positive Rate')
ax.set_title('Some extension of Receiver operating characteristic to multi-class')
_ = ax.legend(loc="lower right")