## 0. Load Library

In [2]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

## 1. Data Load

In [3]:
iris = load_iris()

In [4]:
iris_x = iris.data
iris_y = iris.target

In [5]:
print("[iris.keys()] ", iris.keys())

[iris.keys()]  dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename', 'data_module'])


In [6]:
print("[iris.DESCR] ", iris.DESCR)

[iris.DESCR]  .. _iris_dataset:

Iris plants dataset
--------------------

**Data Set Characteristics:**

    :Number of Instances: 150 (50 in each of three classes)
    :Number of Attributes: 4 numeric, predictive attributes and the class
    :Attribute Information:
        - sepal length in cm
        - sepal width in cm
        - petal length in cm
        - petal width in cm
        - class:
                - Iris-Setosa
                - Iris-Versicolour
                - Iris-Virginica
                
    :Summary Statistics:

                    Min  Max   Mean    SD   Class Correlation
    sepal length:   4.3  7.9   5.84   0.83    0.7826
    sepal width:    2.0  4.4   3.05   0.43   -0.4194
    petal length:   1.0  6.9   3.76   1.76    0.9490  (high!)
    petal width:    0.1  2.5   1.20   0.76    0.9565  (high!)

    :Missing Attribute Values: None
    :Class Distribution: 33.3% for each of 3 classes.
    :Creator: R.A. Fisher
    :Donor: Michael Marshall (MARSHALL%PLU@io.arc.n

In [7]:
print("[iris.data_module] ", iris.data_module)

[iris.data_module]  sklearn.datasets.data


In [8]:
print("[iris.filename] ", iris.filename)

[iris.filename]  iris.csv


In [9]:
print("[iris.feature_names] ", iris.feature_names)

[iris.feature_names]  ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']


In [10]:
print("[iris.target_names] ", iris.target_names)

[iris.target_names]  ['setosa' 'versicolor' 'virginica']


In [11]:
print("[iris_x.shape] ", iris_x.shape)

[iris_x.shape]  (150, 4)


In [12]:
print("[iris_x] ", iris_x)

[iris_x]  [[5.1 3.5 1.4 0.2]
 [4.9 3.  1.4 0.2]
 [4.7 3.2 1.3 0.2]
 [4.6 3.1 1.5 0.2]
 [5.  3.6 1.4 0.2]
 [5.4 3.9 1.7 0.4]
 [4.6 3.4 1.4 0.3]
 [5.  3.4 1.5 0.2]
 [4.4 2.9 1.4 0.2]
 [4.9 3.1 1.5 0.1]
 [5.4 3.7 1.5 0.2]
 [4.8 3.4 1.6 0.2]
 [4.8 3.  1.4 0.1]
 [4.3 3.  1.1 0.1]
 [5.8 4.  1.2 0.2]
 [5.7 4.4 1.5 0.4]
 [5.4 3.9 1.3 0.4]
 [5.1 3.5 1.4 0.3]
 [5.7 3.8 1.7 0.3]
 [5.1 3.8 1.5 0.3]
 [5.4 3.4 1.7 0.2]
 [5.1 3.7 1.5 0.4]
 [4.6 3.6 1.  0.2]
 [5.1 3.3 1.7 0.5]
 [4.8 3.4 1.9 0.2]
 [5.  3.  1.6 0.2]
 [5.  3.4 1.6 0.4]
 [5.2 3.5 1.5 0.2]
 [5.2 3.4 1.4 0.2]
 [4.7 3.2 1.6 0.2]
 [4.8 3.1 1.6 0.2]
 [5.4 3.4 1.5 0.4]
 [5.2 4.1 1.5 0.1]
 [5.5 4.2 1.4 0.2]
 [4.9 3.1 1.5 0.2]
 [5.  3.2 1.2 0.2]
 [5.5 3.5 1.3 0.2]
 [4.9 3.6 1.4 0.1]
 [4.4 3.  1.3 0.2]
 [5.1 3.4 1.5 0.2]
 [5.  3.5 1.3 0.3]
 [4.5 2.3 1.3 0.3]
 [4.4 3.2 1.3 0.2]
 [5.  3.5 1.6 0.6]
 [5.1 3.8 1.9 0.4]
 [4.8 3.  1.4 0.3]
 [5.1 3.8 1.6 0.2]
 [4.6 3.2 1.4 0.2]
 [5.3 3.7 1.5 0.2]
 [5.  3.3 1.4 0.2]
 [7.  3.2 4.7 1.4]
 [6.4 3.2 4.5 1.5]
 [

In [13]:
print("[iris_y.shape] ", iris_y.shape)

[iris_y.shape]  (150,)


In [14]:
print(type(iris_y[0]))

<class 'numpy.int32'>


In [None]:
print("[iris_y] ", iris_y)

In [43]:
print("[iris.frame] ", iris.frame)

[iris.frame]  None


## 2. Data Preprocessing

In [3]:
X_train, X_test, y_train, y_test = train_test_split(iris_x,
                                                    iris_y,
                                                    test_size=0.2,
                                                    random_state=7)

## 3. Model Train & Predict & Visualization

### 1) Decision Tree

In [4]:
# 1) Train
decision_tree = DecisionTreeClassifier(random_state=32)
decision_tree.fit(X_train, y_train)

# 2) Predict
y_pred = decision_tree.predict(X_test)

In [5]:
# 3) Visualization
con_mat = confusion_matrix(y_test, y_pred) # Confusion Matrix
con_mat

array([[ 7,  0,  0],
       [ 0, 10,  2],
       [ 0,  1, 10]], dtype=int64)

In [6]:
accuracy = accuracy_score(y_test, y_pred) # Accuracy
accuracy

0.9

In [7]:
print(classification_report(y_test, y_pred)) # Precision, Recall, F1 score 지표 정리

              precision    recall  f1-score   support

           0       1.00      1.00      1.00         7
           1       0.91      0.83      0.87        12
           2       0.83      0.91      0.87        11

    accuracy                           0.90        30
   macro avg       0.91      0.91      0.91        30
weighted avg       0.90      0.90      0.90        30



### 2) Random Forest

In [8]:
# 1) Train
random_forest = RandomForestClassifier(random_state=32)
random_forest.fit(X_train, y_train)

# 2) Predict
y_pred = random_forest.predict(X_test)

In [9]:
# 3) Visualization
con_mat = confusion_matrix(y_test, y_pred)
con_mat

array([[ 7,  0,  0],
       [ 0, 10,  2],
       [ 0,  2,  9]], dtype=int64)

In [10]:
accuracy = accuracy_score(y_test, y_pred)
accuracy

0.8666666666666667

In [11]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00         7
           1       0.83      0.83      0.83        12
           2       0.82      0.82      0.82        11

    accuracy                           0.87        30
   macro avg       0.88      0.88      0.88        30
weighted avg       0.87      0.87      0.87        30



### 3) SVM

In [12]:
# 1) Train
svm_model = svm.SVC()
svm_model.fit(X_train, y_train)

# 2) Predict
y_pred = svm_model.predict(X_test)

In [13]:
# 3) Visualization
con_mat = confusion_matrix(y_test, y_pred)
con_mat

array([[ 7,  0,  0],
       [ 0, 10,  2],
       [ 0,  2,  9]], dtype=int64)

In [None]:
# confusion matrix 그리는 함수 
def plot_confusion_matrix(con_mat, labels, title='Confusion Matrix', cmap=plt.cm.get_cmap('Blues'), normalize=False):
    plt.imshow(con_mat, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    marks = np.arange(len(labels))
    nlabels = []
    for k in range(len(con_mat)):
        n = sum(con_mat[k])
        nlabel = '{0}(n={1})'.format(labels[k],n)
        nlabels.append(nlabel)
    plt.xticks(marks, labels)
    plt.yticks(marks, nlabels)

    thresh = con_mat.max() / 2.
    if normalize:
        for i, j in itertools.product(range(con_mat.shape[0]), range(con_mat.shape[1])):
            plt.text(j, i, '{0}%'.format(con_mat[i, j] * 100 / n), horizontalalignment="center", color="white" if con_mat[i, j] > thresh else "black")
    else:
        for i, j in itertools.product(range(con_mat.shape[0]), range(con_mat.shape[1])):
            plt.text(j, i, con_mat[i, j], horizontalalignment="center", color="white" if con_mat[i, j] > thresh else "black")
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.show()

# 예측값과 참값 
pred_labels = np.argmax(predictions, axis=1)
true_labels = test_set.labels

#메인 실행 
confusion_matrix = confusion_matrix(true_labels, pred_labels)
plot_confusion_matrix(confusion_matrix, labels=labels, normalize=True)

In [14]:
accuracy = accuracy_score(y_test, y_pred)
accuracy

0.8666666666666667

In [15]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00         7
           1       0.83      0.83      0.83        12
           2       0.82      0.82      0.82        11

    accuracy                           0.87        30
   macro avg       0.88      0.88      0.88        30
weighted avg       0.87      0.87      0.87        30



### 4) SGD

In [16]:
# 1) Train
sgd_model = SGDClassifier()
sgd_model.fit(X_train, y_train)

# 2) Predict
y_pred = sgd_model.predict(X_test)

In [17]:
# 3) Visualization
con_mat = confusion_matrix(y_test, y_pred) # Confusion Matrix
con_mat

array([[ 7,  0,  0],
       [ 0,  0, 12],
       [ 0,  0, 11]], dtype=int64)

In [18]:
accuracy = accuracy_score(y_test, y_pred)
accuracy

0.6

In [19]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00         7
           1       0.00      0.00      0.00        12
           2       0.48      1.00      0.65        11

    accuracy                           0.60        30
   macro avg       0.49      0.67      0.55        30
weighted avg       0.41      0.60      0.47        30



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### 5) Logistic Regression

In [20]:
# 1) Train
logistic_model = LogisticRegression()
logistic_model.fit(X_train, y_train)

# 2) Predict
y_pred = logistic_model.predict(X_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [21]:
# 3) Visualization
con_mat = confusion_matrix(y_test, y_pred) # Confusion Matrix
con_mat

array([[ 7,  0,  0],
       [ 0, 10,  2],
       [ 0,  2,  9]], dtype=int64)

In [22]:
accuracy = accuracy_score(y_test, y_pred)
accuracy

0.8666666666666667

In [23]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00         7
           1       0.83      0.83      0.83        12
           2       0.82      0.82      0.82        11

    accuracy                           0.87        30
   macro avg       0.88      0.88      0.88        30
weighted avg       0.87      0.87      0.87        30

