## Scikit-Learn

In [8]:
import numpy as np
import pandas as pd

### 1. Preprocessing

In [1]:
from sklearn.datasets import load_iris
iris = load_iris()

In [2]:
# feature data (x)
iris.data.shape

(150, 4)

In [3]:
iris.feature_names

['sepal length (cm)',
 'sepal width (cm)',
 'petal length (cm)',
 'petal width (cm)']

In [5]:
# target data (y)
iris.target.shape

(150,)

In [6]:
iris.target_names

array(['setosa', 'versicolor', 'virginica'], dtype='<U10')

In [7]:
print(iris.DESCR)

.. _iris_dataset:

Iris plants dataset
--------------------

**Data Set Characteristics:**

    :Number of Instances: 150 (50 in each of three classes)
    :Number of Attributes: 4 numeric, predictive attributes and the class
    :Attribute Information:
        - sepal length in cm
        - sepal width in cm
        - petal length in cm
        - petal width in cm
        - class:
                - Iris-Setosa
                - Iris-Versicolour
                - Iris-Virginica
                
    :Summary Statistics:

                    Min  Max   Mean    SD   Class Correlation
    sepal length:   4.3  7.9   5.84   0.83    0.7826
    sepal width:    2.0  4.4   3.05   0.43   -0.4194
    petal length:   1.0  6.9   3.76   1.76    0.9490  (high!)
    petal width:    0.1  2.5   1.20   0.76    0.9565  (high!)

    :Missing Attribute Values: None
    :Class Distribution: 33.3% for each of 3 classes.
    :Creator: R.A. Fisher
    :Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)
    :

In [9]:
iris_df = pd.DataFrame(iris.data, columns = iris.feature_names)
iris_df

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
...,...,...,...,...
145,6.7,3.0,5.2,2.3
146,6.3,2.5,5.0,1.9
147,6.5,3.0,5.2,2.0
148,6.2,3.4,5.4,2.3


In [10]:
iris_df['species'] = iris.target
iris_df

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),species
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,2
146,6.3,2.5,5.0,1.9,2
147,6.5,3.0,5.2,2.0,2
148,6.2,3.4,5.4,2.3,2


### 2. Dividing as data for learning and data for test

In [15]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(    # 1차원은 소문자, 2차원은 대문자로 표현
    iris.data, iris.target, test_size = 0.2, random_state = 2021
)

In [17]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((120, 4), (30, 4), (120,), (30,))

In [18]:
y_test

array([0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 1, 2, 1, 1, 0, 1, 1, 2,
       2, 0, 2, 1, 1, 1, 0, 0])

### 3. Learning with applyed machine learning algorithm

- Decision Tree

In [19]:
from sklearn.tree import DecisionTreeClassifier

In [33]:
# making observer --> making model
dt_clf = DecisionTreeClassifier(random_state = 2021)    # 괄호 안에는 Hyper Parameter

In [35]:
# Proceed learning
dt_clf.fit(X_train, y_train)    # fit(train x, train answer) | 학습을 진행

DecisionTreeClassifier(random_state=2021)

- Support Vector Machine

In [36]:
from sklearn.svm import SVC
sv_clf = SVC()
sv_clf.fit(X_train, y_train)

SVC()

- Logistic Regression

In [37]:
from sklearn.linear_model import LogisticRegression
lr_clf = LogisticRegression()
lr_clf.fit(X_train, y_train)

LogisticRegression()

### 4. Proceed prediction

In [44]:
# Decision Tree
pred_dt = dt_clf.predict(X_test)

In [47]:
# SVM
pred_sv = sv_clf.predict(X_test)

In [46]:
# Logistic Regression
pred_lr = lr_clf.predict(X_test)

### 5. Evaluate model

In [40]:
from sklearn.metrics import accuracy_score
score = accuracy_score(y_test, pred_dt)
print(f'Decision Tree prediction accuracy rate : {score:.4f}')

Decision Tree prediction accuracy rate : 1.0000


In [42]:
dt_clf.score(X_test, y_test)

1.0

In [48]:
sv_clf.score(X_test, y_test)

1.0

In [49]:
lr_clf.score(X_test, y_test)

1.0

In [50]:
pred_df = pd.DataFrame({
    'original' : y_test,
    'Decision Tree' : pred_dt,
    'SVM' : pred_sv,
    "Logistic Regression" : pred_lr
})
pred_df

Unnamed: 0,original,Decision Tree,SVM,Logistic Regression
0,0,0,0,0
1,0,0,0,0
2,1,1,1,1
3,0,0,0,0
4,0,0,0,0
5,0,0,0,0
6,0,0,0,0
7,0,0,0,0
8,0,0,0,0
9,0,0,0,0
