In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import *
from sklearn.datasets import load_wine


In [2]:
wine = load_wine()

In [3]:
print(dir(wine))

['DESCR', 'data', 'feature_names', 'frame', 'target', 'target_names']


In [4]:
data = wine.data
label = wine.target

In [6]:
print(wine.target_names)

['class_0' 'class_1' 'class_2']


In [7]:
print(wine.DESCR)

.. _wine_dataset:

Wine recognition dataset
------------------------

**Data Set Characteristics:**

    :Number of Instances: 178 (50 in each of three classes)
    :Number of Attributes: 13 numeric, predictive attributes and the class
    :Attribute Information:
 		- Alcohol
 		- Malic acid
 		- Ash
		- Alcalinity of ash  
 		- Magnesium
		- Total phenols
 		- Flavanoids
 		- Nonflavanoid phenols
 		- Proanthocyanins
		- Color intensity
 		- Hue
 		- OD280/OD315 of diluted wines
 		- Proline

    - class:
            - class_0
            - class_1
            - class_2
		
    :Summary Statistics:
    
                                   Min   Max   Mean     SD
    Alcohol:                      11.0  14.8    13.0   0.8
    Malic Acid:                   0.74  5.80    2.34  1.12
    Ash:                          1.36  3.23    2.36  0.27
    Alcalinity of Ash:            10.6  30.0    19.5   3.3
    Magnesium:                    70.0 162.0    99.7  14.3
    Total Phenols:                0

In [8]:
X_train, X_test, y_train, y_test = train_test_split(data, label, test_size = 0.2, stratify = label, random_state = 42)

In [9]:
dt = DecisionTreeClassifier()
rf = RandomForestClassifier()
svm = SVC()
sgd = SGDClassifier()
ld = LogisticRegression()

In [10]:
dt.fit(X_train, y_train)
dt_pred = dt.predict(X_test)
print(classification_report(y_test, dt_pred))

              precision    recall  f1-score   support

           0       0.92      0.92      0.92        12
           1       0.93      0.93      0.93        14
           2       1.00      1.00      1.00        10

    accuracy                           0.94        36
   macro avg       0.95      0.95      0.95        36
weighted avg       0.94      0.94      0.94        36



In [11]:
print('accuracy :', accuracy_score(y_test, dt_pred))
print('recall :', recall_score(y_test, dt_pred, average='macro'))
print('precision :',precision_score(y_test, dt_pred, average='macro'))
print('f1 score :', f1_score(y_test, dt_pred, average='macro'))

accuracy : 0.9444444444444444
recall : 0.9484126984126985
precision : 0.9484126984126985
f1 score : 0.9484126984126985


In [12]:
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)
print(classification_report(y_test, rf_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        12
           1       1.00      1.00      1.00        14
           2       1.00      1.00      1.00        10

    accuracy                           1.00        36
   macro avg       1.00      1.00      1.00        36
weighted avg       1.00      1.00      1.00        36



In [13]:
print('accuracy :', accuracy_score(y_test, rf_pred))
print('recall :', recall_score(y_test, rf_pred, average='macro'))
print('precision :',precision_score(y_test, rf_pred, average='macro'))
print('f1 score :', f1_score(y_test, rf_pred, average='macro'))

accuracy : 1.0
recall : 1.0
precision : 1.0
f1 score : 1.0


In [14]:
svm.fit(X_train, y_train)
svm_pred = svm.predict(X_test)
print(classification_report(y_test, svm_pred))

              precision    recall  f1-score   support

           0       0.92      0.92      0.92        12
           1       0.61      1.00      0.76        14
           2       0.00      0.00      0.00        10

    accuracy                           0.69        36
   macro avg       0.51      0.64      0.56        36
weighted avg       0.54      0.69      0.60        36



In [15]:
print('accuracy :', accuracy_score(y_test, svm_pred))
print('recall :', recall_score(y_test, svm_pred, average='macro'))
print('precision :',precision_score(y_test, svm_pred, average='macro'))
print('f1 score :', f1_score(y_test, svm_pred, average='macro'))

accuracy : 0.6944444444444444
recall : 0.6388888888888888
precision : 0.5084541062801932
f1 score : 0.5578078078078078


In [20]:
print(confusion_matrix(y_test, svm_pred))

[[11  0  1]
 [ 0 14  0]
 [ 1  9  0]]


In [16]:
sgd.fit(X_train, y_train)
sgd_pred = sgd.predict(X_test)
print(classification_report(y_test, sgd_pred))

              precision    recall  f1-score   support

           0       0.58      0.92      0.71        12
           1       0.88      0.50      0.64        14
           2       0.56      0.50      0.53        10

    accuracy                           0.64        36
   macro avg       0.67      0.64      0.62        36
weighted avg       0.69      0.64      0.63        36



In [17]:
print('accuracy :', accuracy_score(y_test, sgd_pred))
print('recall :', recall_score(y_test, sgd_pred, average='macro'))
print('precision :',precision_score(y_test, sgd_pred, average='macro'))
print('f1 score :', f1_score(y_test, sgd_pred, average='macro'))

accuracy : 0.6388888888888888
recall : 0.6388888888888888
precision : 0.6698343079922028
f1 score : 0.6241189483973865


In [18]:
ld.fit(X_train, y_train)
ld_pred = ld.predict(X_test)
print(classification_report(y_test, ld_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        12
           1       0.93      1.00      0.97        14
           2       1.00      0.90      0.95        10

    accuracy                           0.97        36
   macro avg       0.98      0.97      0.97        36
weighted avg       0.97      0.97      0.97        36



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [19]:
print('accuracy :', accuracy_score(y_test, ld_pred))
print('recall :', recall_score(y_test, ld_pred, average='macro'))
print('precision :',precision_score(y_test, ld_pred, average='macro'))
print('f1 score :', f1_score(y_test, ld_pred, average='macro'))

accuracy : 0.9722222222222222
recall : 0.9666666666666667
precision : 0.9777777777777779
f1 score : 0.9709618874773142


Random Forest가 현재의 test dataset에 대한 label을 모두 예측해서 가장 좋은 성능을 보인다.

SVD 모델의 경우 class2를 아예 예측하지 못한 경우도 보인다. 이러한 경우에는 accuracy 보다는 얼마나 실제 데이터를 잘 나타내었는가를 타나내는 recall 값을 사용하는 것이 타당해보인다.