In [537]:
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

## load_wine

In [538]:
wine = load_wine()

In [539]:
print(dir(wine))

['DESCR', 'data', 'feature_names', 'frame', 'target', 'target_names']


In [540]:
wine.keys()

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names'])

### Feature Data

In [541]:
wine_data= wine.data

In [542]:
print(wine_data.shape)

(178, 13)


In [543]:
wine_data

array([[1.423e+01, 1.710e+00, 2.430e+00, ..., 1.040e+00, 3.920e+00,
        1.065e+03],
       [1.320e+01, 1.780e+00, 2.140e+00, ..., 1.050e+00, 3.400e+00,
        1.050e+03],
       [1.316e+01, 2.360e+00, 2.670e+00, ..., 1.030e+00, 3.170e+00,
        1.185e+03],
       ...,
       [1.327e+01, 4.280e+00, 2.260e+00, ..., 5.900e-01, 1.560e+00,
        8.350e+02],
       [1.317e+01, 2.590e+00, 2.370e+00, ..., 6.000e-01, 1.620e+00,
        8.400e+02],
       [1.413e+01, 4.100e+00, 2.740e+00, ..., 6.100e-01, 1.600e+00,
        5.600e+02]])

### Lable Data

In [544]:
wine_label = wine.target

In [545]:
print(wine_label.shape)

(178,)


In [546]:
wine_label

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2])

### Target Name

In [547]:
wine.target_names

array(['class_0', 'class_1', 'class_2'], dtype='<U7')

### 데이터 Describe

In [548]:
print(wine.DESCR)

.. _wine_dataset:

Wine recognition dataset
------------------------

**Data Set Characteristics:**

    :Number of Instances: 178 (50 in each of three classes)
    :Number of Attributes: 13 numeric, predictive attributes and the class
    :Attribute Information:
 		- Alcohol
 		- Malic acid
 		- Ash
		- Alcalinity of ash  
 		- Magnesium
		- Total phenols
 		- Flavanoids
 		- Nonflavanoid phenols
 		- Proanthocyanins
		- Color intensity
 		- Hue
 		- OD280/OD315 of diluted wines
 		- Proline

    - class:
            - class_0
            - class_1
            - class_2
		
    :Summary Statistics:
    
                                   Min   Max   Mean     SD
    Alcohol:                      11.0  14.8    13.0   0.8
    Malic Acid:                   0.74  5.80    2.34  1.12
    Ash:                          1.36  3.23    2.36  0.27
    Alcalinity of Ash:            10.6  30.0    19.5   3.3
    Magnesium:                    70.0 162.0    99.7  14.3
    Total Phenols:                0

In [549]:
 wine.feature_names

['alcohol',
 'malic_acid',
 'ash',
 'alcalinity_of_ash',
 'magnesium',
 'total_phenols',
 'flavanoids',
 'nonflavanoid_phenols',
 'proanthocyanins',
 'color_intensity',
 'hue',
 'od280/od315_of_diluted_wines',
 'proline']

### train, test 데이터 분리

In [550]:
X_train, X_test, y_train, y_test = train_test_split(wine_data, 
                                                    wine_label, 
                                                    test_size=0.2)

In [551]:
print("X_train 개수:", len(X_train), "X_test 개수:",len(X_test))

X_train 개수: 142 X_test 개수: 36


## 1. Decision Tree

In [552]:
from sklearn.tree import DecisionTreeClassifier
decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_train, y_train)
y_pred = decision_tree.predict(X_test)
print(classification_report(y_test, y_pred,zero_division=0))

              precision    recall  f1-score   support

           0       0.94      1.00      0.97        16
           1       0.89      0.80      0.84        10
           2       0.90      0.90      0.90        10

    accuracy                           0.92        36
   macro avg       0.91      0.90      0.90        36
weighted avg       0.92      0.92      0.91        36



In [553]:
decision_accuracy = accuracy_score(y_test, y_pred)
decision_accuracy

0.9166666666666666

## 2. RandomForest

In [578]:
from sklearn.ensemble import RandomForestClassifier
random_forest = RandomForestClassifier(random_state = 32)
random_forest.fit(X_train, y_train)
y_pred = random_forest.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        16
           1       1.00      1.00      1.00        10
           2       1.00      1.00      1.00        10

    accuracy                           1.00        36
   macro avg       1.00      1.00      1.00        36
weighted avg       1.00      1.00      1.00        36



In [579]:
random_accuracy = accuracy_score(y_test, y_pred)
random_accuracy

1.0

## 3. SVM

In [557]:
from sklearn import svm 
svm_model = svm.SVC(kernel='linear')

In [558]:
svm_model.fit(X_train, y_train)
y_pred = svm_model.predict(X_test)
print(classification_report(y_test, y_pred,zero_division=0))

              precision    recall  f1-score   support

           0       1.00      0.94      0.97        16
           1       0.91      1.00      0.95        10
           2       1.00      1.00      1.00        10

    accuracy                           0.97        36
   macro avg       0.97      0.98      0.97        36
weighted avg       0.97      0.97      0.97        36



In [559]:
svm_accuracy = accuracy_score(y_test, y_pred)
svm_accuracy

0.9722222222222222

### 4. SGD Classifier

In [560]:
from sklearn.linear_model import SGDClassifier
sgd_model = SGDClassifier()

In [561]:
sgd_model.fit(X_train,y_train)
y_pred = sgd_model.predict(X_test)
print(classification_report(y_test, y_pred,zero_division=0))

              precision    recall  f1-score   support

           0       0.81      0.81      0.81        16
           1       0.40      0.80      0.53        10
           2       0.00      0.00      0.00        10

    accuracy                           0.58        36
   macro avg       0.40      0.54      0.45        36
weighted avg       0.47      0.58      0.51        36



In [562]:
sgd_accuracy = accuracy_score(y_test, y_pred)
sgd_accuracy

0.5833333333333334

### 5. Logistic Regression

In [563]:
from sklearn.linear_model import LogisticRegression
logistic_model = LogisticRegression(max_iter=5000)

In [564]:
logistic_model.fit(X_train,y_train)
y_pred = logistic_model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.94      0.94      0.94        16
           1       0.90      0.90      0.90        10
           2       1.00      1.00      1.00        10

    accuracy                           0.94        36
   macro avg       0.95      0.95      0.95        36
weighted avg       0.94      0.94      0.94        36



In [565]:
log_accuracy = accuracy_score(y_test, y_pred)
log_accuracy

0.9444444444444444

# 정확도

In [566]:
print('의사결정나무 정확도 : ' ,decision_accuracy)
print('랜덤포레스트 정확도 : ' ,random_accuracy)
print('SVM 정확도 :', svm_accuracy)
print('SGD 정확도 : ' ,sgd_accuracy)
print('Logistic Regression 정확도 : ' ,log_accuracy)

의사결정나무 정확도 :  0.9166666666666666
랜덤포레스트 정확도 :  1.0
SVM 정확도 : 0.9722222222222222
SGD 정확도 :  0.5833333333333334
Logistic Regression 정확도 :  0.9444444444444444


# 모델 평가

렌덤 포레스트가 가장 좋은 성능을 보인다. 랜텀 포레스트는 와인의 feature 중 무작위로 일부만 선택을 해 와인의 class를 분류하는 방법이다.
이 방법을 반복하면서 여러 개의 결정 트리를 만들면서 정확하게 와인의 class와 일치하게 되는 경우가 많아 성능이 높아졌다고 생각한다.

## 회고

어려웠던 점은 랜덤포레스트에서 정확도가 1이 나오는 경우가 있어서 수정하냐고 어려웠다.
알아낸 점은 division 0 인 경우 오류가 떠서 구글에 직접 찾아봐서 zero_division=0을 추가 해야 한다는 점을 알게 되었다.
루브릭 평가 지표에 맞추기 위해 어떤 평가 지표로 사용하는게 유리한지를 구글에 실제 모델에 장점과 단점을 찾아보면서 비교했다.
아직은 주어진 데이터를 갖고 모델을 돌리는 것이 어렵지만 나중에는 이해 할 수 있을 것 같다.