In [1]:
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

wine = load_wine()
wine_data = wine.data
wine_label = wine.target
print(wine.target_names)

x_train, x_test, y_train, y_test = train_test_split(wine_data, wine_label, test_size=0.3, random_state=55)


['class_0' 'class_1' 'class_2']


# 다양한 모델로 학습 + 평가하기

## Decision Tree

In [2]:
from sklearn.tree import DecisionTreeClassifier

decision_tree=DecisionTreeClassifier(random_state=100)
decision_tree.fit(x_train, y_train)
y_pred = decision_tree.predict(x_test)

print(classification_report(y_test, y_pred))
print("accuracy : ", accuracy_score(y_test, y_pred))

print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.86      0.92      0.89        13
           1       0.91      0.91      0.91        23
           2       0.94      0.89      0.91        18

    accuracy                           0.91        54
   macro avg       0.90      0.91      0.91        54
weighted avg       0.91      0.91      0.91        54

accuracy :  0.9074074074074074
[[12  1  0]
 [ 1 21  1]
 [ 1  1 16]]


## Random Forest

In [3]:
from sklearn.ensemble import RandomForestClassifier

random_forest=RandomForestClassifier(random_state=23)
random_forest.fit(x_train, y_train)
y_pred = random_forest.predict(x_test)

print(classification_report(y_test, y_pred))
print("accuracy : ", accuracy_score(y_test, y_pred))

print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.93      1.00      0.96        13
           1       1.00      0.96      0.98        23
           2       1.00      1.00      1.00        18

    accuracy                           0.98        54
   macro avg       0.98      0.99      0.98        54
weighted avg       0.98      0.98      0.98        54

accuracy :  0.9814814814814815
[[13  0  0]
 [ 1 22  0]
 [ 0  0 18]]


## SVM, Support Vector Machine

In [4]:
from sklearn import svm

svm_model = svm.SVC()
svm_model.fit(x_train, y_train)
y_pred = svm_model.predict(x_test)

print(classification_report(y_test, y_pred))
print("accuracy : ", accuracy_score(y_test, y_pred))

print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.76      1.00      0.87        13
           1       0.59      0.96      0.73        23
           2       0.00      0.00      0.00        18

    accuracy                           0.65        54
   macro avg       0.45      0.65      0.53        54
weighted avg       0.44      0.65      0.52        54

accuracy :  0.6481481481481481
[[13  0  0]
 [ 1 22  0]
 [ 3 15  0]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## SGDClassifier

In [5]:
from sklearn.linear_model import SGDClassifier
import warnings

warnings.filterwarnings('ignore')

sgd_model = SGDClassifier()
sgd_model.fit(x_train, y_train)
y_pred = sgd_model.predict(x_test)

print(classification_report(y_test, y_pred))
print("accuracy : ", accuracy_score(y_test, y_pred))

print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.71      0.92      0.80        13
           1       0.59      0.96      0.73        23
           2       0.00      0.00      0.00        18

    accuracy                           0.63        54
   macro avg       0.43      0.63      0.51        54
weighted avg       0.42      0.63      0.50        54

accuracy :  0.6296296296296297
[[12  1  0]
 [ 1 22  0]
 [ 4 14  0]]


경고 발생 : Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.

f1-score와 관련해서 경고가 발생했다

이 레이블에 대해 계산할 f점수가 없으므로 F 점수는 0.0으로 간주된다. 평균 점수를 요청 했기 때문에 점수에 0이 계산에 포함 된 것을 고려해야 한다.

import warnings
warnings.filterwarnings('ignore') 
이 모듈을 통해 경고를 나타나지 않게 할 수 있다 
예측되지 않은 레이블의 수에 관심이 없다고 결정한 다음 원하는 레이블을 명시적으로 지정한다

완벽하게 이해는 안된다

## LogisticRegression

In [6]:
from sklearn.linear_model import LogisticRegression

logistic_model = LogisticRegression(solver='liblinear')
logistic_model.fit(x_train, y_train)
y_pred = logistic_model.predict(x_test)

print(classification_report(y_test, y_pred))
print("accuracy : ", accuracy_score(y_test, y_pred))

print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        13
           1       0.96      1.00      0.98        23
           2       1.00      0.94      0.97        18

    accuracy                           0.98        54
   macro avg       0.99      0.98      0.98        54
weighted avg       0.98      0.98      0.98        54

accuracy :  0.9814814814814815
[[13  0  0]
 [ 0 23  0]
 [ 0  1 17]]


# 정리
wine 데이터 셋의 경우 

- Decision Tree = 0.90 
- Random Forest = 1.0 
- SVM = 0.59
- SGDClassifier = 0.57
- Logistic Regression = 1.0 
으로 각 정확도가 나온다

wine 데이터의 경우 Random Forest와 Logistic Regression 모델이 1.0의 신뢰도를 보이며 오버피팅인지 학습이 정말 잘 된 것인지 모르겠다.
(= random_state의 값을 바꿔 여러번 동작해본 결과 1.0, 1.0 0.98 과 같이 높은 수치를 보인다)

Random Forest는 임의의 feature를 선택할 때 선택하는 갯수는 fearture 개수가 n이라면 $\sqrt{n}$ 개를 활용해 분류에 사용한다
따라서 feature 데이터가 너무 적어 Random Forest가 높은 수치를 받았을지도 모르겠다.

체크 : digits의 어떤 특성이 SVM과 가장 잘 맞는지는 잘 모르겠다.

In [None]:
# 최종 코드

from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier

wine = load_wine()
wine_data = wine.data
wine_label = wine.target
print(wine.target_names)

x_train, x_test, y_train, y_test = train_test_split(wine_data, wine_label, test_size=0.3, random_state=55)

random_forest=RandomForestClassifier(random_state=23)
random_forest.fit(x_train, y_train)
y_pred = random_forest.predict(x_test)

print(classification_report(y_test, y_pred))
print("accuracy : ", accuracy_score(y_test, y_pred))