In [1]:
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix # confusion_matrix는 데이터를 평가할 때 사용하는 오차행렬

digits = load_digits() # 데이터 불러오기
digits_data = digits.data # data 지정
digits_label = digits.target # target 지정
print(digits.target_names) # target_names 출력하기



[0 1 2 3 4 5 6 7 8 9]


In [2]:
x_train, x_test, y_train, y_test = train_test_split(digits_data, digits_label, test_size=0.3, random_state=10)

## 다양한 모델로 학습하기 + 평가하기(정확도, 오차행렬)

### Decision Tree

In [20]:
from sklearn.tree import DecisionTreeClassifier

decision_tree = DecisionTreeClassifier(random_state=54)
decision_tree.fit(x_train, y_train)
y_pred = decision_tree.predict(x_test)

print(confusion_matrix(y_test, y_pred))

print(classification_report(y_test, y_pred))
print("accuracy : ",accuracy_score(y_test, y_pred))

[[48  1  0  0  1  0  0  0  1  0]
 [ 0 44  3  3  1  0  0  2  4  0]
 [ 0  1 48  3  2  0  0  1  0  0]
 [ 0  1  5 47  1  0  1  0  0  1]
 [ 0  5  0  0 38  5  1  0  1  1]
 [ 0  0  1  0  4 43  1  0  0  2]
 [ 1  0  0  1  2  1 50  0  0  0]
 [ 0  0  0  1  4  0  1 52  0  2]
 [ 1  3  4  2  1  1  0  1 35  2]
 [ 0  0  2  3  0  0  0  2  3 44]]
              precision    recall  f1-score   support

           0       0.96      0.94      0.95        51
           1       0.80      0.77      0.79        57
           2       0.76      0.87      0.81        55
           3       0.78      0.84      0.81        56
           4       0.70      0.75      0.72        51
           5       0.86      0.84      0.85        51
           6       0.93      0.91      0.92        55
           7       0.90      0.87      0.88        60
           8       0.80      0.70      0.74        50
           9       0.85      0.81      0.83        54

    accuracy                           0.83       540
   macro avg       

### Random Forest

In [21]:
from sklearn.ensemble import RandomForestClassifier

random_forest = RandomForestClassifier(random_state=32)
random_forest.fit(x_train, y_train)
y_pred = random_forest.predict(x_test)

print(confusion_matrix(y_test, y_pred))

print(classification_report(y_test, y_pred))
print("accuracy : ", accuracy_score(y_test, y_pred))

[[50  0  0  0  1  0  0  0  0  0]
 [ 0 56  0  0  0  1  0  0  0  0]
 [ 0  0 55  0  0  0  0  0  0  0]
 [ 0  0  0 55  0  0  0  0  1  0]
 [ 0  0  0  0 49  0  0  1  1  0]
 [ 0  0  0  0  0 51  0  0  0  0]
 [ 1  1  0  0  0  0 53  0  0  0]
 [ 0  0  0  0  1  0  0 59  0  0]
 [ 0  2  1  1  0  1  0  0 44  1]
 [ 0  0  0  1  0  1  0  2  0 50]]
              precision    recall  f1-score   support

           0       0.98      0.98      0.98        51
           1       0.95      0.98      0.97        57
           2       0.98      1.00      0.99        55
           3       0.96      0.98      0.97        56
           4       0.96      0.96      0.96        51
           5       0.94      1.00      0.97        51
           6       1.00      0.96      0.98        55
           7       0.95      0.98      0.97        60
           8       0.96      0.88      0.92        50
           9       0.98      0.93      0.95        54

    accuracy                           0.97       540
   macro avg       

# SVM
## Support Vector Machine

In [22]:
from sklearn import svm
svm_model = svm.SVC()

svm_model.fit(x_train, y_train)
y_pred = svm_model.predict(x_test)

print(confusion_matrix(y_test, y_pred))

print(classification_report(y_test, y_pred))
print("accuracy : ",accuracy_score(y_test, y_pred))

[[50  0  0  0  1  0  0  0  0  0]
 [ 0 57  0  0  0  0  0  0  0  0]
 [ 0  0 55  0  0  0  0  0  0  0]
 [ 0  0  1 55  0  0  0  0  0  0]
 [ 0  0  0  0 49  0  0  0  2  0]
 [ 0  0  0  0  0 51  0  0  0  0]
 [ 0  0  0  0  0  0 55  0  0  0]
 [ 0  0  0  0  1  0  0 59  0  0]
 [ 0  1  0  0  0  0  0  0 48  1]
 [ 0  0  0  0  0  1  0  1  0 52]]
              precision    recall  f1-score   support

           0       1.00      0.98      0.99        51
           1       0.98      1.00      0.99        57
           2       0.98      1.00      0.99        55
           3       1.00      0.98      0.99        56
           4       0.96      0.96      0.96        51
           5       0.98      1.00      0.99        51
           6       1.00      1.00      1.00        55
           7       0.98      0.98      0.98        60
           8       0.96      0.96      0.96        50
           9       0.98      0.96      0.97        54

    accuracy                           0.98       540
   macro avg       

# Stochastic Gradient Descent Classifier (SGDClassifier)

In [23]:
from sklearn.linear_model import SGDClassifier
sgd_model = SGDClassifier()

sgd_model.fit(x_train, y_train)
y_pred = sgd_model.predict(x_test)

print(confusion_matrix(y_test, y_pred))

print(classification_report(y_test, y_pred))
print("accuracy : ",accuracy_score(y_test, y_pred))

[[50  0  0  0  1  0  0  0  0  0]
 [ 0 56  0  0  1  0  0  0  0  0]
 [ 0  0 55  0  0  0  0  0  0  0]
 [ 0  0  1 53  0  2  0  0  0  0]
 [ 0  2  0  0 48  0  0  0  1  0]
 [ 0  2  0  0  0 49  0  0  0  0]
 [ 0  0  1  0  0  0 54  0  0  0]
 [ 0  0  0  0  2  0  0 58  0  0]
 [ 0 12  1  1  2  0  1  0 33  0]
 [ 0  2  0  0  1  1  0  2  1 47]]
              precision    recall  f1-score   support

           0       1.00      0.98      0.99        51
           1       0.76      0.98      0.85        57
           2       0.95      1.00      0.97        55
           3       0.98      0.95      0.96        56
           4       0.87      0.94      0.91        51
           5       0.94      0.96      0.95        51
           6       0.98      0.98      0.98        55
           7       0.97      0.97      0.97        60
           8       0.94      0.66      0.78        50
           9       1.00      0.87      0.93        54

    accuracy                           0.93       540
   macro avg       

# Logistic Regression

In [25]:
from sklearn.linear_model import LogisticRegression
# logistic_model = LogisticRegression(solver='liblinear')
logistic_model = LogisticRegression(solver='newton-cg')
# logistic_model = LogisticRegression(solver='saga')

logistic_model.fit(x_train, y_train)
y_pred = logistic_model.predict(x_test)

print(confusion_matrix(y_test, y_pred))

print(classification_report(y_test, y_pred))
print("accuracy : ",accuracy_score(y_test, y_pred))

[[51  0  0  0  0  0  0  0  0  0]
 [ 0 55  1  0  0  0  0  0  1  0]
 [ 0  0 55  0  0  0  0  0  0  0]
 [ 0  0  1 54  0  1  0  0  0  0]
 [ 0  2  0  0 48  0  0  0  1  0]
 [ 0  1  0  0  0 49  0  1  0  0]
 [ 0  1  0  0  0  0 54  0  0  0]
 [ 0  0  0  1  1  0  0 58  0  0]
 [ 1  0  1  1  0  0  1  0 46  0]
 [ 0  0  0  0  0  1  0  0  2 51]]
              precision    recall  f1-score   support

           0       0.98      1.00      0.99        51
           1       0.93      0.96      0.95        57
           2       0.95      1.00      0.97        55
           3       0.96      0.96      0.96        56
           4       0.98      0.94      0.96        51
           5       0.96      0.96      0.96        51
           6       0.98      0.98      0.98        55
           7       0.98      0.97      0.97        60
           8       0.92      0.92      0.92        50
           9       1.00      0.94      0.97        54

    accuracy                           0.96       540
   macro avg       

# 모델 평가하기

## 정확도(Accuracy)

In [8]:
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_test, y_pred)
accuracy

0.9592592592592593

## 오차행렬

In [10]:
from sklearn.metrics import confusion_matrix

print(confusion_matrix(y_test, y_pred))

print(classification_report(y_test, y_pred))


[[51  0  0  0  0  0  0  0  0  0]
 [ 0 54  1  0  1  0  0  0  1  0]
 [ 0  0 55  0  0  0  0  0  0  0]
 [ 0  0  1 54  0  1  0  0  0  0]
 [ 0  2  0  0 48  0  0  0  1  0]
 [ 0  1  0  0  0 49  0  1  0  0]
 [ 0  1  0  0  0  0 54  0  0  0]
 [ 0  0  0  2  1  0  0 56  0  1]
 [ 0  1  1  1  0  0  1  0 46  0]
 [ 0  0  0  0  0  1  0  0  2 51]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        51
           1       0.92      0.95      0.93        57
           2       0.95      1.00      0.97        55
           3       0.95      0.96      0.96        56
           4       0.96      0.94      0.95        51
           5       0.96      0.96      0.96        51
           6       0.98      0.98      0.98        55
           7       0.98      0.93      0.96        60
           8       0.92      0.92      0.92        50
           9       0.98      0.94      0.96        54

    accuracy                           0.96       540
   macro avg       

참고 사이트
https://blog.naver.com/PostView.nhn?blogId=wideeyed&logNo=221531940245
https://towardsdatascience.com/accuracy-precision-recall-or-f1-331fb37c5cb9
https://eunsukimme.github.io/ml/2019/10/21/Accuracy-Recall-Precision-F1-score/

# 정리

digits 데이터셋의 경우 오차행렬과 정확도의 차이가 없어 정확도를 사용해도 오류가 없을 것으로 생각돼 정확도를 사용한다.

- Decision Tree = 0.83
- Random Forest = 0.96
- SVM = 0.98
- SGDClassifier = 0.93
- Logistic Regression = 0.96
으로 각 정확도가 나온다

digits 데이터셋과 같은 경우 SVM(Support Vector Machine)과 조합이 괜찮은 것을 알 수 있다.
선형분류 알고리즘은 이진 분류이지만 일대다 방법을 사용해 다중 클래스 분류 알고리즘으로 사용해 각 클래스를 다른 클래스와 구분하도록 이진 분류 모델을 학습시킨다.
클래스의 수만큼 이진 분류 모델이 만들어지고 예측할 떄는 만들어진 모든 이진 분류기가 작동하여 가장 높은 점수의 분류기의 클래스를 예측값으로 선택한다

10개의 클래스를 모두 이진 분류를 여러번 하는 학습방법이 digits에겐 알맞다

단, digits의 경우 클래스의 수가 10개 밖에 안되서 선형분류를 사용해도 괜찮지만 클래스 수가 엄청나게 많아진다면 오히려 효율이 떨어질 수도 있을 것으로 예상된다.

체크 : digits의 데이터를 분류하는 것이 SVM과 가장 잘 맞는지는 잘 모르겠다.


In [27]:
# 최종 코드

from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn import svm

digits = load_digits() # 데이터 불러오기
digits_data = digits.data # data 지정
digits_label = digits.target # target 지정
print(digits.target_names) # target_names 출력하기

x_train, x_test, y_train, y_test = train_test_split(digits_data, digits_label, test_size=0.3, random_state=10)

svm_model = svm.SVC()

svm_model.fit(x_train, y_train)
y_pred = svm_model.predict(x_test)

print("accuracy : ",accuracy_score(y_test, y_pred))

[0 1 2 3 4 5 6 7 8 9]
accuracy :  0.9833333333333333
