## 모듈 import

In [20]:
import sklearn

print(sklearn.__version__)        # 라이브러리 호출 후 버전확인

1.0.2


In [21]:
from sklearn.datasets import load_digits                  # 필요한 함수들을 라이브러리 모듈에서 가져옴
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

## 데이터 준비

In [22]:
digits = load_digits()             # 앞으로 사용할 손글씨 정보들(digits)을 digits 변수에 저장

print(dir(digits))                 # 객체가 가진 변수와 메서드 확인
digits.keys()                      # Dataset의 정보(key) 확인

['DESCR', 'data', 'feature_names', 'frame', 'images', 'target', 'target_names']


dict_keys(['data', 'target', 'frame', 'feature_names', 'target_names', 'images', 'DESCR'])

In [23]:
digits_data = digits.data            # data 컬럼을 digits_data 변수에 저장

print(digits_data.shape)             # 실행 결과 : 1797개의 dat가 64개의 값을 가지고 있음

digits_data[0]                       # 실행 결과 : 0번 인덱스의 값이 64개의 값들을 가졌음을 확인(8x8 bit의 흑백이미지임을 의미)

(1797, 64)


array([ 0.,  0.,  5., 13.,  9.,  1.,  0.,  0.,  0.,  0., 13., 15., 10.,
       15.,  5.,  0.,  0.,  3., 15.,  2.,  0., 11.,  8.,  0.,  0.,  4.,
       12.,  0.,  0.,  8.,  8.,  0.,  0.,  5.,  8.,  0.,  0.,  9.,  8.,
        0.,  0.,  4., 11.,  0.,  1., 12.,  7.,  0.,  0.,  2., 14.,  5.,
       10., 12.,  0.,  0.,  0.,  0.,  6., 13., 10.,  0.,  0.,  0.])

In [24]:
digits_label = digits.target         # target 컬럼을 digits_label 변수에 저장

print(digits_label.shape)            # 실행 결과 : 1797개의 값만을 가지고 있음(정답인 0~9중 하나이기 때문)

digits_label[0]                      # 실행 결과 : 0번 인덱스의 값이 0임을 확인 (최초 0~9번까지는 0~9로 저장 되어있는 듯 하다. 그 이상의 인덱스 값에 대해서 다양한 값을 확인함)

(1797,)


0

## 데이터 이해하기

* Feature Data 지정하기
* Label Data 지정하기
* Target Names 출력해 보기
* 데이터 Describe 해 보기

In [25]:
## Feature Data / Label Data 지정

import pandas as pd                 # pandas 라이브러리를 pd라는 약칭으로 호출

digits_df = pd.DataFrame(data=digits_data, columns=digits.feature_names)
                                    # Array자료형을 Dataframe자료형으로 digits_df에 저장
digits_df                           # 실행 결과 : 1797개의 데이터와 64개의 픽셀값들 확인(0_0~7_7, 즉 각각은 8x8=64픽셀(값의 갯수)의 이미지를 의미한다.)

Unnamed: 0,pixel_0_0,pixel_0_1,pixel_0_2,pixel_0_3,pixel_0_4,pixel_0_5,pixel_0_6,pixel_0_7,pixel_1_0,pixel_1_1,...,pixel_6_6,pixel_6_7,pixel_7_0,pixel_7_1,pixel_7_2,pixel_7_3,pixel_7_4,pixel_7_5,pixel_7_6,pixel_7_7
0,0.0,0.0,5.0,13.0,9.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,6.0,13.0,10.0,0.0,0.0,0.0
1,0.0,0.0,0.0,12.0,13.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,11.0,16.0,10.0,0.0,0.0
2,0.0,0.0,0.0,4.0,15.0,12.0,0.0,0.0,0.0,0.0,...,5.0,0.0,0.0,0.0,0.0,3.0,11.0,16.0,9.0,0.0
3,0.0,0.0,7.0,15.0,13.0,1.0,0.0,0.0,0.0,8.0,...,9.0,0.0,0.0,0.0,7.0,13.0,13.0,9.0,0.0,0.0
4,0.0,0.0,0.0,1.0,11.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,2.0,16.0,4.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1792,0.0,0.0,4.0,10.0,13.0,6.0,0.0,0.0,0.0,1.0,...,4.0,0.0,0.0,0.0,2.0,14.0,15.0,9.0,0.0,0.0
1793,0.0,0.0,6.0,16.0,13.0,11.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,6.0,16.0,14.0,6.0,0.0,0.0
1794,0.0,0.0,1.0,11.0,15.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2.0,9.0,13.0,6.0,0.0,0.0
1795,0.0,0.0,2.0,10.0,7.0,0.0,0.0,0.0,0.0,0.0,...,2.0,0.0,0.0,0.0,5.0,12.0,16.0,12.0,0.0,0.0


In [26]:
digits_df["label"] = digits.target  # 생성한 digits_df에 digits.target을 label이라는 컬럼으로 새로 추가

digits_df                           # 실행 결과 : 추가됨을 확인

Unnamed: 0,pixel_0_0,pixel_0_1,pixel_0_2,pixel_0_3,pixel_0_4,pixel_0_5,pixel_0_6,pixel_0_7,pixel_1_0,pixel_1_1,...,pixel_6_7,pixel_7_0,pixel_7_1,pixel_7_2,pixel_7_3,pixel_7_4,pixel_7_5,pixel_7_6,pixel_7_7,label
0,0.0,0.0,5.0,13.0,9.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,6.0,13.0,10.0,0.0,0.0,0.0,0
1,0.0,0.0,0.0,12.0,13.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,11.0,16.0,10.0,0.0,0.0,1
2,0.0,0.0,0.0,4.0,15.0,12.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,3.0,11.0,16.0,9.0,0.0,2
3,0.0,0.0,7.0,15.0,13.0,1.0,0.0,0.0,0.0,8.0,...,0.0,0.0,0.0,7.0,13.0,13.0,9.0,0.0,0.0,3
4,0.0,0.0,0.0,1.0,11.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2.0,16.0,4.0,0.0,0.0,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1792,0.0,0.0,4.0,10.0,13.0,6.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,2.0,14.0,15.0,9.0,0.0,0.0,9
1793,0.0,0.0,6.0,16.0,13.0,11.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,6.0,16.0,14.0,6.0,0.0,0.0,0
1794,0.0,0.0,1.0,11.0,15.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,2.0,9.0,13.0,6.0,0.0,0.0,8
1795,0.0,0.0,2.0,10.0,7.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,5.0,12.0,16.0,12.0,0.0,0.0,9


In [27]:
## Target Names 출력

digits.target_names                 # Target Names 출력

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [28]:
## Data Describe

print(digits.DESCR)                 # Digits의 Describe 출력

.. _digits_dataset:

Optical recognition of handwritten digits dataset
--------------------------------------------------

**Data Set Characteristics:**

    :Number of Instances: 1797
    :Number of Attributes: 64
    :Attribute Information: 8x8 image of integer pixels in the range 0..16.
    :Missing Attribute Values: None
    :Creator: E. Alpaydin (alpaydin '@' boun.edu.tr)
    :Date: July; 1998

This is a copy of the test set of the UCI ML hand-written digits datasets
https://archive.ics.uci.edu/ml/datasets/Optical+Recognition+of+Handwritten+Digits

The data set contains images of hand-written digits: 10 classes where
each class refers to a digit.

Preprocessing programs made available by NIST were used to extract
normalized bitmaps of handwritten digits from a preprinted form. From a
total of 43 people, 30 contributed to the training set and different 13
to the test set. 32x32 bitmaps are divided into nonoverlapping blocks of
4x4 and the number of on pixels are counted in each blo

## Train / Test 데이터 분리

In [29]:
X_train, X_test, y_train, y_test = train_test_split(digits_data,                # Feature
                                                    digits_label,               # Label
                                                    test_size=0.2,              # 80%는 학습용 / 20%는 평가용으로 분리
                                                    random_state=7)             # 다른 모델과 동일한 조건으로 평가하기 위해 재현 가능한 랜덤값으로 섞음


print('X_train 개수: ', len(X_train),', X_test 개수: ', len(X_test))            # 배열 길이를 확인함으로써 train 갯수와 test 갯수를 확인
                                                                                # -> 이로써 test 할 준비가 완료되었음

X_train 개수:  1437 , X_test 개수:  360


## 다양한 모델로 학습시켜보기

* Decision Tree 사용해 보기
* Random Forest 사용해 보기
* SVM 사용해 보기
* SGD Classifier 사용해 보기
* Logistic Regression 사용해 보기

In [30]:
## 학습에 필요한 다양한 모델 import

from sklearn.tree import DecisionTreeClassifier           # 의사결정나무
from sklearn.ensemble import RandomForestClassifier       # 랜덤포레스트
from sklearn import svm                                   # Support Vector Machine
from sklearn.linear_model import SGDClassifier            # 확률적 경사 하강
from sklearn.linear_model import LogisticRegression       # 로지스틱 회귀

다양한 모델을 이용하더라도 싸이킷런이 제공하는 기능들이기에 드라마틱한 코드 변화가 없이 몇 줄만 수정하여 수행 할 수 있다.

In [31]:
## Decision Tree

decision_tree = DecisionTreeClassifier(random_state=32)                 # 결정트리분류기의 객체를 만든다.
decision_tree.fit(X_train, y_train)                                     # 분류기에 x와 y의 훈련 데이터를 넣어 훈련 시킨다.
y_pred = decision_tree.predict(X_test)                                  # 훈련된 분류기에 X_test라는 테스트 데이터셋을 넣어 얼마나 예측했는지 확인한다.

print(classification_report(y_test, y_pred))                            # 결과를 지표로 확인하기 위해 classification_report를 활용해 y_test, y_pred 값을 넣어 확인한다.

              precision    recall  f1-score   support

           0       1.00      0.98      0.99        43
           1       0.81      0.81      0.81        42
           2       0.79      0.82      0.80        40
           3       0.79      0.91      0.85        34
           4       0.83      0.95      0.89        37
           5       0.90      0.96      0.93        28
           6       0.84      0.93      0.88        28
           7       0.96      0.82      0.89        33
           8       0.88      0.65      0.75        43
           9       0.78      0.78      0.78        32

    accuracy                           0.86       360
   macro avg       0.86      0.86      0.86       360
weighted avg       0.86      0.86      0.85       360



In [32]:
## Random Forest

random_forest = RandomForestClassifier(random_state=32)                 # RandomForest분류기 객체를 생성

random_forest.fit(X_train, y_train)                                     # 훈련
y_pred = random_forest.predict(X_test)                                  # 예측

print(classification_report(y_test, y_pred))                            # 결과 지표를 확인

              precision    recall  f1-score   support

           0       1.00      0.98      0.99        43
           1       0.93      1.00      0.97        42
           2       1.00      1.00      1.00        40
           3       1.00      1.00      1.00        34
           4       0.93      1.00      0.96        37
           5       0.90      0.96      0.93        28
           6       1.00      0.96      0.98        28
           7       0.94      0.97      0.96        33
           8       1.00      0.84      0.91        43
           9       0.94      0.94      0.94        32

    accuracy                           0.96       360
   macro avg       0.96      0.96      0.96       360
weighted avg       0.97      0.96      0.96       360



In [33]:
## SVM

svm_model = svm.SVC()                                 # 모델 객체를 만든다.

svm_model.fit(X_train, y_train)                       # 훈련
y_pred = svm_model.predict(X_test)                    # 예측

print(classification_report(y_test, y_pred))          # 결과 지표 확인

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        43
           1       0.95      1.00      0.98        42
           2       1.00      1.00      1.00        40
           3       1.00      1.00      1.00        34
           4       1.00      1.00      1.00        37
           5       0.93      1.00      0.97        28
           6       1.00      1.00      1.00        28
           7       1.00      1.00      1.00        33
           8       1.00      0.93      0.96        43
           9       1.00      0.97      0.98        32

    accuracy                           0.99       360
   macro avg       0.99      0.99      0.99       360
weighted avg       0.99      0.99      0.99       360



In [34]:
## SDG

sgd_model = SGDClassifier()                           # 모델 객체 생성

sgd_model.fit(X_train, y_train)                       # SGD 모델로 Train 데이터를 훈련
y_pred = sgd_model.predict(X_test)                    # SGD 모델로 Test 데이터를 예측

print(classification_report(y_test, y_pred))          # 결과 지표를 확인

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        43
           1       0.92      0.79      0.85        42
           2       1.00      1.00      1.00        40
           3       0.94      0.94      0.94        34
           4       0.97      0.95      0.96        37
           5       0.96      0.93      0.95        28
           6       0.93      0.93      0.93        28
           7       0.97      0.97      0.97        33
           8       0.78      0.93      0.85        43
           9       0.91      0.91      0.91        32

    accuracy                           0.93       360
   macro avg       0.94      0.93      0.93       360
weighted avg       0.94      0.93      0.93       360



In [35]:
# Logistic Regression

logistic_model = LogisticRegression(max_iter=5000)    # 모델 객체 생성

logistic_model.fit(X_train, y_train)                  # Train 데이터 훈련
y_pred = logistic_model.predict(X_test)               # Test 데이터 예측

print(classification_report(y_test, y_pred))          # 결과 지표를 확인

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        43
           1       0.95      0.95      0.95        42
           2       0.98      1.00      0.99        40
           3       0.94      0.97      0.96        34
           4       1.00      1.00      1.00        37
           5       0.79      0.96      0.87        28
           6       1.00      0.96      0.98        28
           7       0.97      0.97      0.97        33
           8       0.92      0.84      0.88        43
           9       0.97      0.88      0.92        32

    accuracy                           0.95       360
   macro avg       0.95      0.95      0.95       360
weighted avg       0.96      0.95      0.95       360



## 모델을 평가해 보기

* 손글씨 인식 data에 대해서는 Decision Tree 모델을 제외한 나머지 모델에서 높은 accuracy를 확인 할 수 있었다. 그 중에서도 SVM 모델이 적합하다고 판단했다.
* 0~9 모든 case에 대해서 안정적으로 적중했고, 가장 높은 정확도를 보여줬기 때문이다.

## Discussion
손글씨 dataset의 경우 이번에 진행한 세 개의 프로젝트 중 첫번째로 작성을 해서, 파이썬이 익숙하지 않은 내게 코드 자체에 대한 이해가 특히 힘들었다. 하지만 64bit 이미지를 배열로 풀어서 처리하는 부분은 관심있는 분야인 영상처리와도 비슷해서 흥미롭게 공부 할 수 있었다. 기회가 된다면 음성과도 같은 자연어 처리도 공부해보고 싶어졌다.