In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_mldata
from scipy import io
%matplotlib inline
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

mnist = io.loadmat('mnist-original.mat') 
X = mnist['data'].T
y = mnist['label'].T

print (X.shape, y.shape)

(70000, 784) (70000, 1)


In [2]:
feat_cols = [ 'pixel'+str(i) for i in range(X.shape[1]) ]
df = pd.DataFrame(X,columns=feat_cols)

In [3]:
df['y'] = y
print('Size of the dataframe: {}'.format(df.shape))

Size of the dataframe: (70000, 785)


In [4]:
# 랜덤하게 10000개 데이터만 뽑기
rndperm = np.random.permutation(df.shape[0])
df1=df.loc[rndperm[:10000],:]
df1.shape

(10000, 785)

In [5]:
#train set과 test set 분류하기
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df1[feat_cols], df1['y'], test_size=0.2, random_state=1)

# 원본 데이터

### Random Forest

In [6]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()

In [7]:
# 파라미터 튜닝 - Grid Search
n_estimators = [10, 200]

params = {'n_estimators': n_estimators}

In [8]:
from sklearn.model_selection import GridSearchCV
grid = GridSearchCV(estimator=rf, param_grid=params, cv=5, n_jobs=-1)
grid = grid.fit(X_train, y_train)

In [9]:
grid.best_score_

0.944875

In [10]:
grid.best_params_
# 200 근처의 값에서 파라미터 서치를 더 자세히 해보자!

{'n_estimators': 200}

In [11]:
# 파라미터 튜닝 - Grid Search
n_estimators = [180, 200, 220, 250]

params = {'n_estimators': n_estimators}

In [12]:
from sklearn.model_selection import GridSearchCV
grid = GridSearchCV(estimator=rf, param_grid=params, cv=5, n_jobs=-1)
grid = grid.fit(X_train, y_train)

In [13]:
grid.best_score_

0.944625

In [14]:
grid.best_params_

{'n_estimators': 250}

In [15]:
# 파라미터 튜닝 - Grid Search
n_estimators = [250, 300]

params = {'n_estimators': n_estimators}

In [16]:
from sklearn.model_selection import GridSearchCV
grid = GridSearchCV(estimator=rf, param_grid=params, cv=5, n_jobs=-1)
grid = grid.fit(X_train, y_train)

In [17]:
grid.best_score_

0.94575

In [18]:
grid.best_params_

{'n_estimators': 300}

In [19]:
import time
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

time_start = time.time()

rf = RandomForestClassifier(n_estimators = 300)
rf.fit(X_train, y_train)
rf_y_pred = rf.predict(X_test)
rf_accuracy = accuracy_score(y_test, rf_y_pred)

print('Accuracy : %.4f \nTime : %.2f sec'
      %(rf_accuracy, time.time() - time_start))

Accuracy : 0.9480 
Time : 39.44 sec


### KNN

In [20]:
import time
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

time_start = time.time()

knn = KNeighborsClassifier(n_neighbors = 3)
knn.fit(X_train, y_train)
knn_y_pred = knn.predict(X_test)
knn_accuracy = accuracy_score(y_test, knn_y_pred)

print('Accuracy : %.4f \nTime : %.2f sec'
      %(knn_accuracy, time.time() - time_start))

Accuracy : 0.9365 
Time : 73.00 sec


### Naive Bayes

In [21]:
import time
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

time_start = time.time()

nb = GaussianNB()
nb.fit(X_train, y_train)
nb_y_pred = nb.predict(X_test)
nb_accuracy = accuracy_score(y_test, nb_y_pred)

print('Accuracy : %.4f \nTime : %.2f sec'
      %(nb_accuracy, time.time() - time_start))

Accuracy : 0.5690 
Time : 1.29 sec


# PCA

In [22]:
from sklearn.decomposition import PCA

pca = PCA(n_components = 0.99).fit(X_train)

X_train_pca = pca.transform(X_train)
X_test_pca = pca.transform(X_test)

In [23]:
X_train_pca.shape

(8000, 324)

In [24]:
# 동일한 조건에서 시간과 accuracy를 측정하기 위해 원본 데이터에서의 모델과 동일하게 진행하였다

### Random Forest

In [25]:
import time
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

time_start = time.time()

rf = RandomForestClassifier(n_estimators = 300)
rf.fit(X_train_pca, y_train)
rf_y_pred = rf.predict(X_test_pca)
rf_accuracy = accuracy_score(y_test, rf_y_pred)

print('Accuracy : %.4f \nTime : %.2f sec'
      %(rf_accuracy, time.time() - time_start))

Accuracy : 0.9125 
Time : 118.86 sec


### KNN

In [26]:
import time
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

time_start = time.time()

knn = KNeighborsClassifier(n_neighbors = 3)
knn.fit(X_train_pca, y_train)
knn_y_pred = knn.predict(X_test_pca)
knn_accuracy = accuracy_score(y_test, knn_y_pred)

print('Accuracy : %.4f \nTime : %.2f sec'
      %(knn_accuracy, time.time() - time_start))

Accuracy : 0.9385 
Time : 31.36 sec


### Naive Bayes

In [27]:
import time
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

time_start = time.time()

nb = GaussianNB()
nb.fit(X_train_pca, y_train)
nb_y_pred = nb.predict(X_test_pca)
nb_accuracy = accuracy_score(y_test, nb_y_pred)

print('Accuracy : %.4f \nTime : %.2f sec'
      %(nb_accuracy, time.time() - time_start))

Accuracy : 0.7820 
Time : 0.70 sec


# LDA

In [35]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

lda = LinearDiscriminantAnalysis()
lda.fit(X_train, y_train)

LinearDiscriminantAnalysis(n_components=None, priors=None, shrinkage=None,
              solver='svd', store_covariance=False, tol=0.0001)

In [36]:
X_train_lda = lda.transform(X_train)
X_test_lda = lda.transform(X_test)

In [37]:
X_train_lda.shape

(8000, 9)

### Random Forest

In [38]:
import time
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

time_start = time.time()

rf = RandomForestClassifier(n_estimators = 300)
rf.fit(X_train_lda, y_train)
rf_y_pred = rf.predict(X_test_lda)
rf_accuracy = accuracy_score(y_test, rf_y_pred)

print('Accuracy : %.4f \nTime : %.2f sec'
      %(rf_accuracy, time.time() - time_start))

Accuracy : 0.8755 
Time : 17.65 sec


### KNN

In [39]:
import time
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

time_start = time.time()

knn = KNeighborsClassifier(n_neighbors = 3)
knn.fit(X_train_lda, y_train)
knn_y_pred = knn.predict(X_test_lda)
knn_accuracy = accuracy_score(y_test, knn_y_pred)

print('Accuracy : %.4f \nTime : %.2f sec'
      %(knn_accuracy, time.time() - time_start))

Accuracy : 0.8765 
Time : 0.62 sec


### Naive Bayes

In [40]:
import time
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

time_start = time.time()

nb = GaussianNB()
nb.fit(X_train_lda, y_train)
nb_y_pred = nb.predict(X_test_lda)
nb_accuracy = accuracy_score(y_test, nb_y_pred)

print('Accuracy : %.4f \nTime : %.2f sec'
      %(nb_accuracy, time.time() - time_start))

Accuracy : 0.8595 
Time : 0.04 sec


## 결론 해석

In [41]:
"""
    # 원본 데이터 #
    - Random Forest
        Accuracy : 0.9480 
        Time : 39.44 sec
    - KNN
        Accuracy : 0.9365 
        Time : 73.00 sec
    - Naive Bayes
        Accuracy : 0.5690 
        Time : 1.29 sec
        
    # PCA #
    - Random Forest
        Accuracy : 0.9125 
        Time : 118.86 sec
    - KNN
        Accuracy : 0.9385 
        Time : 31.36 sec
    - Naive Bayes
        Accuracy : 0.7820 
        Time : 0.70 sec    
    
    # LDA #
    - Random Forest
        Accuracy : 0.8755 
        Time : 17.65 sec
    - KNN
        Accuracy : 0.8765 
        Time : 0.62 sec
    - Naive Bayes
        Accuracy : 0.8595 
        Time : 0.04 sec
"""

'\n    # 원본 데이터 #\n    - Random Forest\n        Accuracy : 0.9480 \n        Time : 39.44 sec\n    - KNN\n        Accuracy : 0.9365 \n        Time : 73.00 sec\n    - Naive Bayes\n        Accuracy : 0.5690 \n        Time : 1.29 sec\n        \n    # PCA #\n    - Random Forest\n        Accuracy : 0.9125 \n        Time : 118.86 sec\n    - KNN\n        Accuracy : 0.9385 \n        Time : 31.36 sec\n    - Naive Bayes\n        Accuracy : 0.7820 \n        Time : 0.70 sec    \n    \n    # LDA #\n    - Random Forest\n        Accuracy : 0.8755 \n        Time : 17.65 sec\n    - KNN\n        Accuracy : 0.8765 \n        Time : 0.62 sec\n    - Naive Bayes\n        Accuracy : 0.8595 \n        Time : 0.04 sec\n'

* 원본 데이터
    - Random Forest, KNN, Naive Bayes 순서로 정확도가 높았다.
    - Naive Bayes, Random Forest, KNN 순서로 시간이 적게 걸렸다.

* PCA
    - Random Forest를 제외한 분류기에서 정확도가 증가하고 시간도 줄어들었다.
    - Random Forest의 경우 정확도도 줄어들었고 시간도 더 늘어났다.
    - 특히 Naive Bayes의 경우 정확도가 20% 늘어났다.
    
* LDA
    - 차원이 적어 전체적으로 시간이 많이 줄어들었다.
    - 그러나 Random Forest와 KNN의 정확도도 줄어들었다.
    - Naive Bayes의 경우 정확도가 크게 늘어났다.

* Random Forest는 특이하게 원본데이터에서보다 PCA를 적용한 데이터에서 더 시간이 늘어났다. 왜 그럴까?

    - 그 이유는 MNIST 데이터의 클래스가 0, 1로만 되어있어 비교적 DT에서 분류하기 쉬운 반면, 차원축소를 하게 되면 그 값이 연속형 값으로 바뀌어 분류가 더 어려워지게 된다. 이때 DT를 베이스 모델으로 하는 Random Forest도 같은 이유로 시간이 더 오래 걸리게 된 것으로 보인다.