## 1. MNIST 데이터셋으로 테스트 세트에서 97% 정확도 달성하기

In [10]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
from sklearn.datasets import fetch_openml
mnist = fetch_openml("mnist_784", version=1)
#mnist

In [3]:
import numpy as np
X, y = mnist["data"], mnist["target"]
y = y.astype(np.int)

X.shape, y.shape

((70000, 784), (70000,))

In [4]:
train_test_idx = 60000
X_train, X_test, y_train, y_test = X[:train_test_idx], X[train_test_idx:], y[:train_test_idx], y[train_test_idx:]
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((60000, 784), (10000, 784), (60000,), (10000,))

In [5]:
shuffle_index = np.random.permutation(train_test_idx)

In [6]:
X_train, y_train = X_train[shuffle_index], y_train[shuffle_index]
X_train.shape, y_train.shape

((60000, 784), (60000,))

In [7]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
standard_scaler = StandardScaler()
X_train_standard = standard_scaler.fit_transform(X_train.astype(np.float64))
X_test_standard = standard_scaler.transform(X_test.astype(np.float64))

minmax_scaler = MinMaxScaler()
X_train_minmax = minmax_scaler.fit_transform(X_train.astype(np.float64))
X_test_minmax = minmax_scaler.transform(X_test.astype(np.float64))

### 1.1 random forest

In [8]:
from sklearn.ensemble import RandomForestClassifier
forest_clf = RandomForestClassifier(random_state=42)
forest_clf_standard = RandomForestClassifier(random_state=42)
forest_clf_minmax = RandomForestClassifier(random_state=42)

In [11]:
forest_clf.fit(X_train, y_train)
forest_clf_standard.fit(X_train_standard, y_train)
forest_clf_minmax.fit(X_train_minmax, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

In [None]:
from sklearn.model_selection import cross_val_score

In [12]:
from sklearn.model_selection import cross_val_score
print(f"normal: {np.mean(cross_val_score(forest_clf, X_test, y_test, cv=3, scoring='accuracy'))}")
print(f"standard: {np.mean(cross_val_score(forest_clf_standard, X_test_standard, y_test, cv=3, scoring='accuracy'))}")
print(f"minmax: {np.mean(cross_val_score(forest_clf_minmax, X_test_minmax, y_test, cv=3, scoring='accuracy'))}")

normal: 0.8922174893326219
standard: 0.8925174600615137
minmax: 0.8920172289941819


### 1.1.1 random forest 하이퍼파라미터 서치

In [36]:
import time
from sklearn.model_selection import GridSearchCV

param_grid = [
    {
        "n_estimators": [i for i in range(10, 251, 10)]
    }
]

knn_clf = RandomForestClassifier(random_state=42)

grid_search = GridSearchCV(knn_clf, param_grid, cv=3, scoring="accuracy", return_train_score=True)
t = time.time()
grid_search.fit(X_train_minmax, y_train)
print(f"running: {time.time() - t}")

running: 2642.6802349090576


In [37]:
grid_search.best_params_, grid_search.best_estimator_

({'n_estimators': 240},
 RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                        max_depth=None, max_features='auto', max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, n_estimators=240,
                        n_jobs=None, oob_score=False, random_state=42, verbose=0,
                        warm_start=False))

In [38]:
res = grid_search.cv_results_

In [39]:
for mean_score, params in zip(res["mean_test_score"], res["params"]):
    print(round(mean_score, 5), params)

0.94095 {'n_estimators': 10}
0.95487 {'n_estimators': 20}
0.95888 {'n_estimators': 30}
0.96177 {'n_estimators': 40}
0.96253 {'n_estimators': 50}
0.96377 {'n_estimators': 60}
0.9639 {'n_estimators': 70}
0.96447 {'n_estimators': 80}
0.96485 {'n_estimators': 90}
0.96498 {'n_estimators': 100}
0.96525 {'n_estimators': 110}
0.96558 {'n_estimators': 120}
0.9657 {'n_estimators': 130}
0.96562 {'n_estimators': 140}
0.96608 {'n_estimators': 150}
0.966 {'n_estimators': 160}
0.96607 {'n_estimators': 170}
0.9665 {'n_estimators': 180}
0.96635 {'n_estimators': 190}
0.96642 {'n_estimators': 200}
0.96645 {'n_estimators': 210}
0.96662 {'n_estimators': 220}
0.9667 {'n_estimators': 230}
0.967 {'n_estimators': 240}
0.96678 {'n_estimators': 250}


In [40]:
max_score = [0, 0] # [trees, acc]
for trees in range(10, 151, 10):
    forest_clf = RandomForestClassifier(n_estimators=trees, random_state=42)
    forest_clf.fit(X_train_minmax, y_train)
    acc = np.mean(cross_val_score(forest_clf, X_test_minmax, y_test, cv=3, scoring='accuracy'))
    if acc > max_score[1]:
        max_score[0] = trees
        max_score[1] = acc
    print(f"tree: {trees}, acc: {acc}")
print(f"max: {max_score}")

tree: 10, acc: 0.8920172289941819
tree: 20, acc: 0.918218505391749
tree: 30, acc: 0.9276203885344841
tree: 40, acc: 0.9315170038045809
tree: 50, acc: 0.9342172474345353
tree: 60, acc: 0.9366192603447817
tree: 70, acc: 0.9353168191185514
tree: 80, acc: 0.936815922116637
tree: 90, acc: 0.9382170827505059
tree: 100, acc: 0.9371175110664899
tree: 110, acc: 0.9383172425567802
tree: 120, acc: 0.938417343088946
tree: 130, acc: 0.9390172532896724
tree: 140, acc: 0.9395161048524799
tree: 150, acc: 0.9396177050573898
max: [150, 0.9396177050573898]


In [42]:
forest_clf_240 = RandomForestClassifier(n_estimators=100, random_state=42)
forest_clf_240.fit(X_train_minmax, y_train)
t = time.time()
print(f"acc: {forest_clf_240.score(X_test, y_test)}")
print(f"running time: {time.time() - t}")

acc: 0.9648
running time: 0.37458372116088867


In [43]:
forest_clf_90 = RandomForestClassifier(n_estimators=90, random_state=42)
forest_clf_90.fit(X_train_minmax, y_train)
forest_clf_90.score(X_test_minmax, y_test)

0.9698

In [44]:
forest_clf_150 = RandomForestClassifier(n_estimators=150, random_state=42)
forest_clf_150.fit(X_train_minmax, y_train)
forest_clf_150.score(X_test_minmax, y_test)

0.9711

### 1.2 KNeighbor

In [48]:
import time
from sklearn.neighbors import KNeighborsClassifier

In [None]:
"""
시간이 너무 오래 걸려서 아래 for loop로 최고 acc 탐색하는 방법으로 탐색 진행
"""
from sklearn.model_selection import GridSearchCV

param_grid = [
    {
        "n_neighbors": [i for i in range(1, 15, 2)], 
        "weights": ["uniform", "distance"],
        "algorithm": ["ball_tree", "kd_tree", "brute"]
    }
]

knn_clf = KNeighborsClassifier()

grid_search_knn = GridSearchCV(knn_clf, param_grid, cv=3, scoring="accuracy", return_train_score=True)
grid_search_knn.fit(X_train_minmax, y_train)

In [50]:
knn_max_acc = 0
weight = "distance" # uniform or distance
algorithm = "kd_tree"
for neighbors in range(1, 15, 2):
    knn_clf = KNeighborsClassifier(n_neighbors=neighbors, weights=weight, algorithm=algorithm)
    knn_clf.fit(X_train_minmax, y_train)
    acc = knn_clf.score(X_test_minmax, y_test)
    if acc > knn_max_acc:
        knn_max_acc = acc
    print(f"n: {neighbors}, acc: {acc}")
print(f"knn max acc: {knn_max_acc}")

n: 1, acc: 0.9691
n: 3, acc: 0.9717
n: 5, acc: 0.9691
n: 7, acc: 0.97
n: 9, acc: 0.9673
n: 11, acc: 0.9678
n: 13, acc: 0.9665
knn max acc: 0.9717


In [52]:
knn_clf = KNeighborsClassifier(n_neighbors=3, weights="distance", algorithm="ball_tree")
knn_clf.fit(X_train_minmax, y_train)
knn_clf.score(X_test_minmax, y_test)

0.9717

In [53]:
knn_clf = KNeighborsClassifier(n_neighbors=3, weights="uniform", algorithm="ball_tree")
knn_clf.fit(X_train_minmax, y_train)
knn_clf.score(X_test_minmax, y_test)

0.9705