In [4]:
import pandas as pd
import numpy as np

# Visualization
import seaborn as sns
import matplotlib.pyplot as plt

# Hyperparameter tuning
import sklearn.metrics
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

# Model
from sklearn.ensemble import RandomForestClassifier

# warning message 제거
import warnings
warnings.filterwarnings('ignore')

# Read Data

In [2]:
train = pd.read_csv('Data/digit_recognizer/train.csv')
print(train.shape)
train.sample()

(42000, 785)


Unnamed: 0,label,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
37482,7,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Preprocessing

In [None]:
# pass
# 추가 preprocessing 해주면 된다

# 연산 량이 많아서 시간이 오래걸린다
# 시간을 줄여주기 위해서 테두리 픽셀을 날린다거나
# Feature Importance 가 높은 것들만 사용한다든가

In [7]:
y = train['label']
X = train.drop('label', axis=1)

# Grid Search

In [8]:
forest = RandomForestClassifier(random_state=42) # 깡통 모델

In [11]:
# 총 4개의 pair 를 만든다 [0.3, 0.4] * [True, False]

param_grid = [
    {'n_estimators': [5]           # Decision Tree 의 갯수 # Bagging 방식? 복원추출방식
     
     , 'max_features': [0.3, 0.4]  # 모든 조건의 impurity를 결정할건지
                                   # , 특정 조건에서만 impurity를 계산할건지
                                   # , None은 다 쓴다는 것
     
     , 'bootstrap': [True, False] # True : 복원추출방식, False : 비복원추출방식
    }
]

In [12]:
grid_search = GridSearchCV(forest
                          , param_grid
                          , cv=2 # 보통 5개 이상
#                           , random_state=42 # cv fold 를 만들 때 random하게 하기 때문
                          , scoring='accuracy' # accuracy 가 가장 높은 값을 쓸 거다
                          , return_train_score=True
                           , n_jobs=-1
                           , verbose=1
                          )

In [9]:
# 사용할 수 있는 scoring 옵션
sorted(sklearn.metrics.SCORERS.keys())

['accuracy',
 'adjusted_mutual_info_score',
 'adjusted_rand_score',
 'average_precision',
 'balanced_accuracy',
 'brier_score_loss',
 'completeness_score',
 'explained_variance',
 'f1',
 'f1_macro',
 'f1_micro',
 'f1_samples',
 'f1_weighted',
 'fowlkes_mallows_score',
 'homogeneity_score',
 'jaccard',
 'jaccard_macro',
 'jaccard_micro',
 'jaccard_samples',
 'jaccard_weighted',
 'max_error',
 'mutual_info_score',
 'neg_log_loss',
 'neg_mean_absolute_error',
 'neg_mean_squared_error',
 'neg_mean_squared_log_error',
 'neg_median_absolute_error',
 'normalized_mutual_info_score',
 'precision',
 'precision_macro',
 'precision_micro',
 'precision_samples',
 'precision_weighted',
 'r2',
 'recall',
 'recall_macro',
 'recall_micro',
 'recall_samples',
 'recall_weighted',
 'roc_auc',
 'v_measure_score']

# Fit 학습

In [13]:
grid_search.fit(X, y)

Fitting 2 folds for each of 4 candidates, totalling 8 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:   29.1s finished


GridSearchCV(cv=2, error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators='warn', n_jobs=None,
                                              oob_score=False, random_state=42,
                                              verbose=0, warm_start=False),
             iid='warn', n_jobs=-1,
             param_grid=[{'b

In [14]:
# 학습 결과로 나온 최적값 출력
grid_search.best_params_

{'bootstrap': False, 'max_features': 0.3, 'n_estimators': 5}

In [19]:
grid_search.best_score_

0.9190238095238096

In [18]:
grid_search.best_estimator_ # best parameter 가 들어간 tree 를 돌려준다

RandomForestClassifier(bootstrap=False, class_weight=None, criterion='gini',
                       max_depth=None, max_features=0.3, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=5,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

In [None]:
# 찾은 최적 parameter 로 다시 fit 할 필요없이 바로 predict 할 수 있다
grid_search.best_estimator_.predict(X_test, y_test)
grid_search.best_estimator_.predict_proba(X_test, y_test) # threshold 를 움직여서 모델을 fine tuning 할 수 있다

# model Validation
#### 1. hold-out validation
    train 과 validation set 을 한 번만 나눠주는 것
#### 2. cross validation
    train 데이터를 여러가지 방법으로 나눠주는 것
    
> mean_test_score: 여러가지 방법으로 validation 한 결과를 다른 모델과 비교하기 위해 평균 mean 값을 계산
>
>                  - std 표준편차가 너무 크다면 데이터가 나눠질 때 잘못 나눠졌거나 모델에 문제가 있다

In [16]:
cvres = grid_search.cv_results_
# cvres
for mean_score, params in zip(cvres['mean_test_score'], cvres['params']):
    print(mean_score, params)

0.9085 {'bootstrap': True, 'max_features': 0.3, 'n_estimators': 5}
0.9074047619047619 {'bootstrap': True, 'max_features': 0.4, 'n_estimators': 5}
0.9190238095238096 {'bootstrap': False, 'max_features': 0.3, 'n_estimators': 5}
0.9165714285714286 {'bootstrap': False, 'max_features': 0.4, 'n_estimators': 5}


파라미터를 최적화한 후에 전체 train set 으로 다시 fit 한다

# Random Search
- parameter 를 더 많이 줘도 연산량이 늘어나지 않는다
- 몇 번 시도하는지 횟수가 연산량을 결정

In [20]:
from scipy.stats import uniform as sp_uniform # float uniform distribution
from scipy.stats import randint as sp_randint # int uniform distribution

In [45]:
param_dist = {'max_features': sp_uniform(0.3, 0.5) # 0 ~ 100 까지 넣을 수 있다
                                                   # int, float 동작방식이 다르다
                                                    # int : 3 이라면 매번 3개의 트리를 랜덤하게 뽑는다
                                                    # float : 토탈 feature 중에 몇 % 를 뽑는다
              , 'bootstrap': [True, False]       
              , 'criterion': ['gini', 'entropy'] # ['gini', 'entropy']
              , 'n_estimators': [20]
              , 'max_depth': sp_randint(5, 25)
             }

In [46]:
random_search = RandomizedSearchCV(forest, param_dist
                                   , n_iter=20 # 시도하는 횟수
                                               # 찾을 공간이 넓고 파라미터가 많다면 함께 늘려줘야한다
                                   , cv=2
                                   , verbose=1
                                   , n_jobs=-1 # 최대 갯수를 다 써라
                                  )

In [47]:
random_search.fit(X, y)

Fitting 2 folds for each of 20 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed: 15.1min finished


RandomizedSearchCV(cv=2, error_score='raise-deprecating',
                   estimator=RandomForestClassifier(bootstrap=True,
                                                    class_weight=None,
                                                    criterion='gini',
                                                    max_depth=None,
                                                    max_features='auto',
                                                    max_leaf_nodes=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=None,
                                                    min_samples_leaf=1,
                                                    min_samples_split=2,
                                                    min_weight_fraction_leaf=0.0,
                                                    n_estimators='warn',
                                                    n_jobs=None

In [24]:
random_search.best_params_ # criterion=['gini', 'entropy'] # n_iter=5

{'bootstrap': False,
 'criterion': 'gini',
 'max_depth': 13,
 'max_features': 0.3399633333357592,
 'n_estimators': 5}

In [26]:
random_search.best_score_ # criterion=['gini', 'entropy'] # n_iter=5

0.9198809523809524

In [32]:
cvres = random_search.cv_results_ # criterion=['gini', 'entropy'] # n_iter=5
# cvres
for mean_score, params in zip(cvres['mean_test_score'], cvres['params']):
    print(mean_score, params)

0.7862142857142858 {'bootstrap': False, 'criterion': 'entropy', 'max_depth': 5, 'max_features': 0.3683737697971745, 'n_estimators': 5}
0.8999285714285714 {'bootstrap': True, 'criterion': 'gini', 'max_depth': 13, 'max_features': 0.5907509866064545, 'n_estimators': 5}
0.9198809523809524 {'bootstrap': False, 'criterion': 'gini', 'max_depth': 13, 'max_features': 0.3399633333357592, 'n_estimators': 5}
0.9083571428571429 {'bootstrap': False, 'criterion': 'gini', 'max_depth': 15, 'max_features': 0.7717367598941475, 'n_estimators': 5}
0.8521904761904762 {'bootstrap': False, 'criterion': 'gini', 'max_depth': 7, 'max_features': 0.6416701538101791, 'n_estimators': 5}


In [31]:
random_search.best_estimator_.estimators_ # Tree 를 만들 때 어떤 parameter 썼는지
random_search.best_estimator_.estimators_[0] # Tree 하나씩 꺼낼 수 있다

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=13,
                       max_features=0.3399633333357592, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=1608637542, splitter='best')

In [30]:
 # 어떤 depth 를 줘야 최적값이 나올지 확인하기 위해
for i in random_search.best_estimator_.estimators_: # criterion=['gini', 'entropy'] # n_iter=5
    print(i.get_depth())

13
13
13
13
13


# 2nd fit 
n_estimator=[20] n_iter=5

In [48]:
random_search.best_params_ # n_estimator=[20] # n_iter=20

{'bootstrap': False,
 'criterion': 'gini',
 'max_depth': 21,
 'max_features': 0.5170855414812188,
 'n_estimators': 20}

In [49]:
random_search.best_score_ # n_estimator=[20] # n_iter=20

0.9483333333333334

In [50]:
cvres = random_search.cv_results_ # criterion=['gini'] # n_iter=20
# cvres
for mean_score, params in zip(cvres['mean_test_score'], cvres['params']):
    print(mean_score, params)

0.9352619047619047 {'bootstrap': True, 'criterion': 'entropy', 'max_depth': 10, 'max_features': 0.5652118699412894, 'n_estimators': 20}
0.9433809523809524 {'bootstrap': True, 'criterion': 'entropy', 'max_depth': 12, 'max_features': 0.41701831494872094, 'n_estimators': 20}
0.9122380952380953 {'bootstrap': True, 'criterion': 'entropy', 'max_depth': 8, 'max_features': 0.5965995634396907, 'n_estimators': 20}
0.9421666666666667 {'bootstrap': True, 'criterion': 'entropy', 'max_depth': 18, 'max_features': 0.6874263442249998, 'n_estimators': 20}
0.9222857142857143 {'bootstrap': False, 'criterion': 'gini', 'max_depth': 24, 'max_features': 0.786105161773868, 'n_estimators': 20}
0.9358809523809524 {'bootstrap': True, 'criterion': 'gini', 'max_depth': 12, 'max_features': 0.617666194713609, 'n_estimators': 20}
0.9391904761904762 {'bootstrap': True, 'criterion': 'gini', 'max_depth': 17, 'max_features': 0.717623611103003, 'n_estimators': 20}
0.9464047619047619 {'bootstrap': True, 'criterion': 'entrop

In [65]:
random_search.best_estimator_

RandomForestClassifier(bootstrap=False, class_weight=None, criterion='gini',
                       max_depth=21, max_features=0.5170855414812188,
                       max_leaf_nodes=None, min_impurity_decrease=0.0,
                       min_impurity_split=None, min_samples_leaf=1,
                       min_samples_split=2, min_weight_fraction_leaf=0.0,
                       n_estimators=20, n_jobs=None, oob_score=False,
                       random_state=42, verbose=0, warm_start=False)

In [51]:
 # 어떤 depth 를 줘야 최적값이 나올지 확인하기 위해
for i in random_search.best_estimator_.estimators_: # criterion=['gini'] # n_iter=20
     
    print(i.get_depth())

21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21


# n_estimator 는 클수록 좋으니까
1. best parameter 찾아서 
2. dict 로 저장해서 
3. n_estimator 만 큰 숫자로 바꿔서 model 새로 만들고 fit 한다

In [69]:
best_params = random_search.best_params_
best_params

{'bootstrap': False,
 'criterion': 'gini',
 'max_depth': 21,
 'max_features': 0.5170855414812188,
 'n_estimators': 20}

In [71]:
forest = RandomForestClassifier(n_estimators=100
                                , bootstrap=False
                                , criterion='gini'
                                , max_depth=21
                                , max_features=0.5170855414812188
                               , verbose=True
                                , random_state=42
                                , n_jobs=-1
                               )

In [74]:
forest.fit(X, y)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  4.0min finished


RandomForestClassifier(bootstrap=False, class_weight=None, criterion='gini',
                       max_depth=21, max_features=0.5170855414812188,
                       max_leaf_nodes=None, min_impurity_decrease=0.0,
                       min_impurity_split=None, min_samples_leaf=1,
                       min_samples_split=2, min_weight_fraction_leaf=0.0,
                       n_estimators=100, n_jobs=-1, oob_score=False,
                       random_state=42, verbose=True, warm_start=False)

In [75]:
prediction = forest.predict(test_X)

[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.3s finished


# Test 데이터로 예측

In [52]:
test = pd.read_csv('Data/digit_recognizer/test.csv')
print(test.shape)
test.sample()

(28000, 784)


Unnamed: 0,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,pixel9,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
19664,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [53]:
test_X = test

In [55]:
 # 어떤 depth 를 줘야 최적값이 나올지 확인하기 위해
prediction = random_search.best_estimator_.predict(test_X)

prediction

array([2, 0, 9, ..., 3, 9, 2], dtype=int64)

# 예측값 csv 로 저장

In [77]:
sample['Label'] = prediction
sample.to_csv('Data/digit_recognizer/submission.csv', index=False)

In [64]:
pd.read_csv('Data/digit_recognizer/submission1.csv')

Unnamed: 0,ImageId,Label
0,1,2
1,2,0
2,3,9
3,4,9
4,5,3
5,6,9
6,7,0
7,8,3
8,9,0
9,10,3


In [60]:
sample = pd.read_csv('Data/digit_recognizer/sample_submission.csv')

print(sample.shape)
sample.sample()

(28000, 2)


Unnamed: 0,ImageId,Label
12437,12438,0


In [66]:
sample

Unnamed: 0,ImageId,Label
0,1,2
1,2,0
2,3,9
3,4,9
4,5,3
5,6,9
6,7,0
7,8,3
8,9,0
9,10,3
