# Deep Forestを実行する
Deep Forestの実装部分においては以下のGithubのコードを拝借した。  
https://github.com/leopiney/deep-forest

In [1]:
from pathlib import Path
import numpy as np
import random
import uuid
import time

from keras.datasets import mnist
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from sklearn.metrics import *
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn import datasets
import xgboost as xgb

from deep_forest import MGCForest

In [2]:
%load_ext autoreload
%autoreload 2
pd.set_option("max_columns", 1000)

In [3]:
result_dir_path = Path("result")
if not result_dir_path.exists():
    result_dir_path.mkdir(parents=True)

## Deep_Forestを構築する

In [4]:
n_estimators_mgs = 30
n_estimators_cascade = 100

mgc_forest = MGCForest(
    estimators_config={
        'mgs': [{
            'estimator_class': ExtraTreesClassifier,
            'estimator_params': {
                'n_estimators': n_estimators_mgs,
                'n_jobs': -1,
            }
        }, {
            'estimator_class': RandomForestClassifier,
            'estimator_params': {
                'n_estimators': n_estimators_mgs,
                'n_jobs': -1,
            }
        }],
        'cascade': [{
            'estimator_class': ExtraTreesClassifier,
            'estimator_params': {
                'n_estimators': n_estimators_cascade,
                'max_features': 1,
                'n_jobs': -1,
            }
        }, {
            'estimator_class': ExtraTreesClassifier,
            'estimator_params': {
                'n_estimators': n_estimators_cascade,
                'max_features': 'sqrt',
                'n_jobs': -1,
            }
        }, {
            'estimator_class': RandomForestClassifier,
            'estimator_params': {
                'n_estimators': n_estimators_cascade,
                'max_features': 1,
                'n_jobs': -1,
            }
        }, {
            'estimator_class': RandomForestClassifier,
            'estimator_params': {
                'n_estimators': n_estimators_cascade,
                'max_features': 'sqrt',
                'n_jobs': -1,
            }
        }]
    },
    stride_ratios=[1.0 / 4, 1.0 / 9, 1.0 / 16],
    verbose=False
)

比較用のモデルを構築する

In [5]:
rf_model = RandomForestClassifier(max_depth=10, n_estimators=100)
xgb_model = xgb.XGBClassifier(max_depth=10, n_estimators=100)

model_dict = {
    "rf" : rf_model,
    "xgb" : xgb_model,
    "mgc_forest" : mgc_forest
}

In [6]:
def cross_val_scores_fn(model, X_data, y_data, n_splits, shuffle=False):
    
    kfold = StratifiedKFold(n_splits=n_splits, shuffle=shuffle)

    cvs_list, elapsed_time_list = [], []
    for train_indexes, test_indexes in kfold.split(X_data, y_data):

        X_train = X_data[train_indexes]
        y_train = y_data[train_indexes]
        X_test = X_data[test_indexes]
        y_test = y_data[test_indexes]
        
        start_time = time.time()
        
        model.fit(X_train, y_train)
        
        end_time = time.time()
        
        pred = model.predict(X_test)

        cvs_list.append(accuracy_score(y_test, pred))
        elapsed_time_list.append(end_time - start_time)

    return np.round(cvs_list, 3).tolist(), np.round(elapsed_time_list, 2).tolist()

In [7]:
def estimate_models(model_dict, X_data, y_data, n_splits=5, shuffle=True):
    
    results = []
    for model_name, model in model_dict.items():

        cvs_list, elapsed_time_list = cross_val_scores_fn(
            model, 
            X_data=X_data,
            y_data=y_data,
            n_splits=n_splits,
            shuffle=shuffle
        )

        print("{} : cv score mean : {:.0f}%, scores : {}".format(
            model_name,
            np.mean(cvs_list) * 100,
            cvs_list
        ))
        results.append([model_name, len(X_data), np.mean(cvs_list), str(cvs_list), np.mean(elapsed_time_list), str(elapsed_time_list)])

    results = pd.DataFrame(results, columns=["model_name", "data_size", "cvs_mean", "cvs_list", "elapsed_time_mean", "elapsed_time_list"])
    return results

## irisデータによる検証

In [8]:
from sklearn.datasets import load_iris, load_digits

iris = load_iris()
X_data = iris.data
y_data = iris.target

In [10]:
results = estimate_models(
    model_dict=model_dict,
    X_data=X_data,
    y_data=y_data,
    n_splits=5,
    shuffle=True
)
results.to_csv(result_dir_path.joinpath("model_estimate_iris.csv"), index=False)
results

rf : cv score mean : 97%, scores : [1.0, 0.933, 0.933, 0.967, 1.0]
xgb : cv score mean : 95%, scores : [0.933, 0.933, 0.967, 1.0, 0.933]


<deep_forest.CascadeForest object at 0x132af0ac8> - Level 1:: got accuracy 0.95
<deep_forest.CascadeForest object at 0x132af0ac8> - Level 2:: got accuracy 0.95
<deep_forest.CascadeForest object at 0x132af0ac8> - Level 1:: got accuracy 0.95
<deep_forest.CascadeForest object at 0x132af0ac8> - Level 2:: got accuracy 0.975
<deep_forest.CascadeForest object at 0x132af0ac8> - Level 3:: got accuracy 0.975
<deep_forest.CascadeForest object at 0x132af0ac8> - Level 1:: got accuracy 0.95
<deep_forest.CascadeForest object at 0x132af0ac8> - Level 2:: got accuracy 0.9583333333333334
<deep_forest.CascadeForest object at 0x132af0ac8> - Level 3:: got accuracy 0.9666666666666667
<deep_forest.CascadeForest object at 0x132af0ac8> - Level 4:: got accuracy 0.95
<deep_forest.CascadeForest object at 0x132af0ac8> - Level 1:: got accuracy 0.9083333333333333
<deep_forest.CascadeForest object at 0x132af0ac8> - Level 2:: got accuracy 0.925
<deep_forest.CascadeForest object at 0x132af0ac8> - Level 3:: got accuracy 

mgc_forest : cv score mean : 95%, scores : [0.933, 0.933, 0.933, 0.967, 0.967]


Unnamed: 0,model_name,data_size,cvs_mean,cvs_list,elapsed_time_mean,elapsed_time_list
0,rf,150,0.9666,"[1.0, 0.933, 0.933, 0.967, 1.0]",0.132,"[0.14, 0.15, 0.12, 0.13, 0.12]"
1,xgb,150,0.9532,"[0.933, 0.933, 0.967, 1.0, 0.933]",0.024,"[0.04, 0.02, 0.02, 0.02, 0.02]"
2,mgc_forest,150,0.9466,"[0.933, 0.933, 0.933, 0.967, 0.967]",10.608,"[8.73, 9.34, 12.2, 15.17, 7.6]"


交差検証の正答率では、Deep Forestが一番低い結果となってしまった。

## 乳がんデータによる検証

In [11]:
breast_cancer_data = datasets.load_breast_cancer()
X_data = breast_cancer_data.data
y_data = breast_cancer_data.target

In [12]:
rf_model = RandomForestClassifier(max_depth=10, n_estimators=100)
xgb_model = xgb.XGBClassifier(n_estimators=100)

model_dict = {
    "rf" : rf_model,
    "xgb" : xgb_model,
    "mgc_forest" : mgc_forest
}


results = estimate_models(
    model_dict=model_dict,
    X_data=X_data,
    y_data=y_data,
    n_splits=5,
    shuffle=True
)
results.to_csv(result_dir_path.joinpath("model_estimate_breast_cancer.csv"), index=False)
results

rf : cv score mean : 96%, scores : [0.939, 0.957, 0.965, 0.973, 0.991]
xgb : cv score mean : 97%, scores : [0.974, 0.983, 0.973, 0.965, 0.947]


<deep_forest.CascadeForest object at 0x132af0ac8> - Level 1:: got accuracy 0.9515418502202643
<deep_forest.CascadeForest object at 0x132af0ac8> - Level 2:: got accuracy 0.947136563876652
<deep_forest.CascadeForest object at 0x132af0ac8> - Level 1:: got accuracy 0.9581497797356828
<deep_forest.CascadeForest object at 0x132af0ac8> - Level 2:: got accuracy 0.947136563876652
<deep_forest.CascadeForest object at 0x132af0ac8> - Level 1:: got accuracy 0.9429824561403509
<deep_forest.CascadeForest object at 0x132af0ac8> - Level 2:: got accuracy 0.9298245614035088
<deep_forest.CascadeForest object at 0x132af0ac8> - Level 1:: got accuracy 0.9517543859649122
<deep_forest.CascadeForest object at 0x132af0ac8> - Level 2:: got accuracy 0.956140350877193
<deep_forest.CascadeForest object at 0x132af0ac8> - Level 3:: got accuracy 0.9517543859649122
<deep_forest.CascadeForest object at 0x132af0ac8> - Level 1:: got accuracy 0.9627192982456141
<deep_forest.CascadeForest object at 0x132af0ac8> - Level 2:: g

mgc_forest : cv score mean : 97%, scores : [0.965, 0.957, 0.991, 0.991, 0.947]


Unnamed: 0,model_name,data_size,cvs_mean,cvs_list,elapsed_time_mean,elapsed_time_list
0,rf,569,0.965,"[0.939, 0.957, 0.965, 0.973, 0.991]",0.152,"[0.19, 0.15, 0.14, 0.14, 0.14]"
1,xgb,569,0.9684,"[0.974, 0.983, 0.973, 0.965, 0.947]",0.086,"[0.08, 0.09, 0.09, 0.09, 0.08]"
2,mgc_forest,569,0.9702,"[0.965, 0.957, 0.991, 0.991, 0.947]",10.314,"[10.87, 9.72, 9.67, 11.67, 9.64]"


## MNISTデータによる検証

In [13]:
(X_train, y_train), (X_test, y_test) = mnist.load_data()

# 画像形式をベクトルに変換
X_train = X_train.reshape((len(X_train), -1))
X_test = X_test.reshape((len(X_test), -1))

X_test = X_test[:1000]
y_test = y_test[:1000]

In [112]:
rf_model = RandomForestClassifier(n_estimators=100, max_depth=10)
xgb_model = xgb.XGBClassifier(n_estimators=100, max_depth=10)

model_dict = {
    "rf" : rf_model,
    "xgb" : xgb_model,
    "mgc_forest" : mgc_forest
}

results = []
for model_name, model in model_dict.items():
    
    for data_size in [100, 200, 500, 1000, 2000]:
        
        X_train_ = X_train[:data_size]
        y_train_ = y_train[:data_size]

        start_time = time.time()

        model.fit(X_train_, y_train_)

        end_time = time.time()

        acc = accuracy_score(y_test, model.predict(X_test))
        results.append([model_name, data_size, end_time - start_time, acc])
        print("End {} {} : {:.0f}%".format(model_name, data_size, acc*100))

results = pd.DataFrame(results, columns=["model_name", "data_size", "elapsed_time", "accuracy"])
results.to_csv(result_dir_path.joinpath("model_estimate_mnist.csv"), index=False)
results

End rf 100 : 61%
End rf 200 : 69%
End rf 500 : 80%
End rf 1000 : 86%
End rf 2000 : 90%
End xgb 100 : 50%
End xgb 200 : 63%
End xgb 500 : 79%
End xgb 1000 : 84%
End xgb 2000 : 86%


<deep_forest.CascadeForest object at 0x12c08fcf8> - Level 1:: got accuracy 0.8
<deep_forest.CascadeForest object at 0x12c08fcf8> - Level 2:: got accuracy 0.78


End mgc_forest 100 : 72%


<deep_forest.CascadeForest object at 0x12c08fcf8> - Level 1:: got accuracy 0.86
<deep_forest.CascadeForest object at 0x12c08fcf8> - Level 2:: got accuracy 0.85


End mgc_forest 200 : 80%


<deep_forest.CascadeForest object at 0x12c08fcf8> - Level 1:: got accuracy 0.914
<deep_forest.CascadeForest object at 0x12c08fcf8> - Level 2:: got accuracy 0.922
<deep_forest.CascadeForest object at 0x12c08fcf8> - Level 3:: got accuracy 0.922


End mgc_forest 500 : 91%


<deep_forest.CascadeForest object at 0x12c08fcf8> - Level 1:: got accuracy 0.937
<deep_forest.CascadeForest object at 0x12c08fcf8> - Level 2:: got accuracy 0.941
<deep_forest.CascadeForest object at 0x12c08fcf8> - Level 3:: got accuracy 0.942
<deep_forest.CascadeForest object at 0x12c08fcf8> - Level 4:: got accuracy 0.945
<deep_forest.CascadeForest object at 0x12c08fcf8> - Level 5:: got accuracy 0.941


End mgc_forest 1000 : 93%


<deep_forest.CascadeForest object at 0x12c08fcf8> - Level 1:: got accuracy 0.958
<deep_forest.CascadeForest object at 0x12c08fcf8> - Level 2:: got accuracy 0.958


End mgc_forest 2000 : 95%


Unnamed: 0,model_name,data_size,elapsed_time,accuracy
0,rf,100,0.133338,0.611
1,rf,200,0.148678,0.686
2,rf,500,0.208254,0.803
3,rf,1000,0.34221,0.855
4,rf,2000,0.625249,0.895
5,xgb,100,3.732999,0.503
6,xgb,200,7.309971,0.634
7,xgb,500,18.023113,0.787
8,xgb,1000,42.206646,0.843
9,xgb,2000,89.158615,0.863
