In [1]:
import os
import pandas as pd
import numpy as np

from sklearn import preprocessing
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import ExtraTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier

from sklearn.model_selection import StratifiedKFold

from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
%matplotlib inline

import sys
sys.path.append("..") 
from gcforest.gcforest import GCForest

  from numpy.core.umath_tests import inner1d


hyperparameters

In [2]:
random_seed = 42
cv=5
score = 'f1_weighted'

#### somte sampling

In [3]:
def Smoter(X, y, is_random=False):
    if is_random == True:
        random_lst = list(np.random.randint(0, 1000, 4))
    elif is_random == False:
        random_lst = [0] * 4

    print("rs:", random_lst)
    sm = SMOTE(random_state=random_lst[2], kind = 0.24)
    X_smote, y_smote = sm.fit_sample(X, y)

    return X_smote, y_smote

In [4]:
def get_toy_config():
    config = {}
    ca_config = {}
    ca_config["random_state"] = random_seed
    ca_config["max_layers"] = 20
    ca_config["early_stopping_rounds"] = 5
    ca_config["n_classes"] = 6
    ca_config["estimators"] = []
    ca_config["estimators"].append({"n_folds": 5, "type": "RandomForestClassifier", "random_state" : random_seed})
    ca_config["estimators"].append({"n_folds": 5, "type": "RandomForestClassifier", "random_state" : random_seed})
    ca_config["estimators"].append({"n_folds": 5, "type": "RandomForestClassifier", "random_state" : random_seed})
    ca_config["estimators"].append({"n_folds": 5, "type": "RandomForestClassifier", "random_state" : random_seed})
    ca_config["estimators"].append({"n_folds": 5, "type": "DecisionTreeClassifier"})
    ca_config["estimators"].append({"n_folds": 5, "type": "DecisionTreeClassifier"})
    ca_config["estimators"].append({"n_folds": 5, "type": "DecisionTreeClassifier"})
    ca_config["estimators"].append({"n_folds": 5, "type": "LogisticRegression"})
    config["cascade"] = ca_config
    return config

# load data_four_features

In [5]:
path = os.getcwd()+'/../data/20122018freshwater_four_feature.csv'
data_four_features = pd.read_csv(path, na_values = np.nan)

print(data_four_features.dtypes)
print(data_four_features.shape)

pH             float64
DO(mg/l)       float64
CODMn(mg/l)    float64
NH3-N(mg/l)    float64
本周水质             int64
dtype: object
(33612, 5)


In [6]:
X = data_four_features.drop(['本周水质'], axis=1) # Series
y = data_four_features['本周水质']-1 # Series

In [7]:
print("水质分布情况:")
print(y.value_counts())
print("\n各特征类型分布情况:")
print(data_four_features.dtypes.value_counts())

水质分布情况:
1    13272
2     8797
3     5472
0     2438
5     2146
4     1487
Name: 本周水质, dtype: int64

各特征类型分布情况:
float64    4
int64      1
dtype: int64


In [8]:
data_four_features.head()

Unnamed: 0,pH,DO(mg/l),CODMn(mg/l),NH3-N(mg/l),本周水质
0,7.09,10.0,5.7,0.33,3
1,6.94,12.0,5.4,0.4,3
2,7.2,9.6,4.9,0.34,3
3,6.8,11.6,6.3,0.59,4
4,6.75,11.0,6.2,0.64,4


train 80%, test 20%

In [9]:
print("============ train_test_split ============")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, 
                                       stratify = y, random_state = random_seed)
print("80%% train: %d/%d, 20%% test: %d/%d" %(X_train.shape[0], X.shape[0], X_test.shape[0], X.shape[0]))

80% train: 26889/33612, 20% test: 6723/33612


### normalize  train data

fulfill the Na with median, then standardized the data, output type ndarray

In [10]:
clean_pipeline = Pipeline([('imputer', preprocessing.Imputer(missing_values='NaN',strategy="median")),
                           ('std_scaler', preprocessing.StandardScaler()),])
X_train = clean_pipeline.fit_transform(X_train)
X_test = clean_pipeline.fit_transform(X_test)

# model train & test

In [11]:
y_train = y_train.values
y_test = y_test.values

In [12]:
config = get_toy_config()

models = [
    LogisticRegression(),
    LinearDiscriminantAnalysis(),
    SVC(probability=True),
    DecisionTreeClassifier(),
    ExtraTreeClassifier(),
    GaussianNB(),
    KNeighborsClassifier(),
    RandomForestClassifier(random_state=random_seed),
    ExtraTreesClassifier(random_state=random_seed),
    GCForest(config)
]

In [16]:
test_entries = []
train_entries = []

for model in models:
    model_name = model.__class__.__name__
    if model_name == 'GCForest':
        model.fit_transform(X_train, y_train, X_test, y_test)
    else:
        model.fit(X_train, y_train)
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    f1_train = f1_score(y_train, y_train_pred, average='weighted')
    f1_test = f1_score(y_test, y_test_pred, average='weighted')
    acc_train = accuracy_score(y_train, y_train_pred)
    acc_test = accuracy_score(y_test, y_test_pred)
    train_entries.append((model_name, f1_train, acc_train))
    test_entries.append((model_name, f1_test, acc_test))
    # print("%s, test set: weighted F1 score:%f, Accuracy:%f" %(model_name, f1_score(y_valid, y_pred, average="weighted"), accuracy_score(y_valid, y_pred)))
    
train_df = pd.DataFrame(train_entries, columns=['model_name', 'train_f1_weighted', 'train_accuracy'])
test_df = pd.DataFrame(test_entries, columns=['model_name', 'test_f1_weighted', 'test_accuracy'])

  'precision', 'predicted', average, warn_for)
[ 2019-04-02 23:36:24,565][cascade_classifier.fit_transform] X_groups_train.shape=[(26889, 4)],y_train.shape=(26889,),X_groups_test.shape=[(6723, 4)],y_test.shape=(6723,)
[ 2019-04-02 23:36:24,566][cascade_classifier.fit_transform] group_dims=[4]
[ 2019-04-02 23:36:24,567][cascade_classifier.fit_transform] group_starts=[0]
[ 2019-04-02 23:36:24,568][cascade_classifier.fit_transform] group_ends=[4]
[ 2019-04-02 23:36:24,570][cascade_classifier.fit_transform] X_train.shape=(26889, 4),X_test.shape=(6723, 4)
[ 2019-04-02 23:36:24,577][cascade_classifier.fit_transform] [layer=0] look_indexs=[0], X_cur_train.shape=(26889, 4), X_cur_test.shape=(6723, 4)
[ 2019-04-02 23:36:24,975][kfold_wrapper.log_eval_metrics] Weighted F1 (layer_0 - estimator_0 - 5_folds.train_0.predict)=99.42%
[ 2019-04-02 23:36:25,223][kfold_wrapper.log_eval_metrics] Weighted F1 (layer_0 - estimator_0 - 5_folds.train_1.predict)=99.33%
[ 2019-04-02 23:36:25,420][kfold_wrapper.l

[ 2019-04-02 23:36:30,481][cascade_classifier.calc_f1] Weighted F1 (layer_0 - train.classifier_average)=99.36%
[ 2019-04-02 23:36:30,483][cascade_classifier.calc_f1] Weighted F1 (layer_0 - test.classifier_average)=97.56%
[ 2019-04-02 23:36:30,495][cascade_classifier.fit_transform] [layer=1] look_indexs=[0], X_cur_train.shape=(26889, 52), X_cur_test.shape=(6723, 52)
[ 2019-04-02 23:36:30,922][kfold_wrapper.log_eval_metrics] Weighted F1 (layer_1 - estimator_0 - 5_folds.train_0.predict)=99.46%
[ 2019-04-02 23:36:31,282][kfold_wrapper.log_eval_metrics] Weighted F1 (layer_1 - estimator_0 - 5_folds.train_1.predict)=99.44%
[ 2019-04-02 23:36:31,558][kfold_wrapper.log_eval_metrics] Weighted F1 (layer_1 - estimator_0 - 5_folds.train_2.predict)=99.46%
[ 2019-04-02 23:36:31,918][kfold_wrapper.log_eval_metrics] Weighted F1 (layer_1 - estimator_0 - 5_folds.train_3.predict)=99.39%
[ 2019-04-02 23:36:32,280][kfold_wrapper.log_eval_metrics] Weighted F1 (layer_1 - estimator_0 - 5_folds.train_4.predict)

[ 2019-04-02 23:36:48,460][kfold_wrapper.log_eval_metrics] Weighted F1 (layer_2 - estimator_0 - 5_folds.train_3.predict)=99.54%
[ 2019-04-02 23:36:48,740][kfold_wrapper.log_eval_metrics] Weighted F1 (layer_2 - estimator_0 - 5_folds.train_4.predict)=99.48%
[ 2019-04-02 23:36:48,755][kfold_wrapper.log_eval_metrics] Weighted F1 (layer_2 - estimator_0 - 5_folds.train_cv.predict)=99.44%
[ 2019-04-02 23:36:48,760][kfold_wrapper.log_eval_metrics] Weighted F1 (layer_2 - estimator_0 - 5_folds.test.predict)=97.59%
[ 2019-04-02 23:36:49,044][kfold_wrapper.log_eval_metrics] Weighted F1 (layer_2 - estimator_1 - 5_folds.train_0.predict)=99.33%
[ 2019-04-02 23:36:49,316][kfold_wrapper.log_eval_metrics] Weighted F1 (layer_2 - estimator_1 - 5_folds.train_1.predict)=99.57%
[ 2019-04-02 23:36:49,597][kfold_wrapper.log_eval_metrics] Weighted F1 (layer_2 - estimator_1 - 5_folds.train_2.predict)=99.35%
[ 2019-04-02 23:36:49,870][kfold_wrapper.log_eval_metrics] Weighted F1 (layer_2 - estimator_1 - 5_folds.tr

[ 2019-04-02 23:37:03,627][kfold_wrapper.log_eval_metrics] Weighted F1 (layer_3 - estimator_1 - 5_folds.train_2.predict)=99.50%
[ 2019-04-02 23:37:03,884][kfold_wrapper.log_eval_metrics] Weighted F1 (layer_3 - estimator_1 - 5_folds.train_3.predict)=99.44%
[ 2019-04-02 23:37:04,141][kfold_wrapper.log_eval_metrics] Weighted F1 (layer_3 - estimator_1 - 5_folds.train_4.predict)=99.50%
[ 2019-04-02 23:37:04,155][kfold_wrapper.log_eval_metrics] Weighted F1 (layer_3 - estimator_1 - 5_folds.train_cv.predict)=99.44%
[ 2019-04-02 23:37:04,159][kfold_wrapper.log_eval_metrics] Weighted F1 (layer_3 - estimator_1 - 5_folds.test.predict)=97.60%
[ 2019-04-02 23:37:04,429][kfold_wrapper.log_eval_metrics] Weighted F1 (layer_3 - estimator_2 - 5_folds.train_0.predict)=99.55%
[ 2019-04-02 23:37:04,678][kfold_wrapper.log_eval_metrics] Weighted F1 (layer_3 - estimator_2 - 5_folds.train_1.predict)=99.41%
[ 2019-04-02 23:37:04,933][kfold_wrapper.log_eval_metrics] Weighted F1 (layer_3 - estimator_2 - 5_folds.tr

[ 2019-04-02 23:37:17,014][kfold_wrapper.log_eval_metrics] Weighted F1 (layer_4 - estimator_2 - 5_folds.train_1.predict)=99.50%
[ 2019-04-02 23:37:17,298][kfold_wrapper.log_eval_metrics] Weighted F1 (layer_4 - estimator_2 - 5_folds.train_2.predict)=99.41%
[ 2019-04-02 23:37:17,588][kfold_wrapper.log_eval_metrics] Weighted F1 (layer_4 - estimator_2 - 5_folds.train_3.predict)=99.59%
[ 2019-04-02 23:37:17,834][kfold_wrapper.log_eval_metrics] Weighted F1 (layer_4 - estimator_2 - 5_folds.train_4.predict)=99.31%
[ 2019-04-02 23:37:17,848][kfold_wrapper.log_eval_metrics] Weighted F1 (layer_4 - estimator_2 - 5_folds.train_cv.predict)=99.43%
[ 2019-04-02 23:37:17,852][kfold_wrapper.log_eval_metrics] Weighted F1 (layer_4 - estimator_2 - 5_folds.test.predict)=97.59%
[ 2019-04-02 23:37:18,106][kfold_wrapper.log_eval_metrics] Weighted F1 (layer_4 - estimator_3 - 5_folds.train_0.predict)=99.46%
[ 2019-04-02 23:37:18,418][kfold_wrapper.log_eval_metrics] Weighted F1 (layer_4 - estimator_3 - 5_folds.tr

[ 2019-04-02 23:37:31,185][kfold_wrapper.log_eval_metrics] Weighted F1 (layer_5 - estimator_3 - 5_folds.train_0.predict)=99.33%
[ 2019-04-02 23:37:31,431][kfold_wrapper.log_eval_metrics] Weighted F1 (layer_5 - estimator_3 - 5_folds.train_1.predict)=99.59%
[ 2019-04-02 23:37:31,692][kfold_wrapper.log_eval_metrics] Weighted F1 (layer_5 - estimator_3 - 5_folds.train_2.predict)=99.29%
[ 2019-04-02 23:37:31,950][kfold_wrapper.log_eval_metrics] Weighted F1 (layer_5 - estimator_3 - 5_folds.train_3.predict)=99.52%
[ 2019-04-02 23:37:32,222][kfold_wrapper.log_eval_metrics] Weighted F1 (layer_5 - estimator_3 - 5_folds.train_4.predict)=99.40%
[ 2019-04-02 23:37:32,245][kfold_wrapper.log_eval_metrics] Weighted F1 (layer_5 - estimator_3 - 5_folds.train_cv.predict)=99.43%
[ 2019-04-02 23:37:32,253][kfold_wrapper.log_eval_metrics] Weighted F1 (layer_5 - estimator_3 - 5_folds.test.predict)=97.62%
[ 2019-04-02 23:37:32,680][kfold_wrapper.log_eval_metrics] Weighted F1 (layer_5 - estimator_4 - 5_folds.tr

[ 2019-04-02 23:37:47,589][kfold_wrapper.log_eval_metrics] Weighted F1 (layer_6 - estimator_3 - 5_folds.test.predict)=97.63%
[ 2019-04-02 23:37:47,891][kfold_wrapper.log_eval_metrics] Weighted F1 (layer_6 - estimator_4 - 5_folds.train_0.predict)=98.66%
[ 2019-04-02 23:37:48,321][kfold_wrapper.log_eval_metrics] Weighted F1 (layer_6 - estimator_4 - 5_folds.train_1.predict)=98.85%
[ 2019-04-02 23:37:48,827][kfold_wrapper.log_eval_metrics] Weighted F1 (layer_6 - estimator_4 - 5_folds.train_2.predict)=98.90%
[ 2019-04-02 23:37:49,130][kfold_wrapper.log_eval_metrics] Weighted F1 (layer_6 - estimator_4 - 5_folds.train_3.predict)=98.81%
[ 2019-04-02 23:37:49,413][kfold_wrapper.log_eval_metrics] Weighted F1 (layer_6 - estimator_4 - 5_folds.train_4.predict)=98.87%
[ 2019-04-02 23:37:49,422][kfold_wrapper.log_eval_metrics] Weighted F1 (layer_6 - estimator_4 - 5_folds.train_cv.predict)=98.82%
[ 2019-04-02 23:37:49,424][kfold_wrapper.log_eval_metrics] Weighted F1 (layer_6 - estimator_4 - 5_folds.te

[ 2019-04-02 23:38:05,180][kfold_wrapper.log_eval_metrics] Weighted F1 (layer_7 - estimator_4 - 5_folds.train_cv.predict)=98.69%
[ 2019-04-02 23:38:05,183][kfold_wrapper.log_eval_metrics] Weighted F1 (layer_7 - estimator_4 - 5_folds.test.predict)=97.60%
[ 2019-04-02 23:38:05,447][kfold_wrapper.log_eval_metrics] Weighted F1 (layer_7 - estimator_5 - 5_folds.train_0.predict)=98.88%
[ 2019-04-02 23:38:05,801][kfold_wrapper.log_eval_metrics] Weighted F1 (layer_7 - estimator_5 - 5_folds.train_1.predict)=98.75%
[ 2019-04-02 23:38:06,063][kfold_wrapper.log_eval_metrics] Weighted F1 (layer_7 - estimator_5 - 5_folds.train_2.predict)=98.75%
[ 2019-04-02 23:38:06,322][kfold_wrapper.log_eval_metrics] Weighted F1 (layer_7 - estimator_5 - 5_folds.train_3.predict)=98.59%
[ 2019-04-02 23:38:06,584][kfold_wrapper.log_eval_metrics] Weighted F1 (layer_7 - estimator_5 - 5_folds.train_4.predict)=98.70%
[ 2019-04-02 23:38:06,593][kfold_wrapper.log_eval_metrics] Weighted F1 (layer_7 - estimator_5 - 5_folds.tr

[ 2019-04-02 23:38:21,674][kfold_wrapper.log_eval_metrics] Weighted F1 (layer_8 - estimator_5 - 5_folds.train_4.predict)=98.96%
[ 2019-04-02 23:38:21,682][kfold_wrapper.log_eval_metrics] Weighted F1 (layer_8 - estimator_5 - 5_folds.train_cv.predict)=98.80%
[ 2019-04-02 23:38:21,685][kfold_wrapper.log_eval_metrics] Weighted F1 (layer_8 - estimator_5 - 5_folds.test.predict)=97.59%
[ 2019-04-02 23:38:22,012][kfold_wrapper.log_eval_metrics] Weighted F1 (layer_8 - estimator_6 - 5_folds.train_0.predict)=98.63%
[ 2019-04-02 23:38:22,291][kfold_wrapper.log_eval_metrics] Weighted F1 (layer_8 - estimator_6 - 5_folds.train_1.predict)=98.64%
[ 2019-04-02 23:38:22,566][kfold_wrapper.log_eval_metrics] Weighted F1 (layer_8 - estimator_6 - 5_folds.train_2.predict)=98.96%
[ 2019-04-02 23:38:22,838][kfold_wrapper.log_eval_metrics] Weighted F1 (layer_8 - estimator_6 - 5_folds.train_3.predict)=98.72%
[ 2019-04-02 23:38:23,133][kfold_wrapper.log_eval_metrics] Weighted F1 (layer_8 - estimator_6 - 5_folds.tr

[ 2019-04-02 23:38:36,610][kfold_wrapper.log_eval_metrics] Weighted F1 (layer_9 - estimator_6 - 5_folds.train_3.predict)=98.73%
[ 2019-04-02 23:38:36,894][kfold_wrapper.log_eval_metrics] Weighted F1 (layer_9 - estimator_6 - 5_folds.train_4.predict)=98.59%
[ 2019-04-02 23:38:36,903][kfold_wrapper.log_eval_metrics] Weighted F1 (layer_9 - estimator_6 - 5_folds.train_cv.predict)=98.71%
[ 2019-04-02 23:38:36,905][kfold_wrapper.log_eval_metrics] Weighted F1 (layer_9 - estimator_6 - 5_folds.test.predict)=97.62%
[ 2019-04-02 23:38:37,766][kfold_wrapper.log_eval_metrics] Weighted F1 (layer_9 - estimator_7 - 5_folds.train_0.predict)=99.39%
[ 2019-04-02 23:38:38,600][kfold_wrapper.log_eval_metrics] Weighted F1 (layer_9 - estimator_7 - 5_folds.train_1.predict)=99.35%
[ 2019-04-02 23:38:39,625][kfold_wrapper.log_eval_metrics] Weighted F1 (layer_9 - estimator_7 - 5_folds.train_2.predict)=99.59%
[ 2019-04-02 23:38:40,821][kfold_wrapper.log_eval_metrics] Weighted F1 (layer_9 - estimator_7 - 5_folds.tr

[ 2019-04-02 23:38:54,435][kfold_wrapper.log_eval_metrics] Weighted F1 (layer_10 - estimator_7 - 5_folds.train_1.predict)=99.42%
[ 2019-04-02 23:38:55,263][kfold_wrapper.log_eval_metrics] Weighted F1 (layer_10 - estimator_7 - 5_folds.train_2.predict)=99.55%
[ 2019-04-02 23:38:56,157][kfold_wrapper.log_eval_metrics] Weighted F1 (layer_10 - estimator_7 - 5_folds.train_3.predict)=99.42%
[ 2019-04-02 23:38:57,023][kfold_wrapper.log_eval_metrics] Weighted F1 (layer_10 - estimator_7 - 5_folds.train_4.predict)=99.39%
[ 2019-04-02 23:38:57,032][kfold_wrapper.log_eval_metrics] Weighted F1 (layer_10 - estimator_7 - 5_folds.train_cv.predict)=99.42%
[ 2019-04-02 23:38:57,035][kfold_wrapper.log_eval_metrics] Weighted F1 (layer_10 - estimator_7 - 5_folds.test.predict)=97.65%
[ 2019-04-02 23:38:57,043][cascade_classifier.calc_f1] Weighted F1 (layer_10 - train.classifier_average)=99.42%
[ 2019-04-02 23:38:57,046][cascade_classifier.calc_f1] Weighted F1 (layer_10 - test.classifier_average)=97.63%
[ 201

In [17]:
train_df

Unnamed: 0,model_name,train_f1_weighted,train_accuracy
0,LogisticRegression,0.617887,0.668712
1,LinearDiscriminantAnalysis,0.570242,0.619956
2,SVC,0.899929,0.901038
3,DecisionTreeClassifier,1.0,1.0
4,ExtraTreeClassifier,1.0,1.0
5,GaussianNB,0.815955,0.817807
6,KNeighborsClassifier,0.943844,0.943992
7,RandomForestClassifier,0.999182,0.999182
8,ExtraTreesClassifier,1.0,1.0
9,GCForest,0.99803,0.998029


In [18]:
test_df

Unnamed: 0,model_name,test_f1_weighted,test_accuracy
0,LogisticRegression,0.624301,0.672765
1,LinearDiscriminantAnalysis,0.580324,0.62844
2,SVC,0.897883,0.898855
3,DecisionTreeClassifier,0.970668,0.970549
4,ExtraTreeClassifier,0.875281,0.875056
5,GaussianNB,0.826875,0.828797
6,KNeighborsClassifier,0.903135,0.903019
7,RandomForestClassifier,0.97588,0.975755
8,ExtraTreesClassifier,0.967961,0.967871
9,GCForest,0.97619,0.976052


In [27]:
for model in models:
    model_name = model.__class__.__name__
    y_pred = model.predict(X_test)
    print("=================" + model_name + "=================")
    print(classification_report(y_test, y_pred, digits=4))

    cm = confusion_matrix(y_test, y_pred)
    i=0
    acc_all = np.zeros(6)
    for c in cm:
        acc_all[i] = c[i]/np.sum(c)
        print("%d accuaracy: %f" %(i, acc_all[i]))
        i=i+1
    print("acc:", np.sum(y_test == y_pred)/y_pred.shape[0])
    print('f1_weighted', f1_score(y_test, y_pred, average='weighted'))

             precision    recall  f1-score   support

          0     0.9524    0.0820    0.1509       488
          1     0.6700    0.9951    0.8008      2655
          2     0.6630    0.5534    0.6033      1760
          3     0.6325    0.4799    0.5457      1094
          4     0.0000    0.0000    0.0000       297
          5     0.7808    0.7972    0.7889       429

avg / total     0.6601    0.6728    0.6243      6723

0 accuaracy: 0.081967
1 accuaracy: 0.995104
2 accuaracy: 0.553409
3 accuaracy: 0.479890
4 accuaracy: 0.000000
5 accuaracy: 0.797203
acc: 0.6727651346125242
f1_weighted 0.6243014542088469
             precision    recall  f1-score   support

          0     0.0000    0.0000    0.0000       488
          1     0.6057    0.9985    0.7540      2655
          2     0.6062    0.3989    0.4812      1760
          3     0.7087    0.5293    0.6060      1094
          4     0.5366    0.2963    0.3818       297
          5     0.9903    0.4779    0.6447       429

avg / total  

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0     0.7791    0.6721    0.7217       488
          1     0.8829    0.9311    0.9063      2655
          2     0.9089    0.8784    0.8934      1760
          3     0.9388    0.9260    0.9324      1094
          4     0.9201    0.8923    0.9060       297
          5     0.9654    0.9767    0.9710       429

avg / total     0.8982    0.8989    0.8979      6723

0 accuaracy: 0.672131
1 accuaracy: 0.931073
2 accuaracy: 0.878409
3 accuaracy: 0.925960
4 accuaracy: 0.892256
5 accuaracy: 0.976690
acc: 0.8988546779711438
f1_weighted 0.8978825354498516
             precision    recall  f1-score   support

          0     0.9009    0.9877    0.9423       488
          1     0.9866    0.9729    0.9797      2655
          2     0.9788    0.9716    0.9752      1760
          3     0.9714    0.9634    0.9674      1094
          4     0.9254    0.9192    0.9223       297
          5     0.9549    0.9860    0.9702       429

avg / total  

[ 2019-04-02 23:53:25,827][cascade_classifier.transform] X_groups_test.shape=[(6723, 4)]
[ 2019-04-02 23:53:25,830][cascade_classifier.transform] group_dims=[4]
[ 2019-04-02 23:53:25,831][cascade_classifier.transform] X_test.shape=(6723, 4)
[ 2019-04-02 23:53:25,835][cascade_classifier.transform] [layer=0] look_indexs=[0], X_cur_test.shape=(6723, 4)


             precision    recall  f1-score   support

          0     0.9049    0.9754    0.9389       488
          1     0.9829    0.9755    0.9792      2655
          2     0.9737    0.9670    0.9704      1760
          3     0.9656    0.9506    0.9581      1094
          4     0.9231    0.9293    0.9262       297
          5     0.9658    0.9860    0.9758       429

avg / total     0.9683    0.9679    0.9680      6723

0 accuaracy: 0.975410
1 accuaracy: 0.975518
2 accuaracy: 0.967045
3 accuaracy: 0.950640
4 accuaracy: 0.929293
5 accuaracy: 0.986014
acc: 0.9678714859437751
f1_weighted 0.9679610964861342


[ 2019-04-02 23:53:26,161][cascade_classifier.transform] [layer=1] look_indexs=[0], X_cur_test.shape=(6723, 52)
[ 2019-04-02 23:53:26,578][cascade_classifier.transform] [layer=2] look_indexs=[0], X_cur_test.shape=(6723, 52)
[ 2019-04-02 23:53:26,959][cascade_classifier.transform] [layer=3] look_indexs=[0], X_cur_test.shape=(6723, 52)
[ 2019-04-02 23:53:27,393][cascade_classifier.transform] [layer=4] look_indexs=[0], X_cur_test.shape=(6723, 52)
[ 2019-04-02 23:53:27,740][cascade_classifier.transform] [layer=5] look_indexs=[0], X_cur_test.shape=(6723, 52)


             precision    recall  f1-score   support

          0     0.9065    0.9939    0.9482       488
          1     0.9927    0.9755    0.9840      2655
          2     0.9835    0.9801    0.9818      1760
          3     0.9788    0.9689    0.9738      1094
          4     0.9267    0.9360    0.9313       297
          5     0.9593    0.9883    0.9736       429

avg / total     0.9767    0.9761    0.9762      6723

0 accuaracy: 0.993852
1 accuaracy: 0.975518
2 accuaracy: 0.980114
3 accuaracy: 0.968921
4 accuaracy: 0.936027
5 accuaracy: 0.988345
acc: 0.976052357578462
f1_weighted 0.9761901664502985


In [28]:
for model in models:
    model_name = model.__class__.__name__
    y_pred = model.predict(X_train)
    print("=================" + model_name + "=================")
    print(classification_report(y_train, y_pred, digits=4))

    cm = confusion_matrix(y_train, y_pred)
    i=0
    acc_all = np.zeros(6)
    for c in cm:
        acc_all[i] = c[i]/np.sum(c)
        print("%d accuaracy: %f" %(i, acc_all[i]))
        i=i+1
    print("acc:", np.sum(y_train == y_pred)/y_pred.shape[0])
    print('f1_weighted', f1_score(y_train, y_pred, average='weighted'))

             precision    recall  f1-score   support

          0     0.9182    0.0518    0.0981      1950
          1     0.6734    0.9919    0.8022     10617
          2     0.6428    0.5761    0.6076      7037
          3     0.6242    0.4504    0.5233      4378
          4     0.0000    0.0000    0.0000      1190
          5     0.7899    0.7705    0.7801      1717

avg / total     0.6528    0.6687    0.6179     26889

0 accuaracy: 0.051795
1 accuaracy: 0.991900
2 accuaracy: 0.576098
3 accuaracy: 0.450434
4 accuaracy: 0.000000
5 accuaracy: 0.770530
acc: 0.6687121127598646
f1_weighted 0.6178872582244622
             precision    recall  f1-score   support

          0     0.0000    0.0000    0.0000      1950
          1     0.6017    0.9986    0.7509     10617
          2     0.5950    0.3879    0.4697      7037
          3     0.6923    0.5114    0.5883      4378
          4     0.4900    0.2689    0.3473      1190
          5     0.9861    0.4537    0.6215      1717

avg / total  

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0     0.8553    0.6759    0.7551      1950
          1     0.8906    0.9358    0.9126     10617
          2     0.8862    0.8957    0.8909      7037
          3     0.9419    0.9073    0.9243      4378
          4     0.8997    0.8891    0.8943      1190
          5     0.9733    0.9563    0.9647      1717

avg / total     0.9009    0.9010    0.8999     26889

0 accuaracy: 0.675897
1 accuaracy: 0.935763
2 accuaracy: 0.895694
3 accuaracy: 0.907264
4 accuaracy: 0.889076
5 accuaracy: 0.956319
acc: 0.9010375990181859
f1_weighted 0.8999290952737513
             precision    recall  f1-score   support

          0     1.0000    1.0000    1.0000      1950
          1     1.0000    1.0000    1.0000     10617
          2     1.0000    1.0000    1.0000      7037
          3     1.0000    1.0000    1.0000      4378
          4     1.0000    1.0000    1.0000      1190
          5     1.0000    1.0000    1.0000      1717

avg / total  

[ 2019-04-02 23:53:39,533][cascade_classifier.transform] X_groups_test.shape=[(26889, 4)]
[ 2019-04-02 23:53:39,540][cascade_classifier.transform] group_dims=[4]
[ 2019-04-02 23:53:39,545][cascade_classifier.transform] X_test.shape=(26889, 4)
[ 2019-04-02 23:53:39,548][cascade_classifier.transform] [layer=0] look_indexs=[0], X_cur_test.shape=(26889, 4)


f1_weighted 0.999181826982915
             precision    recall  f1-score   support

          0     1.0000    1.0000    1.0000      1950
          1     1.0000    1.0000    1.0000     10617
          2     1.0000    1.0000    1.0000      7037
          3     1.0000    1.0000    1.0000      4378
          4     1.0000    1.0000    1.0000      1190
          5     1.0000    1.0000    1.0000      1717

avg / total     1.0000    1.0000    1.0000     26889

0 accuaracy: 1.000000
1 accuaracy: 1.000000
2 accuaracy: 1.000000
3 accuaracy: 1.000000
4 accuaracy: 1.000000
5 accuaracy: 1.000000
acc: 1.0
f1_weighted 1.0


[ 2019-04-02 23:53:40,717][cascade_classifier.transform] [layer=1] look_indexs=[0], X_cur_test.shape=(26889, 52)
[ 2019-04-02 23:53:42,164][cascade_classifier.transform] [layer=2] look_indexs=[0], X_cur_test.shape=(26889, 52)
[ 2019-04-02 23:53:43,573][cascade_classifier.transform] [layer=3] look_indexs=[0], X_cur_test.shape=(26889, 52)
[ 2019-04-02 23:53:44,700][cascade_classifier.transform] [layer=4] look_indexs=[0], X_cur_test.shape=(26889, 52)
[ 2019-04-02 23:53:45,725][cascade_classifier.transform] [layer=5] look_indexs=[0], X_cur_test.shape=(26889, 52)


             precision    recall  f1-score   support

          0     0.9949    0.9990    0.9969      1950
          1     0.9999    0.9968    0.9983     10617
          2     0.9970    0.9989    0.9979      7037
          3     0.9973    0.9989    0.9981      4378
          4     0.9933    0.9992    0.9962      1190
          5     0.9994    0.9983    0.9988      1717

avg / total     0.9980    0.9980    0.9980     26889

0 accuaracy: 0.998974
1 accuaracy: 0.996798
2 accuaracy: 0.998863
3 accuaracy: 0.998858
4 accuaracy: 0.999160
5 accuaracy: 0.998253
acc: 0.9980289337647366
f1_weighted 0.9980295751139766


MeanSD

1. MeanSD weighted F1

In [30]:
from sklearn.model_selection import StratifiedKFold

for model in models:
    model_name = model.__class__.__name__
    
    # Kfold
    scores = []
    skf = StratifiedKFold(n_splits=5)
    for train_index, test_index in skf.split(X_train, y_train):
        K_train_x, K_test_x = X_train[train_index], X_train[test_index]
        K_train_y, K_test_y = y_train[train_index], y_train[test_index]
        
        y_pred = model.predict(K_test_x)
        scores.append(f1_score(K_test_y, y_pred,average="weighted"))
    print("%s    Weighted F1: %0.2f ± %0.2f %%" % (model_name, np.mean(scores)*100, np.std(scores)*100))

  'precision', 'predicted', average, warn_for)


LogisticRegression    Weighted F1: 61.78 ± 0.96 %
LinearDiscriminantAnalysis    Weighted F1: 57.02 ± 0.25 %
SVC    Weighted F1: 89.99 ± 0.32 %
DecisionTreeClassifier    Weighted F1: 100.00 ± 0.00 %
ExtraTreeClassifier    Weighted F1: 100.00 ± 0.00 %
GaussianNB    Weighted F1: 81.59 ± 0.41 %


[ 2019-04-03 00:10:16,654][cascade_classifier.transform] X_groups_test.shape=[(5380, 4)]
[ 2019-04-03 00:10:16,657][cascade_classifier.transform] group_dims=[4]
[ 2019-04-03 00:10:16,658][cascade_classifier.transform] X_test.shape=(5380, 4)
[ 2019-04-03 00:10:16,660][cascade_classifier.transform] [layer=0] look_indexs=[0], X_cur_test.shape=(5380, 4)


KNeighborsClassifier    Weighted F1: 94.38 ± 0.30 %
RandomForestClassifier    Weighted F1: 99.92 ± 0.03 %
ExtraTreesClassifier    Weighted F1: 100.00 ± 0.00 %


[ 2019-04-03 00:10:16,872][cascade_classifier.transform] [layer=1] look_indexs=[0], X_cur_test.shape=(5380, 52)
[ 2019-04-03 00:10:17,060][cascade_classifier.transform] [layer=2] look_indexs=[0], X_cur_test.shape=(5380, 52)
[ 2019-04-03 00:10:17,238][cascade_classifier.transform] [layer=3] look_indexs=[0], X_cur_test.shape=(5380, 52)
[ 2019-04-03 00:10:17,424][cascade_classifier.transform] [layer=4] look_indexs=[0], X_cur_test.shape=(5380, 52)
[ 2019-04-03 00:10:17,725][cascade_classifier.transform] [layer=5] look_indexs=[0], X_cur_test.shape=(5380, 52)
[ 2019-04-03 00:10:17,912][cascade_classifier.transform] X_groups_test.shape=[(5380, 4)]
[ 2019-04-03 00:10:17,914][cascade_classifier.transform] group_dims=[4]
[ 2019-04-03 00:10:17,918][cascade_classifier.transform] X_test.shape=(5380, 4)
[ 2019-04-03 00:10:17,919][cascade_classifier.transform] [layer=0] look_indexs=[0], X_cur_test.shape=(5380, 4)
[ 2019-04-03 00:10:18,078][cascade_classifier.transform] [layer=1] look_indexs=[0], X_cu

GCForest    Weighted F1: 99.80 ± 0.06 %


2. MeanSD Acc

In [31]:
for model in models:
    model_name = model.__class__.__name__
    
    # Kfold
    scores = []
    skf = StratifiedKFold(n_splits=5)
    for train_index, test_index in skf.split(X_train, y_train):
        K_train_x, K_test_x = X_train[train_index], X_train[test_index]
        K_train_y, K_test_y = y_train[train_index], y_train[test_index]
        
        y_pred = model.predict(K_test_x)
        scores.append(accuracy_score(K_test_y, y_pred))
    print("%s    Acc: %0.2f ± %0.2f %%" % (model_name, np.mean(scores)*100, np.std(scores)*100))

LogisticRegression    Acc: 66.87 ± 0.83 %
LinearDiscriminantAnalysis    Acc: 62.00 ± 0.21 %
SVC    Acc: 90.10 ± 0.32 %
DecisionTreeClassifier    Acc: 100.00 ± 0.00 %
ExtraTreeClassifier    Acc: 100.00 ± 0.00 %
GaussianNB    Acc: 81.78 ± 0.39 %


[ 2019-04-03 00:11:17,345][cascade_classifier.transform] X_groups_test.shape=[(5380, 4)]
[ 2019-04-03 00:11:17,352][cascade_classifier.transform] group_dims=[4]
[ 2019-04-03 00:11:17,354][cascade_classifier.transform] X_test.shape=(5380, 4)
[ 2019-04-03 00:11:17,356][cascade_classifier.transform] [layer=0] look_indexs=[0], X_cur_test.shape=(5380, 4)


KNeighborsClassifier    Acc: 94.40 ± 0.30 %
RandomForestClassifier    Acc: 99.92 ± 0.03 %
ExtraTreesClassifier    Acc: 100.00 ± 0.00 %


[ 2019-04-03 00:11:17,561][cascade_classifier.transform] [layer=1] look_indexs=[0], X_cur_test.shape=(5380, 52)
[ 2019-04-03 00:11:17,730][cascade_classifier.transform] [layer=2] look_indexs=[0], X_cur_test.shape=(5380, 52)
[ 2019-04-03 00:11:17,896][cascade_classifier.transform] [layer=3] look_indexs=[0], X_cur_test.shape=(5380, 52)
[ 2019-04-03 00:11:18,058][cascade_classifier.transform] [layer=4] look_indexs=[0], X_cur_test.shape=(5380, 52)
[ 2019-04-03 00:11:18,288][cascade_classifier.transform] [layer=5] look_indexs=[0], X_cur_test.shape=(5380, 52)
[ 2019-04-03 00:11:18,463][cascade_classifier.transform] X_groups_test.shape=[(5380, 4)]
[ 2019-04-03 00:11:18,464][cascade_classifier.transform] group_dims=[4]
[ 2019-04-03 00:11:18,465][cascade_classifier.transform] X_test.shape=(5380, 4)
[ 2019-04-03 00:11:18,467][cascade_classifier.transform] [layer=0] look_indexs=[0], X_cur_test.shape=(5380, 4)
[ 2019-04-03 00:11:18,636][cascade_classifier.transform] [layer=1] look_indexs=[0], X_cu

GCForest    Acc: 99.80 ± 0.06 %
