In [1]:
import os
import pandas as pd
import numpy as np

from sklearn import preprocessing
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import ExtraTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split


from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
import pickle

import sys
sys.path.append("..") 
from gcforest.gcforest import GCForest

  from numpy.core.umath_tests import inner1d


hyperparameters

In [2]:
random_seed = 42
kf=10
score = 'f1_weighted'

# load data_four_features

In [3]:
path = os.getcwd()+'/../data/20122018freshwater_four_feature.csv'
data_four_features = pd.read_csv(path, na_values = np.nan)

print(data_four_features.shape)

(33612, 5)


- CE(1-9)标签y为1-6
- CE(1-10)标签y为0-5

先进行计算CE(1-9)，在载入CE(1-10)的时候要将标签统一减去1

In [4]:
X = data_four_features.drop(['本周水质'], axis=1) # Series
y = data_four_features['本周水质'] # Series
print("水质分布情况:")
print(y.value_counts())
print("\n各特征类型分布情况:")
print(data_four_features.dtypes.value_counts())

水质分布情况:
2    13272
3     8797
4     5472
1     2438
6     2146
5     1487
Name: 本周水质, dtype: int64

各特征类型分布情况:
float64    4
int64      1
dtype: int64


In [5]:
# Convert series to ndarray
X = X.values
y = y.values

In [6]:
print("============ train_test_split ============")

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, 
                                       stratify = y, random_state = random_seed)
print("67%% train: %d/%d, 33%% test: %d/%d" %(X_train.shape[0], X.shape[0], X_test.shape[0], X.shape[0]))

67% train: 26889/33612, 33% test: 6723/33612


### normalize  train data

fulfill the Na with median, then standardized the data, output type ndarray

In [7]:
clean_pipeline = Pipeline([('imputer', preprocessing.Imputer(missing_values='NaN',strategy="median")),
                           ('std_scaler', preprocessing.StandardScaler()),])
X_train = clean_pipeline.fit_transform(X_train)
X_test = clean_pipeline.fit_transform(X_test)

print("============ train_valid_split ============")
X_train2, X_valid, y_train2, y_valid = train_test_split(X_train, y_train, test_size=0.25, 
                                       stratify = y_train, random_state = random_seed)




- X_train2: training set
- X_valid: validation set
- X_test: test set

## Accuracy's MEAN(SD) based on 5 flod cross validation
1. 载入CE（1-9）的子模型，仅计算CE（1-9）
2. 载入CE（1-10）的子模型覆盖之前的模型，计算10个子模型
3. 计算CE（1-10）
4. 共得到12个模型的MEAN(SD)

- 载入CE（1-9）的子模型，仅计算CE（1-9），标签为1-6

argmax后标签为0-5，需要加1

In [8]:
models = [
    "LogisticRegression",
    "LinearDiscriminantAnalysis",
    "SVC",
    "DecisionTreeClassifier",
    "ExtraTreeClassifier",
    "GaussianNB",
    "KNeighborsClassifier",
    "RandomForestClassifier",
    "ExtraTreesClassifier"
]
i=0
for model in models:
    model_name = model
    with open("../pkl/CE_97661/CE_" + model_name + ".pkl", "rb") as f:
        models[i] = pickle.load(f)
    i = i+1
# models 不再是字符数组，而是模型数组

population_best_weight = np.load("../npy/CE_best_weights.npy")

classifier_num = 9

scores = []
skf = StratifiedKFold(n_splits=5)
for train_index, test_index in skf.split(X_train, y_train):
    K_train_x, K_test_x = X_train[train_index], X_train[test_index]
    K_train_y, K_test_y = y_train[train_index], y_train[test_index]

    # 所有学习器都输出概率向量，最后投票
    y_test_pred_proba_all = []
    # 取训练好的模型，计算各模型”验证集“上输出概率向量
    for model in models:
        model_name = model.__class__.__name__
        y_test_pred_proba = model.predict_proba(K_test_x)
        y_test_pred_proba_all.append(y_test_pred_proba)

    y_test_pred_ensemble_proba = np.zeros((len(K_test_y), 6)) # 初始化集成器概率向量
    
    # 为每一个基学习器乘上权重
    for k in range(classifier_num):
        y_test_pred_ensemble_proba += y_test_pred_proba_all[k] * population_best_weight[k]
    y_test_pred_ensemble = np.argmax(y_test_pred_ensemble_proba, axis=1) + 1
    
    scores.append(accuracy_score(K_test_y, y_test_pred_ensemble))
print("CE(1-9)  Accuracy: %0.2f $\pm$ %0.2f %%" % (np.mean(scores)*100, np.std(scores)*100))

CE(1-9)  Accuracy: 99.77 $\pm$ 0.05 %


- 载入CE（1-10）的子模型覆盖之前的模型，计算10个子模型，标签为 0-5

In [9]:
y_train = y_train-1
models = [
    "LogisticRegression",
    "LinearDiscriminantAnalysis",
    "SVC",
    "DecisionTreeClassifier",
    "ExtraTreeClassifier",
    "GaussianNB",
    "KNeighborsClassifier",
    "RandomForestClassifier",
    "ExtraTreesClassifier",
    "GCForest"
]


i=0
for model in models:
    model_name = model
    model_path = "../pkl/CE_97661_10/CE_" + model_name + ".pkl"
    with open(model_path, "rb") as f:
        models[i] = pickle.load(f)
    
    # Kfold
    scores = []
    skf = StratifiedKFold(n_splits=5)
    for train_index, test_index in skf.split(X_train, y_train):
        K_train_x, K_test_x = X_train[train_index], X_train[test_index]
        K_train_y, K_test_y = y_train[train_index], y_train[test_index]
        
        y_pred = models[i].predict(K_test_x)
        scores.append(accuracy_score(K_test_y, y_pred))
    print("%s    Accuracy: %0.2f $\pm$ %0.2f %%" % (model_name, np.mean(scores)*100, np.std(scores)*100))
    i = i+1

LogisticRegression    Accuracy: 66.73 $\pm$ 0.75 %
LinearDiscriminantAnalysis    Accuracy: 61.81 $\pm$ 0.09 %
SVC    Accuracy: 89.44 $\pm$ 0.42 %
DecisionTreeClassifier    Accuracy: 99.69 $\pm$ 0.05 %
ExtraTreeClassifier    Accuracy: 97.39 $\pm$ 0.16 %
GaussianNB    Accuracy: 81.44 $\pm$ 0.48 %
KNeighborsClassifier    Accuracy: 92.69 $\pm$ 0.45 %
RandomForestClassifier    Accuracy: 99.78 $\pm$ 0.05 %
ExtraTreesClassifier    Accuracy: 99.33 $\pm$ 0.14 %


[ 2019-02-11 15:44:22,194][cascade_classifier.transform] X_groups_test.shape=[(5380, 4)]
[ 2019-02-11 15:44:22,195][cascade_classifier.transform] group_dims=[4]
[ 2019-02-11 15:44:22,196][cascade_classifier.transform] X_test.shape=(5380, 4)
[ 2019-02-11 15:44:22,197][cascade_classifier.transform] [layer=0] look_indexs=[0], X_cur_test.shape=(5380, 4)
[ 2019-02-11 15:44:22,335][cascade_classifier.transform] [layer=1] look_indexs=[0], X_cur_test.shape=(5380, 52)
[ 2019-02-11 15:44:22,488][cascade_classifier.transform] [layer=2] look_indexs=[0], X_cur_test.shape=(5380, 52)
[ 2019-02-11 15:44:22,632][cascade_classifier.transform] [layer=3] look_indexs=[0], X_cur_test.shape=(5380, 52)
[ 2019-02-11 15:44:22,772][cascade_classifier.transform] [layer=4] look_indexs=[0], X_cur_test.shape=(5380, 52)
[ 2019-02-11 15:44:22,909][cascade_classifier.transform] [layer=5] look_indexs=[0], X_cur_test.shape=(5380, 52)
[ 2019-02-11 15:44:23,052][cascade_classifier.transform] X_groups_test.shape=[(5380, 4)]

GCForest    Accuracy: 99.78 $\pm$ 0.08 %


- 计算CE（1-10）

In [11]:
population_best_weight = np.load("../npy/CE_best_weights(1-10).npy")

classifier_num = 10

scores = []
skf = StratifiedKFold(n_splits=5)
for train_index, test_index in skf.split(X_train, y_train):
    K_train_x, K_test_x = X_train[train_index], X_train[test_index]
    K_train_y, K_test_y = y_train[train_index], y_train[test_index]

    # 所有学习器都输出概率向量，最后投票
    y_test_pred_proba_all = []
    # 取训练好的模型，计算各模型”验证集“上输出概率向量
    for model in models:
        model_name = model.__class__.__name__
        y_test_pred_proba = model.predict_proba(K_test_x)
        y_test_pred_proba_all.append(y_test_pred_proba)

    y_test_pred_ensemble_proba = np.zeros((len(K_test_y), 6)) # 初始化集成器概率向量
    
    # 为每一个基学习器乘上权重
    for k in range(classifier_num):
        y_test_pred_ensemble_proba += y_test_pred_proba_all[k] * population_best_weight[k]
    y_test_pred_ensemble = np.argmax(y_test_pred_ensemble_proba, axis=1)
    
    scores.append(accuracy_score(K_test_y, y_test_pred_ensemble))
print("CE(1-10)  Accuracy: %0.2f $\pm$ %0.2f %%" % (np.mean(scores)*100, np.std(scores)*100))

[ 2019-02-11 15:44:50,518][cascade_classifier.transform] X_groups_test.shape=[(5380, 4)]
[ 2019-02-11 15:44:50,521][cascade_classifier.transform] group_dims=[4]
[ 2019-02-11 15:44:50,522][cascade_classifier.transform] X_test.shape=(5380, 4)
[ 2019-02-11 15:44:50,523][cascade_classifier.transform] [layer=0] look_indexs=[0], X_cur_test.shape=(5380, 4)
[ 2019-02-11 15:44:50,673][cascade_classifier.transform] [layer=1] look_indexs=[0], X_cur_test.shape=(5380, 52)
[ 2019-02-11 15:44:50,819][cascade_classifier.transform] [layer=2] look_indexs=[0], X_cur_test.shape=(5380, 52)
[ 2019-02-11 15:44:50,963][cascade_classifier.transform] [layer=3] look_indexs=[0], X_cur_test.shape=(5380, 52)
[ 2019-02-11 15:44:51,111][cascade_classifier.transform] [layer=4] look_indexs=[0], X_cur_test.shape=(5380, 52)
[ 2019-02-11 15:44:51,255][cascade_classifier.transform] [layer=5] look_indexs=[0], X_cur_test.shape=(5380, 52)
[ 2019-02-11 15:44:52,290][cascade_classifier.transform] X_groups_test.shape=[(5380, 4)]

CE(1-10)  Accuracy: 99.74 $\pm$ 0.09 %
