In [1]:
import os
import pandas as pd
import numpy as np

from sklearn import preprocessing
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import ExtraTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split


from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
import pickle

import sys
sys.path.append("..") 
from gcforest.gcforest import GCForest

  from numpy.core.umath_tests import inner1d


hyperparameters

In [2]:
random_seed = 42
kf=10
score = 'f1_weighted'

# load data_four_features

In [3]:
path = os.getcwd()+'/../data/20122018freshwater_four_feature.csv'
data_four_features = pd.read_csv(path, na_values = np.nan)

print(data_four_features.shape)

(33612, 5)


- CE(1-9)标签y为1-6
- CE(1-10)标签y为0-5

先进行计算CE(1-9)，在载入CE(1-10)的时候要将标签统一减去1

In [4]:
X = data_four_features.drop(['本周水质'], axis=1) # Series
y = data_four_features['本周水质'] # Series

In [5]:
X = data_four_features.drop(['本周水质'], axis=1) # Series
y = data_four_features['本周水质'] # Series
print("水质分布情况:")
print(y.value_counts())
print("\n各特征类型分布情况:")
print(data_four_features.dtypes.value_counts())

水质分布情况:
2    13272
3     8797
4     5472
1     2438
6     2146
5     1487
Name: 本周水质, dtype: int64

各特征类型分布情况:
float64    4
int64      1
dtype: int64


In [6]:
# Convert series to ndarray
X = X.values
y = y.values

In [7]:
print("============ train_test_split ============")

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, 
                                       stratify = y, random_state = random_seed)
print("67%% train: %d/%d, 33%% test: %d/%d" %(X_train.shape[0], X.shape[0], X_test.shape[0], X.shape[0]))

67% train: 26889/33612, 33% test: 6723/33612


### normalize  train data

fulfill the Na with median, then standardized the data, output type ndarray

In [8]:
clean_pipeline = Pipeline([('imputer', preprocessing.Imputer(missing_values='NaN',strategy="median")),
                           ('std_scaler', preprocessing.StandardScaler()),])
X_train = clean_pipeline.fit_transform(X_train)
X_test = clean_pipeline.fit_transform(X_test)

print("============ train_valid_split ============")
X_train2, X_valid, y_train2, y_valid = train_test_split(X_train, y_train, test_size=0.25, 
                                       stratify = y_train, random_state = random_seed)



- X_train2: training set
- X_valid: validation set
- X_test: test set

## Accuracy on 3 parts of data

1. 载入CE（1-9）的子模型，仅计算CE（1-9），标签1-6
2. 载入CE（1-10）的子模型覆盖之前的模型，计算10个子模型,标签0-5
3. 计算CE（1-10）

- 载入CE（1-9）的子模型，仅计算CE（1-9），标签1-6

In [9]:
models = [
    "LogisticRegression",
    "LinearDiscriminantAnalysis",
    "SVC",
    "DecisionTreeClassifier",
    "ExtraTreeClassifier",
    "GaussianNB",
    "KNeighborsClassifier",
    "RandomForestClassifier",
    "ExtraTreesClassifier"
]
i=0
for model in models:
    model_name = model
    with open("../pkl/CE_97661/CE_" + model_name + ".pkl", "rb") as f:
        models[i] = pickle.load(f)
    i = i+1
# models 不再是字符数组，而是模型数组

population_best_weight = np.load("../npy/CE_best_weights.npy")

classifier_num = 9

# 所有学习器都输出概率向量，最后投票
y_train_pred_proba_all = []
y_valid_pred_proba_all = []
y_test_pred_proba_all = []

# 取训练好的模型，计算各模型”验证集“上输出概率向量
for model in models:
    train_pred_proba = model.predict_proba(X_train2)
    valid_pred_proba = model.predict_proba(X_valid)
    test_pred_proba = model.predict_proba(X_test)
    y_train_pred_proba_all.append(train_pred_proba)
    y_valid_pred_proba_all.append(valid_pred_proba)
    y_test_pred_proba_all.append(test_pred_proba)
    
y_train_pred_ensemble_proba = np.zeros((len(y_train2), 6)) # 初始化集成器概率向量
y_valid_pred_ensemble_proba = np.zeros((len(y_valid), 6)) # 初始化集成器概率向量
y_test_pred_ensemble_proba = np.zeros((len(y_test), 6)) # 初始化集成器概率向量

# 为每一个基学习器乘上权重
for k in range(classifier_num):
    y_train_pred_ensemble_proba += y_train_pred_proba_all[k] * population_best_weight[k]
    y_valid_pred_ensemble_proba += y_valid_pred_proba_all[k] * population_best_weight[k]
    y_test_pred_ensemble_proba += y_test_pred_proba_all[k] * population_best_weight[k]
y_train_pred_ensemble = np.argmax(y_train_pred_ensemble_proba, axis=1) + 1
y_valid_pred_ensemble = np.argmax(y_valid_pred_ensemble_proba, axis=1) + 1
y_test_pred_ensemble = np.argmax(y_test_pred_ensemble_proba, axis=1) + 1

# 计算各水质等级的得分
print("=================CE(1-9)=================")

print(classification_report(y_train2, y_train_pred_ensemble, digits=4))
print(classification_report(y_valid, y_valid_pred_ensemble, digits=4))
print(classification_report(y_test, y_test_pred_ensemble, digits=4))

             precision    recall  f1-score   support

          1     1.0000    0.9945    0.9973      1462
          2     0.9987    0.9999    0.9993      7962
          3     0.9991    0.9994    0.9992      5278
          4     0.9994    0.9991    0.9992      3283
          5     0.9989    0.9966    0.9978       893
          6     0.9984    0.9984    0.9984      1288

avg / total     0.9990    0.9990    0.9990     20166

             precision    recall  f1-score   support

          1     0.9898    0.9918    0.9908       488
          2     0.9959    0.9951    0.9955      2655
          3     0.9938    0.9943    0.9940      1759
          4     0.9945    0.9927    0.9936      1095
          5     0.9833    0.9899    0.9866       297
          6     0.9907    0.9907    0.9907       429

avg / total     0.9938    0.9938    0.9938      6723

             precision    recall  f1-score   support

          1     0.9099    0.9939    0.9500       488
          2     0.9927    0.9763    0.9

- 载入CE（1-10）的子模型覆盖之前的模型，计算10个子模型,标签0-5

In [10]:
y_train2 = y_train2-1
y_valid = y_valid-1
y_test = y_test-1

models = [
    "LogisticRegression",
    "LinearDiscriminantAnalysis",
    "SVC",
    "DecisionTreeClassifier",
    "ExtraTreeClassifier",
    "GaussianNB",
    "KNeighborsClassifier",
    "RandomForestClassifier",
    "ExtraTreesClassifier",
    "GCForest"
]

i=0
for name in models:
    model_path = "../pkl/CE_97661_10/CE_" + name + ".pkl"
    with open(model_path, "rb") as f:
        models[i] = pickle.load(f)
    i = i+1

In [11]:
for model in models:
    model_name = model.__class__.__name__
    train_pred = model.predict(X_train2)
    valid_pred = model.predict(X_valid)
    test_pred = model.predict(X_test)
    print("=================" + model_name + "=================")
    print(classification_report(y_train2, train_pred, digits=4))
    print(classification_report(y_valid, valid_pred, digits=4))
    print(classification_report(y_test, test_pred, digits=4))

             precision    recall  f1-score   support

          0     0.8654    0.0308    0.0594      1462
          1     0.6703    0.9921    0.8000      7962
          2     0.6432    0.5722    0.6056      5278
          3     0.6217    0.4505    0.5224      3283
          4     0.0000    0.0000    0.0000       893
          5     0.7904    0.7702    0.7802      1288

avg / total     0.6474    0.6662    0.6136     20166

             precision    recall  f1-score   support

          0     0.9375    0.0307    0.0595       488
          1     0.6764    0.9913    0.8042      2655
          2     0.6439    0.5850    0.6130      1759
          3     0.6338    0.4521    0.5277      1095
          4     0.0000    0.0000    0.0000       297
          5     0.7735    0.7879    0.7806       429

avg / total     0.6562    0.6707    0.6181      6723

             precision    recall  f1-score   support

          0     0.9333    0.0574    0.1081       488
          1     0.6672    0.9951    0.7

  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0     0.8607    0.6464    0.7383      1462
          1     0.8850    0.9362    0.9099      7962
          2     0.8812    0.8920    0.8865      5278
          3     0.9375    0.9007    0.9188      3283
          4     0.8914    0.8735    0.8824       893
          5     0.9686    0.9573    0.9629      1288

avg / total     0.8964    0.8964    0.8949     20166

             precision    recall  f1-score   support

          0     0.8509    0.6434    0.7328       488
          1     0.8802    0.9322    0.9054      2655
          2     0.8760    0.8874    0.8817      1759
          3     0.9239    0.8977    0.9106      1095
          4     0.8826    0.7845    0.8307       297
          5     0.9421    0.9487    0.9454       429

avg / total     0.8881    0.8884    0.8868      6723

             precision    recall  f1-score   support

          0     0.7970    0.6516    0.7170       488
          1     0.8790    0.9333    0.9

[ 2019-02-11 17:04:46,419][cascade_classifier.transform] X_groups_test.shape=[(20166, 4)]
[ 2019-02-11 17:04:46,420][cascade_classifier.transform] group_dims=[4]
[ 2019-02-11 17:04:46,421][cascade_classifier.transform] X_test.shape=(20166, 4)
[ 2019-02-11 17:04:46,422][cascade_classifier.transform] [layer=0] look_indexs=[0], X_cur_test.shape=(20166, 4)


             precision    recall  f1-score   support

          0     0.8741    0.8406    0.8570      1462
          1     0.9355    0.9651    0.9500      7962
          2     0.9457    0.9343    0.9400      5278
          3     0.9520    0.9421    0.9470      3283
          4     0.9208    0.8589    0.8888       893
          5     0.9776    0.9488    0.9630      1288

avg / total     0.9384    0.9385    0.9383     20166

             precision    recall  f1-score   support

          0     0.7908    0.7746    0.7826       488
          1     0.8996    0.9348    0.9169      2655
          2     0.9013    0.8931    0.8972      1759
          3     0.9092    0.8868    0.8978      1095
          4     0.7879    0.7003    0.7415       297
          5     0.9440    0.9044    0.9238       429

avg / total     0.8916    0.8922    0.8916      6723

             precision    recall  f1-score   support

          0     0.7530    0.7684    0.7606       488
          1     0.9016    0.9315    0.9

[ 2019-02-11 17:04:46,869][cascade_classifier.transform] [layer=1] look_indexs=[0], X_cur_test.shape=(20166, 52)
[ 2019-02-11 17:04:47,333][cascade_classifier.transform] [layer=2] look_indexs=[0], X_cur_test.shape=(20166, 52)
[ 2019-02-11 17:04:47,781][cascade_classifier.transform] [layer=3] look_indexs=[0], X_cur_test.shape=(20166, 52)
[ 2019-02-11 17:04:48,235][cascade_classifier.transform] [layer=4] look_indexs=[0], X_cur_test.shape=(20166, 52)
[ 2019-02-11 17:04:48,692][cascade_classifier.transform] [layer=5] look_indexs=[0], X_cur_test.shape=(20166, 52)
[ 2019-02-11 17:04:49,178][cascade_classifier.transform] X_groups_test.shape=[(6723, 4)]
[ 2019-02-11 17:04:49,179][cascade_classifier.transform] group_dims=[4]
[ 2019-02-11 17:04:49,180][cascade_classifier.transform] X_test.shape=(6723, 4)
[ 2019-02-11 17:04:49,181][cascade_classifier.transform] [layer=0] look_indexs=[0], X_cur_test.shape=(6723, 4)
[ 2019-02-11 17:04:49,378][cascade_classifier.transform] [layer=1] look_indexs=[0],

             precision    recall  f1-score   support

          0     0.9979    0.9966    0.9973      1462
          1     0.9992    0.9995    0.9994      7962
          2     1.0000    0.9983    0.9991      5278
          3     0.9985    0.9997    0.9991      3283
          4     0.9944    1.0000    0.9972       893
          5     1.0000    1.0000    1.0000      1288

avg / total     0.9991    0.9991    0.9991     20166

             precision    recall  f1-score   support

          0     0.9898    0.9939    0.9918       488
          1     0.9970    0.9951    0.9960      2655
          2     0.9938    0.9949    0.9943      1759
          3     0.9927    0.9927    0.9927      1095
          4     0.9833    0.9899    0.9866       297
          5     0.9930    0.9907    0.9918       429

avg / total     0.9941    0.9941    0.9941      6723

             precision    recall  f1-score   support

          0     0.9082    0.9939    0.9491       488
          1     0.9927    0.9759    0.9

- 计算CE（1-10）

In [12]:
population_best_weight = np.load("../npy/CE_best_weights(1-10).npy")

classifier_num = 10

# 所有学习器都输出概率向量，最后投票
y_train_pred_proba_all = []
y_valid_pred_proba_all = []
y_test_pred_proba_all = []

# 取训练好的模型，计算各模型”验证集“上输出概率向量
for model in models:
    train_pred_proba = model.predict_proba(X_train2)
    valid_pred_proba = model.predict_proba(X_valid)
    test_pred_proba = model.predict_proba(X_test)
    y_train_pred_proba_all.append(train_pred_proba)
    y_valid_pred_proba_all.append(valid_pred_proba)
    y_test_pred_proba_all.append(test_pred_proba)
    
y_train_pred_ensemble_proba = np.zeros((len(y_train2), 6)) # 初始化集成器概率向量
y_valid_pred_ensemble_proba = np.zeros((len(y_valid), 6)) # 初始化集成器概率向量
y_test_pred_ensemble_proba = np.zeros((len(y_test), 6)) # 初始化集成器概率向量

# 为每一个基学习器乘上权重
for k in range(classifier_num):
    y_train_pred_ensemble_proba += y_train_pred_proba_all[k] * population_best_weight[k]
    y_valid_pred_ensemble_proba += y_valid_pred_proba_all[k] * population_best_weight[k]
    y_test_pred_ensemble_proba += y_test_pred_proba_all[k] * population_best_weight[k]
y_train_pred_ensemble = np.argmax(y_train_pred_ensemble_proba, axis=1)
y_valid_pred_ensemble = np.argmax(y_valid_pred_ensemble_proba, axis=1)
y_test_pred_ensemble = np.argmax(y_test_pred_ensemble_proba, axis=1)

# 计算各水质等级的得分
print("=================CE(1-10)=================")

print(classification_report(y_train2, y_train_pred_ensemble, digits=4))
print(classification_report(y_valid, y_valid_pred_ensemble, digits=4))
print(classification_report(y_test, y_test_pred_ensemble, digits=4))

[ 2019-02-11 17:04:57,190][cascade_classifier.transform] X_groups_test.shape=[(20166, 4)]
[ 2019-02-11 17:04:57,194][cascade_classifier.transform] group_dims=[4]
[ 2019-02-11 17:04:57,195][cascade_classifier.transform] X_test.shape=(20166, 4)
[ 2019-02-11 17:04:57,196][cascade_classifier.transform] [layer=0] look_indexs=[0], X_cur_test.shape=(20166, 4)
[ 2019-02-11 17:04:57,627][cascade_classifier.transform] [layer=1] look_indexs=[0], X_cur_test.shape=(20166, 52)
[ 2019-02-11 17:04:58,090][cascade_classifier.transform] [layer=2] look_indexs=[0], X_cur_test.shape=(20166, 52)
[ 2019-02-11 17:04:58,536][cascade_classifier.transform] [layer=3] look_indexs=[0], X_cur_test.shape=(20166, 52)
[ 2019-02-11 17:04:58,978][cascade_classifier.transform] [layer=4] look_indexs=[0], X_cur_test.shape=(20166, 52)
[ 2019-02-11 17:04:59,425][cascade_classifier.transform] [layer=5] look_indexs=[0], X_cur_test.shape=(20166, 52)
[ 2019-02-11 17:04:59,880][cascade_classifier.transform] X_groups_test.shape=[(6

             precision    recall  f1-score   support

          0     0.9979    0.9938    0.9959      1462
          1     0.9985    0.9986    0.9986      7962
          2     0.9989    0.9985    0.9987      5278
          3     0.9979    0.9991    0.9985      3283
          4     0.9978    1.0000    0.9989       893
          5     0.9992    1.0000    0.9996      1288

avg / total     0.9985    0.9985    0.9985     20166

             precision    recall  f1-score   support

          0     0.9898    0.9939    0.9918       488
          1     0.9970    0.9951    0.9960      2655
          2     0.9938    0.9949    0.9943      1759
          3     0.9936    0.9927    0.9931      1095
          4     0.9800    0.9899    0.9849       297
          5     0.9930    0.9907    0.9918       429

avg / total     0.9941    0.9941    0.9941      6723

             precision    recall  f1-score   support

          0     0.9099    0.9939    0.9500       488
          1     0.9927    0.9763    0.9