In [1]:
import os
import pandas as pd
import numpy as np

from sklearn import preprocessing
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import ExtraTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split


from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
import pickle

import sys
sys.path.append("..") 
from gcforest.gcforest import GCForest

  from numpy.core.umath_tests import inner1d


hyperparameters

In [2]:
random_seed = 42
kf=10
score = 'f1_weighted'

# load data_four_features

In [3]:
path = os.getcwd()+'/../data/20122018freshwater_four_feature.csv'
data_four_features = pd.read_csv(path, na_values = np.nan)

print(data_four_features.shape)

(33612, 5)


- CE(1-9)标签y为1-6
- CE(1-10)标签y为0-5

先进行计算CE(1-9)，在载入CE(1-10)的时候要将标签统一减去1

In [4]:
X = data_four_features.drop(['本周水质'], axis=1) # Series
y = data_four_features['本周水质'] # Series

In [5]:
X = data_four_features.drop(['本周水质'], axis=1) # Series
y = data_four_features['本周水质'] # Series
print("水质分布情况:")
print(y.value_counts())
print("\n各特征类型分布情况:")
print(data_four_features.dtypes.value_counts())

水质分布情况:
2    13272
3     8797
4     5472
1     2438
6     2146
5     1487
Name: 本周水质, dtype: int64

各特征类型分布情况:
float64    4
int64      1
dtype: int64


In [6]:
# Convert series to ndarray
X = X.values
y = y.values

In [7]:
print("============ train_test_split ============")

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, 
                                       stratify = y, random_state = random_seed)
print("67%% train: %d/%d, 33%% test: %d/%d" %(X_train.shape[0], X.shape[0], X_test.shape[0], X.shape[0]))

67% train: 26889/33612, 33% test: 6723/33612


### normalize  train data

fulfill the Na with median, then standardized the data, output type ndarray

In [8]:
clean_pipeline = Pipeline([('imputer', preprocessing.Imputer(missing_values='NaN',strategy="median")),
                           ('std_scaler', preprocessing.StandardScaler()),])
X_train = clean_pipeline.fit_transform(X_train)
X_test = clean_pipeline.fit_transform(X_test)

print("============ train_valid_split ============")
X_train2, X_valid, y_train2, y_valid = train_test_split(X_train, y_train, test_size=0.25, 
                                       stratify = y_train, random_state = random_seed)



- X_train2: training set
- X_valid: validation set
- X_test: test set

## Accuracy on 3 parts of data

1. 载入CE（1-9）的子模型，仅计算CE（1-9），标签1-6
2. 载入CE（1-10）的子模型覆盖之前的模型，计算10个子模型,标签0-5
3. 计算CE（1-10）

- 载入CE（1-9）的子模型，仅计算CE（1-9），标签1-6

In [9]:
models = [
    "LogisticRegression",
    "LinearDiscriminantAnalysis",
    "SVC",
    "DecisionTreeClassifier",
    "ExtraTreeClassifier",
    "GaussianNB",
    "KNeighborsClassifier",
    "RandomForestClassifier",
    "ExtraTreesClassifier"
]
i=0
for model in models:
    model_name = model
    with open("../pkl/CE_97661/CE_" + model_name + ".pkl", "rb") as f:
        models[i] = pickle.load(f)
    i = i+1
# models 不再是字符数组，而是模型数组

population_best_weight = np.load("../npy/CE_best_weights.npy")

classifier_num = 9

# 所有学习器都输出概率向量，最后投票
y_train_pred_proba_all = []
y_valid_pred_proba_all = []
y_test_pred_proba_all = []

# 取训练好的模型，计算各模型”验证集“上输出概率向量
for model in models:
    train_pred_proba = model.predict_proba(X_train2)
    valid_pred_proba = model.predict_proba(X_valid)
    test_pred_proba = model.predict_proba(X_test)
    y_train_pred_proba_all.append(train_pred_proba)
    y_valid_pred_proba_all.append(valid_pred_proba)
    y_test_pred_proba_all.append(test_pred_proba)
    
y_train_pred_ensemble_proba = np.zeros((len(y_train2), 6)) # 初始化集成器概率向量
y_valid_pred_ensemble_proba = np.zeros((len(y_valid), 6)) # 初始化集成器概率向量
y_test_pred_ensemble_proba = np.zeros((len(y_test), 6)) # 初始化集成器概率向量

# 为每一个基学习器乘上权重
for k in range(classifier_num):
    y_train_pred_ensemble_proba += y_train_pred_proba_all[k] * population_best_weight[k]
    y_valid_pred_ensemble_proba += y_valid_pred_proba_all[k] * population_best_weight[k]
    y_test_pred_ensemble_proba += y_test_pred_proba_all[k] * population_best_weight[k]
y_train_pred_ensemble = np.argmax(y_train_pred_ensemble_proba, axis=1) + 1
y_valid_pred_ensemble = np.argmax(y_valid_pred_ensemble_proba, axis=1) + 1
y_test_pred_ensemble = np.argmax(y_test_pred_ensemble_proba, axis=1) + 1

# 计算各水质等级的得分
print("=================CE(1-9)=================")
train_cm = confusion_matrix(y_train2, y_train_pred_ensemble)
valid_cm = confusion_matrix(y_valid, y_valid_pred_ensemble)
test_cm = confusion_matrix(y_test, y_test_pred_ensemble)
i=0
train_acc_all = np.zeros(6)
for c in train_cm:
    train_acc_all[i] = c[i]/np.sum(c)
    print("%d train_acc: %.2f" %(i+1, 100*train_acc_all[i]))
    i=i+1
print("average: %.2f" % (100*np.mean(train_acc_all)))
i=0
valid_acc_all = np.zeros(6)
for c in valid_cm:
    valid_acc_all[i] = c[i]/np.sum(c)
    print("%d valid_acc: %.2f" %(i+1, 100*valid_acc_all[i]))
    i=i+1
print("average: %.2f" % (100*np.mean(valid_acc_all)))
i=0
test_acc_all = np.zeros(6)
for c in test_cm:
    test_acc_all[i] = c[i]/np.sum(c)
    print("%d test_acc: %.2f" %(i+1, 100*test_acc_all[i]))
    i=i+1
print("average: %.2f" % (100*np.mean(test_acc_all)))

1 train_acc: 99.45
2 train_acc: 99.99
3 train_acc: 99.94
4 train_acc: 99.91
5 train_acc: 99.66
6 train_acc: 99.84
average: 99.80
1 valid_acc: 99.18
2 valid_acc: 99.51
3 valid_acc: 99.43
4 valid_acc: 99.27
5 valid_acc: 98.99
6 valid_acc: 99.07
average: 99.24
1 test_acc: 99.39
2 test_acc: 97.63
3 test_acc: 98.01
4 test_acc: 97.07
5 test_acc: 93.27
6 test_acc: 98.83
average: 97.37


- 载入CE（1-10）的子模型覆盖之前的模型，计算10个子模型,标签0-5

In [10]:
y_train2 = y_train2-1
y_valid = y_valid-1
y_test = y_test-1

models = [
    "LogisticRegression",
    "LinearDiscriminantAnalysis",
    "SVC",
    "DecisionTreeClassifier",
    "ExtraTreeClassifier",
    "GaussianNB",
    "KNeighborsClassifier",
    "RandomForestClassifier",
    "ExtraTreesClassifier",
    "GCForest"
]

i=0
for name in models:
    model_path = "../pkl/CE_97661_10/CE_" + name + ".pkl"
    with open(model_path, "rb") as f:
        models[i] = pickle.load(f)
    i = i+1

In [11]:
for model in models:
    model_name = model.__class__.__name__
    train_pred = model.predict(X_train2)
    valid_pred = model.predict(X_valid)
    test_pred = model.predict(X_test)
    print("=================" + model_name + "=================")
    train_cm = confusion_matrix(y_train2, train_pred)
    valid_cm = confusion_matrix(y_valid, valid_pred)
    test_cm = confusion_matrix(y_test, test_pred)
    i=0
    train_acc_all = np.zeros(6)
    for c in train_cm:
        train_acc_all[i] = c[i]/np.sum(c)
        print("%d train_acc: %.2f" %(i+1, 100*train_acc_all[i]))
        i=i+1
    print("average: %.2f" % (100*np.mean(train_acc_all)))
    i=0
    valid_acc_all = np.zeros(6)
    for c in valid_cm:
        valid_acc_all[i] = c[i]/np.sum(c)
        print("%d valid_acc: %.2f" %(i+1, 100*valid_acc_all[i]))
        i=i+1
    print("average: %.2f" % (100*np.mean(valid_acc_all)))
    i=0
    test_acc_all = np.zeros(6)
    for c in test_cm:
        test_acc_all[i] = c[i]/np.sum(c)
        print("%d test_acc: %.2f" %(i+1, 100*test_acc_all[i]))
        i=i+1
    print("average: %.2f" % (100*np.mean(test_acc_all)))

1 train_acc: 3.08
2 train_acc: 99.21
3 train_acc: 57.22
4 train_acc: 45.05
5 train_acc: 0.00
6 train_acc: 77.02
average: 46.93
1 valid_acc: 3.07
2 valid_acc: 99.13
3 valid_acc: 58.50
4 valid_acc: 45.21
5 valid_acc: 0.00
6 valid_acc: 78.79
average: 47.45
1 test_acc: 5.74
2 test_acc: 99.51
3 test_acc: 55.17
4 test_acc: 48.08
5 test_acc: 0.00
6 test_acc: 80.19
average: 48.11
1 train_acc: 0.00
2 train_acc: 99.90
3 train_acc: 38.08
4 train_acc: 50.62
5 train_acc: 28.33
6 train_acc: 44.57
average: 43.58
1 valid_acc: 0.00
2 valid_acc: 99.85
3 valid_acc: 38.49
4 valid_acc: 51.42
5 valid_acc: 26.94
6 valid_acc: 45.45
average: 43.69
1 test_acc: 0.00
2 test_acc: 99.92
3 test_acc: 38.58
4 test_acc: 52.83
5 test_acc: 29.97
6 test_acc: 47.32
average: 44.77
1 train_acc: 64.64
2 train_acc: 93.62
3 train_acc: 89.20
4 train_acc: 90.07
5 train_acc: 87.35
6 train_acc: 95.73
average: 86.77
1 valid_acc: 64.34
2 valid_acc: 93.22
3 valid_acc: 88.74
4 valid_acc: 89.77
5 valid_acc: 78.45
6 valid_acc: 94.87
aver

[ 2019-02-11 16:31:14,501][cascade_classifier.transform] X_groups_test.shape=[(20166, 4)]
[ 2019-02-11 16:31:14,503][cascade_classifier.transform] group_dims=[4]
[ 2019-02-11 16:31:14,504][cascade_classifier.transform] X_test.shape=(20166, 4)
[ 2019-02-11 16:31:14,505][cascade_classifier.transform] [layer=0] look_indexs=[0], X_cur_test.shape=(20166, 4)


1 train_acc: 84.06
2 train_acc: 96.51
3 train_acc: 93.43
4 train_acc: 94.21
5 train_acc: 85.89
6 train_acc: 94.88
average: 91.50
1 valid_acc: 77.46
2 valid_acc: 93.48
3 valid_acc: 89.31
4 valid_acc: 88.68
5 valid_acc: 70.03
6 valid_acc: 90.44
average: 84.90
1 test_acc: 76.84
2 test_acc: 93.15
3 test_acc: 88.75
4 test_acc: 89.58
5 test_acc: 81.82
6 test_acc: 93.94
average: 87.35
1 train_acc: 99.66
2 train_acc: 99.97
3 train_acc: 99.94
4 train_acc: 99.85
5 train_acc: 99.66
6 train_acc: 99.92
average: 99.84
1 valid_acc: 99.18
2 valid_acc: 99.51
3 valid_acc: 99.55
4 valid_acc: 99.27
5 valid_acc: 98.65
6 valid_acc: 99.30
average: 99.24
1 test_acc: 99.39
2 test_acc: 97.55
3 test_acc: 98.01
4 test_acc: 96.98
5 test_acc: 93.27
6 test_acc: 98.83
average: 97.34
1 train_acc: 100.00
2 train_acc: 100.00
3 train_acc: 100.00
4 train_acc: 100.00
5 train_acc: 100.00
6 train_acc: 100.00
average: 100.00
1 valid_acc: 98.36
2 valid_acc: 98.64
3 valid_acc: 97.56
4 valid_acc: 96.35
5 valid_acc: 87.88
6 valid

[ 2019-02-11 16:31:14,921][cascade_classifier.transform] [layer=1] look_indexs=[0], X_cur_test.shape=(20166, 52)
[ 2019-02-11 16:31:15,398][cascade_classifier.transform] [layer=2] look_indexs=[0], X_cur_test.shape=(20166, 52)
[ 2019-02-11 16:31:15,900][cascade_classifier.transform] [layer=3] look_indexs=[0], X_cur_test.shape=(20166, 52)
[ 2019-02-11 16:31:16,339][cascade_classifier.transform] [layer=4] look_indexs=[0], X_cur_test.shape=(20166, 52)
[ 2019-02-11 16:31:16,815][cascade_classifier.transform] [layer=5] look_indexs=[0], X_cur_test.shape=(20166, 52)
[ 2019-02-11 16:31:17,266][cascade_classifier.transform] X_groups_test.shape=[(6723, 4)]
[ 2019-02-11 16:31:17,267][cascade_classifier.transform] group_dims=[4]
[ 2019-02-11 16:31:17,268][cascade_classifier.transform] X_test.shape=(6723, 4)
[ 2019-02-11 16:31:17,269][cascade_classifier.transform] [layer=0] look_indexs=[0], X_cur_test.shape=(6723, 4)
[ 2019-02-11 16:31:17,424][cascade_classifier.transform] [layer=1] look_indexs=[0],

1 train_acc: 99.66
2 train_acc: 99.95
3 train_acc: 99.83
4 train_acc: 99.97
5 train_acc: 100.00
6 train_acc: 100.00
average: 99.90
1 valid_acc: 99.39
2 valid_acc: 99.51
3 valid_acc: 99.49
4 valid_acc: 99.27
5 valid_acc: 98.99
6 valid_acc: 99.07
average: 99.29
1 test_acc: 99.39
2 test_acc: 97.59
3 test_acc: 98.01
4 test_acc: 96.98
5 test_acc: 93.27
6 test_acc: 99.07
average: 97.38


- 计算CE（1-10）

In [14]:
population_best_weight = np.load("../npy/CE_best_weights(1-10).npy")

classifier_num = 10

# 所有学习器都输出概率向量，最后投票
y_train_pred_proba_all = []
y_valid_pred_proba_all = []
y_test_pred_proba_all = []

# 取训练好的模型，计算各模型”验证集“上输出概率向量
for model in models:
    train_pred_proba = model.predict_proba(X_train2)
    valid_pred_proba = model.predict_proba(X_valid)
    test_pred_proba = model.predict_proba(X_test)
    y_train_pred_proba_all.append(train_pred_proba)
    y_valid_pred_proba_all.append(valid_pred_proba)
    y_test_pred_proba_all.append(test_pred_proba)
    
y_train_pred_ensemble_proba = np.zeros((len(y_train2), 6)) # 初始化集成器概率向量
y_valid_pred_ensemble_proba = np.zeros((len(y_valid), 6)) # 初始化集成器概率向量
y_test_pred_ensemble_proba = np.zeros((len(y_test), 6)) # 初始化集成器概率向量

# 为每一个基学习器乘上权重
for k in range(classifier_num):
    y_train_pred_ensemble_proba += y_train_pred_proba_all[k] * population_best_weight[k]
    y_valid_pred_ensemble_proba += y_valid_pred_proba_all[k] * population_best_weight[k]
    y_test_pred_ensemble_proba += y_test_pred_proba_all[k] * population_best_weight[k]
y_train_pred_ensemble = np.argmax(y_train_pred_ensemble_proba, axis=1)
y_valid_pred_ensemble = np.argmax(y_valid_pred_ensemble_proba, axis=1)
y_test_pred_ensemble = np.argmax(y_test_pred_ensemble_proba, axis=1)

# 计算各水质等级的得分
print("=================CE(1-10)=================")
train_cm = confusion_matrix(y_train2, y_train_pred_ensemble)
valid_cm = confusion_matrix(y_valid, y_valid_pred_ensemble)
test_cm = confusion_matrix(y_test, y_test_pred_ensemble)
i=0
train_acc_all = np.zeros(6)
for c in train_cm:
    train_acc_all[i] = c[i]/np.sum(c)
    print("%d train_acc: %.2f" %(i+1, 100*train_acc_all[i]))
    i=i+1
print("average: %.2f" % (100*np.mean(train_acc_all)))
i=0
valid_acc_all = np.zeros(6)
for c in valid_cm:
    valid_acc_all[i] = c[i]/np.sum(c)
    print("%d valid_acc: %.2f" %(i+1, 100*valid_acc_all[i]))
    i=i+1
print("average: %.2f" % (100*np.mean(valid_acc_all)))
i=0
test_acc_all = np.zeros(6)
for c in test_cm:
    test_acc_all[i] = c[i]/np.sum(c)
    print("%d test_acc: %.2f" %(i+1, 100*test_acc_all[i]))
    i=i+1
print("average: %.2f" % (100*np.mean(test_acc_all)))

[ 2019-02-11 16:33:19,455][cascade_classifier.transform] X_groups_test.shape=[(20166, 4)]
[ 2019-02-11 16:33:19,458][cascade_classifier.transform] group_dims=[4]
[ 2019-02-11 16:33:19,459][cascade_classifier.transform] X_test.shape=(20166, 4)
[ 2019-02-11 16:33:19,461][cascade_classifier.transform] [layer=0] look_indexs=[0], X_cur_test.shape=(20166, 4)
[ 2019-02-11 16:33:19,883][cascade_classifier.transform] [layer=1] look_indexs=[0], X_cur_test.shape=(20166, 52)
[ 2019-02-11 16:33:20,357][cascade_classifier.transform] [layer=2] look_indexs=[0], X_cur_test.shape=(20166, 52)
[ 2019-02-11 16:33:20,794][cascade_classifier.transform] [layer=3] look_indexs=[0], X_cur_test.shape=(20166, 52)
[ 2019-02-11 16:33:21,232][cascade_classifier.transform] [layer=4] look_indexs=[0], X_cur_test.shape=(20166, 52)
[ 2019-02-11 16:33:21,669][cascade_classifier.transform] [layer=5] look_indexs=[0], X_cur_test.shape=(20166, 52)
[ 2019-02-11 16:33:22,114][cascade_classifier.transform] X_groups_test.shape=[(6

1 train_acc: 99.38
2 train_acc: 99.86
3 train_acc: 99.85
4 train_acc: 99.91
5 train_acc: 100.00
6 train_acc: 100.00
average: 99.83
1 valid_acc: 99.39
2 valid_acc: 99.51
3 valid_acc: 99.49
4 valid_acc: 99.27
5 valid_acc: 98.99
6 valid_acc: 99.07
average: 99.29
1 test_acc: 99.39
2 test_acc: 97.63
3 test_acc: 98.01
4 test_acc: 96.98
5 test_acc: 93.27
6 test_acc: 99.07
average: 97.39
