In [1]:
import os
import pandas as pd
import numpy as np

from sklearn import preprocessing
from sklearn.pipeline import Pipeline

from sklearn.model_selection import train_test_split

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
%matplotlib inline

import sys
sys.path.append("..") 
from gcforest.gcforest import GCForest

hyperparameters

In [2]:
test_size = 0.33
random_seed = 42
cv=5
score = 'f1_weighted'
max_iteration = 100

# load data_four_features

In [3]:
path = os.getcwd()+'/../data/20122018freshwater_four_feature.csv'
data_four_features = pd.read_csv(path, na_values = np.nan)

print(data_four_features.dtypes)
print(data_four_features.shape)

pH             float64
DO(mg/l)       float64
CODMn(mg/l)    float64
NH3-N(mg/l)    float64
本周水质             int64
dtype: object
(33612, 5)


因为加入gcforest，gcforest默认的y是从0开始计数的，所以将原本从1开始计数的y统一减去1

In [4]:
X = data_four_features.drop(['本周水质'], axis=1) # Series
y = data_four_features['本周水质']-1 # Series

In [5]:
print("水质分布情况:")
print(y.value_counts())
print("\n各特征类型分布情况:")
print(data_four_features.dtypes.value_counts())

水质分布情况:
1    13272
2     8797
3     5472
0     2438
5     2146
4     1487
Name: 本周水质, dtype: int64

各特征类型分布情况:
float64    4
int64      1
dtype: int64


In [6]:
data_four_features.head()

Unnamed: 0,pH,DO(mg/l),CODMn(mg/l),NH3-N(mg/l),本周水质
0,7.09,10.0,5.7,0.33,3
1,6.94,12.0,5.4,0.4,3
2,7.2,9.6,4.9,0.34,3
3,6.8,11.6,6.3,0.59,4
4,6.75,11.0,6.2,0.64,4


In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, 
                                       stratify = y, random_state = random_seed)

### normalize  train data

fulfill the Na with median, then standardized the data, output type ndarray

In [8]:
clean_pipeline = Pipeline([('imputer', preprocessing.Imputer(missing_values='NaN',strategy="median")),
                           ('std_scaler', preprocessing.StandardScaler()),])
X_train = clean_pipeline.fit_transform(X_train)
X_test = clean_pipeline.fit_transform(X_test)

## TEST CE

In [9]:
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.25, 
                                       stratify = y_train, random_state = random_seed)

In [10]:
models = [
    "LogisticRegression",
    "LinearDiscriminantAnalysis",
    "SVC",
    "DecisionTreeClassifier",
    "ExtraTreeClassifier",
    "GaussianNB",
    "KNeighborsClassifier",
    "RandomForestClassifier",
    "ExtraTreesClassifier",
    "GCForest"
]


y_pred_proba_all = []

import pickle
i=0
for model in models:
    model_name = model
    model_path = "../pkl/CE_97661_10/CE_" + model_name + ".pkl"
    with open(model_path, "rb") as f:
        models[i] = pickle.load(f)
    y_pred_proba = models[i].predict_proba(X_valid)
    y_pred = models[i].predict(X_valid)
    print("%s, valid weighted f1 score:%f" %(model_name, f1_score(y_valid, y_pred, average="weighted")))
    y_pred_proba_all.append(y_pred_proba)
    i = i+1

  'precision', 'predicted', average, warn_for)


LogisticRegression, valid weighted f1 score:0.618051
LinearDiscriminantAnalysis, valid weighted f1 score:0.569881
SVC, valid weighted f1 score:0.886775
DecisionTreeClassifier, valid weighted f1 score:0.987514
ExtraTreeClassifier, valid weighted f1 score:0.895863
GaussianNB, valid weighted f1 score:0.812976
KNeighborsClassifier, valid weighted f1 score:0.891579
RandomForestClassifier, valid weighted f1 score:0.994051
ExtraTreesClassifier, valid weighted f1 score:0.973277


[ 2018-12-21 00:06:43,500][cascade_classifier.transform] X_groups_test.shape=[(6723, 4)]
[ 2018-12-21 00:06:43,502][cascade_classifier.transform] group_dims=[4]
[ 2018-12-21 00:06:43,504][cascade_classifier.transform] X_test.shape=(6723, 4)
[ 2018-12-21 00:06:43,507][cascade_classifier.transform] [layer=0] look_indexs=[0], X_cur_test.shape=(6723, 4)
[ 2018-12-21 00:06:43,743][cascade_classifier.transform] [layer=1] look_indexs=[0], X_cur_test.shape=(6723, 52)
[ 2018-12-21 00:06:43,973][cascade_classifier.transform] [layer=2] look_indexs=[0], X_cur_test.shape=(6723, 52)
[ 2018-12-21 00:06:44,190][cascade_classifier.transform] [layer=3] look_indexs=[0], X_cur_test.shape=(6723, 52)
[ 2018-12-21 00:06:44,401][cascade_classifier.transform] [layer=4] look_indexs=[0], X_cur_test.shape=(6723, 52)
[ 2018-12-21 00:06:44,617][cascade_classifier.transform] [layer=5] look_indexs=[0], X_cur_test.shape=(6723, 52)
[ 2018-12-21 00:06:44,857][cascade_classifier.transform] X_groups_test.shape=[(6723, 4)]

GCForest, valid weighted f1 score:0.994052


测试集

In [11]:
population_best_weight = np.load("../npy/CE_best_weights(1-10).npy")

classifier_num = 10

In [12]:
y_test_pred_proba_all = []
for model in models:
    model_name = model.__class__.__name__
    y_test_pred_proba = model.predict_proba(X_test)
    y_test_pred = model.predict(X_test)
    print("%s, test weighted f1 score:%f" %(model_name, f1_score(y_test, y_test_pred, average="weighted")))
    y_test_pred_proba_all.append(y_test_pred_proba)
    
    
y_test_pred_ensemble_proba = np.zeros((len(y_test), 6)) # 集成器概率向量

# 为每一个基学习器乘上权重
for k in range(classifier_num):
    y_test_pred_ensemble_proba += y_test_pred_proba_all[k] * population_best_weight[k]
y_test_pred_ensemble = np.argmax(y_test_pred_ensemble_proba, axis=1)

print("NCE")
print(classification_report(y_test, y_test_pred_ensemble, digits=4))

cm = confusion_matrix(y_test, y_test_pred_ensemble)
i=0
acc_all = np.zeros(6)
for c in cm:
    acc_all[i] = c[i]/np.sum(c)
    print("%d accuaracy: %f" %(i+1, acc_all[i]))
    i=i+1
print("acc:", np.sum(y_test == y_test_pred_ensemble)/y_test_pred_ensemble.shape[0])
print('f1_weighted', f1_score(y_test, y_test_pred_ensemble, average='weighted'))

LogisticRegression, test weighted f1 score:0.620574
LinearDiscriminantAnalysis, test weighted f1 score:0.576099


  'precision', 'predicted', average, warn_for)


SVC, test weighted f1 score:0.894545
DecisionTreeClassifier, test weighted f1 score:0.970527
ExtraTreeClassifier, test weighted f1 score:0.895450
GaussianNB, test weighted f1 score:0.822296
KNeighborsClassifier, test weighted f1 score:0.897799
RandomForestClassifier, test weighted f1 score:0.976184
ExtraTreesClassifier, test weighted f1 score:0.962632

[ 2018-12-21 00:06:48,780][cascade_classifier.transform] X_groups_test.shape=[(6723, 4)]
[ 2018-12-21 00:06:48,781][cascade_classifier.transform] group_dims=[4]
[ 2018-12-21 00:06:48,782][cascade_classifier.transform] X_test.shape=(6723, 4)
[ 2018-12-21 00:06:48,783][cascade_classifier.transform] [layer=0] look_indexs=[0], X_cur_test.shape=(6723, 4)
[ 2018-12-21 00:06:48,977][cascade_classifier.transform] [layer=1] look_indexs=[0], X_cur_test.shape=(6723, 52)
[ 2018-12-21 00:06:49,197][cascade_classifier.transform] [layer=2] look_indexs=[0], X_cur_test.shape=(6723, 52)
[ 2018-12-21 00:06:49,407][cascade_classifier.transform] [layer=3] look_indexs=[0], X_cur_test.shape=(6723, 52)
[ 2018-12-21 00:06:49,614][cascade_classifier.transform] [layer=4] look_indexs=[0], X_cur_test.shape=(6723, 52)
[ 2018-12-21 00:06:49,851][cascade_classifier.transform] [layer=5] look_indexs=[0], X_cur_test.shape=(6723, 52)
[ 2018-12-21 00:06:50,097][cascade_classifier.transform] X_groups_test.shape=[(6723, 4)]


GCForest, test weighted f1 score:0.976472
NCE
             precision    recall  f1-score   support

          0     0.9099    0.9939    0.9500       488
          1     0.9927    0.9763    0.9844      2655
          2     0.9835    0.9801    0.9818      1760
          3     0.9779    0.9698    0.9738      1094
          4     0.9327    0.9327    0.9327       297
          5     0.9594    0.9907    0.9748       429

avg / total     0.9771    0.9765    0.9766      6723

1 accuaracy: 0.993852
2 accuaracy: 0.976271
3 accuaracy: 0.980114
4 accuaracy: 0.969835
5 accuaracy: 0.932660
6 accuaracy: 0.990676
acc: 0.976498586940354
f1_weighted 0.9766154358913746


In [23]:
np.around(population_best_weight/np.sum(population_best_weight), 3)

array([0.   , 0.   , 0.   , 0.   , 0.   , 0.379, 0.   , 0.   , 0.   ,
       0.62 ])