In [1]:
import os
import pandas as pd
import numpy as np

from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
from xgboost import plot_importance

from sklearn import preprocessing
from sklearn.pipeline import Pipeline

from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.model_selection import StratifiedKFold

from sklearn.grid_search import GridSearchCV

from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score



#### somte sampling

In [2]:
def Smoter(X, y, is_random=False):
    if is_random == True:
        sm = SMOTE(random_state=random_seed)
    elif is_random == False:
        sm = SMOTE(random_state=0)
    X_smote, y_smote = sm.fit_sample(X, y)

    return X_smote, y_smote

#### evaluate function

In [3]:
def evaluate(true, pred):
    # compute accuracy, precision and recall
    TP, FP, TN, FN = 0, 0, 0, 0

    for i in range(0, len(pred)):
        if pred[i] == true[i] and true[i] == 1:
            TP += 1
        elif pred[i] == true[i] and true[i] == 0:
            TN += 1
        elif pred[i] != true[i] and true[i] == 0:
            FP += 1
        elif pred[i] != true[i] and true[i] == 1:
            FN += 1

    precision = TP/(TP + FP)
    recall = TP/(TP + FN)
    accuracy = (TP+TN)/(TP+TN+FN+FP)
    
    print('TP=',TP,'FP=',FP,'TN=',TN,'FN=',FN)
    F1 = 2*precision*recall / (precision + recall)
    print("precision", precision,"\nrecall", recall,"\naccuracy", accuracy)
    print('F1=',F1)
    return F1, accuracy, precision, recall

### HyperParameter

In [4]:
batch_size = 3
test_size = 0.33
random_seed = 42
cv = 5

# Load 2018 Train Set

In [5]:
data_all = pd.read_csv("../data/water/csv/train2018.csv")

X = data_all.values[:, 0:-1]
y = data_all.values[:, -1]

### clean the data

fulfill the Na with median, then standardized the data, output type ndarray

In [6]:
clean_pipeline = Pipeline([('imputer', preprocessing.Imputer(missing_values='NaN',strategy="median")),
                               ('std_scaler', preprocessing.StandardScaler()),])

# Load 2018 Test Set

In [7]:
test = pd.read_csv("../data/water/csv/test2018.csv")

X_test = test.values[:, 0:-1]
y_test = test.values[:, -1]

X_test = clean_pipeline.fit_transform(X_test)

In [8]:
print("样本不平衡 %f" %(np.sum(y==1)/len(y)))

样本不平衡 0.012460


In [9]:
# model = XGBClassifier(learning_rate=0.1,
#                       n_estimators=100,         # 树的个数--1000棵树建立xgboost
#                       max_depth=6,               # 树的深度
#                       min_child_weight = 1,      # 叶子节点最小权重
#                       gamma=0.,                  # 惩罚项中叶子结点个数前的参数
#                       subsample=0.8,             # 随机选择80%样本建立决策树
#                       colsample_btree=0.8,       # 随机选择80%特征建立决策树
#                       objective='binary:logistic', # 指定损失函数
#                       scale_pos_weight=90,        # 解决样本个数不平衡的问题
#                       random_state=random_seed            # 随机数
#                       )

model = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.8, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=4, min_child_weight=1, missing=None, n_estimators=1000,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=42, silent=True,
       subsample=0.8)
xgb_acc_valid = []
xgb_precision_valid = []
xgb_recall_valid = []
xgb_f1_valid = []
entries = []

skf = StratifiedKFold(n_splits=cv, random_state=random_seed)
skf.get_n_splits(X, y)
for train_index, valid_index in skf.split(X, y):
    X_train, X_valid = X[train_index], X[valid_index]
    y_train, y_valid = y[train_index], y[valid_index]
    X_train = clean_pipeline.fit_transform(X_train)
    X_valid = clean_pipeline.fit_transform(X_valid)
    # X_train_smote, y_train_smote = Smoter(X_train, y_train, is_random = True)
    model.fit(X_train,
              y_train,
              eval_set = [(X_valid, y_valid)],
              early_stopping_rounds = 50,
              )
    y_valid_pred = model.predict(X_valid)
    xgb_acc_valid.append(accuracy_score(y_valid, y_valid_pred))
    xgb_precision_valid.append(precision_score(y_valid, y_valid_pred))
    xgb_recall_valid.append(recall_score(y_valid, y_valid_pred))
    xgb_f1_valid.append(f1_score(y_valid, y_valid_pred))
entries.append((np.mean(xgb_acc_valid), np.mean(xgb_precision_valid), np.mean(xgb_recall_valid), np.mean(xgb_f1_valid)))
xgb_df = pd.DataFrame(entries, columns=['valid_accuracy', 'valid_precision', 'valid_recall', 'valid_f1'])

[0]	validation_0-error:0.009168
Will train until validation_0-error hasn't improved in 50 rounds.
[1]	validation_0-error:0.008049
[2]	validation_0-error:0.008049
[3]	validation_0-error:0.007796
[4]	validation_0-error:0.007796
[5]	validation_0-error:0.007616
[6]	validation_0-error:0.007147
[7]	validation_0-error:0.007652
[8]	validation_0-error:0.007796
[9]	validation_0-error:0.007291
[10]	validation_0-error:0.007652
[11]	validation_0-error:0.007219
[12]	validation_0-error:0.007219
[13]	validation_0-error:0.007183
[14]	validation_0-error:0.006714
[15]	validation_0-error:0.006677
[16]	validation_0-error:0.006677
[17]	validation_0-error:0.006317
[18]	validation_0-error:0.006461
[19]	validation_0-error:0.006317
[20]	validation_0-error:0.006317
[21]	validation_0-error:0.006461
[22]	validation_0-error:0.006244
[23]	validation_0-error:0.005847
[24]	validation_0-error:0.00592
[25]	validation_0-error:0.00592
[26]	validation_0-error:0.005739
[27]	validation_0-error:0.005595
[28]	validation_0-erro

  if diff:


[0]	validation_0-error:0.019961
Will train until validation_0-error hasn't improved in 50 rounds.
[1]	validation_0-error:0.017723
[2]	validation_0-error:0.004584
[3]	validation_0-error:0.00444
[4]	validation_0-error:0.006064
[5]	validation_0-error:0.005234
[6]	validation_0-error:0.0061
[7]	validation_0-error:0.00527
[8]	validation_0-error:0.005378
[9]	validation_0-error:0.005378
[10]	validation_0-error:0.005378
[11]	validation_0-error:0.005378
[12]	validation_0-error:0.005378
[13]	validation_0-error:0.005378
[14]	validation_0-error:0.005487
[15]	validation_0-error:0.005487
[16]	validation_0-error:0.006461
[17]	validation_0-error:0.00527
[18]	validation_0-error:0.005559
[19]	validation_0-error:0.005198
[20]	validation_0-error:0.004656
[21]	validation_0-error:0.004584
[22]	validation_0-error:0.00462
[23]	validation_0-error:0.00462
[24]	validation_0-error:0.006822
[25]	validation_0-error:0.006858
[26]	validation_0-error:0.006822
[27]	validation_0-error:0.006786
[28]	validation_0-error:0.0

  if diff:


[0]	validation_0-error:0.014655
Will train until validation_0-error hasn't improved in 50 rounds.
[1]	validation_0-error:0.014655
[2]	validation_0-error:0.01729
[3]	validation_0-error:0.021008
[4]	validation_0-error:0.018553
[5]	validation_0-error:0.018156
[6]	validation_0-error:0.021405
[7]	validation_0-error:0.024653
[8]	validation_0-error:0.021405
[9]	validation_0-error:0.023895
[10]	validation_0-error:0.024653
[11]	validation_0-error:0.023895
[12]	validation_0-error:0.023895
[13]	validation_0-error:0.020611
[14]	validation_0-error:0.022849
[15]	validation_0-error:0.021585
[16]	validation_0-error:0.021621
[17]	validation_0-error:0.019311
[18]	validation_0-error:0.021549
[19]	validation_0-error:0.019059
[20]	validation_0-error:0.019023
[21]	validation_0-error:0.019059
[22]	validation_0-error:0.019311
[23]	validation_0-error:0.019528
[24]	validation_0-error:0.018986
[25]	validation_0-error:0.019275
[26]	validation_0-error:0.019889
[27]	validation_0-error:0.019961
[28]	validation_0-err

  if diff:


[0]	validation_0-error:0.002779
Will train until validation_0-error hasn't improved in 50 rounds.
[1]	validation_0-error:0.001263
[2]	validation_0-error:0.001263
[3]	validation_0-error:0.001263
[4]	validation_0-error:0.001263
[5]	validation_0-error:0.001263
[6]	validation_0-error:0.001263
[7]	validation_0-error:0.001263
[8]	validation_0-error:0.001263
[9]	validation_0-error:0.001263
[10]	validation_0-error:0.001263
[11]	validation_0-error:0.001263
[12]	validation_0-error:0.001263
[13]	validation_0-error:0.001263
[14]	validation_0-error:0.001263
[15]	validation_0-error:0.001263
[16]	validation_0-error:0.001263
[17]	validation_0-error:0.001263
[18]	validation_0-error:0.001263
[19]	validation_0-error:0.001263
[20]	validation_0-error:0.001263
[21]	validation_0-error:0.001263
[22]	validation_0-error:0.001263
[23]	validation_0-error:0.001263
[24]	validation_0-error:0.001263
[25]	validation_0-error:0.001263
[26]	validation_0-error:0.001263
[27]	validation_0-error:0.001263
[28]	validation_0-er

  if diff:


[0]	validation_0-error:0.006642
Will train until validation_0-error hasn't improved in 50 rounds.
[1]	validation_0-error:0.011226
[2]	validation_0-error:0.009999
[3]	validation_0-error:0.009999
[4]	validation_0-error:0.009999
[5]	validation_0-error:0.009999
[6]	validation_0-error:0.009999
[7]	validation_0-error:0.009999
[8]	validation_0-error:0.009999
[9]	validation_0-error:0.009999
[10]	validation_0-error:0.009999
[11]	validation_0-error:0.009999
[12]	validation_0-error:0.009999
[13]	validation_0-error:0.009999
[14]	validation_0-error:0.009999
[15]	validation_0-error:0.009999
[16]	validation_0-error:0.009999
[17]	validation_0-error:0.009999
[18]	validation_0-error:0.009999
[19]	validation_0-error:0.009999
[20]	validation_0-error:0.009999
[21]	validation_0-error:0.009999
[22]	validation_0-error:0.009999
[23]	validation_0-error:0.009999
[24]	validation_0-error:0.009999
[25]	validation_0-error:0.009999
[26]	validation_0-error:0.009999
[27]	validation_0-error:0.009999
[28]	validation_0-er

  if diff:


In [None]:
xgb_df

Unnamed: 0,valid_accuracy,valid_precision,valid_recall,valid_f1
0,0.989799,0.737498,0.491867,0.544718


In [None]:
param_test1 = {
    'max_depth':np.arange(2,6,2),
}

gsearch1 = GridSearchCV(estimator = XGBClassifier(learning_rate =0.1, 
                                                  n_estimators=1000, 
                                                  gamma=0, 
                                                  subsample=0.8, 
                                                  colsample_bytree=0.8,
                                                  objective= 'binary:logistic', 
                                                  scale_pos_weight=1, 
                                                  seed=random_seed), 
                                                 param_grid = param_test1, scoring="f1", n_jobs=-1, cv=cv)
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=test_size, stratify = y, random_state = random_seed)
gsearch1.fit(X_train, y_train)

gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_

  if diff:
  if diff:
  if diff:
  if diff:


In [None]:
best_clf = gsearch1.best_estimator_
best_clf.fit(X_train, y_train)

y_valid_pred = best_clf.predict(X_valid)

print("Valid f1: %f" %(f1_score(y_valid, y_valid_pred)))

In [None]:
print(gsearch1.best_estimator_)

In [None]:
y_test_pred = best_clf.predict(X_test)

print("Test f1: %f" %(f1_score(y_test, y_test_pred)))

In [None]:
y_valid_pred = best_clf.predict(X_valid)

print("Valid f1: %f" %(f1_score(y_valid, y_valid_pred)))