In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold
from imblearn.over_sampling import SMOTE
import pickle

import sys
sys.path.append("..") 
from gcforest.gcforest import GCForest
from gcforest.utils.config_utils import load_json

In [2]:
def Smoter(X, y, is_random=False):
    if is_random == True:
        # random_lst = list(np.random.randint(0, 1000, 4))
        sm = SMOTE(random_state=random_seed)
    elif is_random == False:
        sm = SMOTE(random_state=0)

    # sm = SMOTE(random_state=random_lst[2])
    X_smote, y_smote = sm.fit_sample(X, y)

    return X_smote, y_smote

#### evaluate function

In [3]:
def evaluate(true, pred):
    # compute accuracy, precision and recall
    TP, FP, TN, FN = 0, 0, 0, 0

    for i in range(0, len(pred)):
        if pred[i] == true[i] and true[i] == 1:
            TP += 1
        elif pred[i] == true[i] and true[i] == 0:
            TN += 1
        elif pred[i] != true[i] and true[i] == 0:
            FP += 1
        elif pred[i] != true[i] and true[i] == 1:
            FN += 1

    precision = TP/(TP + FP)
    recall = TP/(TP + FN)
    accuracy = (TP+TN)/(TP+TN+FN+FP)
    
    print('TP=',TP,'FP=',FP,'TN=',TN,'FN=',FN)
    F1 = 2*precision*recall / (precision + recall)
    print("precision", precision,"\nrecall", recall,"\naccuracy", accuracy)
    print('F1=',F1)
    return F1, accuracy, precision, recall

#### Batch

combine serveral datas‘ features together

In [4]:
def Batch(X, y, size):
    batch_size = size

    X_trim = X
    y_trim = y

    if len(X) % batch_size != 0:
        extra_num = len(X) % batch_size
        X_trim = np.delete(X, range(len(X) - extra_num, len(X)), axis = 0)
        y_trim = np.delete(y, range(len(y) - extra_num, len(y)), axis = 0)

    X_batch = np.split(X_trim, len(X_trim)/batch_size)
    y_batch = np.split(y_trim, len(y_trim)/batch_size)

    num_batch = 0

    for each_batch in X_batch:
        X_batch[num_batch] = np.reshape(X_batch[num_batch], (9*batch_size))
        y_batch[num_batch] = y_batch[num_batch][-1]
        num_batch += 1

    X_batch = np.array(X_batch)
    y_batch = np.array(y_batch)
    return X_batch, y_batch

#### gc_config

In [5]:
def get_toy_config():
    config = {}
    ca_config = {}
    ca_config["random_state"] = random_seed
    ca_config["max_layers"] = 10
    ca_config["early_stopping_rounds"] = 3
    ca_config["n_classes"] = 2
    ca_config["estimators"] = []
    ca_config["estimators"].append({"n_folds": 5, "type": "RandomForestClassifier", "n_estimators": 10, "max_depth": None, "n_jobs": -1})
    ca_config["estimators"].append({"n_folds": 5, 
                                    "type": "XGBClassifier", 
                                    "n_estimators": 1150, 
                                    "learning_rate": 0.1,
                                    "max_depth":4,
                                    "gamma":0,
                                    "subsample":0.8,
                                    "colsample_bytree":0.8,
                                    "objective":'binary:logistic',
                                    "scale_pos_weight":0.7999,
                                    "seed":random_seed,
                                    "n_jobs": -1})
    ca_config["estimators"].append({"n_folds": 5, 
                                    "type": "XGBClassifier", 
                                    "n_estimators": 1000, 
                                    "max_depth":4,
                                    "learning_rate": 0.1,
                                    "gamma":0,
                                    "subsample":0.8,
                                    "colsample_bytree":0.8,
                                    "objective":'binary:logistic',
                                    "scale_pos_weight":1,
                                    "seed":random_seed,
                                    "n_jobs": -1})
    config["cascade"] = ca_config
    return config

### HyperParameter

In [6]:
batch_size = 3
test_size = 0.2
random_seed = 42
cv = 5

# load train

In [7]:
data_all = pd.read_csv("../data/water/csv/train2019.csv")

X = data_all.values[:, 0:-1]
y = data_all.values[:, -1]

#### train_valid_split

In [8]:
print("============ train_valid_split ============")
X_train, X_valid, y_train, y_valid = train_test_split(X, y,test_size=test_size, stratify=y, random_state=random_seed)
print("train: %d, valid: %d" %(X_train.shape[0], X_valid.shape[0]))

train: 105769, valid: 26443


### clean the data before somte

fulfill the Na with median, then standardized the data, output type ndarray

In [9]:
clean_pipeline = Pipeline([('imputer', preprocessing.Imputer(missing_values='NaN',strategy="median")),
                           ('std_scaler', preprocessing.StandardScaler()),])
X_train = clean_pipeline.fit_transform(X_train)
X_valid = clean_pipeline.fit_transform(X_valid)



#### Do somte sampling on the train data to solve data imblance problem

In [10]:
X_train_oversampled, y_train_oversampled = Smoter(X_train, y_train, is_random=True)
print("============ SMOTE ============")
print("train: %d, contains %.4f of 0 , after SMOTE: train: %d contains %.4f of 1" %(X_train.shape[0], (y_train == 0).sum()/y_train.shape[0], X_train_oversampled.shape[0], (y_train_oversampled == 0).sum()/y_train_oversampled.shape[0]))

train: 105769, contains 0.9984 of 0 , after SMOTE: train: 211204 contains 0.5000 of 1


In [11]:
# X_train_oversampled_batch, y_train_oversampled_batch = Batch(X_train_oversampled, y_train_oversampled, batch_size)
# X_train_batch, y_train_batch = Batch(X_train, y_train, batch_size)
# X_valid_batch, y_valid_batch = Batch(X_valid, y_valid, batch_size)

# GcForest

## test gc

# load 2019 Test datasets

In [12]:
# test = pd.read_csv("../data/water/csv/test2018.csv")

# X_test = test.values[:, 0:-1]
# y_test = test.values[:, -1]

# X_test = clean_pipeline.fit_transform(X_test)

In [13]:
# X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=test_size, stratify = y, random_state = random_seed)

# X_train_oversampled, y_train_oversampled = Smoter(X_train, y_train, is_random=True)
config = get_toy_config()
gc = GCForest(config)

gc.fit_transform(X_train_oversampled, y_train_oversampled, X_valid, y_valid)
# y_valid_pred = gc.predict(X_valid)

[ 2019-03-23 15:41:34,633][cascade_classifier.fit_transform] X_groups_train.shape=[(211204, 6)],y_train.shape=(211204,),X_groups_test.shape=[(26443, 6)],y_test.shape=(26443,)
[ 2019-03-23 15:41:34,650][cascade_classifier.fit_transform] group_dims=[6]
[ 2019-03-23 15:41:34,652][cascade_classifier.fit_transform] group_starts=[0]
[ 2019-03-23 15:41:34,656][cascade_classifier.fit_transform] group_ends=[6]
[ 2019-03-23 15:41:34,660][cascade_classifier.fit_transform] X_train.shape=(211204, 6),X_test.shape=(26443, 6)
[ 2019-03-23 15:41:34,686][cascade_classifier.fit_transform] [layer=0] look_indexs=[0], X_cur_train.shape=(211204, 6), X_cur_test.shape=(26443, 6)
[ 2019-03-23 15:41:37,152][kfold_wrapper.log_eval_metrics] F1 (layer_0 - estimator_0 - 5_folds.train_0.predict)=99.98%
[ 2019-03-23 15:41:39,170][kfold_wrapper.log_eval_metrics] F1 (layer_0 - estimator_0 - 5_folds.train_1.predict)=99.99%
[ 2019-03-23 15:41:41,222][kfold_wrapper.log_eval_metrics] F1 (layer_0 - estimator_0 - 5_folds.trai

[ 2019-03-23 16:03:55,565][kfold_wrapper.log_eval_metrics] F1 (layer_1 - estimator_1 - 5_folds.train_cv.predict)=99.99%
[ 2019-03-23 16:03:57,786][kfold_wrapper.log_eval_metrics] F1 (layer_1 - estimator_1 - 5_folds.test.predict)=83.72%
[ 2019-03-23 16:05:07,192][kfold_wrapper.log_eval_metrics] F1 (layer_1 - estimator_2 - 5_folds.train_0.predict)=99.99%
[ 2019-03-23 16:06:17,212][kfold_wrapper.log_eval_metrics] F1 (layer_1 - estimator_2 - 5_folds.train_1.predict)=99.99%
[ 2019-03-23 16:07:26,595][kfold_wrapper.log_eval_metrics] F1 (layer_1 - estimator_2 - 5_folds.train_2.predict)=100.00%
[ 2019-03-23 16:08:32,762][kfold_wrapper.log_eval_metrics] F1 (layer_1 - estimator_2 - 5_folds.train_3.predict)=99.99%
[ 2019-03-23 16:09:41,437][kfold_wrapper.log_eval_metrics] F1 (layer_1 - estimator_2 - 5_folds.train_4.predict)=99.99%
[ 2019-03-23 16:09:41,784][kfold_wrapper.log_eval_metrics] F1 (layer_1 - estimator_2 - 5_folds.train_cv.predict)=99.99%
[ 2019-03-23 16:09:41,795][kfold_wrapper.log_eva

[ 2019-03-23 16:22:53,051][kfold_wrapper.log_eval_metrics] F1 (layer_3 - estimator_0 - 5_folds.train_cv.predict)=99.99%
[ 2019-03-23 16:22:53,059][kfold_wrapper.log_eval_metrics] F1 (layer_3 - estimator_0 - 5_folds.test.predict)=77.50%
[ 2019-03-23 16:24:19,724][kfold_wrapper.log_eval_metrics] F1 (layer_3 - estimator_1 - 5_folds.train_0.predict)=99.99%
[ 2019-03-23 16:25:49,927][kfold_wrapper.log_eval_metrics] F1 (layer_3 - estimator_1 - 5_folds.train_1.predict)=99.99%
[ 2019-03-23 16:27:11,043][kfold_wrapper.log_eval_metrics] F1 (layer_3 - estimator_1 - 5_folds.train_2.predict)=99.99%
[ 2019-03-23 16:28:43,236][kfold_wrapper.log_eval_metrics] F1 (layer_3 - estimator_1 - 5_folds.train_3.predict)=100.00%
[ 2019-03-23 16:30:10,767][kfold_wrapper.log_eval_metrics] F1 (layer_3 - estimator_1 - 5_folds.train_4.predict)=99.99%
[ 2019-03-23 16:30:11,189][kfold_wrapper.log_eval_metrics] F1 (layer_3 - estimator_1 - 5_folds.train_cv.predict)=99.99%
[ 2019-03-23 16:30:11,199][kfold_wrapper.log_eva

[ 2019-03-23 16:49:21,943][kfold_wrapper.log_eval_metrics] F1 (layer_4 - estimator_2 - 5_folds.train_3.predict)=99.99%
[ 2019-03-23 16:50:38,783][kfold_wrapper.log_eval_metrics] F1 (layer_4 - estimator_2 - 5_folds.train_4.predict)=99.99%
[ 2019-03-23 16:50:39,174][kfold_wrapper.log_eval_metrics] F1 (layer_4 - estimator_2 - 5_folds.train_cv.predict)=99.99%
[ 2019-03-23 16:50:39,182][kfold_wrapper.log_eval_metrics] F1 (layer_4 - estimator_2 - 5_folds.test.predict)=74.70%
[ 2019-03-23 16:50:39,261][cascade_classifier.calc_f1] F1 (layer_4 - train.classifier_average)=99.99%
[ 2019-03-23 16:50:39,270][cascade_classifier.calc_f1] F1 (layer_4 - test.classifier_average)=74.70%
[ 2019-03-23 16:50:39,272][cascade_classifier.fit_transform] [Result][Optimal Level Detected] opt_layer_num=2, f1_train=99.99%, f1_test=83.15%


(array([[1.0000000e+00, 0.0000000e+00, 9.9999762e-01, 2.3572004e-06,
         9.9999553e-01, 4.4722751e-06],
        [1.0000000e+00, 0.0000000e+00, 9.9995077e-01, 4.9242321e-05,
         9.9998266e-01, 1.7364351e-05],
        [1.0000000e+00, 0.0000000e+00, 9.9999982e-01, 1.5563374e-07,
         9.9999905e-01, 9.7724114e-07],
        ...,
        [0.0000000e+00, 1.0000000e+00, 4.7683716e-07, 9.9999952e-01,
         1.5497208e-06, 9.9999845e-01],
        [0.0000000e+00, 1.0000000e+00, 1.1920929e-06, 9.9999881e-01,
         1.0728836e-06, 9.9999893e-01],
        [0.0000000e+00, 1.0000000e+00, 2.0265579e-06, 9.9999797e-01,
         9.5367432e-07, 9.9999905e-01]], dtype=float32),
 array([[1.00000000e+00, 0.00000000e+00, 9.99999881e-01, 1.08268885e-07,
         9.99999821e-01, 1.86877656e-07],
        [1.00000000e+00, 0.00000000e+00, 9.99997497e-01, 2.46044442e-06,
         9.99997616e-01, 2.40211511e-06],
        [1.00000000e+00, 0.00000000e+00, 9.99999404e-01, 6.25788005e-07,
         9.99

In [13]:
# dump
with open("../pkl/2019_gc.pkl", "wb") as f:
    pickle.dump(gc, f, pickle.HIGHEST_PROTOCOL)
    
# # load
# with open("../pkl/2018_gc.pkl", "rb") as f:
#     gc = pickle.load(f)