In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold
from imblearn.over_sampling import SMOTE
import pickle

import sys
sys.path.append("..") 
from gcforest.gcforest import GCForest
from gcforest.utils.config_utils import load_json

In [2]:
def Smoter(X, y, is_random=False):
    if is_random == True:
        # random_lst = list(np.random.randint(0, 1000, 4))
        sm = SMOTE(random_state=random_seed)
    elif is_random == False:
        sm = SMOTE(random_state=0)

    # sm = SMOTE(random_state=random_lst[2])
    X_smote, y_smote = sm.fit_sample(X, y)

    return X_smote, y_smote

#### evaluate function

In [3]:
def evaluate(true, pred):
    # compute accuracy, precision and recall
    TP, FP, TN, FN = 0, 0, 0, 0

    for i in range(0, len(pred)):
        if pred[i] == true[i] and true[i] == 1:
            TP += 1
        elif pred[i] == true[i] and true[i] == 0:
            TN += 1
        elif pred[i] != true[i] and true[i] == 0:
            FP += 1
        elif pred[i] != true[i] and true[i] == 1:
            FN += 1

    precision = TP/(TP + FP)
    recall = TP/(TP + FN)
    accuracy = (TP+TN)/(TP+TN+FN+FP)
    
    print('TP=',TP,'FP=',FP,'TN=',TN,'FN=',FN)
    F1 = 2*precision*recall / (precision + recall)
    print("precision", precision,"\nrecall", recall,"\naccuracy", accuracy)
    print('F1=',F1)
    return F1, accuracy, precision, recall

#### Batch

combine serveral datas‘ features together

In [4]:
def Batch(X, y, size):
    batch_size = size

    X_trim = X
    y_trim = y

    if len(X) % batch_size != 0:
        extra_num = len(X) % batch_size
        X_trim = np.delete(X, range(len(X) - extra_num, len(X)), axis = 0)
        y_trim = np.delete(y, range(len(y) - extra_num, len(y)), axis = 0)

    X_batch = np.split(X_trim, len(X_trim)/batch_size)
    y_batch = np.split(y_trim, len(y_trim)/batch_size)

    num_batch = 0

    for each_batch in X_batch:
        X_batch[num_batch] = np.reshape(X_batch[num_batch], (9*batch_size))
        y_batch[num_batch] = y_batch[num_batch][-1]
        num_batch += 1

    X_batch = np.array(X_batch)
    y_batch = np.array(y_batch)
    return X_batch, y_batch

#### gc_config

In [5]:
def get_toy_config():
    config = {}
    ca_config = {}
    ca_config["random_state"] = random_seed
    ca_config["max_layers"] = 10
    ca_config["early_stopping_rounds"] = 3
    ca_config["n_classes"] = 2
    ca_config["estimators"] = []
    ca_config["estimators"].append({"n_folds": 5, "type": "RandomForestClassifier", "n_estimators": 10, "max_depth": None, "n_jobs": -1})
    ca_config["estimators"].append({"n_folds": 5, 
                                    "type": "XGBClassifier", 
                                    "n_estimators": 1150, 
                                    "learning_rate": 0.1,
                                    "max_depth":4,
                                    "gamma":0,
                                    "subsample":0.8,
                                    "colsample_bytree":0.8,
                                    "objective":'binary:logistic',
                                    "scale_pos_weight":0.7999,
                                    "seed":random_seed,
                                    "n_jobs": -1})
    ca_config["estimators"].append({"n_folds": 5, 
                                    "type": "XGBClassifier", 
                                    "n_estimators": 1000, 
                                    "max_depth":4,
                                    "learning_rate": 0.1,
                                    "gamma":0,
                                    "subsample":0.8,
                                    "colsample_bytree":0.8,
                                    "objective":'binary:logistic',
                                    "scale_pos_weight":1,
                                    "seed":random_seed,
                                    "n_jobs": -1})
    config["cascade"] = ca_config
    return config

### HyperParameter

In [6]:
batch_size = 3
test_size = 0.2
random_seed = 42
cv = 5

# load train

In [7]:
data_all = pd.read_csv("../data/water/csv/train2019.csv")

X = data_all.values[:, 0:-1]
y = data_all.values[:, -1]

#### train_valid_split

In [8]:
print("============ train_valid_split ============")
X_train, X_valid, y_train, y_valid = train_test_split(X, y,test_size=test_size, stratify=y, random_state=random_seed)
print("train: %d, valid: %d" %(X_train.shape[0], X_valid.shape[0]))

train: 105769, valid: 26443


### clean the data before somte

fulfill the Na with median, then standardized the data, output type ndarray

In [9]:
clean_pipeline = Pipeline([('imputer', preprocessing.Imputer(missing_values='NaN',strategy="median")),
                           ('std_scaler', preprocessing.StandardScaler()),])
X_train = clean_pipeline.fit_transform(X_train)
# X_valid = clean_pipeline.fit_transform(X_valid)



#### Do somte sampling on the train data to solve data imblance problem

In [10]:
# X_train_oversampled, y_train_oversampled = Smoter(X_train, y_train, is_random=True)
# print("============ SMOTE ============")
# print("train: %d, contains %.4f of 0 , after SMOTE: train: %d contains %.4f of 1" %(X_train.shape[0], (y_train == 0).sum()/y_train.shape[0], X_train_oversampled.shape[0], (y_train_oversampled == 0).sum()/y_train_oversampled.shape[0]))

In [11]:
# X_train_oversampled_batch, y_train_oversampled_batch = Batch(X_train_oversampled, y_train_oversampled, batch_size)
# X_train_batch, y_train_batch = Batch(X_train, y_train, batch_size)
# X_valid_batch, y_valid_batch = Batch(X_valid, y_valid, batch_size)

# GcForest

## test gc

# load 2019 Test datasets

In [12]:
# test = pd.read_csv("../data/water/csv/test2018.csv")

# X_test = test.values[:, 0:-1]
# y_test = test.values[:, -1]

# X_test = clean_pipeline.fit_transform(X_test)

In [13]:
# X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=test_size, stratify = y, random_state = random_seed)

# X_train_oversampled, y_train_oversampled = Smoter(X_train, y_train, is_random=True)
config = get_toy_config()
gc = GCForest(config)

gc.fit_transform(X_train, y_train, X_valid, y_valid)
# y_valid_pred = gc.predict(X_valid)

[ 2019-03-23 23:11:47,367][cascade_classifier.fit_transform] X_groups_train.shape=[(105769, 6)],y_train.shape=(105769,),X_groups_test.shape=[(26443, 6)],y_test.shape=(26443,)
[ 2019-03-23 23:11:47,376][cascade_classifier.fit_transform] group_dims=[6]
[ 2019-03-23 23:11:47,378][cascade_classifier.fit_transform] group_starts=[0]
[ 2019-03-23 23:11:47,381][cascade_classifier.fit_transform] group_ends=[6]
[ 2019-03-23 23:11:47,383][cascade_classifier.fit_transform] X_train.shape=(105769, 6),X_test.shape=(26443, 6)
[ 2019-03-23 23:11:47,396][cascade_classifier.fit_transform] [layer=0] look_indexs=[0], X_cur_train.shape=(105769, 6), X_cur_test.shape=(26443, 6)
[ 2019-03-23 23:11:48,493][kfold_wrapper.log_eval_metrics] F1 (layer_0 - estimator_0 - 5_folds.train_0.predict)=85.25%
[ 2019-03-23 23:11:49,485][kfold_wrapper.log_eval_metrics] F1 (layer_0 - estimator_0 - 5_folds.train_1.predict)=83.33%
[ 2019-03-23 23:11:50,322][kfold_wrapper.log_eval_metrics] F1 (layer_0 - estimator_0 - 5_folds.trai

[ 2019-03-23 23:30:15,890][kfold_wrapper.log_eval_metrics] F1 (layer_2 - estimator_2 - 5_folds.train_2.predict)=85.71%
[ 2019-03-23 23:30:55,392][kfold_wrapper.log_eval_metrics] F1 (layer_2 - estimator_2 - 5_folds.train_3.predict)=90.91%
[ 2019-03-23 23:31:33,368][kfold_wrapper.log_eval_metrics] F1 (layer_2 - estimator_2 - 5_folds.train_4.predict)=91.80%
[ 2019-03-23 23:31:33,657][kfold_wrapper.log_eval_metrics] F1 (layer_2 - estimator_2 - 5_folds.train_cv.predict)=91.41%
[ 2019-03-23 23:31:33,675][kfold_wrapper.log_eval_metrics] F1 (layer_2 - estimator_2 - 5_folds.test.predict)=0.32%
[ 2019-03-23 23:31:33,710][cascade_classifier.calc_f1] F1 (layer_2 - train.classifier_average)=91.69%
[ 2019-03-23 23:31:33,719][cascade_classifier.calc_f1] F1 (layer_2 - test.classifier_average)=0.32%
[ 2019-03-23 23:31:33,738][cascade_classifier.fit_transform] [layer=3] look_indexs=[0], X_cur_train.shape=(105769, 12), X_cur_test.shape=(26443, 12)
[ 2019-03-23 23:31:34,500][kfold_wrapper.log_eval_metrics

(array([[1.0000000e+00, 0.0000000e+00, 9.9999964e-01, 3.8453169e-07,
         9.9999976e-01, 2.1194627e-07],
        [1.0000000e+00, 0.0000000e+00, 9.9999845e-01, 1.5463777e-06,
         9.9999458e-01, 5.4086931e-06],
        [1.0000000e+00, 0.0000000e+00, 9.9998903e-01, 1.0966282e-05,
         9.9999839e-01, 1.6241576e-06],
        ...,
        [1.0000000e+00, 0.0000000e+00, 9.9999940e-01, 5.6761570e-07,
         9.9999183e-01, 8.1667431e-06],
        [1.0000000e+00, 0.0000000e+00, 9.9999535e-01, 4.6226655e-06,
         9.9998307e-01, 1.6925143e-05],
        [1.0000000e+00, 0.0000000e+00, 9.9999970e-01, 3.1187909e-07,
         9.9999934e-01, 6.6976378e-07]], dtype=float32),
 array([[0.48      , 0.52      , 0.97852594, 0.02147408, 0.9195851 ,
         0.08041488],
        [0.48      , 0.52      , 0.97852594, 0.02147408, 0.9195851 ,
         0.08041488],
        [0.48      , 0.52      , 0.97852594, 0.02147408, 0.9195851 ,
         0.08041488],
        ...,
        [0.48      , 0.52     

In [13]:
# dump
with open("../pkl/2019_gc.pkl", "wb") as f:
    pickle.dump(gc, f, pickle.HIGHEST_PROTOCOL)
    
# # load
# with open("../pkl/2018_gc.pkl", "rb") as f:
#     gc = pickle.load(f)