In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
import pickle

import sys
sys.path.append("..") 
from gcforest.gcforest import GCForest
from gcforest.utils.config_utils import load_json

#### somte sampling

In [2]:
def Smoter(X, y, is_random=False):
    if is_random == True:
        # random_lst = list(np.random.randint(0, 1000, 4))
        sm = SMOTE(random_state=random_seed)
    elif is_random == False:
        sm = SMOTE(random_state=0)

    # sm = SMOTE(random_state=random_lst[2])
    X_smote, y_smote = sm.fit_sample(X, y)

    return X_smote, y_smote

#### evaluate function

In [3]:
def evaluate(true, pred):
    # compute accuracy, precision and recall
    TP, FP, TN, FN = 0, 0, 0, 0

    for i in range(0, len(pred)):
        if pred[i] == true[i] and true[i] == 1:
            TP += 1
        elif pred[i] == true[i] and true[i] == 0:
            TN += 1
        elif pred[i] != true[i] and true[i] == 0:
            FP += 1
        elif pred[i] != true[i] and true[i] == 1:
            FN += 1

    precision = TP/(TP + FP)
    recall = TP/(TP + FN)
    accuracy = (TP+TN)/(TP+TN+FN+FP)
    
    print('TP=',TP,'FP=',FP,'TN=',TN,'FN=',FN)
    F1 = 2*precision*recall / (precision + recall)
    print("precision", precision,"\nrecall", recall,"\naccuracy", accuracy)
    print('F1=',F1)
    return F1, accuracy, precision, recall

#### Batch

combine serveral datasâ€˜ features together

In [4]:
def Batch(X, y, size):
    batch_size = size

    X_trim = X
    y_trim = y

    if len(X) % batch_size != 0:
        extra_num = len(X) % batch_size
        X_trim = np.delete(X, range(len(X) - extra_num, len(X)), axis = 0)
        y_trim = np.delete(y, range(len(y) - extra_num, len(y)), axis = 0)

    X_batch = np.split(X_trim, len(X_trim)/batch_size)
    y_batch = np.split(y_trim, len(y_trim)/batch_size)

    num_batch = 0

    for each_batch in X_batch:
        X_batch[num_batch] = np.reshape(X_batch[num_batch], (9*batch_size))
        y_batch[num_batch] = y_batch[num_batch][-1]
        num_batch += 1

    X_batch = np.array(X_batch)
    y_batch = np.array(y_batch)
    return X_batch, y_batch

#### gc_config

In [5]:
def get_toy_config():
    config = {}
    ca_config = {}
    ca_config["random_state"] = random_seed
    ca_config["max_layers"] = 10
    ca_config["early_stopping_rounds"] = 3
    ca_config["n_classes"] = 2
    ca_config["estimators"] = []
    ca_config["estimators"].append({"n_folds": 5, "type": "RandomForestClassifier", "n_estimators": 10, "max_depth": None, "n_jobs": -1})
    ca_config["estimators"].append({"n_folds": 5, "type": "ExtraTreesClassifier", "n_estimators": 10, "max_depth": None, "n_jobs": -1})
    ca_config["estimators"].append({"n_folds": 5, "type": "LogisticRegression"})
    config["cascade"] = ca_config
    return config

### HyperParameter

In [6]:
batch_size = 3
test_size = 0.33
random_seed = 42

# load train

In [7]:
df = pd.read_table('../data/water/txt/2018waterDataTraining.txt',delim_whitespace=True)

In [8]:
df = df.reset_index()
Time = np.zeros(df.shape[0]).astype("str")
for i in range(len(df)):
    Time[i] = df['index'][i]+" "+ df['Time'][i]
df['Time'] = Time
df = df.drop(['index'], axis=1)

# feature engineering


It looks like we have 14 columns to help us predict our classification. We will drop fnlwgt and education and then convert our categorical features to dummy variables. We will also convert our label to 0 and 1 where 1 means the person made more than $50k



In [9]:
drop_columns = ['Time']
continuous_features = ['Tp', 'Cl', 'pH', 'Redox', 'Leit', 'Trueb', 'Cl_2', 'Fm', 'Fm_2']
cat_features =[]

In [10]:
all_df_dummies = pd.get_dummies(df, columns=cat_features)

In [11]:
all_df_dummies.drop(drop_columns, 1, inplace=True)
# delte NA datas
all_df_dummies = all_df_dummies.dropna(axis=0)

In [12]:
X = all_df_dummies.drop(['EVENT'], axis=1) # Series
y = all_df_dummies['EVENT'].apply(lambda x: 0 if x == False else 1) # Series

In [13]:
data_all = pd.concat([X,y], axis=1)

In [14]:
data_all.head()

Unnamed: 0,Tp,Cl,pH,Redox,Leit,Trueb,Cl_2,Fm,Fm_2,EVENT
0,6.5,0.17,8.36,749.0,211.0,0.011,0.118,1677.0,695.0,0
1,6.5,0.17,8.36,749.0,211.0,0.011,0.118,1561.0,696.0,0
2,6.5,0.17,8.35,749.0,211.0,0.011,0.117,1581.0,696.0,0
3,6.5,0.17,8.35,749.0,211.0,0.011,0.118,1579.0,693.0,0
4,6.5,0.17,8.35,749.0,211.0,0.011,0.118,1567.0,689.0,0


#### layer sampling

In [15]:
array = data_all.values
X = array[:, 0:-1] # ndarray
y = array[:, -1] # ndarray

#### train_valid_split

In [16]:
print("============ train_valid_split ============")
X_train, X_valid, y_train, y_valid = train_test_split(X, y,test_size=test_size, stratify=y, random_state=random_seed)
print("train: %d, valid: %d" %(X_train.shape[0], X_valid.shape[0]))

train: 92809, valid: 45712


### clean the data before somte

fulfill the Na with median, then standardized the data, output type ndarray

In [17]:
clean_pipeline = Pipeline([('imputer', preprocessing.Imputer(missing_values='NaN',strategy="median")),
                           ('std_scaler', preprocessing.StandardScaler()),])
X_train = clean_pipeline.fit_transform(X_train)
X_valid = clean_pipeline.fit_transform(X_valid)

#### Do somte sampling on the train data to solve data imblance problem

In [18]:
X_train_oversampled, y_train_oversampled = Smoter(X_train, y_train, is_random=True)
print("============ SMOTE ============")
print("train: %d, contains %.4f of 0 , after SMOTE: train: %d contains %.4f of 1" %(X_train.shape[0], (y_train == 0).sum()/y_train.shape[0], X_train_oversampled.shape[0], (y_train_oversampled == 0).sum()/y_train_oversampled.shape[0]))

train: 92809, contains 0.9875 of 0 , after SMOTE: train: 183306 contains 0.5000 of 1


In [19]:
X_train_oversampled_batch, y_train_oversampled_batch = Batch(X_train_oversampled, y_train_oversampled, batch_size)
X_train_batch, y_train_batch = Batch(X_train, y_train, batch_size)
X_valid_batch, y_valid_batch = Batch(X_valid, y_valid, batch_size)

# GcForest

## train gc

#### 1.train GcForest on oversampled datasets

In [20]:
config = get_toy_config()
gc = GCForest(config)

X_train_enc = gc.fit_transform(X_train_oversampled, y_train_oversampled)

[ 2018-11-28 16:49:20,081][cascade_classifier.fit_transform] X_groups_train.shape=[(183306, 9)],y_train.shape=(183306,),X_groups_test.shape=no_test,y_test.shape=no_test
[ 2018-11-28 16:49:20,097][cascade_classifier.fit_transform] group_dims=[9]
[ 2018-11-28 16:49:20,099][cascade_classifier.fit_transform] group_starts=[0]
[ 2018-11-28 16:49:20,102][cascade_classifier.fit_transform] group_ends=[9]
[ 2018-11-28 16:49:20,103][cascade_classifier.fit_transform] X_train.shape=(183306, 9),X_test.shape=(0, 9)
[ 2018-11-28 16:49:20,120][cascade_classifier.fit_transform] [layer=0] look_indexs=[0], X_cur_train.shape=(183306, 9), X_cur_test.shape=(0, 9)
[ 2018-11-28 16:49:21,229][kfold_wrapper.log_eval_metrics] Accuracy(layer_0 - estimator_0 - 5_folds.train_0.predict)=99.98%
[ 2018-11-28 16:49:22,309][kfold_wrapper.log_eval_metrics] Accuracy(layer_0 - estimator_0 - 5_folds.train_1.predict)=99.98%
[ 2018-11-28 16:49:23,277][kfold_wrapper.log_eval_metrics] Accuracy(layer_0 - estimator_0 - 5_folds.tra

[ 2018-11-28 16:49:45,367][kfold_wrapper.log_eval_metrics] Accuracy(layer_3 - estimator_0 - 5_folds.train_1.predict)=99.99%
[ 2018-11-28 16:49:45,817][kfold_wrapper.log_eval_metrics] Accuracy(layer_3 - estimator_0 - 5_folds.train_2.predict)=99.99%
[ 2018-11-28 16:49:46,272][kfold_wrapper.log_eval_metrics] Accuracy(layer_3 - estimator_0 - 5_folds.train_3.predict)=99.99%
[ 2018-11-28 16:49:46,725][kfold_wrapper.log_eval_metrics] Accuracy(layer_3 - estimator_0 - 5_folds.train_4.predict)=99.99%
[ 2018-11-28 16:49:46,733][kfold_wrapper.log_eval_metrics] Accuracy(layer_3 - estimator_0 - 5_folds.train_cv.predict)=99.99%
[ 2018-11-28 16:49:47,242][kfold_wrapper.log_eval_metrics] Accuracy(layer_3 - estimator_1 - 5_folds.train_0.predict)=99.99%
[ 2018-11-28 16:49:47,695][kfold_wrapper.log_eval_metrics] Accuracy(layer_3 - estimator_1 - 5_folds.train_1.predict)=99.99%
[ 2018-11-28 16:49:48,148][kfold_wrapper.log_eval_metrics] Accuracy(layer_3 - estimator_1 - 5_folds.train_2.predict)=99.99%
[ 2018-

In [21]:
# dump
with open("../pkl/2018_test.pkl", "wb") as f:
    pickle.dump(gc, f, pickle.HIGHEST_PROTOCOL)

In [22]:
# load
with open("../pkl/2018_test.pkl", "rb") as f:
    gc = pickle.load(f)

#### test GcForest on valid datasets

In [23]:
y_valid_pred = gc.predict(X_valid)

y_valid_nonezero = np.count_nonzero(y_valid)
y_valid_pred_nonezero = np.count_nonzero(y_valid_pred)

print("y_valid, 1 contains: ", y_valid_nonezero/len(y_valid))
print("y_valid_pred, 1 contains: ", y_valid_pred_nonezero/len(y_valid_pred))

print("============= 2018 datasets' results on valid =============")
gc_f1, gc_accraucy, gc_precision, gc_recall = evaluate(y_valid, y_valid_pred)

[ 2018-11-28 16:50:02,340][cascade_classifier.transform] X_groups_test.shape=[(45712, 9)]
[ 2018-11-28 16:50:02,345][cascade_classifier.transform] group_dims=[9]
[ 2018-11-28 16:50:02,347][cascade_classifier.transform] X_test.shape=(45712, 9)
[ 2018-11-28 16:50:02,352][cascade_classifier.transform] [layer=0] look_indexs=[0], X_cur_test.shape=(45712, 9)
[ 2018-11-28 16:50:03,435][cascade_classifier.transform] [layer=1] look_indexs=[0], X_cur_test.shape=(45712, 15)


y_valid, 1 contains:  0.012469373468673434
y_valid_pred, 1 contains:  0.012469373468673434
TP= 559 FP= 11 TN= 45131 FN= 11
precision 0.980701754385965 
recall 0.980701754385965 
accuracy 0.9995187259362969
F1= 0.980701754385965


# load 2018 Test datasets

In [24]:
lines = open("../data/water/txt/2018waterDataTesting.txt").readlines()
num_lines = len(lines) - 1

X_test = np.ones((num_lines, 9))
y_test = np.ones((num_lines, 1))
flag = 0

lines = np.delete(lines, 0, axis = 0)
i = 0

for line in lines:
    data_line = line.split()
    feature = data_line[2:11]
    for k in range(9):
        if feature[k] == 'NA':
            flag = 1
            break
    if flag == 1:
        flag = 0
        continue    # jump out of the loop
    X_test[i] = feature    
    if data_line[11] == 'FALSE':
        y_test[i] = 0
    elif data_line[11] == 'TRUE':
        y_test[i] = 1
    i += 1
    
X_test = clean_pipeline.transform(X_test) 

#### test gcForest on 2018 Test datasets

In [25]:
y_test_pred = gc.predict(X_test)

y_test_nonezero = np.count_nonzero(y_test)
y_test_pred_nonezero = np.count_nonzero(y_test_pred)

print("y_test: {:d}, 1 contains: {:6f}".format(len(y_test), y_test_nonezero/len(y_test)))
print("y_test_pred: {:d}, 1 contains: {:6f}".format(len(y_test_pred), y_test_pred_nonezero/len(y_test_pred)))


print("============= 2018 datasets' results on test =============")
gc_f1, gc_accraucy, gc_precision, gc_recall = evaluate(y_test, y_test_pred)

[ 2018-11-28 16:50:05,748][cascade_classifier.transform] X_groups_test.shape=[(139566, 9)]
[ 2018-11-28 16:50:05,751][cascade_classifier.transform] group_dims=[9]
[ 2018-11-28 16:50:05,752][cascade_classifier.transform] X_test.shape=(139566, 9)
[ 2018-11-28 16:50:05,755][cascade_classifier.transform] [layer=0] look_indexs=[0], X_cur_test.shape=(139566, 9)
[ 2018-11-28 16:50:06,918][cascade_classifier.transform] [layer=1] look_indexs=[0], X_cur_test.shape=(139566, 15)


y_test: 139566, 1 contains: 0.192088
y_test_pred: 139566, 1 contains: 0.179220
TP= 25013 FP= 0 TN= 112757 FN= 1796
precision 1.0 
recall 0.9330075720840016 
accuracy 0.9871315363340641
F1= 0.9653429045579097


#### 2. train GcForest on batched datasets

In [26]:
X_train_batch_enc = gc.fit_transform(X_train_batch, y_train_batch)

[ 2018-11-28 16:50:08,750][cascade_classifier.fit_transform] X_groups_train.shape=[(30936, 27)],y_train.shape=(30936,),X_groups_test.shape=no_test,y_test.shape=no_test
[ 2018-11-28 16:50:08,752][cascade_classifier.fit_transform] group_dims=[27]
[ 2018-11-28 16:50:08,754][cascade_classifier.fit_transform] group_starts=[0]
[ 2018-11-28 16:50:08,755][cascade_classifier.fit_transform] group_ends=[27]
[ 2018-11-28 16:50:08,756][cascade_classifier.fit_transform] X_train.shape=(30936, 27),X_test.shape=(0, 27)
[ 2018-11-28 16:50:08,759][cascade_classifier.fit_transform] [layer=0] look_indexs=[0], X_cur_train.shape=(30936, 27), X_cur_test.shape=(0, 27)
[ 2018-11-28 16:50:09,119][kfold_wrapper.log_eval_metrics] Accuracy(layer_0 - estimator_0 - 5_folds.train_0.predict)=99.81%
[ 2018-11-28 16:50:09,452][kfold_wrapper.log_eval_metrics] Accuracy(layer_0 - estimator_0 - 5_folds.train_1.predict)=99.84%
[ 2018-11-28 16:50:09,783][kfold_wrapper.log_eval_metrics] Accuracy(layer_0 - estimator_0 - 5_folds.

[ 2018-11-28 16:50:20,625][kfold_wrapper.log_eval_metrics] Accuracy(layer_3 - estimator_0 - 5_folds.train_1.predict)=99.90%
[ 2018-11-28 16:50:20,952][kfold_wrapper.log_eval_metrics] Accuracy(layer_3 - estimator_0 - 5_folds.train_2.predict)=99.89%
[ 2018-11-28 16:50:21,284][kfold_wrapper.log_eval_metrics] Accuracy(layer_3 - estimator_0 - 5_folds.train_3.predict)=99.87%
[ 2018-11-28 16:50:21,613][kfold_wrapper.log_eval_metrics] Accuracy(layer_3 - estimator_0 - 5_folds.train_4.predict)=99.89%
[ 2018-11-28 16:50:21,615][kfold_wrapper.log_eval_metrics] Accuracy(layer_3 - estimator_0 - 5_folds.train_cv.predict)=99.88%
[ 2018-11-28 16:50:21,949][kfold_wrapper.log_eval_metrics] Accuracy(layer_3 - estimator_1 - 5_folds.train_0.predict)=99.92%
[ 2018-11-28 16:50:22,287][kfold_wrapper.log_eval_metrics] Accuracy(layer_3 - estimator_1 - 5_folds.train_1.predict)=99.87%
[ 2018-11-28 16:50:22,523][kfold_wrapper.log_eval_metrics] Accuracy(layer_3 - estimator_1 - 5_folds.train_2.predict)=99.82%
[ 2018-

In [None]:
# dump
with open("../pkl/2018_test_batch.pkl", "wb") as f:
    pickle.dump(gc, f, pickle.HIGHEST_PROTOCOL)

In [None]:
# load
with open("../pkl/2018_test_batch.pkl", "rb") as f:
    gc = pickle.load(f)

### test GcForest on batched valid datasets

In [27]:
y_valid_pred = gc.predict(X_valid_batch)
print("============= 2018 datasets' results on %d batched valid =============" %(batch_size))
gc_f1, gc_accraucy, gc_precision, gc_recall = evaluate(y_valid_batch, y_valid_pred)

[ 2018-11-28 16:50:28,048][cascade_classifier.transform] X_groups_test.shape=[(15237, 27)]
[ 2018-11-28 16:50:28,051][cascade_classifier.transform] group_dims=[27]
[ 2018-11-28 16:50:28,052][cascade_classifier.transform] X_test.shape=(15237, 27)
[ 2018-11-28 16:50:28,059][cascade_classifier.transform] [layer=0] look_indexs=[0], X_cur_test.shape=(15237, 27)
[ 2018-11-28 16:50:29,122][cascade_classifier.transform] [layer=1] look_indexs=[0], X_cur_test.shape=(15237, 33)


TP= 167 FP= 3 TN= 15057 FN= 10
precision 0.9823529411764705 
recall 0.943502824858757 
accuracy 0.999146813677233
F1= 0.9625360230547549


#### test gcForest on 2018 batched Test datasets

In [28]:
X_test_batch, y_test_batch = Batch(X_test, y_test, batch_size)

y_test_pred = gc.predict(X_test_batch)
print("============= 2018 datasets' results on %d batched test =============" %(batch_size))
gc_f1, gc_accraucy, gc_precision, gc_recall = evaluate(y_test_batch, y_test_pred)

[ 2018-11-28 16:50:30,534][cascade_classifier.transform] X_groups_test.shape=[(46522, 27)]
[ 2018-11-28 16:50:30,541][cascade_classifier.transform] group_dims=[27]
[ 2018-11-28 16:50:30,542][cascade_classifier.transform] X_test.shape=(46522, 27)
[ 2018-11-28 16:50:30,549][cascade_classifier.transform] [layer=0] look_indexs=[0], X_cur_test.shape=(46522, 27)
[ 2018-11-28 16:50:31,663][cascade_classifier.transform] [layer=1] look_indexs=[0], X_cur_test.shape=(46522, 33)


TP= 8474 FP= 61 TN= 37523 FN= 464
precision 0.9928529584065612 
recall 0.9480868203177445 
accuracy 0.9887150165513091
F1= 0.9699536427631202
