In [25]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.cross_validation import train_test_split  
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import KFold
import warnings
warnings.filterwarnings("ignore")
from dateutil.parser import parse
import gc

In [26]:
def creatDtype():
    dtype = {'id':'object',
             'label':'int8',
             'date':'int64',
             'f1':'uint8',
             'f2': 'uint8',
             'f3': 'uint8',
             'f4': 'uint8',
             'f5': 'float32',
             'ndays':'uint8'
             }
    for i in range(20,298):
        dtype['f'+str(i)] = 'float32'
    for i in range(6,20):
        dtype['f'+str(i)] = 'uint8'
    return dtype

In [28]:
def appendlist(col,a,b):
    for i in range(a,b+1,1):
        col.append("f"+str(i))

In [29]:
print("Loading data!")
train = pd.read_csv('atec_anti_fraud_train.csv',dtype=creatDtype())
test = pd.read_csv('atec_anti_fraud_test_b.csv',dtype=creatDtype())
print(train.shape, test.shape)

Loading data!
((994731, 300), (500538, 299))


In [30]:
print("Concat train and test!")
test['label'] = 2
len_train = len(train)
train = train.append(test)
train = train.reset_index(drop=True)
print(train.shape)
del test
gc.collect()

Concat train and test!
(1495269, 300)


83

In [31]:
print("Split data into train test when finish FE!")
test = train[len_train:]
train = train[:len_train]
print(train.shape,test.shape)

Split data into train test when finish FE!
((994731, 300), (500538, 300))


In [32]:
print("Preparing remain columns!")
pre = [k for k in train.columns if k not in ['id', 'label', 'date', 'ndays']]
need_del = []
appendlist(need_del,20,27)
appendlist(need_del,32,35)
appendlist(need_del,48,53)
appendlist(need_del,64,71)
appendlist(need_del,111,165)
appendlist(need_del,211,233)
pred = [k for k in pre if  k not in need_del]
print("pred ", len(pred))

Preparing remain columns!
('pred ', 193)


In [33]:
print("Decline samples!")
train['ndays'] = train['date'].apply(lambda x:(parse(str(x))-parse(str(20170905))).days)
print train.shape
# train = train[(train.f24 <= 1) | train.f24.isnull()]
# train = train[(train.f25 <= 1) | train.f25.isnull()]
# train = train[(train.f26 <= 1) | train.f26.isnull()]
# train = train[(train.f27 <= 1) | train.f27.isnull()]
train = train[(train.f28 <= 1) | train.f28.isnull()]
train = train[(train.f29 <= 1) | train.f29.isnull()]
train = train[(train.f30 <= 1) | train.f30.isnull()]
train = train[(train.f31 <= 1) | train.f31.isnull()]
train = train[train['ndays']<=40]
print train.shape
gc.collect()

Decline samples!
(994731, 301)
(643383, 301)


0

In [34]:
print("Change -1 to 1!")
print(train.label.value_counts())
train['label'] = train['label'].apply(lambda x: 0 if x == 0 else 1)
print(train.label.value_counts())

Change -1 to 1!
 0    635627
 1      4734
-1      3022
Name: label, dtype: int64
0    635627
1      7756
Name: label, dtype: int64


In [46]:
xgbparams = {'eta': 0.02, 
          'tree_method': "auto", 
          'max_depth': 3, 
          'subsample': 0.9, 
          'colsample_bytree': 0.1, 
          'min_child_weight':1,
          'objective': 'binary:logistic', 
          'eval_metric': 'auc', 
          'random_state': 99,
          'silent': True}

In [47]:
print("K-fold xgb training!")
kf1 = KFold(n_splits=5, shuffle=True, random_state=998)
test['score'] = 0
i = 0
bst = 0
for trn, tst in kf1.split(X):
    i += 1
    print("No ",i)
    trn_X, tst_X = X[trn], X[tst]
    trn_Y, tst_Y = y[trn], y[tst]
    d_train = xgb.DMatrix(trn_X, trn_Y) 
    d_valid = xgb.DMatrix(tst_X, tst_Y) 
    watchlist = [(d_train, 'train'), (d_valid, 'valid')]
    xgb_model = xgb.train(xgbparams, d_train, 100000, watchlist, early_stopping_rounds=100,maximize=True, verbose_eval=100)
    test['score']  += xgb_model.predict(xgb.DMatrix(test[pred].values), ntree_limit= xgb_model.best_ntree_limit+150) / (2 * kfold)
gc.collect()

K-fold xgb training!
('No ', 1)
[0]	train-auc:0.866589	valid-auc:0.871669
Multiple eval metrics have been passed: 'valid-auc' will be used for early stopping.

Will train until valid-auc hasn't improved in 100 rounds.
[100]	train-auc:0.94697	valid-auc:0.949504
[200]	train-auc:0.95334	valid-auc:0.95529
[300]	train-auc:0.960548	valid-auc:0.961602
[400]	train-auc:0.965653	valid-auc:0.966123
[500]	train-auc:0.96892	valid-auc:0.96898
[600]	train-auc:0.971324	valid-auc:0.971063
[700]	train-auc:0.972901	valid-auc:0.972498
[800]	train-auc:0.974102	valid-auc:0.973476
[900]	train-auc:0.97505	valid-auc:0.974207
[1000]	train-auc:0.975899	valid-auc:0.974871
[1100]	train-auc:0.976579	valid-auc:0.97539
[1200]	train-auc:0.97717	valid-auc:0.975767
[1300]	train-auc:0.977692	valid-auc:0.97608
[1400]	train-auc:0.978178	valid-auc:0.976331
[1500]	train-auc:0.978644	valid-auc:0.9766
[1600]	train-auc:0.979061	valid-auc:0.976798
[1700]	train-auc:0.979442	valid-auc:0.97695
[1800]	train-auc:0.979816	valid-auc:0.

[400]	train-auc:0.966566	valid-auc:0.960532
[500]	train-auc:0.969946	valid-auc:0.964252
[600]	train-auc:0.972178	valid-auc:0.966595
[700]	train-auc:0.973598	valid-auc:0.968056
[800]	train-auc:0.974681	valid-auc:0.969203
[900]	train-auc:0.975591	valid-auc:0.970121
[1000]	train-auc:0.976345	valid-auc:0.970785
[1100]	train-auc:0.977021	valid-auc:0.97147
[1200]	train-auc:0.977603	valid-auc:0.971981
[1300]	train-auc:0.978139	valid-auc:0.972401
[1400]	train-auc:0.978637	valid-auc:0.972807
[1500]	train-auc:0.979089	valid-auc:0.973191
[1600]	train-auc:0.979501	valid-auc:0.973444
[1700]	train-auc:0.979913	valid-auc:0.973731
[1800]	train-auc:0.980288	valid-auc:0.973937
[1900]	train-auc:0.980654	valid-auc:0.974175
[2000]	train-auc:0.980977	valid-auc:0.974299
[2100]	train-auc:0.981267	valid-auc:0.974383
[2200]	train-auc:0.98157	valid-auc:0.974537
[2300]	train-auc:0.981845	valid-auc:0.974714
[2400]	train-auc:0.982111	valid-auc:0.974811
[2500]	train-auc:0.982372	valid-auc:0.974878
[2600]	train-auc:0

28

In [48]:
xgbparams = {'eta': 0.2, 
          'tree_method': "auto", 
          'max_depth': 8, 
          'subsample': 0.9, 
          'colsample_bytree': 0.1,  
          'min_child_weight':1,
          'objective': 'binary:logistic', 
          'eval_metric': 'auc', 
          'random_state': 99,
          'silent': True}

In [49]:
print("K-fold xgb training!")
kf = KFold(n_splits=5, shuffle=True, random_state=998)
i = 0
for trn, tst in kf.split(X):
    i += 1
    print("No ",i)
    trn_X, tst_X = X[trn], X[tst]
    trn_Y, tst_Y = y[trn], y[tst]
    d_train = xgb.DMatrix(trn_X, trn_Y) 
    d_valid = xgb.DMatrix(tst_X, tst_Y) 
    watchlist = [(d_train, 'train'), (d_valid, 'valid')]
    xgb_model = xgb.train(xgbparams, d_train, 100000, watchlist, early_stopping_rounds=100,maximize=True, verbose_eval=100)
    test['score']  += xgb_model.predict(xgb.DMatrix(test[pred].values), ntree_limit= xgb_model.best_ntree_limit+150) / (2 * kfold)

K-fold xgb training!
('No ', 1)
[0]	train-auc:0.904146	valid-auc:0.911153
Multiple eval metrics have been passed: 'valid-auc' will be used for early stopping.

Will train until valid-auc hasn't improved in 100 rounds.
[100]	train-auc:0.99394	valid-auc:0.978098
[200]	train-auc:0.997267	valid-auc:0.978685
Stopping. Best iteration:
[180]	train-auc:0.996938	valid-auc:0.979004

('No ', 2)
[0]	train-auc:0.905312	valid-auc:0.905185
Multiple eval metrics have been passed: 'valid-auc' will be used for early stopping.

Will train until valid-auc hasn't improved in 100 rounds.
[100]	train-auc:0.993878	valid-auc:0.977893
[200]	train-auc:0.997329	valid-auc:0.979318
Stopping. Best iteration:
[185]	train-auc:0.997134	valid-auc:0.979615

('No ', 3)
[0]	train-auc:0.905311	valid-auc:0.901288
Multiple eval metrics have been passed: 'valid-auc' will be used for early stopping.

Will train until valid-auc hasn't improved in 100 rounds.
[100]	train-auc:0.994159	valid-auc:0.977746
[200]	train-auc:0.997151	va

In [1]:
test[['id', 'score']].to_csv('sub_xgb_final.csv', index=False)

NameError: name 'test' is not defined

In [2]:
a = [1,2,3]

In [3]:
b = [1]

In [None]:
c = 