In [1]:
import os
from sklearn.model_selection import KFold, train_test_split, GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics
from scipy.stats import entropy, kurtosis
import xgboost as xgb
import seaborn as sns
from xgboost import plot_importance
import warnings
import matplotlib.pyplot as plt
import pandas as pd
from math import *
import numpy as np

pd.set_option('display.max_columns', None)
from IPython.display import display
warnings.filterwarnings('ignore')
%matplotlib inline

In [2]:
pathf = os.path.join("..", "data", "particles")

trainpd = pd.read_csv(os.path.join(pathf, "train.csv"))
print(trainpd.head(1))
trainshape = trainpd.shape
print(trainshape)
eventpd = pd.read_csv(os.path.join(pathf, "event.csv"))
print(eventpd.head(1))
print(eventpd.shape)
testpd = pd.read_csv(os.path.join(pathf, "test.csv"))
testshape = testpd.shape
print(testpd.head(1))
print(testpd.shape)

data = pd.concat([trainpd, testpd], ignore_index=True)
data = pd.merge(data, eventpd, on='event_id', how='left')

       x      y  z        t   terror        q  flag  event_id  hit_id
0 -142.5 -147.5  0  767.879  2.02966  1.05052     0         7       1
(9473201, 9)
   event_id  nhit  nhitreal  energymc  thetamc    phimc   xcmc    ycmc
0         7   426        70   48348.9  63.1686  11.0982 -40.83  114.03
(13315, 8)
       x      y  z        t  terror        q  event_id  hit_id
0 -142.5 -127.5  0  848.061  1.9984  1.15067         9       1
(4086511, 8)


In [3]:
#组合特征 
#train表的特征与event表的特征交互
data['fx'] = data['x']-data['xcmc']
data['fy'] = data['y']-data['ycmc']
data['fdis'] = np.sqrt(data['fx']**2+data['fy']**2)
data['fscala'] = np.sin(data['thetamc']) * data['t']
data['fphi'] = np.arctan2(data['fy'], data['fx']) * 180
data['fttrue'] = data['t']/data['terror']

data['nhitratio'] = data['nhit']/data['nhitreal']
del data['fx']
del data['fy']
del data['x']
del data['y']
del data['z']

In [4]:
print(trainshape[0])
print(type(trainshape[0]))
print(data.shape)

trainpd = data[:trainshape[0]].reset_index()
testpd = data[trainshape[0]:].reset_index()
del data

9473201
<class 'int'>
(13559712, 18)


In [5]:
print(trainpd.columns)
feature= [x for x in trainpd.columns if x not in ['flag','index','hit_id','event_id']]
labels = trainpd['flag']
del trainpd['flag']
del testpd['flag']

Index(['index', 'event_id', 'flag', 'hit_id', 'q', 't', 'terror', 'nhit',
       'nhitreal', 'energymc', 'thetamc', 'phimc', 'xcmc', 'ycmc', 'fdis',
       'fscala', 'fphi', 'fttrue', 'nhitratio'],
      dtype='object')


In [6]:
#这里用的gpu版的xgb，cpu的话实在太慢了，建议装个gpu版
n_splits = 5
kf = KFold(n_splits=n_splits, shuffle=True, random_state=4399)
y_pp_xgb = np.zeros(testshape[0])
y_pp_xgb_stacking = np.zeros(len(labels))
for train_index, test_index in kf.split(trainpd):
    print ( ">>>", train_index )
    clf = xgb.XGBClassifier(tree_method='gpu_hist',max_depth=8,learning_rate=0.1,verbosity=3,
                           eval_metric='auc',n_estimators=2000)
    clf.fit(
        trainpd[feature].iloc[train_index], labels[train_index],
        eval_set=[(trainpd[feature].iloc[train_index], labels[train_index]),
                  (trainpd[feature].iloc[test_index], labels[test_index])],
        early_stopping_rounds=50,
        verbose=5,
    )

    y_pred = clf.predict(trainpd[feature].iloc[test_index]) 
    y_predprob = clf.predict_proba(trainpd[feature].iloc[test_index])[:, 1] 
    
    y_pp_xgb_stacking[test_index] = y_predprob
      
    auc = metrics.roc_auc_score(labels[test_index], y_predprob)
    print("AUC Score (Train): %f" % auc) 
    
    y_pp_xgb += clf.predict_proba(testpd[feature])[:, 1] / n_splits

>>> [      0       2       3 ... 9473198 9473199 9473200]
[19:30:20] DEBUG: C:/Users/Administrator/workspace/xgboost-win64_release_1.0.0/src/tree/updater_gpu_hist.cu:1167: [GPU Hist]: Configure
[19:30:20] Init: 0.00357s, 1 calls @ 3570us

[0]	validation_0-auc:0.96587	validation_1-auc:0.96574
Multiple eval metrics have been passed: 'validation_1-auc' will be used for early stopping.

Will train until validation_1-auc hasn't improved in 50 rounds.
[5]	validation_0-auc:0.97283	validation_1-auc:0.97266
[10]	validation_0-auc:0.97542	validation_1-auc:0.97522
[15]	validation_0-auc:0.97689	validation_1-auc:0.97673
[20]	validation_0-auc:0.97802	validation_1-auc:0.97789
[25]	validation_0-auc:0.97864	validation_1-auc:0.97852
[30]	validation_0-auc:0.97924	validation_1-auc:0.97909
[35]	validation_0-auc:0.97982	validation_1-auc:0.97967
[40]	validation_0-auc:0.98050	validation_1-auc:0.98033
[45]	validation_0-auc:0.98106	validation_1-auc:0.98089
[50]	validation_0-auc:0.98144	validation_1-auc:0.98128
[

[685]	validation_0-auc:0.99727	validation_1-auc:0.99660
[690]	validation_0-auc:0.99728	validation_1-auc:0.99661
[695]	validation_0-auc:0.99728	validation_1-auc:0.99661
[700]	validation_0-auc:0.99729	validation_1-auc:0.99661
[705]	validation_0-auc:0.99729	validation_1-auc:0.99661
[710]	validation_0-auc:0.99731	validation_1-auc:0.99662
[715]	validation_0-auc:0.99732	validation_1-auc:0.99663
[720]	validation_0-auc:0.99733	validation_1-auc:0.99663
[725]	validation_0-auc:0.99734	validation_1-auc:0.99664
[730]	validation_0-auc:0.99735	validation_1-auc:0.99665
[735]	validation_0-auc:0.99736	validation_1-auc:0.99665
[740]	validation_0-auc:0.99737	validation_1-auc:0.99665
[745]	validation_0-auc:0.99738	validation_1-auc:0.99666
[750]	validation_0-auc:0.99740	validation_1-auc:0.99666
[755]	validation_0-auc:0.99740	validation_1-auc:0.99667
[760]	validation_0-auc:0.99741	validation_1-auc:0.99667
[765]	validation_0-auc:0.99742	validation_1-auc:0.99668
[770]	validation_0-auc:0.99743	validation_1-auc:

[1410]	validation_0-auc:0.99822	validation_1-auc:0.99701
[1415]	validation_0-auc:0.99823	validation_1-auc:0.99701
[1420]	validation_0-auc:0.99823	validation_1-auc:0.99701
[1425]	validation_0-auc:0.99823	validation_1-auc:0.99701
[1430]	validation_0-auc:0.99824	validation_1-auc:0.99701
[1435]	validation_0-auc:0.99824	validation_1-auc:0.99701
[1440]	validation_0-auc:0.99825	validation_1-auc:0.99702
[1445]	validation_0-auc:0.99825	validation_1-auc:0.99702
[1450]	validation_0-auc:0.99826	validation_1-auc:0.99702
[1455]	validation_0-auc:0.99826	validation_1-auc:0.99702
[1460]	validation_0-auc:0.99826	validation_1-auc:0.99702
[1465]	validation_0-auc:0.99826	validation_1-auc:0.99702
[1470]	validation_0-auc:0.99827	validation_1-auc:0.99702
[1475]	validation_0-auc:0.99827	validation_1-auc:0.99702
[1480]	validation_0-auc:0.99828	validation_1-auc:0.99703
[1485]	validation_0-auc:0.99828	validation_1-auc:0.99703
[1490]	validation_0-auc:0.99829	validation_1-auc:0.99703
[1495]	validation_0-auc:0.99829

[20:46:22] DEBUG: C:/Users/Administrator/workspace/xgboost-win64_release_1.0.0/src/tree/updater_gpu_hist.cu:1167: [GPU Hist]: Configure
[20:46:23] Init: 0.002908s, 1 calls @ 2908us

[0]	validation_0-auc:0.96605	validation_1-auc:0.96550
Multiple eval metrics have been passed: 'validation_1-auc' will be used for early stopping.

Will train until validation_1-auc hasn't improved in 50 rounds.
[5]	validation_0-auc:0.97276	validation_1-auc:0.97221
[10]	validation_0-auc:0.97556	validation_1-auc:0.97500
[15]	validation_0-auc:0.97714	validation_1-auc:0.97658
[20]	validation_0-auc:0.97819	validation_1-auc:0.97763
[25]	validation_0-auc:0.97875	validation_1-auc:0.97821
[30]	validation_0-auc:0.97940	validation_1-auc:0.97887
[35]	validation_0-auc:0.97999	validation_1-auc:0.97947
[40]	validation_0-auc:0.98080	validation_1-auc:0.98028
[45]	validation_0-auc:0.98146	validation_1-auc:0.98095
[50]	validation_0-auc:0.98213	validation_1-auc:0.98162
[55]	validation_0-auc:0.98270	validation_1-auc:0.98221
[60

[690]	validation_0-auc:0.99734	validation_1-auc:0.99647
[695]	validation_0-auc:0.99734	validation_1-auc:0.99648
[700]	validation_0-auc:0.99735	validation_1-auc:0.99648
[705]	validation_0-auc:0.99736	validation_1-auc:0.99649
[710]	validation_0-auc:0.99736	validation_1-auc:0.99649
[715]	validation_0-auc:0.99737	validation_1-auc:0.99649
[720]	validation_0-auc:0.99738	validation_1-auc:0.99650
[725]	validation_0-auc:0.99739	validation_1-auc:0.99650
[730]	validation_0-auc:0.99741	validation_1-auc:0.99651
[735]	validation_0-auc:0.99742	validation_1-auc:0.99652
[740]	validation_0-auc:0.99743	validation_1-auc:0.99653
[745]	validation_0-auc:0.99745	validation_1-auc:0.99654
[750]	validation_0-auc:0.99745	validation_1-auc:0.99655
[755]	validation_0-auc:0.99746	validation_1-auc:0.99655
[760]	validation_0-auc:0.99746	validation_1-auc:0.99655
[765]	validation_0-auc:0.99747	validation_1-auc:0.99655
[770]	validation_0-auc:0.99748	validation_1-auc:0.99655
[775]	validation_0-auc:0.99748	validation_1-auc:

[1415]	validation_0-auc:0.99825	validation_1-auc:0.99690
[1420]	validation_0-auc:0.99825	validation_1-auc:0.99690
[1425]	validation_0-auc:0.99825	validation_1-auc:0.99690
[1430]	validation_0-auc:0.99826	validation_1-auc:0.99690
[1435]	validation_0-auc:0.99826	validation_1-auc:0.99690
[1440]	validation_0-auc:0.99827	validation_1-auc:0.99690
[1445]	validation_0-auc:0.99827	validation_1-auc:0.99691
[1450]	validation_0-auc:0.99828	validation_1-auc:0.99691
[1455]	validation_0-auc:0.99828	validation_1-auc:0.99691
[1460]	validation_0-auc:0.99829	validation_1-auc:0.99691
[1465]	validation_0-auc:0.99829	validation_1-auc:0.99691
[1470]	validation_0-auc:0.99829	validation_1-auc:0.99691
[1475]	validation_0-auc:0.99830	validation_1-auc:0.99691
[1480]	validation_0-auc:0.99830	validation_1-auc:0.99691
[1485]	validation_0-auc:0.99830	validation_1-auc:0.99691
[1490]	validation_0-auc:0.99831	validation_1-auc:0.99692
[1495]	validation_0-auc:0.99831	validation_1-auc:0.99692
[1500]	validation_0-auc:0.99832

[22:01:21] DEBUG: C:/Users/Administrator/workspace/xgboost-win64_release_1.0.0/src/tree/updater_gpu_hist.cu:1167: [GPU Hist]: Configure
[22:01:21] Init: 0.002873s, 1 calls @ 2873us

[0]	validation_0-auc:0.96587	validation_1-auc:0.96603
Multiple eval metrics have been passed: 'validation_1-auc' will be used for early stopping.

Will train until validation_1-auc hasn't improved in 50 rounds.
[5]	validation_0-auc:0.97329	validation_1-auc:0.97339
[10]	validation_0-auc:0.97531	validation_1-auc:0.97539
[15]	validation_0-auc:0.97703	validation_1-auc:0.97709
[20]	validation_0-auc:0.97786	validation_1-auc:0.97792
[25]	validation_0-auc:0.97872	validation_1-auc:0.97875
[30]	validation_0-auc:0.97937	validation_1-auc:0.97940
[35]	validation_0-auc:0.97990	validation_1-auc:0.97992
[40]	validation_0-auc:0.98055	validation_1-auc:0.98056
[45]	validation_0-auc:0.98099	validation_1-auc:0.98099
[50]	validation_0-auc:0.98165	validation_1-auc:0.98165
[55]	validation_0-auc:0.98281	validation_1-auc:0.98281
[60

[690]	validation_0-auc:0.99733	validation_1-auc:0.99657
[695]	validation_0-auc:0.99733	validation_1-auc:0.99657
[700]	validation_0-auc:0.99734	validation_1-auc:0.99657
[705]	validation_0-auc:0.99735	validation_1-auc:0.99658
[710]	validation_0-auc:0.99736	validation_1-auc:0.99659
[715]	validation_0-auc:0.99737	validation_1-auc:0.99660
[720]	validation_0-auc:0.99738	validation_1-auc:0.99660
[725]	validation_0-auc:0.99739	validation_1-auc:0.99660
[730]	validation_0-auc:0.99740	validation_1-auc:0.99661
[735]	validation_0-auc:0.99740	validation_1-auc:0.99661
[740]	validation_0-auc:0.99740	validation_1-auc:0.99661
[745]	validation_0-auc:0.99741	validation_1-auc:0.99662
[750]	validation_0-auc:0.99743	validation_1-auc:0.99663
[755]	validation_0-auc:0.99744	validation_1-auc:0.99663
[760]	validation_0-auc:0.99744	validation_1-auc:0.99664
[765]	validation_0-auc:0.99745	validation_1-auc:0.99664
[770]	validation_0-auc:0.99746	validation_1-auc:0.99665
[775]	validation_0-auc:0.99747	validation_1-auc:

[1415]	validation_0-auc:0.99822	validation_1-auc:0.99697
[1420]	validation_0-auc:0.99823	validation_1-auc:0.99697
[1425]	validation_0-auc:0.99823	validation_1-auc:0.99697
[1430]	validation_0-auc:0.99823	validation_1-auc:0.99697
[1435]	validation_0-auc:0.99824	validation_1-auc:0.99698
[1440]	validation_0-auc:0.99824	validation_1-auc:0.99698
[1445]	validation_0-auc:0.99824	validation_1-auc:0.99698
[1450]	validation_0-auc:0.99825	validation_1-auc:0.99698
[1455]	validation_0-auc:0.99825	validation_1-auc:0.99698
[1460]	validation_0-auc:0.99826	validation_1-auc:0.99698
[1465]	validation_0-auc:0.99826	validation_1-auc:0.99699
[1470]	validation_0-auc:0.99827	validation_1-auc:0.99699
[1475]	validation_0-auc:0.99827	validation_1-auc:0.99699
[1480]	validation_0-auc:0.99828	validation_1-auc:0.99699
[1485]	validation_0-auc:0.99828	validation_1-auc:0.99699
[1490]	validation_0-auc:0.99828	validation_1-auc:0.99699
[1495]	validation_0-auc:0.99829	validation_1-auc:0.99699
[1500]	validation_0-auc:0.99829

[105]	validation_0-auc:0.99081	validation_1-auc:0.99060
[110]	validation_0-auc:0.99113	validation_1-auc:0.99092
[115]	validation_0-auc:0.99166	validation_1-auc:0.99144
[120]	validation_0-auc:0.99180	validation_1-auc:0.99157
[125]	validation_0-auc:0.99216	validation_1-auc:0.99193
[130]	validation_0-auc:0.99235	validation_1-auc:0.99211
[135]	validation_0-auc:0.99255	validation_1-auc:0.99231
[140]	validation_0-auc:0.99269	validation_1-auc:0.99244
[145]	validation_0-auc:0.99306	validation_1-auc:0.99281
[150]	validation_0-auc:0.99332	validation_1-auc:0.99307
[155]	validation_0-auc:0.99345	validation_1-auc:0.99320
[160]	validation_0-auc:0.99357	validation_1-auc:0.99331
[165]	validation_0-auc:0.99374	validation_1-auc:0.99348
[170]	validation_0-auc:0.99386	validation_1-auc:0.99359
[175]	validation_0-auc:0.99395	validation_1-auc:0.99367
[180]	validation_0-auc:0.99405	validation_1-auc:0.99377
[185]	validation_0-auc:0.99415	validation_1-auc:0.99386
[190]	validation_0-auc:0.99421	validation_1-auc:

[840]	validation_0-auc:0.99752	validation_1-auc:0.99665
[845]	validation_0-auc:0.99752	validation_1-auc:0.99665
[850]	validation_0-auc:0.99753	validation_1-auc:0.99665
[855]	validation_0-auc:0.99754	validation_1-auc:0.99665
[860]	validation_0-auc:0.99754	validation_1-auc:0.99665
[865]	validation_0-auc:0.99755	validation_1-auc:0.99666
[870]	validation_0-auc:0.99756	validation_1-auc:0.99667
[875]	validation_0-auc:0.99757	validation_1-auc:0.99667
[880]	validation_0-auc:0.99758	validation_1-auc:0.99668
[885]	validation_0-auc:0.99759	validation_1-auc:0.99668
[890]	validation_0-auc:0.99760	validation_1-auc:0.99669
[895]	validation_0-auc:0.99761	validation_1-auc:0.99669
[900]	validation_0-auc:0.99762	validation_1-auc:0.99669
[905]	validation_0-auc:0.99762	validation_1-auc:0.99670
[910]	validation_0-auc:0.99763	validation_1-auc:0.99670
[915]	validation_0-auc:0.99764	validation_1-auc:0.99670
[920]	validation_0-auc:0.99765	validation_1-auc:0.99671
[925]	validation_0-auc:0.99765	validation_1-auc:

[1565]	validation_0-auc:0.99833	validation_1-auc:0.99697
[1570]	validation_0-auc:0.99834	validation_1-auc:0.99697
[1575]	validation_0-auc:0.99834	validation_1-auc:0.99697
[1580]	validation_0-auc:0.99834	validation_1-auc:0.99698
[1585]	validation_0-auc:0.99835	validation_1-auc:0.99698
[1590]	validation_0-auc:0.99835	validation_1-auc:0.99698
[1595]	validation_0-auc:0.99835	validation_1-auc:0.99698
[1600]	validation_0-auc:0.99836	validation_1-auc:0.99698
[1605]	validation_0-auc:0.99836	validation_1-auc:0.99698
[1610]	validation_0-auc:0.99837	validation_1-auc:0.99698
[1615]	validation_0-auc:0.99837	validation_1-auc:0.99699
[1620]	validation_0-auc:0.99838	validation_1-auc:0.99699
[1625]	validation_0-auc:0.99838	validation_1-auc:0.99699
[1630]	validation_0-auc:0.99838	validation_1-auc:0.99699
[1635]	validation_0-auc:0.99838	validation_1-auc:0.99699
[1640]	validation_0-auc:0.99839	validation_1-auc:0.99699
[1645]	validation_0-auc:0.99839	validation_1-auc:0.99699
[1650]	validation_0-auc:0.99840

[290]	validation_0-auc:0.99564	validation_1-auc:0.99534
[295]	validation_0-auc:0.99566	validation_1-auc:0.99536
[300]	validation_0-auc:0.99570	validation_1-auc:0.99539
[305]	validation_0-auc:0.99575	validation_1-auc:0.99544
[310]	validation_0-auc:0.99577	validation_1-auc:0.99545
[315]	validation_0-auc:0.99580	validation_1-auc:0.99548
[320]	validation_0-auc:0.99587	validation_1-auc:0.99554
[325]	validation_0-auc:0.99591	validation_1-auc:0.99557
[330]	validation_0-auc:0.99596	validation_1-auc:0.99562
[335]	validation_0-auc:0.99601	validation_1-auc:0.99566
[340]	validation_0-auc:0.99606	validation_1-auc:0.99570
[345]	validation_0-auc:0.99607	validation_1-auc:0.99571
[350]	validation_0-auc:0.99611	validation_1-auc:0.99575
[355]	validation_0-auc:0.99617	validation_1-auc:0.99580
[360]	validation_0-auc:0.99619	validation_1-auc:0.99582
[365]	validation_0-auc:0.99621	validation_1-auc:0.99583
[370]	validation_0-auc:0.99623	validation_1-auc:0.99584
[375]	validation_0-auc:0.99624	validation_1-auc:

[1025]	validation_0-auc:0.99780	validation_1-auc:0.99684
[1030]	validation_0-auc:0.99781	validation_1-auc:0.99684
[1035]	validation_0-auc:0.99781	validation_1-auc:0.99684
[1040]	validation_0-auc:0.99782	validation_1-auc:0.99684
[1045]	validation_0-auc:0.99782	validation_1-auc:0.99684
[1050]	validation_0-auc:0.99783	validation_1-auc:0.99684
[1055]	validation_0-auc:0.99783	validation_1-auc:0.99685
[1060]	validation_0-auc:0.99784	validation_1-auc:0.99685
[1065]	validation_0-auc:0.99785	validation_1-auc:0.99685
[1070]	validation_0-auc:0.99785	validation_1-auc:0.99686
[1075]	validation_0-auc:0.99786	validation_1-auc:0.99686
[1080]	validation_0-auc:0.99786	validation_1-auc:0.99686
[1085]	validation_0-auc:0.99788	validation_1-auc:0.99687
[1090]	validation_0-auc:0.99788	validation_1-auc:0.99687
[1095]	validation_0-auc:0.99788	validation_1-auc:0.99687
[1100]	validation_0-auc:0.99789	validation_1-auc:0.99687
[1105]	validation_0-auc:0.99789	validation_1-auc:0.99687
[1110]	validation_0-auc:0.99790

[1745]	validation_0-auc:0.99847	validation_1-auc:0.99705
[1750]	validation_0-auc:0.99847	validation_1-auc:0.99705
[1755]	validation_0-auc:0.99847	validation_1-auc:0.99705
[1760]	validation_0-auc:0.99848	validation_1-auc:0.99705
[1765]	validation_0-auc:0.99848	validation_1-auc:0.99705
[1770]	validation_0-auc:0.99848	validation_1-auc:0.99705
[1775]	validation_0-auc:0.99849	validation_1-auc:0.99705
[1780]	validation_0-auc:0.99849	validation_1-auc:0.99705
[1785]	validation_0-auc:0.99849	validation_1-auc:0.99705
[1790]	validation_0-auc:0.99850	validation_1-auc:0.99705
[1795]	validation_0-auc:0.99850	validation_1-auc:0.99705
[1800]	validation_0-auc:0.99850	validation_1-auc:0.99705
[1805]	validation_0-auc:0.99851	validation_1-auc:0.99706
[1810]	validation_0-auc:0.99851	validation_1-auc:0.99706
[1815]	validation_0-auc:0.99852	validation_1-auc:0.99705
[1820]	validation_0-auc:0.99852	validation_1-auc:0.99705
[1825]	validation_0-auc:0.99852	validation_1-auc:0.99705
[1830]	validation_0-auc:0.99852

In [7]:
#阈值大概在0.2-0.4之间 本题对召回率较敏感，可适当降低一下阈值
thre = 0.25

In [8]:
#生成提交文件
sub = pd.DataFrame()
sub['hit_id']=testpd['hit_id']
sub['flag_pred'] = y_pp_xgb
sub['event_id'] = testpd['event_id']
sub['flag_pred'] = sub['flag_pred'].apply(lambda x: 1 if x >= thre else 0)
sub.to_csv(os.path.join(pathf, "subsample.csv").format(sub['flag_pred'].mean()),index=False)