In [1]:
import os
from tqdm import tqdm
from sklearn.model_selection import KFold, train_test_split, GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics
from scipy.stats import entropy, kurtosis
import xgboost as xgb
import seaborn as sns
from xgboost import plot_importance
import warnings
import matplotlib.pyplot as plt
import pandas as pd
from math import *
import numpy as np

pd.set_option('display.max_columns', None)
from IPython.display import display
warnings.filterwarnings('ignore')
%matplotlib inline

In [2]:
pathf = os.path.join("..", "data", "particles")

trainpd = pd.read_csv(os.path.join(pathf, "train.csv"))
print(trainpd.head(1))
trainshape = trainpd.shape
print(trainshape)
eventpd = pd.read_csv(os.path.join(pathf, "event.csv"))
print(eventpd.head(1))
print(eventpd.shape)
testpd = pd.read_csv(os.path.join(pathf, "test.csv"))
testshape = testpd.shape
print(testpd.head(1))
print(testpd.shape)

data = pd.concat([trainpd, testpd], ignore_index=True)
data = pd.merge(data, eventpd, on='event_id', how='left')


       x      y  z        t   terror        q  flag  event_id  hit_id
0 -142.5 -147.5  0  767.879  2.02966  1.05052     0         7       1
(9473201, 9)
   event_id  nhit  nhitreal  energymc  thetamc    phimc   xcmc    ycmc
0         7   426        70   48348.9  63.1686  11.0982 -40.83  114.03
(13315, 8)
       x      y  z        t  terror        q  event_id  hit_id
0 -142.5 -127.5  0  848.061  1.9984  1.15067         9       1
(4086511, 8)


In [3]:
def group_fea(df,key,target,flag):
    tmp = df.groupby(key, as_index=False)[target].agg({
        flag+'_'+key+'_'+target + '_median': 'median',
        flag+'_'+key+'_'+target + '_mean': 'mean',
        flag+'_'+key+'_'+target + '_std': 'std',
        flag+'_'+key+'_'+target + '_skew': 'skew', 
        flag+'_'+key+'_'+target + '_kurt': kurtosis,
        flag+'_'+key+'_'+target + '_sum':'sum',
    }).reset_index()
    print("**************************{}**************************".format(target))
    return tmp

In [4]:
#组合特征 
#train表的特征与event表的特征交互
data['fx'] = data['x']-data['xcmc']
data['fy'] = data['y']-data['ycmc']
data['fdis'] = np.sqrt(data['fx']**2+data['fy']**2)
data['fscala'] = np.sin(data['thetamc']) * data['t']
data['fphi'] = np.arctan2(data['fy'], data['fx']) * 180

data['nhit_bias'] = (data['nhit']-data['nhitreal'])/(data['nhitreal']+data['nhitreal'].mean())


In [None]:
print(trainshape[0])
print(type(trainshape[0]))
print(data.shape)

trainpd = data[:trainshape[0]].reset_index()
testpd = data[trainshape[0]:].reset_index()
del data


9473201
<class 'int'>
(13559712, 22)


In [None]:
print(trainpd.columns)
feature= [x for x in trainpd.columns if x not in ['flag','index','index_x','index_y','hit_id','event_id']]


In [None]:
#这里用的gpu版的xgb，cpu的话实在太慢了，建议装个gpu版
n_splits = 5
kf = KFold(n_splits=n_splits, shuffle=True, random_state=4399)
y_pp_xgb = np.zeros(testshape[0])
y_pp_xgb_stacking = np.zeros(len(labels))
for train_index, test_index in kf.split(trainpd):
    print ( ">>>", train_index )
    clf = xgb.XGBClassifier(tree_method='gpu_hist',max_depth=8,learning_rate=0.1,verbosity=3,
                           eval_metric='auc',n_estimators=2000)
    clf.fit(
        trainpd[feature].iloc[train_index], labels[train_index],
        eval_set=[(trainpd[feature].iloc[train_index], labels[train_index]),
                  (trainpd[feature].iloc[test_index], labels[test_index])],
        early_stopping_rounds=50,
        verbose=5,
    )

    y_pred = clf.predict(trainpd[feature].iloc[test_index]) 
    y_predprob = clf.predict_proba(trainpd[feature].iloc[test_index])[:, 1] 
    
    y_pp_xgb_stacking[test_index] = y_predprob
      
    auc = metrics.roc_auc_score(labels[test_index], y_predprob)
    print("AUC Score (Train): %f" % auc) 
    
    y_pp_xgb += clf.predict_proba(testpd[feature])[:, 1] / n_splits
    

In [None]:
#阈值大概在0.2-0.4之间 本题对召回率较敏感，可适当降低一下阈值
thre = 0.25 

In [None]:
#生成提交文件
sub = pd.DataFrame()
sub['hit_id']=testpd['hit_id']
sub['flag_pred'] = y_pp_xgb
sub['event_id'] = testpd['event_id']
sub['flag_pred'] = sub['flag_pred'].apply(lambda x: 1 if x >= thre else 0)
sub.to_csv(os.path.join(pathf, "sample.csv").format(sub['flag_pred'].mean()),index=False)