In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# For example, here's several helpful packages to load in 

import pandas as pd
import numpy as np

# submit CSV to hdfs
import submittools as sub

# Input data files are available with function competitionData
from Turing import competitionData, userData

import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from lightgbm.sklearn import LGBMClassifier
from sklearn.metrics import roc_auc_score
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import time
import gc
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [None]:
train_df = pd.read_csv('data/train.csv')
print(train_df.shape, train_df['flag'].mean())
test_df = pd.read_csv('data/test.csv')
print(test_df.shape)
event_df = pd.read_csv('data/event.csv')
print(event_df.shape)

df = pd.concat([train_df, test_df], axis=0, ignore_index=True)
df = df.merge(event_df, on='event_id', how='left')
df = df.sort_values(['event_id', 't']).reset_index(drop=True)
print(df.shape)


In [None]:
## event的信号比
df['nhitreal_prop'] = df['nhitreal'] / df['nhit']

## 时间上的误差比
df['terror_prop'] = df['terror'] / (df['t'] + 1e-5)

## 角度
df['angle'] = np.arctan(df['y'] / (df['x'] + 1e-5))
df['c_angle'] = np.arctan(df['ycmc'] / (df['xcmc'] + 1e-5))

## 与芯位的关系，横纵坐标上的距离、欧式距离、余弦距离
df['x_xcmc_dist'] = df['x'] - df['xcmc']
df['y_ycmc_dist'] = df['y'] - df['ycmc']
df['c_dist'] = np.sqrt(np.square(df['x'] - df['xcmc']) + np.square(df['y'] - df['ycmc']))
df['c_cos'] = (df['x'] * df['xcmc'] + df['y'] * df['ycmc']) / (
        np.sqrt(df['x'] * df['x'] + df['y'] * df['y'])
        * np.sqrt(df['xcmc'] * df['xcmc'] + df['ycmc'] * df['ycmc']) + 1e-5
)

## 电荷量与原初粒子能量的关系
df['q_energymc_ratio'] = df['q'] / (df['energymc'] + 1e-5)

In [None]:
                    
## count编码，还是有点用
for f in tqdm(['x', 'y', 'terror', 'q']):
    df[f + '_count'] = df[f].map(df[f].value_counts())
for f in tqdm([
    ['x', 'y'], ['x', 'terror'], ['x', 'q'],
    ['y', 'terror'], ['y', 'q'], ['terror', 'q']
]):
    df['_'.join(f) + '_count'] = df.groupby(f)['hit_id'].transform('count')

In [None]:
## 时间轴上的相对变化量，“第二次打比赛”团队之前的开源也有类似的操作
g = df.groupby('event_id')
for f in tqdm(['x', 'y', 'terror', 'q', 't', 'c_dist']):
    df['event_id_{}_mean'.format(f)] = g[f].transform('mean')
    for i in [1, 2, 4, 8, 10, 12, 15, 18, 20]:
        df['event_id_{}_diff_next_{}'.format(f, i)] = g[f].shift(0) - g[f].shift(-i)
        df['event_id_{}_diff_prev_{}'.format(f, i)] = g[f].shift(0) - g[f].shift(i)

In [None]:
                    
## 探测器记录的时间不是从0开始的，而是以最早触发的时间为基准的，所以算一下每个hit在其event中的相对触发时间
df['event_id_t_min'] = df.groupby('event_id')['t'].transform('min')
df['t_gap'] = df['t'] - df['event_id_t_min']

## 据说一个event里的信号在时间上大致都分布在中间的一个区间，因此计算一下各个hit的触发时间跟中间触发时间的间隔
df['event_id_t_gap_mid'] = df.groupby('event_id')['t_gap'].transform('max') / 2
df['t_mid_gap'] = df['t_gap'] - df['event_id_t_gap_mid']
df['t_mid_gap_abs'] = abs(df['t_mid_gap'])

## 一个event中所有hit的电荷总量跟原初粒子能量的关系
df['event_id_q_sum'] = df.groupby('event_id')['q'].transform('sum')
df['q_prop'] = df['q'] / (df['event_id_q_sum'] + 1e-5)
df['event_id_q_sum_energymc_diff'] = df['energymc'] - df['event_id_q_sum']
df['event_id_q_sum_energymc_prop'] = df['event_id_q_sum'] / (df['energymc'] + 1e-5)

## 据说x, y是探测器的坐标，结合起来可以标识一个探测器
df['detector'] = df['x'].astype('str') + '_' + df['y'].astype('str')


In [None]:
                    
train_df = df[~df['flag'].isna()].reset_index(drop=True)
test_df = df[df['flag'].isna()].reset_index(drop=True)

In [None]:
## 五折目标编码
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=2020)
for f in tqdm(['detector', 'terror']):
    train_df[f + '_target_enc'] = 0
    test_df[f + '_target_enc'] = 0
    for i, (trn_idx, val_idx) in enumerate(skf.split(train_df, train_df['flag'])):
        trn_x = train_df[[f, 'flag']].iloc[trn_idx].reset_index(drop=True)
        val_x = train_df[[f]].iloc[val_idx].reset_index(drop=True)
        enc_df = trn_x.groupby(f, as_index=False)['flag'].agg({f + '_target_enc': 'mean'})
        val_x = val_x.merge(enc_df, on=f, how='left')
        test_x = test_df[[f]].merge(enc_df, on=f, how='left')
        val_x[f + '_target_enc'] = val_x[f + '_target_enc'].fillna(train_df['flag'].mean())
        test_x[f + '_target_enc'] = test_x[f + '_target_enc'].fillna(train_df['flag'].mean())
        train_df.loc[val_idx, f + '_target_enc'] = val_x[f + '_target_enc'].values
        test_df[f + '_target_enc'] += test_x[f + '_target_enc'].values / skf.n_splits

In [None]:
cols = [f for f in train_df.columns if f not in ['z', 'hit_id', 'event_id', 'detector', 'flag']]
labels = train_df['flag'].values
train_df = train_df[cols]
sub = test_df[['hit_id', 'event_id']]
test_df = test_df[cols]

In [None]:
print(train_df.shape)
oof = np.zeros(train_df.shape[0])
sub['flag_pred'] = 0
feat_imp_df = pd.DataFrame({'feats': train_df.columns.values, 'imp': 0})
clf = LGBMClassifier(
    learning_rate=0.05,
    n_estimators=3000,
    num_leaves=2047,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=2020,
    metric=None
)
for i, (trn_idx, val_idx) in enumerate(skf.split(train_df, labels)):
    print('--------------------- {} fold ---------------------'.format(i))
    t = time.time()
    trn_x, trn_y = train_df.iloc[trn_idx].reset_index(drop=True), labels[trn_idx]
    val_x, val_y = train_df.iloc[val_idx].reset_index(drop=True), labels[val_idx]
    clf.fit(
        trn_x, trn_y,
        eval_set=[(val_x, val_y)],
        eval_metric='auc',
        early_stopping_rounds=50,
        verbose=20
    )
    feat_imp_df['imp'] += clf.feature_importances_ / skf.n_splits
    oof[val_idx] += clf.predict_proba(val_x)[:, 1]
    sub['flag_pred'] += clf.predict_proba(test_df)[:, 1] / skf.n_splits
    print('runtime: {}\n'.format(time.time() - t))

auc = roc_auc_score(labels, oof)
print('\ncv auc:', auc)


In [None]:
## 特征重要性
plt.figure(figsize=(15, 35))
feat_imp_df = feat_imp_df.sort_values('imp').reset_index(drop=True)
sns.barplot(x='imp', y='feats', data=feat_imp_df)

In [None]:
                    
## 生成sub

sub = sub.sort_values('hit_id').reset_index(drop=True)
sub.to_csv('sub_prob_{}.csv'.format(auc), index=False)

event_df = event_df[['event_id', 'nhitreal']].drop_duplicates('event_id').reset_index(drop=True)
sub = sub.merge(event_df, on='event_id', how='left')
sub['rank'] = sub.groupby('event_id')['flag_pred'].rank(ascending=False, method='first')
sub['flag_pred'] = ((sub['nhitreal'] + 2) >= sub['rank']).astype('int')

sub[['hit_id', 'flag_pred', 'event_id']].to_csv('sub_{}_{}.csv'.format(auc, sub['flag_pred'].mean()), index=False)