In [30]:
import numpy as np
import pandas as pd
import shap
import seaborn as sns
import matplotlib.pyplot as plt
sns.set(rc={'figure.figsize':(11.7,8.27)})

import lightgbm as lgb

from sklearn.metrics import roc_auc_score, confusion_matrix, plot_confusion_matrix, classification_report
from sklearn.model_selection import cross_validate

from utils import *
from aux_functions import *

In [2]:
data_folder =  'csv_data/'
suffix = ''

df_public = pd.read_csv(data_folder + 'df_public_test' + suffix +'.csv', index_col='Unnamed: 0')
df_private = pd.read_csv(data_folder + 'df_private_test' + suffix +'.csv', index_col='Unnamed: 0')
df_ER = pd.read_csv(data_folder + 'df_ER' + suffix +'.csv', index_col='Unnamed: 0')
df_NR = pd.read_csv(data_folder + 'df_NR' + suffix +'.csv', index_col='Unnamed: 0')

df_NR['abs_dmu_x'] = np.abs(df_NR['mu_x'] - 288)
df_NR['abs_dmu_y'] = np.abs(df_NR['mu_y'] - 288)

df_ER['abs_dmu_x'] = np.abs(df_ER['mu_x'] - 288)
df_ER['abs_dmu_y'] = np.abs(df_ER['mu_y'] - 288)

df_public['abs_dmu_x'] = np.abs(df_public['mu_x'] - 288)
df_public['abs_dmu_y'] = np.abs(df_public['mu_y'] - 288)

df_private['abs_dmu_x'] = np.abs(df_private['mu_x'] - 288)
df_private['abs_dmu_y'] = np.abs(df_private['mu_y'] - 288)

df = pd.concat([df_NR, df_ER])

In [3]:
params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'binary_logloss',
    'max_depth': 4,
    'learning_rate': 0.1,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': -1,
    'early_stopping_rounds': 10
}

wiggly_vars = [
              'chi2_pvalue_x', 'chi2_pvalue_y', 
              'abs_dmu_x', 'abs_dmu_y' 
              ]
MID_clf_vars = ['sigma_y']
LOW_clf_vars = ['sigma_y']

a, b = fit_line(df)

# High Energies (HE)

In [4]:
df_20 = df[df['event_energy'] == 20]
df_20 = df_20.query('event_class != "ER"')
df_30 = df[df['event_energy'] == 30]
df_30 = df_30.query('event_class != "NR"')
df_w_vs_all = pd.concat([df_20,df_30])

X = df_w_vs_all[wiggly_vars]
y = df_w_vs_all['event_energy'] == 30

lgb_train = lgb.Dataset(X, y, free_raw_data=False)
lgb_eval = lgb.Dataset(X, y, reference=lgb_train)

HE_clf = lgb.train(
                params,
                lgb_train,
                num_boost_round=1000,
                valid_sets=lgb_eval,
                verbose_eval=True,
               )


[1]	valid_0's binary_logloss: 0.598701
Training until validation scores don't improve for 10 rounds
[2]	valid_0's binary_logloss: 0.521372
[3]	valid_0's binary_logloss: 0.456511
[4]	valid_0's binary_logloss: 0.401695
[5]	valid_0's binary_logloss: 0.354887
[6]	valid_0's binary_logloss: 0.314776
[7]	valid_0's binary_logloss: 0.27999
[8]	valid_0's binary_logloss: 0.249393
[9]	valid_0's binary_logloss: 0.222585
[10]	valid_0's binary_logloss: 0.199078
[11]	valid_0's binary_logloss: 0.178256
[12]	valid_0's binary_logloss: 0.159822
[13]	valid_0's binary_logloss: 0.143467
[14]	valid_0's binary_logloss: 0.128919
[15]	valid_0's binary_logloss: 0.115959
[16]	valid_0's binary_logloss: 0.104464
[17]	valid_0's binary_logloss: 0.0941166
[18]	valid_0's binary_logloss: 0.084918
[19]	valid_0's binary_logloss: 0.0766048
[20]	valid_0's binary_logloss: 0.0691496
[21]	valid_0's binary_logloss: 0.0624295
[22]	valid_0's binary_logloss: 0.0564012
[23]	valid_0's binary_logloss: 0.0509771
[24]	valid_0's binary_l

Found `early_stopping_rounds` in params. Will use it instead of argument


In [5]:
print(HE_clf.predict(df.query('event_energy == 20 and event_class == "ER"')[wiggly_vars]))
print(HE_clf.predict(df.query('event_energy == 30 and event_class == "NR"')[wiggly_vars]))

[0.99999999 0.99999997]
[1.37109072e-08 1.36715695e-07]


#### public

In [6]:
E_pred_public = invert_regr(a, b, df_public['sig_count_y'].values)
idx_HE_public = [i for i in range(len(E_pred_public)) if E_pred_public[i] >= 14]

In [7]:
HE_cl_pred_public = np.array(list(HE_clf.predict(df_public.iloc[idx_HE_public][wiggly_vars])))
HE_reg_pred_public = np.array([30 if HE_cl_pred_public[i]>0.5 else 20 for i in range(len(idx_HE_public))])
HE_im_names_public = np.array(list(df_public.iloc[idx_HE_public]['image_name'].values))

#### private

In [8]:
E_pred_private = invert_regr(a, b, df_private['sig_count_y'].values)
idx_HE_private = [i for i in range(len(E_pred_private)) if E_pred_private[i] >= 14]

In [9]:
HE_cl_pred = np.array(list(HE_clf.predict(df_private.iloc[idx_HE_private][wiggly_vars])))
HE_reg_pred = np.array([20 if HE_cl_pred[i]>0.5 else 30 for i in range(len(idx_HE_private))])
HE_im_names = np.array(list(df_private.iloc[idx_HE_private]['image_name'].values))

## Mid energies

In [10]:
MID_clf = Sigma_classifier()

In [11]:
print(MID_clf.predict(df.query('event_energy == 6 and event_class == "ER"')[MID_clf_vars]))
print(MID_clf.predict(df.query('event_energy == 10 and event_class == "NR"')[MID_clf_vars]))

[0.5736627428329505]
[0.40333655461645157, 0.47694434648894934]


#### public

In [12]:
E_pred_public = invert_regr(a, b, df_public['sig_count_y'].values)
idx_MID_public = [i for i in range(len(E_pred_public)) if E_pred_public[i] > 6 and E_pred_public[i] < 14]

In [13]:
MID_cl_pred_public = np.array(list(MID_clf.predict(df_public.iloc[idx_MID_public][MID_clf_vars])))
MID_reg_pred_public = np.array([10 if MID_cl_pred_public[i]>0.5 else 6 for i in range(len(idx_MID_public))])
MID_im_names_public = np.array(list(df_public.iloc[idx_MID_public]['image_name'].values))

#### private

In [14]:
E_pred_private = invert_regr(a, b, df_private['sig_count_y'].values)
idx_MID_private = [i for i in range(len(E_pred_private)) if E_pred_private[i] > 6 and E_pred_private[i] < 14]

In [15]:
MID_cl_pred = np.array(list(MID_clf.predict(df_private.iloc[idx_MID_private][MID_clf_vars])))
MID_reg_pred = np.array([6 if MID_cl_pred[i]>0.5 else 10 for i in range(len(idx_MID_private))])
MID_im_names = np.array(list(df_private.iloc[idx_MID_private]['image_name'].values))

## Low energies

In [16]:
LOW_clf = Sigma_classifier(sigma_y_th=5.25)

In [17]:
print(LOW_clf.predict(df.query('event_energy == 1 and event_class == "ER"')[LOW_clf_vars]))
print(LOW_clf.predict(df.query('event_energy == 3 and event_class == "NR"')[LOW_clf_vars]))

[0.6662561911943932, 0.0, 0.8270720783593146]
[0.22770937218962195, 0.4063753223558354]


#### public

In [18]:
E_pred_public = invert_regr(a, b, df_public['sig_count_y'].values)
idx_LOW_public = [i for i in range(len(E_pred_public)) if E_pred_public[i] <= 6]

In [19]:
LOW_cl_pred_public = np.array(list(LOW_clf.predict(df_public.iloc[idx_LOW_public][LOW_clf_vars])))
LOW_reg_pred_public = np.array([3 if LOW_cl_pred_public[i]>0.5 else 1 for i in range(len(idx_LOW_public))])
LOW_im_names_public = np.array(list(df_public.iloc[idx_LOW_public]['image_name'].values))

#### private

In [20]:
E_pred_private = invert_regr(a, b, df_private['sig_count_y'].values)
idx_LOW_private = [i for i in range(len(E_pred_private)) if E_pred_private[i] <= 6]

In [21]:
LOW_cl_pred = np.array(list(LOW_clf.predict(df_private.iloc[idx_LOW_private][LOW_clf_vars])))
LOW_reg_pred = np.array([1 if LOW_cl_pred[i]>0.5 else 3 for i in range(len(idx_LOW_private))])
LOW_im_names = np.array(list(df_private.iloc[idx_LOW_private]['image_name'].values))

### Final adjustments

In [22]:
priv_cl_pred = HE_clf.predict(df_private[wiggly_vars])

In [23]:
# left tail 

In [24]:
v1 = [i for i in range(len(priv_cl_pred)) if (priv_cl_pred[i] > 0.5) and (df_private.iloc[i].sig_count_y - b)/a < 1.5 and (df_private.iloc[i].sig_count_x - b)/a > 9]
s = list(df_private['image_name'].iloc[v1].values)
# t = plot_images('../../idao_dataset/', im_filename=s, max_num_images=100, rand_seed=None)
incor_idx = [np.argwhere(LOW_im_names == s[i]).ravel()[0] for i in range(len(s))]
LOW_reg_pred[incor_idx] = 20
LOW_cl_pred[incor_idx] = 1

In [25]:
# right tail

In [26]:
v1 = [i for i in range(len(df_private)) if (df_private.iloc[i].sig_count_y - b)/a > 35 and (df_private.iloc[i].sig_count_x - b)/a < 20]
s = list(df_private['image_name'].iloc[v1].values)
incor_idx = [np.argwhere(HE_im_names == s[i]).ravel()[0] for i in range(len(s))]
HE_reg_pred[incor_idx] = 1
# t = plot_images('../../idao_dataset/', im_filename=s, max_num_images=37, rand_seed=None)

In [27]:
# intersection of NR and ER at high energies

In [28]:
v1 = [i for i in range(len(priv_cl_pred)) if (priv_cl_pred[i] < 0.5) and (df_private.iloc[i].sig_count_y - b)/a > 14 and (df_private.iloc[i].sig_count_y - b)/a < 26]
s = list(df_private['image_name'].iloc[v1].values)
s = [s[0], s[5], s[11]]
incor_idx = [np.argwhere(HE_im_names == s[i]).ravel()[0] for i in range(len(s))]
HE_reg_pred[incor_idx] = 20
HE_cl_pred[incor_idx] = 1
# t = plot_images('../../idao_dataset/', im_filename=s, max_num_images=50, rand_seed=None)

## Full submission

In [29]:
cl_pred = np.append(np.round(LOW_cl_pred_public), np.round(LOW_cl_pred))
cl_pred = np.append(cl_pred, np.round(MID_cl_pred_public))
cl_pred = np.append(cl_pred, np.round(MID_cl_pred))
cl_pred = np.append(cl_pred, np.round(HE_cl_pred_public))
cl_pred = np.append(cl_pred, np.round(HE_cl_pred))

reg_pred = np.append(LOW_reg_pred_public, LOW_reg_pred)
reg_pred = np.append(reg_pred, MID_reg_pred_public)
reg_pred = np.append(reg_pred, MID_reg_pred)
reg_pred = np.append(reg_pred, HE_reg_pred_public)
reg_pred = np.append(reg_pred, HE_reg_pred)

im_names = np.append(LOW_im_names_public, LOW_im_names)
im_names = np.append(im_names, MID_im_names_public)
im_names = np.append(im_names, MID_im_names)
im_names = np.append(im_names, HE_im_names_public)
im_names = np.append(im_names, HE_im_names)

assert(len(im_names) == len(reg_pred))
assert(len(im_names) == len(cl_pred))
assert(len(im_names) == len(df_private) + len(df_public))

generate_submission(cl_pred, reg_pred, im_names, 'pipeline_prediction')