In [1]:
import pandas as pd
import numpy as np
import random

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score,auc
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.preprocessing import QuantileTransformer


from lightgbm import LGBMClassifier

In [2]:
random.seed(42)
np.random.seed(42)

In [27]:
train_raw = pd.read_csv("../data/train.csv").drop(columns="BUTTER")
test_raw = pd.read_csv("../data/test.csv").drop(columns="BUTTER")

In [28]:
train_raw['train'] = 1
test_raw['train'] = 0

In [29]:
all_df = pd.concat([train_raw, test_raw]).reset_index(drop=True)

In [30]:
all_df.columns = [col.replace(" ", "") for col in all_df.columns]

In [31]:
# cos -> sin transformation
all_df["Kst_892_0_sinThetaH"] = np.sqrt(1 - all_df["Kst_892_0_cosThetaH"]**2)
all_df["B_DIRA_OWNPV_sin"] = np.sqrt(1 - all_df["B_DIRA_OWNPV"]**2)

In [32]:
# x and y P components
all_df["Kplus_P_x"] = all_df["Kplus_P"] * all_df["Kst_892_0_sinThetaH"]
all_df["Kplus_P_y"] = all_df["Kplus_P"] * all_df["Kst_892_0_cosThetaH"]
all_df["B_PT_x"] = all_df["B_PT"] * all_df["B_DIRA_OWNPV"]
all_df["B_PT_y"] = all_df["B_PT"] * all_df["B_DIRA_OWNPV_sin"]

# things in hbar units
all_df["B_hbar"] = all_df["B_PT"] * all_df["B_IPCHI2_OWNPV"]
all_df["B_hbar_2"] = all_df["B_PT"] * all_df["B_FDCHI2_OWNPV"]
all_df["K_hbar"] = all_df["Kplus_P"] * all_df["Kplus_IP_OWNPV"]
all_df["p_hbar"] = all_df["piminus_P"] * all_df["piminus_IP_OWNPV"]

# hbar ratios
all_df["B_hbar_ratio"] = all_df["B_hbar"] / all_df["B_hbar_2"]
all_df["K_p_hbar_ratio"] = all_df["K_hbar"] / all_df["p_hbar"]
all_df["K_B_hbar_ratio"] = all_df["K_hbar"] / all_df["B_hbar"]

# p ratios
all_df["gamma_B_PT_ratio"] = (all_df["gamma_PT"] / all_df['B_PT'])
all_df["piminus_B_P_ratio"] = (all_df["piminus_P"] / all_df['B_PT'])
all_df["kplus_B_P_ratio"] = (all_df["Kplus_P"] / all_df['B_PT'])
all_df["kplus_piminus_P_ratio"] = (all_df["Kplus_P"] / all_df['piminus_P'])

# distance ratios
all_df["b_distance_ratio"] = all_df['B_IPCHI2_OWNPV'] / all_df['B_FDCHI2_OWNPV']
all_df["k_p_distance_ratio"] = all_df['Kplus_IP_OWNPV'] / all_df['piminus_IP_OWNPV']
all_df["k_b_distance_ratio"] = all_df['Kplus_IP_OWNPV'] / all_df['B_IPCHI2_OWNPV']
all_df["p_b_distance_ratio"] = all_df['piminus_IP_OWNPV'] / all_df['B_IPCHI2_OWNPV']
all_df["k_kst_distance_ratio"] = all_df['Kplus_IP_OWNPV'] / all_df['Kst_892_0_IP_OWNPV']

# shpere radius
all_df["sphere_radius_k_b"] =  all_df['Kplus_IP_OWNPV']**2 + all_df['B_IPCHI2_OWNPV']**2
all_df["sphere_radius_p_b"] =  all_df['piminus_IP_OWNPV']**2 + all_df['B_IPCHI2_OWNPV']**2

# ANGLE ratios
# all_df["b_eta"] = np.arccos(all_df["B_DIRA_OWNPV"])
# all_df["b_K_ratio"] = all_df["b_eta"] / all_df["Kplus_ETA"]
# all_df["b_p_ratio"] = all_df["b_eta"] / all_df["piminus_ETA"]

In [33]:
transformed_values = QuantileTransformer().fit_transform(all_df)
transformed_df = pd.DataFrame(transformed_values)

keep_cols = ["Id", "signal", "train"]

transformed_df.columns = [col if col in keep_cols else f"{col}_q" for col in all_df.columns]

transformed_df = transformed_df.drop(columns=keep_cols)

In [34]:
# full_df = pd.concat([all_df, transformed_df], axis=1)
full_df = pd.concat([all_df[keep_cols], transformed_df], axis=1)

In [35]:
train = full_df[full_df.train == 1].drop(columns=['train', 'Id'])
test = full_df[full_df.train != 1].drop(columns=['train', 'Id', 'signal'])

In [36]:
X_full = train.drop(columns="signal")
X_test = test.copy()
y_full = train.signal

X_train, X_valid, y_train, y_valid = train_test_split(
    X_full, 
    y_full, 
    stratify=train.signal
)

In [37]:
lgb = LGBMClassifier(n_estimators=500)
lgb.fit(X_train, y_train)

LGBMClassifier(n_estimators=500)

In [38]:
pred_valid = lgb.predict_proba(X_valid)[:, 1]
pred_train = lgb.predict_proba(X_train)[:, 1]

In [39]:
roc_auc_score(y_valid, pred_valid)

# Only quantiles, 500 trees: 865 (cv 864)
# Only quantiles + distance ratio, 500 trees: 868 (cv 863)
# Only quantiles + many features, 500 trees: 871 (cv 870)
# Only quantiles + many features, 1500 trees: 872 (cv 870)
# Only quantiles + many features + special distance ratios, 500 trees: 882 (cv 877)
# Only quantiles + many features + special distance ratios + b_px + b_py, 500 trees: 8816 (cv 8778)
# Only quantiles + many features + special distance ratios + b_px + b_py + hbars, 500 trees: 883 (cv 8772)
# Only quantiles + many features + special distance ratios + b_px + b_py + hbars + hbar ratios, 500 trees: 879 (cv 8787)
# Only quantiles + many features + special distance ratios + b_px + b_py + hbars + hbar ratios + sphere radius, 500 trees: 883 (cv 877)
# Only quantiles, 1500 trees: 866 (cv 865)

0.8808428093634422

In [40]:
roc_auc_score(y_train, pred_train)

0.9283420118772603

In [41]:
cross_val_score(
    lgb, 
    X_train, 
    y_train, 
    scoring='roc_auc', 
    cv=3
).mean()


0.8777407219777199

In [18]:
lgb.fit(X_full, y_full)

LGBMClassifier(n_estimators=500)

In [19]:
test_predictions = lgb.predict_proba(X_test)[:, 1]

In [20]:
test_raw['Predicted'] = test_predictions

In [21]:
test_raw

Unnamed: 0,Id,B_OWNPV_CHI2,B_IPCHI2_OWNPV,B_FDCHI2_OWNPV,B_DIRA_OWNPV,B_PT,Kst_892_0_IP_OWNPV,Kst_892_0_cosThetaH,Kplus_IP_OWNPV,Kplus_P,piminus_IP_OWNPV,piminus_P,gamma_PT,piminus_ETA,Kplus_ETA,train,Predicted
0,0,20.173445,3.846368,120.871099,0.999868,6160.068877,0.283769,0.784221,0.221518,14938.796613,0.334843,18885.484755,5739.899165,3.395456,3.520709,0,0.025829
1,1,22.472050,3.463131,23290.089549,1.000000,4421.210474,2.289943,-0.755909,2.675985,27997.248030,1.512916,8658.765482,3511.829361,3.498786,3.427056,0,0.177569
2,2,13.758772,6.239888,2826.232845,0.999946,6271.957675,0.860737,0.321553,0.964498,8383.679923,0.846215,10310.459345,3211.637797,2.401201,2.224787,0,0.076474
3,3,39.320282,0.146056,992.465454,0.999998,7440.960902,0.485886,0.371864,0.457798,23595.091189,0.576270,13975.989410,5466.588206,3.203331,3.491705,0,0.361921
4,4,35.042408,3.912886,830.923177,0.999830,6120.895612,0.620750,0.325027,0.590861,6875.570461,0.721326,5065.980778,5250.449109,2.518374,2.190288,0,0.614534
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
141771,141771,30.533255,1.699120,247.567972,0.999888,8367.474105,0.221315,0.478428,0.300806,10682.845309,0.187368,25552.669086,6022.967681,3.105870,3.033990,0,0.016016
141772,141772,45.894135,0.304761,98.141535,0.999989,8777.501344,0.265711,-0.512062,0.241356,69159.089014,0.393407,10720.653365,5716.381079,3.508286,3.815329,0,0.436606
141773,141773,43.790756,2.437984,320.666560,0.999974,6736.548775,0.233608,0.073552,0.239403,37153.397051,0.238732,25575.359107,4168.692210,3.629445,3.312150,0,0.512434
141774,141774,43.101467,1.530856,3348.540676,0.999987,9328.262997,1.399666,0.435606,0.416667,21874.563688,1.604297,14153.166709,4692.972961,2.453287,2.540912,0,0.644057


In [22]:
test_raw[['Id', 'Predicted']].to_csv('../submissions/more_features_lgbm.csv', index=False)