In [1]:
import pandas as pd
import numpy as np
import utils
import re
import swifter
import scoring
import math
import catboost
from sklearn.model_selection import train_test_split
import lightgbm
from multiprocessing import Pool, Manager
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
closest_cols = np.empty(utils.N_FOI_FEATURES, dtype=object)
for i, coord in enumerate(['x', 'y', 'T', 'z', 'dx', 'dy']):
    for j in range(4):
        closest_cols[i*4+j] = "closest_{}[{}]".format(coord, j)

In [3]:
num_cores = 80
num_splits = 80
def parallelize_df(df, func, num_cores, num_splits):
    df_split = np.array_split(df, num_splits)
    print('Splitted')
    pool = Pool(num_cores)
    df_map = pool.map(func, df_split)
    pool.close()
    pool.join()
    print('Proccessed')
    df = pd.concat(df_map)
    print('Concatenated')
    return df
def closest_hits_df(df):
#    global ns
    df = df.apply(utils.find_closest_hit_per_station, result_type="expand", axis=1)
    return df.rename(lambda x: closest_cols[x], axis = 1)

In [4]:
train1 = pd.read_csv('data/train_part_1_v2.csv')
print(train1.shape)

train2 = pd.read_csv('data/train_part_2_v2.csv')
print(train2.shape)

train = pd.concat([train1, train2], axis = 0).reset_index(drop=True)
print(train.shape)

(2722852, 80)
(2722853, 80)
(5445705, 80)


In [5]:
def re_array(str_):
    return np.array(re.search(r'\[(.+)\]', str_.replace('\n', '')).group(1).split()).astype(float)
def re_array_df(df):
    for col in df.columns:
        df[col] = df[col].apply(re_array)
    return df

In [6]:
%%time
train[utils.FOI_COLUMNS] = parallelize_df(train[utils.FOI_COLUMNS], re_array_df, num_cores, num_splits) 

Splitted
Proccessed
Concatenated
CPU times: user 57.5 s, sys: 29.6 s, total: 1min 27s
Wall time: 1min 29s


In [7]:
%%time
closest_hits_features = parallelize_df(train, closest_hits_df, num_cores, num_splits)

Splitted
Proccessed
Concatenated
CPU times: user 3min 19s, sys: 54.8 s, total: 4min 14s
Wall time: 4min 53s


In [8]:
train_FOI = pd.concat([train[utils.SIMPLE_FEATURE_COLUMNS],
                          closest_hits_features], axis = 1)

In [9]:
my_columns = np.empty(4*4+23*4+8, dtype=object)
for i, coord in enumerate(['X', 'Y', 'Z', 'T']):
    for j in range(4):
        my_columns[i*4+j]='d/cord_{}[{}]'.format(coord, j)
for i in range(4):
    my_columns[16+23*i] = 'l2[{}]'.format(i)
    my_columns[16+23*i+1] = 'l3[{}]'.format(i)
    my_columns[16+23*i+2] = 'dl2[{}]'.format(i)
    my_columns[16+23*i+3] = 'dl3[{}]'.format(i)
    my_columns[16+23*i+4] = 'dl2/l2[{}]'.format(i)
    my_columns[16+23*i+5] = 'dl3/l3[{}]'.format(i)
    my_columns[16+23*i+6] = 's_x[{}]'.format(i)
    my_columns[16+23*i+7] = 's_y[{}]'.format(i)
    my_columns[16+23*i+8] = 's_dx[{}]'.format(i)
    my_columns[16+23*i+9] = 's_dy[{}]'.format(i)
    my_columns[16+23*i+10] = 'math.sqrt(s_dl2_2)/s_l2[{}]'.format(i)
    my_columns[16+23*i+11] = 's_l2[{}]'.format(i)
    my_columns[16+23*i+12] = 'math.sqrt(s_dl2)[{}]'.format(i)
    my_columns[16+23*i+13] = 's_dl2_2[{}]'.format(i)
    my_columns[16+23*i+14] = 'math.sqrt(s_dl2)/s_l2[{}]'.format(i)

    my_columns[16+23*i+15] = 'cl_diff_x[{}]'.format(i)
    my_columns[16+23*i+16] = 'cl_diff_y[{}]'.format(i)
    my_columns[16+23*i+17] = 'cl_T-T[{}]'.format(i)
    my_columns[16+23*i+18] = 'cl_diff_l^2[{}]'.format(i)

    my_columns[16+23*i+19] = 'cl_diff_ex[{}]'.format(i)
    my_columns[16+23*i+20] = 'cl_diff_ey[{}]'.format(i)
    my_columns[16+23*i+21] = 'cl_diff_el^2[{}]'.format(i)

    my_columns[16+23*i+22] = 'R[{}]'.format(i)
my_columns[-8] = 'D3^2'
my_columns[-7] = 'D2^2'
my_columns[-6] = 'D1^2'
my_columns[-5] = 'alpha_30_xy'
my_columns[-4] = 'alpha_30_x'
my_columns[-3] = 'alpha_30_y'
my_columns[-2] = 'PN'
my_columns[-1] = "PT/P"

In [10]:
def my_feauter_proc(row):
    X_SIZE = 6000.0*2
    Y_SIZE = 4820.0*2
    result = np.empty(4*4+23*4+8, np.float32)
    for i, coord in enumerate(['X', 'Y', 'Z', 'T']):
        for j in range(4):
            main = 'MatchedHit_{}[{}]'.format(coord, j)
            delta = 'MatchedHit_D{}[{}]'.format(coord, j)
            result[i*4+j] = row[delta]/row[main]
    D1 = []
    D2 = []
    D3 = []
    for i in range(4):
        x = row['MatchedHit_X[{}]'.format(i)]
        y = row['MatchedHit_Y[{}]'.format(i)]
        z = row['MatchedHit_Z[{}]'.format(i)]
        dx = row['MatchedHit_DX[{}]'.format(i)]
        dy = row['MatchedHit_DY[{}]'.format(i)]
        dz = row['MatchedHit_DZ[{}]'.format(i)]
        
        e_x = row['Lextra_X[{}]'.format(i)]
        e_y = row['Lextra_Y[{}]'.format(i)]
        e_dx = row['Mextra_DX2[{}]'.format(i)]
        e_dy = row['Mextra_DY2[{}]'.format(i)]
        
        cl_x = row['closest_x[{}]'.format(i)]
        cl_y = row['closest_y[{}]'.format(i)]
        cl_T = row['closest_T[{}]'.format(i)]
        cl_z = row['closest_z[{}]'.format(i)]
        
            
        l2 = math.sqrt(x**2+y**2)
        dl2 = math.sqrt(dx**2 + dy**2)
        
        l3 = math.sqrt(x**2 + y**2 + z**2)
        dl3 = math.sqrt(dx**2 + dy**2 + dz**2)
        
        s_x = x - e_x
        s_y = y - e_y
        s_dx = dx**2 + e_dx
        s_dy = dy**2 + e_dy
        s_dx2 = dx**2 - e_dx
        s_dy2 = dy**2 - e_dy
        
        s_l2 = math.sqrt(s_x**2 + s_y**2)
        s_dl2 = s_dx + s_dy
        s_dl2_2 = abs(s_dx2) + abs(s_dy2)
        
        cl_diff_x = cl_x - x
        cl_diff_y = cl_y - y
        
        cl_diff_ex = cl_x - e_x
        cl_diff_ey = cl_y - e_y
        
        if (abs(x) < X_SIZE/16) and (abs(y) < Y_SIZE/16):
            R = 1
        elif (abs(x) < X_SIZE/8) and (abs(y) < Y_SIZE/8):
            R = 2
        elif (abs(x) < X_SIZE/4) and (abs(y) < Y_SIZE/4):
            R = 3
        elif (abs(x) < X_SIZE/2) and (abs(y) < Y_SIZE/2):
            R = 4
        else:
            R = 0
        
        result[16+23*i] = l2
        result[16+23*i+1] = l3
        result[16+23*i+2] = dl2
        result[16+23*i+3] = dl3
        result[16+23*i+4] = dl2/l2
        result[16+23*i+5] = dl3/l3
        result[16+23*i+6] = s_x
        result[16+23*i+7] = s_y
        result[16+23*i+8] = s_dx
        result[16+23*i+9] = s_dy
        result[16+23*i+10] = math.sqrt(s_dl2_2)/s_l2
        result[16+23*i+11] = s_l2
        result[16+23*i+12] = math.sqrt(s_dl2)
        result[16+23*i+13] = s_dl2_2
        result[16+23*i+14] = math.sqrt(s_dl2)/s_l2
        
        result[16+23*i+15] = cl_diff_x
        result[16+23*i+16] = cl_diff_y
        result[16+23*i+17] = cl_T-row['MatchedHit_T[{}]'.format(i)]
        result[16+23*i+18] = cl_diff_x**2 + cl_diff_y**2
        
        result[16+23*i+19] = cl_diff_ex
        result[16+23*i+20] = cl_diff_ey
        result[16+23*i+21] = cl_diff_ex**2 + cl_diff_ey**2
        
        result[16+23*i+22] = R
        
        D1.append( (cl_diff_x/dx)**2 + (cl_diff_y/dy)**2 )
        D3.append( (cl_diff_ex/dx)**2 + (cl_diff_ey/dy)**2 )
        D2.append( cl_diff_ex**2/e_dx + cl_diff_ey**2/e_dy)
    
    result[-8] = sum(D3)
    result[-7] = sum(D2)
    result[-6] = sum(D1)
    result[-5] = math.sqrt(row['MatchedHit_X[3]']**2 + row['MatchedHit_Y[3]']**2)-math.sqrt(row['MatchedHit_X[0]']**2 + row['MatchedHit_Y[0]']**2)
    result[-4] = row['MatchedHit_X[3]']-row['MatchedHit_X[0]']
    result[-3] = row['MatchedHit_Y[3]']-row['MatchedHit_Y[0]']
    result[-2] = row['P']**2-row['PT']**2
    result[-1] = row['PT']/row['P']
    return result
def my_feauter_proc_df(df):
    df = df.apply(lambda x: my_feauter_proc(x), result_type="expand", axis = 1)
    return df.rename(lambda x: my_columns[x], axis=1)

In [11]:
%%time
my_feat = parallelize_df(train_FOI, my_feauter_proc_df, num_cores=num_cores, num_splits=num_splits)

Splitted
Proccessed
Concatenated
CPU times: user 7.54 s, sys: 38.4 s, total: 46 s
Wall time: 3min 27s


In [12]:
train_concat = pd.concat([train_FOI, my_feat], axis = 1)
print(train_concat.shape)
train_concat.head(2)

(5445705, 205)


Unnamed: 0,ncl[0],ncl[1],ncl[2],ncl[3],avg_cs[0],avg_cs[1],avg_cs[2],avg_cs[3],ndof,MatchedHit_TYPE[0],...,cl_diff_el^2[3],R[3],D3^2,D2^2,D1^2,alpha_30_xy,alpha_30_x,alpha_30_y,PN,PT/P
0,47,31,13,15,2.0,1.580645,1.153846,1.133333,8,2,...,1456185000.0,4.0,131185.03125,396076.25,132652.28125,974.040405,-924.090027,327.719513,156827024.0,0.139555
1,92,19,11,26,2.75,2.789474,1.363636,1.230769,8,2,...,3378341000.0,4.0,12057.380859,4563225.0,12211.955078,548.41333,19.17,564.381104,795129472.0,0.157977


# For c++

In [32]:
cat_train_cpp = catboost.Pool(data=train_concat, 
                              label=train.label, 
                              weight=abs(train.weight))

In [31]:
params = {
    'iterations':700,
    'loss_function':'Logloss',
    'learning_rate':0.1,
    'max_depth':8,
    'thread_count':80,
    'logging_level':'Verbose'
}

In [33]:
cat_cpp = catboost.CatBoost(params=params)
cat_cpp.fit(cat_train_cpp)

0:	learn: 0.6348357	total: 943ms	remaining: 18m 50s
1:	learn: 0.5931721	total: 2.15s	remaining: 21m 28s
2:	learn: 0.5651379	total: 3.14s	remaining: 20m 52s
3:	learn: 0.5462601	total: 4.54s	remaining: 22m 37s
4:	learn: 0.5308679	total: 5.39s	remaining: 21m 27s
5:	learn: 0.5201343	total: 6.22s	remaining: 20m 38s
6:	learn: 0.5113613	total: 7.41s	remaining: 21m 2s
7:	learn: 0.5048033	total: 8.57s	remaining: 21m 16s
8:	learn: 0.4999846	total: 9.4s	remaining: 20m 43s
9:	learn: 0.4956317	total: 10.6s	remaining: 21m 1s
10:	learn: 0.4922384	total: 11.4s	remaining: 20m 32s
11:	learn: 0.4894429	total: 12.8s	remaining: 21m 3s
12:	learn: 0.4865601	total: 13.6s	remaining: 20m 44s
13:	learn: 0.4843079	total: 14.4s	remaining: 20m 22s
14:	learn: 0.4819072	total: 15.6s	remaining: 20m 34s
15:	learn: 0.4802393	total: 16.4s	remaining: 20m 16s
16:	learn: 0.4782949	total: 17.6s	remaining: 20m 24s
17:	learn: 0.4766064	total: 18.4s	remaining: 20m 6s
18:	learn: 0.4753144	total: 19.1s	remaining: 19m 49s
19:	lear

154:	learn: 0.4305371	total: 2m 41s	remaining: 18m 7s
155:	learn: 0.4302796	total: 2m 42s	remaining: 18m 8s
156:	learn: 0.4300827	total: 2m 43s	remaining: 18m 6s
157:	learn: 0.4299422	total: 2m 44s	remaining: 18m 6s
158:	learn: 0.4296415	total: 2m 45s	remaining: 18m 5s
159:	learn: 0.4294716	total: 2m 47s	remaining: 18m 7s
160:	learn: 0.4292620	total: 2m 48s	remaining: 18m 8s
161:	learn: 0.4290400	total: 2m 49s	remaining: 18m 6s
162:	learn: 0.4287623	total: 2m 50s	remaining: 18m 7s
163:	learn: 0.4285358	total: 2m 51s	remaining: 18m 5s
164:	learn: 0.4283299	total: 2m 53s	remaining: 18m 5s
165:	learn: 0.4281544	total: 2m 54s	remaining: 18m 5s
166:	learn: 0.4279639	total: 2m 55s	remaining: 18m 3s
167:	learn: 0.4278496	total: 2m 56s	remaining: 18m 2s
168:	learn: 0.4276994	total: 2m 57s	remaining: 18m 2s
169:	learn: 0.4275684	total: 2m 58s	remaining: 18m
170:	learn: 0.4274317	total: 2m 59s	remaining: 18m 1s
171:	learn: 0.4272440	total: 3m	remaining: 17m 59s
172:	learn: 0.4271324	total: 3m 1s

305:	learn: 0.4079233	total: 5m 18s	remaining: 15m 30s
306:	learn: 0.4078235	total: 5m 19s	remaining: 15m 29s
307:	learn: 0.4076785	total: 5m 20s	remaining: 15m 27s
308:	learn: 0.4075232	total: 5m 21s	remaining: 15m 26s
309:	learn: 0.4073522	total: 5m 22s	remaining: 15m 25s
310:	learn: 0.4072417	total: 5m 23s	remaining: 15m 23s
311:	learn: 0.4070586	total: 5m 24s	remaining: 15m 22s
312:	learn: 0.4069350	total: 5m 24s	remaining: 15m 20s
313:	learn: 0.4067882	total: 5m 26s	remaining: 15m 20s
314:	learn: 0.4066958	total: 5m 26s	remaining: 15m 18s
315:	learn: 0.4065320	total: 5m 27s	remaining: 15m 17s
316:	learn: 0.4064366	total: 5m 28s	remaining: 15m 16s
317:	learn: 0.4062291	total: 5m 30s	remaining: 15m 15s
318:	learn: 0.4061169	total: 5m 31s	remaining: 15m 14s
319:	learn: 0.4060214	total: 5m 32s	remaining: 15m 13s
320:	learn: 0.4058700	total: 5m 32s	remaining: 15m 11s
321:	learn: 0.4057366	total: 5m 33s	remaining: 15m 9s
322:	learn: 0.4056327	total: 5m 34s	remaining: 15m 8s
323:	learn: 

455:	learn: 0.3904247	total: 7m 55s	remaining: 12m 55s
456:	learn: 0.3903442	total: 7m 56s	remaining: 12m 54s
457:	learn: 0.3902501	total: 7m 57s	remaining: 12m 53s
458:	learn: 0.3901600	total: 7m 58s	remaining: 12m 52s
459:	learn: 0.3900759	total: 7m 59s	remaining: 12m 51s
460:	learn: 0.3898954	total: 8m	remaining: 12m 50s
461:	learn: 0.3897701	total: 8m 1s	remaining: 12m 49s
462:	learn: 0.3896780	total: 8m 2s	remaining: 12m 48s
463:	learn: 0.3895971	total: 8m 3s	remaining: 12m 46s
464:	learn: 0.3894572	total: 8m 4s	remaining: 12m 45s
465:	learn: 0.3893367	total: 8m 5s	remaining: 12m 44s
466:	learn: 0.3892131	total: 8m 6s	remaining: 12m 43s
467:	learn: 0.3891410	total: 8m 7s	remaining: 12m 42s
468:	learn: 0.3890729	total: 8m 8s	remaining: 12m 41s
469:	learn: 0.3889913	total: 8m 9s	remaining: 12m 40s
470:	learn: 0.3889096	total: 8m 10s	remaining: 12m 38s
471:	learn: 0.3888140	total: 8m 11s	remaining: 12m 37s
472:	learn: 0.3887027	total: 8m 12s	remaining: 12m 36s
473:	learn: 0.3885915	t

605:	learn: 0.3758132	total: 10m 22s	remaining: 10m 10s
606:	learn: 0.3757503	total: 10m 23s	remaining: 10m 9s
607:	learn: 0.3756542	total: 10m 24s	remaining: 10m 8s
608:	learn: 0.3755649	total: 10m 26s	remaining: 10m 7s
609:	learn: 0.3755070	total: 10m 26s	remaining: 10m 6s
610:	learn: 0.3754066	total: 10m 27s	remaining: 10m 5s
611:	learn: 0.3753101	total: 10m 28s	remaining: 10m 4s
612:	learn: 0.3752364	total: 10m 29s	remaining: 10m 2s
613:	learn: 0.3751024	total: 10m 30s	remaining: 10m 1s
614:	learn: 0.3750032	total: 10m 31s	remaining: 10m
615:	learn: 0.3748822	total: 10m 33s	remaining: 10m
616:	learn: 0.3748039	total: 10m 33s	remaining: 9m 58s
617:	learn: 0.3746991	total: 10m 34s	remaining: 9m 57s
618:	learn: 0.3745862	total: 10m 35s	remaining: 9m 56s
619:	learn: 0.3744869	total: 10m 36s	remaining: 9m 55s
620:	learn: 0.3744345	total: 10m 37s	remaining: 9m 54s
621:	learn: 0.3743314	total: 10m 38s	remaining: 9m 53s
622:	learn: 0.3742151	total: 10m 40s	remaining: 9m 52s
623:	learn: 0.3

755:	learn: 0.3625208	total: 12m 50s	remaining: 7m 32s
756:	learn: 0.3624648	total: 12m 50s	remaining: 7m 31s
757:	learn: 0.3623847	total: 12m 52s	remaining: 7m 30s
758:	learn: 0.3623158	total: 12m 52s	remaining: 7m 29s
759:	learn: 0.3622560	total: 12m 53s	remaining: 7m 28s
760:	learn: 0.3621719	total: 12m 54s	remaining: 7m 26s
761:	learn: 0.3621029	total: 12m 56s	remaining: 7m 26s
762:	learn: 0.3620315	total: 12m 56s	remaining: 7m 24s
763:	learn: 0.3619411	total: 12m 57s	remaining: 7m 23s
764:	learn: 0.3618577	total: 12m 58s	remaining: 7m 22s
765:	learn: 0.3617687	total: 12m 59s	remaining: 7m 21s
766:	learn: 0.3616793	total: 13m	remaining: 7m 20s
767:	learn: 0.3616078	total: 13m 1s	remaining: 7m 19s
768:	learn: 0.3615370	total: 13m 2s	remaining: 7m 18s
769:	learn: 0.3614603	total: 13m 3s	remaining: 7m 17s
770:	learn: 0.3613813	total: 13m 4s	remaining: 7m 16s
771:	learn: 0.3613184	total: 13m 5s	remaining: 7m 15s
772:	learn: 0.3612348	total: 13m 6s	remaining: 7m 14s
773:	learn: 0.361140

906:	learn: 0.3509162	total: 15m 16s	remaining: 4m 55s
907:	learn: 0.3508538	total: 15m 16s	remaining: 4m 54s
908:	learn: 0.3507652	total: 15m 18s	remaining: 4m 53s
909:	learn: 0.3506982	total: 15m 19s	remaining: 4m 53s
910:	learn: 0.3506472	total: 15m 20s	remaining: 4m 51s
911:	learn: 0.3505618	total: 15m 21s	remaining: 4m 50s
912:	learn: 0.3504837	total: 15m 22s	remaining: 4m 49s
913:	learn: 0.3504210	total: 15m 23s	remaining: 4m 48s
914:	learn: 0.3503280	total: 15m 24s	remaining: 4m 47s
915:	learn: 0.3502572	total: 15m 25s	remaining: 4m 46s
916:	learn: 0.3502046	total: 15m 26s	remaining: 4m 45s
917:	learn: 0.3501259	total: 15m 27s	remaining: 4m 44s
918:	learn: 0.3500477	total: 15m 28s	remaining: 4m 43s
919:	learn: 0.3499829	total: 15m 29s	remaining: 4m 42s
920:	learn: 0.3499126	total: 15m 30s	remaining: 4m 41s
921:	learn: 0.3498440	total: 15m 31s	remaining: 4m 40s
922:	learn: 0.3497774	total: 15m 31s	remaining: 4m 39s
923:	learn: 0.3497136	total: 15m 32s	remaining: 4m 38s
924:	learn

1055:	learn: 0.3405662	total: 17m 41s	remaining: 2m 24s
1056:	learn: 0.3404954	total: 17m 43s	remaining: 2m 23s
1057:	learn: 0.3404270	total: 17m 43s	remaining: 2m 22s
1058:	learn: 0.3403823	total: 17m 45s	remaining: 2m 21s
1059:	learn: 0.3403005	total: 17m 45s	remaining: 2m 20s
1060:	learn: 0.3402423	total: 17m 46s	remaining: 2m 19s
1061:	learn: 0.3401787	total: 17m 47s	remaining: 2m 18s
1062:	learn: 0.3401443	total: 17m 48s	remaining: 2m 17s
1063:	learn: 0.3401022	total: 17m 49s	remaining: 2m 16s
1064:	learn: 0.3400393	total: 17m 50s	remaining: 2m 15s
1065:	learn: 0.3399769	total: 17m 51s	remaining: 2m 14s
1066:	learn: 0.3399421	total: 17m 52s	remaining: 2m 13s
1067:	learn: 0.3398736	total: 17m 53s	remaining: 2m 12s
1068:	learn: 0.3398038	total: 17m 54s	remaining: 2m 11s
1069:	learn: 0.3397293	total: 17m 54s	remaining: 2m 10s
1070:	learn: 0.3396835	total: 17m 55s	remaining: 2m 9s
1071:	learn: 0.3396220	total: 17m 56s	remaining: 2m 8s
1072:	learn: 0.3395468	total: 17m 57s	remaining: 2

<catboost.core.CatBoost at 0x7fe5cd4062b0>

In [34]:
cat_cpp.save_model('models/cat_full_cpp.cbm')