# calculate features

In [1]:
import pandas as pd
import numpy as np
import gc
import os

In [4]:
shared_data_path = r'C:\\Users\\CWinsor\\Documents\\code_kaggle_plasticc___shared_data\\PLAsTiCC-2018'

In [5]:
# read training data
col_dict = {'mjd': np.float64, 'flux': np.float32, 'flux_err': np.float32, 'object_id': np.int32, 'passband': np.int8,
            'detected': np.int8}
train_meta = pd.read_csv(os.path.join(shared_data_path, 'training_set_metadata.csv'))
train = pd.read_csv(os.path.join(shared_data_path, 'training_set.csv'), dtype=col_dict)

In [6]:
def calc_aggs(all_data, exact):

    # aggregate features
    band_aggs = all_data.groupby(['object_id', 'passband'])['flux'].agg(['mean', 'std', 'max', 'min']).unstack(-1)
    band_aggs.columns = [x + '_' + str(y) for x in band_aggs.columns.levels[0]
                          for y in band_aggs.columns.levels[1]]
    all_data.sort_values(['object_id', 'passband', 'flux'], inplace=True)
    # this way of calculating quantiles is faster than using the pandas quantile builtin on the groupby object
    all_data['group_count'] = all_data.groupby(['object_id', 'passband']).cumcount()
    all_data['group_size'] = all_data.groupby(['object_id', 'passband'])['flux'].transform('size')
    q_list = [0.25, 0.75]
    for q in q_list:
        all_data['q_' + str(q)] = all_data.loc[
            (all_data['group_size'] * q).astype(int) == all_data['group_count'], 'flux']
    quantiles = all_data.groupby(['object_id', 'passband'])[['q_' + str(q) for q in q_list]].max().unstack(-1)
    quantiles.columns = [str(x) + '_' + str(y) + '_quantile' for x in quantiles.columns.levels[0]
                         for y in quantiles.columns.levels[1]]

    new_data = pd.concat([band_aggs, quantiles], axis=1)             

    #new_data = pd.concat([band_aggs, quantiles, band_aggs_s, max_detected, time_between_detections[['det_period']],
    #                      time_between_detections_pb, extreme_max, extreme_min, extreme_max_s, extreme_min_s,
    #                      time_between_highs[['det_period_high']], quantiles_s, detection_time_dist,
    #                      detection_time_dist_all, det_aggs], axis=1)

    return new_data

In [9]:
# calculate features
new_data_exact = calc_aggs(train.copy(), True)

In [11]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1421705 entries, 0 to 1421704
Data columns (total 6 columns):
 #   Column     Non-Null Count    Dtype  
---  ------     --------------    -----  
 0   object_id  1421705 non-null  int32  
 1   mjd        1421705 non-null  float64
 2   passband   1421705 non-null  int8   
 3   flux       1421705 non-null  float32
 4   flux_err   1421705 non-null  float32
 5   detected   1421705 non-null  int8   
dtypes: float32(2), float64(1), int32(1), int8(2)
memory usage: 29.8 MB


In [10]:
train.head(4)

Unnamed: 0,object_id,mjd,passband,flux,flux_err,detected
0,615,59750.4229,2,-544.810303,3.622952,1
1,615,59750.4306,1,-816.434326,5.55337,1
2,615,59750.4383,3,-471.385529,3.801213,1
3,615,59750.445,4,-388.984985,11.395031,1


In [12]:
new_data_exact.head(2)

Unnamed: 0_level_0,mean_0,mean_1,mean_2,mean_3,mean_4,mean_5,std_0,std_1,std_2,std_3,...,q_0.25_2_quantile,q_0.25_3_quantile,q_0.25_4_quantile,q_0.25_5_quantile,q_0.75_0_quantile,q_0.75_1_quantile,q_0.75_2_quantile,q_0.75_3_quantile,q_0.75_4_quantile,q_0.75_5_quantile
object_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
615,-3.254555,-385.699921,-134.146561,-121.1035,-55.954594,-47.449848,83.944735,601.787302,455.121346,335.425053,...,-555.853943,-455.588196,-347.090027,-354.07428,82.168922,129.541901,341.057709,208.770279,235.489929,256.966217
713,-2.720398,-1.019804,-0.794238,-0.986966,-0.900262,-1.794175,7.113509,5.712334,5.770738,6.450413,...,-5.760825,-6.185159,-6.208874,-7.436915,2.671449,3.302556,3.984369,4.768611,4.485665,3.404367


In [8]:
new_data_exact.columns

Index(['mean_0', 'mean_1', 'mean_2', 'mean_3', 'mean_4', 'mean_5', 'std_0',
       'std_1', 'std_2', 'std_3', 'std_4', 'std_5', 'max_0', 'max_1', 'max_2',
       'max_3', 'max_4', 'max_5', 'min_0', 'min_1', 'min_2', 'min_3', 'min_4',
       'min_5', 'q_0.25_0_quantile', 'q_0.25_1_quantile', 'q_0.25_2_quantile',
       'q_0.25_3_quantile', 'q_0.25_4_quantile', 'q_0.25_5_quantile',
       'q_0.75_0_quantile', 'q_0.75_1_quantile', 'q_0.75_2_quantile',
       'q_0.75_3_quantile', 'q_0.75_4_quantile', 'q_0.75_5_quantile'],
      dtype='object')

In [9]:
new_data_exact.index

Int64Index([      615,       713,       730,       745,      1124,      1227,
                 1598,      1632,      1920,      1926,
            ...
            130684460, 130695262, 130698059, 130716752, 130727624, 130739978,
            130755807, 130762946, 130772921, 130779836],
           dtype='int64', name='object_id', length=7848)

In [10]:
# get the metadata
test_meta = pd.read_csv(os.path.join(shared_data_path, 'test_set_metadata.csv'))
all_meta = pd.concat([train_meta, test_meta], axis=0, ignore_index=True, sort=True).reset_index()
all_meta.drop('index', axis=1, inplace=True)
n_chunks = 100


In [11]:
# calculate features
new_data_exact = calc_aggs(train.copy(), True)
new_data_approx = calc_aggs(train.copy(), False)
train_meta_exact = pd.merge(train_meta, new_data_exact, 'left', left_on='object_id', right_index=True)
train_meta_approx = pd.merge(train_meta, new_data_approx, 'left', left_on='object_id', right_index=True)


In [12]:
new_data_exact

Unnamed: 0_level_0,mean_0,mean_1,mean_2,mean_3,mean_4,mean_5,std_0,std_1,std_2,std_3,...,q_0.25_2_quantile,q_0.25_3_quantile,q_0.25_4_quantile,q_0.25_5_quantile,q_0.75_0_quantile,q_0.75_1_quantile,q_0.75_2_quantile,q_0.75_3_quantile,q_0.75_4_quantile,q_0.75_5_quantile
object_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
615,-3.254555,-385.699921,-134.146561,-121.103500,-55.954594,-47.449848,83.944733,601.787292,455.121338,335.425049,...,-555.853943,-455.588196,-347.090027,-354.074280,82.168922,129.541901,341.057709,208.770279,235.489929,256.966217
713,-2.720398,-1.019804,-0.794238,-0.986966,-0.900262,-1.794175,7.113509,5.712334,5.770738,6.450413,...,-5.760825,-6.185159,-6.208874,-7.436915,2.671449,3.302556,3.984369,4.768611,4.485665,3.404367
730,-0.048080,0.141057,2.400870,3.236164,4.308728,4.539396,1.828872,1.807229,5.559483,8.191987,...,-0.465229,-0.846692,-1.036243,-3.150459,1.228119,0.973025,2.330264,3.270966,4.244992,8.474236
745,1.797523,5.717394,9.711532,14.412925,13.134436,10.746138,4.374445,25.964659,31.957998,34.967697,...,-0.589270,-0.343889,-0.577881,-2.357999,2.605217,3.482585,8.645923,14.269365,13.113527,12.491076
1124,0.660948,4.634637,10.243968,11.086555,9.906102,6.896741,2.360085,8.107525,21.319853,26.270649,...,0.056974,-0.260505,-0.013201,-3.416412,2.230999,6.302905,12.010501,8.794269,7.323390,6.814576
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
130739978,1.286655,11.604012,1.487066,4.696556,-1.326906,20.073462,15.998517,42.103638,9.781001,9.132244,...,-3.305224,-0.348405,-10.791011,-22.310572,6.918293,1.702036,3.295825,8.434893,9.429575,33.716682
130755807,24.635242,6.965825,-0.779640,27.147503,40.758274,3.528390,79.713936,16.268583,4.606559,88.177979,...,-4.309475,-1.818439,-3.739179,-23.535288,20.451754,6.636871,3.179184,16.398903,18.921553,19.697828
130762946,-0.282914,-20.094296,-28.388796,-13.723449,-15.201844,-9.838346,46.605186,23.572412,29.855715,31.491709,...,-41.514149,-32.555824,-30.323326,-37.571419,21.228121,-7.865635,-1.860571,3.131885,2.509389,7.717197
130772921,3.374208,28.342249,0.618994,-0.357920,-1.135067,7.671389,14.420201,97.513710,6.550608,7.667433,...,-2.612158,-3.832796,-8.072168,-13.994596,8.315109,0.980240,2.096250,2.246015,7.432199,23.434837


In [50]:
train_meta_exact

Unnamed: 0,object_id,ra,decl,gal_l,gal_b,ddf,hostgal_specz,hostgal_photoz,hostgal_photoz_err,distmod,...,q_0.25_2_quantile,q_0.25_3_quantile,q_0.25_4_quantile,q_0.25_5_quantile,q_0.75_0_quantile,q_0.75_1_quantile,q_0.75_2_quantile,q_0.75_3_quantile,q_0.75_4_quantile,q_0.75_5_quantile
0,615,349.046051,-61.943836,320.796530,-51.753706,1,0.0000,0.0000,0.0000,,...,-555.853943,-455.588196,-347.090027,-354.074280,82.168922,129.541901,341.057709,208.770279,235.489929,256.966217
1,713,53.085938,-27.784405,223.525509,-54.460748,1,1.8181,1.6267,0.2552,45.4063,...,-5.760825,-6.185159,-6.208874,-7.436915,2.671449,3.302556,3.984369,4.768611,4.485665,3.404367
2,730,33.574219,-6.579593,170.455585,-61.548219,1,0.2320,0.2262,0.0157,40.2561,...,-0.465229,-0.846692,-1.036243,-3.150459,1.228119,0.973025,2.330264,3.270966,4.244992,8.474236
3,745,0.189873,-45.586655,328.254458,-68.969298,1,0.3037,0.2813,1.1523,40.7951,...,-0.589270,-0.343889,-0.577881,-2.357999,2.605217,3.482585,8.645923,14.269365,13.113527,12.491076
4,1124,352.711273,-63.823658,316.922299,-51.059403,1,0.1934,0.2415,0.0176,40.4166,...,0.056974,-0.260505,-0.013201,-3.416412,2.230999,6.302905,12.010501,8.794269,7.323390,6.814576
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7843,130739978,26.718750,-14.940303,172.342697,-72.255675,0,0.0000,0.0000,0.0000,,...,-3.305224,-0.348405,-10.791011,-22.310572,6.918293,1.702036,3.295825,8.434893,9.429575,33.716682
7844,130755807,120.101349,-62.696659,275.742955,-16.509746,0,0.1725,2.5606,1.1146,46.6108,...,-4.309475,-1.818439,-3.739179,-23.535288,20.451754,6.636871,3.179184,16.398903,18.921553,19.697828
7845,130762946,203.108109,-55.682144,308.728904,6.727511,0,0.0000,0.0000,0.0000,,...,-41.514149,-32.555824,-30.323326,-37.571419,21.228121,-7.865635,-1.860571,3.131885,2.509389,7.717197
7846,130772921,79.101562,-35.501846,239.172243,-33.827844,0,0.0000,0.0000,0.0000,,...,-2.612158,-3.832796,-8.072168,-13.994596,8.315109,0.980240,2.096250,2.246015,7.432199,23.434837


In [53]:
# process training set (not actually used, just to get right shape of dataframe)
new_data_arr = []
new_data_arr.append(calc_aggs(train.copy(), True))
# process test set
for i in range(n_chunks):
    df = pd.read_hdf(os.path.join(shared_data_path, 'split_{}'.format(n_chunks), 'chunk_{}.hdf5'.format(i)), key='file0')
    df.drop('index', axis=1, inplace=True)
    print('Read chunk {}'.format(i))
    new_data_arr.append(calc_aggs(df.copy(), True))
    print('Calculated features for chunk {}'.format(i))
del df
gc.collect()
new_data = pd.concat(new_data_arr, axis=0, sort=True)


Read chunk 0
Calculated features for chunk 0
Read chunk 1
Calculated features for chunk 1
Read chunk 2
Calculated features for chunk 2
Read chunk 3
Calculated features for chunk 3
Read chunk 4
Calculated features for chunk 4
Read chunk 5
Calculated features for chunk 5
Read chunk 6
Calculated features for chunk 6
Read chunk 7
Calculated features for chunk 7
Read chunk 8
Calculated features for chunk 8
Read chunk 9
Calculated features for chunk 9
Read chunk 10
Calculated features for chunk 10
Read chunk 11
Calculated features for chunk 11
Read chunk 12
Calculated features for chunk 12
Read chunk 13
Calculated features for chunk 13
Read chunk 14
Calculated features for chunk 14
Read chunk 15
Calculated features for chunk 15
Read chunk 16
Calculated features for chunk 16
Read chunk 17
Calculated features for chunk 17
Read chunk 18
Calculated features for chunk 18
Read chunk 19
Calculated features for chunk 19
Read chunk 20
Calculated features for chunk 20
Read chunk 21
Calculated features

In [54]:
new_data_arr

[               mean_0      mean_1       mean_2      mean_3       mean_4  \
 object_id                                                                 
 615         -3.254555 -385.699921  -134.146561 -121.103500   -55.954594   
 713         -2.720398   -1.019804    -0.794238   -0.986966    -0.900262   
 730         -0.048080    0.141057     2.400870    3.236164     4.308728   
 745          1.797523    5.717394     9.711532   14.412925    13.134436   
 1124         0.660948    4.634637    10.243968   11.086555     9.906102   
 ...               ...         ...          ...         ...          ...   
 130739978    1.286655   11.604012     1.487066    4.696556    -1.326906   
 130755807   24.635242    6.965825    -0.779640   27.147503    40.758274   
 130762946   -0.282914  -20.094296   -28.388796  -13.723449   -15.201844   
 130772921    3.374208   28.342249     0.618994   -0.357920    -1.135067   
 130779836  812.700928  725.169861  2690.360352  755.993896  3489.185303   
 
          

In [55]:
new_data

Unnamed: 0_level_0,max_0,max_1,max_2,max_3,max_4,max_5,mean_0,mean_1,mean_2,mean_3,...,q_0.75_2_quantile,q_0.75_3_quantile,q_0.75_4_quantile,q_0.75_5_quantile,std_0,std_1,std_2,std_3,std_4,std_5
object_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
615,125.182808,660.626343,611.984558,445.737061,381.953735,378.188141,-3.254555,-385.699921,-134.146561,-121.103500,...,341.057709,208.770279,235.489929,256.966217,83.944733,601.787292,455.121338,335.425049,291.803436,294.779510
713,14.509829,9.129021,10.529041,11.330316,9.827934,14.770886,-2.720398,-1.019804,-0.794238,-0.986966,...,3.984369,4.768611,4.485665,3.404367,7.113509,5.712334,5.770738,6.450413,6.406989,7.094073
730,5.942166,5.693109,20.994711,33.572102,41.159981,47.310059,-0.048080,0.141057,2.400870,3.236164,...,2.330264,3.270966,4.244992,8.474236,1.828872,1.807229,5.559483,8.191987,10.710344,13.332758
745,18.014029,192.244293,220.795212,203.250702,183.633118,141.513290,1.797523,5.717394,9.711532,14.412925,...,8.645923,14.269365,13.113527,12.491076,4.374445,25.964659,31.957998,34.967697,33.069054,26.060129
1124,5.330927,37.170177,106.671692,139.818405,143.600189,109.157585,0.660948,4.634637,10.243968,11.086555,...,12.010501,8.794269,7.323390,6.814576,2.360085,8.107525,21.319853,26.270649,26.865913,21.434628
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32267,7.360654,4.601959,7.677400,7.962985,9.225177,17.060766,0.597122,0.246621,0.990978,0.879636,...,1.625979,3.302037,3.392185,4.677769,2.485040,1.896895,2.112185,3.118261,3.830864,7.055592
32269,5.101140,2.842719,5.854305,14.855160,13.826154,14.226738,-0.006226,-0.200515,0.505341,1.953696,...,1.400674,3.383695,4.235683,6.760887,2.187066,1.325965,1.785630,3.554424,3.976609,6.299774
32284,5.162250,3.702271,4.810881,3.743448,5.997475,13.626250,-0.215683,0.085841,0.314429,-0.155031,...,0.766787,1.416236,1.404186,2.279009,2.018510,1.392899,1.328564,1.794698,1.977529,4.783540
32293,25.503233,82.557831,121.372177,110.651031,107.269577,90.963181,1.584891,4.560369,9.978071,11.740754,...,5.060884,9.078282,14.541861,14.206085,6.142158,15.125955,27.793428,25.644709,26.822813,25.282307


In [56]:
# merge
all_meta = pd.merge(all_meta, new_data, 'left', left_on='object_id', right_index=True)


In [57]:
all_meta

Unnamed: 0,ddf,decl,distmod,gal_b,gal_l,hostgal_photoz,hostgal_photoz_err,hostgal_specz,mwebv,object_id,...,q_0.75_2_quantile,q_0.75_3_quantile,q_0.75_4_quantile,q_0.75_5_quantile,std_0,std_1,std_2,std_3,std_4,std_5
0,1,-61.943836,,-51.753706,320.796530,0.0000,0.0000,0.0000,0.017,615,...,341.057709,208.770279,235.489929,256.966217,83.944733,601.787292,455.121338,335.425049,291.803436,294.779510
1,1,-27.784405,45.4063,-54.460748,223.525509,1.6267,0.2552,1.8181,0.007,713,...,3.984369,4.768611,4.485665,3.404367,7.113509,5.712334,5.770738,6.450413,6.406989,7.094073
2,1,-6.579593,40.2561,-61.548219,170.455585,0.2262,0.0157,0.2320,0.021,730,...,2.330264,3.270966,4.244992,8.474236,1.828872,1.807229,5.559483,8.191987,10.710344,13.332758
3,1,-45.586655,40.7951,-68.969298,328.254458,0.2813,1.1523,0.3037,0.007,745,...,8.645923,14.269365,13.113527,12.491076,4.374445,25.964659,31.957998,34.967697,33.069054,26.060129
4,1,-63.823658,40.4166,-51.059403,316.922299,0.2415,0.0176,0.1934,0.024,1124,...,12.010501,8.794269,7.323390,6.814576,2.360085,8.107525,21.319853,26.270649,26.865913,21.434628
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3500733,0,-23.806295,41.9836,-40.940434,221.904509,0.4493,0.9954,,0.036,130787966,...,,,,,,,,,,
3500734,0,-32.974850,43.0419,-17.528223,241.585054,0.6729,0.0614,,0.083,130787971,...,,,,,,,,,,
3500735,0,-21.542267,38.7604,15.069447,247.349359,0.1211,0.0093,,0.136,130787974,...,,,,,,,,,,
3500736,0,-0.895283,41.8625,61.327851,316.152852,0.4287,0.2616,,0.028,130788053,...,,,,,,,,,,


In [60]:
# write output
dir_name = 'features'
if not os.path.exists(os.path.join(shared_data_path, dir_name)):
    os.mkdir(os.path.join(shared_data_path, dir_name))
all_meta.to_hdf(os.path.join(shared_data_path, dir_name, 'all_data.hdf5'), key='file0')
train_meta_exact.to_hdf(os.path.join(shared_data_path, dir_name, 'train_meta_exact.hdf5'), key='file0')
train_meta_approx.to_hdf(os.path.join(shared_data_path, dir_name, 'train_meta_approx.hdf5'), key='file0')
