# Review B.Trotta's "calc_aggs"
calc_aggs is the function that generates features from the timeseries data.  This is the "feature engineering" step and the core of the submission.

This notebook investigates each step in "calc_aggs" method.  It is mostly a copy/paste from the btrotta source tree https://github.com/btrotta/kaggle-plasticc.git file calculate_features.py 

In [29]:
import pandas as pd
import numpy as np
import gc
import os

In [30]:
col_dict = {'mjd': np.float64, 'flux': np.float32, 'flux_err': np.float32, 'object_id': np.int32, 'passband': np.int8,
            'detected': np.int8}

In [31]:
path = '/mnt/c/Users/CWinsor/Documents/code_kaggle_plasticc___shared_data/PLAsTiCC-2018'
train_meta = pd.read_csv(os.path.join(path, 'training_set_metadata.csv'))

In [32]:
train_meta.head(5)

Unnamed: 0,object_id,ra,decl,gal_l,gal_b,ddf,hostgal_specz,hostgal_photoz,hostgal_photoz_err,distmod,mwebv,target
0,615,349.046051,-61.943836,320.79653,-51.753706,1,0.0,0.0,0.0,,0.017,92
1,713,53.085938,-27.784405,223.525509,-54.460748,1,1.8181,1.6267,0.2552,45.4063,0.007,88
2,730,33.574219,-6.579593,170.455585,-61.548219,1,0.232,0.2262,0.0157,40.2561,0.021,42
3,745,0.189873,-45.586655,328.254458,-68.969298,1,0.3037,0.2813,1.1523,40.7951,0.007,90
4,1124,352.711273,-63.823658,316.922299,-51.059403,1,0.1934,0.2415,0.0176,40.4166,0.024,90


In [33]:
train = pd.read_csv(os.path.join(path, 'training_set.csv'), dtype=col_dict)

In [34]:
train.head(10)

Unnamed: 0,object_id,mjd,passband,flux,flux_err,detected
0,615,59750.4229,2,-544.810303,3.622952,1
1,615,59750.4306,1,-816.434326,5.55337,1
2,615,59750.4383,3,-471.385529,3.801213,1
3,615,59750.445,4,-388.984985,11.395031,1
4,615,59752.407,2,-681.858887,4.041204,1
5,615,59752.4147,1,-1061.457031,6.472994,1
6,615,59752.4224,3,-524.95459,3.552751,1
7,615,59752.4334,4,-393.480225,3.599346,1
8,615,59752.4435,5,-355.88678,10.421921,1
9,615,59767.2968,2,-548.01355,3.462291,1


In [35]:
train.describe()

Unnamed: 0,object_id,mjd,passband,flux,flux_err,detected
count,1421705.0,1421705.0,1421705.0,1421705.0,1421705.0,1421705.0
mean,33926080.0,60179.21,2.783108,24.68856,25.48932,0.1666168
std,42584390.0,309.2379,1.703946,3684.506,3814.491,0.3726336
min,615.0,59580.03,0.0,-1149388.0,0.463753,0.0
25%,184340.0,59899.05,1.0,-2.789418,2.11867,0.0
50%,4548783.0,60193.31,3.0,0.733199,4.708105,0.0
75%,67962800.0,60487.22,4.0,8.830932,12.97371,0.0
max,130779800.0,60674.36,5.0,2432809.0,2234069.0,1.0


In [38]:
all_meta = pd.concat([train_meta], axis=0, ignore_index=True, sort=True).reset_index()
all_meta.drop('index', axis=1, inplace=True)

In [39]:
all_meta.head(5)

Unnamed: 0,ddf,decl,distmod,gal_b,gal_l,hostgal_photoz,hostgal_photoz_err,hostgal_specz,mwebv,object_id,ra,target
0,1,-61.943836,,-51.753706,320.79653,0.0,0.0,0.0,0.017,615,349.046051,92
1,1,-27.784405,45.4063,-54.460748,223.525509,1.6267,0.2552,1.8181,0.007,713,53.085938,88
2,1,-6.579593,40.2561,-61.548219,170.455585,0.2262,0.0157,0.232,0.021,730,33.574219,42
3,1,-45.586655,40.7951,-68.969298,328.254458,0.2813,1.1523,0.3037,0.007,745,0.189873,90
4,1,-63.823658,40.4166,-51.059403,316.922299,0.2415,0.0176,0.1934,0.024,1124,352.711273,90


# Prepare to emulate call to calc_aggs()

In [108]:
# prepare to emulate call to calc_aggs()
all_data = train.copy()
exact = True

In [109]:
all_data.head(5)

Unnamed: 0,object_id,mjd,passband,flux,flux_err,detected
0,615,59750.4229,2,-544.810303,3.622952,1
1,615,59750.4306,1,-816.434326,5.55337,1
2,615,59750.4383,3,-471.385529,3.801213,1
3,615,59750.445,4,-388.984985,11.395031,1
4,615,59752.407,2,-681.858887,4.041204,1


In [110]:
#starting out...
all_data.shape

(1421705, 6)

# calc_aggs ...

In [111]:
all_data[all_data["object_id"]==730].head(5)

Unnamed: 0,object_id,mjd,passband,flux,flux_err,detected
702,730,59798.3205,2,1.177371,1.3643,0
703,730,59798.3281,1,2.320849,1.159247,0
704,730,59798.3357,3,2.939447,1.771328,0
705,730,59798.3466,4,2.128097,2.610659,0
706,730,59798.3576,5,-12.809639,5.380097,0


### Normalise the flux, following the Bayesian approach

In [112]:
prior_mean = all_data.groupby(['object_id', 'passband'])['flux'].transform('mean')
prior_std = all_data.groupby(['object_id', 'passband'])['flux'].transform('std')
prior_std.loc[prior_std.isnull()] = all_data.loc[prior_std.isnull(), 'flux_err']
obs_std = all_data['flux_err']  # since the above kernel tells us that the flux error is the 68% confidence interval
all_data['bayes_flux'] = (all_data['flux'] / obs_std**2 + prior_mean / prior_std**2) \
                             / (1 / obs_std**2 + 1 / prior_std**2)
all_data.loc[all_data['bayes_flux'].notnull(), 'flux'] \
        = all_data.loc[all_data['bayes_flux'].notnull(), 'bayes_flux']

In [113]:
all_data[all_data["object_id"]==730].head(5)

Unnamed: 0,object_id,mjd,passband,flux,flux_err,detected,bayes_flux
702,730,59798.3205,2,1.246867,1.3643,0,1.246867
703,730,59798.3281,1,1.685412,1.159247,0,1.685412
704,730,59798.3357,3,2.9527,1.771328,0,2.9527
705,730,59798.3466,4,2.250392,2.610659,0,2.250392
706,730,59798.3576,5,-10.380242,5.380097,0,-10.380242


### Estimate the flux at source, using the fact that light is proportional

In [114]:
redshift = all_meta.set_index('object_id')[['hostgal_specz', 'hostgal_photoz']]
if exact:
    redshift['redshift'] = redshift['hostgal_specz']
    redshift.loc[redshift['redshift'].isnull(), 'redshift'] \
        = redshift.loc[redshift['redshift'].isnull(), 'hostgal_photoz']
else:
    redshift['redshift'] = redshift['hostgal_photoz']
all_data = pd.merge(all_data, redshift, 'left', 'object_id')
nonzero_redshift = all_data['redshift'] > 0
all_data.loc[nonzero_redshift, 'flux'] = all_data.loc[nonzero_redshift, 'flux'] \
                                         * all_data.loc[nonzero_redshift, 'redshift']**2

In [115]:
all_data[all_data["object_id"]==730].head(5)

Unnamed: 0,object_id,mjd,passband,flux,flux_err,detected,bayes_flux,hostgal_specz,hostgal_photoz,redshift
702,730,59798.3205,2,0.067111,1.3643,0,1.246867,0.232,0.2262,0.232
703,730,59798.3281,1,0.090716,1.159247,0,1.685412,0.232,0.2262,0.232
704,730,59798.3357,3,0.158926,1.771328,0,2.9527,0.232,0.2262,0.232
705,730,59798.3466,4,0.121125,2.610659,0,2.250392,0.232,0.2262,0.232
706,730,59798.3576,5,-0.558706,5.380097,0,-10.380242,0.232,0.2262,0.232


# aggregate features

In [86]:
band_aggs = all_data.groupby(['object_id', 'passband'])['flux'].agg(['mean', 'std', 'max', 'min']).unstack(-1)
band_aggs.columns = [x + '_' + str(y) for x in band_aggs.columns.levels[0]
                      for y in band_aggs.columns.levels[1]]
all_data.sort_values(['object_id', 'passband', 'flux'], inplace=True)
# this way of calculating quantiles is faster than using the pandas quantile builtin on the groupby object
all_data['group_count'] = all_data.groupby(['object_id', 'passband']).cumcount()
all_data['group_size'] = all_data.groupby(['object_id', 'passband'])['flux'].transform('size')
q_list = [0.25, 0.75]
for q in q_list:
    all_data['q_' + str(q)] = all_data.loc[
        (all_data['group_size'] * q).astype(int) == all_data['group_count'], 'flux']
quantiles = all_data.groupby(['object_id', 'passband'])[['q_' + str(q) for q in q_list]].max().unstack(-1)
quantiles.columns = [str(x) + '_' + str(y) + '_quantile' for x in quantiles.columns.levels[0]
                     for y in quantiles.columns.levels[1]]

In [87]:
all_data[all_data["object_id"]==730]

Unnamed: 0,object_id,mjd,passband,flux,flux_err,detected,bayes_flux,hostgal_specz,hostgal_photoz,redshift,group_count,group_size,q_0.25,q_0.75
1021,730,60643.0521,0,-0.095862,1.682890,0,-1.781029,0.232,0.2262,0.232,0,72,,
919,730,60290.0761,0,-0.077771,1.929932,0,-1.444906,0.232,0.2262,0.232,1,72,,
822,730,59938.0647,0,-0.076970,2.015928,0,-1.430028,0.232,0.2262,0.232,2,72,,
943,730,60558.2332,0,-0.066238,2.511074,0,-1.230635,0.232,0.2262,0.232,3,72,,
712,730,59818.2740,0,-0.065273,1.801066,0,-1.212713,0.232,0.2262,0.232,4,72,,
942,730,60557.2322,0,-0.059896,1.848830,0,-1.112806,0.232,0.2262,0.232,5,72,,
714,730,59820.2522,0,-0.058615,1.915426,0,-1.089018,0.232,0.2262,0.232,6,72,,
756,730,59876.0980,0,-0.058106,2.450620,0,-1.079552,0.232,0.2262,0.232,7,72,,
992,730,60616.0769,0,-0.055775,2.024976,0,-1.036256,0.232,0.2262,0.232,8,72,,
892,730,60264.0559,0,-0.054466,2.192123,0,-1.011920,0.232,0.2262,0.232,9,72,,


### another approach - use the .quantile operator on the .groupBy
as referenced in the comments

DataFrame.quantile(self, q=0.5, axis=0, numeric_only=True, interpolation='linear')

this *does* take a long time

In [34]:
temporary = all_data.groupby(['object_id', 'passband'])['flux'].quantile(q=[.25,.75])

In [38]:
temporary.unstack().head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,0.25,0.75
object_id,passband,Unnamed: 2_level_1,Unnamed: 3_level_1
615,0,-87.899025,81.896961
615,1,-936.821533,114.756821
615,2,-555.584244,312.449711
615,3,-454.085464,208.036484
615,4,-343.492065,227.865406
615,5,-353.825928,256.820892
713,0,-28.223888,7.317062
713,1,-19.103429,9.737729
713,2,-18.076024,12.579229
713,3,-19.864004,13.305155


### max detected flux
a reduction - get the maximum flux value for the object (independent of passband)

note it is only performed on "detected==1" ... not sure why

In [88]:
max_detected = all_data.loc[all_data['detected'] == 1].groupby('object_id')['flux'].max().to_frame('max_detected')

In [40]:
max_detected.head(5)

Unnamed: 0_level_0,max_detected
object_id,Unnamed: 1_level_1
615,660.555237
713,33.711967
730,2.192224
745,20.322405
1124,5.305183


## most of above is more easily performed by just describe()
however it is really slow

In [41]:
train.head(5)

Unnamed: 0,object_id,mjd,passband,flux,flux_err,detected
0,615,59750.4229,2,-544.810303,3.622952,1
1,615,59750.4306,1,-816.434326,5.55337,1
2,615,59750.4383,3,-471.385529,3.801213,1
3,615,59750.445,4,-388.984985,11.395031,1
4,615,59752.407,2,-681.858887,4.041204,1


In [42]:
# warning slow... train.groupby(['object_id', 'passband'])['flux'].describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,count,mean,std,min,25%,50%,75%,max
object_id,passband,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
615,0,63.0,-3.254554,83.944733,-116.913223,-88.070869,-10.015225,82.123554,125.182808
615,1,58.0,-385.699921,601.787292,-1100.440063,-936.876556,-488.057968,114.780052,660.626343
615,2,58.0,-134.146576,455.121338,-681.858887,-555.616257,-265.686005,312.467346,611.984558
615,3,58.0,-121.103500,335.425049,-530.644592,-454.119835,-162.170944,208.062096,445.737061
615,4,58.0,-55.954590,291.803436,-422.184509,-343.538879,-103.541367,227.912670,381.953735
615,5,57.0,-47.449844,294.779510,-422.815094,-354.074280,-85.524307,256.966217,378.188141
713,0,70.0,-2.720398,7.113509,-14.735178,-9.162396,-3.096804,2.649697,14.509829
713,1,56.0,-1.019804,5.712334,-11.715749,-6.318706,-0.561735,3.173910,9.129021
713,2,56.0,-0.794238,5.770738,-10.067919,-5.784492,-0.117977,3.955531,10.529041
713,3,56.0,-0.986966,6.450413,-12.394593,-6.364463,-0.073897,4.217097,11.330316


### "most extreme"
find the "most extreme" time for each object, and for each band, retrieve the k data points on either side, k points before

procedure is
* for each passband - translate to it's median
* (no scaling is done)
* find the date of the peak (largest value)
* for each sample identify the number of days to/from the peak
* sort by days before/after in order to find the n preceding, and n following


In [62]:
def most_extreme(df_in, k, positive=True, suffix='', include_max=True, include_dur=True, include_interval=False):
        # find the "most extreme" time for each object, and for each band, retrieve the k data points on either side
        # k points before
        df = df_in.copy()
        df['object_passband_mean'] = df.groupby(['object_id', 'passband'])['flux'].transform('median')
        if positive:
            df['dist_from_mean'] = (df['flux'] - df['object_passband_mean'])
        else:
            df['dist_from_mean'] = -(df['flux'] - df['object_passband_mean'])

        max_time = df.loc[df['detected'] == 1].groupby('object_id')['dist_from_mean'].idxmax().to_frame(
            'max_ind')
        max_time['mjd_max' + suffix] = df.loc[max_time['max_ind'].values, 'mjd'].values
        df = pd.merge(df, max_time[['mjd_max' + suffix]], 'left', left_on=['object_id'], right_index=True)
        df['time_after_mjd_max'] = df['mjd'] - df['mjd_max' + suffix]
        df['time_before_mjd_max'] = -df['time_after_mjd_max']

        # first k after event
        df.sort_values(['object_id', 'passband', 'time_after_mjd_max'], inplace=True)
        df['row_num_after'] = df.loc[df['time_after_mjd_max'] >= 0].groupby(
            ['object_id', 'passband']).cumcount()
        first_k_after = df.loc[(df['row_num_after'] < k) & (df['time_after_mjd_max'] <= 50),
                              ['object_id', 'passband', 'flux', 'row_num_after']]
        first_k_after.set_index(['object_id', 'passband', 'row_num_after'], inplace=True)
        first_k_after = first_k_after.unstack(level=-1).unstack(level=-1)
        first_k_after.columns = [str(x) + '_' + str(y) + '_after' for x in first_k_after.columns.levels[1]
                                 for y in first_k_after.columns.levels[2]]
        extreme_data = first_k_after
        time_bands = [[-50, -20], [-20, -10], [-10, 0], [0, 10], [10, 20], [20, 50], [50, 100], [100, 200], [200, 500]]
        if include_interval:
            interval_arr = []
            for start, end in time_bands:
                band_data = df.loc[(start <= df['time_after_mjd_max']) & (df['time_after_mjd_max'] <= end)]
                interval_agg = band_data.groupby(['object_id', 'passband'])['flux'].mean().unstack(-1)
                interval_agg.columns = ['{}_start_{}_end_{}'.format(c, start, end) for c in interval_agg.columns]
                interval_arr.append(interval_agg)
            interval_data = pd.concat(interval_arr, axis=1)
            extreme_data = pd.concat([extreme_data, interval_data], axis=1)
        if include_dur:
            # detection duration in each passband after event
            duration_after = df.loc[(df['time_after_mjd_max'] >= 0) & (df['detected'] == 0)] \
                .groupby(['object_id', 'passband'])['time_after_mjd_max'].first().unstack(-1)
            duration_after.columns = ['dur_after_' + str(c) for c in range(6)]
            extreme_data = pd.concat([extreme_data, duration_after], axis=1)

        # last k before event
        df.sort_values(['object_id', 'passband', 'time_before_mjd_max'], inplace=True)
        df['row_num_before'] = df.loc[df['time_before_mjd_max'] >= 0].groupby(
            ['object_id', 'passband']).cumcount()
        first_k_before = df.loc[(df['row_num_before'] < k) & (df['time_after_mjd_max'] <= 50),
                                ['object_id', 'passband', 'flux', 'row_num_before']]
        first_k_before.set_index(['object_id', 'passband', 'row_num_before'], inplace=True)
        first_k_before = first_k_before.unstack(level=-1).unstack(level=-1)
        first_k_before.columns = [str(x) + '_' + str(y) + '_before' for x in first_k_before.columns.levels[1]
                                  for y in first_k_before.columns.levels[2]]
        extreme_data = pd.concat([extreme_data, first_k_before], axis=1)
        if include_dur:
            # detection duration in each passband before event
            duration_before = df.loc[(df['time_before_mjd_max'] >= 0) & (df['detected'] == 0)] \
                .groupby(['object_id', 'passband'])['time_before_mjd_max'].first().unstack(-1)
            duration_before.columns = ['dur_before_' + str(c) for c in range(6)]
            extreme_data = pd.concat([extreme_data, duration_before], axis=1)

        if include_max:
            # passband with maximum detected flux for each object
            max_pb = df.loc[max_time['max_ind'].values].groupby('object_id')['passband'].max().to_frame(
                'max_passband')
            # time of max in each passband, relative to extreme max
            band_max_ind = df.groupby(['object_id', 'passband'])['flux'].idxmax()
            band_mjd_max = df.loc[band_max_ind.values].groupby(['object_id', 'passband'])['mjd'].max().unstack(-1)
            cols = ['max_time_' + str(i) for i in range(6)]
            band_mjd_max.columns = cols
            band_mjd_max = pd.merge(band_mjd_max, max_time, 'left', 'object_id')
            for c in cols:
                band_mjd_max[c] -= band_mjd_max['mjd_max' + suffix]
            band_mjd_max.drop(['mjd_max' + suffix, 'max_ind'], axis=1, inplace=True)
            extreme_data = pd.concat([extreme_data, max_pb, band_mjd_max], axis=1)

        extreme_data.columns = [c + suffix for c in extreme_data.columns]
        return extreme_data

### apply above

In [63]:
extreme_max = most_extreme(all_data, 1, positive=True, suffix='', include_max=True, include_dur=True, include_interval=True)

In [64]:
extreme_max.head(10)

Unnamed: 0_level_0,0.0_0_after,0.0_1_after,0.0_2_after,0.0_3_after,0.0_4_after,0.0_5_after,0_start_-50_end_-20,1_start_-50_end_-20,2_start_-50_end_-20,3_start_-50_end_-20,...,dur_before_3,dur_before_4,dur_before_5,max_passband,max_time_0,max_time_1,max_time_2,max_time_3,max_time_4,max_time_5
object_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
615,107.724236,660.555237,,386.264587,325.347717,280.555481,27.010132,-369.583008,-56.053688,-30.622488,...,,,21.9888,1,11.9946,0.0,-0.0077,330.9835,-25.001,354.0169
713,8.214751,21.860806,17.686045,33.711967,24.936552,-1.505679,7.235832,6.216793,10.324715,6.959814,...,2.9591,2.9482,2.9372,3,2.987,14.0023,13.9946,0.0,22.0253,321.1721
730,0.002485,0.081253,1.062404,1.54356,2.127217,2.192224,,,,,...,245.2082,245.1973,245.1864,4,-710.0849,-734.0001,-0.0263,5.9698,0.0,0.011
745,1.42862,17.679188,20.322405,18.697315,16.823721,11.039402,,,,,...,223.3026,223.2916,223.2807,2,14.8309,0.0077,0.0,0.0154,0.0263,0.0354
1124,0.099942,1.210217,3.462885,4.406872,5.305183,3.692601,,,,,...,241.2421,241.2311,241.2201,4,8.9554,-0.0188,-0.0266,-0.0111,0.0,0.0111
1227,66.739166,-0.379217,0.474166,1.155594,1.404833,0.433763,0.591041,-0.529568,-1.02423,0.521934,...,4.9858,4.9748,4.9639,0,0.0,-691.9633,-389.7432,-83.7596,-334.9484,-389.7061
1598,,26.449911,,18.311235,13.461679,11.876847,-0.004076,-0.012961,-0.004523,0.003063,...,18.1457,15.1326,18.1235,1,-7.1541,0.0,-0.0076,0.0077,0.0186,0.0296
1632,,0.732351,0.399769,1.277339,2.649178,6.772305,,,,,...,0.0221,14.9009,14.8908,5,814.7497,809.692,472.688,-16.896,-0.011,42.8079
1920,1.943407,18.57402,22.088634,20.062527,17.617647,14.012241,0.052922,0.068802,-0.042402,0.183571,...,17.913,17.902,14.9189,2,8.0002,0.0076,0.0,12.1312,12.1422,0.0372
1926,0.17501,0.362255,29.288733,-1.498832,-4.315869,4.157547,,-0.695243,-0.722745,0.983133,...,10.9773,10.9663,10.9554,2,80.8079,17.9358,0.0,-354.9767,71.8523,-294.1424


In [65]:
extreme_min = most_extreme(all_data, 1, positive=False, suffix='_min', include_max=False, include_dur=True)

# attempt to identify periodicity:

In [76]:
# find the dates of min and max flux
# (interestingly... only done for intra-galaxy...)
time_between_detections = all_data.loc[all_data['detected'] == 1].groupby('object_id')['mjd'].agg(['max', 'min'])

In [77]:
time_between_detections.head()

Unnamed: 0_level_0,max,min
object_id,Unnamed: 1_level_1,Unnamed: 2_level_1
615,60624.2132,59750.4229
713,60672.0693,59825.2676
730,60611.0756,60532.3019
745,60249.0966,60125.4094
1124,60624.176,60490.2647


In [90]:
# calculate the 
time_between_detections['det_period'] = time_between_detections['max'] - time_between_detections['min']

In [91]:
time_between_detections.head(10)

Unnamed: 0_level_0,max,min,det_period
object_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
615,60614.0276,59819.1532,794.8744
713,60674.0798,59825.26,848.8198
730,60652.166,59798.3205,853.8455
745,60624.0722,59770.3662,853.706
1124,60624.2132,59750.4229,873.7903
1227,60652.166,59798.3205,853.8455
1598,60609.051,59750.4229,858.6281
1632,60624.2132,59750.4229,873.7903
1920,60436.984,59582.3282,854.6558
1926,60436.984,59582.3282,854.6558


### same feature but grouped by passband

### similar feature based on high values

### aggregate values of the features during the detection period

In [92]:
all_data = pd.merge(all_data, time_between_detections, 'left', 'object_id')

In [93]:
all_data.head(10)

Unnamed: 0,object_id,mjd,passband,flux,flux_err,detected,bayes_flux,hostgal_specz,hostgal_photoz,redshift,group_count,group_size,q_0.25,q_0.75,max_x,min_x,max_y,min_y,det_period
0,615,59877.0238,0,-116.758644,3.097836,1,-116.758644,0.0,0.0,0.0,0,63,,,60614.0276,59819.1532,60614.0276,59819.1532,794.8744
1,615,60560.1065,0,-114.520103,4.013463,1,-114.520103,0.0,0.0,0.0,1,63,,,60614.0276,59819.1532,60614.0276,59819.1532,794.8744
2,615,60588.0131,0,-113.424164,3.241369,1,-113.424164,0.0,0.0,0.0,2,63,,,60614.0276,59819.1532,60614.0276,59819.1532,794.8744
3,615,59823.1505,0,-113.09108,4.069051,1,-113.09108,0.0,0.0,0.0,3,63,,,60614.0276,59819.1532,60614.0276,59819.1532,794.8744
4,615,60587.0127,0,-110.785774,3.555624,1,-110.785774,0.0,0.0,0.0,4,63,,,60614.0276,59819.1532,60614.0276,59819.1532,794.8744
5,615,60612.0266,0,-110.526726,2.8442,1,-110.526726,0.0,0.0,0.0,5,63,,,60614.0276,59819.1532,60614.0276,59819.1532,794.8744
6,615,60559.1097,0,-110.509773,3.426444,1,-110.509773,0.0,0.0,0.0,6,63,,,60614.0276,59819.1532,60614.0276,59819.1532,794.8744
7,615,59876.0231,0,-108.494843,3.449714,1,-108.494843,0.0,0.0,0.0,7,63,,,60614.0276,59819.1532,60614.0276,59819.1532,794.8744
8,615,60118.4163,0,-106.938904,3.102513,1,-106.938904,0.0,0.0,0.0,8,63,,,60614.0276,59819.1532,60614.0276,59819.1532,794.8744
9,615,59878.0246,0,-102.630241,3.135772,1,-102.630241,0.0,0.0,0.0,9,63,,,60614.0276,59819.1532,60614.0276,59819.1532,794.8744


### time distribution of detections in each band

In [94]:
# get the standard deviation of dates for each (object/passband) group
# this is a measure of how widely spread the sample dates are... by object/passband
detection_time_dist = all_data.loc[all_data['detected'] == 1].groupby(['object_id', 'passband'])['mjd'].std().unstack(-1)

In [95]:
detection_time_dist.head(5)

passband,0,1,2,3,4,5
object_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
615,274.702133,319.286828,317.107039,315.965966,314.734274,319.622294
713,34.651867,347.562533,293.15078,338.971978,35.866638,
730,,,31.068126,29.716185,21.654423,16.205342
745,2.819164,30.492986,34.654006,34.653997,30.411028,23.573502
1124,,45.982778,42.129072,38.609425,32.002089,25.281898


In [96]:
# this is just a renaming of the columns
detection_time_dist.columns = ['time_dist_' + str(i) for i in range(6)]

In [97]:
detection_time_dist.head(5)

Unnamed: 0_level_0,time_dist_0,time_dist_1,time_dist_2,time_dist_3,time_dist_4,time_dist_5
object_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
615,274.702133,319.286828,317.107039,315.965966,314.734274,319.622294
713,34.651867,347.562533,293.15078,338.971978,35.866638,
730,,,31.068126,29.716185,21.654423,16.205342
745,2.819164,30.492986,34.654006,34.653997,30.411028,23.573502
1124,,45.982778,42.129072,38.609425,32.002089,25.281898


In [103]:
# this is std just not by passband (std value by object_id - all passbands)
detection_time_dist_all = all_data.loc[all_data['detected'] == 1].groupby(['object_id'])['mjd'].std().to_frame('time_dist')

In [104]:
detection_time_dist_all.head(5)

Unnamed: 0_level_0,time_dist
object_id,Unnamed: 1_level_1
615,310.975974
713,290.559994
730,27.845135
745,33.849254
1124,41.140559


In [None]:
type()