In [2]:
import pandas as pd
import numpy as np
import gc # Garbage collection
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
gc.enable()

train = pd.read_csv('../data/training_set.csv', engine='python')
train_meta = pd.read_csv('../data/training_set_metadata.csv', engine='python')

In [4]:
train_meta.head()

Unnamed: 0,object_id,ra,decl,gal_l,gal_b,ddf,hostgal_specz,hostgal_photoz,hostgal_photoz_err,distmod,mwebv,target
0,615,349.046051,-61.943836,320.79653,-51.753706,1,0.0,0.0,0.0,,0.017,92
1,713,53.085938,-27.784405,223.525509,-54.460748,1,1.8181,1.6267,0.2552,45.4063,0.007,88
2,730,33.574219,-6.579593,170.455585,-61.548219,1,0.232,0.2262,0.0157,40.2561,0.021,42
3,745,0.189873,-45.586655,328.254458,-68.969298,1,0.3037,0.2813,1.1523,40.7951,0.007,90
4,1124,352.711273,-63.823658,316.922299,-51.059403,1,0.1934,0.2415,0.0176,40.4166,0.024,90


flux_ratio_sq = flux divided by flux error, squared:

$$ \left(\frac{{F}}{d{F}}\right)^2 $$

And then the flux-by-flux ratio sq is the product of the flux and flux_ratio_sq:

$$ F  \left(\frac{{F}}{d{F}}\right)^2 $$.

I am not entirely sure what these are supposed to get us; the flux_ratio is fine. The other ...

In [5]:
# We only want:
cols_to_keep = ['gal_l', 'gal_b', 'hostgal_photoz', 'hostgal_photoz_err', 'mwebv']
meta_kept = train_meta[cols_to_keep]
meta_kept.head()

Unnamed: 0,gal_l,gal_b,hostgal_photoz,hostgal_photoz_err,mwebv
0,320.79653,-51.753706,0.0,0.0,0.017
1,223.525509,-54.460748,1.6267,0.2552,0.007
2,170.455585,-61.548219,0.2262,0.0157,0.021
3,328.254458,-68.969298,0.2813,1.1523,0.007
4,316.922299,-51.059403,0.2415,0.0176,0.024


In [6]:
# What's happening here??
# (f/df)
train['flux_ratio_sq'] = np.power(train['flux'] / train['flux_err'], 2.0)
train['flux_by_flux_ratio_sq'] = train['flux'] * train['flux_ratio_sq']
train.head()

Unnamed: 0,object_id,mjd,passband,flux,flux_err,detected,flux_ratio_sq,flux_by_flux_ratio_sq
0,615,59750.4229,2,-544.810303,3.622952,1,22613.379109,-12320000.0
1,615,59750.4306,1,-816.434326,5.55337,1,21613.708602,-17646170.0
2,615,59750.4383,3,-471.385529,3.801213,1,15378.2912,-7249104.0
3,615,59750.445,4,-388.984985,11.395031,1,1165.291701,-453281.0
4,615,59752.407,2,-681.858887,4.041204,1,28468.688609,-19411630.0


In [7]:
print('number of observations NOT detected = {}'.format(len(train['detected'].loc[train['detected'] == 0])))
print('number of observations detected = {}'.format(len(train['detected'].loc[train['detected'] == 1])))

number of observations NOT detected = 1184825
number of observations detected = 236880


In [8]:
aggs = {
    'mjd': ['min', 'max', 'size'],
    'flux': ['min', 'max', 'mean', 'median', 'std','skew'],
    'flux_err': ['min', 'max', 'mean', 'median', 'std','skew'],
    'detected': ['mean'],
    'flux_ratio_sq':['sum','skew'],
    'flux_by_flux_ratio_sq':['sum','skew'],
}
agg_train = train.groupby(['object_id', 'passband'])
# agg_train.head()

We'll want to keep the passband broken out. Right now, we have the fluxes and flux errors and the ratio of flux and flux errors. So, basically we have no real sense of the time-dependence of the object. What should we do to get the time dependence? Perhaps create a GAN, who's goal is to re-create the time-series data. We'll need to mask the gaps, because the loss in there will suck, but that's probably taken care of because we simply don't have samples in those regions.



In [9]:
train.head()

Unnamed: 0,object_id,mjd,passband,flux,flux_err,detected,flux_ratio_sq,flux_by_flux_ratio_sq
0,615,59750.4229,2,-544.810303,3.622952,1,22613.379109,-12320000.0
1,615,59750.4306,1,-816.434326,5.55337,1,21613.708602,-17646170.0
2,615,59750.4383,3,-471.385529,3.801213,1,15378.2912,-7249104.0
3,615,59750.445,4,-388.984985,11.395031,1,1165.291701,-453281.0
4,615,59752.407,2,-681.858887,4.041204,1,28468.688609,-19411630.0


In [11]:
train_meta.head()

Unnamed: 0,object_id,ra,decl,gal_l,gal_b,ddf,hostgal_specz,hostgal_photoz,hostgal_photoz_err,distmod,mwebv,target
0,615,349.046051,-61.943836,320.79653,-51.753706,1,0.0,0.0,0.0,,0.017,92
1,713,53.085938,-27.784405,223.525509,-54.460748,1,1.8181,1.6267,0.2552,45.4063,0.007,88
2,730,33.574219,-6.579593,170.455585,-61.548219,1,0.232,0.2262,0.0157,40.2561,0.021,42
3,745,0.189873,-45.586655,328.254458,-68.969298,1,0.3037,0.2813,1.1523,40.7951,0.007,90
4,1124,352.711273,-63.823658,316.922299,-51.059403,1,0.1934,0.2415,0.0176,40.4166,0.024,90


In [13]:
obj_ids = train_meta[train_meta['target'] == 92]['object_id']

In [18]:
train

Unnamed: 0,object_id,mjd,passband,flux,flux_err,detected,flux_ratio_sq,flux_by_flux_ratio_sq
0,615,59750.4229,2,-544.810303,3.622952,1,22613.379109,-1.232000e+07
1,615,59750.4306,1,-816.434326,5.553370,1,21613.708602,-1.764617e+07
2,615,59750.4383,3,-471.385529,3.801213,1,15378.291200,-7.249104e+06
3,615,59750.4450,4,-388.984985,11.395031,1,1165.291701,-4.532810e+05
4,615,59752.4070,2,-681.858887,4.041204,1,28468.688609,-1.941163e+07
5,615,59752.4147,1,-1061.457031,6.472994,1,26890.224454,-2.854282e+07
6,615,59752.4224,3,-524.954590,3.552751,1,21833.026104,-1.146135e+07
7,615,59752.4334,4,-393.480225,3.599346,1,11950.845409,-4.702421e+06
8,615,59752.4435,5,-355.886780,10.421921,1,1166.079493,-4.149923e+05
9,615,59767.2968,2,-548.013550,3.462291,1,25052.753196,-1.372925e+07
