In [19]:
import pandas as pd
import numpy as np
import gc # Garbage collection
import matplotlib.pyplot as plt
%matplotlib inline

In [17]:
gc.enable()

train = pd.read_csv('../data/training_set.csv', engine='python')
train_meta = pd.read_csv('../data/training_set_metadata.csv', engine='python')

In [18]:
train_meta.head()

Unnamed: 0,object_id,ra,decl,gal_l,gal_b,ddf,hostgal_specz,hostgal_photoz,hostgal_photoz_err,distmod,mwebv,target
0,615,349.046051,-61.943836,320.79653,-51.753706,1,0.0,0.0,0.0,,0.017,92
1,713,53.085938,-27.784405,223.525509,-54.460748,1,1.8181,1.6267,0.2552,45.4063,0.007,88
2,730,33.574219,-6.579593,170.455585,-61.548219,1,0.232,0.2262,0.0157,40.2561,0.021,42
3,745,0.189873,-45.586655,328.254458,-68.969298,1,0.3037,0.2813,1.1523,40.7951,0.007,90
4,1124,352.711273,-63.823658,316.922299,-51.059403,1,0.1934,0.2415,0.0176,40.4166,0.024,90


flux_ratio_sq = flux divided by flux error, squared:

$$ \left(\frac{{F}}{d{F}}\right)^2 $$

And then the flux-by-flux ratio sq is the product of the flux and flux_ratio_sq:

$$ F  \left(\frac{{F}}{d{F}}\right)^2 $$.

I am not entirely sure what these are supposed to get us; the flux_ratio is fine. The other ...

In [55]:
# We only want:
cols_to_keep = ['gal_l', 'gal_b', 'hostgal_photoz', 'hostgal_photoz_err', 'mwebv']
meta_kept = train_meta[cols_to_keep]
meta_kept.head()

Unnamed: 0,gal_l,gal_b,hostgal_photoz,hostgal_photoz_err,mwebv
0,320.79653,-51.753706,0.0,0.0,0.017
1,223.525509,-54.460748,1.6267,0.2552,0.007
2,170.455585,-61.548219,0.2262,0.0157,0.021
3,328.254458,-68.969298,0.2813,1.1523,0.007
4,316.922299,-51.059403,0.2415,0.0176,0.024


In [16]:
# What's happening here??
# (f/df)
train['flux_ratio_sq'] = np.power(train['flux'] / train['flux_err'], 2.0)
train['flux_by_flux_ratio_sq'] = train['flux'] * train['flux_ratio_sq']
train.head()

Unnamed: 0,object_id,mjd,passband,flux,flux_err,detected,flux_ratio_sq,flux_by_flux_ratio_sq
0,615,59750.4229,2,-544.810303,3.622952,1,22613.379109,-12320000.0
1,615,59750.4306,1,-816.434326,5.55337,1,21613.708602,-17646170.0
2,615,59750.4383,3,-471.385529,3.801213,1,15378.2912,-7249104.0
3,615,59750.445,4,-388.984985,11.395031,1,1165.291701,-453281.0
4,615,59752.407,2,-681.858887,4.041204,1,28468.688609,-19411630.0


In [10]:
print('number of observations NOT detected = {}'.format(len(train['detected'].loc[train['detected'] == 0])))
print('number of observations detected = {}'.format(len(train['detected'].loc[train['detected'] == 1])))

number of observations NOT detected = 1184825
number of observations detected = 236880


In [58]:
aggs = {
    'mjd': ['min', 'max', 'size'],
    'flux': ['min', 'max', 'mean', 'median', 'std','skew'],
    'flux_err': ['min', 'max', 'mean', 'median', 'std','skew'],
    'detected': ['mean'],
    'flux_ratio_sq':['sum','skew'],
    'flux_by_flux_ratio_sq':['sum','skew'],
}
agg_train = train.groupby(['object_id', 'passband'])
# agg_train.head()

We'll want to keep the passband broken out. Right now, we have the fluxes and flux errors and the ratio of flux and flux errors. So, basically we have no real sense of the time-dependence of the object. What should we do to get the time dependence? Perhaps create a GAN, who's goal is to re-create the time-series data. We'll need to mask the gaps, because the loss in there will suck, but that's probably taken care of because we simply don't have samples in those regions.



In [32]:
train.head()

Unnamed: 0,object_id,mjd,passband,flux,flux_err,detected
0,615,59750.4229,2,-544.810303,3.622952,1
1,615,59750.4306,1,-816.434326,5.55337,1
2,615,59750.4383,3,-471.385529,3.801213,1
3,615,59750.445,4,-388.984985,11.395031,1
4,615,59752.407,2,-681.858887,4.041204,1


In [61]:

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import tensorflow as tf
from keras.layers import Input, Dense, Lambda, Layer, Activation
from keras.layers.normalization import BatchNormalization
from keras.models import Model
from keras import backend as K
from keras import metrics, optimizers
from keras.callbacks import Callback
import keras

import pydot
from keras.utils import plot_model
from keras_tqdm import TQDMNotebookCallback
from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot

In [34]:
# Testing out the code to see what the heck they're doing

In [62]:
print(keras.__version__)
tf.__version__

2.1.2


'1.4.0'

In [67]:
%matplotlib inline
plt.style.use('seaborn-notebook')
sns.set(style="white", color_codes=True)
sns.set_context("paper", rc={"font.size":14,"axes.titlesize":15,"axes.labelsize":20,
                             'xtick.labelsize':14, 'ytick.labelsize':14})
np.random.seed(123)

In [65]:
# Function for reparameterization trick to make model differentiable
def sampling(args):
    
    import tensorflow as tf
    # Function with args required for Keras Lambda function
    z_mean, z_log_var = args

    # Draw epsilon of the same shape from a standard normal distribution
    epsilon = K.random_normal(shape=tf.shape(z_mean), mean=0.,
                              stddev=epsilon_std)
    
    # The latent vector is non-deterministic and differentiable
    # in respect to z_mean and z_log_var
    z = z_mean + K.exp(z_log_var / 2) * epsilon
    return z


class CustomVariationalLayer(Layer):
    """
    Define a custom layer that learns and performs the training
    This function is borrowed from:
    https://github.com/fchollet/keras/blob/master/examples/variational_autoencoder.py
    """
    def __init__(self, **kwargs):
        # https://keras.io/layers/writing-your-own-keras-layers/
        self.is_placeholder = True
        super(CustomVariationalLayer, self).__init__(**kwargs)

    def vae_loss(self, x_input, x_decoded):
        reconstruction_loss = original_dim * metrics.binary_crossentropy(x_input, x_decoded)
        kl_loss = - 0.5 * K.sum(1 + z_log_var_encoded - K.square(z_mean_encoded) - 
                                K.exp(z_log_var_encoded), axis=-1)
        return K.mean(reconstruction_loss + (K.get_value(beta) * kl_loss))

    def call(self, inputs):
        x = inputs[0]
        x_decoded = inputs[1]
        loss = self.vae_loss(x, x_decoded)
        self.add_loss(loss, inputs=inputs)
        # We won't actually use the output.
        return x

In [66]:
class WarmUpCallback(Callback):
    def __init__(self, beta, kappa):
        self.beta = beta
        self.kappa = kappa
    # Behavior on each epoch
    def on_epoch_end(self, epoch, logs={}):
        if K.get_value(self.beta) <= 1:
            K.set_value(self.beta, K.get_value(self.beta) + self.kappa)

In [69]:
test_set_percent = 0.2
train_df = train.sample(frac=test_set_percent)

In [72]:
train_df.head()

Unnamed: 0,object_id,mjd,passband,flux,flux_err,detected
1365503,120948964,60077.4059,2,257.873444,6.718282,1
1023362,60351317,60222.0753,3,-8.022947,4.729793,0
325212,168967,60593.0713,1,-0.354021,1.144274,0
149568,77391,59800.3168,2,-6.033435,2.299204,0
605185,301819,60118.4163,0,-1.165072,1.57126,0


# Initialize Variables and Hyperparameters

In [80]:
# Set hyper parameters
original_dim = train_df.shape[1]
latent_dim = 100

batch_size = 50
epochs = 50
learning_rate = 0.0005

epsilon_std = 1.0
beta = K.variable(0)
kappa = 1

In [82]:
train.sample(15)

Unnamed: 0,object_id,mjd,passband,flux,flux_err,detected
565102,280437,60164.1619,2,3.217951,1.419879,0
720561,6040409,60641.1528,3,6.554013,5.378167,0
1050621,65015360,60087.1864,0,12.535067,14.10082,0
1270236,104381461,60068.3381,5,-10.821926,13.784325,0
1388270,125276761,59652.0477,5,-59.547398,53.621174,0
1147824,82678948,60313.3152,2,17.801054,48.507767,0
322521,167436,60609.04,3,2.331788,1.222446,0
603362,300864,60546.3406,2,0.185006,1.904506,0
549411,272839,59902.1287,5,-0.816815,8.03186,0
1117905,77355092,60548.9899,4,-0.012486,15.446695,0


In [92]:
# df = pd.pivot_table(train, index=['object_id','passband'], columns=['mjd'])
# df.head(10)

# df = pd.pivot_table(train, index=['object_id','passband'], columns=df.groupby(['object_id', 'passband']).cumcount().add(1), values=['flux', 'flux_err'], aggfunc='sum')
# df

In [110]:
train_df = train.groupby(['object_id', 'passband']).apply(lambda x: pd.Series(
                               {
                                **{t: t for i,t in enumerate(x.mjd)}
                               }
                             )).reset_index()
train_df

Unnamed: 0,object_id,passband,level_2,0
0,615,0,59819.1532,59819.1532
1,615,0,59820.1047,59820.1047
2,615,0,59821.1026,59821.1026
3,615,0,59822.1105,59822.1105
4,615,0,59823.1505,59823.1505
5,615,0,59851.1114,59851.1114
6,615,0,59874.0599,59874.0599
7,615,0,59875.0311,59875.0311
8,615,0,59876.0231,59876.0231
9,615,0,59877.0238,59877.0238


In [108]:
train_df = train_df.drop(0)
train_df

ValueError: labels [0] not contained in axis