# Tutorial: Creating TF Records

Last updated: 07-Oct-2021 by
Cristobal Donoso

In [48]:
cd /tf/astromer

/tf/astromer


In [49]:
import matplotlib.pyplot as plt
import tensorflow as tf
import pandas as pd
import dask.dataframe as dd
import os

from core.masking import get_padding_mask
from core.astromer import get_ASTROMER, train
from core.data  import create_records

# This tutorial runs only in early mode. For the optimized code use train.py
# OTHERWISE you can comment the next ljupyine and reset the kernel before the finetuning
tf.config.run_functions_eagerly(False)

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [71]:
name = 'ogle'
photshards = './data/raw_data/{}/phot/'.format(name) # Photometry shards folder
metadata   = './data/raw_data/{}/{}_dataset.csv'.format(name, name) # Metadata file
target     = './data/records/small_{}/'.format(name) # where records will be stored

### Metadata frame sample

In [72]:
meta = pd.read_csv(metadata)
meta = meta[meta['class']!='UNK'] # we remove unknown objects
meta = meta[meta['nobs']>=20]
meta = meta.sample(n=20000)
print(meta.shape)
meta['class'].value_counts()

(20000, 4)


OSARG    11815
SRV       1815
std       1784
RRab      1284
ED        1160
ESD        515
cep        419
RRc        410
EC         351
Mira       305
dsct       142
Name: class, dtype: int64

### Training, validation and testing split

We take 100 samples per class for the testing dataset. The rest is used for training and validation. At the beggining of training, the record loader function will take a random fraction of samples for validation. 

In [73]:
training_oids = []
testing_oids  = []
valids_oids   = []
for cls, subframe in meta.groupby('class'):
    subframe = subframe.sample(frac=1) # Shuffling

    test  = subframe.iloc[:100] # we took 100 objects per class
    rest  = subframe.iloc[100:] # the rest for training and validation 
    valid = rest.sample(frac=0.25)
    train = rest[~rest['oid'].isin(valid['oid'])]
    
    training_oids.append(train)
    valids_oids.append(valid)
    testing_oids.append(test)
    
train_meta = pd.concat(training_oids)
valid_meta = pd.concat(valids_oids)
test_meta  = pd.concat(testing_oids)

we can save the subframes to recover oids later

In [74]:
os.makedirs(target, exist_ok=True)
train_meta.to_csv(os.path.join(target, 'train_objs.csv'), index=False)
valid_meta.to_csv(os.path.join(target, 'valid_objs.csv'), index=False)
test_meta.to_csv(os.path.join(target, 'test_objs.csv'), index=False)

### Saving Records

Creating Records requires tables with `['oid', 'mjd', 'mag', 'std', 'band']` columns stored as `.csv` files. Also you can split your entire dataset in shards to optimize reading.

In [75]:
observations = dd.read_csv(os.path.join(photshards,'*.csv')) # Load photometries shards using DASK

In [None]:
%%time
create_records(observations, train_meta, dest='./data/records/{}/train'.format(name))

In [None]:
%%time
create_records(observations, valid_meta, dest='./data/records/{}/val'.format(name), njobs=4)

In [None]:
%%time
create_records(observations, test_meta, dest='./data/records/{}/test'.format(name), njobs=4)

# Loading Data

In [35]:
cd /tf/astromer

/tf/astromer


In [36]:
from core.data import load_records
import os
import tensorflow as tf
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Loading data for pretraining and finetuning

In [37]:
batch_size = 10
max_obs = 50

In [39]:
train_batches = load_records('./data/records/alcock/train',
                             batch_size,
                             max_obs=max_obs,
                             repeat=1,
                             is_train=True)
val_batches = load_records('./data/records/alcock/val',
                             batch_size,
                             max_obs=max_obs,
                             repeat=1,
                             is_train=True)

Training Mode
Training Mode


In [40]:
print('Train: {}\nVal: {}'.format(tf.reduce_sum([b['input'].shape[0] for b in train_batches]).numpy(),
      tf.reduce_sum([b['input'].shape[0] for b in val_batches]).numpy()))

Train: 19400
Val: 4849


In [41]:
test_batches = load_records('./data/records/alcock/test', 
                            batch_size, 
                            max_obs=max_obs)
print('test: ', tf.reduce_sum([b['input'].shape[0] for b in test_batches]).numpy())

Testing mode
test:  600


### Loading data for classification

In [46]:
train_batches = load_records('./data/records/alcock/train',
                              batch_size,
                              max_obs=10,
                              msk_frac=0.,                                                  
                              rnd_frac=0., 
                              same_frac=0., 
                              repeat=1,
                              is_train=True)

Training Mode


In [47]:
masked_values = tf.reduce_sum([tf.reduce_sum(batch['mask_in']) for batch in train_batches])
print(masked_values)

tf.Tensor(0.0, shape=(), dtype=float32)


zero value for short light-curves is ok. It means no magnitude was masked.