# Tutorial: Creating TF Records

Last updated: 07-Oct-2021 by
Cristobal Donoso

In [1]:
cd /tf/astromer

/tf/astromer


In [2]:
import matplotlib.pyplot as plt
import tensorflow as tf
import pandas as pd
import dask.dataframe as dd
import os

from core.masking import get_padding_mask
from core.astromer import get_ASTROMER, train
from core.data  import create_records, split_and_create_records

# This tutorial runs only in early mode. For the optimized code use train.py
# OTHERWISE you can comment the next ljupyine and reset the kernel before the finetuning
tf.config.run_functions_eagerly(False)

%load_ext autoreload
%autoreload 2

In [3]:
name = 'alcock'
photshards = './data/raw_data/{}/phot/'.format(name) # Photometry shards folder
metadata   = './data/raw_data/{}/{}_dataset.csv'.format(name, name) # Metadata file
target     = './data/records_v2/{}/'.format(name) # where records will be stored

### Metadata frame sample

In [4]:
meta = pd.read_csv(metadata)
meta = meta[meta['class']!='UNK'] # we remove unknown objects
meta = meta[meta['nobs']>=100]
# meta = meta.sample(n=20000)
print(meta.shape)
meta['class'].value_counts()

(20444, 4)


RRab     7157
EC       6681
LPV      3017
RRc      1734
Cep_0    1174
Cep_1     681
Name: class, dtype: int64

### Training, validation and testing split

We take 100 samples per class for the testing dataset. The rest is used for training and validation. In case you already have a testing set, it can be upload in order to only change the training samples. 

In [5]:
# test_saved = pd.read_csv('./data/records/asas/test_objs.csv')

In [6]:
training_oids = []
testing_oids  = []
valids_oids   = []
for cls, subframe in meta.groupby('class'):
    subframe = subframe.sample(frac=1) # Shuffling

    train = subframe.sample(frac=0.5)
    
    rest  = subframe[~subframe['oid'].isin(train['oid'])]
    test  = rest.sample(frac=0.5)
    valid = rest[~rest['oid'].isin(test['oid'])]
    
    training_oids.append(train)
    valids_oids.append(valid)
    testing_oids.append(test)
    
train_meta = pd.concat(training_oids)
valid_meta = pd.concat(valids_oids)
test_meta  = pd.concat(testing_oids)

In [7]:
print(train_meta.shape[0])
print(valid_meta.shape[0])
print(test_meta.shape[0])
print(meta.shape[0])

10220
5112
5112
20444


we can save the subframes to recover oids later

In [8]:
os.makedirs(target, exist_ok=True)
train_meta.to_csv(os.path.join(target, 'train_objs.csv'), index=False)
valid_meta.to_csv(os.path.join(target, 'val_objs.csv'), index=False)
test_meta.to_csv(os.path.join(target, 'test_objs.csv'), index=False)

### Saving Records

Creating Records requires tables with `['oid', 'mjd', 'mag', 'std', 'band']` columns stored as `.csv` files. Also you can split your entire dataset in shards to optimize reading.

In [16]:
from multiprocessing import cpu_count

In [9]:
observations = dd.read_csv(os.path.join(photshards,'*.csv')) # Load photometries shards using DASK

In [17]:
%%time
split_and_create_records(observations, train_meta, dest='{}/train'.format(target), njobs=cpu_count())

[INFO] Processing Cep_0 class...
[INFO] Cooking windows
[INFO] Writting records
[INFO] Processing Cep_1 class...
[INFO] Cooking windows
[INFO] Writting records
[INFO] Processing EC class...
[INFO] Cooking windows
[INFO] Writting records
[INFO] Processing LPV class...
[INFO] Cooking windows
[INFO] Writting records
[INFO] Processing RRab class...
[INFO] Cooking windows
[INFO] Writting records
[INFO] Processing RRc class...
[INFO] Cooking windows
[INFO] Writting records
[INFO] Records succefully created. Have a good training
CPU times: user 2min 3s, sys: 15.6 s, total: 2min 18s
Wall time: 1min 47s


In [18]:
%%time
split_and_create_records(observations, valid_meta, dest='{}/val'.format(target), njobs=cpu_count())

[INFO] Processing Cep_0 class...
[INFO] Cooking windows
[INFO] Writting records
[INFO] Processing Cep_1 class...
[INFO] Cooking windows
[INFO] Writting records
[INFO] Processing EC class...
[INFO] Cooking windows
[INFO] Writting records
[INFO] Processing LPV class...
[INFO] Cooking windows
[INFO] Writting records
[INFO] Processing RRab class...
[INFO] Cooking windows
[INFO] Writting records
[INFO] Processing RRc class...
[INFO] Cooking windows
[INFO] Writting records
[INFO] Records succefully created. Have a good training
CPU times: user 1min 23s, sys: 11.1 s, total: 1min 34s
Wall time: 1min 3s


In [19]:
%%time
split_and_create_records(observations, test_meta, dest='{}/test'.format(target), njobs=cpu_count())

[INFO] Processing Cep_0 class...
[INFO] Cooking windows
[INFO] Writting records
[INFO] Processing Cep_1 class...
[INFO] Cooking windows
[INFO] Writting records
[INFO] Processing EC class...
[INFO] Cooking windows
[INFO] Writting records
[INFO] Processing LPV class...
[INFO] Cooking windows
[INFO] Writting records
[INFO] Processing RRab class...
[INFO] Cooking windows
[INFO] Writting records
[INFO] Processing RRc class...
[INFO] Cooking windows
[INFO] Writting records
[INFO] Records succefully created. Have a good training
CPU times: user 1min 27s, sys: 11.5 s, total: 1min 39s
Wall time: 1min 6s


# Loading Data

In [20]:
cd /tf/astromer

/tf/astromer


In [22]:
from core.data import load_records, load_records_v3
import os
import tensorflow as tf
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Loading data for pretraining and finetuning

In [62]:
batch_size = 10
max_obs = 200

In [63]:
train_batches = load_records_v3('./data/records_v2/alcock/train',
                             batch_size,
                             max_obs=max_obs,
                             is_train=True)
val_batches = load_records_v3('./data/records_v2/alcock/val',
                             batch_size,
                             max_obs=max_obs,
                             is_train=True)

Training mode
Training mode


In [64]:
print('Train: {}\nVal: {}'.format(tf.reduce_sum([x['input'].shape[0] for x, _ in train_batches]).numpy(),
      tf.reduce_sum([x['input'].shape[0] for x, _ in val_batches]).numpy()))

Train: 38999
Val: 19351


In [67]:
test_batches = load_records_v3('./data/records_v2/alcock/test', 
                               batch_size, 
                               max_obs=max_obs,
                               is_train=False)
print('test: ', tf.reduce_sum([x['input'].shape[0] for x, _ in test_batches]).numpy())

Testing mode
test:  19435


### Loading data for classification

In [21]:
train_batches = load_records_v3('./data/records/alcock/train',
                              batch_size,
                              max_obs=10,
                              msk_frac=0.,                                                  
                              rnd_frac=0., 
                              same_frac=0., 
                              repeat=1,
                              is_train=True)

NameError: name 'load_records_v3' is not defined

In [47]:
masked_values = tf.reduce_sum([tf.reduce_sum(batch['mask_in']) for batch in train_batches])
print(masked_values)

tf.Tensor(0.0, shape=(), dtype=float32)


zero value for short light-curves is ok. It means no magnitude was masked.