# Tutorial: Creating TF Records

Last updated: 07-Oct-2021 by
Cristobal Donoso

In [1]:
cd /tf/astromer

/tf/astromer


In [2]:
import matplotlib.pyplot as plt
import tensorflow as tf
import pandas as pd
import dask.dataframe as dd
import os

from core.masking import get_padding_mask
from core.astromer import get_ASTROMER, train
from core.data  import create_records

# This tutorial runs only in early mode. For the optimized code use train.py
# OTHERWISE you can comment the next ljupyine and reset the kernel before the finetuning
tf.config.run_functions_eagerly(False)

%load_ext autoreload
%autoreload 2

In [3]:
name = 'alcock'
photshards = './data/raw_data/{}/phot/'.format(name) # Photometry shards folder
metadata   = './data/raw_data/{}/{}_dataset.csv'.format(name, name) # Metadata file
target     = './data/records/{}/'.format(name) # where records will be stored

### Metadata frame sample

In [4]:
meta = pd.read_csv(metadata)
print(meta.shape)
meta = meta[meta['class']!='UNK'] # we remove unknown objects
meta = meta[meta['nobs']>=20]
print(meta.shape)
meta['class'].value_counts()

(21444, 3)
(20867, 3)


RRab     7381
EC       6814
LPV      3046
RRc      1761
Cep_0    1182
Cep_1     683
Name: class, dtype: int64

### Training, validation and testing split

We take 100 samples per class for the testing dataset. The rest is used for training and validation. At the beggining of training, the record loader function will take a random fraction of samples for validation. 

In [5]:
training_oids = []
testing_oids = []
for cls, subframe in meta.groupby('class'):
    subframe = subframe.sample(frac=1) # Shuffling

    test  = subframe.iloc[:100] # we took 100 objects per class
    train = subframe.iloc[100:] # the rest for training and validation 
    
    training_oids.append(train)
    testing_oids.append(test)
    
train_meta = pd.concat(training_oids)
test_meta = pd.concat(testing_oids)

we can save the subframes to recover oids later

In [6]:
train_meta.to_csv(os.path.join(target, 'train_objs.csv'), index=False)
test_meta.to_csv(os.path.join(target, 'test_objs.csv'), index=False)

### Saving Records

Creating Records requires tables with `['oid', 'mjd', 'mag', 'std', 'band']` columns stored as `.csv` files. Also you can split your entire dataset in shards to optimize reading.

In [7]:
observations = dd.read_csv(os.path.join(photshards,'*.csv')) # Load photometries shards using DASK

In [8]:
%%time
create_records(observations, train_meta, dest='./data/records/{}/train'.format(name))

[INFO] Records succefully created. Have a good training
CPU times: user 2min 23s, sys: 17.5 s, total: 2min 40s
Wall time: 1min 50s


In [9]:
%%time
create_records(observations, test_meta, dest='./data/records/{}/test'.format(name), njobs=4)

[INFO] Records succefully created. Have a good training
CPU times: user 45.8 s, sys: 4.39 s, total: 50.2 s
Wall time: 18.9 s


# Loading Data

In [1]:
cd /tf/astromer

/tf/astromer


In [2]:
from core.data import load_records
import os
import tensorflow as tf
%load_ext autoreload
%autoreload 2

### Loading data for pretraining and finetuning

In [8]:
batch_size = 10
max_obs = 50

In [9]:
train_batches, val_batches = load_records('./data/records/alcock/train',
                                            batch_size,
                                            val_data=10, # either fraction (0, 1) or number of samples per class
                                            no_shuffle=False, 
                                            max_obs=max_obs,
                                            repeat=1)

In [84]:
print('Train: {}\nVal: {}'.format(tf.reduce_sum([b['input'].shape[0] for b in train_batches]).numpy(),
      tf.reduce_sum([b['input'].shape[0] for b in val_batches]).numpy()))

Train: 60
Val: 60


In [85]:
test_batches = pretraining_records('./data/records/alcock/test', batch_size, max_obs=max_obs)
print('test: ', tf.reduce_sum([b['input'].shape[0] for b in test_batches]).numpy())

test:  600


### Loading data for classification

In [6]:
train_batches, val_batches = load_records('./data/records/alcock/train',
                                          batch_size,
                                          val_data=10, # either fraction (0, 1) or number of samples per class
                                          no_shuffle=False, 
                                          max_obs=max_obs,
                                          msk_frac=0.,                                                  
                                          rnd_frac=0., 
                                          same_frac=0., 
                                          repeat=1)

In [14]:
masked_values = tf.reduce_sum([tf.reduce_sum(batch['mask_in']) for batch in train_batches])
print(masked_values)

tf.Tensor(0.0, shape=(), dtype=float32)


0. is ok when doing specific task. It means no magnitude was masked.

### Loading data for getting embeddings

In [3]:
import pandas as pd
from core.pretrained import ASTROMER_v1

In [4]:
objects = pd.read_csv('./data/records/alcock/train_objs.csv')

In [13]:
batch_size = 300
int(objects.shape[0]/batch_size)

67

In [14]:
astromer_size = 200
maxobs = objects['nobs'].max()

rest = maxobs%astromer_size
maxobs = maxobs + astromer_size-rest

n_windows = maxobs//astromer_size

In [15]:
train_batches = load_records('./data/records/alcock/train',
                             batch_size,
                             val_data=0, # either fraction (0, 1) or number of samples per class
                             no_shuffle=False, 
                             max_obs=maxobs,
                             msk_frac=0.,                                                  
                             rnd_frac=0., 
                             same_frac=0., 
                             repeat=1)

In [16]:
model = ASTROMER_v1()
encoder = model.model.get_layer('encoder')

In [23]:
import time
n_windows

9

In [24]:
def batch_inference(x, t, m, encoder):
    inputs = {'input':x,
              'times':t,
              'mask_in':m}      
    emb = encoder(inputs)
    return emb.numpy()

In [25]:
%%time
embeddings = []
for batch in train_batches:
    start = time.time()
    w_inp  = tf.split(batch['input'], n_windows, axis=1)
    w_time = tf.split(batch['times'], n_windows, axis=1)
    w_mask = tf.split(batch['mask_in'], n_windows, axis=1)
    
    batch_windows = []
    for x,t,m in zip(w_inp, w_time, w_mask):
        embs = batch_inference(x,t,m,encoder)
        batch_windows.append(embs)
        
    batch_emb = tf.concat(batch_windows, 1)
    bool_mask = tf.logical_not(tf.cast(tf.squeeze(batch['mask_in']), tf.bool))
    valid = tf.ragged.boolean_mask(batch_emb, bool_mask)
    embeddings.append(valid)
    
    end = time.time() 
    
    print('{:.2f}'.format(end-start))

13.59
14.10
15.90
16.71
19.25
20.78
19.78
19.96
21.32
20.26
20.04
18.94
18.80
18.69
18.96
18.85
19.54
19.70
19.14
18.67
18.23
18.45
18.35
18.67
18.31
18.09
18.62
18.84
18.65
18.41
18.56
18.00
17.76
18.39
18.62
18.56
21.46
20.54
20.53
19.97
18.61


KeyboardInterrupt: 