# Tutorial: Creating TF Records

Last updated: 07-Oct-2021 by
Cristobal Donoso

In [1]:
cd /tf/astromer

/tf/astromer


In [2]:
import matplotlib.pyplot as plt
import tensorflow as tf
import pandas as pd
import dask.dataframe as dd
import os

from core.masking import get_padding_mask
from core.astromer import get_ASTROMER, train
from core.data  import create_records, split_and_create_records

# This tutorial runs only in early mode. For the optimized code use train.py
# OTHERWISE you can comment the next ljupyine and reset the kernel before the finetuning
tf.config.run_functions_eagerly(False)

%load_ext autoreload
%autoreload 2

In [6]:
name = 'ogle'
photshards = './data/raw_data/{}/phot/'.format(name) # Photometry shards folder
metadata   = './data/raw_data/{}/{}_dataset.csv'.format(name, name) # Metadata file
target     = './data/records/{}/'.format(name) # where records will be stored

### Metadata frame sample

In [7]:
meta = pd.read_csv(metadata)
meta = meta[meta['class']!='UNK'] # we remove unknown objects
meta = meta[meta['nobs']>=20]
meta = pd.concat([group.sample(n=2800) for _, group in meta.groupby('class')], 0)
meta['class'].value_counts(), meta.shape

(ED       2800
 std      2800
 RRc      2800
 ESD      2800
 cep      2800
 SRV      2800
 dsct     2800
 EC       2800
 Mira     2800
 RRab     2800
 OSARG    2800
 Name: class, dtype: int64,
 (30800, 4))

### Training, validation and testing split

We take 100 samples per class for the testing dataset. The rest is used for training and validation. In case you already have a testing set, it can be upload in order to only change the training samples. 

In [8]:
# test_saved = pd.read_csv('./data/records/asas/test_objs.csv')

In [9]:
training_oids = []
testing_oids  = []
valids_oids   = []
for cls, subframe in meta.groupby('class'):
    subframe = subframe.sample(frac=1) # Shuffling

    train = subframe.sample(frac=0.5)
    
    rest  = subframe[~subframe['oid'].isin(train['oid'])]
    test  = rest.sample(frac=0.5)
    valid = rest[~rest['oid'].isin(test['oid'])]
    
    training_oids.append(train)
    valids_oids.append(valid)
    testing_oids.append(test)
    
train_meta = pd.concat(training_oids)
valid_meta = pd.concat(valids_oids)
test_meta  = pd.concat(testing_oids)

In [10]:
print(train_meta.shape[0])
print(valid_meta.shape[0])
print(test_meta.shape[0])
print(meta.shape[0])

15400
7700
7700
30800


we can save the subframes to recover oids later

In [11]:
os.makedirs(target, exist_ok=True)
train_meta.to_csv(os.path.join(target, 'train_objs.csv'), index=False)
valid_meta.to_csv(os.path.join(target, 'val_objs.csv'), index=False)
test_meta.to_csv(os.path.join(target, 'test_objs.csv'), index=False)

### Saving Records

Creating Records requires tables with `['oid', 'mjd', 'mag', 'std', 'band']` columns stored as `.csv` files. Also you can split your entire dataset in shards to optimize reading.

In [12]:
from multiprocessing import cpu_count

In [13]:
%%time
# Load photometries shards using DASK (THIS COULD TAKE A LOT OF MEMORY DEPENDING ON YOUR DATASET!!!)
observations = dd.read_csv(os.path.join(photshards,'*.csv')) 
observations = observations[observations['oid'].isin(meta['oid'])].compute()

CPU times: user 2min 45s, sys: 12.7 s, total: 2min 58s
Wall time: 55.2 s


In [None]:
%%time
split_and_create_records(observations, train_meta, dest='{}/train'.format(target), njobs=cpu_count())

[INFO] Processing EC class...
[INFO] Cooking windows
[INFO] Writting records
[INFO] Processing ED class...
[INFO] Cooking windows
[INFO] Writting records
[INFO] Processing ESD class...
[INFO] Cooking windows
[INFO] Writting records
[INFO] Processing Mira class...
[INFO] Cooking windows
[INFO] Writting records
[INFO] Processing OSARG class...


In [None]:
%%time
split_and_create_records(observations, valid_meta, dest='{}/val'.format(target), njobs=cpu_count())

In [None]:
%%time
split_and_create_records(observations, test_meta, dest='{}/test'.format(target), njobs=cpu_count())

# Loading Data

In [31]:
cd /tf/astromer

/tf/astromer


In [32]:
from core.data import load_records, load_records_v3
import os
import tensorflow as tf
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Loading data for pretraining and finetuning

In [33]:
batch_size = 10
max_obs = 200

In [43]:
train_batches = load_records('./data/records/asas/train',
                             batch_size,
                             max_obs=max_obs,
                             is_train=True)
val_batches = load_records('./data/records/asas/val',
                             batch_size,
                             max_obs=max_obs,
                             is_train=True)

Training Mode
Training Mode


In [45]:
print('Train: {}\nVal: {}'.format(tf.reduce_sum([x['input'].shape[0] for x in train_batches]).numpy(),
      tf.reduce_sum([x['input'].shape[0] for x in val_batches]).numpy()))

Train: 4426
Val: 2223


In [37]:
test_batches = load_records_v3('./data/records/asas/test', 
                               batch_size, 
                               max_obs=max_obs,
                               is_train=False)
print('test: ', tf.reduce_sum([x['input'].shape[0] for x, _ in test_batches]).numpy())

Testing mode
test:  2219


### Loading data for classification

In [40]:
train_batches = load_records_v3('./data/records/asas/train',
                              batch_size,
                              max_obs=200,
                              is_train=True)

Training mode


In [42]:
masked_values = tf.reduce_sum([tf.reduce_sum(batch['mask_in']) for batch, _ in train_batches])
print(masked_values)

tf.Tensor(154024.0, shape=(), dtype=float32)


zero value for short light-curves is ok. It means no magnitude was masked.