# Tutorial: Creating TF Records

Last updated: 09-sep-2021 by
Cristobal Donoso

In [1]:
cd /home/users/cdonoso/astromer/ASTROMER/

/home/users/cdonoso/astromer/ASTROMER


In [2]:
import matplotlib.pyplot as plt
import tensorflow as tf
import pandas as pd
import os
import sys,os
sys.path.append(os.path.realpath('/home/users/cdonoso/astromer/ASTROMER'))

from core.masking import get_padding_mask
from core.astromer import get_ASTROMER, train
from core.data  import (create_dataset, 
                        pretraining_records)

# This tutorial runs only in early mode. For the optimized code use train.py
# OTHERWISE you can comment the next ljupyine and reset the kernel before the finetuning
tf.config.run_functions_eagerly(False)

%load_ext autoreload
%autoreload 2

[H[2J

In [97]:
source = './data/raw_data/ogle3/LCs/' # lightcurves folder
metadata = './data/raw_data/ogle3/ogle_dataset.dat' # metadata file
target = './data/records/ogle_20' # where records will be stored

### Metadata frame sample

In [98]:
pwd

'/home/users/cdonoso/astromer/ASTROMER'

In [99]:
meta = pd.read_csv(metadata)
meta = meta[meta['Class'] != 'UNK']
meta = meta[meta['Class'] != 'std']
meta = meta[meta['N'] >= 20]
meta['Band'] = tf.ones(meta.shape[0])
meta = meta.rename(columns={'Unnamed: 0':'ID', 'Path_R':'Path'})
meta = pd.concat([frame.sample(n=20) for c, frame in meta.groupby('Class')])

In [100]:
meta.sample(), meta.shape

(                         ID Class  \
 48859  OGLE-LMC-RRLYR-13140  RRab   
 
                                               Path    N  Band  
 48859  ./lmc/rrlyr/phot/I/OGLE-LMC-RRLYR-13140.dat  664   1.0  ,
 (200, 5))

### Lightcurve frame sample

In [101]:
lc_df = pd.read_csv(os.path.join(source,  
                                 meta['Path'].sample(1).values[0].split('/')[-1]),
                   delim_whitespace=False)
lc_df.head()

Unnamed: 0,mjd,mag,errmag
0,2165.90781,17.642,0.027
1,2166.82639,17.595,0.017
2,2172.87516,17.617,0.018
3,2189.83242,17.669,0.024
4,2191.86676,17.582,0.016


### From .csv to .record 

In [102]:
%%time
create_dataset(meta, source, target, max_lcs_per_record=20000, 
               n_jobs=7, subsets_frac=(0.6, 0.2))

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 11.60it/s]

CPU times: user 674 ms, sys: 31.9 ms, total: 706 ms
Wall time: 867 ms





### Loading Data

In [14]:
batch_size = 10
max_obs = 50

In [15]:
from core.data import pretraining_records

train_batches = pretraining_records(os.path.join(target, 'train'), batch_size, max_obs=max_obs)
valid_batches = pretraining_records(os.path.join(target, 'val'), batch_size, max_obs=max_obs)
test_batches = pretraining_records(os.path.join(target, 'test'), batch_size, max_obs=max_obs)

In [16]:
sum([b['input'].shape[0] for b in train_batches])

12534

In [46]:
sum([b['input'].shape[0] for b in valid_batches]) 

32150

In [47]:
sum([b['input'].shape[0] for b in test_batches]) 

10750