# Tutorial: Creating TF Records

Last updated: 09-sep-2021 by
Cristobal Donoso

In [1]:
cd /tf/astromer

/tf/astromer


In [2]:
import matplotlib.pyplot as plt
import tensorflow as tf
import pandas as pd
import os
import sys,os
sys.path.append(os.path.realpath('/home/users/cdonoso/astromer/workshop/'))

from core.masking import get_padding_mask
from core.astromer import get_ASTROMER, train
from core.data  import (create_dataset, 
                        pretraining_records)

# This tutorial runs only in early mode. For the optimized code use train.py
# OTHERWISE you can comment the next ljupyine and reset the kernel before the finetuning
tf.config.run_functions_eagerly(False)

%load_ext autoreload
%autoreload 2

In [17]:
source = './data/raw_data/ogle/LCs/' # lightcurves folder
metadata = './data/raw_data/ogle/OGLE_dataset.dat' # metadata file
target = './data/records/ogle_100' # where records will be stored

### Metadata frame sample

In [18]:
meta = pd.read_csv(metadata)
meta = meta[meta['Class'] != 'UNK']
meta = meta[meta['Class'] != 'std']
meta = meta[meta['N'] >= 20]
meta['Band'] = tf.ones(meta.shape[0])
meta = meta.rename(columns={'Unnamed: 0':'ID', 'Path_R':'Path'})
meta = pd.concat([frame.sample(n=100) for c, frame in meta.groupby('Class')])

In [19]:
meta.sample(), meta.shape



(                       ID Class                                    Path    N  \
 176345  OGLE-SMC-ECL-1635    ED  ./smc/ecl/phot/I/OGLE-SMC-ECL-1635.dat  713   
 
         Band  
 176345   1.0  ,
 (1000, 5))

### Sampling from each class 
This for especific experiment only! If you want to train on your whole dataset please skip this cell

In [20]:
meta.sample(1)

Unnamed: 0,ID,Class,Path,N,Band
5214,OGLE-LMC-ECL-14768,ED,./lmc/ecl/phot/I/OGLE-LMC-ECL-14768.dat,539,1.0


### Lightcurve frame sample

In [21]:
lc_df = pd.read_csv(os.path.join(source,  
                                 meta['Path'].sample(1).values[0].split('/')[-1]),
                   delim_whitespace=True, names=['mjd', 'mag', 'std'])
lc_df.head()

Unnamed: 0,mjd,mag,std
0,2166.86329,21.116,0.38
1,2189.86887,21.285,0.57
2,2192.77176,20.97,0.289
3,2201.78397,20.698,0.207
4,2212.81622,20.36,0.208


### From .csv to .record 

In [22]:
%%time
create_dataset(meta, source, target, max_lcs_per_record=20000, 
               n_jobs=7, subsets_frac=(0.6, 0.2))

100%|██████████| 10/10 [00:04<00:00,  2.29it/s]

CPU times: user 4.09 s, sys: 78.5 ms, total: 4.17 s
Wall time: 4.37 s





### Loading Data

In [43]:
batch_size = 10
max_obs = 50

In [44]:
from core.data import pretraining_records

train_batches = pretraining_records(os.path.join(target, 'train'), batch_size, max_obs=max_obs)
valid_batches = pretraining_records(os.path.join(target, 'val'), batch_size, max_obs=max_obs)
test_batches = pretraining_records(os.path.join(target, 'test'), batch_size, max_obs=max_obs)

[INFO] Shuffling
[INFO] Shuffling
[INFO] Shuffling


In [45]:
sum([b['input'].shape[0] for b in train_batches])

2021-11-04 09:08:10.522343: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


64320

In [46]:
sum([b['input'].shape[0] for b in valid_batches]) 

32150

In [47]:
sum([b['input'].shape[0] for b in test_batches]) 

10750