# Tutorial: Creating TF Records

Last updated: 09-sep-2021 by
Cristobal Donoso

In [6]:
cd /home

/home


In [7]:
import matplotlib.pyplot as plt
import tensorflow as tf
import pandas as pd
import os
import sys,os
sys.path.append(os.path.realpath('/home/users/cdonoso/astromer/workshop/'))

from core.masking import get_padding_mask
from core.astromer import get_ASTROMER, train
from core.data  import (create_dataset, 
                        pretraining_records)

# This tutorial runs only in early mode. For the optimized code use train.py
# OTHERWISE you can comment the next ljupyine and reset the kernel before the finetuning
tf.config.run_functions_eagerly(False)

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [8]:
source = './data/raw_data/alcock/LCs/' # lightcurves folder
metadata = './data/raw_data/alcock/alcock_dataset.dat' # metadata file
target = './data/records/alcock' # where records will be stored

### Metadata frame sample

In [9]:
meta = pd.read_csv(metadata)
meta = meta[meta['Class'] != 'UNK']
meta = meta[meta['Class'] != 'std']
# meta = meta[meta['N'] >= 20]
meta['Band'] = tf.ones(meta.shape[0])
meta = meta.rename(columns={'Unnamed: 0':'ID', 'Path_R':'Path'})
# meta = pd.concat([frame.sample(n=100) for c, frame in meta.groupby('Class')])

In [10]:
meta.sample(), meta.shape

(                ID Class     rPer     bPer                   Path_B  \
 13512  7.7777.2818    EC  1.24443  1.27064  ./LCs/B/7.7777.2818.dat   
 
                           Path  Band  
 13512  ./LCs/R/7.7777.2818.dat   1.0  ,
 (20894, 7))

### Lightcurve frame sample

In [11]:
lc_df = pd.read_csv(os.path.join(source,  
                                 meta['Path'].sample(1).values[0].split('/')[-1]),
                   delim_whitespace=False)
lc_df.head()

Unnamed: 0,mjd,mag,err
0,48884.70703,-5.563,0.057
1,48885.67969,-5.432,0.058
2,48888.77734,-5.5,0.043
3,48894.78516,-5.291,0.092
4,48916.75391,-5.559,0.054


### From .csv to .record 

In [14]:
%%time
create_dataset(meta, source, target, max_lcs_per_record=20000, 
               n_jobs=7, subsets_frac=(0.6, 0.2))

100%|███████████████████████████████████████████| 6/6 [00:28<00:00,  4.77s/it]

CPU times: user 18.7 s, sys: 711 ms, total: 19.4 s
Wall time: 28.7 s





### Loading Data

In [14]:
batch_size = 10
max_obs = 50

In [15]:
from core.data import pretraining_records

train_batches = pretraining_records(os.path.join(target, 'train'), batch_size, max_obs=max_obs)
valid_batches = pretraining_records(os.path.join(target, 'val'), batch_size, max_obs=max_obs)
test_batches = pretraining_records(os.path.join(target, 'test'), batch_size, max_obs=max_obs)

In [16]:
sum([b['input'].shape[0] for b in train_batches])

12534

In [46]:
sum([b['input'].shape[0] for b in valid_batches]) 

32150

In [47]:
sum([b['input'].shape[0] for b in test_batches]) 

10750