# Tutorial: Creating TF Records

Last updated: 09-sep-2021 by
Cristobal Donoso

In [1]:
cd /tf/astromer

/tf/astromer


In [2]:
import matplotlib.pyplot as plt
import tensorflow as tf
import pandas as pd
import os

from core.masking import get_padding_mask
from core.astromer import get_ASTROMER, train
from core.data  import (create_dataset, 
                        pretraining_records)

# This tutorial runs only in early mode. For the optimized code use train.py
# OTHERWISE you can comment the next ljupyine and reset the kernel before the finetuning
tf.config.run_functions_eagerly(False)

%load_ext autoreload
%autoreload 2

In [5]:
source = './data/raw_data/alcock/LCs/' # lightcurves folder
metadata = './data/raw_data/alcock/alcock_dataset.dat' # metadata file
target = './data/records/alcock_500' # where records will be stored

### Metadata frame sample

In [6]:
meta = pd.read_csv(metadata)
meta = meta[meta.N > 20]
meta['Band'] = tf.ones(meta.shape[0])
meta = meta.rename(columns={'Unnamed: 0':'ID'})
meta

Unnamed: 0,ID,Class,Path,Field,Band,N
0,1.3319.10,LPV,1.3319.10.dat,1.0,1.0,613
1,1.3441.15,Cep_0,1.3441.15.dat,1.0,1.0,725
2,1.3441.25,LPV,1.3441.25.dat,1.0,1.0,676
3,1.3441.45,Cep_0,1.3441.45.dat,1.0,1.0,706
4,1.3441.1031,RRab,1.3441.1031.dat,1.0,1.0,302
...,...,...,...,...,...,...
21439,9.5608.870,RRab,9.5608.870.dat,1.0,1.0,308
21440,9.5608.946,RRab,9.5608.946.dat,1.0,1.0,350
21441,9.5609.22,EC,9.5609.22.dat,1.0,1.0,190
21442,9.5609.790,RRab,9.5609.790.dat,1.0,1.0,238


### Sampling from each class 
This for especific experiment only! If you want to train on your whole dataset please skip this cell

In [7]:
class_groups = meta.groupby('Class')
subsets = []
for i, g in class_groups:
    subset = g.sample(n=500)
    subsets.append(subset)
meta_v2 = pd.concat(subsets)
meta_v2 = meta_v2[meta_v2['Class'] != 'std']

### Lightcurve frame sample

In [8]:
lc_df = pd.read_csv(os.path.join(source,  meta_v2['Path'].sample(1).values[0].split('/')[-1]))
lc_df.head()

Unnamed: 0,mjd,mag,err
0,48823.75781,-8.734,0.006
1,48824.78125,-8.755,0.006
2,48825.78516,-8.743,0.005
3,48828.74609,-8.754,0.005
4,48829.72656,-8.752,0.004


### From .csv to .record 

In [9]:
create_dataset(meta_v2, source, target, max_lcs_per_record=20000, n_jobs=7, subsets_frac=(0.6, 0.2))

  0%|          | 0/7 [00:00<?, ?it/s]

1.046903133392334


 14%|█▍        | 1/7 [00:01<00:09,  1.64s/it]

0.4184262752532959
0.17228364944458008
0.7055926322937012


 29%|██▊       | 2/7 [00:02<00:07,  1.41s/it]

0.37753915786743164
0.16222643852233887
0.6253464221954346


 43%|████▎     | 3/7 [00:04<00:05,  1.28s/it]

0.3583824634552002
0.14584922790527344
0.6548886299133301


 57%|█████▋    | 4/7 [00:05<00:03,  1.25s/it]

0.3924129009246826
0.16194772720336914
0.6047682762145996


 71%|███████▏  | 5/7 [00:06<00:02,  1.21s/it]

0.3891594409942627
0.1432650089263916
0.5869824886322021


 86%|████████▌ | 6/7 [00:07<00:01,  1.17s/it]

0.3522148132324219
0.1454932689666748
0.5790619850158691


100%|██████████| 7/7 [00:08<00:00,  1.22s/it]

0.37667369842529297
0.1392984390258789





### Loading Data

In [52]:
batch_size = 10
max_obs = 50

In [53]:
from core.data import pretraining_records

train_batches = pretraining_records(os.path.join(target, 'train'), batch_size, max_obs=max_obs)
valid_batches = pretraining_records(os.path.join(target, 'val'), batch_size, max_obs=max_obs)
test_batches = pretraining_records(os.path.join(target, 'test'), batch_size, max_obs=max_obs)

[INFO] Shuffling
[INFO] Shuffling
[INFO] Shuffling


In [54]:
sum([b['input'].shape[0] for b in train_batches])

2100

In [17]:
sum([b['input'].shape[0] for b in valid_batches]) 

37500

In [21]:
sum([b['input'].shape[0] for b in test_batches]) 

12500