# Tutorial: Creating TF Records

Last updated: 09-sep-2021 by
Cristobal Donoso

In [1]:
cd /tf/astromer

/tf/astromer


In [2]:
import matplotlib.pyplot as plt
import tensorflow as tf
import pandas as pd
import os

from core.masking import get_padding_mask
from core.astromer import get_ASTROMER, train
from core.data  import (create_dataset, 
                        pretraining_records)

# This tutorial runs only in early mode. For the optimized code use train.py
# OTHERWISE you can comment the next ljupyine and reset the kernel before the finetuning
tf.config.run_functions_eagerly(False)

%load_ext autoreload
%autoreload 2

In [8]:
source = './data/raw_data/ogle/OGLE/LCs/' # lightcurves folder
metadata = './data/raw_data/ogle/OGLE/OGLE_dataset.dat' # metadata file
target = './data/records/ogle_2500' # where records will be stored

### Metadata frame sample

In [19]:
meta = pd.read_csv(metadata)
meta = meta[meta.N > 100]
meta['Band'] = tf.ones(meta.shape[0])

### Sampling from each class 
This for especific experiment only! If you want to train on your whole dataset please skip this cell

In [10]:
class_groups = meta.groupby('Class')
subsets = []
for i, g in class_groups:
    subset = g.sample(2500)
    subsets.append(subset)
meta_v2 = pd.concat(subsets)
meta_v2 = meta_v2[meta_v2['Class'] != 'std']

### Lightcurve frame sample

In [11]:
lc_df = pd.read_csv(os.path.join(source,  meta_v2['Path'].sample(1).values[0].split('/')[-1]))
lc_df.head()

Unnamed: 0,mjd,mag,errmag
0,2128.69818,14.074,0.006
1,2131.61604,14.098,0.007
2,2135.58611,14.102,0.006
3,2140.56664,14.098,0.005
4,2143.6735,14.089,0.005


### From .csv to .record 

In [12]:
create_dataset(meta_v2, source, target, max_lcs_per_record=20000, n_jobs=7, subsets_frac=(0.6, 0.2))

  0%|          | 0/10 [00:00<?, ?it/s]

2.8510899543762207
1.5990266799926758


 10%|█         | 1/10 [00:05<00:45,  5.10s/it]

0.6447122097015381
1.9480879306793213
1.156299352645874


 20%|██        | 2/10 [00:08<00:33,  4.17s/it]

0.4200010299682617
2.2376112937927246
1.324131727218628


 30%|███       | 3/10 [00:12<00:28,  4.13s/it]

0.5059020519256592
2.2663683891296387
1.4645977020263672


 40%|████      | 4/10 [00:16<00:25,  4.17s/it]

0.5032973289489746
2.371983766555786
1.3563110828399658


 50%|█████     | 5/10 [00:21<00:20,  4.20s/it]

0.5095806121826172
2.044661045074463
1.1806573867797852


 60%|██████    | 6/10 [00:24<00:16,  4.03s/it]

0.47254037857055664
2.1118643283843994
1.1577091217041016


 70%|███████   | 7/10 [00:28<00:11,  3.93s/it]

0.45895838737487793
2.400801658630371
1.2899196147918701


 80%|████████  | 8/10 [00:32<00:08,  4.03s/it]

0.5530643463134766
2.3066084384918213
1.2485730648040771


 90%|█████████ | 9/10 [00:36<00:04,  4.04s/it]

0.5063972473144531
2.0714221000671387
1.2193186283111572


100%|██████████| 10/10 [00:40<00:00,  4.07s/it]

0.4824635982513428





### Loading Data

In [13]:
batch_size = 10
max_obs = 50

In [20]:
from core.data import pretraining_records

train_batches = pretraining_records(os.path.join(target, 'train'), batch_size, max_obs=max_obs)
valid_batches = pretraining_records(os.path.join(target, 'val'), batch_size, max_obs=max_obs)
test_batches = pretraining_records(os.path.join(target, 'test'), batch_size, max_obs=max_obs)

[INFO] Shuffling
[INFO] Shuffling
[INFO] Shuffling


In [16]:
sum([b['input'].shape[0] for b in train_batches])

75000

In [17]:
sum([b['input'].shape[0] for b in valid_batches]) 

37500

In [21]:
sum([b['input'].shape[0] for b in test_batches]) 

12500