# Tutorial: Creating TF Records

Last updated: 09-sep-2021 by
Cristobal Donoso

In [1]:
cd ../..

/home/users/cdonoso/astromer/ASTROMER


In [2]:
import matplotlib.pyplot as plt
import tensorflow as tf
import pandas as pd
import os
import sys,os

os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

from core.masking import get_padding_mask
from core.astromer import get_ASTROMER, train
from core.data  import (create_dataset, 
                        pretraining_records)

# This tutorial runs only in early mode. For the optimized code use train.py
# OTHERWISE you can comment the next ljupyine and reset the kernel before the finetuning
tf.config.run_functions_eagerly(False)

%load_ext autoreload
%autoreload 2

[H[2J

In [3]:
source = './data/raw_data/naul_macho/LCs/' # lightcurves folder
metadata = './data/raw_data/naul_macho/metadata.csv' # metadata file
name = 'naul_macho'

### Metadata frame sample

In [4]:
meta = pd.read_csv(metadata)
print(meta.shape)
# meta = meta[meta['Class'] != 'UNK']
# meta = meta[meta['Class'] != 'std']
# meta = meta[meta['Class'] != 'Dubious']
# meta = meta[meta['N'] >= 20]
meta['Band'] = tf.ones(meta.shape[0])
meta = meta.rename(columns={'objID':'ID', 'Unnamed: 0':'ID', 'Path_R':'Path'})
print(meta.shape)
if 'atlas' in name:
    meta['Path'] = meta['ID'].astype(str)+'.dat'

(21470, 4)
(21470, 5)


2022-02-02 11:10:38.584902: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
2022-02-02 11:10:38.584948: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:169] retrieving CUDA diagnostic information for host: pececillo
2022-02-02 11:10:38.584956: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:176] hostname: pececillo
2022-02-02 11:10:38.585064: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:200] libcuda reported version is: 470.82.0
2022-02-02 11:10:38.585092: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:204] kernel reported version is: 470.82.0
2022-02-02 11:10:38.585099: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:310] kernel version seems to match DSO: 470.82.0
2022-02-02 11:10:38.585472: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions i

In [5]:
change_labels = {'LPV WoodA':'LPV',
                 'LPV WoodB':'LPV', 
                 'LPV WoodC':'LPV', 
                 'LPV WoodD':'LPV'}
meta['Class'] = meta['Class'].replace(change_labels)

In [6]:
meta['Class'].value_counts()

RRL AB       7403
EB           6833
LPV          3049
RRL C        1765
Ceph Fund    1185
Ceph 1st      683
RRL E         315
RRL + GB      237
Name: Class, dtype: int64

### Lightcurve frame sample

In [7]:
lc_df = pd.read_csv(os.path.join(source,  
                                 meta['Path'].sample(1).values[0].split('/')[-1]), 
                    )
lc_df.head()

Unnamed: 0,mjd,mag,std
0,48823.6441,-9.481,0.005
1,48823.6537,-9.456,0.005
2,48823.6634,-9.466,0.006
3,48824.6249,-9.466,0.002
4,48824.6344,-9.46,0.002


### Creating training partitions from .csv to .record 

In [16]:
meta.shape

(21470, 5)

In [13]:
%%time

for fold_n in range(3): 
    test_meta  = pd.concat([frame.sample(n=100) for g, frame in meta.groupby('Class')])
    train_meta = meta[~meta['ID'].isin(test_meta['ID'])]
    print(test_meta.shape, train_meta.shape)
        
    for nsamples in [0]:  
        if nsamples == 0:
            partial_meta = train_meta
            target = './data/records/{}/fold_{}/{}'.format(name, fold_n, name)
        else:
            partial_meta = pd.concat([frame.sample(n=nsamples) for c, frame in train_meta.groupby('Class')])
            target = './data/records/{}/fold_{}/{}_{}'.format(name, fold_n, name, nsamples)

        create_dataset(partial_meta, source, target, max_lcs_per_record=20000, 
                       n_jobs=7, subsets_frac=(0.8, 0.2), test_subset=test_meta,  
                       names=['mjd', 'mag', 'errmag'],
                       delim_whitespace=True)

(1000, 5) (357288, 5)


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [06:39<00:00, 39.94s/it]


(1000, 5) (357288, 5)


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [06:37<00:00, 39.75s/it]


(1000, 5) (357288, 5)


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [06:37<00:00, 39.78s/it]

CPU times: user 16min 54s, sys: 20.2 s, total: 17min 15s
Wall time: 19min 55s





### JUST ONE DATASET

In [17]:
name

'naul_macho'

In [19]:
for fold_n in range(3):
    target = './data/records/{}/fold_{}/{}'.format(name, fold_n, name)
    
    test_meta  = meta.sample(frac=0.2)
    train_meta = meta[~meta['ID'].isin(test_meta['ID'])]

    create_dataset(train_meta, source, target, max_lcs_per_record=20000, 
                   n_jobs=7, subsets_frac=(0.8, 0.2), 
                   test_subset=test_meta)

100%|████████████████████████████████████████████████████████████████████| 8/8 [01:02<00:00,  7.83s/it]
100%|████████████████████████████████████████████████████████████████████| 8/8 [01:02<00:00,  7.85s/it]
100%|████████████████████████████████████████████████████████████████████| 8/8 [01:02<00:00,  7.76s/it]


In [17]:
root = './data/records/new_ogle/'
for fold_n in range(3):
    fold_f = os.path.join(root, 'fold_{}'.format(fold_n))
    for source in os.listdir(fold_f):
        target = source.replace('new_', '')
        os.rename(os.path.join(fold_f, source), os.path.join(fold_f, target))