In [4]:
from pathlib import Path
import light_curve as licu
import dask.dataframe as dd
from tape import Ensemble, ColumnMapper
import numpy as np

DATA_DIR = Path("../data/plasticc/uncompressed")

META_FILENAME = "plasticc_test_metadata.csv"
LC_FILENAMES = [f"plasticc_test_lightcurves_{i:02d}.csv" for i in range(1, 12)]
# META_FILENAME = "plasticc_train_metadata.csv.gz"
# LC_FILENAMES = ['plasticc_train_lightcurves.csv.gz']

N_PARTITIONS = len(LC_FILENAMES)

N_PROCESSORS = 3

In [5]:
# In TAPE's (and LSST's) terminology, sources are individual detections,
# and objects are the underlying astrophysical objects.

# We load object table first, from the metadata file.
print("Loading object table...")
object_table = dd.read_csv(
    DATA_DIR / META_FILENAME,
    # Read data chunk by chunk, to avoid loading the whole file into memory.
    blocksize=100e6,
)

# Then we load the sources:
print("Loading source tables...")
source_table = dd.read_csv(
    [DATA_DIR / filename for filename in LC_FILENAMES],
    blocksize=100e6,
)

Loading object table...
Loading source tables...


In [8]:
object_table.head(5)

Unnamed: 0,object_id,ra,decl,ddf_bool,hostgal_specz,hostgal_photoz,hostgal_photoz_err,distmod,mwebv,target,...,true_rv,true_av,true_peakmjd,libid_cadence,tflux_u,tflux_g,tflux_r,tflux_i,tflux_z,tflux_y
0,13,34.4531,-5.2295,1,0.305,0.319,0.054,41.112,0.019,0,...,0.0,0.0,60499.461,124,0.0,0.0,0.0,0.0,0.0,0.0
1,14,33.3984,-4.3311,1,-9.0,0.632,0.018,42.877,0.018,0,...,0.0,0.0,59792.121,120,0.0,0.0,0.0,0.0,0.0,0.0
2,17,348.5294,-61.7554,1,-9.0,0.83,0.06,43.6,0.016,0,...,0.0,0.0,60543.566,85,0.0,0.0,0.0,0.0,0.0,0.0
3,23,34.8047,-5.8292,1,-9.0,0.653,0.148,42.964,0.023,0,...,0.0,0.0,60137.48,97,0.0,0.0,0.0,0.0,0.0,0.0
4,34,351.3214,-64.1987,1,0.456,0.462,0.012,42.054,0.023,0,...,0.0,0.0,60245.078,68,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
dd.to_parquet(source_table, path="../data/plasticc/parquet/source/", write_index=False)

In [9]:
dd.to_parquet(object_table, path="../data/plasticc/parquet/object/", write_index=False)