In this notebook, we will try to create a function to read a a parquet file (or multiple ones) and filter by IDs.

In [1]:
import pandas as pd
import numpy as np
from os.path import join
import polars as pl
import os

In [2]:
local_path = '../../data/raw_data/alcock/'

Read the metadata and save it into one parquet.

In [3]:
metadata = pd.read_csv(join(local_path, 'metadata.csv'))

In [4]:
path = (local_path + 'LCs/'+ metadata.Path).to_list()
IDs = metadata.ID.to_list()
metadata = metadata.assign(newID=metadata.index.values)

In [5]:
path_parquets = join(local_path, 'parquets/')
if not os.path.exists(path_parquets):
    os.mkdir(path_parquets)

In [6]:
dfs = []
for cont, (file, id_) in enumerate(zip(path, IDs)):
    # Read the file
    df = pd.read_csv(file, engine='c', na_filter=False)
    df['newID'] = metadata.newID.iloc[cont]*np.ones(df.shape[0]).astype(np.int64)
    dfs.append(df)
dfs = pd.concat(dfs) 
dfs = dfs.set_index('newID')

Save every 1000 lightcurves


In [7]:
N_batch = 1000
batches = np.ceil(np.arange(dfs.index.unique().shape[0]).shape[0]/N_batch).astype(np.int64)
for batch in range(batches):
    n0 = N_batch*batch
    n1 = N_batch*(batch+1)
    df_sel = dfs.iloc[n0:n1]
    n = str(batch).rjust(3, '0')
    df_sel.to_parquet(os.path.join(path_parquets, 'shard_'+n+'.parquet'))

In [8]:
metadata['Class'] = pd.Categorical(metadata['Class'])
metadata['Label'] = metadata['Class'].cat.codes

In [9]:
metadata.to_parquet(join(local_path, 'new_metadata.parquet'), index=False)