In [1]:
cd /home

/home


In this notebook, we will try to create a function to read a a parquet file (or multiple ones) and filter by IDs.

In [2]:
import pandas as pd
import numpy as np
from os.path import join
import polars as pl
import os

In [3]:
pwd

'/home'

In [5]:
local_path = './data/raw_data/alcock/'

Read the metadata and save it into one parquet.

In [7]:
metadata = pd.read_csv(join(local_path, 'train_metadata.csv'))
metadata['sset'] = ['train']*metadata.shape[0]
metadata

Unnamed: 0,ID,Class,Path,Band,sset
0,1.3319.10,LPV,1.3319.10.dat,1.0,train
1,1.3441.15,Cep_0,1.3441.15.dat,1.0,train
2,1.3441.25,LPV,1.3441.25.dat,1.0,train
3,1.3441.45,Cep_0,1.3441.45.dat,1.0,train
4,1.3441.1031,RRab,1.3441.1031.dat,1.0,train
...,...,...,...,...,...
21439,9.5608.870,RRab,9.5608.870.dat,1.0,train
21440,9.5608.946,RRab,9.5608.946.dat,1.0,train
21441,9.5609.22,EC,9.5609.22.dat,1.0,train
21442,9.5609.790,RRab,9.5609.790.dat,1.0,train


In [6]:
testmetadata = pd.read_csv(join(local_path, 'test_meta.csv'))
testmetadata['sset'] = ['test']*testmetadata.shape[0]
testmetadata

Unnamed: 0,ID,Path,N,Band,Class,sset
0,F_10.4278.2380,F_10.4278.2380.dat,835,R,NonVar,test
1,F_10.4278.2381,F_10.4278.2381.dat,834,R,NonVar,test
2,F_10.4278.2382,F_10.4278.2382.dat,846,R,NonVar,test
3,F_10.4278.2383,F_10.4278.2383.dat,826,R,NonVar,test
4,F_10.4278.2384,F_10.4278.2384.dat,848,R,NonVar,test
...,...,...,...,...,...,...
74589,F_10.4277.5224,F_10.4277.5224.dat,780,R,NonVar,test
74590,F_10.4277.5225,F_10.4277.5225.dat,778,R,NonVar,test
74591,F_10.4277.5231,F_10.4277.5231.dat,790,R,NonVar,test
74592,F_10.4277.5240,F_10.4277.5240.dat,798,R,NonVar,test


In [7]:
metadata = pd.concat([metadata, testmetadata])

In [8]:
path = (local_path + 'LCs/'+ metadata.Path).to_list()
IDs = metadata.ID.to_list()
metadata = metadata.assign(newID=metadata.index.values)

In [9]:
path_parquets = join(local_path, 'parquets/')
if not os.path.exists(path_parquets):
    os.mkdir(path_parquets)

In [10]:
dfs = []
for cont, (file, id_) in enumerate(zip(path, IDs)):
    # Read the file
    df = pd.read_csv(file, engine='c', na_filter=False)
    df['newID'] = metadata.newID.iloc[cont]*np.ones(df.shape[0]).astype(np.int64)
    dfs.append(df)
dfs = pd.concat(dfs) 
dfs = dfs.set_index('newID')

In [11]:
dfs

Unnamed: 0_level_0,observation_date,red magnitude,red error
newID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,49075.7738,-6.422,0.035
0,49078.7742,-6.450,0.028
0,49080.8128,-6.441,0.032
0,49082.7526,-6.424,0.041
0,49091.8035,-6.416,0.053
...,...,...,...
74593,51505.7017,-2.658,0.504
74593,51510.6614,-2.935,0.204
74593,51513.5102,-2.810,0.231
74593,51514.6641,-2.988,0.273


Save every 1000 lightcurves


In [12]:
# dfs.iloc[100000:101000]

In [14]:
nsamples = 1000
for batch, begin in enumerate(np.arange(0, dfs.shape[0], nsamples)):
    df_sel = dfs.iloc[begin:begin+nsamples]    
    n = str(batch).rjust(3, '0')
    df_sel.to_parquet(os.path.join(path_parquets, 
                                   'shard_'+n+'.parquet'))

In [9]:
metadata['Label'] = pd.Categorical(metadata['Class']).codes

In [10]:
metadata.to_parquet(join(local_path, 'new_metadata.parquet'), index=False)

In [23]:
import polars as pl

In [24]:
path_parquets = './data/raw_data/alcock/parquets/*'
# path_parquets = [os.path.join(root, x) for x in os.listdir(root)]
paths = os.path.join(path_parquets)
scan = pl.scan_parquet(paths)

In [25]:
# scan.collect(streaming=False)['newID'].unique()

In [26]:
import pandas as pd

In [27]:
root = './data/raw_data/alcock/parquets/'
path_parquets =[os.path.join(root, x) for x in os.listdir(root)]

dfs = []
for file in path_parquets:
    df = pd.read_parquet(file)
    dfs.append(df)

In [28]:
pd.concat(dfs).reset_index()['newID'].unique().shape

(21444,)

In [29]:
pd.concat(dfs).shape

(13818054, 3)