In [41]:
import json
import os
from datetime import datetime
from importlib.metadata import version

import numpy as np
import pandas as pd
import pyarrow.dataset as pds
import pyarrow.parquet as pq

from hipscat.io import paths

PROVENANCE_INFO_FILENAME = "provenance_info.json"
PARQUET_METADATA_FILENAME = "_metadata"
PARQUET_COMMON_METADATA_FILENAME = "_common_metadata"


def write_parquet_metadata(catalog_path):
    """Generate parquet metadata, using the already-partitioned parquet files
    for this catalog
    Args:
        catalog_path (str): base path for the catalog
    """

    dataset = pds.dataset(catalog_path,
                          partitioning="hive", 
                          format="parquet", 
                          exclude_invalid_files=True)
    metadata_collector = []
    all_names = set()

    for hips_file in dataset.files:
        ## Get rid of any non-parquet files
        if not hips_file.endswith("parquet"):
            continue
        single_metadata = pq.read_metadata(hips_file)
        metadata_collector.append(single_metadata)
        all_names.update(single_metadata.schema.to_arrow_schema().names)
    print(all_names)
    dataset_names = set(dataset.schema.names)
    print(all_names.difference(dataset_names))
    hive_names = dataset_names.difference(all_names)
    known_hive_names = set([paths.ORDER_DIRECTORY_PREFIX, paths.DIR_DIRECTORY_PREFIX, "parts"])
    unknown_names = hive_names.difference(known_hive_names)
    print(unknown_names)

    ## Trim hive fields from final schema, otherwise there will be a mismatch.
    subschema = dataset.schema
    hive_fields = [paths.ORDER_DIRECTORY_PREFIX, paths.DIR_DIRECTORY_PREFIX, "parts"]
    for hive_field in hive_fields:
        field_index = subschema.get_field_index(hive_field)
        if field_index != -1:
            subschema = subschema.remove(field_index)

    metadata_path = os.path.join(catalog_path, PARQUET_METADATA_FILENAME)
    common_metadata_path = os.path.join(catalog_path, PARQUET_COMMON_METADATA_FILENAME)

#     pq.write_metadata(subschema, metadata_path, metadata_collector=metadata_collector)
#     pq.write_metadata(subschema, common_metadata_path)

In [42]:
# tables = ["object", "source", "object_to_source"]

# for table in tables:
#     write_parquet_metadata(f"/data3/epyc/data3/hipscat/catalogs/ztf_mar16/{table}/")
    
# tables = ["object_index", "source_index"]

# for table in tables:
#     write_parquet_metadata(f"/data3/epyc/data3/hipscat/catalogs/ztf_mar16/{table}/")

write_parquet_metadata(f"/data3/epyc/data3/hipscat/catalogs/ztf_mar16/source/")

{'catflags', 'band', 'fieldID', '_hipscat_id', 'rcID', 'mjd', 'index', 'ps1_objid', 'mag', '__index_level_0__', 'dec', 'ra', 'maggerr'}
set()
set()


In [13]:
dataset = pds.dataset(f"/data3/epyc/data3/hipscat/catalogs/ztf_mar16/object_index/", 
                      partitioning="hive", 
                      format="parquet", 
                      exclude_invalid_files=True)

In [27]:
# md = pq.read_metadata("/data3/epyc/data3/hipscat/catalogs/ztf_mar16/object_index/parts=0/part_000_of_001.parquet")
md = pq.read_metadata("/data3/epyc/data3/hipscat/catalogs/ztf_mar16/object/Norder=1/Dir=0/Npix=33.parquet")
md.schema.to_arrow_schema().names

['index',
 '_hipscat_id',
 'ps1_objid',
 'ra',
 'dec',
 'ps1_gMeanPSFMag',
 'ps1_rMeanPSFMag',
 'ps1_iMeanPSFMag',
 'nobs_g',
 'nobs_r',
 'nobs_i',
 'mean_mag_g',
 'mean_mag_r',
 'mean_mag_i',
 '__index_level_0__']