# Comparative Storage Evaluation of Parquet and DGGS-(UB)JSON Formats

Using the Kontur Population Data in H3 DGGS as per the Canada Population notebook.

In [2]:
import geopandas as gpd
import pandas as pd
import h3
import os
import copy
import ubjson

In [6]:
kontur_gdf = gpd.read_file('kontur_population_20231101.gpkg', layer='population')

In [8]:
kontur_gdf.shape

(32957699, 3)

In [9]:
kontur_gdf.head(5)

Unnamed: 0,h3,population,geometry
0,88f3a6db3bfffff,1.0,"POLYGON ((18603209.76 -14292976.866, 18601045...."
1,88f3a6db17fffff,1.0,"POLYGON ((18606931.526 -14291748.854, 18604767..."
2,88f2a40257fffff,1.0,"POLYGON ((-18642103.219 -18440071.317, -186432..."
3,88f1b4575dfffff,1.0,"POLYGON ((13731045.703 -12978244.014, 13729880..."
4,88f1b45755fffff,1.0,"POLYGON ((13727729.912 -12976687.302, 13726565..."


In [10]:
h3_kontur_df = kontur_gdf.drop(columns=['geometry'], errors='ignore')
h3_kontur_df['population'] = h3_kontur_df['population'].astype('int32')
h3_kontur_df['h3'] = h3_kontur_df['h3'].apply(lambda zone_id: int(zone_id, 16))
h3_kontur_df.set_index('h3', inplace=True)

In [11]:
h3_kontur_df.head(5)

Unnamed: 0_level_0,population
h3,Unnamed: 1_level_1
616775916814598143,1
616775916776849407,1
616758128968466431,1
616741659117355007,1
616741659108966399,1


In [5]:
h3_kontur_df.to_parquet('kontur_population_20231101_h3-int64.parquet')

In [6]:
print("Kontur Original Size:", os.stat('kontur_population_20231101.gpkg').st_size)
print("Kontur H3 Int64 Size:", os.stat('kontur_population_20231101_h3-int64.parquet').st_size)

Kontur Original Size: 6711951360
Kontur H3 Int64 Size: 196460119


In [7]:
h3_zones_L0 = h3.get_res0_cells()
len(h3_zones_L0)

122

In [8]:
ubjson_base_template = {
  "$schema": "https://schemas.opengis.net/ogcapi/dggs/1.0/core/schemas/dggs-json/dggs-json.json",
  "dggrs": "https://www.opengis.net/def/dggrs/OGC/1.0/H3",
  "zoneId": None,  # to be filled
  "depths": [8],
  "schema": {
    "$schema": "https://json-schema.org/draft/2020-12/schema",
    "type": "object",
    "properties": {"population": {"type": "number", "format": "int32"}}
  },
  "values": {
    "population": [{
      "depth": 8,
      "shape": {"count": None, "subZones": None},  # to be filled
      "data": []  # to be filled
    }]
  }
}

In [25]:
os.makedirs('h3_population_ubjson', exist_ok=True)
os.makedirs('h3_population_ubjson_data', exist_ok=True)
os.makedirs('h3_population_ubjson_skip', exist_ok=True)
for parent in h3_zones_L0:
    children = h3.cell_to_children(parent, res=8)
    children = pd.Series(children, dtype='string').apply(lambda x: int(x, 16))
    values = h3_kontur_df.reindex(children)['population']
    data = values.tolist()

    ubjson_data = copy.deepcopy(ubjson_base_template)
    ubjson_data['zoneId'] = parent
    ubjson_data['values']['population'][0]['shape']['count'] = children.size
    ubjson_data['values']['population'][0]['shape']['subZones'] = children.size
    ubjson_data['values']['population'][0]['data'] = data

    with open(f'h3_population_ubjson/{parent}.ubjson', 'wb') as f:
        ubjson.dump(ubjson_data, f)
    with open(f'h3_population_ubjson_data/{parent}.ubjson', 'wb') as f:
        ubjson.dump(data, f)
    if not values.isna().all():
        with open(f'h3_population_ubjson_skip/{parent}.ubjson', 'wb') as f:
            ubjson.dump(ubjson_data, f)

In [26]:
def get_dir_size_mb(directory):
    total_size = 0
    for dirpath, dirnames, filenames in os.walk(directory):
        for f in filenames:
            fp = os.path.join(dirpath, f)
            if os.path.isfile(fp):
                total_size += os.path.getsize(fp)
    print(f"Total size: {total_size / (1024 * 1024):.2f} MB")


get_dir_size_mb('h3_population_ubjson')
get_dir_size_mb('h3_population_ubjson_data')
get_dir_size_mb('h3_population_ubjson_skip')

Total size: 911.22 MB
Total size: 911.18 MB
Total size: 829.67 MB
