### Load Catalog

 - Connect to catalog (postgres) and warehouse (s3 - data & metadata layer)
 - Verify Namespace and Table


 *NOTE* For this example we will use a second namespace: `development`


In [1]:
import logging
logging.basicConfig(level=logging.INFO)

from connection import connect_to_catalog
catalog = connect_to_catalog()

NAMESPACE = "conformance"

name_spaces = [ns[0] for ns in catalog.list_namespaces()]
print(f"Existing namespaces: {name_spaces}")

for table_identifier in catalog.list_tables(NAMESPACE):
    print(f"Table found: {table_identifier}")

INFO:root:Connected to Iceberg catalog: `trinity`


Existing namespaces: ['conformance', 'development']
Table found: ('conformance', 'hydraulics')
Table found: ('conformance', 'hydrology')


#### Starting with the hydraulics table:

 - Load sample data from Trinity models into dataframes 
 - Prepare for appending to Iceberg table

In [2]:
from example_data_utils import load_sample_ras_data
from example_data_utils import SAMPLE_RAS_MODELS,SAMPLE_EVENTS

first_ras_model = SAMPLE_RAS_MODELS[0]

print(f"Loading RAS data for event {SAMPLE_EVENTS[0]} and model {first_ras_model}")
ras_df = load_sample_ras_data(SAMPLE_EVENTS[0], first_ras_model)
print(f"RAS data shape: {ras_df.shape}")

ras_df.head()

INFO:root:Loading data from s3://trinity-pilot/dev/conformance/simulations/event-data/4/hydraulics/blw-bear/flow_timeseries.pq


Loading RAS data for event 4 and model blw-bear


INFO:root:Data loaded with shape (841, 25)


RAS data shape: (21025, 7)


Unnamed: 0,sim_time,realization_id,model_id,site_id,event_id,run_version,flow
0,1992-11-22 00:00:00,1,blw-bear,bc_big-bear-ck_s010_base,4,v1,0.0
1,1992-11-22 01:00:00,1,blw-bear,bc_big-bear-ck_s010_base,4,v1,0.0
2,1992-11-22 02:00:00,1,blw-bear,bc_big-bear-ck_s010_base,4,v1,0.0
3,1992-11-22 03:00:00,1,blw-bear,bc_big-bear-ck_s010_base,4,v1,0.0
4,1992-11-22 04:00:00,1,blw-bear,bc_big-bear-ck_s010_base,4,v1,0.0


In [3]:
# This table has flow but no stage; we will add stage in an update to the table later
# to ensure consistency with the iceberg schema
import pandas as pd
ras_df["stage"] = pd.NA

#### Convert from pandas to arrow tables with compatible iceberg schema

In [4]:
import pyarrow as pa
from table_utils import json_to_arrow

arrow_schema = json_to_arrow("schemas/hydraulics.json")
arrow_table = pa.Table.from_pandas(ras_df, schema=arrow_schema, preserve_index=False)

arrow_table.slice(0,1)

pyarrow.Table
sim_time: timestamp[us] not null
realization_id: int32 not null
model_id: string not null
site_id: string not null
event_id: int32 not null
run_version: string not null
flow: double
stage: double
----
sim_time: [[1992-11-22 00:00:00.000000]]
realization_id: [[1]]
model_id: [["blw-bear"]]
site_id: [["bc_big-bear-ck_s010_base"]]
event_id: [[4]]
run_version: [["v1"]]
flow: [[0]]
stage: [[null]]

#### Load the existing (empty) table form the iceberg catalog


In [5]:
table_identifier = f"{NAMESPACE}.hydraulics"
table = catalog.load_table(table_identifier)

schema = table.schema()

print(schema) 
print("identifier_field_ids:", schema.identifier_field_ids)

id_to_name = {
    field_id: schema.find_field(field_id).name
    for field_id in schema.identifier_field_ids
}

print("identifier fields:", id_to_name)

INFO:pyiceberg.io:Loaded FileIO: pyiceberg.io.pyarrow.PyArrowFileIO
INFO:pyiceberg.io:Loaded FileIO: pyiceberg.io.pyarrow.PyArrowFileIO


table {
  1: sim_time: required timestamp (Simulation timestamp from HEC-RAS model [UTC])
  2: realization_id: required int (Unique identifier for each model realization)
  3: model_id: required string (Identifier for the HEC-RAS model)
  4: site_id: required string (Identifier for the measurement site)
  5: event_id: required int (Identifier for the simulated event)
  6: run_version: required string (Version of the model run)
  7: flow: optional double (Discharge at the site [cfs])
  8: stage: optional double (Stage at the site [ft])
}
identifier_field_ids: [1, 2, 3, 4, 5, 6]
identifier fields: {1: 'sim_time', 2: 'realization_id', 3: 'model_id', 4: 'site_id', 5: 'event_id', 6: 'run_version'}


#### Append data and 

In [None]:
table.append(arrow_table)
# Show current table history after appending
print(f"Snapshots: {table.snapshots()[0]}")  

Snapshots: Operation.APPEND: id=6458660265846564039, schema_id=0


![](imgs/hydra-1.png)

#### Add flow data to existing records in the hydrology table

In [11]:
from example_data_utils import SAMPLE_HMS_MODELS, load_sample_hms_data
from example_data_utils import SAMPLE_EVENTS

first_hms_model = SAMPLE_HMS_MODELS[0]

print(f"Loading HMS data for event {SAMPLE_EVENTS[0]} and model {first_hms_model}")
hms_df = load_sample_hms_data(SAMPLE_EVENTS[0], first_hms_model)
print(f"HMS data shape: {hms_df.shape}")

hms_df.head()

INFO:root:Loading data from s3://trinity-pilot/stac/prod-support/conformance/event_id=4/hms_model=trinity/FLOW.pq


Loading HMS data for event 4 and model trinity


INFO:root:Data loaded with shape (841, 782)


HMS data shape: (657662, 7)


Unnamed: 0,sim_time,realization_id,model_id,site_id,event_id,run_version,flow
0,1995-10-11 00:00:00,1,trinity,amon-g-carter_s010,4,v1,0.0
1,1995-10-11 01:00:00,1,trinity,amon-g-carter_s010,4,v1,0.0
2,1995-10-11 02:00:00,1,trinity,amon-g-carter_s010,4,v1,0.0
3,1995-10-11 03:00:00,1,trinity,amon-g-carter_s010,4,v1,0.0
4,1995-10-11 04:00:00,1,trinity,amon-g-carter_s010,4,v1,0.0


In [13]:
import pandas as pd
hms_df["base_flow"] = pd.NA


In [14]:
import pyarrow as pa
from table_utils import json_to_arrow

arrow_schema = json_to_arrow("schemas/hydrology.json")
arrow_table = pa.Table.from_pandas(hms_df, schema=arrow_schema, preserve_index=False)

arrow_table.slice(0,1)

pyarrow.Table
sim_time: timestamp[us] not null
realization_id: int32 not null
model_id: string not null
site_id: string not null
event_id: int32 not null
run_version: string not null
flow: double
base_flow: double
----
sim_time: [[1995-10-11 00:00:00.000000]]
realization_id: [[1]]
model_id: [["trinity"]]
site_id: [["amon-g-carter_s010"]]
event_id: [[4]]
run_version: [["v1"]]
flow: [[0]]
base_flow: [[null]]

In [17]:
hydro_table_identifier = f"{NAMESPACE}.hydrology"
hydro_table = catalog.load_table(hydro_table_identifier)

hydro_table.append(arrow_table)
# Show current table history after appending
print(f"Snapshots: {hydro_table.snapshots()[0]}")  

INFO:pyiceberg.io:Loaded FileIO: pyiceberg.io.pyarrow.PyArrowFileIO
INFO:pyiceberg.io:Loaded FileIO: pyiceberg.io.pyarrow.PyArrowFileIO
INFO:pyiceberg.io:Loaded FileIO: pyiceberg.io.pyarrow.PyArrowFileIO
INFO:pyiceberg.io:Loaded FileIO: pyiceberg.io.pyarrow.PyArrowFileIO
INFO:pyiceberg.io:Loaded FileIO: pyiceberg.io.pyarrow.PyArrowFileIO


Snapshots: Operation.APPEND: id=5693197842489282781, schema_id=0


![](imgs/hydro-1.png)