# data_d25a.ipynb
1. For GMSL and RSL at gauges, save fusion, high-end, low-end, and central projections for 2020–2100, and also gauge info.
2. For cities near a gauge, save city info, gauge info, high-end, low-end, and central projections for 2100.

Author: Benjamin S. Grandey.

In [1]:
import d25a
import datetime
import numpy as np
import pandas as pd

In [2]:
# Get start datetime
start_dt = datetime.datetime.now()

In [3]:
# Print package versions
print(d25a.get_watermark())

Python implementation: CPython
Python version       : 3.10.16
IPython version      : 8.31.0

matplotlib: 3.10.0
numpy     : 2.2.2
pandas    : 2.2.3
seaborn   : 0.13.2
xarray    : 2025.1.1

conda environment: d25a-rsl-fusion

Compiler    : Clang 18.1.8 
OS          : Darwin
Release     : 22.6.0
Machine     : arm64
Processor   : arm
CPU cores   : 8
Architecture: 64bit



In [4]:
# Make output directories if they don't exist
for data_dir in (d25a.DATA_DIR, d25a.DATA_DIR / 'gmsl', d25a.DATA_DIR / 'gauges', d25a.DATA_DIR / 'cities'):
    if not data_dir.exists():
        data_dir.mkdir()

## 1. GMSL and RSL at gauges

### 1a. Identify gauges with missing RSL data
These gauges will be dropped.

In [5]:
# Read fusion RSL data for one scenario
qfs_da = d25a.get_sl_qfs(workflow='fusion_1e+2e', gmsl_rsl_novlm='rsl', scenario='ssp585').copy()
# Identify locations with missing data
missing_gauges = qfs_da.where(qfs_da.isnull(), drop=True).locations.data
# Print some information about these gauges
print(f'{len(missing_gauges)} gauges have missing RSL data:')
for gauge_id in missing_gauges:
    gauge_info = d25a.get_gauge_info(gauge=gauge_id)
    print(f"{gauge_id}, {gauge_info['gauge_name']}, {gauge_info['country']}")

14 gauges have missing RSL data:
126, TROIS-RIVIERES, CANADA
137, PORT-SAINT-FRANCOIS, CANADA
144, BATISCAN, CANADA
173, QUEBEC, CANADA
192, NEUVILLE, CANADA
201, DESCHAILLONS, CANADA
387, GRONDINES, CANADA
951, PORTNEUF, CANADA
999, ST-FRANCOIS, CANADA
1005, CHAMPLAIN, CANADA
1219, TADOUSSAC, CANADA
1244, ST-JOSEPH-DE-LA-RIVE, CANADA
1392, PORT-ALFRED, CANADA
1798, BECANCOUR, CANADA


### 1b. Save fusion, high-end, and low-end projections

In [6]:
# Loop over GMSL, RSL, and RSL without VLM component
for gmsl_rsl_novlm in ('gmsl', 'rsl', 'novlm'):
    # Output directory
    if gmsl_rsl_novlm == 'gmsl':
        out_dir = d25a.DATA_DIR / 'gmsl'
    else:
        out_dir = d25a.DATA_DIR / 'gauges'
    # Loop over two scenarios
    for scenario in ['ssp585', 'ssp126']:
        # Derive fusion projection
        qfs_da = d25a.get_sl_qfs(workflow='fusion_1e+2e', gmsl_rsl_novlm=gmsl_rsl_novlm, scenario=scenario).copy()
        # Drop gauges with missing RSL data
        if gmsl_rsl_novlm != 'gmsl':
            for gauge_id in missing_gauges:
                qfs_da.sel(locations=gauge_id).data[:] = np.nan  # this changes novlm data to also be NaN
            qfs_da = qfs_da.dropna(dim='locations')
        # Save fusion projection
        out_fn = out_dir / f'{gmsl_rsl_novlm}_fusion_{scenario}_d25a.nc'
        if gmsl_rsl_novlm == 'gmsl':
            print(f'Writing {out_fn.name}')
        else:
            print(f'Writing {out_fn.name} ({len(qfs_da.locations)} gauges)')
        qfs_da.to_netcdf(out_fn)
        # Derive and save high-end or low-end projection, depending on scenario
        if scenario == 'ssp585':
            high_da = qfs_da.sel(quantiles=0.95).squeeze()
            out_fn = out_dir / f'{gmsl_rsl_novlm}_high_d25a.nc'
            print(f'Writing {out_fn.name}')
            high_da.to_netcdf(out_fn)
        elif scenario == 'ssp126':
            low_da = qfs_da.sel(quantiles=0.05).squeeze()
            out_fn = out_dir / f'{gmsl_rsl_novlm}_low_d25a.nc'
            print(f'Writing {out_fn.name}')
            low_da.to_netcdf(out_fn)

Writing gmsl_fusion_ssp585_d25a.nc
Writing gmsl_high_d25a.nc
Writing gmsl_fusion_ssp126_d25a.nc
Writing gmsl_low_d25a.nc
Writing rsl_fusion_ssp585_d25a.nc (1016 gauges)
Writing rsl_high_d25a.nc
Writing rsl_fusion_ssp126_d25a.nc (1016 gauges)
Writing rsl_low_d25a.nc
Writing novlm_fusion_ssp585_d25a.nc (1016 gauges)
Writing novlm_high_d25a.nc
Writing novlm_fusion_ssp126_d25a.nc (1016 gauges)
Writing novlm_low_d25a.nc


### 1c. Save central projection
Defined as median of medium confidence mean under SSP2-4.5.

In [7]:
# Loop over GMSL/RSL and scenarios
for gmsl_rsl_novlm in ('gmsl', 'rsl', 'novlm'):
    # Output directory
    if gmsl_rsl_novlm == 'gmsl':
        out_dir = d25a.DATA_DIR / 'gmsl'
    else:
        out_dir = d25a.DATA_DIR / 'gauges'
    # Derive medium confidence mean under SSP2-4.5
    qfs_da = d25a.get_sl_qfs(workflow='mean_1e+2e', gmsl_rsl_novlm=gmsl_rsl_novlm, scenario='ssp245').copy()
    # Drop locations with NaN
    if gmsl_rsl_novlm != 'gmsl':
        # Drop gauges with missing RSL data
        if gmsl_rsl_novlm != 'gmsl':
            for gauge_id in missing_gauges:
                qfs_da.sel(locations=gauge_id).data[:] = np.nan  # this changes novlm data to also be NaN
            qfs_da = qfs_da.dropna(dim='locations')
    # Derive and Save central projection
    central_da = qfs_da.sel(quantiles=0.5).squeeze()
    out_fn = out_dir / f'{gmsl_rsl_novlm}_central_d25a.nc'
    print(f'Writing {out_fn.name}')
    central_da.to_netcdf(out_fn)

Writing gmsl_central_d25a.nc
Writing rsl_central_d25a.nc
Writing novlm_central_d25a.nc


### 1d. Save gauge information

In [8]:
# Create DataFrame to hold gauge information
gauge_info_df = pd.DataFrame(columns=['gauge_id', 'gauge_name', 'lat', 'lon', 'country'])
# Loop over locations for which projections are available
qfs_da = d25a.get_sl_qfs().copy()
for location in qfs_da.locations.data:
    if location not in missing_gauges:
        # Get information about this gauge and save to DataFrame
        gauge_info = d25a.get_gauge_info(location)
        gauge_info_df.loc[len(gauge_info_df)] = gauge_info
# Index by gauge_id
gauge_info_df = gauge_info_df.set_index('gauge_id')
# Save to CSV
out_fn = d25a.DATA_DIR / 'gauges' / f'gauge_info_d25a.csv'
print(f'Writing {out_fn.name} ({len(gauge_info_df)} gauges)')
gauge_info_df.to_csv(out_fn)

Writing gauge_info_d25a.csv (1016 gauges)


## Identify large cities with a tide gauge nearby
1. Urban agglomeration has a population of at least 10 million in 2025.
2. Select nearest tide gauge within a maximum distance of 100 km.

In [9]:
# # Read World Urbanisation Prospects 2018 data
# cities_df = pd.read_excel('data_wup18/WUP2018-F12-Cities_Over_300K.xls', header=16, usecols='C:E,G,H,X', index_col=None)
# cities_df = cities_df.rename(columns={'Country or area': 'city_country', 'City Code': 'city_code',
#                                       'Urban Agglomeration': 'city_name', 'Latitude': 'city_lat', 'Longitude': 'city_lon',
#                                       2025: 'population_2025'})
# cities_df = cities_df.set_index('city_code')
# # Select cities with population > 10 million (ie > 10,000 thousand) in 2025
# cities_df = cities_df.where(cities_df['population_2025'] > 10000).dropna().sort_values(by='population_2025', ascending=False)
# print(f'{len(cities_df)} cities have a population > 10 million in 2025.')
# # Loop over these cities and find nearest tide gauge and distance
# for index, row_ser in cities_df.iterrows():
#     lat0 = row_ser['city_lat']  # latitude of city
#     lon0 = row_ser['city_lon']  # longitude of city
#     temp_df = gauge_info_df.copy()  # copy tide gauge data
#     temp_df['distance'] = 6378 * np.arccos(  # calculate great-circle distance between city and all available gauges
#         np.sin(np.radians(lat0)) * np.sin(np.radians(temp_df['lat'])) +
#         np.cos(np.radians(lat0)) * np.cos(np.radians(temp_df['lat'])) * np.cos(np.radians(temp_df['lon'] - lon0)))
#     temp_df['distance'] = temp_df['distance'].round(0).astype(int)  # round to nearest km
#     temp_df = temp_df.sort_values(by=['distance']).reset_index()  # sort by distance
#     temp_df = temp_df.rename(columns={'lat': 'gauge_lat', 'lon': 'gauge_lon'})
#     for col in ['gauge_id', 'gauge_name', 'gauge_lat', 'gauge_lon', 'distance']:
#         cities_df.loc[index, col] = temp_df.loc[0, col]  # save gauge info to cities_df
# # Identify shorter name for cities with a long name
# for index, row_ser in cities_df.iterrows():
#     short_name = row_ser['city_name']  # use full name by default
#     if short_name.split(' (')[0] in ['Mumbai', 'Kolkata']:  # cases to use name outside parentheses
#         short_name = short_name.split(' (')[0]
#     elif '(' in short_name:  # cases to use name within parentheses
#         short_name = short_name.split(' (')[-1].rstrip(')')
#     elif ',' in short_name:  # cases to use name before comma
#         short_name = short_name.split(',')[0]
#     elif '-' in short_name:  # cases to use name before hyphen
#         short_name = short_name.split('-')[0]
#     cities_df.loc[index, 'city_short'] = short_name
# # Save to CSV
# out_fn = d25a.DATA_DIR / f'cities_d25a.csv'
# print(f'Writing {out_fn.name}')
# cities_df.to_csv(out_fn)

In [10]:
# Get end datetime
end_dt = datetime.datetime.now()
# Calculate run timedelta
run_td = end_dt - start_dt
# Print timing information
print(f"Start:     {start_dt.strftime('%Y-%m-%d %H:%M:%S')}")
print(f"End:       {end_dt.strftime('%Y-%m-%d %H:%M:%S')}")
print(f"Wall time: {run_td.seconds} s")

Start:     2025-02-05 17:08:47
End:       2025-02-05 17:09:16
Wall time: 29 s
