In [1]:
import os
import warnings
import numpy as np
import pandas as pd
import dask
import gev_bayes_utils as gevbu
import gev_city_utils as gevcu
import sa_city_utils as sacu
import trend_utils as tu

from utils import city_list, gev_metric_ids, trend_metric_ids, tgw_scenarios
from utils import roar_data_path as project_data_path
from utils import roar_code_path as project_code_path



In [2]:
# #######################
# ### Dask
# #######################
# from dask_jobqueue import SLURMCluster

# cluster = SLURMCluster(
#     # account="open",
#     account="pches_cr_default",
#     queue='basic',
#     cores=1,
#     processes=1,
#     memory="5GiB",
#     walltime="02:00:00",
# )

# cluster.scale(10)

# from dask.distributed import Client
# client = Client(cluster)
# client

## Get city timeseries

In [5]:
# Run for all: extremes
for city in city_list.keys():
    for metric_id in gev_metric_ids:
        sacu.get_city_timeseries_all(city, metric_id)

In [7]:
# Run for all: extremes with neighbors
for city in city_list.keys():
    for metric_id in gev_metric_ids:
        sacu.get_city_timeseries_all(city, metric_id, include_neighbors=True)

In [6]:
# Run for all: trends
for city in city_list.keys():
    for metric_id in trend_metric_ids:
        sacu.get_city_timeseries_all(city, metric_id)

## Trends with bootstrap

In [2]:
# Fit all: cities, bootstrap
for city in city_list:
    for metric_id in trend_metric_ids:
        for n_boot in [250, 1000]:
            tu.trend_fit_city(metric_id, city, n_boot=n_boot)

## Stationary GEV with bootstrap

### Main

In [5]:
# Main ensemble
stationary = True
fit_method = "lmom"
periods_for_level = [10,25,50,100]
hist_slice = [1950,2014]
proj_slice = [2050,2100]

# Loop through all
delayed = []

for city in city_list:
    for metric_id in gev_metric_ids:
        for n_boot_proj in [100, 1000]:
            delayed.append(dask.delayed(gevcu.fit_ensemble_gev_city)
                           (city=city, 
                            metric_id=metric_id,
                            stationary=stationary,
                            fit_method=fit_method,
                            hist_slice=hist_slice,
                            proj_slice=proj_slice,
                            n_boot_proj = n_boot_proj,
                            periods_for_level=periods_for_level,
                            return_samples=True))

_ = dask.compute(*delayed)

In [27]:
# TGW
stationary = True
fit_method = "lmom"
periods_for_level = [10,25,50,100]
hist_slice = [1980,2019]
proj_slice = [2049,2099]
return_samples = True
n_boot_hist = 1

stat_str = "stat" if stationary else "nonstat"
sample_str = "_samples" if return_samples else ""

df_out = []

for city in city_list:
    for metric_id in gev_metric_ids:
        for ssp in tgw_scenarios[1:]:
            for n_boot_proj in [100, 1000]:
                # Compute
                df = gevcu.fit_gev_city(city=city,
                                        metric_id=metric_id,
                                        ensemble='TGW',
                                        gcm='none',
                                        ssp=ssp,
                                        member='none',
                                        fit_method=fit_method,
                                        hist_slice=hist_slice,
                                        proj_slice=proj_slice,
                                        stationary=stationary,
                                        n_boot_proj=n_boot_proj,
                                        n_boot_hist=n_boot_hist,
                                        periods_for_level=periods_for_level,
                                        return_samples=return_samples)
                # Append
                df_out.append(df)

        # Store city/metric
        file_name = f"TGW_{city}_{metric_id}_{hist_slice[0]}-{hist_slice[1]}_{proj_slice[0]}-{proj_slice[1]}_{fit_method}_{stat_str}_nbootproj{n_boot_proj}_nboothist{n_boot_hist}{sample_str}.csv"
        df_out = pd.concat(df_out, ignore_index=True)
        df_out.to_csv(f"{project_data_path}/extreme_value/cities/original_grid/freq/{file_name}", index=False)
        df_out = []

### Naive pooling

In [7]:
# Main ensemble
stationary = True
fit_method = "lmom"
periods_for_level = [10,25,50,100]
hist_slice = [1950,2014]
proj_slice = [2050,2100]

# Loop through all
delayed = []

for city in city_list:
    for metric_id in gev_metric_ids:
        for n_boot_proj in [100, 1000]:
            delayed.append(dask.delayed(gevcu.fit_ensemble_gev_city)
                           (city=city, 
                            metric_id=metric_id,
                            stationary=stationary,
                            fit_method=fit_method,
                            hist_slice=hist_slice,
                            proj_slice=proj_slice,
                            n_boot_proj = n_boot_proj,
                            periods_for_level=periods_for_level,
                            return_samples=True,
                            include_neighbors=True))

_ = dask.compute(*delayed)

In [10]:
# TGW
stationary = True
fit_method = "lmom"
periods_for_level = [10,25,50,100]
hist_slice = [1980,2019]
proj_slice = [2049,2099]
return_samples = True
include_neighbors = True
n_boot_hist = 1

stat_str = "stat" if stationary else "nonstat"
sample_str = "_samples" if return_samples else ""
neighbor_str = "_neighbors" if include_neighbors else ""

df_out = []

for city in city_list:
    for metric_id in gev_metric_ids:
        for ssp in tgw_scenarios[1:]:
            for n_boot_proj in [100, 1000]:
                # Compute
                df = gevcu.fit_gev_city(city=city,
                                        metric_id=metric_id,
                                        ensemble='TGW',
                                        gcm='none',
                                        ssp=ssp,
                                        member='none',
                                        fit_method=fit_method,
                                        hist_slice=hist_slice,
                                        proj_slice=proj_slice,
                                        stationary=stationary,
                                        n_boot_proj=n_boot_proj,
                                        n_boot_hist=n_boot_hist,
                                        periods_for_level=periods_for_level,
                                        return_samples=return_samples,
                                        include_neighbors=include_neighbors)
                # Append
                df_out.append(df)

        # Store city/metric
        file_name = f"TGW_{city}_{metric_id}_{hist_slice[0]}-{hist_slice[1]}_{proj_slice[0]}-{proj_slice[1]}_{fit_method}_{stat_str}_nbootproj{n_boot_proj}_nboothist{n_boot_hist}{sample_str}{neighbor_str}.csv"
        df_out = pd.concat(df_out, ignore_index=True)
        df_out.to_csv(f"{project_data_path}/extreme_value/cities/original_grid/freq/{file_name}", index=False)
        df_out = []

## Non-stationary GEV with bootstrap

### Location trend only

In [6]:
# Main ensemble: location trend only
fit_method = 'mle'
stationary = False
# n_boots = [100, 1000]
n_boots = [100]

# Loop through all
delayed = []

for city in city_list:
    for metric_id in gev_metric_ids:
        for n_boot in n_boots:
            delayed.append(dask.delayed(gevcu.fit_ensemble_gev_city)
                           (city=city, 
                            metric_id=metric_id,
                            stationary=stationary,
                            fit_method=fit_method,
                            n_boot_proj = n_boot,
                            return_samples = True))

_ = dask.compute(*delayed)

In [29]:
# TGW
stationary = False
fit_method = "mle"
periods_for_level = [10,25,50,100]
years = [1980, 2099]
return_samples = True
n_boot_hist = 1

stat_str = "stat" if stationary else "nonstat"
sample_str = "_samples" if return_samples else ""

df_out = []

for city in city_list:
    for metric_id in gev_metric_ids:
        for ssp in tgw_scenarios[1:]:
            for n_boot_proj in [100, 1000]:
                # Compute
                df = gevcu.fit_gev_city(city=city,
                                        metric_id=metric_id,
                                        ensemble='TGW',
                                        gcm='none',
                                        ssp=ssp,
                                        member='none',
                                        fit_method=fit_method,
                                        years=years,
                                        stationary=stationary,
                                        n_boot_proj=n_boot_proj,
                                        periods_for_level=periods_for_level,
                                        return_samples=return_samples)
                # Append
                df_out.append(df)

        # Store city/metric
        file_name = f"TGW_{city}_{metric_id}_{years[0]}-{years[1]}_{fit_method}_{stat_str}_nboot{n_boot_proj}{sample_str}.csv"
        df_out = pd.concat(df_out, ignore_index=True)
        df_out.to_csv(f"{project_data_path}/extreme_value/cities/original_grid/freq/{file_name}", index=False)
        df_out = []

### Location & scale trend

In [3]:
# Main ensemble
fit_method = 'mle'
stationary = False
# n_boots = [100, 1000]
n_boots = [100]
nonstationary_scale = True

# Loop through all
delayed = []

for city in city_list:
    for metric_id in gev_metric_ids:
        for n_boot in n_boots:
            delayed.append(dask.delayed(gevcu.fit_ensemble_gev_city)
                           (city=city, 
                            metric_id=metric_id,
                            stationary=stationary,
                            fit_method=fit_method,
                            n_boot_proj = n_boot,
                            return_samples = True,
                            nonstationary_scale = nonstationary_scale))

_ = dask.compute(*delayed)

In [None]:
# TGW
stationary = False
nonstationary_scale = True
fit_method = "mle"
periods_for_level = [10,25,50,100]
years = [1980, 2099]
return_samples = True
n_boot_hist = 1

stat_str = "stat" if stationary else "nonstat"
sample_str = "_samples" if return_samples else ""
scale_str = "_scale" if nonstationary_scale else ""

df_out = []

for city in city_list:
    for metric_id in gev_metric_ids:
        for ssp in tgw_scenarios[1:]:
            for n_boot_proj in [100, 1000]:
                # Compute
                df = gevcu.fit_gev_city(city=city,
                                        metric_id=metric_id,
                                        ensemble='TGW',
                                        gcm='none',
                                        ssp=ssp,
                                        member='none',
                                        fit_method=fit_method,
                                        years=years,
                                        stationary=stationary,
                                        n_boot_proj=n_boot_proj,
                                        periods_for_level=periods_for_level,
                                        nonstationary_scale=nonstationary_scale,
                                        return_samples=return_samples)
                # Append
                df_out.append(df)

        # Store city/metric
        file_name = f"TGW_{city}_{metric_id}_{years[0]}-{years[1]}_{fit_method}_{stat_str}_nboot{n_boot_proj}{sample_str}{scale_str}.csv"
        df_out = pd.concat(df_out, ignore_index=True)
        df_out.to_csv(f"{project_data_path}/extreme_value/cities/original_grid/freq/{file_name}", index=False)
        df_out = []

In [20]:
# Check
gevcu.fit_gev_city(
    city='boston',
    metric_id='max_pr',
    ensemble='LOCA2',
    gcm='CanESM5',
    ssp='ssp585',
    member='r1i1p1f1',
    fit_method='mle',
    stationary=False,
    nonstationary_scale=True,  # NEW: whether to fit non-stationary scale parameter
    bootstrap=True,
    n_boot_hist=1,
    n_boot_proj=100
)

Unnamed: 0,quantile,ensemble,gcm,member,ssp,loc_intcp,loc_trend,log_scale_intcp,log_scale_trend,shape,...,100yr_return_level_2075,100yr_return_level_2100,10yr_return_level_diff_2075-1975,25yr_return_level_diff_2075-1975,50yr_return_level_diff_2075-1975,100yr_return_level_diff_2075-1975,10yr_return_level_chfc_2075-1975,25yr_return_level_chfc_2075-1975,50yr_return_level_chfc_2075-1975,100yr_return_level_chfc_2075-1975
0,main,LOCA2,CanESM5,r1i1p1f1,ssp585,57.742605,0.091815,2.893627,0.001578,-0.226628,...,247.453835,256.920751,18.60724,24.262965,29.317828,35.198942,1.161544,1.163637,1.164849,1.165833
1,q025,LOCA2,CanESM5,r1i1p1f1,ssp585,52.820643,0.015332,2.542673,-0.001818,-0.386558,...,190.025593,189.973425,-8.186143,-14.267087,-19.828523,-25.490894,0.934852,0.914075,0.902574,0.893399
2,q975,LOCA2,CanESM5,r1i1p1f1,ssp585,63.460247,0.165027,3.173491,0.00479,-0.125098,...,374.709258,417.682979,48.105552,69.34474,91.475299,120.859143,1.433129,1.475024,1.499492,1.519377


In [17]:
df = pd.read_csv('/storage/group/pches/default/users/dcl5300/conus_comparison_lafferty-etal-2024/extreme_value/cities/original_grid/freq/boston_max_pr_1950-2100_mle_nonstat_nboot100_samples_scale.csv')

In [21]:
df[(df['ensemble'] == 'LOCA2') & (df['gcm'] == 'CanESM5') & (df['ssp'] == 'ssp585') & (df['member'] == 'r1i1p1f1')]['100yr_return_level_chfc_2075-1975'].quantile(0.975)

1.5038977538541212

## Bayesian GEV (old!)

In [None]:
# # For Bayes
# ############
# ### Dask ###
# ############
# from dask_jobqueue import SLURMCluster

# cluster = SLURMCluster(
#     # account="pches",
#     account="open",
#     cores=3,
#     processes=1,
#     job_cpu=3,
#     memory="3GiB",
#     walltime="12:00:00",
#     job_script_prologue=[
#         f"export PYTHONPATH={project_code_path}/.venv/lib/python3.12/site-packages:$PYTHONPATH",  # Put venv first
#         "export JAX_PLATFORM_NAME=cpu",
#         "export XLA_FLAGS='--xla_force_host_platform_device_count=1'",
#         # Force PyTensor to not use caching at all
#         "export PYTENSOR_FLAGS='cxx=,",
#         "mode=FAST_COMPILE,",  # Less aggressive optimization but more stable
#         "allow_gc=True,",
#         "cache_size=0'"        # Disable caching completely
#     ],
#     death_timeout=60,
#     local_directory="/tmp"
# )

# cluster.scale(5)

# from dask.distributed import Client
# client = Client(cluster)
# client

### Fit across ensemble

In [None]:
%%time
# Fit info: non-stationary
future_years = [2015,2100]
stationary = False
return_periods = [100]

# Parallelize with dask delayed
delayed = []

# Loop thorugh all combos
for city in city_list.keys():
    for metric_id in gev_metric_ids:
        tmp = dask.delayed(gevbu.fit_bayesian_gev_ensemble)(
            city=city,
            metric_id=metric_id,
            years=future_years,
            stationary=stationary,
            shape_sigma=0.2,
            prior_identifier='shape_sigma_02',
            return_periods=return_periods,
        )
        delayed.append(tmp)

_ = dask.compute(*delayed)

In [2]:
# %%time
# # Fit info: stationary
# hist_years = [1950,2014]
# future_years = [2050,2100]
# stationary = True
# return_periods = [100]

# # Parallelize with dask delayed
# delayed = []

# # Loop thorugh all combos
# for city in city_list.keys():
#     for metric_id in gev_metric_ids:
#         for years in [hist_years, future_years]:
#             tmp = dask.delayed(gevbu.fit_bayesian_gev_ensemble)(
#                     city=city,
#                     metric_id=metric_id,
#                     years=years,
#                     stationary=stationary,
#                     return_periods=return_periods,
#                     dask=False
#             )
#             delayed.append(tmp)

# _ = dask.compute(*delayed)

### Gather results

In [3]:
%%time
# Loop thorugh all combos and store
store_path = f"{project_data_path}/extreme_value/cities/original_grid/bayes_combined/"

return_periods = [100]

prior_identifier = "shape_sigma_02"

stationary = False
stationary_string = "stat" if stationary else "nonstat"

# for city in city_list.keys():
for city in ['nyc', 'chicago', 'denver']:
    for metric_id in ['max_tasmax', 'max_pr', 'min_tasmin']:
    # for metric_id in gev_metric_ids:
        for years in [None, [2015,2100]]:
            # Check if done
            change_identifier = "" if years is None else f"_change_{years[0]}-{years[1]}"
            file_path = f"{store_path}/{city}_{metric_id}_{stationary_string}_{prior_identifier}{change_identifier}.csv"
            if os.path.exists(file_path):
                continue
            # Read
            df = gevbu.gather_bayesian_gev_results_all(
                city = city,
                metric_id = metric_id,
                return_periods = return_periods,
                stationary = stationary,
                prior_identifier = prior_identifier,
                years = years,
            )
            # Store
            df.to_csv(file_path, index=False)

CPU times: user 2min 39s, sys: 11.7 s, total: 2min 51s
Wall time: 15min 58s
