# Export basin-aggregated data
In order to compile drought indices at the river-basin scale, we need to aggregate the glacial runoff for each basin and export it.  We produce a single CSV file for each GCM, SSP, and basin.  The columns of the CSV file should reflect the three glacier models compared here.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import cm
from datetime import date
import collections
import datetime
import itertools
import json
import os
import glob

In [None]:
SSPpaths = ['ssp126','ssp245','ssp370','ssp585']

In [None]:
modelnames = ['BCC-CSM2-MR','CAMS-CSM1-0','CESM2','CESM2-WACCM','EC-Earth3','EC-Earth3-Veg','FGOALS-f3-L','GFDL-ESM4',
              'INM-CM4-8','INM-CM5-0','MPI-ESM1-2-HR','MRI-ESM2-0']

## EDIT THESE to reflect your use case (fpath and which basins)

In [None]:
## Path to each model's data folder
fpath_glogem = '/Volumes/GoogleDrive/My Drive/Runoff-intercomparison/GloGEM-output/'
fpath_pygem = '/Volumes/GoogleDrive/My Drive/Runoff-intercomparison/PyGEM/'
fpath_oggm = '/Volumes/GoogleDrive/My Drive/Runoff-intercomparison/OGGM/lschuster/runs_2023.3/output/basins/gcm_from_2000_bc_2000_2019/'

## Path where the processed output will go
out_fpath = '/Users/lizz/Documents/Research/Runoff-intercomparison/basin_aggregated/'

In [None]:
which_basins = {'RHONE': '6243'} ## fill with basin names (all caps) and GRDC basin codes of the basins you want

### Make a list of the relevant glaciers

In [None]:
def select_glaciers_json(basin='all'):
    '''
    Select glaciers within a basin by MRBID from a json-file,
    which is stored in the data directory.

    Args:
    -----
    basin: str
        String of MRBID or 'all'.

    Returns:
    --------
    If basin is 'all' a list of all relevant glaciers is returned, for
    initiating glacier simulations. If basin is a MRBID the list of glaciers
    within that basin is returned.
    
    Copy of a function written by Erik Holmgren (2022) in holmgren_gha.utils
    '''

    fpath = '/Users/lizz/Documents/Research/Runoff-intercomparison/msc_thesis-multi_gcm/code/data/rgi_ids_per_basin.json' ## correct for local run
    with open(fpath) as f:
        basin_dict = json.load(f)

    if basin.lower() != 'all':
        glacier_list = basin_dict[basin]
    else:
        glacier_list = list(itertools.chain.from_iterable(basin_dict.values()))

    return glacier_list

In [None]:
basin_gls = {}
for basin, code in which_basins.items():
    basin_gls[basin] = select_glaciers_json(code)

### Process GloGEM data

In [None]:
def sum_basin_glogem(basin_RGI_list, runoff_data, transpose=False):
    """
    Sum GloGEM-derived runoff data from a basin, given a list of RGI IDs within
    the basin.  Note this works differently if using for a single basin, GCM, SSP
    (as in the Export-BasinData notebook) versus in a loop with many basins, GCMs,
    SSPs (as in CentralEurope-3ModelAgg).
    
    Inputs:
        basin_RGI_list: list, output from select_glaciers_json
        runoff_data : df or Series, per-glacier runoff data
        transpose: bool, optional
            Default False reflects single-case usage as in Export-BasinData
            Recover earlier utility, as in CentralEurope-3ModelAgg, by
            setting to True.
    """
    # Create new list to match our RGI formatting
    new_basin_list = [int(str(x)[-4:]) for x in basin_RGI_list]
    if transpose:
        runoff_data = runoff_data.transpose()
    
    # Filter new_basin_list to keep only the indexes present in the DataFrame
    new_basin_list = [x for x in new_basin_list if x in runoff_data.index]
    
    # Extract glaciers contained in the list from original df and create a new df
    new_df = runoff_data.loc[new_basin_list].copy()
    
    # Sum the values of the glaciers within the basin
    summed_basin_runoff = new_df.sum()
    #print(summed_basin_runoff)
    
    return summed_basin_runoff

In [None]:
def glogem_agg_runoff(basin_name, region_folder, region_fname, GCM, SSP, debug=False):
    """Compile a monthly time series of runoff, summed over
    all glaciers in a given river basin.
    
    Inputs:
        basin_name : str
            The name of the basin, in all caps as in the List_MultiRegionBasins dict
        region_folder: str
            The part of the filepath that identifies the RGI region
        region_fname: str
            The part of the input filename that identifies the regional discharge data
        GCM : str
            The part of the filepath that names the GCM considered.
            All valid choices are in modelnames list
        SSP: str
            Which SSP scenario we consider.  
            Valid choices are 'ssp126','ssp245','ssp370','ssp585'
        debug: bool, optional, default False
            Whether to return intermediate output for examination
            during debugging.  Default behavior is to return only
            the aggregated runoff series.
    
    Output:
        rs: pandas Series
            Monthly runoff from all glaciers in this basin added together
    """
    
    discharge_df = pd.read_csv(fpath_glogem + region_folder + GCM + '/' + SSP  + '/' 
                          + region_fname + '_Discharge_r1.dat', 
                          sep='\s+', header=None, skiprows=1, index_col=0)
    area_df = pd.read_csv(fpath_glogem + region_folder + GCM  + '/' + SSP  + '/' 
                          + region_fname + '_Area_r1.dat', 
                          sep='\s+', index_col="ID")
    
    # Create new index using pandas date_range function
    start_date = datetime.date(1980, 1, 1)
    end_date = datetime.date(2100, 12, 1)
    new_indices = pd.date_range(start_date, end_date, freq='MS').strftime('%Y-%m').tolist()

    discharge_df.columns = pd.to_datetime(new_indices)
    area_df = area_df[area_df.columns.repeat(12)]
    area_df.columns = pd.to_datetime(new_indices)
    
    df_area_init = area_df.loc[:,'1980-01-01'].mul(1e6) ## convert from km2 to m2
    
    runoff = discharge_df.mul(df_area_init, axis=0)
    
    ## Aggregate using sum_basin, and convert from m3 to km3
    rsum = sum_basin_glogem(select_glaciers_json(which_basins[basin_name]), runoff) * 1e-9
    
    if debug:
        return rsum, runoff, df_area_init, discharge_df
    else:
        return rsum

In [None]:
rs, rhone_tester, da, dd = glogem_agg_runoff('RHONE', 'RGI11-CentralEurope/files/', 'centraleurope', 
                                 modelnames[0], SSPpaths[0], debug=True)

In [None]:
rs

In [None]:
rs.resample('A').sum().plot()

### Process PyGEM data
Read in as usual; select a given column for the GCM and SSP desired.

### Process OGGM data
Read in as usual; select a given column for the GCM and SSP desired.

### Write the output

In [None]:
## set up filename to reflect what you're writing out, possibly in a nested loop
out_fname = out_fpath+'runoff_{}_{}_{}.csv'.format(GCM, ssp, basin_name) 