# Train CMIP6 large ensemble regional sea ice data with modes of variability from CVDP

In [1]:
import numpy as np
import xarray as xr
import matplotlib.pyplot as plt
import scipy.signal as sig
from sklearn import linear_model
from sklearn import model_selection
import datetime
import warnings
import dask

In [34]:
#for running on Casper
from dask_jobqueue import PBSCluster
from dask.distributed import Client

cluster = PBSCluster(cores    = 1,
                     memory   = '2GB',
                     queue    = 'casper',
                     walltime = '00:30:00')

cluster.scale(12)
client = Client(cluster)
client

Perhaps you already have a cluster running?
Hosting the HTTP server on port 36235 instead
  http_address["port"], self.http_server.port


0,1
Client  Scheduler: tcp://10.12.206.49:38194  Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/cwpowell/proxy/36235/status,Cluster  Workers: 0  Cores: 0  Memory: 0 B


In [2]:
#list of model names with CVDP >=30 members
model_names  = ['CanESM5', 'MIROC6', 'GISS-E2-1-G', 'IPSL-CM6A-LR',
                'CNRM-CM6-1', 'NorCPM1'
]

model_centers = {
    'CanESM5':'CCCma', 'MIROC6':'MIROC', 'GISS-E2-1-G':'NASA-GISS',
    'IPSL-CM6A-LR':'IPSL', 'CNRM-CM6-1':'CNRM-CERFACS', 'NorCPM1':'NCC',
}

areacello_paths = {
    'CanESM5': '/glade/collections/cmip/CMIP6/ScenarioMIP/CCCma/CanESM5/'\
        +'ssp585/r1i1p1f1/Ofx/areacello/gn/v20190429/areacello/'\
        +'areacello_Ofx_CanESM5_ssp585_r1i1p1f1_gn.nc',
    
    'MIROC6': '/glade/collections/cmip/CMIP6/CMIP/MIROC/MIROC6/historical/'\
        +'r1i1p1f1/Ofx/areacello/gn/v20190311/areacello/'\
        +'areacello_Ofx_MIROC6_historical_r1i1p1f1_gn.nc',
    
    'GISS-E2-1-G': '/glade/collections/cmip/CMIP6/CMIP/NASA-GISS/GISS-E2-1-G/'\
        +'piControl/r1i1p1f1/Ofx/areacello/gn/v20180824/areacello/'\
        +'areacello_Ofx_GISS-E2-1-G_piControl_r1i1p1f1_gn.nc',
    
    'IPSL-CM6A-LR': '/glade/collections/cmip/CMIP6/CMIP/IPSL/IPSL-CM6A-LR/'\
        +'historical/r1i1p1f1/Ofx/areacello/gn/v20180803/areacello/'\
        +'areacello_Ofx_IPSL-CM6A-LR_historical_r1i1p1f1_gn.nc',
    
    'CNRM-CM6-1': '/glade/collections/cmip/CMIP6/CMIP/CNRM-CERFACS/CNRM-CM6-1/'\
        +'historical/r1i1p1f2/Ofx/areacello/gn/v20180917/areacello/'\
        +'areacello_Ofx_CNRM-CM6-1_historical_r1i1p1f2_gn.nc', 
    
    'NorCPM1': '/glade/work/cwpowell/low-frequency-variability/raw_data/'\
        +'masie_masks/areacello_Ofx_NorCPM1_piControl_r1i1p1f1_gn.nc',
    
}

lat_names = {'CanESM5': 'latitude', 'MIROC6':'latitude', 'GISS-E2-1-G':'lat',
             'IPSL-CM6A-LR': 'nav_lat', 'CNRM-CM6-1':'lat', 
             'NorCPM1': 'latitude',
}

x_y_names = {'CanESM5':['i','j'], 'MIROC6':['x','y'], 
             'GISS-E2-1-G':['lat','lon'], 'IPSL-CM6A-LR':['x','y'], 
             'CNRM-CM6-1':['x','y'], 'NorCPM1':['i','j'],
}

doi_dict = {'CanESM5':'10.5194/gmd-12-4823-2019', 
            'MIROC6':'10.5194/gmd-12-2727-2019',
            'GISS-E2-1-G':'10.1029/2019MS002025',
            'IPSL-CM6A-LR':'10.1029/2019MS002010',
            'CNRM-CM6-1':'10.1029/2019MS001683',
            'NorCPM1':'10.5194/gmd-12-343-2019',
}

#split train:~75%, test:~15%, validation:~10% (not used in this analysis)
mem_split = {'CanESM5':[47,55,64], 'MIROC6':[36,43,49], 
             'GISS-E2-1-G':[31,37,42], 'IPSL-CM6A-LR':[23,27,31],
             'CNRM-CM6-1':[14,17,20], 'NorCPM1':[21,25,29]}

## Train the Lasso model on absolute SIC % anomaly values

In [3]:
def train_model_month(sea_ice_data, variability_data, month_, year_lags, 
                      train_test, start_end_yr, max_iteration, tolorence,
                      model_sel, alphas_list):
    '''
    Function which trains and tests a lasso model from a single GCM for a
    single month of sea ice data and with all CVDP climate variables lagged
    as specified
        
    Input:
        sea_ice_data: xarray dataarray  
            Lowpass filtered standardized sea ice data e.g. SIC or SIT from
            a given model
        variability_data: xarray dataarray
            Climate variable data for all members from a given model
        month_: int
            The month of the year, e.g. 1 for January
        year_lags: list of ints
            List of number of years lagged over which to run the Lasso model
        train_test: list of ints
            List of the member elements up to element x (inclusive) for the 
            training data, and from y to z (inclusive) for the testing data, 
            e.g. [74,90,99] for a 100 member ensemble
        start_end_yr: list of ints
            List of the starting and ending years of analysis e.g. [1950,2014]
            Note that the starting year is that of the sea ice data which does
            not change with lag, CVDP data for a 10 year lag would use 1940 to
            2004 data
        max_iteration: int
            The maximum number of iterations for to fit the data for the 
            Lasso method
        tolorence: float
            The tolorence of the Lasso iterations
        model_sel: str
            The type of Lasso model to use e.g. 'cyclic' or 'random'
        alphas_list: list of floats
            The values of alpha used in the Lasso model e.g. [1.0,1.5,2.0]
        
    Returns:
        tuple of two xarray dataarrays, containing Lasso multiple regression 
        coefficients and scores
    '''
    
    ##################### reorganize sea ice and CVDP data #####################
    #prepare sea ice data for analysis by creating year and month dimensions
    #out of the time dimension - uncomment if using time dim not year,month
#     sea_ice_year_month = sea_ice_data.sortby('time')

#     month_seperate = []
#     for i in np.arange(1,13):
#         temp_data = sea_ice_year_month.sel(
#             time=sea_ice_year_month['time.month']==i)
#         temp_data['time'] = np.arange(1920,2015)
#         month_seperate.append(temp_data)

#     target_data = xr.concat((month_seperate), dim='month')
#     target_data['month'] = np.arange(1,13)
#     target_data = target_data.rename({'time':'year'})

    target_data = sea_ice_data.copy()

    #prepare CVDP data for analysis by creating year and month dimensions
    #out of the time dimension
    CVDP_year_month = variability_data.to_array('variable').sortby('time')

    month_seperate = []
    for i in np.arange(1,13):
        temp_data = CVDP_year_month.sel(
            time=CVDP_year_month['time.month']==i)
        temp_data['time'] = np.arange(1920,2015)
        month_seperate.append(temp_data)

    CVDP_data = xr.concat((month_seperate), dim='month')
    CVDP_data['month'] = np.arange(1,13)
    CVDP_data = CVDP_data.rename({'time':'year'})

    # make subsets of the datasets with only members in both sea ice and CVDP
    # uncomment if using individual models
#     common_mem = np.intersect1d(target_data['member'], CVDP_data['member'])

#     target_data = target_data.sel(member=common_mem).sortby('member')
#     CVDP_data = CVDP_data.sel(member=common_mem).sortby('member')
        
    #remove AMOC as only some models contain that data
    CVDP_data = CVDP_data.drop_sel(variable='AMOC')
    
    ######################### begin the lasso training #########################
    all_alphas_coefs_train = []
    all_alphas_score = []
    for alpha_val in alphas_list:

        all_lags_coefs_train = []
        all_lags_score = []
        for lag in year_lags:

            all_regions_coefs_train  = []
            all_regions_score = []
            for region_ in np.arange(1,17):

                #preapre the sea ice data (targets) and split into 70% training
                #and 10% testing data, ensure member is sorted alphabetically
                #and time is sorted chronologically not all Jan, all Feb etc.
                target_train = target_data.sortby('member')
                target_train = target_train.isel(
                    member=slice(0,train_test[0])).sel(month=month_).sel(
                    year=slice(str(start_end_yr[0]), str(start_end_yr[1]))).sel(
                    region=region_).stack(member_time=('member','year'))
                
                target_test  = target_data.sortby('member')
                target_test  = target_test.isel(
                    member=slice(train_test[1], train_test[2])).sel(
                    month=month_).sel(
                    year=slice(str(start_end_yr[0]), str(start_end_yr[1]))).sel(
                    region=region_).stack(member_time=('member','year'))

                #prepare the CVDP data into the training and testing data
                CVDP_train = []
                CVDP_test  = []
                for lag_month in np.arange(1,13):
                        
                    CVDP_month_data = CVDP_data.sortby('member')
                    CVDP_month_data = CVDP_month_data.sel(
                        month=lag_month).sel(
                        year=slice(str(start_end_yr[0]-lag), 
                                   str(start_end_yr[1]-lag)))
                    
                    CVDP_train.append(CVDP_month_data.isel(
                        member=slice(0,train_test[0])))
                    CVDP_test.append(CVDP_month_data.isel(
                        member=slice(train_test[1], train_test[2])))
                
                CVDP_train_stacked = xr.concat((CVDP_train),'month')
                
                CVDP_train_stacked = CVDP_train_stacked.stack(
                    member_time=('member','year')).stack(
                    var_month=('variable','month'))
                               
                CVDP_test_stacked  = xr.concat((CVDP_test),'month')
                
                CVDP_test_stacked = CVDP_test_stacked.stack(
                    member_time=('member','year')).stack(
                    var_month=('variable','month'))
                
                #run the lasso model
                lasso_model = linear_model.Lasso(alpha=alpha_val, 
                                                 max_iter=max_iteration, 
                                                 tol=tolorence, 
                                                 selection=model_sel)
                
                lasso_fit = lasso_model.fit(X=CVDP_train_stacked, 
                                            y=target_train.T)

                #save the trained coefficients and the scores
                var_month_coords = [] 
                for var_name_ in CVDP_month_data['variable'].values:
                    for i in np.arange(1,13):
                        var_month_coords.append(str(var_name_)+'_'+str(i).zfill(2))
                
                all_regions_coefs_train.append(xr.DataArray(
                    data=lasso_fit.coef_, 
                    coords={'var_month':var_month_coords}, 
                    dims=['var_month']))

                all_regions_score.append(xr.DataArray(
                    data=[lasso_model.score(X=CVDP_train_stacked, 
                                            y=target_train.T), 
                          lasso_model.score(X=CVDP_test_stacked, 
                                            y=target_test.T)],
                    coords={'train_test':['train','test']}, 
                    dims=['train_test'])
                )

            all_lags_coefs_train.append(xr.concat((all_regions_coefs_train),
                                                  dim='region'))
            all_lags_score.append(xr.concat((all_regions_score), dim='region'))

        all_alphas_coefs_train.append(xr.concat((all_lags_coefs_train),
                                                dim='lag'))
        all_alphas_score.append(xr.concat((all_lags_score), dim='lag'))
        
    coefs_xr = xr.concat((all_alphas_coefs_train), dim='alpha')
    score_xr = xr.concat((all_alphas_score), dim='alpha')
    
    coefs_xr['region'] = np.arange(1,17)
    coefs_xr['lag'] = year_lags
    coefs_xr['alpha'] = alphas_list
    
    score_xr['region'] = np.arange(1,17)
    score_xr['lag'] = year_lags
    score_xr['alpha'] = alphas_list
    
    return(coefs_xr, score_xr)

In [49]:
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore') 
#ignore warnings of Lasso not converging occur for even high iterations and high
#tolorences
###############################################################################
for model_name in ['MIROC6', 'IPSL-CM6A-LR',
                'CNRM-CM6-1', 'NorCPM1']:#model_names[1:]:    #run the lasso computations with dask
    print(datetime.datetime.now(), model_name)
    
    CMIP6_data = xr.open_dataset(
        '/glade/work/cwpowell/low-frequency-variability/input_data/Regional_'\
        +'SIC_SIT_detrended_lowpass_{}_1920_2014.nc'.format(model_name)
    )
    
    CVDP_data = xr.open_dataset(
        '/glade/work/cwpowell/low-frequency-variability/input_data/'\
        +'CVDP_standardized_1920_2014_historical_{}.nc'.format(model_name)
    )    
    
    lasso_compute_list = []
    for month_ in np.arange(1,13):
        lasso_compute_list.append(dask.delayed(train_model_month)(
            sea_ice_data = CMIP6_data['SIT'], variability_data = CVDP_data,
            month_ = month_, year_lags = np.arange(2,21), 
            train_test = mem_split[model_name], start_end_yr = [1941,2014], 
            max_iteration = 1e4, tolorence = 1e-3, 
            model_sel = 'random', alphas_list = [1.5,2.0,2.5])
                                 )

    #do the simultaneous computation on all months 
    lasso_computed = dask.compute(*lasso_compute_list)

    coefs = []
    scores = []
    for month_ in np.arange(1,13):
        coefs.append(lasso_computed[month_-1][0])
        scores.append(lasso_computed[month_-1][1])

    coefs_xr = xr.concat((coefs), dim='month')
    scores_xr = xr.concat((scores), dim='month')

    coefs_xr['month'] = np.arange(1,13)
    scores_xr['month'] = np.arange(1,13)

    coefs_attrs = {
        'Description': 'Multiple regression coefficients using the Lasso '\
            +'method, trained on average regional sea ice thickness (SIT) '\
            +'and modes of climate variability for the model '\
            +'{}. Regions as defined for NSIDC MASIE-NH '.format(model_name)\
            +'Version 1, modes of variability are obtained from the Climate '\
            +'Variability Diagnostics Package (CVDP). Training on the first '\
            +'75% of members for each region and month of SIC data for '\
            +'1941-2014 using historical CMIP6 forcing with CVDP data lagged '\
            +'between 2 and 20 years for each month of the mode of '\
            +'variability. Hyperparameters: alpha=[1.5,2.0,2.5], random '\
            +'rather than cyclic Lasso model training, maximum iteration of '\
            +'1e4 and a tolorence of 1e-3.', 
        'Timestamp'  : str(datetime.datetime.utcnow().strftime(
            "%H:%M UTC %a %Y-%m-%d")),
        'Data source': '{}, doi:{} . '.format(model_name, doi_dict[model_name])\
            +'Climate Variability Diagnostics Package, '\
            +'doi:10.1002/2014EO490002. NSIDC MASIE-NH Regions, '\
            +'doi:10.7265/N5GT5K3K.', 
        'Analysis'   : 'https://github.com/chrisrwp/low-frequency-variability/'\
            +'blob/main/lasso/Train_CMIP6_CVDP.ipynb'
    }

    coefs_xr.attrs = coefs_attrs

    coefs_xr.to_netcdf('/glade/work/cwpowell/low-frequency-variability/'\
        +'lasso_coefs_scores/SIT_CVDP_Lasso_Coefs_{}_'.format(model_name)\
        +'1941_2014_lag_2_20.nc')

    scores_attrs = coefs_attrs.copy()
    scores_attrs['Description'] = 'Multiple regression scores using the Lasso '\
            +'method, trained and tested on average regional sea ice '\
            +'thickness (SIT) and modes of climate variability for the'\
            +' model {}. Regions as defined for NSIDC '.format(model_name)\
            +'MASIE-NH Version 1, modes of variability are obtained from the '\
            +'Climate Variability Diagnostics Package (CVDP). Training on the '\
            +'first 75% of members and testing on the final 15% of member for '\
            +'each region and month of SIC data for 1941-2014 using '\
            +'historical CMIP6 forcing with CVDP data lagged between 2 and 20 '\
            +'years for each month of the mode of variability. '\
            +'Hyperparameters: alpha=[1.5,2.0,2.5], random rather than cyclic '\
            +'Lasso model training, maximum iteration of 1e4 and a tolorence '\
            +'of 1e-3.', 
    scores_xr.attrs = scores_attrs

    scores_xr.to_netcdf('/glade/work/cwpowell/low-frequency-variability/'\
        +'lasso_coefs_scores/SIT_CVDP_Lasso_Scores_{}_'.format(model_name)\
        +'1941_2014_lag_2_20.nc')

2022-07-24 22:35:38.114850 MIROC6
2022-07-24 22:48:10.117707 IPSL-CM6A-LR
2022-07-24 22:52:53.416385 CNRM-CM6-1
2022-07-24 22:55:50.307458 NorCPM1


## Train the Lasso model on relative SIC % point anomalies from initialization

In [164]:
def train_model_month_relative(
    sea_ice_data, variability_data, month_, year_lags, train_test, 
    start_end_yr, max_iteration, tolorence, model_sel, alphas_list, relative=False
):
    '''
    Function which trains and tests a lasso model from a single GCM for a
    single month of sea ice change from starting point and with all CVDP climate 
    variables lagged as specified
        
    Input:
        sea_ice_data: xarray dataarray  
            Lowpass filtered standardized sea ice data e.g. SIC or SIT from
            a given model
        variability_data: xarray dataarray
            Climate variable data for all members from a given model
        month_: int
            The month of the year, e.g. 1 for January
        year_lags: list of ints
            List of number of years lagged over which to run the Lasso model
        train_test: list of ints
            List of the member elements up to element x (inclusive) for the 
            training data, and from y to z (inclusive) for the testing data, 
            e.g. [74,90,99] for a 100 member ensemble
        start_end_yr: list of ints
            List of the starting and ending years of analysis e.g. [1950,2014]
            Note that the starting year is that of the sea ice data which does
            not change with lag, CVDP data for a 10 year lag would use 1940 to
            2004 data
        max_iteration: int
            The maximum number of iterations for to fit the data for the 
            Lasso method
        tolorence: float
            The tolorence of the Lasso iterations
        model_sel: str
            The type of Lasso model to use e.g. 'cyclic' or 'random'
        alphas_list: list of floats
            The values of alpha used in the Lasso model e.g. [1.0,1.5,2.0]
        relative: bool
            If relative, then sea ice data delta with lag is used rather than
            the abosolute SIC % anomaly value
        
    Returns:
        tuple of two xarray dataarrays, containing Lasso multiple regression 
        coefficients and scores
    '''
    
    ##################### reorganize sea ice and CVDP data #####################
    #prepare sea ice data for analysis by creating year and month dimensions
    #out of the time dimension - uncomment if using time dim not year,month
#     sea_ice_year_month = sea_ice_data.sortby('time')

#     month_seperate = []
#     for i in np.arange(1,13):
#         temp_data = sea_ice_year_month.sel(
#             time=sea_ice_year_month['time.month']==i)
#         temp_data['time'] = np.arange(1920,2015)
#         month_seperate.append(temp_data)

#     target_data = xr.concat((month_seperate), dim='month')
#     target_data['month'] = np.arange(1,13)
#     target_data = target_data.rename({'time':'year'})

    target_data = sea_ice_data.copy()
    
    #prepare CVDP data for analysis by creating year and month dimensions
    #out of the time dimension
    CVDP_year_month = variability_data.to_array('variable').sortby('time')

    month_seperate = []
    for i in np.arange(1,13):
        temp_data = CVDP_year_month.sel(
            time=CVDP_year_month['time.month']==i)
        temp_data['time'] = np.arange(1920,2015)
        month_seperate.append(temp_data)

    CVDP_data = xr.concat((month_seperate), dim='month')
    CVDP_data['month'] = np.arange(1,13)
    CVDP_data = CVDP_data.rename({'time':'year'})

    #make subsets of the datasets with only members in both sea ice and CVDP
    common_mem = np.intersect1d(target_data['member'], CVDP_data['member'])

    target_data = target_data.sel(member=common_mem).sortby('member')
    CVDP_data = CVDP_data.sel(member=common_mem).sortby('member')
    
    #remove AMOC as only CanESM5 and GISS-E2-1-G contain that data
    CVDP_data = CVDP_data.drop_sel(variable='AMOC')
    
    ######################### begin the lasso training #########################
    all_alphas_coefs_train = []
    all_alphas_score = []
    for alpha_val in alphas_list:

        all_lags_coefs_train = []
        all_lags_score = []
        for lag in year_lags:

            all_regions_coefs_train  = []
            all_regions_score = []
            for region_ in [2,3,4,5,11]:#np.arange(1,17):

                #preapre the sea ice data (targets) and split into 70% training
                #and 10% testing data, ensure member is sorted alphabetically
                #and time is sorted chronologically not all Jan, all Feb etc.
                
                if relative:
                    target_data_ = target_data.sel(
                        year=slice(str(start_end_yr[0]),str(start_end_yr[1]))) \
                    - target_data.sel(year=slice(str(start_end_yr[0]-lag),
                                                 str(start_end_yr[1]-lag))).values
                    
                    target_train = target_data_.sortby('member')
                    target_train = target_train.isel(
                    member=slice(0,train_test[0])).sel(month=month_).sel(
                    region=region_).stack(member_time=('member','year'))
                    
                    target_test  = target_data_.sortby('member')
                    target_test  = target_test.isel(
                    member=slice(train_test[1], train_test[2])).sel(
                    month=month_).sel(
                    region=region_).stack(member_time=('member','year'))
                    
                else:
                    target_train = target_data.sortby('member')
                    target_train = target_train.isel(
                        member=slice(0,train_test[0])).sel(month=month_).sel(
                        year=slice(str(start_end_yr[0]), str(start_end_yr[1]))).sel(
                        region=region_).stack(member_time=('member','year'))

                    target_test  = target_data.sortby('member')
                    target_test  = target_test.isel(
                        member=slice(train_test[1], train_test[2])).sel(
                        month=month_).sel(
                        year=slice(str(start_end_yr[0]), str(start_end_yr[1]))).sel(
                        region=region_).stack(member_time=('member','year'))

                #prepare the CVDP data into the training and testing data
                CVDP_train = []
                CVDP_test  = []
                for lag_month in np.arange(1,13):
                        
                    CVDP_month_data = CVDP_data.sortby('member')
                    CVDP_month_data = CVDP_month_data.sel(
                        month=lag_month).sel(
                        year=slice(str(start_end_yr[0]-lag), 
                                   str(start_end_yr[1]-lag)))
                    
                    CVDP_train.append(CVDP_month_data.isel(
                        member=slice(0,train_test[0])))
                    CVDP_test.append(CVDP_month_data.isel(
                        member=slice(train_test[1], train_test[2])))
                
                CVDP_train_stacked = xr.concat((CVDP_train),'month')
                
                CVDP_train_stacked = CVDP_train_stacked.stack(
                    member_time=('member','year')).stack(
                    var_month=('variable','month'))
                               
                CVDP_test_stacked  = xr.concat((CVDP_test),'month')
                
                CVDP_test_stacked = CVDP_test_stacked.stack(
                    member_time=('member','year')).stack(
                    var_month=('variable','month'))
                
                #run the lasso model
                lasso_model = linear_model.Lasso(alpha=alpha_val, 
                                                 max_iter=max_iteration, 
                                                 tol=tolorence, 
                                                 selection=model_sel)
                
                lasso_fit = lasso_model.fit(X=CVDP_train_stacked, 
                                            y=target_train.T)

                #save the trained coefficients and the scores
                var_month_coords = [] 
                for var_name_ in CVDP_month_data['variable'].values:
                    for i in np.arange(1,13):
                        var_month_coords.append(str(var_name_)+'_'+str(i).zfill(2))
                
                all_regions_coefs_train.append(xr.DataArray(
                    data=lasso_fit.coef_, 
                    coords={'var_month':var_month_coords}, 
                    dims=['var_month']))

                all_regions_score.append(xr.DataArray(
                    data=[lasso_model.score(X=CVDP_train_stacked, 
                                            y=target_train.T), 
                          lasso_model.score(X=CVDP_test_stacked, 
                                            y=target_test.T)],
                    coords={'train_test':['train','test']}, 
                    dims=['train_test'])
                )

            all_lags_coefs_train.append(xr.concat((all_regions_coefs_train),
                                                  dim='region'))
            all_lags_score.append(xr.concat((all_regions_score), dim='region'))

        all_alphas_coefs_train.append(xr.concat((all_lags_coefs_train),
                                                dim='lag'))
        all_alphas_score.append(xr.concat((all_lags_score), dim='lag'))
        
    coefs_xr = xr.concat((all_alphas_coefs_train), dim='alpha')
    score_xr = xr.concat((all_alphas_score), dim='alpha')
    
    coefs_xr['region'] = [2,3,4,5,11]
    coefs_xr['lag'] = year_lags
    coefs_xr['alpha'] = alphas_list
    
    score_xr['region'] = [2,3,4,5,11]
    score_xr['lag'] = year_lags
    score_xr['alpha'] = alphas_list
    
    return(coefs_xr, score_xr)

In [6]:
region_coefs = {}
region_scores = {}
for model_name in ['CanESM5', 'MIROC6', 'IPSL-CM6A-LR', 'GISS-E2-1-G',
                   'CNRM-CM6-1', 'NorCPM1']:
    
    print(datetime.datetime.now(), model_name)
    
    CMIP6_data = xr.open_dataset(
        '/glade/work/cwpowell/low-frequency-variability/input_data/Regional_'\
        +'SIC_SIT_detrended_lowpass_{}_1920_2014.nc'.format(model_name)
    )
    
    CVDP_data = xr.open_dataset(
        '/glade/work/cwpowell/low-frequency-variability/input_data/'\
        +'CVDP_standardized_1920_2014_historical_{}.nc'.format(model_name)
    )    

#N.B. replaced mem_split[model_name] with [14,17,20] to do an initial subsampling test

    region_model = train_model_month_relative(
        sea_ice_data = CMIP6_data['SIC'], variability_data = CVDP_data,
        month_ = 10, year_lags = np.arange(2,21), 
        train_test = [14,17,20], start_end_yr = [1941,2014], 
        max_iteration = 1e4, tolorence = 1e-3, 
        model_sel = 'random', alphas_list = [0.10,0.50,0.75,1.00],
        relative=False)
    
    region_coefs[model_name]= region_model[0]
    region_scores[model_name]= region_model[1]

2022-08-01 15:56:50.277699 CanESM5
2022-08-01 15:57:30.318027 MIROC6
2022-08-01 15:58:04.216938 IPSL-CM6A-LR
2022-08-01 15:58:33.688323 GISS-E2-1-G
2022-08-01 15:59:04.909930 CNRM-CM6-1
2022-08-01 15:59:29.016283 NorCPM1


In [8]:
xr.Dataset(region_coefs).to_netcdf('/glade/work/cwpowell/'\
    +'low-frequency-variability/lasso_coefs_scores/Subset_regions_SIC_coefs_October_subsamp_20.nc')

xr.Dataset(region_scores).to_netcdf('/glade/work/cwpowell/'\
    +'low-frequency-variability/lasso_coefs_scores/Subset_regions_SIC_scores_October_subsamp_20.nc')

# Train model on first 75% of members of all models

In [4]:
CMIP6_info = xr.open_dataset(
    '/glade/work/cwpowell/low-frequency-variability/raw_data/CMIP6_info/'\
    +'CMIP6_modeling_center_members_doi.nc'
)

In [5]:
split_dict = {
    '2' :[1, 1, 0],
    '3' :[2, 1, 0],
    '4' :[2, 1, 1],
    '5' :[3, 1, 1],
    '6' :[4, 1, 1],
    '10':[7, 2, 1],
    '11':[8, 2, 1],
    '16':[12,2, 2],
    '20':[15,3, 2],
    '23':[18,3, 2],
    '30':[22,5, 3],
    '32':[24,5, 3],
    '43':[32,7, 4],
    '50':[38,7, 5],
    '65':[49,10,6],
}

In [7]:
#first make a dataset of all CMIP6 and CVDP data
#then rename the member coordinate to be 

var_ = 'SIC'

all_CMIP6_data = []
all_CVDP_data  = []

train_iter = 1000
test_iter  = 2000
valid_iter = 3000

single_mem_count = 0

for model_name in CMIP6_info['model'].drop_sel(model='CAS-ESM2-0').values:
    try:
        CMIP6_data = xr.open_dataset(
            '/glade/work/cwpowell/low-frequency-variability/input_data/'\
            +f'Regional_SIC_detrended_lowpass_{model_name}_1920_2014.nc'
        )      
        
        CVDP_data = xr.open_dataset(
            '/glade/work/cwpowell/low-frequency-variability/input_data/'\
            +f'CVDP_standardized_1920_2014_historical_{model_name}.nc'
        )  
        
        if model_name == 'CNRM-ESM2-1':
            CMIP6_data = CMIP6_data.sel(member=['r11i1p1f2', 'r1i1p1f2', 
                'r2i1p1f2', 'r3i1p1f2', 'r4i1p1f2', 'r5i1p1f2'])
            
            CVDP_data = CVDP_data.sel(member=['r11i1p1f2', 'r1i1p1f2', 
                'r2i1p1f2', 'r3i1p1f2', 'r4i1p1f2', 'r5i1p1f2'])
        
        elif model_name == 'UKESM1-0-LL':
            CMIP6_data = CMIP6_data.drop_sel(member=['r13i1p1f2', 'r14i1p1f2'])
            CVDP_data = CVDP_data.drop_sel(member=['r13i1p1f2', 'r14i1p1f2'])
            
            
        CMIP6_data = CMIP6_data[var_].sortby('member')
        CVDP_data = CVDP_data.sortby('member')
        
        #get number of members
        num_mem = len(CMIP6_data['member'].values)
        
        assert len(CMIP6_data['member'].values) == len(CVDP_data['member'].values)
        
        if num_mem == 1:
            single_mem_count +=1
            
            if single_mem_count+1 % 3 == 0:
                test_iter += 1
                CMIP6_data['member'] = [test_iter]
                CVDP_data['member'] = [test_iter]
                
            elif single_mem_count % 3 == 0:
                valid_iter += 1
                CMIP6_data['member'] = [valid_iter]
                CVDP_data['member'] = [valid_iter]
            
            else:
                train_iter += 1
                CMIP6_data['member'] = [train_iter]
                CVDP_data['member'] = [train_iter]
                   
        else:
            mem_list = np.concatenate(
                (np.arange(train_iter, train_iter+split_dict[str(num_mem)][0]),
                np.arange(test_iter, test_iter+split_dict[str(num_mem)][1]),
                np.arange(valid_iter, valid_iter+split_dict[str(num_mem)][2])
                )
            )

            train_iter = train_iter + split_dict[str(num_mem)][0]
            test_iter = test_iter + split_dict[str(num_mem)][1]
            valid_iter = valid_iter + split_dict[str(num_mem)][2]
            
            CMIP6_data['member'] = mem_list
            CVDP_data['member'] = mem_list
            
        all_CMIP6_data.append(CMIP6_data)
        all_CVDP_data.append(CVDP_data)
        
    except (FileNotFoundError):
        print(model_name, 'File Not Found')
        
    

AWI-CM-1-1-MR File Not Found
AWI-ESM-1-1-LR File Not Found
CNRM-CM6-1 File Not Found
CNRM-CM6-1-HR File Not Found
E3SM-1-0 File Not Found
GFDL-CM4 File Not Found
GISS-E2-1-G-CC File Not Found
HadGEM3-GC31-MM File Not Found
IITM-ESM File Not Found
KACE-1-0-G File Not Found
KIOST-ESM File Not Found
MCM-UA-1-0 File Not Found
NorESM2-LM File Not Found


In [8]:
CMIP6_all_mem = xr.concat((all_CMIP6_data), dim='member').sortby('member')
CVDP_all_mem  = xr.concat((all_CVDP_data), dim='member').sortby('member')

CMIP6_all_mem.to_netcdf('/glade/work/cwpowell/low-frequency-variability/'\
    +'input_data/Regional_SIC_detrended_lowpass_all_CMIP6_1920_2014.nc')

CVDP_all_mem.to_netcdf('/glade/work/cwpowell/low-frequency-variability/'\
    +'input_data/CVDP_standardized_1920_2014_historical_all_CMIP6_1920_2014.nc')

In [48]:
#N.B. takes ~10 minutes to run all members with 2GB x 12 dask workers
lasso_compute_list = []
for month_ in np.arange(1,13):
    lasso_compute_list.append(dask.delayed(train_model_month)(
        sea_ice_data = CMIP6_all_mem, variability_data = CVDP_all_mem,
        month_ = month_, year_lags = np.arange(2,21), 
        train_test = [306,307,381], start_end_yr = [1941,2014], 
        max_iteration = 1e5, tolorence = 1e-3, 
        model_sel = 'random', alphas_list = [0.001])
                             )

#do the simultaneous computation on all months 
lasso_computed = dask.compute(*lasso_compute_list)

coefs = []
scores = []
for month_ in np.arange(1,13):
    coefs.append(lasso_computed[month_-1][0])
    scores.append(lasso_computed[month_-1][1])

coefs_xr = xr.concat((coefs), dim='month')
scores_xr = xr.concat((scores), dim='month')

coefs_xr['month'] = np.arange(1,13)
scores_xr['month'] = np.arange(1,13)

coefs_attrs = {
    'Description': 'Multiple regression coefficients using the Lasso '\
        +'method, trained on average regional sea ice concentration (SIC) '\
        +'and modes of climate variability for all availible CMIP6 models. '\
        +'Regions as defined for NSIDC MASIE-NH '\
        +'Version 1, modes of variability are obtained from the Climate '\
        +'Variability Diagnostics Package (CVDP). Training on the first '\
        +'75% of members for each region and month of SIC data for '\
        +'1941-2014 using historical CMIP6 forcing with CVDP data lagged '\
        +'between 2 and 20 years for each month of the mode of '\
        +'variability. Hyperparameters: alpha=[0.001], '\
        +'random rather than cyclic Lasso model training, maximum iteration '\
        +'of 1e5 and a tolorence of 1e-3.', 
    'Timestamp'  : str(datetime.datetime.utcnow().strftime(
        "%H:%M UTC %a %Y-%m-%d")),
    'Data source': 'CMIP6 historical model output, '\
        +'Climate Variability Diagnostics Package, '\
        +'doi:10.1002/2014EO490002. NSIDC MASIE-NH Regions, '\
        +'doi:10.7265/N5GT5K3K.', 
    'Analysis'   : 'https://github.com/chrisrwp/low-frequency-variability/'\
        +'blob/main/lasso/Train_CMIP6_CVDP.ipynb'
}

coefs_xr.attrs = coefs_attrs

coefs_xr.to_netcdf('/glade/work/cwpowell/low-frequency-variability/'\
    +'lasso_coefs_scores/SIC_CVDP_Lasso_Coefs_all_CMIP6_models_'\
    +'1941_2014_lag_2_20_very_low_alpha_higher_it.nc')

scores_attrs = coefs_attrs.copy()
scores_attrs['Description'] = 'Multiple regression scores using the Lasso '\
        +'method, trained and tested on average regional sea ice '\
        +'thickness (SIT) and modes of climate variability for all availible '\
        +'CMIP6 models. Regions as defined for NSIDC '\
        +'MASIE-NH Version 1, modes of variability are obtained from the '\
        +'Climate Variability Diagnostics Package (CVDP). Training on the '\
        +'first 75% of members and testing on the final 15% of member for '\
        +'each region and month of SIC data for 1941-2014 using '\
        +'historical CMIP6 forcing with CVDP data lagged between 2 and 20 '\
        +'years for each month of the mode of variability. '\
        +'Hyperparameters: alpha=[0.001], random rather than '\
        +'cyclic Lasso model training, maximum iteration of 1e5 and a '\
        +'tolorence of 1e-3.', 
scores_xr.attrs = scores_attrs

scores_xr.to_netcdf('/glade/work/cwpowell/low-frequency-variability/'\
    +'lasso_coefs_scores/SIC_CVDP_Lasso_Scores_all_CMIP6_models_'\
    +'1941_2014_lag_2_20_very_low_alpha.nc')