In [None]:
from dask.distributed import Client

client = Client("tcp://127.0.0.1:33240")
client

# Calculate BIC curves

This notebook will create the file BICs2-30.obj, required to reproduce Figure 2 from *Heuristic Methods for Determining the Number of Classes in Unsupervised Classification of Climate Models*, E. Boland et al. 2022 (doi to follow). This requires cluster_utils.py and input datafiles via the googleapi CMIP6 store (see cluster_utils.py for more info)

Please attribute any plots or code from this notebook using the DOI from Zenodo: to come

Updated Nov 2022
E Atkinson & E Boland [emmomp@bas.ac.uk](email:emmomp@bas.ac.uk)

In [1]:
from dask.distributed import Client

client = Client("tcp://127.0.0.1:34573")
client

KeyboardInterrupt: 

In [None]:
import numpy as np
import os
import pickle
import cluster_utils as flt

Uncomment the following two lines if you need to generate mask.npy:

In [None]:
#data = flt.retrieve_profiles(timeRange = slice('1995-01', '1995-02'))
#np.save('data/mask', data['n'])
mask = np.load('data/mask.npy', allow_pickle=True)

### User options
Leave as is to recreate the paper

In [None]:
ids = ['r1i1p1f2', 'r2i1p1f2', 'r3i1p1f2'] # Ensemble members to use
model_folder = 'model'
tslice=slice('1965-01', '1994-12')
npca=3 #number of PCA components
ntrain=7000 #number of profiles per month to use in training dataset

### Fit 2-30 class models for each ensemble member
Saves each individual PCA model, GMM model and BIC curve to \[model_folder\]

Saves all BIC curves to \[model_folder\]/BICs2-30.obj

In [10]:
BICs = {}
for m_id in ids:
    path_id = '{}/{}'.format(model_folder, m_id)
    if not os.path.isdir(path_id):
        os.makedirs(path_id)
    print('Starting {}'.format(m_id))
    options = {'memberId' : m_id}
    
    # Load training set
    [data,pca] = flt.generate_trainingset(timeRange = tslice, mask=mask, options=options,N=ntrain,n_components=npca)
    
    bic = np.zeros(30)
    
    with open('{}/pca.obj'.format(path_id), 'wb') as file:
        pickle.dump(pca, file)
        
    print('Finished setup for {}'.format(m_id))
    
    for n_classes in range(2, 30):
        
        path_n = '{}/{}/{}'.format(model_folder, m_id, n_classes)
        if not os.path.isdir(path_n):
            os.makedirs(path_n)
        elif os.path.exists('{}/bic.obj'.format(path_n)):
            with open('{}/bic.obj'.format(path_n), 'rb') as file:
                bic[n_classes] = pickle.load(file)
            continue
        
        gmm = flt.train_gmm(data, n_classes)
        bic[n_classes] = gmm.bic(data)
        
        with open('{}/gmm.obj'.format(path_n), 'wb') as file:
            pickle.dump(gmm, file)
            
        with open('{}/bic.obj'.format(path_n), 'wb') as file:
            pickle.dump(bic[n_classes], file)
        
        print('Finished {} with {} classes'.format(m_id, n_classes))
        
    BICs[m_id] = bic
    
with open('{}/BICs2-30.obj'.format(model_folder), 'wb') as file:
    pickle.dump(BICs, file)

print('Done!')

Starting r1i1p1f2


    >>> with dask.config.set(**{'array.slicing.split_large_chunks': False}):
    ...     array[indexer]

To avoid creating the large chunks, set the option
    >>> with dask.config.set(**{'array.slicing.split_large_chunks': True}):
    ...     array[indexer]
  return self.array[key]
    >>> with dask.config.set(**{'array.slicing.split_large_chunks': False}):
    ...     array[indexer]

To avoid creating the large chunks, set the option
    >>> with dask.config.set(**{'array.slicing.split_large_chunks': True}):
    ...     array[indexer]
  return self.array[key]


Finished setup for r1i1p1f2
Starting r2i1p1f2


    >>> with dask.config.set(**{'array.slicing.split_large_chunks': False}):
    ...     array[indexer]

To avoid creating the large chunks, set the option
    >>> with dask.config.set(**{'array.slicing.split_large_chunks': True}):
    ...     array[indexer]
  return self.array[key]
    >>> with dask.config.set(**{'array.slicing.split_large_chunks': False}):
    ...     array[indexer]

To avoid creating the large chunks, set the option
    >>> with dask.config.set(**{'array.slicing.split_large_chunks': True}):
    ...     array[indexer]
  return self.array[key]


KeyboardInterrupt: 