# Calculate BIC curves

This notebook will create the file BICs2-30.obj, required to reproduce Figure 2 from *Heuristic Methods for Determining the Number of Classes in Unsupervised Classification of Climate Models*, E. Boland et al. 2022 (doi to follow). This requires cluster_utils.py and input datafiles via the googleapi CMIP6 store (see cluster_utils.py for more info)

Please attribute any plots or code from this notebook using the DOI from Zenodo: to come

Updated Nov 2022
E Atkinson & E Boland [emmomp@bas.ac.uk](email:emmomp@bas.ac.uk)

In [1]:
from dask.distributed import Client

client = Client("tcp://127.0.0.1:46506")
client

0,1
Connection method: Direct,
Dashboard: http://127.0.0.1:8787/status,

0,1
Comm: tcp://127.0.0.1:46506,Workers: 6
Dashboard: http://127.0.0.1:8787/status,Total threads: 6
Started: 3 minutes ago,Total memory: 48.00 GiB

0,1
Comm: tcp://127.0.0.1:40196,Total threads: 1
Dashboard: http://127.0.0.1:39078/status,Memory: 8.00 GiB
Nanny: tcp://127.0.0.1:37996,
Local directory: /tmp/dask-worker-space/worker-usl3n7ug,Local directory: /tmp/dask-worker-space/worker-usl3n7ug
Tasks executing: 0,Tasks in memory: 0
Tasks ready: 0,Tasks in flight: 0
CPU usage: 0.0%,Last seen: Just now
Memory usage: 132.91 MiB,Spilled bytes: 0 B
Read bytes: 89.36 kiB,Write bytes: 100.74 kiB

0,1
Comm: tcp://127.0.0.1:37973,Total threads: 1
Dashboard: http://127.0.0.1:42412/status,Memory: 8.00 GiB
Nanny: tcp://127.0.0.1:36890,
Local directory: /tmp/dask-worker-space/worker-g4arku4g,Local directory: /tmp/dask-worker-space/worker-g4arku4g
Tasks executing: 0,Tasks in memory: 0
Tasks ready: 0,Tasks in flight: 0
CPU usage: 0.0%,Last seen: Just now
Memory usage: 131.02 MiB,Spilled bytes: 0 B
Read bytes: 126.74 kiB,Write bytes: 148.72 kiB

0,1
Comm: tcp://127.0.0.1:38656,Total threads: 1
Dashboard: http://127.0.0.1:45120/status,Memory: 8.00 GiB
Nanny: tcp://127.0.0.1:34565,
Local directory: /tmp/dask-worker-space/worker-hot49vjx,Local directory: /tmp/dask-worker-space/worker-hot49vjx
Tasks executing: 0,Tasks in memory: 0
Tasks ready: 0,Tasks in flight: 0
CPU usage: 2.0%,Last seen: Just now
Memory usage: 132.19 MiB,Spilled bytes: 0 B
Read bytes: 115.10 kiB,Write bytes: 143.01 kiB

0,1
Comm: tcp://127.0.0.1:45384,Total threads: 1
Dashboard: http://127.0.0.1:41222/status,Memory: 8.00 GiB
Nanny: tcp://127.0.0.1:37018,
Local directory: /tmp/dask-worker-space/worker-8f6he80y,Local directory: /tmp/dask-worker-space/worker-8f6he80y
Tasks executing: 0,Tasks in memory: 0
Tasks ready: 0,Tasks in flight: 0
CPU usage: 4.0%,Last seen: Just now
Memory usage: 132.16 MiB,Spilled bytes: 0 B
Read bytes: 115.48 kiB,Write bytes: 143.47 kiB

0,1
Comm: tcp://127.0.0.1:34904,Total threads: 1
Dashboard: http://127.0.0.1:37587/status,Memory: 8.00 GiB
Nanny: tcp://127.0.0.1:43946,
Local directory: /tmp/dask-worker-space/worker-n3ati3gi,Local directory: /tmp/dask-worker-space/worker-n3ati3gi
Tasks executing: 0,Tasks in memory: 0
Tasks ready: 0,Tasks in flight: 0
CPU usage: 2.0%,Last seen: Just now
Memory usage: 225.93 MiB,Spilled bytes: 0 B
Read bytes: 39.71 kiB,Write bytes: 34.11 kiB

0,1
Comm: tcp://127.0.0.1:45047,Total threads: 1
Dashboard: http://127.0.0.1:42456/status,Memory: 8.00 GiB
Nanny: tcp://127.0.0.1:43613,
Local directory: /tmp/dask-worker-space/worker-wki49_z_,Local directory: /tmp/dask-worker-space/worker-wki49_z_
Tasks executing: 0,Tasks in memory: 0
Tasks ready: 0,Tasks in flight: 0
CPU usage: 2.0%,Last seen: Just now
Memory usage: 132.90 MiB,Spilled bytes: 0 B
Read bytes: 118.73 kiB,Write bytes: 144.75 kiB


In [2]:
import numpy as np
import xarray as xr
import matplotlib.pyplot as plt

import os
import pickle
import cluster_utils as flt

### Load/create mask
Uncomment the first two lines to generate the file mask.npy if required

In [4]:
#data = flt.retrieve_profiles(timeRange = slice('1995-01', '1995-02'))
#np.save('data/mask', data['n'])
mask = np.load('data/mask.npy', allow_pickle=True)

In [1]:
ids = ['r1i1p1f2', 'r2i1p1f2', 'r3i1p1f2'] # Ensemble members to use
BICs = {}
model_folder = 'model'

### Fit 2-30 class models for each ensemble member
Saves each individual PCA model, GMM model and BIC curve to \[model_folder\]

Saves all BIC curves to \[model_folder\]/BICs2-30.obj

In [None]:
for m_id in ids:
    path_id = '{}/{}'.format(model_folder, m_id)
    if not os.path.isdir(path_id):
        os.makedirs(path_id)
    print('Starting {}'.format(m_id))
    options = {'memberId' : m_id}
    
    # Load all 30 years of data
    data = flt.retrieve_profiles(timeRange = slice('1965-01', '1994-12'), mask=mask, options=options)
    
    # Create training dataset by taking random subsample of 7000 Southern Ocean profiles per month
    data_sampled = flt.random_sample(data, 7000).compute()
    data_sampled = flt.normalise_data(data_sampled, 'N') #Normalise
    
    pca = flt.train_pca(data_sampled, 3) #Fit PCA model    
    data_trans = flt.pca_transform(data_sampled, pca).compute() # Transform training set to PCA space
    
    bic = np.zeros(30)
    
    with open('{}/pca.obj'.format(path_id), 'wb') as file:
        pickle.dump(pca, file)
        file.close()
        
    print('Finished setup for {}'.format(m_id))
    
    for n_classes in range(2, 30):
        
        path_n = '{}/{}/{}'.format(model_folder, m_id, n_classes)
        if not os.path.isdir(path_n):
            os.makedirs(path_n)
        elif os.path.exists('{}/bic.obj'.format(path_n)):
            with open('{}/bic.obj'.format(path_n), 'rb') as file:
                bic[n_classes] = pickle.load(file)
                file.close()
            continue
        
        gmm = flt.train_gmm(data_trans, n_classes)
        bic[n_classes] = gmm.bic(data_trans)
        
        with open('{}/gmm.obj'.format(path_n), 'wb') as file:
            pickle.dump(gmm, file)
            file.close()
            
        with open('{}/bic.obj'.format(path_n), 'wb') as file:
            pickle.dump(bic[n_classes], file)
            file.close()
        
        print('Finished {} with {} classes'.format(m_id, n_classes))
        
    BICs[m_id] = bic
    
with open('{}/BICs2-30.obj'.format(model_folder), 'wb') as file:
    pickle.dump(BICs, file)
    file.close()

print('Done!')

Starting r1i1p1f2
Finished setup for r1i1p1f2
Finished r1i1p1f2 with 2 classes
Finished r1i1p1f2 with 3 classes
Finished r1i1p1f2 with 4 classes
Finished r1i1p1f2 with 5 classes
Finished r1i1p1f2 with 6 classes
Finished r1i1p1f2 with 7 classes
Finished r1i1p1f2 with 8 classes
