# Fit 7 & 8 class GMMs and calculate average profiles for profile matching

This notebook will create the average profiles for 7 and 8 class GMMs fit to two ensembles from the UK-ESM historical simulations. These files are required to reproduce Figures 3 and 4 from *Heuristic Methods for Determining the Number of Classes in Unsupervised Classification of Climate Models*, E. Boland et al. 2022 (doi to follow). This requires cluster_utils.py and input datafiles via the googleapi CMIP6 store (see cluster_utils.py for more info)
Outputs stored in model/\[ensemble\]/\[nclasses\]/avg.obj

Please attribute any plots or code from this notebook using the DOI from Zenodo: to come

Updated Nov 2022
E Atkinson & E Boland [emmomp@bas.ac.uk](email:emmomp@bas.ac.uk)

In [4]:
from dask.distributed import Client

client = Client("tcp://127.0.0.1:44138")
client

0,1
Connection method: Direct,
Dashboard: http://127.0.0.1:8787/status,

0,1
Comm: tcp://127.0.0.1:44138,Workers: 8
Dashboard: http://127.0.0.1:8787/status,Total threads: 8
Started: Just now,Total memory: 64.00 GiB

0,1
Comm: tcp://127.0.0.1:35337,Total threads: 1
Dashboard: http://127.0.0.1:40715/status,Memory: 8.00 GiB
Nanny: tcp://127.0.0.1:45456,
Local directory: /tmp/dask-worker-space/worker-gxvz9v95,Local directory: /tmp/dask-worker-space/worker-gxvz9v95
Tasks executing: 0,Tasks in memory: 0
Tasks ready: 0,Tasks in flight: 0
CPU usage: 0.0%,Last seen: Just now
Memory usage: 140.61 MiB,Spilled bytes: 0 B
Read bytes: 34.91 kiB,Write bytes: 47.99 kiB

0,1
Comm: tcp://127.0.0.1:36362,Total threads: 1
Dashboard: http://127.0.0.1:44742/status,Memory: 8.00 GiB
Nanny: tcp://127.0.0.1:45728,
Local directory: /tmp/dask-worker-space/worker-uiyri130,Local directory: /tmp/dask-worker-space/worker-uiyri130
Tasks executing: 0,Tasks in memory: 0
Tasks ready: 0,Tasks in flight: 0
CPU usage: 2.0%,Last seen: Just now
Memory usage: 144.09 MiB,Spilled bytes: 0 B
Read bytes: 34.08 kiB,Write bytes: 47.19 kiB

0,1
Comm: tcp://127.0.0.1:41186,Total threads: 1
Dashboard: http://127.0.0.1:45396/status,Memory: 8.00 GiB
Nanny: tcp://127.0.0.1:34703,
Local directory: /tmp/dask-worker-space/worker-isrkcweo,Local directory: /tmp/dask-worker-space/worker-isrkcweo
Tasks executing: 0,Tasks in memory: 0
Tasks ready: 0,Tasks in flight: 0
CPU usage: 0.0%,Last seen: Just now
Memory usage: 140.85 MiB,Spilled bytes: 0 B
Read bytes: 33.36 kiB,Write bytes: 46.43 kiB

0,1
Comm: tcp://127.0.0.1:38567,Total threads: 1
Dashboard: http://127.0.0.1:39165/status,Memory: 8.00 GiB
Nanny: tcp://127.0.0.1:42647,
Local directory: /tmp/dask-worker-space/worker-xwd9yagv,Local directory: /tmp/dask-worker-space/worker-xwd9yagv
Tasks executing: 0,Tasks in memory: 0
Tasks ready: 0,Tasks in flight: 0
CPU usage: 2.0%,Last seen: Just now
Memory usage: 142.83 MiB,Spilled bytes: 0 B
Read bytes: 32.88 kiB,Write bytes: 46.01 kiB

0,1
Comm: tcp://127.0.0.1:34078,Total threads: 1
Dashboard: http://127.0.0.1:43698/status,Memory: 8.00 GiB
Nanny: tcp://127.0.0.1:36304,
Local directory: /tmp/dask-worker-space/worker-5spl6avv,Local directory: /tmp/dask-worker-space/worker-5spl6avv
Tasks executing: 0,Tasks in memory: 0
Tasks ready: 0,Tasks in flight: 0
CPU usage: 2.0%,Last seen: Just now
Memory usage: 144.11 MiB,Spilled bytes: 0 B
Read bytes: 34.08 kiB,Write bytes: 47.18 kiB

0,1
Comm: tcp://127.0.0.1:45880,Total threads: 1
Dashboard: http://127.0.0.1:36230/status,Memory: 8.00 GiB
Nanny: tcp://127.0.0.1:35321,
Local directory: /tmp/dask-worker-space/worker-mj_ljvo_,Local directory: /tmp/dask-worker-space/worker-mj_ljvo_
Tasks executing: 0,Tasks in memory: 0
Tasks ready: 0,Tasks in flight: 0
CPU usage: 0.0%,Last seen: Just now
Memory usage: 144.11 MiB,Spilled bytes: 0 B
Read bytes: 36.76 kiB,Write bytes: 52.00 kiB

0,1
Comm: tcp://127.0.0.1:39302,Total threads: 1
Dashboard: http://127.0.0.1:46631/status,Memory: 8.00 GiB
Nanny: tcp://127.0.0.1:42776,
Local directory: /tmp/dask-worker-space/worker-u7asy9o8,Local directory: /tmp/dask-worker-space/worker-u7asy9o8
Tasks executing: 0,Tasks in memory: 0
Tasks ready: 0,Tasks in flight: 0
CPU usage: 2.0%,Last seen: Just now
Memory usage: 142.85 MiB,Spilled bytes: 0 B
Read bytes: 34.04 kiB,Write bytes: 47.13 kiB

0,1
Comm: tcp://127.0.0.1:44097,Total threads: 1
Dashboard: http://127.0.0.1:42925/status,Memory: 8.00 GiB
Nanny: tcp://127.0.0.1:41248,
Local directory: /tmp/dask-worker-space/worker-ma3qpucg,Local directory: /tmp/dask-worker-space/worker-ma3qpucg
Tasks executing: 0,Tasks in memory: 0
Tasks ready: 0,Tasks in flight: 0
CPU usage: 0.0%,Last seen: Just now
Memory usage: 143.89 MiB,Spilled bytes: 0 B
Read bytes: 34.17 kiB,Write bytes: 47.31 kiB


In [1]:
from dask_gateway import Gateway
gateway = Gateway()
from dask.distributed import Client

'''properly shutdown any previous clusters'''
clusters=gateway.list_clusters()
if clusters != []:
    print(f'found {len(clusters)} clusters')
    for cluster in clusters:
        cluster = gateway.connect(cluster.name)
        client=Client(cluster)
        client.close()
        cluster.shutdown()

In [1]:
client.close()

NameError: name 'client' is not defined

In [5]:
import numpy as np
import xarray as xr

import os
import pickle

import cluster_utils as flt

### User options
Leave as is to recreate the paper

In [6]:
# Number of classes 
classes = [7,8]
model='model_19651995'
#Time range
tslice=slice('1965-01', '1994-12') 
['r1i1p1f2', 'r2i1p1f2'] 
npca=3 #number of PCA components
ntrain=7000 #number of profiles per month to use in training dataset

Uncomment the following two lines if you need to generate mask.npy:

In [7]:
#data = flt.retrieve_profiles(timeRange = slice('1995-01', '1995-02'))
#np.save('data/mask', data['n'])
mask = np.load('data/mask.npy', allow_pickle=True)

### Train models and generate average profiles for chosen ensemble members and classes 

In [22]:
for m_id in ids:
    
    #Check if data already exists
    tests=[]
    for nn,n_classes in enumerate(classes):   
        path_data = '{}/{}/{}'.format(model,m_id, n_classes)
        tests.append(os.path.isfile('{}/avg_prof.obj'.format(path_data)))
    if np.all(tests):
        print('Found avg files for {}, skipping'.format(m_id))
        continue
    else:
        
        print('Starting {}'.format(m_id))
        options = {'memberId' : m_id}
        path_id = '{}/{}'.format(model,m_id)
        # Check if models already created
        tests=[]
        for nn,n_classes in enumerate(classes):   
            path_data = '{}/{}/{}'.format(model,m_id, n_classes)
            tests.append(os.path.isfile('{}/gmm.obj'.format(path_data)))
        if np.all(tests): # All models trained, no need to load training set       
            with open('{}/pca.obj'.format(path_id),'rb') as file:
                pca=pickle.load(file)         
        else:     # Load training set, generate PCA model
            print('No models found, generating training set')
            [data_train,pca] = flt.generate_trainingset(timeRange = tslice, mask=mask, options=options,n_components=npca,N=ntrain)
            if not os.path.exists(path_id):
                os.makedirs(path_id)
            with open('{}/pca.obj'.format(path_id), 'wb') as file:
                pickle.dump(pca, file)               
                        
        
        # Retrieve all Southern Ocean data
        options = {'memberId' : m_id}
        data = flt.retrieve_profiles(timeRange=tslice,mask=mask,options=options)
        data = data.chunk({'time': data.sizes['time'], 'n': 1024})
        # Normalise the samples
        data_norm = flt.normalise_data(data, ('n', 'time')) 
        # Transform to PCA space
        data_trans = flt.pca_transform(data_norm, pca)
        print('Finished setup for {}'.format(m_id))      

        for nn,n_classes in enumerate(classes):  
            path_n = '{}/{}/{}'.format(model,m_id, n_classes)
            #Check if model already created
            if os.path.isfile('{}/gmm.obj'.format(path_n)):
                with open('{}/gmm.obj'.format(path_n),'rb') as file:
                    gmm=pickle.load(file)                
            else:            
                print('Training {} class model'.format(n_classes))
                if not os.path.exists(path_n):
                    os.makedirs(path_data)            
                gmm = flt.train_gmm(data_train, n_classes)
                with open('{}/gmm.obj'.format(path_n), 'wb') as file:
                    pickle.dump(gmm, file)
 
            print('Classifying full dataset into {} classes'.format(n_classes))
            # Classify full dataset
            data_classes = flt.gmm_classify(data_trans, gmm)
            avg_class = data_classes.mean('time')
            print('Time Average Classification calculated, writing to file'.format(n_classes))
            with open('{}/avg_class.obj'.format(path_n), 'wb') as file:
                pickle.dump(avg_class.data, file)         
            # Calculate average profiles for each clasee
            avg_prof = flt.avg_profiles(data, data_classes, n_classes)
            print('Average profiles calculated, writing to file'.format(n_classes))
            with open('{}/avg_prof.obj'.format(path_data), 'wb') as file:
                pickle.dump(avg_prof, file)
            print('Done with {} classes'.format(n_classes))
    
print('Done!')

Starting r3i1p1f2
Finished setup for r3i1p1f2
Classifying full dataset into 7 classes
Average profiles calculated, writing to file
Done with 7 classes
Classifying full dataset into 8 classes
Average profiles calculated, writing to file
Done with 8 classes
Starting r4i1p1f2


FileNotFoundError: [Errno 2] No such file or directory: 'model/r4i1p1f2/pca.obj'