# Calculate Average Profiles for 7 & 8 classes

This notebook will create the average profiles for 7 and 8 class GMMs fit to UK-ESM historical data. These files are required to reproduce Figures 3, 4 and YY from *Heuristic Methods for Determining the Number of Classes in Unsupervised Classification of Climate Models*, E. Boland et al. 2022 (doi to follow). This requires cluster_utils.py and input datafiles via the googleapi CMIP6 store (see cluster_utils.py for more info)

There are two options
- calculate from already trained models (uses data from model/)
- recreate from scratch (train models from scratch)

Outputs stored in model/\[ensemble\]/\[nclasses\]/avg.obj

Please attribute any plots or code from this notebook using the DOI from Zenodo: to come

Updated Nov 2022
E Atkinson & E Boland [emmomp@bas.ac.uk](email:emmomp@bas.ac.uk)

In [26]:
from dask.distributed import Client

client = Client("tcp://127.0.0.1:39989")
client

0,1
Connection method: Direct,
Dashboard: http://127.0.0.1:8787/status,

0,1
Comm: tcp://127.0.0.1:39989,Workers: 1
Dashboard: http://127.0.0.1:8787/status,Total threads: 1
Started: Just now,Total memory: 8.00 GiB

0,1
Comm: tcp://127.0.0.1:39865,Total threads: 1
Dashboard: http://127.0.0.1:32785/status,Memory: 8.00 GiB
Nanny: tcp://127.0.0.1:37426,
Local directory: /tmp/dask-worker-space/worker-0b84jw_p,Local directory: /tmp/dask-worker-space/worker-0b84jw_p
Tasks executing: 0,Tasks in memory: 0
Tasks ready: 0,Tasks in flight: 0
CPU usage: 0.0%,Last seen: Just now
Memory usage: 140.50 MiB,Spilled bytes: 0 B
Read bytes: 15.98 kiB,Write bytes: 19.20 kiB


In [25]:
client.close()

In [27]:
import numpy as np
import xarray as xr

import os
import pickle

import cluster_utils as flt

### User options
Leave as is to recreate the paper

In [28]:
# Number of classes 
classes = [7,8]
#Time range
tslice=slice('1965-01', '1994-12') 
#['r1i1p1f2', 'r2i1p1f2'] You only need these two for Figs 3& 4
ids = ['r1i1p1f2', 'r2i1p1f2', 'r3i1p1f2', 'r4i1p1f2', 'r5i1p1f3', 'r6i1p1f3', 'r7i1p1f3', 'r8i1p1f2', 'r9i1p1f2', 'r10i1p1f2'] # You need all of these for figs YY
npca=3 #number of PCA components
ntrain=7000 #number of profiles per month to use in training dataset

Uncomment the following two lines if you need to generate mask.npy:

In [29]:
#data = flt.retrieve_profiles(timeRange = slice('1995-01', '1995-02'))
#np.save('data/mask', data['n'])
mask = np.load('data/mask.npy', allow_pickle=True)

### Option 1: Generate average profiles for chosen ensemble members and classes if models already trained 

In [22]:
for m_id in ids:
    
    #Check if data already exists
    tests=[]
    for nn,n_classes in enumerate(classes):   
        path_data = 'data/{}/{}'.format(m_id, n_classes)
        tests.append(os.path.isfile('{}/avg.obj'.format(path_data)))
    if np.all(tests):
        print('Found avg files for {}, skipping'.format(m_id))
        continue
    else:
    
        print('Starting {}'.format(m_id))
        path_id = 'model/{}'.format(m_id)
        # Load PCA for given model
        with open('{}/pca.obj'.format(path_id), 'rb') as file:
            pca = pickle.load(file)
        # Retrieve all Southern Ocean data
        options = {'memberId' : m_id}
        data = flt.retrieve_profiles(timeRange=tslice,mask=mask,options=options)
        data = data.chunk({'time': data.sizes['time'], 'n': 1024})
        # Normalise the samples
        data_norm = flt.normalise_data(data, ('n', 'time')) 
        # Transform to PCA space
        data_trans = flt.pca_transform(data_norm, pca)
        print('Finished setup for {}'.format(m_id))      

        for nn,n_classes in enumerate(classes):       
            print('Classifying full dataset into {} classes'.format(n_classes))
            path_n = 'model/{}/{}'.format(m_id, n_classes)
            path_data = 'data/{}/{}'.format(m_id, n_classes)
            if not os.path.exists(path_data):
                os.makedirs(path_data)
            # Open GMM model generated from training set
            with open('{}/gmm.obj'.format(path_n), 'rb') as file:
                gmm = pickle.load(file)
            # Classify full dataset
            data_classes = flt.gmm_classify(data_trans, gmm)
            # Calculate average profiles for each clasee
            avg_prof = flt.avg_profiles(data, data_classes, n_classes)
            print('Average profiles calculated, writing to file'.format(n_classes))

            with open('{}/avg.obj'.format(path_data), 'wb') as file:
                pickle.dump(avg_prof, file)
            print('Done with {} classes'.format(n_classes))
    
print('Done!')

Starting r3i1p1f2
Finished setup for r3i1p1f2
Classifying full dataset into 7 classes
Average profiles calculated, writing to file
Done with 7 classes
Classifying full dataset into 8 classes
Average profiles calculated, writing to file
Done with 8 classes
Starting r4i1p1f2


FileNotFoundError: [Errno 2] No such file or directory: 'model/r4i1p1f2/pca.obj'

### Option 2: Generate average profiles for chosen ensemble members and classes, training models from scratch

In [None]:
avg_profiles = {}
for m_id in ids:
    
    #Check if data already exists
    tests=[]
    for nn,n_classes in enumerate(classes):   
        path_data = 'data/{}/{}'.format(m_id, n_classes)
        tests.append(os.path.isfile('{}/avg.obj'.format(path_data)))
    if np.all(tests):
        print('Found avg files for {}, skipping'.format(m_id))
        continue
    else:
        
        print('Starting {}'.format(m_id))
        path_id = 'model/{}'.format(m_id)
        #Generate training set and PCA model
        [data_train,pca] = flt.generate_trainingset(timeRange = tslice, mask=mask, options=options,n_components=npca,N=ntrain)
        if not os.path.exists(path_id):
            os.makedirs(path_id)
        with open('{}/pca.obj'.format(path_id), 'wb') as file:
            pickle.dump(pca, file)
        #Load full Southern Ocean data to fit
        data = flt.retrieve_profiles(timeRange=tslice,mask=mask,options=options)
        data = data.chunk({'time': data.sizes['time'], 'n': 1024})
        # Normalise the samples
        data_norm = flt.normalise_data(data, ('n', 'time')) 
        # Transform to PCA space
        data_trans = flt.pca_transform(data_norm, pca)
        print('Finished setup for {}'.format(m_id))      

        for nn,n_classes in enumerate(classes):        
            print('Classifying full dataset into {} classes'.format(n_classes))
            path_n = 'model/{}/{}'.format(m_id, n_classes)
            if not os.path.exists(path_n):
                os.makedirs(path_n)            
            path_data = 'data/{}/{}'.format(m_id, n_classes)
            if not os.path.exists(path_data):
                os.makedirs(path_data)
                
            # Generate GMM model generated from training set
            gmm = flt.train_gmm(data_train, n_classes)
            with open('{}/gmm.obj'.format(path_n), 'wb') as file:
                pickle.dump(gmm, file)
            # Classify full dataset
            data_classes = flt.gmm_classify(data_trans, gmm)
            # Calculate average profiles for each clasee
            avg_prof = flt.avg_profiles(data, data_classes, n_classes)
            print('Average profiles calculated, writing to file'.format(n_classes))

            with open('{}/avg.obj'.format(path_data), 'wb') as file:
                pickle.dump(avg_prof, file)      
            print('Done with {} classes'.format(n_classes))      
    
print('Done!')

Found avg files for r1i1p1f2, skipping
Found avg files for r2i1p1f2, skipping
Found avg files for r3i1p1f2, skipping
Starting r4i1p1f2
Finished setup for r4i1p1f2
Classifying full dataset into 7 classes
