In [None]:
import sys
import os
import numpy as np
import xarray as xr
import pandas as pf
from pathlib import Path
from hdbscan import HDBSCAN

sys.path.append(os.path.expanduser('~/Documents/Weave'))
from Weave.inputoutput import Reader
from Weave.clustering import Clustering, Latlons

In [None]:
# current params: ms 2000, mcs 4000, allow True, epsilon = 0.0
corrpath = Path('/nobackup_1/users/straaten/correlation_cv_spearmanpar_varalpha_strict/t850_nhblock.7.corr.nc')

In [None]:
ds = xr.open_dataset(corrpath, decode_times = False)

In [None]:
fieldseparate = ds['correlation'].sel(lag = -10, fold = 1)
fieldwave = ds['correlation'].sel(lag = -7, fold = 0)
clusterkwargs = dict(min_cluster_size=2000, #400 
                     min_samples=1000, # 1000 good noise reduction
                     allow_single_cluster=True,
                     cluster_selection_epsilon=0.17, #0.15# Radian distance. Unit = fraction of earth radius. As 2pi * r is the full 2pi radian distance
                     metric='haversine',
                     core_dist_n_jobs = 7)

In [None]:
fieldseparate.plot()

In [None]:
fieldwave.plot()

In [None]:
cl = Clustering()
cl.reshape_and_drop_obs(array = fieldseparate.expand_dims(dim = 'lag', axis = 0), mask = ~fieldseparate.isnull(), min_samples = 700)
cl.prepare_for_distance_algorithm(manipulator = Latlons, kwargs = {'to_radians':True})
clusters = cl.clustering(clusterclass = HDBSCAN, kwargs = clusterkwargs)
clw = Clustering()
clw.reshape_and_drop_obs(array = fieldwave.expand_dims(dim = 'lag', axis = 0), mask = ~fieldwave.isnull(), min_samples = 700)
clw.prepare_for_distance_algorithm(manipulator = Latlons, kwargs = {'to_radians':True})
clustersw = clw.clustering(clusterclass = HDBSCAN, kwargs = clusterkwargs)

In [None]:
clusters.squeeze().plot()

In [None]:
clustersw.squeeze().plot()

1000 ms gives a good first noise reduction.
At 2000 mcs the clustering for the sparse field seems ok (3 separate duo's, not together). However, the wave field is still subdivided (nothing is together)
At 2500, (i.c.w. a reasonable epsilon) sometimes a blob from too far away joins
at 3000 mcs the wave fields sees disappearing parts of the wave, still not together
at 4000 and 5000 still not together.
at 6000 another part disappears
0.15 epsilon works for connecting the wave
0.1 works only for three parts
0.175 works to connect part of another wave
Idea: 1000ms 2000mcs 0.17 epsilon (as the bare minimum to connect in these examples)

### Lets move to SST

In [None]:
# current params: ms 2000, mcs 4000, allow True, epsilon = 0.0
corrpath = Path('/nobackup_1/users/straaten/correlation_cv_spearmanpar_varalpha_strict/sst_nhplus.7.corr.nc')

In [None]:
ds = xr.open_dataset(corrpath, decode_times = False)

In [None]:
fieldmorenoise = ds['correlation'].sel(lag = -28, fold = 0)
fieldsignal = ds['correlation'].sel(lag = -12, fold = 1)
clusterkwargs = dict(min_cluster_size=1000, #300 worked good without epsilon
                     min_samples=300, # 250 good noise reduction without any epsilon, 300 with epsilon
                     allow_single_cluster=True,
                     cluster_selection_epsilon=0.22, #0.15# Radian distance. Unit = fraction of earth radius. As 2pi * r is the full 2pi radian distance
                     metric='haversine',
                     core_dist_n_jobs = 7)

In [None]:
fieldsignal.plot()

In [None]:
fieldmorenoise.plot()

In [None]:
cl = Clustering()
cl.reshape_and_drop_obs(array = fieldsignal.expand_dims(dim = 'lag', axis = 0), mask = ~fieldsignal.isnull(), min_samples = 700)
cl.prepare_for_distance_algorithm(manipulator = Latlons, kwargs = {'to_radians':True})
clusters = cl.clustering(clusterclass = HDBSCAN, kwargs = clusterkwargs)
cln = Clustering()
cln.reshape_and_drop_obs(array = fieldmorenoise.expand_dims(dim = 'lag', axis = 0), mask = ~fieldmorenoise.isnull(), min_samples = 700)
cln.prepare_for_distance_algorithm(manipulator = Latlons, kwargs = {'to_radians':True})
clustersn = cln.clustering(clusterclass = HDBSCAN, kwargs = clusterkwargs)

In [None]:
clusters.squeeze().plot()

In [None]:
clustersn.squeeze().plot()

In [None]:
fieldmorenoise.plot()

In [None]:
fieldmorenoise.plot()

Sst will have a lower minimum sample size. As just the spatial coherence of anomalies is much less.
currently ms 300.
Did not make mcs too high. 
epsilon of 0.2 seems to work out well (6) clusters. But when the merge happens suddenly bit more noisy stuff included. Does ms help? -> well it removes some of the connecting elements. therefore some joined by epsilon split off again. This can again be counteracted with increasing mcs a bit.
working with very spotted fields: ms 400, mcs 500, epsilon 0.2
does not work so well for a less 'spotted' field 
therefore: slightly higher epsilon again: 0.22
and ms = 300, and mcs = 1000

### Onto a next challenging variable?

In [None]:
# Variables with different resolution. e.g. snowcover. swvl
# Perhaps snowcover with 2-3 big regions and one with only a single.

In [None]:
# current params: ms 2000, mcs 4000, allow True, epsilon = 0.0
corrpath = Path('/nobackup_1/users/straaten/correlation_cv_spearmanpar_varalpha_strict/snowc_nhmin.21.corr.nc')

In [None]:
ds = xr.open_dataset(corrpath, decode_times = False)

In [None]:
fieldsingle = ds['correlation'].sel(lag = -21, fold = 0) # lag = -52, fold = 3
fieldmore = ds['correlation'].sel(lag = -52, fold = 0)
clusterkwargs = dict(min_cluster_size= 2500, # 1000 works already okay ish. I believe the single field is not truly single. Only starts to make a difference above 2500.
                     min_samples= 1000, # 1000 seems reasonable, (for mcs 200 and 1000) Fields are pretty connected with lots of spots.
                     allow_single_cluster=True,
                     cluster_selection_epsilon=0.1, ## Radian distance. Unit = fraction of earth radius. As 2pi * r is the full 2pi radian distance
                     metric='haversine',
                     core_dist_n_jobs = 7)

In [None]:
fieldsingle.plot()

In [None]:
fieldmore.plot()

In [None]:
cl = Clustering()
cl.reshape_and_drop_obs(array = fieldsingle.expand_dims(dim = 'lag', axis = 0), mask = ~fieldsingle.isnull(), min_samples = 700)
cl.prepare_for_distance_algorithm(manipulator = Latlons, kwargs = {'to_radians':True})
clusterss = cl.clustering(clusterclass = HDBSCAN, kwargs = clusterkwargs)
clm = Clustering()
clm.reshape_and_drop_obs(array = fieldmore.expand_dims(dim = 'lag', axis = 0), mask = ~fieldmore.isnull(), min_samples = 700)
clm.prepare_for_distance_algorithm(manipulator = Latlons, kwargs = {'to_radians':True})
clustersm = clm.clustering(clusterclass = HDBSCAN, kwargs = clusterkwargs)

In [None]:
clusterss.squeeze().plot()

In [None]:
clusterss.squeeze().plot()

In [None]:
clustersm.squeeze().plot()

In [None]:
clustersm.squeeze().plot()

So the epsilon here is slightly less, because this variable specifically focuses also on the polar part of the northern hemisphere where we want to keep things apart (like siberia from america) and because fields are pretty dense. As expected ms and mcs can be a bit larger.