This notebook is used to randomly select a certain number of configurations in order to span uniformly the available values of momentum.

In [None]:
import os
import sys
import json
import numpy as np
from numpy.random import RandomState, SeedSequence, MT19937
import pandas as pd
from statsmodels.nonparametric.bandwidths import bw_scott, bw_silverman

if '../../inertia-terna' not in sys.path:
    sys.path = ['../../inertia-terna'] + sys.path
from utils import cluster_data,plot_clustered_data

In [None]:
import matplotlib
import matplotlib.pyplot as plt
from matplotlib_scalebar.scalebar import ScaleBar
import seaborn as sns

fontsize = 9
lw = 0.75
matplotlib.rc('font', **{'family': 'Times', 'size': fontsize})
matplotlib.rc('axes', **{'linewidth': 0.75, 'labelsize': fontsize})
matplotlib.rc('xtick', **{'labelsize': fontsize})
matplotlib.rc('ytick', **{'labelsize': fontsize})
matplotlib.rc('xtick.major', **{'width': lw, 'size':3})
matplotlib.rc('ytick.major', **{'width': lw, 'size':3})
matplotlib.rc('ytick.minor', **{'width': lw, 'size':1.5})

In [None]:
data = np.load('../HEM.npz', allow_pickle=True)
H,E,M = data['H'],data['E'],data['M']
dirs = np.arange(H.size) + 1
idx = H > 0
H,E,M = H[idx],E[idx],M[idx]
dirs = dirs[idx]

In [None]:
base_data_folder = '/dati2tb/daniele/Research/ai-pf/data/Sardinia/SM_configs_from_data'

Select which measure to use:

In [None]:
measure = 'momentum'
if measure == 'inertia':
    Y = H
    units = 's'
elif measure == 'energy':
    Y = E
    units = r'GW$\cdot$s'
elif measure == 'momentum':
    Y = M
    units = r'GW$\cdot$s$^2$'
else:
    raise Exception(f"Unknown measure '{measure}'")

Cluster the data using Kernel Density Estimation:

In [None]:
bw = bw_silverman(Y)
y,scores,breaks,edges,N,IDX = cluster_data(Y, bandwidth=bw/3)
print(f'Subdivided data into {N.size} clusters.')

How many files per cluster to pick:

In [None]:
MIN_N_FILES_PER_BIN = 3
N_files_per_bin = np.ceil(N/15).astype(int)
N_files_per_bin[N_files_per_bin < MIN_N_FILES_PER_BIN] = MIN_N_FILES_PER_BIN
total_N_files = N_files_per_bin.sum()
print(f'Total number of files: {total_N_files}')

Pick `N_files_per_bin` files from each cluster in a uniform way:

In [None]:
JDX_UNIFORM = []
for i,idx in enumerate(IDX):
    jdx = np.argsort(Y[idx])
    JDX_UNIFORM.append(idx[jdx[::jdx.size//N_files_per_bin[i]]][:N_files_per_bin[i]])
JDX_UNIFORM = np.sort(np.concatenate(JDX_UNIFORM))

In [None]:
data_dirs = [os.path.join(base_data_folder, f"{dirs[j]:03d}") for j in JDX_UNIFORM]
outfile = os.path.join('..','config','Sardinia','data_dirs_uniform.json')
json.dump({'data_dirs': data_dirs}, open(outfile,'w'), indent=4)
print(f'Saved variable names to file {outfile}.')

In [None]:
'"' + '" "'.join(map(lambda s: f'{s:03d}', dirs[JDX_UNIFORM])) + '"'

Pick `N_files_per_bin` files from each cluster randomly:

In [None]:
seed = 73409
if seed is None:
    with open('/dev/urandom', 'rb') as fid:
        seed = int.from_bytes(fid.read(4), 'little') % 100000
rs = RandomState(MT19937(SeedSequence(seed)))
print(f'Seed = {seed}.')

JDX_RANDOM = []
for i,n in enumerate(N_files_per_bin):
    JDX_RANDOM.append(IDX[i][rs.permutation(IDX[i].size)][:n])
JDX_RANDOM = np.sort(np.concatenate(JDX_RANDOM))

In [None]:
data_dirs = [os.path.join(base_data_folder, f"{dirs[j]:03d}") for j in JDX_RANDOM]
outfile = os.path.join('..','config','Sardinia',f'data_dirs_{seed}.json')
json.dump({'data_dirs': data_dirs}, open(outfile,'w'), indent=4)
print(f'Saved variable names to file {outfile}.')

In [None]:
'"' + '" "'.join(map(lambda s: f'{s:03d}', dirs[JDX_RANDOM])) + '"'

Pick `N_low_high` files from the clusters with lower and higher measure values:

In [None]:
N_low_high = MIN_N_FILES_PER_BIN
JDX_LOW_HIGH = []
for idx in [IDX[0],IDX[-1]]:
    jdx = np.argsort(Y[idx])
    JDX_LOW_HIGH.append(idx[jdx[::jdx.size//N_low_high]][:N_low_high])
JDX_LOW_HIGH = np.sort(np.concatenate(JDX_LOW_HIGH))
data_dirs = [os.path.join(base_data_folder, f"{dirs[j]:03d}") for j in JDX_LOW_HIGH]
outfile = os.path.join('..','config','Sardinia','data_dirs_low_high.json')
json.dump({'data_dirs': data_dirs}, open(outfile,'w'), indent=4)
print(f'Saved variable names to file {outfile}.')

#### Summary plot

In [None]:
ymin,ymax = 0, (dirs.size//50+1)*50
fig,ax = plt.subplots(1, 1, figsize=(3.5,2))
ax.vlines(edges, ymin, ymax, color=.8+np.zeros(3), ls=':', lw=0.5)
ax.plot(Y, dirs, 'o', color=.7+np.zeros(3), markersize=2)
ax.plot(Y[JDX_LOW_HIGH], dirs[JDX_LOW_HIGH], 's', color='tab:red', markerfacecolor='w',
        markeredgewidth=1.25, markersize=6)
ax.plot(Y[JDX_UNIFORM], dirs[JDX_UNIFORM], 's', color='k', markerfacecolor='w',
        markeredgewidth=1.25, markersize=3)
ax.set_xlabel('{} [{}]'.format(measure.capitalize(), units))
ax.set_ylabel('Configuration #')
ticks = np.r_[0.25 : 0.61 : 0.05]
ax.set_xlim(ticks[[0,-1]]+np.array([-0.01,0.01]))
ax.set_xticks(ticks)
sns.despine()
fig.tight_layout()
plt.savefig(f'M_configurations.pdf')

In [None]:
df = pd.read_parquet(os.path.join('..', 'configuration_IDs.parquet'))
N_samples = df['Configuration_ID'].size
N_IDs = data['H'].size
jdx = df.columns.get_loc('Configuration_ID')
for key in data.files:
    df[key] = np.zeros(N_samples)
    for i in range(N_IDs):
        idx = df.loc[:,'Configuration_ID'] == i
        df.loc[idx,key] = data[key][i]
idx = df.loc[:,'H'] == -1
df.loc[idx,['H','E','M']] = np.nan

In [None]:
fig,ax = plt.subplots(1, 1, figsize=(3.5,2.25))
twin_ax = ax.twinx()
red = 'tab:red'
df.plot(y='M', color='k', lw=0.75, ax=ax, legend=False)
df.plot(y='H', color=red, lw=0.75, ax=twin_ax, legend=False, alpha=0.75)

ylim,dy = np.array([0.28, 0.58]), 0.1
offset = np.diff(ylim)*0.1
ax.set_ylim(ylim + offset*np.array([-1,1]))
ax.set_yticks(np.r_[ylim[0] : ylim[1]+dy/2 : dy])

ylim,dy = np.array([4, 5]), 0.25
offset = np.diff(ylim)*0.1
twin_ax.set_ylim(ylim + offset*np.array([-1,1]))
twin_ax.set_yticks(np.r_[ylim[0] : ylim[1]+dy/2 : dy])

# ax.grid(which='major', axis='y', ls=':', lw=0.5, color=[.6,.6,.6])
ax.set_xlabel('Date')
ax.set_ylabel(r'Momentum [GW$\cdot$s$^2$]')
twin_ax.set_ylabel(r'Inertia [s]', color=red)
twin_ax.tick_params(axis='y', labelcolor=red)
fig.tight_layout()
plt.savefig('MH.pdf')