Copyright 2024 Google LLC.

Licensed under the Apache License, Version 2.0 (the "License");

In [None]:
#@title License
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# imports and setup

In [None]:
%reload_ext autoreload

import pandas as pd
import numpy as np
import numbers
import xarray as xr
from sklearn import metrics as skl_metrics
import scipy as sp
from scipy import stats
import tensorflow as tf
import tensorflow_probability as tfp
import matplotlib.colors as mpl_colors


#--- for printing formatted text
from IPython.display import display, Markdown
def printmd(string):
    display(Markdown(string))

import os
import datetime
from absl import flags
import time
from matplotlib import pyplot as plt
import matplotlib as mpl
from matplotlib.ticker import FixedLocator
import logging

import gin
gin.enter_interactive_mode()

np.set_printoptions(precision=4, threshold=2500)

from eq_mag_prediction.scripts import calculate_benchmark_gr_properties
from eq_mag_prediction.scripts import magnitude_predictor_trainer   # import unused for gin config
from eq_mag_prediction.forecasting import metrics, training_examples
from eq_mag_prediction.forecasting import encoders
from eq_mag_prediction.forecasting import one_region_model
from eq_mag_prediction.utilities import geometry
from eq_mag_prediction.utilities import statistics_utils as statistics
from eq_mag_prediction.utilities import catalog_analysis
from eq_mag_prediction.utilities import simulate_catalog
from eq_mag_prediction.utilities import data_utils

In [None]:
# Plot lines font colors etc...

listed_colors_discrete = [
    '#e41a1c',
    '#377eb8',
    '#4daf4a',
    '#984ea3',
    '#ff7f00',
    '#ffff33',
    '#f0027f',
]

color_list_cmap = mpl_colors.ListedColormap(listed_colors_discrete)

For the notebook to run, it needs the working directory to be the notebooks directory in the repo.

In [None]:
assert os.path.basename(os.getcwd()) == 'notebooks'

# Catalogs and configurations

Models needs two things to work: data (a catalog) and hyper parameters.

### Catalogs
Catalogs need to be in a standard format, this is the ingestion process. Scripts
that ingest the catalogs we worked on and convert them to the standard format
are in `./ingestion`. The ingested catalogs are in `./results/catalogs/ingested`.

For technical reasons, we cannot include the catalogs in this repo. You can
download the catalogs from links provided in the script for each catalog. For
demonstration, we use a mock catalog containing random data, which we call
`mock` catalog.

### Configuraion
We use [gin-config](https://github.com/google/gin-config) to define the
hyperparameters. Example of `gin` files are available in
`results/trained_model/[CATALOG_NAME]`

The following code:
1. Generates a mock catalog
2. Ingests it to a standard format
3. Generates a gin configuration file for training.

In [None]:
MODEL_NAME = 'mock'
# MODEL_NAME = 'Hauksson'
# MODEL_NAME = 'JMA'

# The following code generates and ingests a mock catalog.
if MODEL_NAME.lower() == 'mock':
  simulate_catalog.mock_catalog_and_config_ingestion()

#### Compute features from the catalog, and cache them for training

In [None]:
!python3 ../eq_mag_prediction/scripts/magnitude_prediction_compute_features.py --gin_path='../results/trained_models/mock/config.gin'

#### Train a model

In [None]:
!python3 ../eq_mag_prediction/scripts/magnitude_predictor_trainer.py --gin_config='../results/trained_models/mock/config.gin' --output_dir='../results/trained_models/mock/' --gin_bindings='train_and_evaluate_magnitude_prediction_model.epochs=3'

### Load model

In [None]:
experiment_dir = os.path.join(os.getcwd(), '..', 'results/trained_models/', MODEL_NAME)
custom_objects={
    '_repeat': encoders._repeat,
    }

loaded_model = tf.keras.models.load_model(
    os.path.join(experiment_dir, 'model'),
    custom_objects={'_repeat': encoders._repeat},
    compile=False,
    # safe_mode=True
    )

In [None]:
# set gin configs
with open(os.path.join(experiment_dir, 'config.gin')) as f:
    with gin.unlock_config():
        gin.parse_config(f.read(), skip_unknown=False)

In [None]:
domain = training_examples.CatalogDomain()
labels = training_examples.magnitude_prediction_labels(domain)

scaler_saving_dir = os.path.join(os.getcwd(), '..', 'results/trained_models', MODEL_NAME, 'scalers')

labels = training_examples.magnitude_prediction_labels(domain)
all_encoders = one_region_model.build_encoders(domain)

In [None]:
one_region_model.compute_and_cache_features_scaler_encoder(
    domain,
    all_encoders,
    force_recalculate = False,
)
features_and_models = one_region_model.load_features_and_construct_models(
    domain, all_encoders, scaler_saving_dir
)
train_features = one_region_model.features_in_order(features_and_models, 0)
validation_features = one_region_model.features_in_order(features_and_models, 1)
test_features = one_region_model.features_in_order(features_and_models, 2)

In [None]:
forecasts = {}
for set_name in ['train', 'validation', 'test']:
    forecasts[set_name] = loaded_model.predict(locals()[f'{set_name}_features'])

# Analysis and plotting

## Set relevant probability density and other definitions

In [None]:
# set the relevant probability density function
probability_density_function = metrics.kumaraswamy_mixture_instance

In [None]:
BETA_OF_TRAIN_SET = catalog_analysis.estimate_beta(labels.train_labels, None, 'BPOS')
MAG_THRESH = domain.magnitude_threshold
DAY_TO_SECONDS = 60*60*24

In [None]:
try:
    support_stretch = gin.query_parameter('train_and_evaluate_magnitude_prediction_model.pdf_support_stretch')
except:
    default_stretch = 7
    message = f"<span style='color:red; font-size:25px'>pdf_support_stretch not defined in gin, setting to default: {default_stretch}</span>"
    display(Markdown(message))
    support_stretch = default_stretch



In [None]:
# Create a shift function for labels

random_var_shift = MAG_THRESH
random_var_stretch = support_stretch

costum_shift_stretch = lambda x, random_var_shift=random_var_shift, random_var_stretch=random_var_stretch: np.minimum((x - random_var_shift) / random_var_stretch, 1)
shift_strech_input = costum_shift_stretch


In [None]:
timestamps_dict = calculate_benchmark_gr_properties.create_timestamps_dict(domain)
test_timestamps = timestamps_dict['test']
validation_timestamps = timestamps_dict['validation']
train_timestamps = timestamps_dict['train']
all_timestamps = np.concatenate([train_timestamps, validation_timestamps, test_timestamps])

coordinates_dict = calculate_benchmark_gr_properties.create_coordinates_dict(domain)

## Functions for computing likelihoods and baselines

In [None]:
def likelihood_probability_func(
      labels,
      forecasts,
      shift = random_var_shift,
      stretch = random_var_stretch,
      ):
  # Create a tfp.distributions.Distribution instance:
  random_variable = probability_density_function(
      tf.convert_to_tensor(forecasts))
  labels_tensor = tf.reshape(tf.convert_to_tensor(labels, dtype=forecasts.dtype), (-1,))
  likelihood = random_variable.prob(shift_strech_input(labels_tensor))/stretch
  return likelihood

def split_name_to_model_and_set(name):
  under_score_idx = name[::-1].find('_')
  current_model = name[:-(under_score_idx+1)]
  set_name = name[-(under_score_idx):]
  return (current_model, set_name)

def sort_strings_w_constraint(list_of_strings, start_with_constraint):
  sorted_list = []
  for cons in start_with_constraint:
    cons_list = [l for l in list_of_strings if l.startswith(cons)]
    cons_list.sort()
    sorted_list += cons_list
  remains_list = list(set(list_of_strings) - set(sorted_list))
  remains_list.sort()
  sorted_list += remains_list
  return sorted_list

# Compute model's results and baselines

The following command computes benchmark magnitude predictors and caches the results.<br>
In order to perform the calculation, the script will require the *```domain```*,
an instance of ```training_examples.CatalogDomain```. This is defined by the flag:<br>
```--domain_path='path/to/relevant/domain'```


If benchmarks are already cached, they will not be recomputed unless specified explicitly by the flag <br>
```--force_recalculate=True```.

If some benchmarks are to be excluded, specify in a dictionary format, for example:<br>
```--compute_benchmark='n_past_events_kde=False'```

In [None]:
!python3 ../eq_mag_prediction/scripts/calculate_benchmark_gr_properties.py --domain_path='../results/trained_models/mock/domain' --compute_benchmark='n_past_events_kde=False, spatial_gr=False' --force_recalculate=True

### Collect $\beta$ and $m_c$ for GR variation benchmarks

The following cell preforms the benchmark calculation, as the cmd above does, but with an in-notebook workaround.
The function ```calculate_benchmark_gr_properties.compute_and_assign_benchmarks_all_sets``` will either calculate the benchmarks or load them if already calculated and cached.

In [None]:
# Set cache path
GR_PROPERTIES_CACHE = os.path.join(
    os.getcwd(), '..', 'results/cached_benchmarks'
)

custom_args = [
    f"--{calculate_benchmark_gr_properties._CACHE_DIR.name}=GR_PROPERTIES_CACHE",
    f"--{calculate_benchmark_gr_properties._FORCE_RECALCULATE.name}=False",
]
FLAGS = flags.FLAGS
FLAGS(custom_args)


# show logging info while running
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)

# Load the benchmarks
LOAD_KDE = False
gr_models_beta, gr_models_mc = calculate_benchmark_gr_properties.compute_and_assign_benchmarks_all_sets(
    domain,
    timestamps_dict,
    coordinates_dict,
    BETA_OF_TRAIN_SET,
    MAG_THRESH,
    # compute_benchmark={'spatial_gr':False},
    compute_benchmark={'n_past_events_kde':LOAD_KDE, 'spatial_gr':False},
)

In [None]:
# Rename some keys
for k in iter(list(gr_models_beta.keys())):
  if not k.startswith('gr_spatial'):
    continue
  k_new = k.replace('gr_spatial', 'spatial_gr')
  gr_models_beta[k_new] = gr_models_beta.pop(k)
  gr_models_mc[k_new] = gr_models_mc.pop(k)

## Compute likelihoods of models and baselines

### Benchmarks

In [None]:
gr_likelihoods_and_baselines = {}
for k in gr_models_beta:
  set_name = k.split('_')[-1]
  if 'events_kde' in k:
    gr_likelihoods_and_baselines[k] = np.array(
        [kde(l) for kde,l in zip(gr_models_beta[k], getattr(labels, f'{set_name}_labels'))]
        ).ravel()
  else:
    gr_likelihoods_and_baselines[k] = metrics.gr_likelihood(
        getattr(labels, f'{set_name}_labels'),
        gr_models_beta[k],
        gr_models_mc[k],
        )


### Model's scores

In [None]:
likelihoods_and_baselines = {}

for set_name in ['train', 'validation', 'test']:
  likelihoods_and_baselines[f'model_{MODEL_NAME}_likelihood_{set_name}'] = np.array(
      likelihood_probability_func(
          getattr(labels, f'{set_name}_labels'),
          forecasts[set_name],
          MAG_THRESH,
          )
      )

likelihoods_and_baselines.update(gr_likelihoods_and_baselines)

## Display results

In [None]:
# Select benchmarks to display

MODELS_TO_PLOT = [split_name_to_model_and_set(k)[0] for k in likelihoods_and_baselines.keys() if k.startswith('model_') & k.endswith('_test')]
MODELS_TO_PLOT += [
    'train_gr_likelihood',
    'test_gr_likelihood',
    # 'gr_last_10_days_constant_mc_likelihood',
    'gr_last_100_days_constant_mc_likelihood',
    # 'gr_last_1000_days_constant_mc_likelihood',
    # 'gr_last_10_days_fitted_mc_likelihood',
    # 'gr_last_100_days_fitted_mc_likelihood',
    # 'gr_last_1000_days_fitted_mc_likelihood',
    'n300_past_events_constant_mc',
    # 'n300_present_events_constant_mc',
    # 'n300_past_events_fitted_mc',
    # 'n300_present_events_fitted_mc',
    # 'spatial_gr_on_all_likelihood',
    # 'spatial_gr_on_train_likelihood',
    # 'gr_spatial_on_train_likelihood',
    # 'spatial_gr_on_test_likelihood',
    # 'n300_past_events_kde_constant_mc'
]
MODELS_TO_PLOT = sort_strings_w_constraint(MODELS_TO_PLOT, ['model', 'train', 'test', 'gr', 'n', 'saptial'])

COLOR_PER_MODEL = {m:listed_colors_discrete[i] for i,m in enumerate(MODELS_TO_PLOT)}

#### display helper-functions

In [None]:
def create_scores_summary_df(
    likelihoods_and_baselines_dictionary,
    per_set_boolean_filter=None,
    exclude_zeros=False,
    drop_nans=True
    ):
  model_names = set()
  for k in likelihoods_and_baselines_dictionary.keys():
    under_score_idx = k[::-1].find('_')
    model_name = k[:-(under_score_idx+1)]
    model_names.add(model_name)

  summary_df = pd.DataFrame(
      index=sort_strings_w_constraint(
          list(model_names),
           ['model_', 'train', 'test', 'gr_', 'n_'],
          ),
      columns=['train', 'validation', 'test'],
      )

  for k in likelihoods_and_baselines_dictionary.keys():
    current_model, set_name = split_name_to_model_and_set(k)

    total_logical = np.full_like(likelihoods_and_baselines_dictionary[k].ravel(), True).astype(bool)
    if per_set_boolean_filter is not None:
      total_logical = total_logical & per_set_boolean_filter[set_name]
    if exclude_zeros:
      total_logical = total_logical & (likelihoods_and_baselines_dictionary[k]!=0)
    if drop_nans:
      total_logical = total_logical & (~np.isnan(likelihoods_and_baselines_dictionary[k]))

    summary_df.loc[current_model, set_name] = float(-np.log(likelihoods_and_baselines_dictionary[k][total_logical]).mean())
  return summary_df.apply(pd.to_numeric)



def get_grad_colormap(original_color):
  listed_colors_discrete = [
      list(original_color),
      (1, 1, 1, 1),
      ]
  return mpl_colors.LinearSegmentedColormap.from_list('grad_colormap', np.array(listed_colors_discrete))


def barplot_scores(scores_summary_df, models_to_plot_list, colors=None, set_name='test'):
  data_column = scores_summary_df[set_name].loc[MODELS_TO_PLOT]
  are_infs = np.isinf(data_column)
  non_inf_max = data_column[~are_infs].max()
  margin = (non_inf_max - data_column[~are_infs].min())/4
  replace_inf_val = np.max(data_column[~are_infs]) + 2*margin
  data_column[are_infs] = replace_inf_val
  infs_bars = np.where(are_infs)[0]


  f, ax = plt.subplots(1, 1)
  bars_handle = ax.bar(
      models_to_plot_list,
      data_column,
      color=colors
      )
  #-- account for infs:
  if infs_bars.size > 0:
    bar_ax = bars_handle[0].axes
    lim = bar_ax.get_xlim()+bar_ax.get_ylim()
    for inf_idx in infs_bars:
      bar = bars_handle[inf_idx]
      bar.set_zorder(1)
      original_color = bar.get_facecolor()
      grad_colormap = get_grad_colormap(original_color)
      bar.set_facecolor("none")
      x,y = bar.get_xy()
      w, h = bar.get_width(), bar.get_height()
      grad = np.atleast_2d(np.linspace(replace_inf_val, 0, 1000)).T
      normalizer = mpl.colors.PowerNorm(0.8, vmin=replace_inf_val-margin, vmax=replace_inf_val)
      ax.imshow(grad, extent=[x,x+w,y,y+h], aspect="auto", zorder=0, cmap=grad_colormap, norm=normalizer)
      ax.text(x+w/2, replace_inf_val, '$\infty$', ha='center', color=original_color)
    bar_ax.axis(lim)

  max_y = data_column.max() + margin
  min_y = data_column.min() - margin
  ax.set_ylim(min_y, max_y)
  for label in ax.get_xticklabels():
    label.set(rotation=-30, horizontalalignment='left')
  ax.set_ylabel(r'$-\langle \log \leftparen \mathtt{likelihood} \rightparen \rangle$')
  return f, ax

## Calculate the minus mean log-likelihood score   $-<\mathcal{L}>$
Lower score = better score.

**Notice than with the default settings of the tutorial notebook, random data is used and MAGNET is expected to lose.**

In [None]:
#@title Minus mean log likelihood

summary_df_mean_ll = create_scores_summary_df(likelihoods_and_baselines, drop_nans=True, exclude_zeros=True)
summary_df_mean_ll

In [None]:
logging.getLogger('matplotlib').setLevel(logging.WARNING)
f_meanLL_barplot, ax_meanLL_barplot = barplot_scores(summary_df_mean_ll, MODELS_TO_PLOT, [COLOR_PER_MODEL[m] for m in MODELS_TO_PLOT])

# Plot resulting distributions

Plot the pdf predicted by the model MAGNET.
Colors indicate the label magnitude, values of labels will be marked by stars of the corresponding colors.

In [None]:
#---- setup data
samples_to_plot = 4
plot_above_thresh = MAG_THRESH
m_vec = np.linspace(MAG_THRESH, 9, 500)
prob_density_inst = probability_density_function(forecasts['test'])
prob_vecs = prob_density_inst.prob((m_vec[:, None] - random_var_shift)/random_var_stretch)/random_var_stretch

test_labels_to_plot_from = labels.test_labels[labels.test_labels>=plot_above_thresh]
prob_vecs_to_plot_from = prob_vecs.numpy()[:, labels.test_labels>=plot_above_thresh]


p_for_mags = np.exp(BETA_OF_TRAIN_SET*test_labels_to_plot_from)
p_for_mags /= p_for_mags.sum()
rnd_seed = np.random.RandomState(seed=1905)
label_idxs_to_plot = np.sort(rnd_seed.choice(prob_vecs_to_plot_from.shape[1],samples_to_plot, replace=False, p=p_for_mags))
labels_to_plot = test_labels_to_plot_from[label_idxs_to_plot]


#--- setup figure
num_mags = 25
min_mag = 2
max_mag = 6.5
m_scale = np.linspace(min_mag-0.01, max_mag, num_mags)
norm_inst = plt.Normalize(min_mag, max_mag);

chosen_colormap = mpl.colormaps['coolwarm']
colors = chosen_colormap(np.linspace(0,1,num_mags))
colors2plot = colors[np.argmin(np.abs(test_labels_to_plot_from[label_idxs_to_plot][:,None] - m_scale[None,:]), axis=1)]

In [None]:
#---- Plot
f_dist_fig, ax_dist_fig = plt.subplots(1,1,)

for idx, lbl_index in enumerate(label_idxs_to_plot):
  p = ax_dist_fig.plot(m_vec, prob_vecs_to_plot_from[:, lbl_index], alpha=0.4, color=colors2plot[idx], linewidth=4);
for idx, lbl_index in enumerate(label_idxs_to_plot):
  x_mark = labels_to_plot[idx]
  p_idx = np.argmin(np.abs(m_vec - x_mark))
  y_mark = prob_vecs_to_plot_from[p_idx, lbl_index]
  ax_dist_fig.scatter(x_mark, y_mark, s=50, marker='*', color=colors2plot[idx], edgecolor='black', linewidth=0.5, zorder=np.inf)


# plot GR train set
train_gr_curve = metrics.gr_likelihood(m_vec, BETA_OF_TRAIN_SET, MAG_THRESH)
gr_handle = ax_dist_fig.plot(m_vec, train_gr_curve, 'k--', label='train_gr_likelihood', linewidth=3)
ax_dist_fig.legend(handles=gr_handle, frameon=False)

norm_inst = plt.Normalize(min_mag, max_mag);
sm = plt.cm.ScalarMappable(cmap=chosen_colormap, norm=norm_inst);

cb = plt.colorbar(sm, ax=ax_dist_fig, label='True magnitude (label)')
ax_dist_fig.set_xlabel('magnitude')
ax_dist_fig.set_ylabel('p(magnitude)')
ax_dist_fig.set_xscale('linear')