In [1]:
# Autoreload packages that are modified
%load_ext autoreload
%autoreload 2

from datetime import datetime, timedelta
import glob
import os
import sys
import time

import h5py
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import tensorflow as tf
import tensorflow_probability as tfp
tfb = tfp.bijectors
tfd = tfp.distributions
tfk = tfp.math.psd_kernels

os.environ["CUDA_VISIBLE_DEVICES"]="3"

cwd = os.getcwd()
sys.path.append(f"{cwd}/forecast_rodeo")
sys.path.append(f"{cwd}/forecast_rodeo/src/experiments")
from experiments_util import get_target_date, month_day_subset
from stepwise_util import default_stepwise_candidate_predictors

In [2]:
# #https://stackoverflow.com/questions/43147983/could-not-create-cudnn-handle-cudnn-status-internal-error
# physical_devices = tf.config.experimental.list_physical_devices('GPU')
# assert len(physical_devices) > 0, "Not enough GPU hardware devices available"
# config = tf.config.experimental.set_memory_growth(physical_devices[0], True)

In [3]:
target = "contest_tmp2m" # "contest_precip" or "contest_tmp2m"
target_horizon = "34w" # "34w" or "56w"

data_path = os.path.expanduser("forecast_rodeo/results/regression/shared")
data_matrices_folder = f"{target}_{target_horizon}"
fs = glob.glob(f"{data_path}/{data_matrices_folder}/*.h5")
print(fs)
lat_lon_date_data_file = fs[0]
date_data_file = fs[1]

['forecast_rodeo/results/regression/shared/contest_tmp2m_34w/lat_lon_date_data-contest_tmp2m_34w.h5', 'forecast_rodeo/results/regression/shared/contest_tmp2m_34w/date_data-contest_tmp2m_34w.h5']


## Define dates of interest for prediction

In [4]:
submission_dates = [datetime(y,4,18)+timedelta(14*i) for y in range(2011,2018) for i in range(26)]
submission_dates = ['{}{:02d}{:02d}'.format(date.year, date.month, date.day) for date in submission_dates]
target_date_objs = [get_target_date(submission_date_str, target_horizon) for submission_date_str in submission_dates]
target_dates = ['{}{:02d}{:02d}'.format(date.year, date.month, date.day) for date in target_date_objs]

# for submission_date, target_date in zip(submission_dates, target_dates):
#     print(f"submission date: {submission_date}, target date: {target_date}")
    
submission_date = submission_dates[0]
target_date_obj = target_date_objs[0]
target_date = target_dates[0]
    
print(f"submission date: {submission_date}, target date: {target_date}")

# some vars
gt_col = target.split('_')[-1]  # 'tmp2m'
clim_col = f"{gt_col}_clim"     # 'tmp2m_clim'
anom_col = f"{gt_col}_anom"     # 'tmp2m_anom'
base_col = 'zeros'
group_by_cols = ['lat', 'lon']
first_train_year = 1978 # use 1948 for precip, 1978 for temp
start_delta = 29 # 29 for 34w or 43 for 56w
last_train_date = target_date_obj - timedelta(start_delta)
print(f"submission date: {submission_date}")
print(f"last train date: {last_train_date}")

# get data array names we care about
candidate_x_cols = default_stepwise_candidate_predictors(target, target_horizon, hindcast=False)

relevant_cols = set(candidate_x_cols
                    +[base_col,clim_col,anom_col,'start_date','lat','lon','target','year','ones']
                    +group_by_cols)
print(relevant_cols)
print(candidate_x_cols)
for c in relevant_cols:
    if c not in candidate_x_cols:
        print(c)

submission date: 20110418, target date: 20110502
submission date: 20110418
last train date: 2011-04-03 00:00:00
{'icec_2010_1_shift30', 'target', 'sst_2010_1_shift30', 'tmp2m_shift29_anom', 'nmme0_wo_ccsm3_nasa', 'wind_hgt_10_2010_2_shift30', 'lon', 'rhum_shift30', 'icec_2010_2_shift30', 'wind_hgt_10_2010_1_shift30', 'start_date', 'tmp2m_shift58_anom', 'zeros', 'tmp2m_shift58', 'year', 'icec_2010_3_shift30', 'sst_2010_3_shift30', 'ones', 'tmp2m_clim', 'sst_2010_2_shift30', 'phase_shift17', 'lat', 'mei_shift45', 'tmp2m_shift29', 'pres_shift30', 'nmme_wo_ccsm3_nasa', 'tmp2m_anom'}
['ones', 'tmp2m_shift29', 'tmp2m_shift29_anom', 'tmp2m_shift58', 'tmp2m_shift58_anom', 'rhum_shift30', 'pres_shift30', 'nmme_wo_ccsm3_nasa', 'nmme0_wo_ccsm3_nasa', 'mei_shift45', 'phase_shift17', 'sst_2010_1_shift30', 'sst_2010_2_shift30', 'sst_2010_3_shift30', 'icec_2010_1_shift30', 'icec_2010_2_shift30', 'icec_2010_3_shift30', 'wind_hgt_10_2010_1_shift30', 'wind_hgt_10_2010_2_shift30']
target
lon
start_date
z

## Load the data files

In [5]:
# raw data files
date_data = pd.read_hdf(date_data_file)
lat_lon_date_data = pd.read_hdf(lat_lon_date_data_file)

In [6]:
# filter out data older than "first_train_year" and keep only relevant columns
data = lat_lon_date_data.loc[lat_lon_date_data.start_date.dt.year >= first_train_year,
                             lat_lon_date_data.columns.isin(relevant_cols)]
data = pd.merge(data, date_data.loc[date_data.start_date.dt.year >= first_train_year,
                                    date_data.columns.isin(relevant_cols)],
                on="start_date", how="left")
del lat_lon_date_data
del date_data

print(len(data.columns))
print(data.columns)

23
Index(['lat', 'lon', 'start_date', 'rhum_shift30', 'pres_shift30',
       'nmme_wo_ccsm3_nasa', 'nmme0_wo_ccsm3_nasa', 'tmp2m_clim', 'tmp2m_anom',
       'tmp2m_shift29', 'tmp2m_shift29_anom', 'tmp2m_shift58',
       'tmp2m_shift58_anom', 'mei_shift45', 'phase_shift17',
       'sst_2010_1_shift30', 'sst_2010_2_shift30', 'sst_2010_3_shift30',
       'icec_2010_1_shift30', 'icec_2010_2_shift30', 'icec_2010_3_shift30',
       'wind_hgt_10_2010_1_shift30', 'wind_hgt_10_2010_2_shift30'],
      dtype='object')


In [7]:
# filter to days within margin around target date
# margin_in_days = 56
# print(f"target date: {target_date_obj}, margin in days: {margin_in_days}")
# sub_data = month_day_subset(data, target_date_obj, margin_in_days).copy()
# del data

In [10]:
print((data.columns))
data['year'] = data.start_date.dt.year
data['ones'] = 1.0
data['zeros'] = 0.0
print((data.columns))

Index(['lat', 'lon', 'start_date', 'rhum_shift30', 'pres_shift30',
       'nmme_wo_ccsm3_nasa', 'nmme0_wo_ccsm3_nasa', 'tmp2m_clim', 'tmp2m_anom',
       'tmp2m_shift29', 'tmp2m_shift29_anom', 'tmp2m_shift58',
       'tmp2m_shift58_anom', 'mei_shift45', 'phase_shift17',
       'sst_2010_1_shift30', 'sst_2010_2_shift30', 'sst_2010_3_shift30',
       'icec_2010_1_shift30', 'icec_2010_2_shift30', 'icec_2010_3_shift30',
       'wind_hgt_10_2010_1_shift30', 'wind_hgt_10_2010_2_shift30'],
      dtype='object')
Index(['lat', 'lon', 'start_date', 'rhum_shift30', 'pres_shift30',
       'nmme_wo_ccsm3_nasa', 'nmme0_wo_ccsm3_nasa', 'tmp2m_clim', 'tmp2m_anom',
       'tmp2m_shift29', 'tmp2m_shift29_anom', 'tmp2m_shift58',
       'tmp2m_shift58_anom', 'mei_shift45', 'phase_shift17',
       'sst_2010_1_shift30', 'sst_2010_2_shift30', 'sst_2010_3_shift30',
       'icec_2010_1_shift30', 'icec_2010_2_shift30', 'icec_2010_3_shift30',
       'wind_hgt_10_2010_1_shift30', 'wind_hgt_10_2010_2_shift30', 'ye

In [13]:
# this is really tmp2m_clim + tmp2m_anom
data['target'] = data[clim_col] + data[anom_col]

# drop data that doesn't have valid targets
data_valid_targets = data.dropna(subset=candidate_x_cols+['target'])

In [14]:
print(data_valid_targets.head())

       lat    lon start_date  rhum_shift30  pres_shift30  nmme_wo_ccsm3_nasa  \
1096  27.0  261.0 1982-01-01     70.420459  99167.996512           12.467787   
1097  27.0  261.0 1982-01-02     72.955266  99165.462333           12.467787   
1098  27.0  261.0 1982-01-03     74.292511  99175.210100           12.467787   
1099  27.0  261.0 1982-01-04     73.203110  99199.950614           12.467787   
1100  27.0  261.0 1982-01-05     73.615885  99200.662667           12.467787   

      nmme0_wo_ccsm3_nasa  tmp2m_clim  tmp2m_anom  tmp2m_shift29  ...  \
1096            14.397082   13.877076   -2.529009      17.393454  ...   
1097            14.397082   13.817056   -2.386775      16.864281  ...   
1098            14.397082   13.872171   -2.898529      16.131157  ...   
1099            14.397082   13.829688   -3.939882      15.877757  ...   
1100            14.397082   13.792283   -3.644579      16.066349  ...   

      sst_2010_3_shift30  icec_2010_1_shift30  icec_2010_2_shift30  \
1096      

In [15]:
data_grouped_by_latlon = data_valid_targets.loc[:,relevant_cols].groupby(group_by_cols)

In [16]:
latlons = [latlon for latlon, _ in data_grouped_by_latlon]
lat_oi, lon_oi = latlons[0] #(37.0, 238.0)
print(lat_oi, lon_oi)
data_at_lat_lon = data_grouped_by_latlon.get_group((lat_oi, lon_oi))

27.0 261.0


## Do regression
We want to predict the temperature `tmp2m` (stored in `target`) using features from the list `candidate_x_cols`

In [17]:
Y = data_at_lat_lon['target']
X = data_at_lat_lon[candidate_x_cols]
dates = data_at_lat_lon['start_date']

In [18]:
subsample = False
if subsample:
    X = X[::4]
    Y = Y[::4]
    dates = dates[::4]
print(X.shape)
print(Y.shape)
print(dates.shape)

(13283, 19)
(13283,)
(13283,)


In [19]:
import bokeh
import bokeh.io
import bokeh.plotting
import bokeh.models
from IPython.display import display, HTML

from tqdm import tqdm
from itertools import islice

bokeh.io.output_notebook(hide_banner=True)

def month_to_float(month):
    if month in [1, 3, 5, 7, 8, 10, 12]:
        return 31.0
    elif month in [2]:
        return 28.0
    elif month in [4, 6, 9, 11]:
        return 30.0

def dt_to_float(dt):
    year = dt.year
    month = np.sum([month_to_float(mth) for mth in np.arange(1, dt.month)])
    day = dt.day
    val = float(year) + ((month + day) / 365.0) 
    return val

dates_np = np.array([dt_to_float(dt) for dt in dates.iloc[:]])

In [24]:
# Plot data
fig = bokeh.plotting.figure(
    width=800, height=400)
#     x_range=(1982, 2018), y_range=(10, 35))
fig.xaxis.axis_label = 'Date'
fig.yaxis.axis_label = 'CO₂ (ppm)'
fig.add_layout(bokeh.models.Title(
    text='In situ air measurements at Mauna Loa, Observatory, Hawaii',
    text_font_style="italic"), 'above')
fig.add_layout(bokeh.models.Title(
    text='Atmospheric CO₂ concentrations', 
    text_font_size="14pt"), 'above')
fig.line(
    dates_np, Y, legend_label='All data',
    line_width=2, line_color='midnightblue')
fig.legend.location = 'top_left'
fig.toolbar.autohide = True
bokeh.plotting.show(fig)
#

In [25]:
# Split the data into observed and to predict
date_split_predict = 2015
df_observed = Y[dates_np < date_split_predict].values
dates_observed = dates_np[dates_np < date_split_predict]
print('{} measurements in the observed set'.format(len(df_observed)))
df_predict = Y[dates_np >= date_split_predict].values
dates_predict = dates_np[dates_np >= date_split_predict]
print('{} measurements in the test set'.format(len(df_predict)))


12044 measurements in the observed set
1239 measurements in the test set


In [26]:
# Define mean function which is the means of observations
observations_mean = tf.constant(
    [np.mean(df_observed)], dtype=tf.float64)
mean_fn = lambda _: observations_mean
#


In [28]:
# Define the kernel with trainable parameters. 
# Note we transform some of the trainable variables to ensure
#  they stay positive.

# Use float64 because this means that the kernel matrix will have 
#  less numerical issues when computing the Cholesky decomposition

# Constrain to make sure certain parameters are strictly positive
constrain_positive = tfb.Shift(np.finfo(np.float64).tiny)(tfb.Exp())

# Smooth kernel hyperparameters
smooth_amplitude = tfp.util.TransformedVariable(
    initial_value=10., bijector=constrain_positive, dtype=np.float64,
    name='smooth_amplitude')
smooth_length_scale = tfp.util.TransformedVariable(
    initial_value=10., bijector=constrain_positive, dtype=np.float64,
    name='smooth_length_scale')

# Smooth kernel
smooth_kernel = tfk.ExponentiatedQuadratic(
    amplitude=smooth_amplitude, 
    length_scale=smooth_length_scale)

# Local periodic kernel hyperparameters
periodic_amplitude = tfp.util.TransformedVariable(
    initial_value=5.0, bijector=constrain_positive, dtype=np.float64,
    name='periodic_amplitude')
periodic_length_scale = tfp.util.TransformedVariable(
    initial_value=1.0, bijector=constrain_positive, dtype=np.float64,
    name='periodic_length_scale')
periodic_period = tfp.util.TransformedVariable(
    initial_value=1.0, bijector=constrain_positive, dtype=np.float64,
    name='periodic_period')
periodic_local_length_scale = tfp.util.TransformedVariable(
    initial_value=1.0, bijector=constrain_positive, dtype=np.float64,
    name='periodic_local_length_scale')
# Local periodic kernel
local_periodic_kernel = (
    tfk.ExpSinSquared(
        amplitude=periodic_amplitude, 
        length_scale=periodic_length_scale,
        period=periodic_period) * 
    tfk.ExponentiatedQuadratic(
        length_scale=periodic_local_length_scale))

# # Short-medium term irregularities kernel hyperparameters
# irregular_amplitude = tfp.util.TransformedVariable(
#     initial_value=1., bijector=constrain_positive, dtype=np.float64,
#     name='irregular_amplitude')
# irregular_length_scale = tfp.util.TransformedVariable(
#     initial_value=1., bijector=constrain_positive, dtype=np.float64,
#     name='irregular_length_scale')
# irregular_scale_mixture = tfp.util.TransformedVariable(
#     initial_value=1., bijector=constrain_positive, dtype=np.float64,
#     name='irregular_scale_mixture')
# # Short-medium term irregularities kernel
# irregular_kernel = tfk.RationalQuadratic(
#     amplitude=irregular_amplitude,
#     length_scale=irregular_length_scale,
#     scale_mixture_rate=irregular_scale_mixture)

# Noise variance of observations
# Start out with a medium-to high noise
observation_noise_variance = tfp.util.TransformedVariable(
    initial_value=1, bijector=constrain_positive, dtype=np.float64,
    name='observation_noise_variance')

trainable_variables = [v.variables[0] for v in [
    smooth_amplitude,
    smooth_length_scale,
    periodic_amplitude,
    periodic_length_scale,
    periodic_period,
    periodic_local_length_scale,
#     irregular_amplitude,
#     irregular_length_scale,
#     irregular_scale_mixture,
    observation_noise_variance
]]

# Sum all kernels to single kernel containing all characteristics
kernel = (smooth_kernel + local_periodic_kernel)

In [29]:
# Define mini-batch data iterator
batch_size = 256

batched_dataset = (
    tf.data.Dataset.from_tensor_slices(
        (dates_observed.reshape(-1, 1), df_observed))
    .shuffle(buffer_size=len(df_observed))
    .repeat(count=None)
    .batch(batch_size)
)
#

@tf.function(autograph=False, experimental_compile=False)  # Use tf.function for more effecient function evaluation
def gp_loss_fn(index_points, observations):
    """Gaussian process negative-log-likelihood loss function."""
    gp = tfd.GaussianProcess(
        mean_fn=mean_fn,
        kernel=kernel,
        index_points=index_points,
        observation_noise_variance=observation_noise_variance
    )
    
    negative_log_likelihood = -gp.log_prob(observations)
    return negative_log_likelihood


In [30]:
# Fit hyperparameters
optimizer = tf.keras.optimizers.Adam(learning_rate=0.005)

# Training loop
batch_nlls = []  # Batch NLL for plotting
full_ll = []  # Full data NLL for plotting
nb_iterations = 10001
for i, (index_points_batch, observations_batch) in tqdm(enumerate(islice(batched_dataset, nb_iterations)), file=sys.stdout):
    # Run optimization for single batch
    with tf.GradientTape() as tape:
        loss = gp_loss_fn(index_points_batch, observations_batch)
    grads = tape.gradient(loss, trainable_variables)
    optimizer.apply_gradients(zip(grads, trainable_variables))
    batch_nlls.append((i, loss.numpy()))
    # Evaluate on all observations
    if i % 100 == 0:
        # Evaluate on all observed data
        ll = gp_loss_fn(
            index_points=dates_observed.reshape(-1, 1),
            observations=df_observed)
        full_ll.append((i, ll.numpy()))

Instructions for updating:
Do not pass `graph_parents`.  They will  no longer be used.
30001it [12:54, 38.72it/s]


In [31]:
# Plot NLL over iterations
fig = bokeh.plotting.figure(
    width=600, height=400, 
    x_range=(0, nb_iterations))
fig.add_layout(bokeh.models.Title(
    text='Negative Log-Likelihood (NLL) during training', 
    text_font_size="14pt"), 'above')
fig.xaxis.axis_label = 'iteration'
fig.yaxis.axis_label = 'NLL batch'
# First plot
fig.line(
    *zip(*batch_nlls), legend_label='Batch data',
    line_width=2, line_color='midnightblue')
# Seoncd plot
# Setting the second y axis range name and range
fig.extra_y_ranges = {
    'fig1ax2': bokeh.models.Range1d(start=130, end=250)}
fig.line(
    *zip(*full_ll), legend_label='All observed data',
    line_width=2, line_color='red', y_range_name='fig1ax2')
# Adding the second axis to the plot.  
fig.add_layout(bokeh.models.LinearAxis(
    y_range_name='fig1ax2', axis_label='NLL all'), 'right')

fig.legend.location = 'top_right'
fig.toolbar.autohide = True
bokeh.plotting.show(fig)

In [33]:
# Show values of parameters found
variables = [
    smooth_amplitude,
    smooth_length_scale,
    periodic_amplitude,
    periodic_length_scale,
    periodic_period,
    periodic_local_length_scale,
    observation_noise_variance
]

data = list([(var.variables[0].name[:-2], var.numpy()) for var in variables])
df_variables = pd.DataFrame(
    data, columns=['Hyperparameters', 'Value'])
display(HTML(df_variables.to_html(
    index=False, float_format=lambda x: f'{x:.4f}')))

Hyperparameters,Value
smooth_amplitude,0.9098944434898556
smooth_length_scale,26.547901095961887
periodic_amplitude,11.12843509417804
periodic_length_scale,2.3629652336965616
periodic_period,0.9990947381112354
periodic_local_length_scale,317.9467917886616
observation_noise_variance,3.515428251541274


In [39]:
# let's predict up to the year 3000
extra_dates = np.arange(2015.1260, 2020, 1.0/12).reshape(-1, 1)
prediction_dates = dates_observed.reshape(-1, 1)
prediction_dates_extra = np.concatenate((prediction_dates, extra_dates))
print(prediction_dates.shape)
print(extra_dates.shape)
print(prediction_dates_extra.shape)

(12044, 1)
(59, 1)
(12103, 1)


In [40]:
# Posterior GP using fitted kernel and observed data
gp_posterior_predict = tfd.GaussianProcessRegressionModel(
    mean_fn=mean_fn,
    kernel=kernel,
    index_points=prediction_dates_extra,
    observation_index_points=dates_observed.reshape(-1, 1),
    observations=df_observed,
    observation_noise_variance=observation_noise_variance)

# Posterior mean and standard deviation
posterior_mean_predict = gp_posterior_predict.mean()
posterior_std_predict = gp_posterior_predict.stddev()

In [47]:
# Plot posterior predictions
num_to_plot = 365 * 8

# Get posterior predictions
μ = posterior_mean_predict.numpy()
σ = posterior_std_predict.numpy()

# Plot
fig = bokeh.plotting.figure(
    width=800, height=400)
fig.xaxis.axis_label = 'Date'
fig.yaxis.axis_label = 'CO₂ (ppm)'
fig.add_layout(bokeh.models.Title(
    text='Posterior predictions conditioned on observations before 2008.',
    text_font_style="italic"), 'above')
fig.add_layout(bokeh.models.Title(
    text='Atmospheric CO₂ concentrations', 
    text_font_size="14pt"), 'above')
fig.circle(
    dates_np, Y.values, legend_label='True data',
    size=2, line_color='midnightblue')
fig.line(
    prediction_dates_extra.squeeze(), μ, legend_label='μ (predictions)',
    line_width=2, line_color='firebrick')
# Prediction interval
band_x = np.append(
    prediction_dates_extra.squeeze(), prediction_dates_extra.squeeze()[::-1])
band_y = np.append(
    (μ + 2*σ), (μ - 2*σ)[::-1])
fig.patch(
    band_x, band_y, color='firebrick', alpha=0.4, 
    line_color='firebrick', legend_label='2σ')

fig.legend.location = 'top_left'
fig.toolbar.autohide = True
bokeh.plotting.show(fig)
#


In [53]:
# Posterior GP using fitted kernel and observed data
smooth_kernel_gp_posterior_predict = tfd.GaussianProcessRegressionModel(
    mean_fn=mean_fn,
    kernel=smooth_kernel,
    index_points=prediction_dates_extra,
    observation_index_points=dates_observed.reshape(-1, 1),
    observations=df_observed,
    observation_noise_variance=observation_noise_variance)
local_periodic_kernel_gp_posterior_predict = tfd.GaussianProcessRegressionModel(
    mean_fn=mean_fn,
    kernel=local_periodic_kernel,
    index_points=prediction_dates_extra,
    observation_index_points=dates_observed.reshape(-1, 1),
    observations=df_observed,
    observation_noise_variance=observation_noise_variance)

# Posterior mean and standard deviation
smooth_kernel_posterior_mean_predict = smooth_kernel_gp_posterior_predict.mean()
smooth_kernel_posterior_std_predict = smooth_kernel_gp_posterior_predict.stddev()

local_periodic_kernel_posterior_mean_predict = local_periodic_kernel_gp_posterior_predict.mean()
local_periodic_kernel_posterior_std_predict = local_periodic_kernel_gp_posterior_predict.stddev()


# Plot posterior predictions

# Get posterior predictions
smooth_kernel_μ = smooth_kernel_posterior_mean_predict.numpy()
smooth_kernel_σ = smooth_kernel_posterior_std_predict.numpy()

local_periodic_kernel_μ = local_periodic_kernel_posterior_mean_predict.numpy()
local_periodic_kernel_σ = local_periodic_kernel_posterior_std_predict.numpy()

# Plot
fig = bokeh.plotting.figure(
    width=600, height=400)
fig.xaxis.axis_label = 'Date'
fig.yaxis.axis_label = 'CO₂ (ppm)'
# fig.add_layout(bokeh.models.Title(
#     text='Posterior predictions conditioned on observations before 2008.',
#     text_font_style="italic"), 'above')
fig.add_layout(bokeh.models.Title(
    text='Atmospheric CO₂ concentrations', 
    text_font_size="14pt"), 'above')
fig.circle(
    dates_np, Y.values, legend_label='True data',
    size=2, line_color='midnightblue')

# smooth kernel
fig.line(
    prediction_dates_extra.squeeze(), smooth_kernel_μ, legend_label='smooth kernel μ (predictions)',
    line_width=2, line_color='firebrick')
# Prediction interval
band_x = np.append(
    prediction_dates_extra.squeeze(), prediction_dates_extra.squeeze()[::-1])
band_y = np.append(
    (smooth_kernel_μ + 2*smooth_kernel_σ), (smooth_kernel_μ - 2*smooth_kernel_σ)[::-1])
fig.patch(
    band_x, band_y, color='firebrick', alpha=0.4, 
    line_color='firebrick', legend_label='smooth kernel 2σ')


# periodic kernel
fig.line(
    prediction_dates_extra.squeeze(), local_periodic_kernel_μ, legend_label='periodic kernel μ (predictions)',
    line_width=2, line_color='forestgreen')
# Prediction interval
band_x = np.append(
    prediction_dates_extra.squeeze(), prediction_dates_extra.squeeze()[::-1])
band_y = np.append(
    (local_periodic_kernel_μ + 2*local_periodic_kernel_σ), (local_periodic_kernel_μ - 2*local_periodic_kernel_σ)[::-1])
fig.patch(
    band_x, band_y, color='forestgreen', alpha=0.4, 
    line_color='forestgreen', legend_label='periodic kernel 2σ')


fig.legend.location = 'top_left'
fig.toolbar.autohide = True
bokeh.plotting.show(fig)
#


In [None]:
print("--------------------")
print(len(sub_data))
for c in sub_data.columns:
    num_nan = np.sum(np.isnan(sub_data[c].to_numpy()))
    print(f"{c}: {num_nan}")
    
print("--------------------")
tmp = sub_data.dropna()
print(len(tmp))
for c in tmp.columns:
    num_nan = np.sum(np.isnan(tmp[c].to_numpy()))
    print(f"{c}: {num_nan}")

print("--------------------")
tmp2 = sub_data.dropna(subset=['target','sample_weight'])
print(len(tmp2))
for c in tmp.columns:
    num_nan = np.sum(np.isnan(tmp2[c].to_numpy()))
    print(f"{c}: {num_nan}")

    
print(sub_data.icec_2010_1_shift30)
print(tmp.icec_2010_1_shift30)
print(tmp2.icec_2010_1_shift30)

In [None]:
# print(sub_data.loc[(sub_data.lat==27.0) & (sub_data.lon == 261.0)].head())

