
#  Computing 3D PDF from State matrix and tag pressure information

## 1. **Configure the Notebook:**
Prepare the notebook to compute the 3D PDF.

In this step, we set up the notebook environment for analysis. It includes installing necessary packages, importing required libraries, setting up parameters, and configuring the cluster for distributed computing. It also retrieves the tag data needed for analysis.


In [None]:
# Import necessary libraries and modules.
import xarray as xr
import hvplot.xarray
from pangeo_fish.io import open_tag

In [None]:

# Set up execution parameters for the analysis.
# Note: This cell is tagged as parameters, allowing automatic updates when configuring with papermill.
# `tag_name` corresponds to the name of the biologging tag (DST identification number), 

# which is also a path for storing all the information for the specific fish tagged with `tag_name`.
# `tag_root` specifies the root URL for tag data used for this computation.
tag_name = "A19124"  

# `scratch_root` specifies the root directory for storing output files.
tag_root = "https://data-taos.ifremer.fr/data_tmp/cleaned/tag/"

# `storage_options` specifies options for the filesystem storing output files.
scratch_root = "s3://destine-gfts-data-lake/demo"

# If you are using a local file system, activate the following two lines:
storage_options = {
    'anon': False, 
    'profile' : "gfts",
    'client_kwargs': {
        "endpoint_url": "https://s3.gra.perf.cloud.ovh.net",
        "region_name": "gra",
    }
}

# scratch_root = "."
# storage_options = None


# Default chunk value for the time dimension. This value depends on the configuration of your Dask cluster.
chunk_time=24


# Parameters for step 2: **Compare Reference Model with DST Information:**
# `bbox`, the bounding box, defines the latitude and longitude range for the analysis area.
# Define target root directories for storing analysis results.
bbox = {"latitude": [46, 51], "longitude": [-8, -1]} 



In [None]:
# Define the default chunk size for optimization.
target_root = f"{scratch_root}/{tag_name}"

# Set up a local cluster for distributed computing.
default_chunk = {"time": chunk_time, "lat": -1, "lon": -1}
default_chunk_xy = {"time": chunk_time, "x": -1, "y": -1}


In [None]:
# Open and retrieve the tag data required for the analysis.
from distributed import LocalCluster
cluster = LocalCluster()
client = cluster.get_client()
client

In [None]:

tag = open_tag(tag_root, tag_name)
tag


## 2. **Compute pdf_depth**

In this step, we compute the PDF only in depth for each time step used with the model. First, we load the reference model to choose the time bins used for creating the depth PDF (we can use the state matrix for that too).

**Note:** Here, the maximum depth is fixed at 42, and the interval is set at 2. This can be made 'automatic' or fixed according to the depth levels of our future climate data.


In [None]:
# Drop tag data outside the tagged events interval.
from pangeo_fish.cf import bounds_to_bins
from pangeo_fish.tags import  reshape_by_bins, to_time_slice

In [None]:

time_slice = to_time_slice(tag["tagging_events/time"])
tag_log = tag["dst"].ds.sel(time=time_slice)


In [None]:
%%time
# Save probability distribution, state matrix.
states=xr.open_zarr(
    f"{target_root}/states.zarr")
states

In [None]:
%%time
# Reshape the tag log so that it bins to the time step of the reference model.
reshaped_tag = reshape_by_bins(
    tag_log,
    dim="time",
    bins=(
        states.cf.add_bounds(["time"], output_dim="bounds")
        .pipe(bounds_to_bins, bounds_dim="bounds")
        .get("time_bins")
    ),
    bin_dim="bincount",
    other_dim="obs",
).chunk({"time": chunk_time})
reshaped_tag

In [None]:
import numpy as np
maxdepth=42
interval=2
bins=np.arange(0, maxdepth, interval)
def compute_pdf(data,bins=np.arange(0, maxdepth, interval)):

    data = data[~np.isnan(data)]
    
# Remove NaN values.
    hist, bin_edges = np.histogram(data, bins=bins, density=True)

# Calculate the histogram.
    bin_width = bin_edges[1] - bin_edges[0]
    pdf = hist * bin_width
    return pdf#, bin_edges

def compute_pdf_bins(bins=bins):
# Normalize the histogram to ensure the sum of the PDF is 1.
    data = np.full(reshaped_tag.obs.size, 1.0) # here i make fake dataset just to compute the bins and center of bins

    hist, bin_edges = np.histogram(data, bins=bins, density=True)
    bin_centers = (bin_edges[:-1] + bin_edges[1:]) / 2
    return  bin_edges,bin_centers
bin_edges,bin_centers = compute_pdf_bins()

depth_pdf=(
    xr.apply_ufunc(
        compute_pdf,  # the function
        reshaped_tag['pressure'],
        input_core_dims=[["obs"]],
        output_core_dims=[["depth"]],
        exclude_dims=set(("obs",)),
        vectorize=True,
        dask="parallelized",
        output_dtypes=[reshaped_tag.pressure.dtype] ,
        dask_gufunc_kwargs={'output_sizes': 
                            {"depth": bin_centers.size,}}
    ).assign_attrs({'long_name': 'depth_pdf'})
    .to_dataset(name='depth_pdf')
    .assign_coords(depth=bin_centers)
)

depth_pdf
depth_pdf.depth_pdf.plot(x='time',y='depth')


In [None]:
depth_pdf

In [None]:
%%time
# Remove NaN values.
depth_pdf.compute().to_zarr(
    f"{target_root}/depth_pdf.zarr", mode="w", consolidated=True,  
        storage_options=storage_options                
)


## 3. **Compute 3D PDF**

Multiply the 2D map with the depth PDF to obtain the 3D PDF.  Plot and veirify the result.  


In [None]:
ds=states.states.chunk(chunks={"time":24*2}) * depth_pdf.depth_pdf.chunk(chunks={"time":24*2})
ds=(
    ds
    .persist()
    .assign_attrs({'long_name': '3D_pdf'}).to_dataset(name='3d_pdf')
)
ds

In [None]:
%%time
# Calculate the histogram.
(ds

 .to_zarr(
    f"{target_root}/three_d_pdf.zarr", mode="w", consolidated=True,  
        storage_options=storage_options                
)
)

In [None]:
ds.hvplot.quadmesh(x='longitude',y='latitude',groupby=['time','depth'])

In [None]:
ds['3d_pdf'].sum(dim=['x','y']).plot(x='time',y='depth')