# Probability grids

This notebook creates the time-dependent probability maps and writes them to file (`.nc` format), using the models trained in the previous notebook (`01a-create_classifiers_global.ipynb`). This notebook must have been run previously.

## Notebook options

These cells set some of the important variables and definitions used throughout the notebook.

In [1]:
# If True, use training data extracted in notebook 00c
# Else, use pre-prepared training data downloaded from Zenodo
use_extracted_data = True

# Number of processes to use
n_jobs = 4

# Overwrite any existing output files
overwrite = True

# Control verbosity level of logging output
verbose = True

These values will usually not need to be modified:

In [2]:
import os

from lib.check_files import check_prepared_data

if use_extracted_data:
    data_dir = "extracted_data"
else:
    data_dir = check_prepared_data("prepared_data", verbose=True)

output_dir = "outputs"

If any of the following exist as environment variables, they will replace the values defined above:

In [3]:
n_jobs = int(os.environ.get("N_JOBS", n_jobs))
overwrite = bool(int(os.environ.get("OVERWRITE", overwrite)))
verbose = bool(int(os.environ.get("VERBOSE", verbose)))
output_dir = os.environ.get("OUTPUT_DIR", output_dir)

## Notebook setup

Imports, definitions, etc.

### Imports

In [4]:
import time
from datetime import timedelta

import dask
import dask.dataframe as dd
from dask_ml.wrappers import ParallelPostFit
from joblib import load

from lib.pu import create_probability_grids

We will use `dask-ml` to handle parallelism at this stage:

In [5]:
dask.config.set(num_workers=n_jobs)

<dask.config.set at 0x17ec54050>

### Input and output files

In [6]:
data_filename = os.path.join(data_dir, "grid_data.csv")
training_filename = os.path.join(data_dir, "training_data_global.csv")

point_data = dd.read_csv(data_filename)
df_out = point_data[["lon", "lat", "present_lon", "present_lat", "age (Ma)"]].compute()
regions = list(dd.read_csv(training_filename)["region"].unique())

output_dir = os.path.join(output_dir, "global")

## Calculate probabilities

Deposit probability will be calculated for the gridded data and written to a CSV file.

In [7]:
for algorithm in ("PU", "SVM"):
    t0 = time.time()

    subdir = os.path.join(output_dir, algorithm)
    model_filename = os.path.join(subdir, f"classifier.joblib")
    probabilities_filename = os.path.join(
        subdir,
        "grid_probabilities.csv",
    )
    model = load(model_filename)

    # Set model n_jobs if possible
    # (let dask handle parallelism at this stage)
    try:
        model[-1].set_params(n_jobs=1)
    except ValueError:
        pass
    model_parallel = ParallelPostFit(model)

    point_x = point_data[model.feature_names_in_]
    p = model_parallel.predict_proba(point_x)[:, 1].ravel().compute()
    probabilities = df_out.copy()
    probabilities["probability"] = p
    del p
    probabilities.to_csv(probabilities_filename, index=False)
    del probabilities, model
    duration = timedelta(seconds=time.time() - t0)
    if verbose:
        print(
            f"Calculating probabilities for {algorithm} model complete",
            f"(region: global; duration: {duration})",
        )

    for region in regions:
        region_code = "_".join(region.lower().split())
        model_filename = os.path.join(
            subdir,
            f"classifier_{region_code}.joblib",
        )
        if not os.path.isfile(model_filename):
            continue
        t0 = time.time()

        probabilities_filename = os.path.join(
            subdir,
            f"grid_probabilities_{region_code}.csv",
        )
        model = load(model_filename)

        try:
            model[-1].set_params(n_jobs=1)
        except ValueError:
            pass
        model_parallel = ParallelPostFit(model)

        point_x = point_data[model.feature_names_in_]
        p = model_parallel.predict_proba(point_x)[:, 1].ravel().compute()
        probabilities = df_out.copy()
        probabilities["probability"] = p
        del p
        probabilities.to_csv(probabilities_filename, index=False)
        del probabilities, model
        duration = timedelta(seconds=time.time() - t0)
        if verbose:
            print(
                f"Calculating probabilities for {algorithm} model complete",
                f"(region: {region}; duration: {duration})",
            )

del point_data

Calculating probabilities for PU model complete (region: global; duration: 0:02:32.916897)


Calculating probabilities for PU model complete (region: North America; duration: 0:02:20.352261)


Calculating probabilities for PU model complete (region: South America; duration: 0:02:26.709014)


Calculating probabilities for PU model complete (region: Southeast Asia; duration: 0:02:26.215107)


Calculating probabilities for PU model complete (region: Tethys; duration: 0:02:23.249408)


Calculating probabilities for SVM model complete (region: global; duration: 0:01:40.374387)


Calculating probabilities for SVM model complete (region: North America; duration: 0:01:34.602675)


Calculating probabilities for SVM model complete (region: South America; duration: 0:01:41.087536)


## Create probability maps

The probabilities calculated in the previous section will now be written to one netCDF file per time step:

In [8]:
for algorithm in ("PU", "SVM"):
    t0 = time.time()

    subdir = os.path.join(output_dir, algorithm)
    probabilities_filename = os.path.join(
        subdir,
        "grid_probabilities.csv",
    )
    grid_output_dir = os.path.join(
        subdir,
        "probability_grids",
    )
    os.makedirs(grid_output_dir, exist_ok=True)

    create_probability_grids(
        data=probabilities_filename,
        output_dir=grid_output_dir,
        threads=n_jobs,
        extent=(-180, 180, -90, 90),
    )
    duration = timedelta(seconds=time.time() - t0)
    if verbose:
        print(
            f"Creating grids for {algorithm} model complete",
            f"(region: global; duration: {duration})",
        )

    for region in regions:
        region_code = "_".join(region.lower().split())
        subdir = os.path.join(output_dir, algorithm)
        probabilities_filename = os.path.join(
            subdir,
            f"grid_probabilities_{region_code}.csv",
        )
        if not os.path.isfile(probabilities_filename):
            continue
        t0 = time.time()

        grid_output_dir = os.path.join(
            subdir,
            f"probability_grids_{region_code}",
        )
        os.makedirs(grid_output_dir, exist_ok=True)

        create_probability_grids(
            data=probabilities_filename,
            output_dir=grid_output_dir,
            threads=n_jobs,
            extent=(-180, 180, -90, 90),
        )
        duration = timedelta(seconds=time.time() - t0)
        if verbose:
            print(
                f"Creating grids for {algorithm} model complete",
                f"(region: {region}; duration: {duration})",
            )

Creating grids for PU model complete (region: global; duration: 0:13:51.905821)


Creating grids for PU model complete (region: North America; duration: 0:13:37.224705)


Creating grids for PU model complete (region: South America; duration: 0:13:32.963336)


Creating grids for PU model complete (region: Southeast Asia; duration: 0:13:36.328199)


Creating grids for PU model complete (region: Tethys; duration: 0:13:40.561762)


Creating grids for SVM model complete (region: global; duration: 0:13:30.629772)


Creating grids for SVM model complete (region: North America; duration: 0:13:28.987141)


Creating grids for SVM model complete (region: South America; duration: 0:13:46.512445)
