## Probability grids

This notebook creates the time-dependent probability maps and writes them to file (`.nc` format). The notebook `01a-create_classifiers_global.ipynb` must have been run previously.

In [1]:
use_extracted_data = False

In [2]:
import os
import time
from datetime import timedelta

import dask
import dask.dataframe as dd
from dask_ml.wrappers import ParallelPostFit
from joblib import load

from lib.check_files import check_prepared_data
from lib.pu import create_probability_grids

In [3]:
n_jobs = int(os.environ.get("N_JOBS", 8))
dask.config.set(num_workers=n_jobs)

<dask.config.set at 0x1c2143d50>

### Load input data from file

In [4]:
if use_extracted_data:
    data_dir = "extracted_data"
else:
    data_dir = "prepared_data"
    check_prepared_data(data_dir, verbose=True)
data_filename = os.path.join(data_dir, "grid_data.csv")
training_filename = os.path.join(data_dir, "training_data_global.csv")

point_data = dd.read_csv(data_filename)
df_out = point_data[["lon", "lat", "age (Ma)"]].compute()
regions = list(dd.read_csv(training_filename)["region"].unique())

### Calculate probabilities

In [5]:
output_dir = os.path.join("outputs", "global")

for algorithm in ("PU", "SVM"):
    t0 = time.time()

    subdir = os.path.join(output_dir, algorithm)
    model_filename = os.path.join(subdir, f"classifier.joblib")
    probabilities_filename = os.path.join(
        subdir,
        "grid_probabilities.csv",
    )
    model = load(model_filename)

    # Set model n_jobs if possible
    # (let dask handle parallelism at this stage)
    try:
        model[-1].set_params(n_jobs=1)
    except ValueError:
        pass
    model_parallel = ParallelPostFit(model)

    point_x = point_data[model.feature_names_in_]
    p = model_parallel.predict_proba(point_x)[:, 1].ravel().compute()
    probabilities = df_out.copy()
    probabilities["probability"] = p
    del p
    probabilities.to_csv(probabilities_filename, index=False)
    del probabilities, model
    duration = timedelta(seconds=time.time() - t0)
    print(
        f"Calculating probabilities for {algorithm} model complete",
        f"(region: global; duration: {duration})",
    )

    for region in regions:
        region_code = "_".join(region.lower().split())
        model_filename = os.path.join(
            subdir,
            f"classifier_{region_code}.joblib",
        )
        if not os.path.isfile(model_filename):
            continue
        t0 = time.time()

        probabilities_filename = os.path.join(
            subdir,
            f"grid_probabilities_{region_code}.csv",
        )
        model = load(model_filename)

        try:
            model[-1].set_params(n_jobs=1)
        except ValueError:
            pass
        model_parallel = ParallelPostFit(model)

        point_x = point_data[model.feature_names_in_]
        p = model_parallel.predict_proba(point_x)[:, 1].ravel().compute()
        probabilities = df_out.copy()
        probabilities["probability"] = p
        del p
        probabilities.to_csv(probabilities_filename, index=False)
        del probabilities, model
        duration = timedelta(seconds=time.time() - t0)
        print(
            f"Calculating probabilities for {algorithm} model complete",
            f"(region: {region}; duration: {duration})",
        )

del point_data


Calculating probabilities for PU model complete (region: global; duration: 0:03:15.843744)


Calculating probabilities for PU model complete (region: North America; duration: 0:02:54.157393)


Calculating probabilities for PU model complete (region: South America; duration: 0:03:06.943133)


Calculating probabilities for PU model complete (region: Southeast Asia; duration: 0:03:10.606686)


Calculating probabilities for PU model complete (region: Tethys; duration: 0:03:10.765249)


Calculating probabilities for SVM model complete (region: global; duration: 0:01:58.452027)


Calculating probabilities for SVM model complete (region: North America; duration: 0:01:33.380360)


Calculating probabilities for SVM model complete (region: South America; duration: 0:01:51.301283)


### Create probability maps

In [6]:
for algorithm in ("PU", "SVM"):
    t0 = time.time()

    subdir = os.path.join(output_dir, algorithm)
    probabilities_filename = os.path.join(
        subdir,
        "grid_probabilities.csv",
    )
    grid_output_dir = os.path.join(
        subdir,
        "probability_grids",
    )
    os.makedirs(grid_output_dir, exist_ok=True)

    create_probability_grids(
        data=probabilities_filename,
        output_dir=grid_output_dir,
        threads=n_jobs,
        extent=(-180, 180, -90, 90),
    )
    duration = timedelta(seconds=time.time() - t0)
    print(
        f"Creating grids for {algorithm} model complete",
        f"(region: global; duration: {duration})",
    )

    for region in regions:
        region_code = "_".join(region.lower().split())
        subdir = os.path.join(output_dir, algorithm)
        probabilities_filename = os.path.join(
            subdir,
            f"grid_probabilities_{region_code}.csv",
        )
        if not os.path.isfile(probabilities_filename):
            continue
        t0 = time.time()

        grid_output_dir = os.path.join(
            subdir,
            f"probability_grids_{region_code}",
        )
        os.makedirs(grid_output_dir, exist_ok=True)

        create_probability_grids(
            data=probabilities_filename,
            output_dir=grid_output_dir,
            threads=n_jobs,
            extent=(-180, 180, -90, 90),
        )
        duration = timedelta(seconds=time.time() - t0)
        print(
            f"Creating grids for {algorithm} model complete",
            f"(region: {region}; duration: {duration})",
        )

Creating grids for PU model complete (region: global; duration: 0:13:11.713105)


Creating grids for PU model complete (region: North America; duration: 0:13:21.995436)


Creating grids for PU model complete (region: South America; duration: 0:13:25.114693)


Creating grids for PU model complete (region: Southeast Asia; duration: 0:13:07.798919)


Creating grids for PU model complete (region: Tethys; duration: 0:13:06.822083)


Creating grids for SVM model complete (region: global; duration: 0:13:29.665323)


Creating grids for SVM model complete (region: North America; duration: 0:13:23.600845)


Creating grids for SVM model complete (region: South America; duration: 0:13:04.187936)
