# Extract training data

This notebook will extract plate kinematic data from a plate model and other data from the `source_data` directory, writing the resulting dataset to a CSV file which can then be used to train the models in the following notebooks (`01*.ipynb`).

## Notebook options

These cells set some of the important variables and definitions used throughout the notebook.

In [1]:
config_file = "notebook_parameters_deformation.yml"

In [2]:
from lib.load_params import get_params

params = get_params(config_file, notebook="00b")

# Directory for output
data_dir = params["extracted_data_dir"]
output_dir = data_dir

# Number of processes to use
n_jobs = params["n_jobs"]

# Overwrite any existing output files
overwrite = params["overwrite_output"]

# Control verbosity level of logging output
verbose = params["verbose"]

# Timespan for analysis
min_time = params["timespan"]["min"]
max_time = params["timespan"]["max"]
times = range(min_time, max_time + 1)

# Number of unlabelled points to generate
num_unlabelled = params["num_unlabelled"]  # per timestep

# Random seed for reproducibility
random_seed = params["random_seed"]

# CSV file with known deposits; columns:
# lon, lat, age (Ma)
deposits_filename = params["deposits_filename"]

# If desired, categorise deposits according to location
# Should be a shapefile or GeoJSON containing polygons
# with a 'region' attribute
regions_filename = params["regions_filename"]

### Select plate model

To use the plate model from the published paper (Alfonso et al., 2024), set `use_provided_plate_model` to `True`. Otherwise, leave `use_provided_plate_model` as `False` and set `plate_model_name` to a valid model name for the [`plate-model-manager`](https://github.com/michaelchin/plate-model-manager/blob/4f66423b53950bf42f5dac1228e61fd1e19fdf6e/models.json) package, or set `plate_model_name` to `None` and place GPlates files in a directory named `plate_model`.

| `use_provided_plate_model` | `plate_model_name` | result |
| - | - | - |
| `True` | Any | Use Alfonso et al., 2024 model |
| `False` | Model name string (e.g. `"muller2022"`) | Use specified plate model |
| `False` | `None` | Use files in `plate_model` directory |

In [3]:
plate_model_name = params["plate_model"]["plate_model_name"]
use_provided_plate_model = params["plate_model"]["use_provided_plate_model"]

## Notebook setup

Imports, definitions, etc.

### Imports

In [4]:
import os
import warnings
from pathlib import Path

import geopandas as gpd
import numpy as np
import pandas as pd
import pygplates
with warnings.catch_warnings():
    warnings.simplefilter("ignore", UserWarning)
    from gplately.tools import plate_isotherm_depth

from lib.assign_regions import assign_regions
from lib.calculate_convergence import run_calculate_convergence
from lib.check_files import (
    check_source_data,
    check_plate_model,
)
from lib.combine_point_data import combine_point_data
from lib.coregister_combined_point_data import run_coregister_combined_point_data
from lib.coregister_crustal_thickness import run_coregister_crustal_thickness
from lib.coregister_ocean_rasters import (
    extract_subducted_thickness,
    run_coregister_ocean_rasters,
)
from lib.create_study_area_polygons import run_create_study_area_polygons
from lib.deformation import extract_strain_and_rate, extract_strain_history
from lib.erodep import calculate_erodep
from lib.generate_unlabelled_points import generate_unlabelled_points
from lib.misc import calculate_slab_flux, calculate_carbon
from lib.plate_models import get_plate_reconstruction
from lib.slab_dip import calculate_slab_dip
from lib.subduction_history import extract_subduction_history
from lib.water import calculate_water_thickness

# Suppress occasional joblib warnings
%env PYTHONWARNINGS=ignore::UserWarning
warnings.simplefilter("ignore", UserWarning)
warnings.simplefilter("ignore", pd.errors.PerformanceWarning)



### Input and output files

If necessary, the plate model will be downloaded:

In [5]:
plate_model_dir = "plate_model"
if use_provided_plate_model:
    check_plate_model(plate_model_dir, verbose=True)
    plate_model_name = None
plate_model = get_plate_reconstruction(
    model_name=plate_model_name,
    model_dir=plate_model_dir,
)

Path(data_dir).mkdir(parents=True, exist_ok=True)

The following input directories are all relative to `data_dir`:

In [None]:
# Seafloor age grid directory
# Filename format 'seafloor_age_{time}Ma.nc'
agegrid_dir = "SeafloorAge"

# Seafloor spreading rate directory
# Filename format 'spreading_rate_{time}Ma.nc'
spreadrate_dir = "SpreadingRate"

# Seafloor sediment thickness directory
# Filename format 'sediment_thickness_{time}Ma.nc'
sedthick_dir = "SedimentThickness"

# Seafloor carbonate sediment thickness directory
# Filename format 'carbonate_thickness_{time}Ma.nc'
carbonate_dir = "CarbonateThickness"

# Oceanic crustal CO2 density directory
# Filename format 'crustal_co2_{time}Ma.nc'
co2_dir = "CrustalCO2"

# Overriding plate thickness directory
# Filename format 'crustal_thickness_{time}Ma.nc'
crustal_thickness_dir = "CrustalThickness"

# Erosion/deposition rate directory
# Filename format 'erosion_deposition_{time}Ma.nc'
erodep_dir = "ErosionDeposition"

In [None]:
# Handle relative file/directory paths
if not os.path.isfile(deposits_filename):
    deposits_filename = os.path.join(data_dir, deposits_filename)
agegrid_dir = os.path.join(data_dir, agegrid_dir)
spreadrate_dir = os.path.join(data_dir, spreadrate_dir)
sedthick_dir = os.path.join(data_dir, sedthick_dir)
carbonate_dir = os.path.join(data_dir, carbonate_dir)
co2_dir = os.path.join(data_dir, co2_dir)
crustal_thickness_dir = os.path.join(data_dir, crustal_thickness_dir)
erodep_dir = os.path.join(data_dir, erodep_dir)

subduction_data_filename = os.path.join(output_dir, "subducting_plate_data.csv")
study_area_dir = os.path.join(output_dir, "study_area_polygons")
output_filename = os.path.join(output_dir, "training_data_global_deformation.csv")

### Subducting plate data

This cell will extract the subduction kinematics data from the plate model, along with datasets relating to the subducting oceanic plate: seafloor age, sediment and carbonate thickness, etc.
However, if this data has already been extracted by another notebook and `overwrite` has not been set to `True`, then the data will be read from that file instead.

In [None]:
if (
    subduction_data_filename is not None and os.path.isfile(subduction_data_filename)
) and (not overwrite):
    subduction_data = pd.read_csv(subduction_data_filename)
else:
    subduction_data = run_calculate_convergence(
        nprocs=n_jobs,
        min_time=min(times),
        max_time=max(times),
        plate_reconstruction=plate_model,
        verbose=verbose,
    )

    subduction_data = run_coregister_ocean_rasters(
        nprocs=n_jobs,
        times=times,
        input_data=subduction_data,
        agegrid_dir=agegrid_dir,
        spreadrate_dir=spreadrate_dir,
        plate_reconstruction=plate_model,
        sedthick_dir=sedthick_dir,
        carbonate_dir=carbonate_dir,
        co2_dir=co2_dir,
        verbose=verbose,
    )
    subduction_data["plate_thickness (m)"] = plate_isotherm_depth(
        subduction_data["seafloor_age (Ma)"],
        maxiter=100,
    )
    subduction_data = calculate_water_thickness(data=subduction_data)
    subduction_data = calculate_carbon(subduction_data)
    subduction_data = calculate_slab_flux(subduction_data)
    subduction_data = calculate_slab_dip(subduction_data)
    subduction_data = extract_subducted_thickness(
        subduction_data,
        plate_reconstruction=plate_model,
    )
    subduction_data["sediment_flux (m^2/yr)"] = (
        subduction_data["sediment_thickness (m)"]
        * subduction_data["convergence_rate_orthogonal (cm/yr)"] * 1.0e-2
    ).clip(0.0, np.inf)
    subduction_data["carbon_flux (t/m/yr)"] = (
        subduction_data["total_carbon_density (t/m^2)"]
        * subduction_data["convergence_rate_orthogonal (cm/yr)"] * 1.0e-2
    ).clip(0.0, np.inf)
    subduction_data["water_flux (m^2/yr)"] = (
        subduction_data["total_water_thickness (m)"]
        * subduction_data["convergence_rate_orthogonal (cm/yr)"] * 1.0e-2
    ).clip(0.0, np.inf)

    if subduction_data_filename is not None:
        subduction_data.to_csv(subduction_data_filename, index=False)

### Create study area polygons along subduction zones

Here we define our study area as all points on the overriding plate within a certain distance of the subduction zone (by default, $6 \degree, \approx 660\mathrm{km}$)

In [11]:
from lib.create_study_area_polygons import DEFAULT_SZ_BUFFER_DISTANCE

buffer_distance = DEFAULT_SZ_BUFFER_DISTANCE  # 6.0

if overwrite or not os.path.isdir(study_area_dir):
    run_create_study_area_polygons(
        nprocs=n_jobs,
        times=times,
        plate_reconstruction=plate_model,
        output_dir=study_area_dir,
        buffer_distance=buffer_distance,
        verbose=verbose,
        return_output=False,
    )

### Generate random unlabelled data points

The unlabelled set is created by generating uniformly-distributed random points within the polygons created in the previous cell. To change the number of points generated at each timestep, modify the `num_unlabelled` parameter defined earlier.

In [12]:
unlabelled = generate_unlabelled_points(
    times=times,
    input_dir=study_area_dir,
    num=num_unlabelled,
    threads=n_jobs,
    seed=random_seed,
    plate_reconstruction=plate_model,
    verbose=verbose,
)

### Combine labelled deposit/non-deposit data with random unlabelled data

The function below wrangles the points generated in the previous cell into the same format as the deposit location data.

In [13]:
combined_points = combine_point_data(
    deposit_data=deposits_filename,
    unlabelled_data=unlabelled,
    plate_reconstruction=plate_model,
    study_area_dir=study_area_dir,
    min_time=min(times),
    max_time=max(times),
    n_jobs=n_jobs,
    verbose=verbose,
)
del unlabelled
combined_points = combined_points.dropna(subset=["present_lon", "present_lat"])

### Assign subduction data to point deposit/non-deposit/unlabelled data

Here we assign the appropriate values for the subduction-related parameters (kinematics, seafloor age, etc.) to the deposit sites and random locations.

In [None]:
coregistered_data = run_coregister_combined_point_data(
    point_data=combined_points,
    subduction_data=subduction_data,
    n_jobs=n_jobs,
    verbose=verbose,
)
del combined_points, subduction_data

### Assign crustal thickness data to point data

This cell extracts the overriding plate thickness at each point.

In [15]:
coregistered_data = run_coregister_crustal_thickness(
    point_data=coregistered_data,
    input_dir=crustal_thickness_dir,
    n_jobs=n_jobs,
    verbose=verbose,
)

### Calculate cumulative erosion

Here we calculate the cumulative erosion experienced by each deposit/random point since its time of formation.

In [16]:
coregistered_data = calculate_erodep(
    coregistered_data,
    input_dir=erodep_dir,
    n_jobs=n_jobs,
    column_name="erosion (m)",
    verbose=verbose,
)

### Assign data to regions

To divide the data into individual regions for the later analysis, we use the `regions_filename` defined earlier, if desired.

In [17]:
if regions_filename is not None and os.path.isfile(regions_filename):
    points = gpd.GeoSeries.from_xy(
        coregistered_data["present_lon"],
        coregistered_data["present_lat"],
        index=coregistered_data.index,
    )
    coregistered_data["region"] = assign_regions(
        points,
        regions=regions_filename,
    )
    del points

### Add strain rate and cumulative strain

In [18]:
topological_model = pygplates.TopologicalModel(
    topological_features=plate_model.topology_features,
    rotation_model=plate_model.rotation_model,
)
coregistered_data = extract_strain_and_rate(
    coregistered_data,
    topological_model=topological_model,
    max_time=170,
)
coregistered_data = extract_strain_history(
    coregistered_data,
    topological_model=topological_model,
    time_window=10,
)

### Save to file

Finally, we write the dataset to a CSV file.

In [19]:
coregistered_data.to_csv(output_filename, index=False)

coregistered_data.groupby(["region", "label"]).size()

region          label     
East Asia       negative         7
                positive         2
                unlabelled    2773
North America   negative        45
                positive       192
                unlabelled    2663
Other           negative       203
                positive         1
                unlabelled    1394
South America   negative      1096
                positive       188
                unlabelled    2653
Southeast Asia  negative         4
                positive        53
                unlabelled    3347
Tethys          negative        20
                positive        56
                unlabelled    2254
dtype: int64