In [1]:
import glob
import os
import tempfile
import warnings

import geopandas as gpd
from gplately.tools import plate_isotherm_depth

from lib.assign_regions import assign_regions
from lib.calculate_convergence import run_calculate_convergence
from lib.check_files import (
    check_source_data,
    check_plate_model,
)
from lib.combine_point_data import combine_point_data
from lib.coregister_combined_point_data import run_coregister_combined_point_data
from lib.coregister_crustal_thickness import run_coregister_crustal_thickness
from lib.coregister_magnetic import coregister_magnetic
from lib.coregister_ocean_rasters import run_coregister_ocean_rasters
from lib.create_study_area_polygons import run_create_study_area_polygons
from lib.erodep import calculate_erodep
from lib.generate_unlabelled_points import generate_unlabelled_points
from lib.misc import (
    calculate_slab_flux,
    # calculate_water_thickness,
)
from lib.slab_dip import calculate_slab_dip
from lib.water import calculate_water_thickness

# Suppress occasional joblib warnings
%env PYTHONWARNINGS=ignore::UserWarning

warnings.simplefilter("ignore", UserWarning)




In [2]:
random_seed = 1234

n_jobs = int(os.environ.get("N_JOBS", 8))
times = range(171)
verbose = False

num_unlabelled = 200  # per timestep


### Input and output files

In [3]:
model_dir = "plate_model"
check_plate_model(model_dir, verbose=verbose)

data_dir = "source_data"
check_source_data(data_dir, verbose=verbose)
deposits_filename = os.path.join(data_dir, "deposit_data.csv")
regions_filename = os.path.join(data_dir, "regions.shp")
agegrid_dir = os.path.join(data_dir, "AgeGrids")
sedthick_dir = os.path.join(data_dir, "SedimentThickness")
carbonate_dir = os.path.join(data_dir, "CarbonateThickness")
co2_dir = os.path.join(data_dir, "CrustalCO2")
crustal_thickness_dir = os.path.join(data_dir, "CrustalThickness")
mag_anomaly_filename = os.path.join(
    data_dir,
    "MagneticAnomaly",
    "emag2_upcont_interpolated.nc",
)
subducted_quantities_dir = os.path.join(
    data_dir,
    "SubductedQuantities",
)
erodep_dir = os.path.join(data_dir, "ErosionDeposition")

output_dir = "extracted_data"
os.makedirs(output_dir, exist_ok=True)
subduction_data_filename = os.path.join(output_dir, "subducting_plate_data.csv")
study_area_dir = os.path.join(output_dir, "study_area_polygons")
output_filename = os.path.join(output_dir, "training_data.csv")

feature_filenames = glob.glob(
    os.path.join(
        model_dir,
        "*.gpml",
    )
)
rotation_filenames = glob.glob(
    os.path.join(
        model_dir,
        "*.rot",
    )
)
static_polygons_filename = os.path.join(
    model_dir,
    "StaticGeometries",
    "StaticPolygons",
    "Clennett_2020_StaticPolygons.gpml",
)


### Subducting plate data

In [4]:
with tempfile.TemporaryDirectory() as subduction_kinematics_dir:
    run_calculate_convergence(
        nprocs=n_jobs,
        min_time=min(times),
        max_time=max(times),
        topology_filenames=feature_filenames,
        rotation_filenames=rotation_filenames,
        output_dir=subduction_kinematics_dir,
        verbose=verbose,
    )

    subduction_data = run_coregister_ocean_rasters(
        nprocs=n_jobs,
        times=times,
        input_data=subduction_kinematics_dir,
        agegrid_dir=agegrid_dir,
        topology_features=feature_filenames,
        rotation_model=rotation_filenames,
        sedthick_dir=sedthick_dir,
        carbonate_dir=carbonate_dir,
        co2_dir=co2_dir,
        subducted_thickness_dir=os.path.join(
            subducted_quantities_dir,
            "plate_thickness",
        ),
        subducted_sediments_dir=os.path.join(
            subducted_quantities_dir,
            "sediment_thickness",
        ),
        subducted_carbonates_dir=os.path.join(
            subducted_quantities_dir,
            "carbonate_thickness",
        ),
        subducted_water_dir=os.path.join(
            subducted_quantities_dir,
            "water_thickness",
        ),
        verbose=verbose,
    )
subduction_data["plate_thickness (m)"] = plate_isotherm_depth(
    subduction_data["seafloor_age (Ma)"],
    maxiter=100,
)
subduction_data = calculate_water_thickness(subduction_data)
subduction_data = calculate_slab_flux(subduction_data)
subduction_data = calculate_slab_dip(subduction_data)
if subduction_data_filename is not None:
    subduction_data.to_csv(subduction_data_filename, index=False)


### Create study area polygons along subduction zones

In [5]:
run_create_study_area_polygons(
    nprocs=n_jobs,
    times=times,
    topological_features=feature_filenames,
    rotation_model=rotation_filenames,
    output_dir=study_area_dir,
    verbose=verbose,
    return_output=False,
)


### Generate random unlabelled data points

In [6]:
unlabelled = generate_unlabelled_points(
    times=times,
    input_dir=study_area_dir,
    num=num_unlabelled,
    threads=n_jobs,
    seed=random_seed,
    topological_features=feature_filenames,
    rotation_model=rotation_filenames,
    verbose=verbose,
)


### Combine labelled deposit/non-deposit data with random unlabelled data

In [7]:
combined_points = combine_point_data(
    deposit_data=deposits_filename,
    unlabelled_data=unlabelled,
    static_polygons=static_polygons_filename,
    topological_features=feature_filenames,
    rotation_model=rotation_filenames,
    study_area_dir=study_area_dir,
    min_time=min(times),
    max_time=max(times),
    n_jobs=n_jobs,
    verbose=verbose,
)
del unlabelled


### Assign subduction data to point deposit/non-deposit/unlabelled data

In [8]:
coregistered_data = run_coregister_combined_point_data(
    point_data=combined_points,
    subduction_data=subduction_data,
    n_jobs=n_jobs,
    verbose=verbose,
)
del combined_points, subduction_data


### Assign magnetic anomaly and crustal thickness data to point data

In [9]:
coregistered_data = run_coregister_crustal_thickness(
    point_data=coregistered_data,
    input_dir=crustal_thickness_dir,
    n_jobs=n_jobs,
    verbose=verbose,
)
coregistered_data = coregister_magnetic(
    data=coregistered_data,
    filename=mag_anomaly_filename,
    n_jobs=n_jobs,
)


### Calculate cumulative erosion

In [10]:
coregistered_data = calculate_erodep(
    coregistered_data,
    input_dir=erodep_dir,
    n_jobs=n_jobs,
    column_name="erosion (m)",
    verbose=verbose,
)


### Assign data to regions

In [11]:
points = gpd.GeoSeries.from_xy(
    coregistered_data["present_lon"],
    coregistered_data["present_lat"],
    index=coregistered_data.index,
)
coregistered_data["region"] = assign_regions(
    points,
    regions=regions_filename,
)
del points


In [12]:
coregistered_data.to_csv(output_filename, index=False)

coregistered_data.groupby(["source", "region", "label"]).size()


source                      region          label     
Diaz-Rodriguez et al. 2021  North America   negative        45
                                            positive       168
                            South America   negative       979
                                            positive       126
random                      East Asia       unlabelled    4544
                            North America   unlabelled    7240
                            Other           unlabelled    4186
                            South America   unlabelled    6243
                            Southeast Asia  unlabelled    7589
                            Tethys          unlabelled    6060
dtype: int64