## Train classifiers

If training data has been extracted from the source datasets by running the `00a-extract_training_data.ipynb` and `00b-extract_grid_data.ipynb` notebooks, set the `use_extracted_data` variable below to `True` to use this dataset instead of the pre-prepared training data from the [Zenodo repository](https://zenodo.org/record/8157691).

In [1]:
use_extracted_data = False


In [2]:
import os
import time
import warnings
from datetime import timedelta

import pandas as pd
from joblib import dump
from pulearn.bagging import BaggingPuClassifier
from sklearn.base import clone
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.svm import SVC

from lib.check_files import check_prepared_data
from lib.pu import (
    BASE_MODELS,
    PU_PARAMS,
    UNUSED_COLUMNS,
    downsample_unlabelled,
)

# Suppress FutureWarning for some versions of Scikit-learn
%env PYTHONWARNINGS=ignore::FutureWarning

warnings.simplefilter("ignore", FutureWarning)




In [3]:
# Random seed for reproducibility
random_seed = 1234

# Number of jobs used to train model
n_jobs = int(os.environ.get("N_JOBS", 8))

# Parameters for models
imputer_params = {
    "random_state": random_seed,
    "add_indicator": False,
}
rf_params = {"random_state": random_seed}
pu_params = {
    "n_jobs": n_jobs,
    "random_state": random_seed,
    **PU_PARAMS
}
svm_params = {
    "kernel": "rbf",
    "probability": True,
    "random_state": random_seed,
}


In [4]:
# Input/output files
if use_extracted_data:
    data_dir = "extracted_data"
else:
    data_dir = "prepared_data"
    check_prepared_data(data_dir, verbose=False)
data_filename = os.path.join(data_dir, "training_data.csv")

output_dir = os.path.join("outputs", "Americas")
os.makedirs(output_dir, exist_ok=True)

pu_dir = os.path.join(output_dir, "PU")
os.makedirs(pu_dir, exist_ok=True)
pu_filename = os.path.join(pu_dir, "classifier.joblib")

svm_dir = os.path.join(output_dir, "SVM")
os.makedirs(svm_dir, exist_ok=True)
svm_filename = os.path.join(svm_dir, "classifier.joblib")


In [5]:
data = pd.read_csv(data_filename)

# Restrict training data to the Americas
data = data[data["region"].isin({"North America", "South America"})]

# Equal number of positive and unlabelled samples
negatives = data[data["label"] == "negative"]
tmp = data[data["label"] != "negative"]
tmp_downsampled = downsample_unlabelled(
    tmp,
    random_state=random_seed,
)
combined = pd.concat((tmp_downsampled, negatives))
cleaned = combined.drop(columns=list(UNUSED_COLUMNS), errors="ignore")
del tmp, tmp_downsampled

print(combined.groupby(["region", "label"]).size())


region         label     
North America  negative       57
               positive      170
               unlabelled    158
South America  negative      979
               positive      126
               unlabelled    138
dtype: int64


### Train the PU classifier

#### All data

In [6]:
# Wrangle training data
train_pu = cleaned[cleaned["label"].isin({"positive", "unlabelled"})]
x_pu = train_pu.drop(columns="label")
y_pu = train_pu["label"].replace({"positive": 1, "unlabelled": 0})

# Use a random forest as the base classifier
base_model = clone(BASE_MODELS["randomforest"])
base_model.set_params(**rf_params)

# Impute missing values and scale before training
pu_model = make_pipeline(
    IterativeImputer(**imputer_params),
    RobustScaler(),
    BaggingPuClassifier(base_model, **pu_params),
)
pu_model.set_output(transform="pandas")

# Train model
t0 = time.time()
pu_model.fit(x_pu, y_pu)

# Save to file
dump(pu_model, pu_filename, compress=True)
duration = timedelta(seconds=time.time() - t0)
print(f"Model training time: {duration}")


Model training time: 0:00:07.011656


In [7]:
data

Unnamed: 0,present_lon,present_lat,age (Ma),label,source,plate_id,lon,lat,overriding_plate_id,region,...,crustal_thickness_n,crustal_thickness_range (m),magnetic_anomaly_mean (nT),magnetic_anomaly_min (nT),magnetic_anomaly_max (nT),magnetic_anomaly_median (nT),magnetic_anomaly_std (nT),magnetic_anomaly_n,magnetic_anomaly_range (nT),erosion (m)
0,-153.091800,60.031900,0,negative,Diaz-Rodriguez et al. 2021,1073.0,-153.091800,60.031900,101,North America,...,224,3295.085938,154.652344,35.811516,282.693878,149.069107,62.332241,57,246.882362,-0.000000
1,-153.435000,59.362600,0,negative,Diaz-Rodriguez et al. 2021,1073.0,-153.435000,59.362600,101,North America,...,222,3179.992188,73.226997,67.446449,90.968697,71.927338,5.519875,58,23.522247,-0.000000
2,-153.535100,58.859600,0,negative,Diaz-Rodriguez et al. 2021,1073.0,-153.535100,58.859600,101,North America,...,219,2803.796875,137.284744,122.035904,156.593994,135.497070,9.474295,57,34.558090,-0.000000
3,-153.673800,58.770300,0,negative,Diaz-Rodriguez et al. 2021,1073.0,-153.673800,58.770300,101,North America,...,219,2763.070312,134.857651,88.042747,158.585251,144.321899,20.285322,56,70.542503,-0.000000
4,-154.357300,58.452800,0,negative,Diaz-Rodriguez et al. 2021,1073.0,-154.357300,58.452800,101,North America,...,216,2659.718750,34.768494,23.034136,52.113960,34.099560,7.306161,53,29.079824,-0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37302,-86.833202,16.027304,170,unlabelled,random,,-75.962592,7.547731,9020,North America,...,0,,47.997517,37.580315,58.946045,47.636066,6.158432,30,21.365730,3828.529550
37307,-163.833216,61.215475,170,unlabelled,random,,-129.825589,61.419039,9040,North America,...,39,0.000000,-53.238571,-168.160477,76.782150,-45.935848,61.840675,58,244.942627,2053.613637
37310,-148.391585,61.788103,170,unlabelled,random,,-77.204341,44.840847,1072,North America,...,12,2200.000000,127.109818,47.917496,298.118469,108.855057,67.063446,60,250.200974,8466.513303
37314,-77.023983,7.706606,170,unlabelled,random,,-72.434033,-0.491494,9020,South America,...,0,,-39.352089,-83.110275,3.672881,-43.054024,25.887815,29,86.783156,4181.183118


#### Separate regions

In [8]:
for region, subset in combined.groupby("region"):
    region = str(region)
    if (subset["label"] == "positive").sum() < 50:
        print(f"Skipping region: {region}")
        continue
    print(f"Region: {region}")
    r = "_".join(region.lower().split())
    output_subset = os.path.join(
        pu_dir,
        f"classifier_{r}.joblib",
    )

    subset = subset[subset["label"].isin({"positive", "unlabelled"})]
    subset = subset.drop(columns=list(UNUSED_COLUMNS), errors="ignore")
    print(subset.groupby("label").size())
    x_pu_subset = subset.drop(columns="label")
    y_pu_subset = subset["label"].replace({"positive": 1, "unlabelled": 0})

    pu_model_subset = clone(pu_model)
    t0 = time.time()
    pu_model_subset.fit(x_pu_subset, y_pu_subset)
    dump(pu_model, output_subset, compress=True)
    duration = timedelta(seconds=time.time() - t0)
    print(f"Model training time: {duration}")

    print("")


Region: North America
label
positive      170
unlabelled    158
dtype: int64


Model training time: 0:00:04.178691

Region: South America
label
positive      126
unlabelled    138
dtype: int64


Model training time: 0:00:04.181627



### Train the SVM classifier

#### All data

In [9]:
# Wrangle training data
train_svm = cleaned[cleaned["label"].isin({"positive", "negative"})]
x_svm = train_svm.drop(columns="label")
y_svm = train_svm["label"].replace({"positive": 1, "negative": 0})

# Impute missing values and scale before training
svm_model = make_pipeline(
    IterativeImputer(**imputer_params),
    RobustScaler(),
    SVC(**svm_params),
)
svm_model.set_output(transform="pandas")

# Train model
t0 = time.time()
svm_model.fit(x_svm, y_svm)

# Save to file
dump(svm_model, svm_filename, compress=True)
duration = timedelta(seconds=time.time() - t0)
print(f"Model training time: {duration}")


Model training time: 0:00:00.250057


#### Separate regions

In [10]:
for region, subset in combined.groupby("region"):
    region = str(region)
    if (subset["label"] == "positive").sum() < 50:
        print(f"Skipping region: {region}")
        continue
    print(f"Region: {region}")
    r = "_".join(region.lower().split())
    output_subset = os.path.join(
        svm_dir,
        f"classifier_{r}.joblib",
    )

    subset = subset[subset["label"].isin({"positive", "negative"})]
    subset = subset.drop(columns=list(UNUSED_COLUMNS), errors="ignore")
    print(subset.groupby("label").size())
    x_svm_subset = subset.drop(columns="label")
    y_svm_subset = subset["label"].replace({"positive": 1, "negative": 0})

    svm_model_subset = clone(svm_model)
    t0 = time.time()
    svm_model_subset.fit(x_svm_subset, y_svm_subset)
    dump(svm_model_subset, output_subset, compress=True)
    duration = timedelta(seconds=time.time() - t0)
    print(f"Model training time: {duration}")

    print("")


Region: North America
label
negative     57
positive    170
dtype: int64
Model training time: 0:00:00.097714

Region: South America
label
negative    979
positive    126
dtype: int64


Model training time: 0:00:00.211793

