## Train positive-unlabelled (PU) classifier

If training data has been extracted from the source datasets by running the `00a-extract_training_data.ipynb` and `00b-extract_grid_data.ipynb` notebooks, set the `use_extracted_data` variable below to `True` to use this dataset instead of the pre-prepared training data from the [Zenodo repository](https://zenodo.org/record/8157691).

In [1]:
use_extracted_data = False

In [2]:
import os
import pprint
import warnings

import pandas as pd
from joblib import dump
from pulearn.bagging import BaggingPuClassifier
from sklearn.base import clone

from lib.check_files import check_prepared_data
from lib.pu import (
    BASE_MODELS,
    PU_PARAMS,
    downsample_unlabelled,
    get_xy,
)

# Suppress FutureWarning for some versions of Scikit-learn
%env PYTHONWARNINGS=ignore::FutureWarning

warnings.simplefilter("ignore", FutureWarning)



In [3]:
# Random seed for reproducibility
random_seed = 1234

# Number of jobs used to train model
n_jobs = 4

rf_params = {"random_state": random_seed}
pu_params = {
    "n_jobs": n_jobs,
    "random_state": random_seed,
    **PU_PARAMS
}

In [4]:
# Input/output files
if use_extracted_data:
    data_dir = "extracted_data"
else:
    data_dir = "prepared_data"
    check_prepared_data(data_dir, verbose=True)
data_filename = os.path.join(data_dir, "training_data.csv")

output_dir = "outputs"
os.makedirs(output_dir, exist_ok=True)
output_filename = os.path.join(output_dir, "pu_classifier.joblib")

### Load and process training data

In [5]:
data = pd.read_csv(data_filename)

# Drop negatively-labelled samples
data = data[data["label"] != "negative"]

# Restrict unlabelled training data to the Americas
data = data[data["region"].isin({"NAm", "SAm"})]

# Equal number of positive and unlabelled samples
data_downsampled = downsample_unlabelled(
    data,
    random_state=random_seed,
)

print(data_downsampled.groupby(["region", "label"]).size())

region  label     
NAm     positive      217
        unlabelled    183
SAm     positive      130
        unlabelled    164
dtype: int64


In [6]:
# Extract relevant columns in NumPy array form
x, y = get_xy(data_downsampled)
print(x.shape, y.shape)

(633, 25) (633,)


### Train the PU classifier

In [7]:
# Use a random forest as the base classifier
base_model = clone(BASE_MODELS["randomforest"])
base_model.set_params(**rf_params)
print("Base random forest model:")
pprint.pprint(base_model.get_params())

Base random forest model:
{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 50,
 'n_jobs': 1,
 'oob_score': False,
 'random_state': 1234,
 'verbose': 0,
 'warm_start': False}


In [8]:
# Train the PU classifier
pu_model = BaggingPuClassifier(base_estimator=base_model, **pu_params)
print("PU model:")
pprint.pprint(pu_model.get_params())
pu_model = pu_model.fit(x, y)

PU model:
{'base_estimator': RandomForestClassifier(n_estimators=50, n_jobs=1, random_state=1234),
 'base_estimator__bootstrap': True,
 'base_estimator__ccp_alpha': 0.0,
 'base_estimator__class_weight': None,
 'base_estimator__criterion': 'gini',
 'base_estimator__max_depth': None,
 'base_estimator__max_features': 'sqrt',
 'base_estimator__max_leaf_nodes': None,
 'base_estimator__max_samples': None,
 'base_estimator__min_impurity_decrease': 0.0,
 'base_estimator__min_samples_leaf': 1,
 'base_estimator__min_samples_split': 2,
 'base_estimator__min_weight_fraction_leaf': 0.0,
 'base_estimator__n_estimators': 50,
 'base_estimator__n_jobs': 1,
 'base_estimator__oob_score': False,
 'base_estimator__random_state': 1234,
 'base_estimator__verbose': 0,
 'base_estimator__warm_start': False,
 'bootstrap': True,
 'bootstrap_features': False,
 'max_features': 1.0,
 'max_samples': 1.0,
 'n_estimators': 100,
 'n_jobs': 4,
 'oob_score': True,
 'random_state': 1234,
 'verbose': 0,
 'warm_start': False

### Use `joblib` to save model to file

In [9]:
dump(pu_model, output_filename)

['outputs/pu_classifier.joblib']

### Train models for NA and SA only

In [10]:
# data_na = data[data["region"] == "NAm"]
for region, subset in data.groupby("region"):
    region = str(region)
    print(f"Region: {region}")
    output_subset = os.path.join(
        output_dir,
        f"pu_classifier_{region[:2]}.joblib",
    )

    subset = downsample_unlabelled(
        subset,
        random_state=random_seed,
    )
    print(subset.groupby(["region", "label"]).size())

    x, y = get_xy(subset)
    base_model = clone(BASE_MODELS["randomforest"])
    base_model.set_params(**rf_params)

    pu_model = BaggingPuClassifier(base_estimator=base_model, **pu_params)
    pu_model = pu_model.fit(x, y)
    dump(pu_model, output_subset)

    print("")

Region: NAm
region  label     
NAm     positive      217
        unlabelled    217
dtype: int64

Region: SAm
region  label     
SAm     positive      130
        unlabelled    130
dtype: int64

