# Setup

In [25]:
import json
import os
import sys
import warnings

sys.path.append('..')

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from sklearn.metrics import balanced_accuracy_score, recall_score, mean_absolute_error, root_mean_squared_error
from sklearn.dummy import DummyRegressor

from pymer4 import Lmer
from flaml import AutoML

from utils import read_csv_non_utf, preprocess_data, get_zero_nonzero_datasets, test_thresholds, ratios_to_DI_cats
from model_utils import HurdleModelEstimator, PymerModelWrapper
from custom_metrics import balanced_accuracy_FLAML, mean_absolute_error_range

In [26]:
# Loading in general configuration
with open('../config.json', 'r') as f:
    config = json.load(f)

# Getting filepaths
gdrive_fp = config['gdrive_path']
LIFE_fp = config['LIFE_folder']
dataset_fp = config['datasets_path']
benitez_lopez2019 = config['indiv_data_paths']['benitez_lopez2019']
ferreiro_arias2024 = config['indiv_data_paths']['ferreiro_arias2024']

ben_lop_path = os.path.join(gdrive_fp, LIFE_fp, dataset_fp, benitez_lopez2019)
fer_ari_path = os.path.join(gdrive_fp, LIFE_fp, dataset_fp, ferreiro_arias2024)

In [27]:
# Reading in the datasets
bird_data = pd.read_csv(fer_ari_path)
mammal_data = read_csv_non_utf(ben_lop_path)

bird_data.head()

Unnamed: 0,Study,Dataset,Reviewer,Order,Family,Species,BirdLife_Species,BirdTree_Species,IUCN_Species,Hunting,...,TravDist,PopDens,Stunting,FoodBiomass,Reserve,Forest_cover,NPP,CountryNum,Food,Hunted
0,442,9,AB,Galliformes,Cracidae,Mitu tuberosum,Mitu tuberosum,Mitu tuberosum,Mitu tuberosum,S,...,2389.0,0.0,14.7,17.695656,1,100.0,207.0,76,Yes,Yes
1,442,9,AB,Galliformes,Cracidae,Mitu tuberosum,Mitu tuberosum,Mitu tuberosum,Mitu tuberosum,S,...,2389.0,0.0,14.7,17.695656,1,100.0,207.0,76,Yes,Yes
2,442,9,AB,Galliformes,Cracidae,Mitu tuberosum,Mitu tuberosum,Mitu tuberosum,Mitu tuberosum,S,...,2389.0,0.0,14.7,17.695656,1,100.0,207.0,76,Yes,Yes
3,442,9,AB,Galliformes,Cracidae,Mitu tuberosum,Mitu tuberosum,Mitu tuberosum,Mitu tuberosum,S,...,2389.0,0.0,14.7,17.695656,1,100.0,207.0,76,Yes,Yes
4,442,9,AB,Galliformes,Cracidae,Mitu tuberosum,Mitu tuberosum,Mitu tuberosum,Mitu tuberosum,S,...,2389.0,0.0,14.7,17.695656,1,100.0,207.0,76,Yes,Yes


In [30]:
mammal_data['Region'].value_counts()

Region
S America    1414
Africa       1164
C America     531
Asia          172
Name: count, dtype: int64

# Cross-taxa generalisation

Training on birds and evaluating on mammals, and vice versa.

In [18]:
# Choosing which dataset to train on - "mammals" or "birds"
train_dataset = 'birds' 

test_dataset = 'birds' if train_dataset == 'mammals' else 'mammals'
print(f'Training on {train_dataset} and testing on {test_dataset}')

Training on birds and testing on mammals


In [None]:
# Aligning the two datasets
cols = ['Order', 'Family', 'Species', 'ratio', 'X', 'Y', 'Country', 'BM', 'DistKm', 'PopDens', 
        'Stunting', 'TravTime', 'LivestockBio', 'Reserve']
mammal_data = mammal_data[cols]

cols = ['Order', 'Family', 'Species', 'RR', 'Latitude', 'Longitude', 'Country', 'Body_Mass', 
        'Dist_Hunters', 'PopDens', 'Stunting', 'TravDist', 'FoodBiomass', 'Reserve']
bird_data = bird_data[cols]
bird_data['Reserve'] = bird_data['Reserve'].replace({0 : 'No', 1 : 'Yes'}) # aligning the coding of this binary columns to the mammal dataset

bird_data = bird_data.rename(columns = {'RR' : 'ratio', 'Longitude' : 'X', 'Latitude' : 'Y',
                                        'Dist_Hunters' : 'DistKm', 'TravDist' : 'TravTime',
                                        'FoodBiomass' : 'LivestockBio', 'Body_Mass' : 'BM'})

In [19]:
# Preparing the data
dataset = 'both'
data = pd.concat((mammal_data, bird_data), join = 'inner', axis = 0, ignore_index = True)

mammal_idxs = [i for i in range(len(mammal_data))]
bird_idxs = [i for i in range(len(mammal_data), len(data))]
idxs = {'train' : mammal_idxs if train_dataset == 'mammals' else bird_idxs, 
        'test' : bird_idxs if train_dataset == 'mammals' else mammal_idxs}

pp_data = preprocess_data(data, include_indicators = False, standardize = True, log_trans_cont = False,
                          polynomial_features = 0, train_test_idxs = idxs, embeddings_to_use = None,
                          embeddings_args = None, dataset = dataset)

train_data, test_data = pp_data.iloc[idxs['train']].reset_index(drop = True), pp_data.iloc[idxs['test']].reset_index(drop = True)

In [20]:
# General parameters
time_budget_mins = 0.1
base_path = os.path.join('..', 'model_saves')
verbose = 3

#  using only shared columns for predictors
zero_columns = ['BM', 'DistKm', 'PopDens', 'Stunting', 'TravTime', 'LivestockBio', 'Reserve']
nonzero_columns = zero_columns
indicator_columns = []

zero_metric = balanced_accuracy_FLAML
nonzero_metric = 'mse'

# Setting up the zero and nonzero models
zero_model = AutoML()
nonzero_model = AutoML()

#  specify fitting paramaters
zero_settings = {
    'time_budget' : time_budget_mins * 60,  # in seconds
    'metric' : zero_metric,
    'task' : 'classification',
    'log_file_name' : os.path.join(base_path, f'nonlinear_hurdle_ZERO.log'),
    'seed' : 1693,
    'estimator_list' : ['lgbm', 'xgboost', 'xgb_limitdepth', 'rf', 
                        'extra_tree', 'kneighbor', 'lrl1', 'lrl2'],
    'early_stop' : True,
    'verbose' : verbose,
    'keep_search_state' : True,
    'eval_method' : 'cv'
}

nonzero_settings = {
    'time_budget' : time_budget_mins * 60,  # in seconds
    'metric' : nonzero_metric,
    'task' : 'regression',
    'log_file_name' : os.path.join(base_path, f'nonlinear_hurdle_NONZERO.log'),
    'seed' : 1693,
    'estimator_list' : ['lgbm', 'xgboost', 'xgb_limitdepth', 'rf', 'extra_tree', 'kneighbor'],
    'early_stop' : True,
    'verbose' : verbose,
    'keep_search_state' : True,
    'eval_method' : 'cv'
}

extirp_pos = False
settings = {'zero' : zero_settings, 'nonzero' : nonzero_settings}

#  dumping everything into the hurdle model wrapper
data_args = {'indicator_columns' : indicator_columns,
             'nonzero_columns' : nonzero_columns,
             'zero_columns' : zero_columns,
             'embeddings_to_use' : None,
             'dataset' : dataset}
hurdle_model = HurdleModelEstimator(zero_model, nonzero_model, extirp_pos = extirp_pos, 
                                    data_args = data_args, verbose = True)

In [21]:
# Fitting the two constituent models
hurdle_model.fit(train_data, fit_args = settings)

Fitting the nonzero model...
[flaml.automl.logger: 06-25 12:32:02] {1680} INFO - task = regression
[flaml.automl.logger: 06-25 12:32:02] {1691} INFO - Evaluation method: cv
[flaml.automl.logger: 06-25 12:32:02] {1789} INFO - Minimizing error metric: mse
[flaml.automl.logger: 06-25 12:32:02] {1901} INFO - List of ML learners in AutoML Run: ['lgbm', 'xgboost', 'xgb_limitdepth', 'rf', 'extra_tree', 'kneighbor']
[flaml.automl.logger: 06-25 12:32:02] {2219} INFO - iteration 0, current learner lgbm
[flaml.automl.logger: 06-25 12:32:03] {2345} INFO - Estimated sufficient time budget=1259s. Estimated necessary time budget=13s.
[flaml.automl.logger: 06-25 12:32:03] {2392} INFO -  at 0.2s,	estimator lgbm's best error=0.9040,	best estimator lgbm's best error=0.9040
[flaml.automl.logger: 06-25 12:32:03] {2219} INFO - iteration 1, current learner lgbm
[flaml.automl.logger: 06-25 12:32:03] {2392} INFO -  at 0.3s,	estimator lgbm's best error=0.8814,	best estimator lgbm's best error=0.8814
[flaml.auto

<class 'TimeoutError'> [Errno 14] <frame at 0x1263ce670, file '/Users/emiliolr/miniforge3/envs/life-hunting/lib/python3.12/site-packages/joblib/parallel.py', line 1707, code _retrieve>


[flaml.automl.logger: 06-25 12:32:15] {2392} INFO -  at 7.0s,	estimator lrl2's best error=0.5000,	best estimator lgbm's best error=0.3590
[flaml.automl.logger: 06-25 12:32:15] {2628} INFO - retrain lgbm for 0.1s
[flaml.automl.logger: 06-25 12:32:15] {2631} INFO - retrained model: LGBMClassifier(colsample_bytree=0.7568488517531023, learning_rate=1.0,
               max_bin=127, min_child_samples=25, n_estimators=1, n_jobs=-1,
               num_leaves=11, reg_alpha=0.6359407655146174,
               reg_lambda=2.397491917044793, verbose=-1)
[flaml.automl.logger: 06-25 12:32:16] {1931} INFO - fit succeeded
[flaml.automl.logger: 06-25 12:32:16] {1932} INFO - Time taken to find the best model: 5.821398019790649


In [22]:
# Tuning the probability threshold for the zero model
X_zero, y_zero, _, _ = get_zero_nonzero_datasets(train_data, extirp_pos = hurdle_model.extirp_pos,
                                                 pred = False, **hurdle_model.data_args)
y_pred = hurdle_model.zero_model.predict_proba(X_zero)[ : , 1]

opt_thresh, _ = test_thresholds(y_pred, y_zero)
hurdle_model.prob_thresh = round(opt_thresh, 3)
print(f'Optimal threshold was found to be {hurdle_model.prob_thresh}')

Optimal threshold was found to be 0.8


In [23]:
# Predicting on the test set
y_pred = hurdle_model.predict(test_data)
y_pred[y_pred != 0] = np.exp(y_pred[y_pred != 0])
y_test = test_data['ratio'].copy(deep = True)

#  getting DI categories
true_DI_cats = ratios_to_DI_cats(y_test)
pred_DI_cats = ratios_to_DI_cats(y_pred)

In [24]:
# Getting performance metrics
mae = mean_absolute_error(y_test, y_pred)
print(f'MAE: {round(mae, 3)}')

mae_01, _ = mean_absolute_error_range(y_test, y_pred, 0, 1)
print(f'MAE (0-1 range): {round(mae_01, 3)}')

rmse = root_mean_squared_error(y_test, y_pred)
print(f'RMSE: {round(rmse, 3)}')

ba = balanced_accuracy_score(true_DI_cats, pred_DI_cats)
print(f'BA: {round(ba, 3)}')

MAE: 1.063
MAE (0-1 range): 0.601
RMSE: 3.833
BA: 0.343


# Cross-continent generalisation
For mammals, this is test on South America or Africa (and train on everything else), and for birds, this is test on the Neotropical or Indomalayan region (and train on everything else).

In [69]:
# Choosing which region to train on - "mammals" or "birds"
dataset = 'mammals'
test_region = 'S America' # for mammals, either "S America" or "Africa", and for birds, either "Neotropic" or "Indomalayan" 

if dataset == 'mammals':
    assert test_region in ['S America', 'Africa'], 'The only valid test regions for mammals are "S America" or "Africa".' 
elif dataset == 'birds':
    assert test_region in ['Neotropic', 'Indomalayan'], 'The only valid test regions for birds are "Neotropic" or "Indomalayan".' 

print(f'{dataset.title()} dataset, testing on {test_region}')

Mammals dataset, testing on S America


In [71]:
# Preparing the data
data = mammal_data if dataset == 'mammals' else bird_data
col = 'Region' if dataset == 'mammals' else 'Realm'

test_idxs = data.index[data[col] == test_region].to_list()
train_idxs = [i for i in data.index if i not in test_idxs]
idxs = {'train' : train_idxs, 'test' : test_idxs}

pp_data = preprocess_data(data, include_indicators = False, standardize = True, log_trans_cont = False,
                          polynomial_features = 0, train_test_idxs = idxs, embeddings_to_use = None,
                          embeddings_args = None, dataset = dataset)

train_data, test_data = pp_data.iloc[idxs['train']], pp_data.iloc[idxs['test']]

In [77]:
# General parameters
time_budget_mins = 0.1
base_path = os.path.join('..', 'model_saves')
verbose = 3

# Getting the predictors
if dataset == 'mammals':
    zero_columns = ['BM', 'DistKm', 'PopDens', 'Stunting', 'TravTime', 'LivestockBio', 'Reserve', 'Literacy']
elif dataset == 'birds':
    zero_columns = ['Dist_Hunters', 'TravDist', 'PopDens', 'Stunting', 'FoodBiomass', 'Forest_cover', 'NPP', 'Body_Mass']
nonzero_columns = zero_columns
indicator_columns = []

zero_metric = balanced_accuracy_FLAML
nonzero_metric = 'mse'

# Setting up the zero and nonzero models
zero_model = AutoML()
nonzero_model = AutoML()

#  specify fitting paramaters
zero_settings = {
    'time_budget' : time_budget_mins * 60,  # in seconds
    'metric' : zero_metric,
    'task' : 'classification',
    'log_file_name' : os.path.join(base_path, f'nonlinear_hurdle_ZERO.log'),
    'seed' : 1693,
    'estimator_list' : ['lgbm', 'xgboost', 'xgb_limitdepth', 'rf', 
                        'extra_tree', 'kneighbor', 'lrl1', 'lrl2'],
    'early_stop' : True,
    'verbose' : verbose,
    'keep_search_state' : True,
    'eval_method' : 'cv'
}

nonzero_settings = {
    'time_budget' : time_budget_mins * 60,  # in seconds
    'metric' : nonzero_metric,
    'task' : 'regression',
    'log_file_name' : os.path.join(base_path, f'nonlinear_hurdle_NONZERO.log'),
    'seed' : 1693,
    'estimator_list' : ['lgbm', 'xgboost', 'xgb_limitdepth', 'rf', 'extra_tree', 'kneighbor'],
    'early_stop' : True,
    'verbose' : verbose,
    'keep_search_state' : True,
    'eval_method' : 'cv'
}

extirp_pos = False
settings = {'zero' : zero_settings, 'nonzero' : nonzero_settings}

#  dumping everything into the hurdle model wrapper
data_args = {'indicator_columns' : indicator_columns,
             'nonzero_columns' : nonzero_columns,
             'zero_columns' : zero_columns,
             'embeddings_to_use' : None,
             'dataset' : dataset}
hurdle_model = HurdleModelEstimator(zero_model, nonzero_model, extirp_pos = extirp_pos, 
                                    data_args = data_args, verbose = True)

In [78]:
# Fitting the two constituent models
hurdle_model.fit(train_data, fit_args = settings)

Fitting the nonzero model...
[flaml.automl.logger: 06-25 12:59:00] {1680} INFO - task = regression
[flaml.automl.logger: 06-25 12:59:00] {1691} INFO - Evaluation method: cv
[flaml.automl.logger: 06-25 12:59:00] {1789} INFO - Minimizing error metric: mse
[flaml.automl.logger: 06-25 12:59:00] {1901} INFO - List of ML learners in AutoML Run: ['lgbm', 'xgboost', 'xgb_limitdepth', 'rf', 'extra_tree', 'kneighbor']
[flaml.automl.logger: 06-25 12:59:00] {2219} INFO - iteration 0, current learner lgbm
[flaml.automl.logger: 06-25 12:59:00] {2345} INFO - Estimated sufficient time budget=2429s. Estimated necessary time budget=24s.
[flaml.automl.logger: 06-25 12:59:00] {2392} INFO -  at 0.3s,	estimator lgbm's best error=0.9901,	best estimator lgbm's best error=0.9901
[flaml.automl.logger: 06-25 12:59:00] {2219} INFO - iteration 1, current learner lgbm
[flaml.automl.logger: 06-25 12:59:00] {2392} INFO -  at 0.4s,	estimator lgbm's best error=0.9397,	best estimator lgbm's best error=0.9397
[flaml.auto

<class 'TimeoutError'> [Errno 14] <frame at 0x166dff780, file '/Users/emiliolr/miniforge3/envs/life-hunting/lib/python3.12/site-packages/joblib/parallel.py', line 1707, code _retrieve>


[flaml.automl.logger: 06-25 12:59:13] {2392} INFO -  at 7.0s,	estimator lrl2's best error=0.5000,	best estimator lgbm's best error=0.3148
[flaml.automl.logger: 06-25 12:59:13] {2628} INFO - retrain lgbm for 0.0s
[flaml.automl.logger: 06-25 12:59:13] {2631} INFO - retrained model: LGBMClassifier(colsample_bytree=0.5587141427298338, learning_rate=1.0,
               max_bin=1023, min_child_samples=41, n_estimators=1, n_jobs=-1,
               num_leaves=19, reg_alpha=0.03170864738238316,
               reg_lambda=1.0104135255795006, verbose=-1)
[flaml.automl.logger: 06-25 12:59:13] {1931} INFO - fit succeeded
[flaml.automl.logger: 06-25 12:59:13] {1932} INFO - Time taken to find the best model: 5.09814190864563


In [79]:
# Tuning the probability threshold for the zero model
X_zero, y_zero, _, _ = get_zero_nonzero_datasets(train_data, extirp_pos = hurdle_model.extirp_pos,
                                                 pred = False, **hurdle_model.data_args)
y_pred = hurdle_model.zero_model.predict_proba(X_zero)[ : , 1]

opt_thresh, _ = test_thresholds(y_pred, y_zero)
hurdle_model.prob_thresh = round(opt_thresh, 3)
print(f'Optimal threshold was found to be {hurdle_model.prob_thresh}')

Optimal threshold was found to be 0.8


In [80]:
# Predicting on the test set
y_pred = hurdle_model.predict(test_data)
y_pred[y_pred != 0] = np.exp(y_pred[y_pred != 0])
y_test = test_data['ratio'].copy(deep = True)

#  getting DI categories
true_DI_cats = ratios_to_DI_cats(y_test)
pred_DI_cats = ratios_to_DI_cats(y_pred)

In [81]:
# Getting performance metrics
mae = mean_absolute_error(y_test, y_pred)
print(f'MAE: {round(mae, 3)}')

mae_01, _ = mean_absolute_error_range(y_test, y_pred, 0, 1)
print(f'MAE (0-1 range): {round(mae_01, 3)}')

rmse = root_mean_squared_error(y_test, y_pred)
print(f'RMSE: {round(rmse, 3)}')

ba = balanced_accuracy_score(true_DI_cats, pred_DI_cats)
print(f'BA: {round(ba, 3)}')

MAE: 0.921
MAE (0-1 range): 0.407
RMSE: 2.564
BA: 0.436
