# Setup

In [1]:
import json
import os
import sys
import warnings

sys.path.append('..')

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from sklearn.metrics import balanced_accuracy_score, recall_score, mean_absolute_error, root_mean_squared_error
from sklearn.dummy import DummyRegressor

from pymer4 import Lmer
from flaml import AutoML

from utils import read_csv_non_utf, preprocess_data, get_zero_nonzero_datasets, test_thresholds, ratios_to_DI_cats
from model_utils import HurdleModelEstimator, PymerModelWrapper
from custom_metrics import balanced_accuracy_FLAML, mean_absolute_error_range

In [2]:
# Loading in general configuration
with open('../config.json', 'r') as f:
    config = json.load(f)

# Getting filepaths
gdrive_fp = config['gdrive_path']
LIFE_fp = config['LIFE_folder']
dataset_fp = config['datasets_path']
benitez_lopez2019 = config['indiv_data_paths']['benitez_lopez2019']
ferreiro_arias2024 = config['indiv_data_paths']['ferreiro_arias2024']

ben_lop_path = os.path.join(gdrive_fp, LIFE_fp, dataset_fp, benitez_lopez2019)
fer_ari_path = os.path.join(gdrive_fp, LIFE_fp, dataset_fp, ferreiro_arias2024)

In [3]:
# Reading in the datasets
bird_data = pd.read_csv(fer_ari_path)
mammal_data = read_csv_non_utf(ben_lop_path)

# Cross-taxa generalisation

Training on birds and evaluating on mammals, and vice versa.

In [135]:
# Choosing which dataset to train on - "mammals" or "birds"
train_dataset = 'mammals' 

test_dataset = 'birds' if train_dataset == 'mammals' else 'mammals'
print(f'Training on {train_dataset} and testing on {test_dataset}')

Training on mammals and testing on birds


In [136]:
# Choosing the model to use - pymer, FLAML_hurdle, or dummy_regressor
model_to_use = 'dummy_regressor'

if model_to_use == 'pymer':
    #  setting up the equations for each model
    if train_dataset == 'mammals':
        formula_zero = 'local_extirpation ~ BM + DistKm + I(DistKm^2) + PopDens + Stunting + Reserve + (1|Country) + (1|Species)'
        formula_nonzero = 'RR ~ BM + DistKm + I(DistKm^2) + PopDens + I(PopDens^2) + BM*DistKm + (1|Country) + (1|Species)'
    elif dataset == 'birds':
        formula_zero = 'local_extirpation ~ BM + DistKm + TravTime + PopDens + Stunting + Reserve + BM*DistKm + BM*TravTime + BM*Stunting + (1|Country) + (1|Species)'
        formula_nonzero = 'RR ~ BM + DistKm + TravTime + PopDens + Stunting + Reserve + BM*DistKm + BM*TravTime + BM*Stunting + (1|Country) + (1|Species)'
    
    control_str = "optimizer='bobyqa', optCtrl=list(maxfun=1e5)"

    #  hurdle model params
    use_rfx = False
    extirp_pos = False

    outlier_cutoff = 15 if dataset == 'mammals' else 5
    data_args = {'outlier_cutoff' : outlier_cutoff, 'dataset' : dataset}

    #  setting up the hurdle model
    zero_model = PymerModelWrapper(Lmer, formula = formula_zero, family = 'binomial', control_str = control_str, 
                                   use_rfx = use_rfx)
    nonzero_model = PymerModelWrapper(Lmer, formula = formula_nonzero, family = 'gaussian', control_str = control_str, 
                                      use_rfx = use_rfx)

    hurdle_model = HurdleModelEstimator(zero_model, nonzero_model, extirp_pos = extirp_pos, data_args = data_args,
                                        verbose = True)

    fit_args = None
    pp_args = {'include_indicators' : False,
               'include_categorical' : True,
               'polynomial_features' : 0,
               'log_trans_cont' : True,
               'dataset' : 'both'}

    #  results saving params
    model_name = 'pymer_hurdle'
    model_name += '_w_rfx' if use_rfx else '_wo_rfx'

elif model_to_use == 'FLAML_hurdle':
    #  general parameters
    time_budget_mins = 0.1
    model_name = f'FLAML_hurdle_{time_budget_mins}mins'
    base_path = os.path.join('..', 'model_saves')
    verbose = 3
    
    #  getting the predictors
    zero_columns = ['BM', 'DistKm', 'PopDens', 'Stunting', 'TravTime', 'LivestockBio', 'Reserve']
    nonzero_columns = zero_columns
    indicator_columns = []
    
    zero_metric = balanced_accuracy_FLAML
    nonzero_metric = 'mse'
    
    #  setting up the zero and nonzero models
    zero_model = AutoML()
    nonzero_model = AutoML()
    
    #  specify fitting paramaters
    zero_settings = {
        'time_budget' : time_budget_mins * 60,  # in seconds
        'metric' : zero_metric,
        'task' : 'classification',
        'log_file_name' : os.path.join(base_path, f'nonlinear_hurdle_ZERO.log'),
        'seed' : 1693,
        'estimator_list' : ['lgbm', 'xgboost', 'xgb_limitdepth', 'rf', 
                            'extra_tree', 'kneighbor', 'lrl1', 'lrl2'],
        'early_stop' : True,
        'verbose' : verbose,
        'keep_search_state' : True,
        'eval_method' : 'cv'
    }
    
    nonzero_settings = {
        'time_budget' : time_budget_mins * 60,  # in seconds
        'metric' : nonzero_metric,
        'task' : 'regression',
        'log_file_name' : os.path.join(base_path, f'nonlinear_hurdle_NONZERO.log'),
        'seed' : 1693,
        'estimator_list' : ['lgbm', 'xgboost', 'xgb_limitdepth', 'rf', 'extra_tree', 'kneighbor'],
        'early_stop' : True,
        'verbose' : verbose,
        'keep_search_state' : True,
        'eval_method' : 'cv'
    }
    
    extirp_pos = False
    fit_args = {'zero' : zero_settings, 'nonzero' : nonzero_settings}
    
    #  dumping everything into the hurdle model wrapper
    data_args = {'indicator_columns' : indicator_columns,
                 'nonzero_columns' : nonzero_columns,
                 'zero_columns' : zero_columns,
                 'embeddings_to_use' : None,
                 'dataset' : 'both'}
    hurdle_model = HurdleModelEstimator(zero_model, nonzero_model, extirp_pos = extirp_pos, 
                                        data_args = data_args, verbose = True)

    #  defining preprocessing
    pp_args = {'include_indicators' : False,
               'include_categorical' : False,
               'polynomial_features' : 0,
               'log_trans_cont' : False,
               'dataset' : dataset,
               'embeddings_to_use' : None}

elif model_to_use == 'dummy_regressor':
    strat = 'mean' # either mean or median
    model = DummyRegressor(strategy = strat)
    
    pp_args = {'include_indicators' : False,
               'include_categorical' : False,
               'polynomial_features' : 0,
               'log_trans_cont' : False,
               'dataset' : 'both'}

    #  results saving params
    model_name = 'dummy_regressor'

print(model_name)

dummy_regressor


In [137]:
# Aligning the two datasets
cols = ['Order', 'Family', 'Species', 'ratio', 'X', 'Y', 'Country', 'BM', 'DistKm', 'PopDens', 
        'Stunting', 'TravTime', 'LivestockBio', 'Reserve']
mammal_data_sub = mammal_data[cols].copy(deep = True)

cols = ['Order', 'Family', 'Species', 'RR', 'Latitude', 'Longitude', 'Country', 'Body_Mass', 
        'Dist_Hunters', 'PopDens', 'Stunting', 'TravDist', 'FoodBiomass', 'Reserve']
bird_data_sub = bird_data[cols].copy(deep = True)
bird_data_sub['Reserve'] = bird_data_sub['Reserve'].replace({0 : 'No', 1 : 'Yes'}) # aligning the coding of this binary columns to the mammal dataset

bird_data_sub = bird_data_sub.rename(columns = {'RR' : 'ratio', 'Longitude' : 'X', 'Latitude' : 'Y',
                                                'Dist_Hunters' : 'DistKm', 'TravDist' : 'TravTime',
                                                'FoodBiomass' : 'LivestockBio', 'Body_Mass' : 'BM'})

In [138]:
# Preparing the data
dataset = 'both'
data = pd.concat((mammal_data_sub, bird_data_sub), join = 'inner', axis = 0, ignore_index = True)

mammal_idxs = [i for i in range(len(mammal_data_sub))]
bird_idxs = [i for i in range(len(mammal_data_sub), len(data))]
idxs = {'train' : mammal_idxs if train_dataset == 'mammals' else bird_idxs, 
        'test' : bird_idxs if train_dataset == 'mammals' else mammal_idxs}

pp_data = preprocess_data(data, standardize = True, train_test_idxs = idxs, **pp_args)

train_data, test_data = pp_data.iloc[idxs['train']].reset_index(drop = True), pp_data.iloc[idxs['test']].reset_index(drop = True)

In [139]:
# Fitting the model
if model_to_use in ['FLAML_hurdle', 'pymer']:
    hurdle_model.fit(train_data, fit_args = fit_args)
elif model_to_use == 'dummy_regressor':
    X_train, y_train = train_data.drop(columns = resp_col), train_data[resp_col]
    model.fit(X_train, y_train)

In [140]:
# Tuning the probability threshold for the zero model
if model_to_use in ['FLAML_hurdle', 'pymer']:
    X_zero, y_zero, _, _ = get_zero_nonzero_datasets(train_data, extirp_pos = hurdle_model.extirp_pos,
                                                     pred = False, **hurdle_model.data_args)
    y_pred = hurdle_model.zero_model.predict_proba(X_zero)[ : , 1]
    
    opt_thresh, _ = test_thresholds(y_pred, y_zero)
    hurdle_model.prob_thresh = round(opt_thresh, 3)
    print(f'Optimal threshold was found to be {hurdle_model.prob_thresh}')

In [141]:
# Predicting on the test set
if model_to_use in ['FLAML_hurdle', 'pymer']:
    y_pred = hurdle_model.predict(test_data)
elif model_to_use == 'dummy_regressor':
    X_test = test_data.drop(columns = resp_col)
    y_pred = model.predict(X_test)
if model_to_use in ['FLAML_hurdle', 'pymer']:
    y_pred[y_pred != 0] = np.exp(y_pred[y_pred != 0])

y_test = test_data[resp_col].copy(deep = True)

#  getting DI categories
true_DI_cats = ratios_to_DI_cats(y_test)
pred_DI_cats = ratios_to_DI_cats(y_pred)

#  saving predictions
save_filename = f'{model_name}_SPECIAL_{train_dataset}-->{test_dataset}.csv'
preds_df = pd.DataFrame({'index' : test_data.index, 
                         'actual' : y_test,
                         'predicted' : y_pred})
preds_df = preds_df.set_index('index').sort_index()

preds_df.to_csv(os.path.join('..', 'results', 'raw_predictions', save_filename))
print(f'Results saved to: {save_filename}')

Results saved to: dummy_regressor_SPECIAL_mammals-->birds.csv


In [142]:
# Getting performance metrics
mae = mean_absolute_error(y_test, y_pred)
print(f'MAE: {round(mae, 3)}')

mae_01, _ = mean_absolute_error_range(y_test, y_pred, 0, 1)
print(f'MAE (0-1 range): {round(mae_01, 3)}')

rmse = root_mean_squared_error(y_test, y_pred)
print(f'RMSE: {round(rmse, 3)}')

ba = balanced_accuracy_score(true_DI_cats, pred_DI_cats)
print(f'BA: {round(ba, 3)}')

MAE: 0.961
MAE (0-1 range): 0.774
RMSE: 1.726
BA: 0.333


# Cross-continent generalisation
For mammals, this is test on South America or Africa (and train on everything else), and for birds, this is test on the Neotropical or Indomalayan region (and train on everything else).

In [82]:
# Choosing which region to train on - "mammals" or "birds"
dataset = 'mammals'
test_region = 'Africa' # for mammals, either "S America" or "Africa", and for birds, either "Neotropic" or "Indomalayan" 

if dataset == 'mammals':
    assert test_region in ['S America', 'Africa'], 'The only valid test regions for mammals are "S America" or "Africa".' 
elif dataset == 'birds':
    assert test_region in ['Neotropic', 'Indomalayan'], 'The only valid test regions for birds are "Neotropic" or "Indomalayan".' 

print(f'{dataset.title()} dataset, testing on {test_region}')

Mammals dataset, testing on Africa


In [83]:
# Choosing the model to use - pymer, FLAML_hurdle, or dummy_regressor
model_to_use = 'dummy_regressor'

if model_to_use == 'pymer':
    #  setting up the equations for each model
    if dataset == 'mammals':
        formula_zero = 'local_extirpation ~ BM + DistKm + I(DistKm^2) + PopDens + Stunting + Reserve + (1|Country) + (1|Species) + (1|Study)'
        formula_nonzero = 'RR ~ BM + DistKm + I(DistKm^2) + PopDens + I(PopDens^2) + BM*DistKm + (1|Country) + (1|Species) + (1|Study)'
    elif dataset == 'birds':
        formula_zero = 'local_extirpation ~ Body_Mass + Dist_Hunters + TravDist + PopDens + Stunting + NPP + Reserve + Body_Mass*Dist_Hunters + Body_Mass*TravDist + Body_Mass*Stunting + NPP*Dist_Hunters + (1|Country) + (1|Species)'
        formula_nonzero = 'RR ~ Body_Mass + Dist_Hunters + TravDist + PopDens + Stunting + NPP + Reserve + Body_Mass*Dist_Hunters + Body_Mass*TravDist + Body_Mass*Stunting + NPP*Dist_Hunters + (1|Country) + (1|Species)'
    
    control_str = "optimizer='bobyqa', optCtrl=list(maxfun=1e5)"

    #  hurdle model params
    use_rfx = False
    extirp_pos = False

    outlier_cutoff = 15 if dataset == 'mammals' else 5
    data_args = {'outlier_cutoff' : outlier_cutoff, 'dataset' : dataset}

    #  setting up the hurdle model
    zero_model = PymerModelWrapper(Lmer, formula = formula_zero, family = 'binomial', control_str = control_str, 
                                   use_rfx = use_rfx)
    nonzero_model = PymerModelWrapper(Lmer, formula = formula_nonzero, family = 'gaussian', control_str = control_str, 
                                      use_rfx = use_rfx)

    hurdle_model = HurdleModelEstimator(zero_model, nonzero_model, extirp_pos = extirp_pos, data_args = data_args,
                                        verbose = True)

    fit_args = None
    pp_args = {'include_indicators' : False,
               'include_categorical' : True,
               'polynomial_features' : 0,
               'log_trans_cont' : True,
               'dataset' : dataset}

    #  results saving params
    model_name = 'pymer_hurdle'
    model_name += '_w_rfx' if use_rfx else '_wo_rfx'

elif model_to_use == 'FLAML_hurdle':
    #  general parameters
    time_budget_mins = 0.1
    model_name = f'FLAML_hurdle_{time_budget_mins}mins'
    base_path = os.path.join('..', 'model_saves')
    verbose = 3
    
    #  getting the predictors
    if dataset == 'mammals':
        zero_columns = ['BM', 'DistKm', 'PopDens', 'Stunting', 'TravTime', 'LivestockBio', 'Reserve', 'Literacy']
    elif dataset == 'birds':
        zero_columns = ['Dist_Hunters', 'TravDist', 'PopDens', 'Stunting', 'FoodBiomass', 'Forest_cover', 'NPP', 'Body_Mass']
    nonzero_columns = zero_columns
    indicator_columns = []
    
    zero_metric = balanced_accuracy_FLAML
    nonzero_metric = 'mse'
    
    #  setting up the zero and nonzero models
    zero_model = AutoML()
    nonzero_model = AutoML()
    
    #  specify fitting paramaters
    zero_settings = {
        'time_budget' : time_budget_mins * 60,  # in seconds
        'metric' : zero_metric,
        'task' : 'classification',
        'log_file_name' : os.path.join(base_path, f'nonlinear_hurdle_ZERO.log'),
        'seed' : 1693,
        'estimator_list' : ['lgbm', 'xgboost', 'xgb_limitdepth', 'rf', 
                            'extra_tree', 'kneighbor', 'lrl1', 'lrl2'],
        'early_stop' : True,
        'verbose' : verbose,
        'keep_search_state' : True,
        'eval_method' : 'cv'
    }
    
    nonzero_settings = {
        'time_budget' : time_budget_mins * 60,  # in seconds
        'metric' : nonzero_metric,
        'task' : 'regression',
        'log_file_name' : os.path.join(base_path, f'nonlinear_hurdle_NONZERO.log'),
        'seed' : 1693,
        'estimator_list' : ['lgbm', 'xgboost', 'xgb_limitdepth', 'rf', 'extra_tree', 'kneighbor'],
        'early_stop' : True,
        'verbose' : verbose,
        'keep_search_state' : True,
        'eval_method' : 'cv'
    }
    
    extirp_pos = False
    fit_args = {'zero' : zero_settings, 'nonzero' : nonzero_settings}
    
    #  dumping everything into the hurdle model wrapper
    data_args = {'indicator_columns' : indicator_columns,
                 'nonzero_columns' : nonzero_columns,
                 'zero_columns' : zero_columns,
                 'embeddings_to_use' : None,
                 'dataset' : dataset}
    hurdle_model = HurdleModelEstimator(zero_model, nonzero_model, extirp_pos = extirp_pos, 
                                        data_args = data_args, verbose = True)

    #  defining preprocessing
    pp_args = {'include_indicators' : False,
               'include_categorical' : False,
               'polynomial_features' : 0,
               'log_trans_cont' : False,
               'dataset' : dataset,
               'embeddings_to_use' : None}

elif model_to_use == 'dummy_regressor':
    strat = 'mean' # either mean or median
    model = DummyRegressor(strategy = strat)
    
    pp_args = {'include_indicators' : False,
               'include_categorical' : False,
               'polynomial_features' : 0,
               'log_trans_cont' : False,
               'dataset' : dataset}

    #  results saving params
    model_name = 'dummy_regressor'

print(model_name)

dummy_regressor


In [84]:
# Preparing the data
data = mammal_data if dataset == 'mammals' else bird_data
col = 'Region' if dataset == 'mammals' else 'Realm'
resp_col = 'ratio' if dataset == 'mammals' else 'RR'

test_idxs = data.index[data[col] == test_region].to_list()
train_idxs = [i for i in data.index if i not in test_idxs]
idxs = {'train' : train_idxs, 'test' : test_idxs}

pp_data = preprocess_data(data, standardize = True, train_test_idxs = idxs, **pp_args)

train_data, test_data = pp_data.iloc[idxs['train']], pp_data.iloc[idxs['test']]

In [85]:
# Fitting the model
if model_to_use in ['FLAML_hurdle', 'pymer']:
    hurdle_model.fit(train_data, fit_args = fit_args)
elif model_to_use == 'dummy_regressor':
    X_train, y_train = train_data.drop(columns = resp_col), train_data[resp_col]
    model.fit(X_train, y_train)

In [86]:
# Tuning the probability threshold for the zero model
if model_to_use in ['FLAML_hurdle', 'pymer']:
    X_zero, y_zero, _, _ = get_zero_nonzero_datasets(train_data, extirp_pos = hurdle_model.extirp_pos,
                                                     pred = False, **hurdle_model.data_args)
    y_pred = hurdle_model.zero_model.predict_proba(X_zero)[ : , 1]
    
    opt_thresh, _ = test_thresholds(y_pred, y_zero)
    hurdle_model.prob_thresh = round(opt_thresh, 3)
    print(f'Optimal threshold was found to be {hurdle_model.prob_thresh}')

In [87]:
# Predicting on the test set
if model_to_use in ['FLAML_hurdle', 'pymer']:
    y_pred = hurdle_model.predict(test_data)
elif model_to_use == 'dummy_regressor':
    X_test = test_data.drop(columns = resp_col)
    y_pred = model.predict(X_test)
if model_to_use in ['FLAML_hurdle', 'pymer']:
    y_pred[y_pred != 0] = np.exp(y_pred[y_pred != 0])

y_test = test_data[resp_col].copy(deep = True)

#  getting DI categories
true_DI_cats = ratios_to_DI_cats(y_test)
pred_DI_cats = ratios_to_DI_cats(y_pred)

#  saving predictions
save_filename = f'{model_name}_SPECIAL_test-region-{test_region.replace(' ', '')}.csv'
preds_df = pd.DataFrame({'index' : test_data.index, 
                         'actual' : y_test,
                         'predicted' : y_pred})
preds_df = preds_df.set_index('index').sort_index()

preds_df.to_csv(os.path.join('..', 'results', 'raw_predictions', save_filename))
print(f'Results saved to: {save_filename}')

Results saved to: dummy_regressor_SPECIAL_test-region-Africa.csv


In [88]:
# Getting performance metrics
mae = mean_absolute_error(y_test, y_pred)
print(f'MAE: {round(mae, 3)}')

mae_01, _ = mean_absolute_error_range(y_test, y_pred, 0, 1)
print(f'MAE (0-1 range): {round(mae_01, 3)}')

rmse = root_mean_squared_error(y_test, y_pred)
print(f'RMSE: {round(rmse, 3)}')

ba = balanced_accuracy_score(true_DI_cats, pred_DI_cats)
print(f'BA: {round(ba, 3)}')

MAE: 0.961
MAE (0-1 range): 0.646
RMSE: 4.479
BA: 0.333
