# Using Maxent as base models
Yangkang Chen<br>
Sep 12, 2023

In [1]:
import pandas as pd
import numpy as np
import random
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
import matplotlib
import warnings
import pickle
import os
import json

# matplotlib.style.use('ggplot')
# plt.rcParams['axes.facecolor']='w'
# warnings.filterwarnings('ignore')
%matplotlib inline

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
# Please download the sample data from:
# https://figshare.com/articles/dataset/Sample_data_Mallard_csv/24080745
# Assuming now it's downloaded and saved as './Sample_data_Mallard.csv'

# you can also try other species like 
# https://figshare.com/articles/dataset/Sample_data_Alder_Flycatcher_csv/24080751
# https://figshare.com/articles/dataset/Sample_data_Short-eared_Owl_csv/24080742
# https://figshare.com/articles/dataset/Sample_data_Eurasian_Tree_Sparrow_csv/24080748


In [4]:
data = pd.read_csv(f'./Sample_data_Mallard.csv')
data = data.drop('sampling_event_identifier', axis=1)

# Get X and y

In [5]:
X = data.drop('count', axis=1)
y = data['count'].values


# First thing first: Spatio-temporal train test split

In [6]:
from stemflow.model_selection import ST_train_test_split
X_train, X_test, y_train, y_test = ST_train_test_split(X, y, 
                                                       Spatio_blocks_count = 50, Temporal_blocks_count=50,
                                                       random_state=42, test_size=0.3)

# Train AdaSTEM hurdle model

In [7]:
from stemflow.model.AdaSTEM import AdaSTEM, AdaSTEMClassifier, AdaSTEMRegressor
from xgboost import XGBClassifier, XGBRegressor
from stemflow.model.Hurdle import Hurdle_for_AdaSTEM, Hurdle
import elapid as ela


In [8]:
## create model instance
model = AdaSTEMClassifier(base_model=ela.MaxentModel(transform='cloglog', beta_multiplier=2.0),
                                save_gridding_plot = True,
                                ensemble_fold=10, 
                                min_ensemble_required=7,
                                grid_len_lon_upper_threshold=50,
                                grid_len_lon_lower_threshold=5,
                                grid_len_lat_upper_threshold=50,
                                grid_len_lat_lower_threshold=5,
                                temporal_step=50,
                                temporal_bin_interval=50,
                                points_lower_threshold=100, njobs=4)


In [9]:
X_train

Unnamed: 0,longitude,latitude,DOY,duration_minutes,Traveling,Stationary,Area,effort_distance_km,number_observers,obsvr_species_count,time_observation_started_minute_of_day,elevation_mean,slope_mean,eastness_mean,northness_mean,bio1,bio2,bio3,bio4,bio5,bio6,bio7,bio8,bio9,bio10,bio11,bio12,bio13,bio14,bio15,bio16,bio17,bio18,bio19,closed_shrublands,cropland_or_natural_vegetation_mosaics,croplands,deciduous_broadleaf_forests,deciduous_needleleaf_forests,evergreen_broadleaf_forests,evergreen_needleleaf_forests,grasslands,mixed_forests,non_vegetated_lands,open_shrublands,permanent_wetlands,savannas,urban_and_built_up_lands,water_bodies,woody_savannas,entropy
0,-83.472224,8.859308,22,300.0,1,0,0,4.828,5.0,34.0,476,7.555556,0.758156,0.036083,-0.021484,24.883502,5.174890,59.628088,93.482247,30.529131,21.850519,8.678612,24.302626,26.536822,26.213334,23.864924,0.720487,0.127594,0.003156,0.001451,0.332425,0.026401,0.044218,0.260672,0.0,0.000000,0.000000,0.000000,0.0,0.138889,0.000000,0.000000,0.000000,0.0,0.0,0.777778,0.000000,0.000000,0.083333,0.000000,0.676720
1,-2.687724,43.373323,290,90.0,1,0,0,0.570,2.0,151.0,1075,30.833336,3.376527,0.050544,-0.099299,14.107917,5.224109,31.174167,376.543853,23.219421,6.461607,16.757814,9.048385,19.092725,19.236082,9.287841,0.171423,0.035598,0.004512,0.000081,0.084657,0.018400,0.030210,0.065007,0.0,0.000000,0.000000,0.000000,0.0,0.333333,0.000000,0.000000,0.083333,0.0,0.0,0.000000,0.194444,0.027778,0.000000,0.361111,1.359063
2,-89.884770,35.087255,141,10.0,0,1,0,-1.000,2.0,678.0,575,91.777780,0.558100,-0.187924,-0.269078,17.396487,8.673912,28.688889,718.996078,32.948335,2.713938,30.234397,14.741099,13.759220,26.795849,7.747272,0.187089,0.031802,0.005878,0.000044,0.073328,0.026618,0.039616,0.059673,0.0,0.055556,0.000000,0.000000,0.0,0.000000,0.000000,0.305556,0.000000,0.0,0.0,0.000000,0.527778,0.000000,0.000000,0.111111,1.104278
3,-99.216873,31.218510,104,9.0,1,0,0,0.805,2.0,976.0,657,553.166700,0.856235,-0.347514,-0.342971,20.740836,10.665164,35.409121,666.796919,35.909941,5.790119,30.119822,18.444353,30.734456,29.546417,11.701038,0.084375,0.025289,0.000791,0.000052,0.052866,0.004096,0.006064,0.015965,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,1.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,-0.000000
4,-124.426730,43.065847,96,30.0,1,0,0,0.161,2.0,654.0,600,6.500000,0.491816,-0.347794,-0.007017,11.822340,6.766870,35.672897,396.157833,22.608788,3.639569,18.969219,8.184412,16.290802,17.258721,7.319234,0.144122,0.044062,0.000211,0.000147,0.089238,0.004435,0.004822,0.040621,0.0,0.000000,0.000000,0.000000,0.0,0.361111,0.166667,0.000000,0.472222,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,1.020754
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
399992,-72.866418,42.284674,135,26.0,0,1,0,-1.000,1.0,82.0,876,261.833340,6.557083,0.032050,0.010564,11.311626,8.425120,26.173181,865.498760,29.085216,-3.104682,32.189899,0.779390,23.451954,23.133536,0.558971,0.146514,0.022105,0.005105,0.000020,0.052974,0.019771,0.022775,0.044293,0.0,0.000000,0.000000,0.861111,0.0,0.000000,0.000000,0.000000,0.138889,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.402941
399995,-93.856544,45.343599,173,7.0,0,1,0,-1.000,1.0,734.0,703,288.333340,0.291276,0.058862,-0.073619,8.071018,8.432168,21.100676,1105.670016,28.693508,-11.268091,39.961599,22.654184,-7.615305,22.926627,-6.419971,0.100922,0.018951,0.001125,0.000026,0.045567,0.007614,0.042444,0.007614,0.0,0.111111,0.777778,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.111111,0.000000,0.000000,0.683739
399996,-103.538804,41.770665,298,16.0,1,0,0,0.161,1.0,803.0,726,1186.111200,0.958184,0.459900,-0.046327,12.325220,12.542043,34.888082,881.115618,31.890323,-4.059039,35.949362,14.831920,2.051258,24.926994,1.965077,0.049444,0.013380,0.000833,0.000012,0.027230,0.002873,0.011386,0.005529,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,1.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,-0.000000
399997,-68.873896,47.671126,208,15.0,1,0,0,0.300,1.0,363.0,435,160.222230,1.210733,0.369554,0.018077,4.520960,7.485217,20.590898,1020.163575,23.542224,-12.809842,36.352066,5.497164,16.976383,18.200694,-7.881880,0.133743,0.022637,0.002391,0.000025,0.053372,0.016781,0.022682,0.030404,0.0,0.000000,0.000000,0.166667,0.0,0.000000,0.000000,0.000000,0.805556,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.027778,0.572348


In [10]:
## fit model
model.fit(X_train, np.where(y_train>0,1,0))

Generating Ensemble:   0%|          | 0/10 [00:00<?, ?it/s]

Generating Ensemble: 100%|██████████| 10/10 [00:12<00:00,  1.21s/it]
100%|██████████| 13370/13370 [32:04<00:00,  6.95it/s] 


In [11]:
## predict
pred_adastem = model.predict(X_test)


In [12]:
## save prediction results
pred_df = pd.DataFrame({
            'y_true':y_test.flatten(),
            'y_pred_adastem':np.where(pred_adastem.flatten()<0, 0, pred_adastem.flatten()),
        }).dropna()

## calculate metrics
metric_dict = AdaSTEM.eval_STEM_res('hurdle', np.array(pred_df.y_true).flatten(), 
                                            np.where(np.array(pred_df.y_pred_adastem).flatten()<0, 0, np.array(pred_df.y_pred_adastem).flatten())
                                            )



In [13]:
with open('./Maxent_AdasTEM_metrics.json', 'w') as f:
    json.dump(metric_dict, f)
    

In [14]:
dict([(a,b) for a,b in metric_dict.items() if a in ['AUC','kappa','f1','precision','recall','average_precision']])

{'AUC': 0.7302657060170704,
 'kappa': 0.3891521535228686,
 'f1': 0.5171835328255351,
 'precision': 0.4349003011933356,
 'recall': 0.6378683666051558,
 'average_precision': 0.341234738072168}

# Compared to simple Maxent model

In [15]:
## create model instance
model_me = ela.MaxentModel(transform='cloglog', beta_multiplier=2.0)

In [16]:
## fit model
model_me.fit(X_train.drop(['longitude','latitude'], axis=1), np.where(y_train>0,1,0))

In [17]:
## predict on test set
pred_me = model_me.predict(X_test.drop(['longitude','latitude'], axis=1))


In [18]:
## save prediction results
pred_df = pd.DataFrame({
    'y_true':y_test.flatten(),
    'y_pred':np.where(pred_me.flatten()>0.5, 1, 0)
}).dropna()

## calculate metrics
metrics_me = AdaSTEM.eval_STEM_res('hurdle', pred_df.y_true, pred_df.y_pred)


In [19]:
with open('./Maxent_simple_metrics.json', 'w') as f:
    json.dump(metrics_me, f)
    

In [20]:
dict([(a,b) for a,b in metrics_me.items() if a in ['AUC','kappa','f1','precision','recall','average_precision']])

{'AUC': 0.7166285212372914,
 'kappa': 0.3200639824175635,
 'f1': 0.46625433043730613,
 'precision': 0.35706745193910433,
 'recall': 0.6716308721325182,
 'average_precision': 0.2938855307639262}

In [21]:
from watermark import watermark
print(watermark())
print(watermark(packages="stemflow,numpy,scipy,pandas,xgboost,tqdm,matplotlib,h3pandas,geopandas,scikit-learn,watermark"))


Last updated: 2023-09-12T22:42:16.805375+08:00

Python implementation: CPython
Python version       : 3.11.5
IPython version      : 8.15.0

Compiler    : Clang 15.0.7 
OS          : Darwin
Release     : 21.6.0
Machine     : arm64
Processor   : arm
CPU cores   : 8
Architecture: 64bit

stemflow    : 0.0.10
numpy       : 1.25.2
scipy       : 1.11.2
pandas      : 2.1.0
xgboost     : 2.0.0
tqdm        : 4.66.1
matplotlib  : 3.7.3
h3pandas    : 0.2.4
geopandas   : 0.13.2
scikit-learn: 1.3.0
watermark   : 2.4.3

