# Using Maxent as base model
Yangkang Chen<br>
Sep 5, 2023

In [1]:
import pandas as pd
import numpy as np
import random
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
import matplotlib
import warnings
import pickle
import geopandas as gpd
import os
import json

# matplotlib.style.use('ggplot')
# plt.rcParams['axes.facecolor']='w'
# warnings.filterwarnings('ignore')
%matplotlib inline

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
data = pd.read_csv(f'./BirdSTEM/dataset/test_data/Sample_data_Mallard.csv')

# Get X and y

In [4]:
X = data.drop('count', axis=1)
y = data['count'].values


# First thing first: Spatio-temporal train test split

In [5]:
from BirdSTEM.model_selection import ST_train_test_split
X_train, X_test, y_train, y_test = ST_train_test_split(X, y, 
                                                       Spatio_blocks_count = 50, Temporal_blocks_count=50,
                                                       random_state=42, test_size=0.3)

# Train AdaSTEM hurdle model

In [6]:
from BirdSTEM.model.AdaSTEM import AdaSTEM, AdaSTEMClassifier, AdaSTEMRegressor
from xgboost import XGBClassifier, XGBRegressor
from BirdSTEM.model.Hurdle import Hurdle_for_AdaSTEM, Hurdle
import elapid as ela


In [7]:
model = AdaSTEMClassifier(base_model=ela.MaxentModel(transform='cloglog', beta_multiplier=2.0),
                                save_gridding_plot = True,
                                ensemble_fold=10, 
                                min_ensemble_required=7,
                                grid_len_lon_upper_threshold=50,
                                grid_len_lon_lower_threshold=5,
                                grid_len_lat_upper_threshold=50,
                                grid_len_lat_lower_threshold=5,
                                temporal_step=50,
                                temporal_bin_interval=50,
                                points_lower_threshold=100)


In [8]:
# new_X_train = X_train.copy()
# new_X_train['y_'] = y_train
# new_X_train = new_X_train.sample(5000)
# new_X_train.iloc[:,:-1], np.where(new_X_train.iloc[:,-1].values>0,1,0)

In [9]:
model.fit(X_train, np.where(y_train>0,1,0))

Generating Ensemble: 100%|██████████| 10/10 [00:14<00:00,  1.47s/it]
training: 100%|██████████| 13142/13142 [1:57:47<00:00,  1.86it/s]  


In [10]:
with open('./test_output/maxent_adastem.pkl', 'wb') as f:
    pickle.dump(model, f)
    

In [11]:
pred_adastem = model.predict(X_test)


In [12]:
pred_df = pd.DataFrame({
            'y_true':y_test.flatten(),
            'y_pred_adastem':np.where(pred_adastem.flatten()<0, 0, pred_adastem.flatten()),
        }).dropna()


In [13]:
metric_dict = AdaSTEM.eval_STEM_res('hurdle', np.array(pred_df.y_true).flatten(), 
                                            np.where(np.array(pred_df.y_pred_adastem).flatten()<0, 0, np.array(pred_df.y_pred_adastem).flatten())
                                            )


In [14]:
with open('./test_output/Maxent_AdasTEM_metrics.json', 'w') as f:
    json.dump(metric_dict, f)
    

In [38]:
dict([(a,b) for a,b in metric_dict.items() if a in ['AUC','kappa','f1','precision','recall','average_precision']])

{'AUC': 0.7110742597058116,
 'kappa': 0.3594605543477699,
 'f1': 0.512802988882814,
 'precision': 0.43125910031420034,
 'recall': 0.6323744240925947,
 'average_precision': 0.34673055053552077}

# Compare to simple Maxent model

In [15]:
model_me = ela.MaxentModel(transform='cloglog', beta_multiplier=2.0)

In [16]:
model_me.fit(X_train.drop(['longitude','latitude'], axis=1), np.where(y_train>0,1,0))

In [17]:
with open('./test_output/maxent_simple.pkl', 'wb') as f:
    pickle.dump(model_me, f)
    

In [20]:
X_test.drop(['longitude','latitude'], axis=1)

Unnamed: 0,DOY,duration_minutes,Traveling,Stationary,Area,effort_distance_km,number_observers,obsvr_species_count,time_observation_started_minute_of_day,elevation_mean,slope_mean,eastness_mean,northness_mean,bio1,bio2,bio3,bio4,bio5,bio6,bio7,bio8,bio9,bio10,bio11,bio12,bio13,bio14,bio15,bio16,bio17,bio18,bio19,closed_shrublands,cropland_or_natural_vegetation_mosaics,croplands,deciduous_broadleaf_forests,deciduous_needleleaf_forests,evergreen_broadleaf_forests,evergreen_needleleaf_forests,grasslands,mixed_forests,non_vegetated_lands,open_shrublands,permanent_wetlands,savannas,urban_and_built_up_lands,water_bodies,woody_savannas,entropy
5,47,45.0,0,1,0,-1.000,1.0,55.0,430,2084.500000,1.474975,-0.705554,-0.279949,11.873589,12.150509,33.769915,871.207969,29.855962,-6.124314,35.980276,23.600977,17.788865,22.968558,0.133504,0.036888,0.006928,0.000720,0.000003,0.015758,0.003600,0.009808,0.009323,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.944444,0.000000,0.0,0.055556,0.0,0.000000,0.000000,0.000000,0.000000,0.214559
12,89,76.0,1,0,0,3.315,2.0,172.0,814,395.388900,2.308504,0.055726,-0.006490,7.243620,8.839642,21.304950,1112.647946,27.783987,-13.707036,41.491023,23.070757,-7.854962,22.174463,-6.952804,0.086734,0.025014,0.000926,0.000058,0.057456,0.003294,0.057456,0.005661,0.0,0.000000,0.944444,0.000000,0.0,0.0,0.000000,0.055556,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.214559
13,275,37.0,1,0,0,2.300,1.0,720.0,1009,264.888900,5.350314,0.243507,0.037571,15.132397,9.813581,37.641997,544.142596,30.478132,4.407302,26.070830,14.176979,19.783638,22.081275,8.399152,0.110303,0.020415,0.001847,0.000026,0.049038,0.010666,0.017080,0.026240,0.0,0.000000,0.000000,0.138889,0.0,0.0,0.000000,0.722222,0.138889,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.783383
14,251,90.0,1,0,0,0.805,1.0,590.0,510,141.222230,2.794508,-0.154491,-0.031299,13.595076,7.823010,24.654766,799.068812,31.303647,-0.426568,31.730216,9.913604,15.234011,24.641929,3.566562,0.164129,0.024047,0.005409,0.000031,0.065412,0.021523,0.042120,0.044549,0.0,0.000000,0.111111,0.138889,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.138889,0.000000,0.000000,0.611111,1.093450
15,296,30.0,0,1,0,-1.000,1.0,84.0,540,52.944447,2.603995,0.060170,-0.047103,10.923443,6.195472,31.186498,497.756412,22.305800,2.439920,19.865880,5.365580,18.067261,17.753206,5.325704,0.257844,0.059043,0.004477,0.000212,0.124012,0.020481,0.022377,0.106117,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.111111,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.888889,0.000000,0.000000,0.348832
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
399987,117,103.0,1,0,0,3.856,4.0,248.0,998,213.277770,0.808984,-0.156508,0.026345,12.377936,8.892305,25.787919,890.096242,29.932637,-4.549809,34.482446,15.757534,23.853752,23.961534,0.402520,0.127413,0.022590,0.002771,0.000029,0.053915,0.014413,0.022958,0.027138,0.0,0.000000,1.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,-0.000000
399990,69,8.0,1,0,0,0.130,1.0,160.0,590,0.000000,0.053453,-0.001716,0.015291,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.027778,0.972222,0.000000,0.126931
399993,334,5.0,0,1,0,-1.000,1.0,297.0,773,3.500000,0.660273,-0.050625,0.165530,27.440800,2.173208,50.560509,62.751009,29.554719,25.256487,4.298232,27.629019,27.418639,28.143704,26.472626,0.203693,0.045899,0.003085,0.000220,0.118625,0.009861,0.034195,0.016353,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.000000,1.000000,0.000000,-0.000000
399994,230,60.0,1,0,0,1.609,1.0,375.0,523,9.722222,0.165324,0.049576,0.010508,11.121199,6.797107,35.207923,428.067975,22.086899,2.781283,19.305616,6.783609,12.981573,16.814158,6.157729,0.099681,0.013711,0.001484,0.000017,0.039509,0.009661,0.033293,0.021796,0.0,0.027778,0.972222,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.126931


In [21]:
pred_me = model_me.predict(X_test.drop(['longitude','latitude'], axis=1))


In [22]:
pred_df = pd.DataFrame({
    'y_true':y_test.flatten(),
    'y_pred':np.where(pred_me.flatten()>0.5, 1, 0)
}).dropna()


In [23]:
metrics_me = AdaSTEM.eval_STEM_res('hurdle', pred_df.y_true, pred_df.y_pred)


In [24]:
with open('./test_output/Maxent_simple_metrics.json', 'w') as f:
    json.dump(metrics_me, f)
    

In [39]:
dict([(a,b) for a,b in metrics_me.items() if a in ['AUC','kappa','f1','precision','recall','average_precision']])

{'AUC': 0.7170955508504191,
 'kappa': 0.320510630875698,
 'f1': 0.466673915407198,
 'precision': 0.3572201414898044,
 'recall': 0.672832732403198,
 'average_precision': 0.29421951675350466}