# Using Maxent as base models
Yangkang Chen<br>
Sep 5, 2023

In [1]:
import pandas as pd
import numpy as np
import random
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
import matplotlib
import warnings
import pickle
import os
import json

# matplotlib.style.use('ggplot')
# plt.rcParams['axes.facecolor']='w'
# warnings.filterwarnings('ignore')
%matplotlib inline

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
data = pd.read_csv(f'./stemflow/dataset/test_data/Sample_data_Mallard.csv')

# Get X and y

In [4]:
X = data.drop('count', axis=1)
y = data['count'].values


# First thing first: Spatio-temporal train test split

In [5]:
from stemflow.model_selection import ST_train_test_split
X_train, X_test, y_train, y_test = ST_train_test_split(X, y, 
                                                       Spatio_blocks_count = 50, Temporal_blocks_count=50,
                                                       random_state=42, test_size=0.3)

# Train AdaSTEM hurdle model

In [6]:
from stemflow.model.AdaSTEM import AdaSTEM, AdaSTEMClassifier, AdaSTEMRegressor
from xgboost import XGBClassifier, XGBRegressor
from stemflow.model.Hurdle import Hurdle_for_AdaSTEM, Hurdle
import elapid as ela


In [7]:
## create model instance
model = AdaSTEMClassifier(base_model=ela.MaxentModel(transform='cloglog', beta_multiplier=2.0),
                                save_gridding_plot = True,
                                ensemble_fold=10, 
                                min_ensemble_required=7,
                                grid_len_lon_upper_threshold=50,
                                grid_len_lon_lower_threshold=5,
                                grid_len_lat_upper_threshold=50,
                                grid_len_lat_lower_threshold=5,
                                temporal_step=50,
                                temporal_bin_interval=50,
                                points_lower_threshold=100)


In [9]:
## fit model
model.fit(X_train, np.where(y_train>0,1,0))

Generating Ensemble: 100%|██████████| 10/10 [00:14<00:00,  1.47s/it]
training: 100%|██████████| 13142/13142 [1:57:47<00:00,  1.86it/s]  


In [10]:
## save model
with open('./test_output/maxent_adastem.pkl', 'wb') as f:
    pickle.dump(model, f)
    

In [11]:
## predict
pred_adastem = model.predict(X_test)


In [12]:
## save prediction results
pred_df = pd.DataFrame({
            'y_true':y_test.flatten(),
            'y_pred_adastem':np.where(pred_adastem.flatten()<0, 0, pred_adastem.flatten()),
        }).dropna()

## calculate metrics
metric_dict = AdaSTEM.eval_STEM_res('hurdle', np.array(pred_df.y_true).flatten(), 
                                            np.where(np.array(pred_df.y_pred_adastem).flatten()<0, 0, np.array(pred_df.y_pred_adastem).flatten())
                                            )



In [14]:
with open('./test_output/Maxent_AdasTEM_metrics.json', 'w') as f:
    json.dump(metric_dict, f)
    

In [38]:
dict([(a,b) for a,b in metric_dict.items() if a in ['AUC','kappa','f1','precision','recall','average_precision']])

{'AUC': 0.7110742597058116,
 'kappa': 0.3594605543477699,
 'f1': 0.512802988882814,
 'precision': 0.43125910031420034,
 'recall': 0.6323744240925947,
 'average_precision': 0.34673055053552077}

# Compared to simple Maxent model

In [15]:
## create model instance
model_me = ela.MaxentModel(transform='cloglog', beta_multiplier=2.0)

In [16]:
## fit model
model_me.fit(X_train.drop(['longitude','latitude'], axis=1), np.where(y_train>0,1,0))

In [17]:
## save model
with open('./test_output/maxent_simple.pkl', 'wb') as f:
    pickle.dump(model_me, f)
    

In [21]:
## predict on test set
pred_me = model_me.predict(X_test.drop(['longitude','latitude'], axis=1))


In [22]:
## save prediction results
pred_df = pd.DataFrame({
    'y_true':y_test.flatten(),
    'y_pred':np.where(pred_me.flatten()>0.5, 1, 0)
}).dropna()

## calculate metrics
metrics_me = AdaSTEM.eval_STEM_res('hurdle', pred_df.y_true, pred_df.y_pred)


In [24]:
with open('./test_output/Maxent_simple_metrics.json', 'w') as f:
    json.dump(metrics_me, f)
    

In [39]:
dict([(a,b) for a,b in metrics_me.items() if a in ['AUC','kappa','f1','precision','recall','average_precision']])

{'AUC': 0.7170955508504191,
 'kappa': 0.320510630875698,
 'f1': 0.466673915407198,
 'precision': 0.3572201414898044,
 'recall': 0.672832732403198,
 'average_precision': 0.29421951675350466}