# Prediction and Visualization

Yangkang Chen<br>
Sep 7, 2023

In [1]:
import pandas as pd
import numpy as np
import random
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
import matplotlib
import warnings
import pickle
import os
import seaborn as sns

# warnings.filterwarnings('ignore')
%matplotlib inline

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
data = pd.read_csv(f'./stemy/dataset/test_data/Sample_data_Mallard.csv')


# Get X and y

In [4]:
X = data.drop('count', axis=1)
y = data['count'].values


# First thing first: Spatio-temporal train test split

In [5]:
from stemflow.model_selection import ST_train_test_split
X_train, X_test, y_train, y_test = ST_train_test_split(X, y, 
                                                       Spatio_blocks_count = 50, Temporal_blocks_count=50,
                                                       random_state=42, test_size=0.2)


# Train AdaSTEM hurdle model

In [6]:
from stemflow.model.AdaSTEM import AdaSTEM, AdaSTEMClassifier, AdaSTEMRegressor
from xgboost import XGBClassifier, XGBRegressor
from stemflow.model.Hurdle import Hurdle_for_AdaSTEM, Hurdle


In [36]:
model = Hurdle_for_AdaSTEM(
    classifier=AdaSTEMClassifier(base_model=XGBClassifier(tree_method='hist',random_state=42, verbosity = 0, n_jobs=1),
                                save_gridding_plot = True,
                                ensemble_fold=10, 
                                min_ensemble_required=7,
                                grid_len_lon_upper_threshold=80,
                                grid_len_lon_lower_threshold=10,
                                grid_len_lat_upper_threshold=80,
                                grid_len_lat_lower_threshold=10,
                                points_lower_threshold=20,
                                temporal_start = 1,
                                temporal_end=366,
                                temporal_step=20,
                                temporal_bin_interval = 50,
                                Spatio1='longitude',
                                Spatio2 = 'latitude', 
                                Temporal1 = 'DOY',
                                use_temporal_to_train=True),
    regressor=AdaSTEMRegressor(base_model=XGBRegressor(tree_method='hist',random_state=42, verbosity = 0, n_jobs=1),
                                save_gridding_plot = True,
                                ensemble_fold=10, 
                                min_ensemble_required=7,
                                grid_len_lon_upper_threshold=80,
                                grid_len_lon_lower_threshold=10,
                                grid_len_lat_upper_threshold=80,
                                grid_len_lat_lower_threshold=10,
                                points_lower_threshold=20,
                                temporal_start = 1, 
                                temporal_end=366,
                                temporal_step=20, 
                                temporal_bin_interval = 50,
                                Spatio1='longitude',
                                Spatio2 = 'latitude', 
                                Temporal1 = 'DOY',
                                use_temporal_to_train=True)
)





In [37]:
model.fit(X_train.reset_index(drop=True), y_train)

Generating Ensemble: 100%|██████████| 10/10 [00:38<00:00,  3.90s/it]
training: 100%|██████████| 28547/28547 [12:49<00:00, 37.10it/s] 
Generating Ensemble: 100%|██████████| 10/10 [00:06<00:00,  1.60it/s]
training: 100%|██████████| 8645/8645 [08:18<00:00, 17.34it/s]


# Save model

In [None]:
with open('./test_output/01.demo_adastem_model.pkl','wb') as f:
    pickle.dump(model, f)
    

# Evaluation

In [2]:
pred = model.predict(X_test)


NameError: name 'model' is not defined

In [40]:
perc = np.sum(np.isnan(pred.flatten()))/len(pred.flatten())
print(f'Percentage not predictable {round(perc*100, 2)}%')

Percentage not predictable 17.46%


In [41]:
pred_df = pd.DataFrame({
    'y_true':y_test.flatten(),
    'y_pred':np.where(pred.flatten()<0, 0, pred.flatten())
}).dropna()


In [42]:
AdaSTEM.eval_STEM_res('hurdle', pred_df.y_true, pred_df.y_pred)


{'AUC': 0.6980345039052719,
 'kappa': 0.43077631408096195,
 'f1': 0.5304336055588142,
 'precision': 0.6028099682083888,
 'recall': 0.4735739606832098,
 'average_precision': 0.39021517760120394,
 'Spearman_r': 0.44708603015455384,
 'Pearson_r': 0.1433973532055428,
 'R2': -0.3195830752536277,
 'MAE': 5.2570484840707286,
 'MSE': 2409.862916210604,
 'poisson_deviance_explained': 0.11704860924872906}

# Predict

In [43]:
pred_set = pd.read_csv('./stemy/dataset/test_data/Predset_2020.csv')


In [44]:
## reduce the prediction size
pred_set['lng_grid'] = np.digitize(
    pred_set.longitude,
    np.linspace(-180,180,500)
)

pred_set['lat_grid'] = np.digitize(
    pred_set.latitude,
    np.linspace(-90,90,500)
)

pred_set = pred_set.sample(frac=1, replace=False).groupby(['lng_grid','lat_grid']).first().reset_index(drop=True)
# pred_set = pred_set.drop(['lng_grid','lat_grid'], axis=1)



In [45]:
pred_list = []
for doy in tqdm(range(1,367)):
    pred_set['DOY'] = doy
    pred_set['duration_minutes'] = 60
    pred_set['Traveling'] = 1
    pred_set['Stationary'] = 0
    pred_set['Area'] = 0
    pred_set['effort_distance_km'] = 1
    pred_set['number_observers'] = 1
    pred_set['obsvr_species_count'] = 500
    pred_set['time_observation_started_minute_of_day'] = 420
    pred = model.predict(pred_set.fillna(-1))
    pred_list.append(pred)
    
    not_p = np.sum(np.isnan(pred.flatten()))/len(pred.flatten())
    # print(f'DOY {doy} Not predictable: {not_p*100}%')


  0%|          | 0/366 [00:00<?, ?it/s]

In [71]:
pred_df = []
for doy,doy_pred in enumerate(pred_list):
    pred_df.append(pd.DataFrame({
        'longitude':pred_set.longitude.values,
        'latitude':pred_set.latitude.values,
        'DOY':doy,
        'pred':np.array(doy_pred).flatten()
    }))

In [72]:
pred_df = pd.concat(pred_df, axis=0)

## Make GIF

In [73]:
from stemflow.utils.plot_gif import make_sample_gif

In [None]:
make_sample_gif(pred_df, './test_output/pred_gif.gif',
                            col='pred', max_frame=366, log_scale = True,
                            Spatio1='longitude', Spatio2='latitude', Temporal1='DOY',
                            figsize=(18,9), xlims=(-180, 180), ylims=(-90,90), grid=True,
                            xtick_interval=20, ytick_interval=20,
                            lng_size = 360, lat_size = 180, dpi=300, fps=30)


![](../pred_gif.gif)

In [152]:
%load_ext watermark
%watermark -n -u -v -iv -w

Last updated: Thu Sep 07 2023

Python implementation: CPython
Python version       : 3.9.7
IPython version      : 8.14.0

seaborn   : 0.11.2
matplotlib: 3.7.1
numpy     : 1.24.3
pandas    : 2.0.3

Watermark: 2.3.1

