# GDVSDM

## Initialise GDVSDM

### Load packages

In [None]:
%matplotlib inline
%load_ext autoreload

import os
import sys
import gdal
import pandas as pd
import numpy as np
import xarray as xr
import datacube
import matplotlib.pyplot as plt

sys.path.append('../Scripts')
from dea_datahandling import load_ard
from dea_dask import create_local_dask_cluster
from dea_plotting import display_map, rgb

sys.path.append('./scripts')
import gdvsdm

## Prepare shapefiles and rasters

### Set shapefile paths

In [None]:
# set shapefile path and file location
#occur_shp_path = r'./data_rf/bradypus/bradypus_p_webmerc.shp'
occur_shp_path = r'./data_testing/yandi/points/presence_yandi_subset.shp'

# example shapefile mask
mask_shp_path = './data_rf/bradypus/sa_polymask_diss_webmerc.shp'

# combine (only for validate)
shp_path_list = [occur_shp_path,  mask_shp_path]

### Set raster paths

In [None]:
# todo
# in arcgis, on prompt, let them select raster layers cont and cat
# we then bring in here as lists of file paths

# example continous rasts
"""
rast_cont_list = [
    './data_rf/tif/tmx6190_ann.tif',
    './data_rf/tif/dtr6190_ann.tif',
    './data_rf/tif/h_dem.tif',
    './data_rf/tif/tmn6190_ann.tif',
    './data_rf/tif/pre6190_l4.tif',
    './data_rf/tif/vap6190_ann.tif',
    './data_rf/tif/pre6190_l1.tif',
    './data_rf/tif/cld6190_ann.tif',
    './data_rf/tif/pre6190_l10.tif',
    './data_rf/tif/pre6190_ann.tif',
    './data_rf/tif/pre6190_l7.tif',
    './data_rf/tif/tmp6190_ann.tif',
    './data_rf/tif/frs6190_ann.tif'
]
"""
rast_cont_list = [
    './data_testing/yandi/rasters/aspect.tif',
    './data_testing/yandi/rasters/curvature.tif',
    './data_testing/yandi/rasters/dem_filled.tif',
    './data_testing/yandi/rasters/dissection.tif',
    './data_testing/yandi/rasters/eastness.tif',
    './data_testing/yandi/rasters/hillshade.tif',
    './data_testing/yandi/rasters/hli.tif',
    './data_testing/yandi/rasters/northness.tif',
    #'./data_testing/yandi/rasters/roughness.tif',
    './data_testing/yandi/rasters/slope_deg.tif',
    './data_testing/yandi/rasters/solar_rad.tif',
    './data_testing/yandi/rasters/tpi.tif',
    './data_testing/yandi/rasters/tri.tif',
    './data_testing/yandi/rasters/twi.tif'
]

# example categorical rasts
rast_cate_list = [
    #'./data_rf/tif/ecoreg.tif'
]

# example combine
rast_path_list = rast_cont_list + rast_cate_list

### Validate input layers

In [None]:
# run selected layers through validator, throw error if errors found
gdvsdm.validate_input_data(shp_path_list, rast_path_list)

## Prepare species presence point locations

### Read occurrence shapefile

In [None]:
# extract point x and y from shapefile as numpy array
df_presence = gdvsdm.read_coordinates_shp(occur_shp_path)

# display result
#print(df_presence)

### Read mask shapefile

In [None]:
# load mask polygon, dissolve it, output multipolygon geometry
mask_geom = gdvsdm.read_mask_shp(mask_shp_path)

# display result
#print(mask_geom)

## Load rasters

### Convert to dataset

In [None]:
# convert rasters to xarray dataset (todo send to gdv_tools)
ds = gdvsdm.rasters_to_dataset(rast_path_list)

# display result
#print(ds)

## Prepare species psuedo-absence point locations

### Generate psuedo-absence coordinates

In [None]:
# generate absences using shapefile mask and occurrence coords
#df_absence = gdvsdm.generate_absences_from_shp(mask_shp_path=mask_shp_path, num_abse=500, 
                                               #occur_shp_path=occur_shp_path, buff_m=50000)

# generate absences using dataset pixels and occurrence coords
df_absence = gdvsdm.generate_absences_from_dataset(ds=ds, num_abse=5000, occur_shp_path=occur_shp_path,
                                                   buff_m=500, res_factor=3, nodata_value=-9999)

# display result
#print(df_absence)

### Extract variable values at observed point locations

In [None]:
# extract values from dataset variables at each presence point coord
df_presence_data = gdvsdm.extract_dataset_values(ds=ds, coords=df_presence, res_factor=3)

# display result
#print(df_presence_data)

In [None]:
# remove all records containing nodata values
df_presence_data = gdvsdm.remove_nodata_records(df_presence_data)

# display result
#print(df_presence_data)

### Extract variable values at pseudoabsence point locations

In [None]:
# extract values from dataset variables at each presence point coord
df_absence_data = gdvsdm.extract_dataset_values(ds=ds, coords=df_absence, res_factor=3)

# display result
#print(df_absence_data)

In [None]:
# remove all records containing nodata values
df_absence_data = gdvsdm.remove_nodata_records(df_absence_data)

# display result
#print(df_absence_data)

### Equalise records

In [None]:
# equalise absence to match number of presence
#df_absence_data = gdvsdm.equalise_absence_records(df_presence_data, df_absence_data)

# display result
#print(df_absence_data)

### Combine presence and absence records

In [None]:
# take pres and abse records and combine, add new pres/abse column
df_pres_abse_data = gdvsdm.combine_presence_absence_records(df_presence_data, df_absence_data)

# display result
#print(df_pres_abse_data)

## Investigate variable correlation and variance inflation

### Generate Pearson's correlation matrix

In [None]:
# generate the matrix
# rule of thumb: < 0.6 = weak collinearity, 0.6-0.8 = moderate, >= 0.8 = strong
gdvsdm.generate_correlation_matrix(df_pres_abse_data, rast_cate_list, show_fig=True)

### Generate Variance Inflation Factor (VIF) Score

In [None]:
# generate the matrix
# rule of thumb: 1 = No multicolinearity, 1-5 = moderate, > 5 = high, > 10 = Remove from model
gdvsdm.generate_vif_scores(df_pres_abse_data, rast_cate_list)

## Perform Species Distribution Modelling 

### Create estimator

In [None]:
# create a random forest estimator using default sklearn parameters
estimator = gdvsdm.create_estimator(estimator_type='rf', n_estimators=500)

### Generate SDM

In [None]:
# generate SDM with 5 replicates and 10% training-testing split
ds_sdm = gdvsdm.generate_sdm(ds, df_pres_abse_data, estimator, rast_cont_list, rast_cate_list, replicates=5, 
                             test_ratio=0.1, equalise_test_set=False, calc_accuracy_stats=True)

### Display SDM result

In [None]:
# set the sdm variable to display (i.e. sdm_mean, sdm_stdv, sdm_cvar)
metric_name = 'sdm_mean'

# create fig
fig = plt.figure(figsize=(9, 7), dpi=85)

# plot this on map
ds_sdm[metric_name].plot(robust=False, cmap='jet')

In [None]:
da = ds_sdm[metric_name]

In [None]:
ds_sdm.to_netcdf('yandi_sdm.nc')

In [None]:
# import
from datacube.utils.cog import write_cog

# out crs
crs = 'EPSG:3577'
   
# write tif
write_cog(geo_im=da,
          fname='yandi_sdm_mean.tif',
          crs=crs,
          nodata=-9999,
          overwrite=True)
        

## Temporary dem retriever

### Set up a dask cluster

In [None]:
# initialise the cluster. paste url into dask panel for more info.
create_local_dask_cluster()

In [None]:
# open up a datacube connection
dc = datacube.Datacube(app='gdvsdm')

## Study area and data setup

### Set study area

In [None]:
# set lat, lon (y, x) dictionary of testing areas for gdv project
loc_dict = {
    'test_a':   (-23.28043, 119.85931),
    'test_b':   (-31.60693, 116.94264),
    'test_c':   (-22.64623, 120.16237),
    'test_d':   (-22.74596, 119.10474),
}

# set buffer length and height (x, y)
buff_dict = {
    'test_a': (0.5, 0.5),
    'test_b': (0.5, 0.5),
    'test_c': (0.5, 0.5),
    'test_d': (0.25, 0.1),
}

In [None]:
# select location from dict
study_area = 'test_d'

# set buffer size in lon, lat (x, y)
lon_buff, lat_buff = buff_dict[study_area][0], buff_dict[study_area][1]

In [None]:
# select a study area from existing dict
lat, lon = loc_dict[study_area][0], loc_dict[study_area][1]

# combine centroid with buffer to form study boundary
lat_extent = (lat - lat_buff, lat + lat_buff)
lon_extent = (lon - lon_buff, lon + lon_buff)

# display onto interacrive map
display_map(x=lon_extent, y=lat_extent)

### Load SRTM Digital Elevation Model (Resampled to 30m)

In [None]:
# create query from above and expected info
query = {
    'x': lon_extent,
    'y': lat_extent,
    'measurements': ['dem'],
    'output_crs': 'EPSG:3577',
    'resolution': (80, 80),
    'group_by': 'solar_day',
}


# load srtm dem dataset # multi_scale_topographic_position
ds_dem = dc.load(product='ga_srtm_dem1sv1_0', **query)

# display dataset
#print(ds_dem)

# plot
#ds_dem['dem'].plot(robust=True, cmap='terrain_r')

In [None]:
# drop time dimension and squeeze out time coordinate
ds_dem = ds_dem.drop('time', errors='ignore')
ds_dem = ds_dem.squeeze('time', drop=True)
#ds_dem['dem'] = ds_dem['dem'].astype('int32')

In [None]:
# Load raster and reproject to match satellite dataset
raster_path = './dem_80m.tif'
raster_reprojected = rio_slurp_xarray(raster_path,
                                      ds_10m.geobox,
                                      resampling="bilinear")

# Set nodata to `NaN`
raster_reprojected = mask_invalid_data(raster_reprojected)

In [None]:
ds_10m

In [None]:
ds_10m['dem'] = raster_reprojected.astype('int16')

In [None]:
from odc.algo import xr_reproject

In [None]:
ds_dem['dem'].attrs.get('nodata') = -9999

In [None]:
at = ds_dem['dem'].attrs
at['nodata'] = -9999
ds_dem['dem'].attrs = at

# import
from datacube.utils.cog import write_cog

# out crs
crs = 'EPSG:3577'
   
# write tif
write_cog(geo_im=ds_dem['dem'].astype('int16'),
          fname='yandi_srtm_dem_30m.tif',
          crs=crs,
          overwrite=True)
        