In [2]:
"""
Script to cluster tree observations based on spatial proximity and membership to same biome. Uses HDBSCAN algorithm. 
"""

import pandas as pd
import geopandas as gpd
from sklearn import cluster
import numpy as np

from dataset_creation import add_feature_from_polygon_layer

In [4]:


df = pd.read_csv("/home/dibepa/git/global.agb.ml/data/training/raw/tree_allometry/Tallo.csv")
print(df)

tallo = gpd.GeoDataFrame(
    df, 
    geometry = gpd.points_from_xy(df.longitude, df.latitude)
)
tallo.crs = "EPSG:4326"

         tree_id    division      family       genus             species  \
0            T_1  Angiosperm  Betulaceae      Betula    Betula pubescens   
1            T_2  Gymnosperm    Pinaceae       Picea         Picea abies   
2            T_3  Gymnosperm    Pinaceae       Picea         Picea abies   
3            T_4  Gymnosperm    Pinaceae       Picea         Picea abies   
4            T_5  Gymnosperm    Pinaceae       Picea         Picea abies   
...          ...         ...         ...         ...                 ...   
498833  T_498834  Angiosperm   Myrtaceae  Eucalyptus  Eucalyptus regnans   
498834  T_498835  Angiosperm   Myrtaceae  Eucalyptus  Eucalyptus regnans   
498835  T_498836  Angiosperm   Myrtaceae  Eucalyptus  Eucalyptus regnans   
498836  T_498837  Angiosperm   Myrtaceae  Eucalyptus  Eucalyptus regnans   
498837  T_498838  Angiosperm   Myrtaceae  Eucalyptus  Eucalyptus regnans   

        latitude  longitude  stem_diameter_cm  height_m  crown_radius_m  \
0        67.

In [6]:
# Identify biomes to ensure bioclimatic similarity during clustering.

biomes = "/home/dibepa/git/global.agb.ml/data/training/raw/biome_data/Ecoregions2017.shp"

tallo = add_feature_from_polygon_layer(tallo,"BIOME_NAME",biomes,"bio")    
tallo = add_feature_from_polygon_layer(tallo,"REALM",biomes,"bgr") 

In [34]:
# Clustering per biome
previous_cluster = 0
tallo_cluster = tallo.copy(deep=True)
cluster_df = pd.DataFrame()

for bio in np.unique(tallo_cluster["bio"]):

    df = tallo_cluster[tallo_cluster["bio"] == bio]
    
    if len(df.index) > 25:
        min_size = 25
    else: 
        min_size = len(df.index)   

    hdb = cluster.HDBSCAN(min_cluster_size=min_size)
    labels = hdb.fit_predict(df[["latitude","longitude"]])

    labels = np.where(labels > -1, labels + previous_cluster, labels)

    previous_cluster = np.max(labels)

    label_df = pd.DataFrame(columns=["cluster"],index=df.index)
    label_df["cluster"] = labels

    cluster_df = pd.concat([cluster_df,label_df])

tallo_cluster = tallo_cluster.join(cluster_df)
print(tallo_cluster)

         tree_id    division      family       genus           species  \
0            T_1  Angiosperm  Betulaceae      Betula  Betula pubescens   
1            T_2  Gymnosperm    Pinaceae       Picea       Picea abies   
2            T_3  Gymnosperm    Pinaceae       Picea       Picea abies   
3            T_4  Gymnosperm    Pinaceae       Picea       Picea abies   
4            T_5  Gymnosperm    Pinaceae       Picea       Picea abies   
...          ...         ...         ...         ...               ...   
498425  T_498247  -9999.9999  -9999.9999  -9999.9999        -9999.9999   
498426  T_498268  -9999.9999  -9999.9999  -9999.9999        -9999.9999   
498427  T_498274  -9999.9999  -9999.9999  -9999.9999        -9999.9999   
498428  T_498286  -9999.9999  -9999.9999  -9999.9999        -9999.9999   
498429  T_498319  -9999.9999  -9999.9999  -9999.9999        -9999.9999   

        latitude  longitude  stem_diameter_cm  height_m  crown_radius_m  \
0         67.395     28.723         

In [35]:
# Testing

print(len(np.unique(tallo_cluster["cluster"])))
print(np.max(np.unique(tallo_cluster["cluster"])))
print(np.min(np.unique(tallo_cluster["cluster"])))
print(len(tallo_cluster[tallo_cluster["cluster"]==-1].index))
print(len(tallo_cluster[tallo_cluster["cluster"]==0].index))

1902
1900
-1
33432
130


In [37]:
tallo_cluster = tallo_cluster.drop("geometry",axis='columns')
tallo_cluster.to_csv("/home/dibepa/git/global.agb.ml/data/training/tmp_preprocessed/tallo_clusters.csv",index=False)