# Dynamic ecoregions

## Define inputs

In [1]:
# Inputs
input_file = "data/data-continental-united-states.csv"
geohash_precision = 4
log_file = "run.log"
output_file = "output.geojson"
num_clusters = 12

## Set up logging

In [2]:
import logging

logging.basicConfig(filename=log_file, encoding="utf-8", level=logging.INFO)

## Data flow

In [3]:
from src.mermaid import build_mermaid_graph

build_mermaid_graph()

## `DarwinCoreCsvLazyFrame`

### Build

In [4]:
from src.lazyframes.darwin_core_csv import DarwinCoreCsvLazyFrame

darwin_core_csv_lazy_frame = DarwinCoreCsvLazyFrame.build(input_file)

### Preview

In [5]:
darwin_core_csv_lazy_frame.lf.limit(3).collect()

gbifID,datasetKey,occurrenceID,kingdom,phylum,class,order,family,genus,species,infraspecificEpithet,taxonRank,scientificName,verbatimScientificName,verbatimScientificNameAuthorship,countryCode,locality,stateProvince,occurrenceStatus,individualCount,publishingOrgKey,decimalLatitude,decimalLongitude,coordinateUncertaintyInMeters,coordinatePrecision,elevation,elevationAccuracy,depth,depthAccuracy,eventDate,day,month,year,taxonKey,speciesKey,basisOfRecord,institutionCode,collectionCode,catalogNumber,recordNumber,identifiedBy,dateIdentified,license,rightsHolder,recordedBy,typeStatus,establishmentMeans,lastInterpreted,mediaType,issue
str,str,str,enum,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,f64,f64,str,str,str,str,str,str,str,str,str,str,u64,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str
"""4424226317""","""50c9509d-22c7-4a22-a47d-8c4842…","""https://www.inaturalist.org/ob…","""Animalia""","""Arthropoda""","""Insecta""","""Odonata""","""Libellulidae""","""Sympetrum""",,,"""GENUS""","""Sympetrum Newman, 1833""","""Sympetrum""",,"""US""",,"""New York""","""PRESENT""",,"""28eb1a3f-1c15-4a95-931a-4af90e…",42.96266,-76.740947,"""2191.0""",,,,,,"""2019-09-06T10:30""","""6""","""9""","""2019""",1428195,,"""HUMAN_OBSERVATION""","""iNaturalist""","""Observations""","""32505913""",,"""Amy Guala""","""2019-09-11T01:33:07""","""CC_BY_NC_4_0""","""Amy Guala""","""Amy Guala""",,,"""2024-12-03T00:52:33.467Z""","""StillImage""","""COORDINATE_ROUNDED;CONTINENT_D…"
"""2423028148""","""50c9509d-22c7-4a22-a47d-8c4842…","""https://www.inaturalist.org/ob…","""Animalia""","""Arthropoda""","""Insecta""","""Lepidoptera""","""Geometridae""","""Nematocampa""","""Nematocampa resistaria""",,"""SPECIES""","""Nematocampa resistaria (Herric…","""Nematocampa resistaria""",,"""US""",,"""Vermont""","""PRESENT""",,"""28eb1a3f-1c15-4a95-931a-4af90e…",43.822608,-72.641412,"""5.0""",,,,,,"""2019-09-11T22:08:52""","""11""","""9""","""2019""",1991012,"""1991012""","""HUMAN_OBSERVATION""","""iNaturalist""","""Observations""","""32566956""",,"""bugguytad""","""2019-09-12T02:25:38""","""CC_BY_NC_4_0""","""bugguytad""","""bugguytad""",,,"""2024-12-03T01:38:46.617Z""","""StillImage;StillImage;StillIma…","""COORDINATE_ROUNDED;CONTINENT_D…"
"""2423041120""","""50c9509d-22c7-4a22-a47d-8c4842…","""https://www.inaturalist.org/ob…","""Animalia""","""Arthropoda""","""Insecta""","""Odonata""","""Coenagrionidae""","""Argia""","""Argia moesta""",,"""SPECIES""","""Argia moesta (Hagen, 1861)""","""Argia moesta""",,"""US""",,"""Texas""","""PRESENT""",,"""28eb1a3f-1c15-4a95-931a-4af90e…",30.99463,-103.662658,,,,,,,"""2019-09-08T09:01""","""8""","""9""","""2019""",5051399,"""5051399""","""HUMAN_OBSERVATION""","""iNaturalist""","""Observations""","""32612673""",,"""Todd Fitzgerald""","""2019-09-12T22:11:30""","""CC_BY_NC_4_0""","""Todd Fitzgerald""","""Todd Fitzgerald""",,,"""2024-12-03T01:38:10.816Z""","""StillImage""","""COORDINATE_ROUNDED;CONTINENT_D…"


## `GeohashDataFrame`

### Build

In [6]:
from src.dataframes.geohash import GeohashDataFrame

geohash_dataframe = GeohashDataFrame.build(darwin_core_csv_lazy_frame, geohash_precision)

Before:  (20759, 2)
After:  (20547, 2)


### Preview

In [7]:
geohash_dataframe.df.limit(6)

geohash,neighbors
str,list[str]
"""97zz""","[""9ebn""]"
"""9ebn""","[""97zz""]"
"""9ec6""","[""9ecd"", ""9ece""]"
"""9ecd""","[""9ece"", ""9ecg"", ""9ec6""]"
"""9ece""","[""9ecg"", ""9ecu"", … ""9ec6""]"
"""9ecg""","[""9ecu"", ""9efh"", … ""9ece""]"


## `TaxonomyDataFrame`

### Build

In [8]:
from src.dataframes.taxonomy import TaxonomyDataFrame

taxonomy_dataframe = TaxonomyDataFrame.build(darwin_core_csv_lazy_frame)

### Preview

In [9]:
taxonomy_dataframe.df.limit(3)

kingdom,phylum,class,order,family,genus,species
enum,str,str,str,str,str,str
"""Animalia""","""Arthropoda""","""Insecta""","""Coleoptera""","""Apionidae""","""Aspidapion""","""Aspidapion radiolus"""
"""Animalia""","""Arthropoda""","""Insecta""","""Lepidoptera""","""Pyralidae""","""Moodna""","""Moodna ostrinella"""
"""Animalia""","""Porifera""","""Demospongiae""","""Verongiida""","""Aplysinidae""","""Aplysina""","""Aplysina fistularis"""


## `GeohashSpeciesCountsDataFrame`

### Build

In [10]:
from src.dataframes.geohash_species_counts import GeohashSpeciesCountsDataFrame

geohash_taxa_counts_dataframe = GeohashSpeciesCountsDataFrame.build(
    darwin_core_csv_lazy_frame,
    geohash_dataframe,
    geohash_precision,
)

### Shape

In [18]:
geohash_taxa_counts_dataframe.df.shape

(7709233, 5)

### Preview

In [11]:
geohash_taxa_counts_dataframe.df.limit(3)

geohash,kingdom,rank,name,count
str,enum,enum,str,u32
"""97zz""","""Animalia""","""species""","""Eschrichtius robustus""",1
"""9ebn""","""Animalia""","""species""","""Eschrichtius robustus""",1
"""9ec6""","""Animalia""","""species""",,1


# `ConnectivityMatrix`

### Build

In [12]:
from src.matrices.connectivity import ConnectivityMatrix

connectivity_matrix = ConnectivityMatrix.build(geohash_dataframe)

In [21]:
#TMP

connectivity_matrix._connectivity_matrix.shape

geohash_dataframe.df["geohash"].unique()

# len(set(geohash_taxa_counts_dataframe.df["geohash"].unique()))


# import polars as pl

# geohash_dataframe.df.filter(pl.col("geohash").is_in(n))


geohash
str
"""dptv"""
"""9udn"""
"""dr8z"""
"""d5c7"""
"""dj29"""
…
"""9zz5"""
"""c8sk"""
"""9zzr"""
"""f0k0"""


### Preview

In [14]:
connectivity_matrix._connectivity_matrix

array([[0, 1, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 1, 0]], shape=(20547, 20547))

## `DistanceMatrix`

### Build

In [15]:
from src.matrices.distance import DistanceMatrix

distance_matrix = DistanceMatrix.build(geohash_taxa_counts_dataframe)

### Preview

In [16]:
distance_matrix.squareform()

array([[0.        , 0.        , 0.0705359 , ..., 0.06098313, 0.14197394,
        0.13851372],
       [0.        , 0.        , 0.0705359 , ..., 0.06098313, 0.14197394,
        0.13851372],
       [0.0705359 , 0.0705359 , 0.        , ..., 0.03423142, 0.12815178,
        0.12587236],
       ...,
       [0.06098313, 0.06098313, 0.03423142, ..., 0.        , 0.12217617,
        0.12030488],
       [0.14197394, 0.14197394, 0.12815178, ..., 0.12217617, 0.        ,
        0.16932232],
       [0.13851372, 0.13851372, 0.12587236, ..., 0.12030488, 0.16932232,
        0.        ]], shape=(20539, 20539))

## `GeohashClusterDataFrame`

### Build

In [17]:
from src.dataframes.geohash_cluster import GeohashClusterDataFrame

geohash_cluster_dataframe = GeohashClusterDataFrame.build(
    geohash_dataframe,
    distance_matrix,
    connectivity_matrix,
    num_clusters,
)

ValueError: Wrong shape for connectivity matrix: (20547, 20547) when X is (20539, 20539)

### Preview

In [None]:
geohash_cluster_dataframe.df.limit(3)

## `ClusterColorDataFrame`

### Build

In [18]:
from src.dataframes.cluster_color import ClusterColorDataFrame

cluster_colors_dataframe = ClusterColorDataFrame.build(geohash_cluster_dataframe)

### Preview

In [None]:
cluster_colors_dataframe.df.limit(3)

## Silhouette score

In [None]:
# TMP

from sklearn.metrics import silhouette_score, silhouette_samples
import matplotlib.pyplot as plt

silhouette_score(
    X=distance_matrix.squareform(),
    labels=geohash_cluster_dataframe.df["cluster"],
    metric="precomputed",
)

samples = silhouette_samples(
    X=distance_matrix.squareform(),
    labels=geohash_cluster_dataframe.df["cluster"],
    metric="precomputed",
)

import polars as pl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import numpy as np

new = geohash_cluster_dataframe.df.with_columns(
    pl.Series(
        name="sample",
        values=samples,
    )
)

n_clusters = len(geohash_cluster_dataframe.df["cluster"].unique())
n_geohashes = len(distance_matrix.squareform())

# Create a subplot with 1 row and 2 columns
fig, ax1 = plt.subplots()
fig.set_size_inches(18, 7)
# The 1st subplot is the silhouette plot
# The silhouette coefficient can range from -1, 1 but in this example all
# lie within [-0.1, 1]
# ax1.set_xlim([-0.1, 1])
# The (n_clusters+1)*10 is for inserting blank space between silhouette
# plots of individual clusters, to demarcate them clearly.
ax1.set_ylim([0, n_geohashes + (n_clusters + 1) * 10])

y_lower = 10
for i, cluster in enumerate(geohash_cluster_dataframe.df["cluster"].unique()):
    ith_cluster_silhouette_values = (new.filter(pl.col("cluster") == cluster).sort("sample", descending=True))["sample"]
    print(ith_cluster_silhouette_values)

    size_cluster_i = ith_cluster_silhouette_values.shape[0]
    y_upper = y_lower + size_cluster_i

    color =  cluster_colors_dataframe.get_color_for_cluster(cluster)
    ax1.fill_betweenx(
        np.arange(y_lower, y_upper),
        0,
        ith_cluster_silhouette_values,
        facecolor=color,
        edgecolor=color,
    )

    # Compute the new y_lower for next plot
    y_lower = y_upper + 10  # 10 for the 0 samples


## `ClusterTaxaStatisticsDataFrame`

### Build

In [21]:
from src.dataframes.cluster_taxa_statistics import ClusterTaxaStatisticsDataFrame

cluster_taxa_statistics_dataframe = ClusterTaxaStatisticsDataFrame.build(
    geohash_taxa_counts_dataframe,
    geohash_cluster_dataframe,
    taxonomy_dataframe,
)

### Preview

In [None]:
cluster_taxa_statistics_dataframe.df.limit(3)

## `ClusterSignificantDifferencesDataFrame`

### Build

In [23]:
from src.dataframes.cluster_significant_differences import ClusterSignificantDifferencesDataFrame

cluster_significant_differences_dataframe = ClusterSignificantDifferencesDataFrame.build(
    cluster_taxa_statistics_dataframe,
)

### Preview

In [None]:
cluster_significant_differences_dataframe.df.limit(3)

## Build and plot GeoJSON feature collection

In [None]:
from src.geojson import build_geojson_feature_collection, write_geojson
from src.render import plot_clusters

feature_collection = build_geojson_feature_collection(
    geohash_cluster_dataframe,
    cluster_colors_dataframe,
)

# print_results(darwin_core_aggregations, all_stats, cluster_dataframe)

write_geojson(feature_collection, output_file)

plot_clusters(feature_collection)

# Build and display HTML output

In [None]:
from src.html_output import build_html_output
from IPython.display import display, HTML

html = build_html_output(
    cluster_colors_dataframe,
    cluster_significant_differences_dataframe,
)

display(HTML(html))


In [None]:
from sklearn.manifold import TSNE

tsne = TSNE(
    n_components=2,
    random_state=42,
    metric="precomputed",
    init="random",
    perplexity=min(30, distance_matrix.squareform().shape[0] - 1), # HACK FOR SMALLER DATASETS
)
X_tsne = tsne.fit_transform(distance_matrix.squareform())
tsne.kl_divergence_


In [None]:
import seaborn as sns

sns.scatterplot(
    x=X_tsne[:, 0],
    y=X_tsne[:, 1],
    hue=geohash_cluster_dataframe.df["cluster"],
    palette=cluster_colors_dataframe.to_dict(),
    alpha=1,
)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import polars as pl
from scipy.cluster.hierarchy import linkage

from src.darwin_core import TaxonRank

linkage_array = linkage(distance_matrix.condensed(), "ward")

ordered_geohashes = geohash_dataframe.df["geohash"].unique()

def min_max_normalize(series: pl.Series) -> pl.Series:
    return (series - series.min()) / (series.max() - series.min())

col_colors = []
for geohash in ordered_geohashes:
    cluster = geohash_cluster_dataframe.cluster_for_geohash(geohash)
    col_colors.append(cluster_colors_dataframe.get_color_for_cluster(cluster))

data = {}
for species in cluster_significant_differences_dataframe.df["taxon"].unique():
    counts = []

    for geohash in ordered_geohashes:
        geohash_counts_species = geohash_taxa_counts_dataframe.df.filter(
            pl.col("geohash") == geohash, pl.col("name") == species
        ).get_column("count").sum()
        geohash_counts_all = geohash_taxa_counts_dataframe.df.filter(
            pl.col("geohash") == geohash
        ).get_column("count").sum()
        geohash_average = geohash_counts_species / geohash_counts_all
        all_average = (
            cluster_taxa_statistics_dataframe.df.filter(
                # pl.col("kingdom") == kingdom, FIXME
                pl.col("name") == species,
                pl.col("cluster").is_null(),
                pl.col("rank") == TaxonRank.species,
            )
            .get_column("average")
            .item()
        )
        counts.append(geohash_average - all_average)
    counts = pl.Series(
        values=counts,
        name=species,
    )
    # counts = counts / counts.max()
    # counts = counts.pow(1 / 3)
    data[species] = min_max_normalize(counts)


In [None]:
data

In [None]:
dataframe = pl.DataFrame(data=data)
sns.clustermap(
    data=dataframe,
    col_cluster=False,
    row_cluster=True,
    row_linkage=linkage_array,
    row_colors=col_colors,
    xticklabels=dataframe.columns,
    yticklabels=False,
)