In [None]:
! pip install xarray cfgrib eccodes --quiet

In [6]:
import duckdb
import xarray as xr
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import folium
from folium.plugins import HeatMapWithTime

from sklearn.neighbors import BallTree

pd.set_option('display.max_columns', None)

In [7]:
con = duckdb.connect('/network/scratch/g/guzmand/Fall-2025/COMP-685-ML-for-Climate-Change/data/iNaturalist.duckdb')
print("connected!")

connected!


# Data

## iNaturalist joined with ERA5
The idea is that for a given observation with (latitude, longitude, time) we can join this with the ERA5 dataset to get relevant climate features for that specific place and time. 

Our proposed framework integrates spatial and temporal information through location and time encoders, or an LLM, to produce joint spatio-temporal embeddings for downstream climate related prediction tasks

- **Location Encoder:** A pretrained geographic encoder $f_{loc}$ like SatCLIP transforms coordinates into dense location embeddings $\mathbf{z}_{\text{loc}} = f_{\text{loc}}(\text{lat}, \text{long})$.
- **Time Encoder:** Given a relevant climate time series with $n$ observations (e.g., temperature, precipitation, wind) at a given location and time $x_{t} = (x_{t_{0}}, x_{t_{1}}, \ldots, x_{t_{n}})$, where each $x_{t_{i}}$ represents the value at time step $t_{i}$. This sequence is passed through the time encoder $f_{\text{time}}$, like TimesFM to produce a fixed length embedding $\mathbf{z}_{\text{time}} = f_{\text{time}}(x_{t})$.
- **Downstream Tasks:** The joint embeddings feed task specific predictors for applications where spatio-temporal data is relevant. For instance, in species distribution modeling, the embeddings could help predict how suitable habitats shift under changing temperature and precipitation regimes, offering insights for biodiversity conservation.

![Location-Time Encoders](../images/location-time-encoders.jpg)

These are the variables in the ERA5 dataset

```python
era5_variable_names = {
    "u10": "10m u-component of wind",
    "v10": "10m v-component of wind",
    "d2m": "2m dewpoint temperature",
    "t2m": "2m temperature",
    "msl": "Mean sea level pressure",
    "mwd": "Mean wave direction",
    "mwp": "Mean wave period",
    "sst": "Sea surface temperature",
    "swh": "Significant height of combined wind waves and swell",
    "sp": "Surface pressure",
    "tp": "Total precipitation"
}
```

### Daily
The resulting dataframe provides species + climate data aligned at the **daily** level. From here, it can be aggregated further, for example, by week, month, or by ERA5 grid cell.

1. Filter observations of by species, date, location, and valid coordinates.
2. Map each observation to its nearest ERA5 grid cell.
3. Extract a 7-day climate history (the observation day + previous 6 days) for each ERA5 grid cell, using a window function over ERA5 daily reanalysis variables (e.g., temperature, precipitation, wind).
4. Join iNaturalist + ERA5 so each biological observation has geographic metadata, its mapped grid-cell coordinates, a set of short climate histories (arrays) for each variable.



In [71]:
verbatimScientificName = "Danaus plexippus"
min_date = "2020-01-01"
max_date = "2025-12-31"

query = f"""
WITH filtered_iNaturalist AS (
    SELECT
        gbifID,
        verbatimScientificName,
        STRFTIME(
            TRY_STRPTIME(eventDate, ['%Y-%m-%dT%H:%M:%S', '%Y-%m-%dT%H:%M', '%Y-%m-%d']),
            '%Y-%m-%d'
        ) AS dateOcurrence,  -- keep only the date part
        decimalLatitude,
        decimalLongitude,
        countryCode
    FROM iNaturalist
    WHERE
        decimalLatitude IS NOT NULL
        AND decimalLongitude IS NOT NULL
        AND eventDate IS NOT NULL AND TRIM(eventDate) <> '' 
        AND dateOcurrence IS NOT NULL AND TRIM(dateOcurrence) <> ''
        AND TRY_STRPTIME(eventDate, ['%Y-%m-%dT%H:%M:%S', '%Y-%m-%dT%H:%M', '%Y-%m-%d']) BETWEEN '{min_date}' AND '{max_date}'
        AND verbatimScientificName = '{verbatimScientificName}'
        AND countryCode IN ('US', 'CA', 'MX')
),

mapped_iNaturalist AS (
    SELECT
        filtered_iNaturalist.verbatimScientificName,
        CAST(filtered_iNaturalist.dateOcurrence AS DATE) AS dateOcurrence,
        filtered_iNaturalist.decimalLatitude,
        filtered_iNaturalist.decimalLongitude,
        filtered_iNaturalist.countryCode,
        iNaturalist_grid_mapping.nearest_grid_id,
        1 AS countOccurrences,
    FROM filtered_iNaturalist AS filtered_iNaturalist
    JOIN iNaturalist_grid_mapping AS iNaturalist_grid_mapping
    USING (gbifID)
),

mapped_ERA5 AS (
    SELECT
        CAST(ERA5_Daily.valid_time AS DATE) AS valid_time,
        ERA5_Daily.latitude,
        ERA5_Daily.longitude,
        ERA5_grid.grid_id,
        -- Build the array of 7 historical + current values (ordered by date)
        LIST(u10) OVER w AS u10_hist,
        LIST(v10) OVER w AS v10_hist,
        LIST(d2m) OVER w AS d2m_hist,
        LIST(t2m) OVER w AS t2m_hist,
        LIST(msl) OVER w AS msl_hist,
        LIST(mwd) OVER w AS mwd_hist,
        LIST(mwp) OVER w AS mwp_hist,
        LIST(sst) OVER w AS sst_hist,
        LIST(swh) OVER w AS swh_hist,
        LIST(sp)  OVER w AS sp_hist,
        LIST(tp)  OVER w AS tp_hist

    FROM ERA5_Daily
    JOIN ERA5_grid 
      ON ERA5_Daily.latitude = ERA5_grid.latitude
     AND ERA5_Daily.longitude = ERA5_grid.longitude
    WHERE
        valid_time BETWEEN '{min_date}' AND '{max_date}'
        AND ERA5_grid.grid_id IN (
            SELECT DISTINCT nearest_grid_id
            FROM mapped_iNaturalist
        )

    WINDOW w AS (
        PARTITION BY grid_id
        ORDER BY valid_time
        -- This gets the last 6 days before valid_time and the current valid_time
        ROWS BETWEEN 6 PRECEDING AND CURRENT ROW
    )
),

joined_iNaturalist_ERA5 AS (
    SELECT
        mapped_iNaturalist.verbatimScientificName,
        mapped_iNaturalist.dateOcurrence,
        mapped_iNaturalist.decimalLatitude,
        mapped_iNaturalist.decimalLongitude,
        mapped_iNaturalist.countryCode,
        mapped_iNaturalist.nearest_grid_id,
        mapped_iNaturalist.countOccurrences,
        mapped_ERA5.latitude AS nearest_grid_id_latitude,
        mapped_ERA5.longitude AS nearest_grid_id_longitude,
        mapped_ERA5.u10_hist,
        mapped_ERA5.v10_hist,
        mapped_ERA5.d2m_hist,
        mapped_ERA5.t2m_hist,
        mapped_ERA5.msl_hist,
        -- mapped_ERA5.mwd_hist, -- these are sea variables, so they appear as NaN on the land
        -- mapped_ERA5.mwp_hist, -- these are sea variables, so they appear as NaN on the land
        -- mapped_ERA5.sst_hist, -- these are sea variables, so they appear as NaN on the land
        -- mapped_ERA5.swh_hist, -- these are sea variables, so they appear as NaN on the land
        mapped_ERA5.sp_hist,
        mapped_ERA5.tp_hist
    FROM mapped_iNaturalist AS mapped_iNaturalist
    JOIN mapped_ERA5 AS mapped_ERA5
      ON mapped_iNaturalist.nearest_grid_id = mapped_ERA5.grid_id
      AND mapped_iNaturalist.dateOcurrence = mapped_ERA5.valid_time
)


-- This can be grouped further
SELECT *
FROM joined_iNaturalist_ERA5
ORDER BY dateOcurrence, decimalLatitude, decimalLongitude;
"""
df = con.execute(query).df()
# Remove a few of the first days that don't have 7-days histories (I didn't get 2019 data)
df = df[df["u10_hist"].apply(len) == 7]
df


Unnamed: 0,verbatimScientificName,dateOcurrence,decimalLatitude,decimalLongitude,countryCode,nearest_grid_id,countOccurrences,nearest_grid_id_latitude,nearest_grid_id_longitude,u10_hist,v10_hist,d2m_hist,t2m_hist,msl_hist,sp_hist,tp_hist
79,Danaus plexippus,2020-01-07,23.299000,-106.443000,MX,653334,1,23.25,253.50,"[-1.1912784576416016, 0.21049053966999054, 1.2...","[0.41495323181152344, 1.2509905099868774, -2.6...","[290.542724609375, 288.5713195800781, 287.7474...","[293.4073791503906, 292.4245910644531, 293.499...","[101282.4296875, 101193.3359375, 101573.125, 1...","[101399.7109375, 101310.8671875, 101690.65625,...","[0.002592881442978978, 0.0005121827125549316, ..."
80,Danaus plexippus,2020-01-07,26.181973,-80.125923,US,670719,1,26.25,279.75,"[-0.017206192016601562, -1.9012037515640259, -...","[-1.6698856353759766, 0.740899384021759, 4.106...","[284.07568359375, 289.0439758300781, 293.88439...","[292.7335510253906, 294.84197998046875, 298.45...","[101772.5625, 101783.7734375, 101789.1875, 101...","[101704.1640625, 101715.6171875, 101721.242187...","[0.0, 0.0, 9.934107083608978e-07, 2.0364919691..."
81,Danaus plexippus,2020-01-07,28.658221,-81.386191,US,685114,1,28.75,278.50,"[1.2142391204833984, -1.514200210571289, -0.46...","[-1.1532840728759766, 1.1368547677993774, 3.67...","[278.1405334472656, 283.8645935058594, 292.412...","[287.3736877441406, 290.4765930175781, 296.133...","[101860.5, 101795.4140625, 101669.6953125, 101...","[101541.1640625, 101479.78125, 101356.28125, 1...","[0.0, 0.0, 7.669131264265161e-06, 0.0005294680..."
82,Danaus plexippus,2020-01-07,29.301348,-94.797696,US,687941,1,29.25,265.25,"[-2.520827531814575, -1.0590814352035522, 1.34...","[1.1612911224365234, 2.0519344806671143, 1.262...","[279.368896484375, 289.3843688964844, 288.0708...","[287.8973693847656, 290.4136047363281, 290.024...","[101831.03125, 100682.0859375, 100917.65625, 1...","[101813.375, 100664.90625, 100900.40625, 10236...","[2.2470951080322266e-05, 0.0011809667339548469..."
83,Danaus plexippus,2020-01-07,29.892725,-95.514687,US,692258,1,30.00,264.50,"[-0.6443220973014832, -0.6580397486686707, 1.8...","[1.1187292337417603, 1.9181455373764038, 0.027...","[275.7267150878906, 287.5146789550781, 284.863...","[284.8507385253906, 288.4826965332031, 287.952...","[101817.625, 100664.2734375, 100910.9453125, 1...","[101321.0, 100177.28125, 100423.3671875, 10193...","[8.960564628068823e-06, 0.0007939140195958316,..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
216526,Danaus plexippus,2025-10-21,40.526402,-74.495054,US,752822,1,40.50,285.50,"[1.1629976034164429, 1.8447412252426147, 1.530...","[-2.919408082962036, -4.02459192276001, -3.113...","[283.3708801269531, 271.6748352050781, 271.023...","[289.5272521972656, 285.1553955078125, 284.869...","[101545.7421875, 101466.5546875, 101552.40625,...","[100885.0625, 100796.3359375, 100882.1328125, ...","[4.5696893380409165e-07, 0.0, 0.0, 1.293420791..."
216527,Danaus plexippus,2025-10-21,40.860142,-73.933889,US,754264,1,40.75,286.00,"[0.9071788787841797, 1.8905582427978516, 1.594...","[-3.163426637649536, -4.165746212005615, -3.67...","[284.5069274902344, 275.7417907714844, 275.066...","[289.0607604980469, 285.3655090332031, 285.286...","[101516.6875, 101412.7109375, 101498.9296875, ...","[101297.1484375, 101190.0, 101276.4296875, 101...","[2.702077154026483e-06, 0.0, 0.0, 5.3445496632..."
216528,Danaus plexippus,2025-10-21,40.861013,-73.879937,US,754264,1,40.75,286.00,"[0.9071788787841797, 1.8905582427978516, 1.594...","[-3.163426637649536, -4.165746212005615, -3.67...","[284.5069274902344, 275.7417907714844, 275.066...","[289.0607604980469, 285.3655090332031, 285.286...","[101516.6875, 101412.7109375, 101498.9296875, ...","[101297.1484375, 101190.0, 101276.4296875, 101...","[2.702077154026483e-06, 0.0, 0.0, 5.3445496632..."
216529,Danaus plexippus,2025-10-21,40.890367,-73.897522,US,755704,1,41.00,286.00,"[0.9740734100341797, 1.8374576568603516, 1.531...","[-3.212946653366089, -4.0759429931640625, -3.6...","[283.63885498046875, 273.76458740234375, 273.1...","[288.3072814941406, 284.2792663574219, 284.334...","[101534.96875, 101424.09375, 101508.8125, 1015...","[100382.6875, 100255.0, 100340.6796875, 100434...","[2.6424725092510926e-06, 0.0, 0.0, 2.483526805..."


#### Aggregating by `dateOcurrence` and `nearest_grid_id`

We can group the data by `dateOcurrence` and `nearest_grid_id` to keep a daily resolution while combining all observations within the same grid cell into a single aggregated record.

In [73]:
cols = [
    "dateOcurrence",
    "nearest_grid_id",
    "nearest_grid_id_latitude",
    "nearest_grid_id_longitude",
    "decimalLatitude",
    "decimalLongitude",
    "countryCode",
    "verbatimScientificName",
    "countOccurrences",
    "u10_hist",
    "v10_hist",
    "d2m_hist",
    "t2m_hist",
    "msl_hist",
    "sp_hist",
    "tp_hist",
]

# Sort so that "most recent" means "last" in each group
df = df.sort_values("dateOcurrence")

agg_df = (
    df.groupby(["dateOcurrence", "nearest_grid_id"], as_index=False)
      .agg({
          "countOccurrences": "sum",
          "verbatimScientificName": list,
          "decimalLatitude": list,
          "decimalLongitude": list,
          "countryCode": list,
          "nearest_grid_id_latitude": "last",
          "nearest_grid_id_longitude": "last",
          "u10_hist": "last",
          "v10_hist": "last",
          "d2m_hist": "last",
          "t2m_hist": "last",
          "msl_hist": "last",
          "sp_hist": "last",
          "tp_hist": "last",
      })
)
agg_df = agg_df[cols]
agg_df


Unnamed: 0,dateOcurrence,nearest_grid_id,nearest_grid_id_latitude,nearest_grid_id_longitude,decimalLatitude,decimalLongitude,countryCode,verbatimScientificName,countOccurrences,u10_hist,v10_hist,d2m_hist,t2m_hist,msl_hist,sp_hist,tp_hist
0,2020-01-07,653334,23.25,253.50,[23.299],[-106.443],[MX],[Danaus plexippus],1,"[-1.1912784576416016, 0.21049053966999054, 1.2...","[0.41495323181152344, 1.2509905099868774, -2.6...","[290.542724609375, 288.5713195800781, 287.7474...","[293.4073791503906, 292.4245910644531, 293.499...","[101282.4296875, 101193.3359375, 101573.125, 1...","[101399.7109375, 101310.8671875, 101690.65625,...","[0.002592881442978978, 0.0005121827125549316, ..."
1,2020-01-07,670719,26.25,279.75,[26.181973],[-80.125923],[US],[Danaus plexippus],1,"[-0.017206192016601562, -1.9012037515640259, -...","[-1.6698856353759766, 0.740899384021759, 4.106...","[284.07568359375, 289.0439758300781, 293.88439...","[292.7335510253906, 294.84197998046875, 298.45...","[101772.5625, 101783.7734375, 101789.1875, 101...","[101704.1640625, 101715.6171875, 101721.242187...","[0.0, 0.0, 9.934107083608978e-07, 2.0364919691..."
2,2020-01-07,685114,28.75,278.50,[28.658221],[-81.386191],[US],[Danaus plexippus],1,"[1.2142391204833984, -1.514200210571289, -0.46...","[-1.1532840728759766, 1.1368547677993774, 3.67...","[278.1405334472656, 283.8645935058594, 292.412...","[287.3736877441406, 290.4765930175781, 296.133...","[101860.5, 101795.4140625, 101669.6953125, 101...","[101541.1640625, 101479.78125, 101356.28125, 1...","[0.0, 0.0, 7.669131264265161e-06, 0.0005294680..."
3,2020-01-07,687941,29.25,265.25,[29.301348],[-94.797696],[US],[Danaus plexippus],1,"[-2.520827531814575, -1.0590814352035522, 1.34...","[1.1612911224365234, 2.0519344806671143, 1.262...","[279.368896484375, 289.3843688964844, 288.0708...","[287.8973693847656, 290.4136047363281, 290.024...","[101831.03125, 100682.0859375, 100917.65625, 1...","[101813.375, 100664.90625, 100900.40625, 10236...","[2.2470951080322266e-05, 0.0011809667339548469..."
4,2020-01-07,692258,30.00,264.50,[29.892725],[-95.514687],[US],[Danaus plexippus],1,"[-0.6443220973014832, -0.6580397486686707, 1.8...","[1.1187292337417603, 1.9181455373764038, 0.027...","[275.7267150878906, 287.5146789550781, 284.863...","[284.8507385253906, 288.4826965332031, 287.952...","[101817.625, 100664.2734375, 100910.9453125, 1...","[101321.0, 100177.28125, 100423.3671875, 10193...","[8.960564628068823e-06, 0.0007939140195958316,..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
149729,2025-10-21,751348,40.25,277.00,[40.136439],[-83.034339],[US],[Danaus plexippus],1,"[-0.4459705352783203, -0.43085289001464844, -1...","[-2.534479856491089, -2.340550661087036, 1.210...","[282.84564208984375, 279.03533935546875, 276.3...","[286.7170104980469, 285.0031433105469, 283.512...","[102305.53125, 102207.5625, 101895.75, 101178....","[98751.2734375, 98631.9140625, 98333.4140625, ...","[0.0, 0.0, 0.0, 1.7881393432617188e-07, 0.0009..."
149730,2025-10-21,752822,40.50,285.50,[40.526402],[-74.495054],[US],[Danaus plexippus],1,"[1.1629976034164429, 1.8447412252426147, 1.530...","[-2.919408082962036, -4.02459192276001, -3.113...","[283.3708801269531, 271.6748352050781, 271.023...","[289.5272521972656, 285.1553955078125, 284.869...","[101545.7421875, 101466.5546875, 101552.40625,...","[100885.0625, 100796.3359375, 100882.1328125, ...","[4.5696893380409165e-07, 0.0, 0.0, 1.293420791..."
149731,2025-10-21,754264,40.75,286.00,"[40.860142, 40.861013]","[-73.933889, -73.879937]","[US, US]","[Danaus plexippus, Danaus plexippus]",2,"[0.9071788787841797, 1.8905582427978516, 1.594...","[-3.163426637649536, -4.165746212005615, -3.67...","[284.5069274902344, 275.7417907714844, 275.066...","[289.0607604980469, 285.3655090332031, 285.286...","[101516.6875, 101412.7109375, 101498.9296875, ...","[101297.1484375, 101190.0, 101276.4296875, 101...","[2.702077154026483e-06, 0.0, 0.0, 5.3445496632..."
149732,2025-10-21,755704,41.00,286.00,[40.890367],[-73.897522],[US],[Danaus plexippus],1,"[0.9740734100341797, 1.8374576568603516, 1.531...","[-3.212946653366089, -4.0759429931640625, -3.6...","[283.63885498046875, 273.76458740234375, 273.1...","[288.3072814941406, 284.2792663574219, 284.334...","[101534.96875, 101424.09375, 101508.8125, 1015...","[100382.6875, 100255.0, 100340.6796875, 100434...","[2.6424725092510926e-06, 0.0, 0.0, 2.483526805..."


#### Aggregating by `year_week` and `nearest_grid_id`

We can group the data by `year_week` and `nearest_grid_id` to produce weekly aggregated records, combining all observations within the same grid cell for each week.

In [74]:
cols = [
    "year_week",
    "dateOcurrence",
    "nearest_grid_id",
    "nearest_grid_id_latitude",
    "nearest_grid_id_longitude",
    "decimalLatitude",
    "decimalLongitude",
    "countryCode",
    "verbatimScientificName",
    "countOccurrences",
    "u10_hist",
    "v10_hist",
    "d2m_hist",
    "t2m_hist",
    "msl_hist",
    "sp_hist",
    "tp_hist",
]

df["dateOcurrence"] = pd.to_datetime(df["dateOcurrence"])

# Create a week-of-year column
iso = df["dateOcurrence"].dt.isocalendar()
df["year_week"] = iso["year"].astype(str) + "-" + iso["week"].astype(str).str.zfill(2)


# Sort so that "most recent" means "last" in each group
df = df.sort_values("dateOcurrence")

agg_df = (
    df
    .groupby(["year_week", "nearest_grid_id"], as_index=False)
    .agg({
          "countOccurrences": "sum",
          "verbatimScientificName": list,
          "dateOcurrence": list,
          "decimalLatitude": list,
          "decimalLongitude": list,
          "countryCode": list,
          "nearest_grid_id_latitude": "last",
          "nearest_grid_id_longitude": "last",
          "u10_hist": "last",
          "v10_hist": "last",
          "d2m_hist": "last",
          "t2m_hist": "last",
          "msl_hist": "last",
          "sp_hist": "last",
          "tp_hist": "last",
      }
    )
)
agg_df = agg_df[cols]
agg_df

Unnamed: 0,year_week,dateOcurrence,nearest_grid_id,nearest_grid_id_latitude,nearest_grid_id_longitude,decimalLatitude,decimalLongitude,countryCode,verbatimScientificName,countOccurrences,u10_hist,v10_hist,d2m_hist,t2m_hist,msl_hist,sp_hist,tp_hist
0,2020-02,[2020-01-11 00:00:00],630319,19.25,259.75,[19.365011],[-100.294392],[MX],[Danaus plexippus],1,"[-1.0488017797470093, -0.05954424664378166, -0...","[-0.7950083613395691, -0.10547828674316406, -0...","[277.2191162109375, 278.2710876464844, 278.051...","[287.4580078125, 286.6080322265625, 287.080383...","[102147.8671875, 102011.0234375, 101841.585937...","[81491.2578125, 81391.2578125, 81291.4609375, ...","[0.0, 0.0, 0.0, 2.5510787963867188e-05, 0.0, 0..."
1,2020-02,[2020-01-11 00:00:00],634637,20.00,259.25,[20.040089],[-100.723572],[MX],[Danaus plexippus],1,"[-0.8755432963371277, -0.027887344360351562, -...","[-1.1186574697494507, -0.17933082580566406, -0...","[272.2959289550781, 272.7549743652344, 272.618...","[284.0393981933594, 285.5241394042969, 286.964...","[102507.4296875, 102084.5234375, 101964.25, 10...","[79458.7578125, 79265.7578125, 79213.4140625, ...","[0.0, 0.0, 0.0, 5.086263172415784e-06, 2.66234..."
2,2020-02,"[2020-01-08 00:00:00, 2020-01-12 00:00:00]",641828,21.25,257.00,"[21.176591, 21.176591]","[-102.885647, -102.885647]","[MX, MX]","[Danaus plexippus, Danaus plexippus]",2,"[-0.26393064856529236, -1.13653564453125, -1.0...","[-0.5435479283332825, -0.4225482940673828, -0....","[268.4554748535156, 271.5700988769531, 272.906...","[286.1736145019531, 288.9594421386719, 287.138...","[102131.5234375, 101938.2578125, 102136.054687...","[83103.6796875, 83002.4609375, 83058.171875, 8...","[0.0, 1.748402951307071e-06, 0.0, 0.0, 0.0, 0...."
3,2020-02,[2020-01-08 00:00:00],643048,21.50,202.00,[21.414412],[-157.995773],[US],[Danaus plexippus],1,"[-1.0943189859390259, -4.930917263031006, -6.1...","[-1.8201357126235962, -3.709315061569214, -3.1...","[291.8149719238281, 292.3414001464844, 293.161...","[295.3735046386719, 295.2958984375, 296.111022...","[101708.8125, 101965.5703125, 102216.2890625, ...","[99818.7421875, 100069.03125, 100316.5703125, ...","[7.512172305723652e-05, 0.00014152129006106406..."
4,2020-02,[2020-01-07 00:00:00],653334,23.25,253.50,[23.299],[-106.443],[MX],[Danaus plexippus],1,"[-1.1912784576416016, 0.21049053966999054, 1.2...","[0.41495323181152344, 1.2509905099868774, -2.6...","[290.542724609375, 288.5713195800781, 287.7474...","[293.4073791503906, 292.4245910644531, 293.499...","[101282.4296875, 101193.3359375, 101573.125, 1...","[101399.7109375, 101310.8671875, 101690.65625,...","[0.002592881442978978, 0.0005121827125549316, ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
97968,2025-43,[2025-10-20 00:00:00],761410,42.00,272.50,[41.876287],[-87.621453],[US],[Danaus plexippus],1,"[-1.2621809244155884, -4.351569652557373, -3.0...","[-4.027459621429443, -4.1240386962890625, 0.73...","[287.65985107421875, 284.39691162109375, 283.4...","[290.7469787597656, 288.3119812011719, 288.462...","[102407.15625, 102455.9609375, 102223.5703125,...","[100352.1171875, 100384.0234375, 100160.085937...","[3.97364289028701e-07, 0.00011267264926573262,..."
97969,2025-43,[2025-10-20 00:00:00],765736,42.75,274.00,[42.776813],[-86.112511],[US],[Danaus plexippus],1,"[-0.026056289672851562, -1.3848539590835571, -...","[-2.3010189533233643, -1.8972727060317993, 0.4...","[285.6966247558594, 279.8614196777344, 279.088...","[289.67333984375, 287.0600280761719, 285.29724...","[102378.7890625, 102504.6171875, 102303.570312...","[99964.2421875, 100064.9375, 99866.0859375, 99...","[0.0, 2.4100145310512744e-05, 0.0, 1.112619997..."
97970,2025-43,[2025-10-20 00:00:00],765755,42.75,278.75,[42.664597],[-81.169833],[CA],[Danaus plexippus],1,"[-0.28692054748535156, 0.8823986053466797, 0.0...","[-2.7191097736358643, -4.3059234619140625, -3....","[283.4685974121094, 279.7908630371094, 275.803...","[287.33642578125, 285.7005920410156, 282.91296...","[102270.1953125, 102320.25, 102267.5078125, 10...","[99668.15625, 99690.1484375, 99621.2890625, 99...","[5.165736070011917e-07, 1.8080075960824615e-06..."
97971,2025-43,[2025-10-20 00:00:00],767164,43.00,271.00,[43.0],[-89.0],[US],[Danaus plexippus],1,"[-1.0250797271728516, -2.7535877227783203, -2....","[-2.5239193439483643, -1.1082509756088257, 0.4...","[281.09051513671875, 282.8287048339844, 282.81...","[286.691162109375, 285.4293518066406, 286.9841...","[102553.7421875, 102583.9765625, 102228.1875, ...","[99433.8203125, 99433.9375, 99109.0, 98167.710...","[1.9868215517249155e-08, 9.560585021972656e-05..."


# SatCLIP embeddings

Let's use the aggregated dataframe by `year_week` and `nearest_grid_id`.

In [77]:
agg_df

Unnamed: 0,year_week,dateOcurrence,nearest_grid_id,nearest_grid_id_latitude,nearest_grid_id_longitude,decimalLatitude,decimalLongitude,countryCode,verbatimScientificName,countOccurrences,u10_hist,v10_hist,d2m_hist,t2m_hist,msl_hist,sp_hist,tp_hist
0,2020-02,[2020-01-11 00:00:00],630319,19.25,259.75,[19.365011],[-100.294392],[MX],[Danaus plexippus],1,"[-1.0488017797470093, -0.05954424664378166, -0...","[-0.7950083613395691, -0.10547828674316406, -0...","[277.2191162109375, 278.2710876464844, 278.051...","[287.4580078125, 286.6080322265625, 287.080383...","[102147.8671875, 102011.0234375, 101841.585937...","[81491.2578125, 81391.2578125, 81291.4609375, ...","[0.0, 0.0, 0.0, 2.5510787963867188e-05, 0.0, 0..."
1,2020-02,[2020-01-11 00:00:00],634637,20.00,259.25,[20.040089],[-100.723572],[MX],[Danaus plexippus],1,"[-0.8755432963371277, -0.027887344360351562, -...","[-1.1186574697494507, -0.17933082580566406, -0...","[272.2959289550781, 272.7549743652344, 272.618...","[284.0393981933594, 285.5241394042969, 286.964...","[102507.4296875, 102084.5234375, 101964.25, 10...","[79458.7578125, 79265.7578125, 79213.4140625, ...","[0.0, 0.0, 0.0, 5.086263172415784e-06, 2.66234..."
2,2020-02,"[2020-01-08 00:00:00, 2020-01-12 00:00:00]",641828,21.25,257.00,"[21.176591, 21.176591]","[-102.885647, -102.885647]","[MX, MX]","[Danaus plexippus, Danaus plexippus]",2,"[-0.26393064856529236, -1.13653564453125, -1.0...","[-0.5435479283332825, -0.4225482940673828, -0....","[268.4554748535156, 271.5700988769531, 272.906...","[286.1736145019531, 288.9594421386719, 287.138...","[102131.5234375, 101938.2578125, 102136.054687...","[83103.6796875, 83002.4609375, 83058.171875, 8...","[0.0, 1.748402951307071e-06, 0.0, 0.0, 0.0, 0...."
3,2020-02,[2020-01-08 00:00:00],643048,21.50,202.00,[21.414412],[-157.995773],[US],[Danaus plexippus],1,"[-1.0943189859390259, -4.930917263031006, -6.1...","[-1.8201357126235962, -3.709315061569214, -3.1...","[291.8149719238281, 292.3414001464844, 293.161...","[295.3735046386719, 295.2958984375, 296.111022...","[101708.8125, 101965.5703125, 102216.2890625, ...","[99818.7421875, 100069.03125, 100316.5703125, ...","[7.512172305723652e-05, 0.00014152129006106406..."
4,2020-02,[2020-01-07 00:00:00],653334,23.25,253.50,[23.299],[-106.443],[MX],[Danaus plexippus],1,"[-1.1912784576416016, 0.21049053966999054, 1.2...","[0.41495323181152344, 1.2509905099868774, -2.6...","[290.542724609375, 288.5713195800781, 287.7474...","[293.4073791503906, 292.4245910644531, 293.499...","[101282.4296875, 101193.3359375, 101573.125, 1...","[101399.7109375, 101310.8671875, 101690.65625,...","[0.002592881442978978, 0.0005121827125549316, ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
97968,2025-43,[2025-10-20 00:00:00],761410,42.00,272.50,[41.876287],[-87.621453],[US],[Danaus plexippus],1,"[-1.2621809244155884, -4.351569652557373, -3.0...","[-4.027459621429443, -4.1240386962890625, 0.73...","[287.65985107421875, 284.39691162109375, 283.4...","[290.7469787597656, 288.3119812011719, 288.462...","[102407.15625, 102455.9609375, 102223.5703125,...","[100352.1171875, 100384.0234375, 100160.085937...","[3.97364289028701e-07, 0.00011267264926573262,..."
97969,2025-43,[2025-10-20 00:00:00],765736,42.75,274.00,[42.776813],[-86.112511],[US],[Danaus plexippus],1,"[-0.026056289672851562, -1.3848539590835571, -...","[-2.3010189533233643, -1.8972727060317993, 0.4...","[285.6966247558594, 279.8614196777344, 279.088...","[289.67333984375, 287.0600280761719, 285.29724...","[102378.7890625, 102504.6171875, 102303.570312...","[99964.2421875, 100064.9375, 99866.0859375, 99...","[0.0, 2.4100145310512744e-05, 0.0, 1.112619997..."
97970,2025-43,[2025-10-20 00:00:00],765755,42.75,278.75,[42.664597],[-81.169833],[CA],[Danaus plexippus],1,"[-0.28692054748535156, 0.8823986053466797, 0.0...","[-2.7191097736358643, -4.3059234619140625, -3....","[283.4685974121094, 279.7908630371094, 275.803...","[287.33642578125, 285.7005920410156, 282.91296...","[102270.1953125, 102320.25, 102267.5078125, 10...","[99668.15625, 99690.1484375, 99621.2890625, 99...","[5.165736070011917e-07, 1.8080075960824615e-06..."
97971,2025-43,[2025-10-20 00:00:00],767164,43.00,271.00,[43.0],[-89.0],[US],[Danaus plexippus],1,"[-1.0250797271728516, -2.7535877227783203, -2....","[-2.5239193439483643, -1.1082509756088257, 0.4...","[281.09051513671875, 282.8287048339844, 282.81...","[286.691162109375, 285.4293518066406, 286.9841...","[102553.7421875, 102583.9765625, 102228.1875, ...","[99433.8203125, 99433.9375, 99109.0, 98167.710...","[1.9868215517249155e-08, 9.560585021972656e-05..."
