To get the data [iNaturalist Research-grade Observations](https://www.gbif.org/dataset/50c9509d-22c7-4a22-a47d-8c48425ef4a7) run this in the terminal


```bash
mkdir data
cd data
wget https://api.gbif.org/v1/occurrence/download/request/0011012-251025141854904.zip
unzip 0011012-251025141854904.zip
```

In [2]:
import duckdb
import pandas as pd

pd.set_option('display.max_columns', None)

In [None]:
# Run this only once

# Create or connect to a persistent DB
con = duckdb.connect('data/iNaturalist.duckdb')
print("connected")

# Import TSV once
con.execute("""
    CREATE TABLE IF NOT EXISTS iNaturalist AS
        SELECT * 
        FROM read_csv_auto('../data/0011983-251025141854904.csv', sep='\t', header=True);
""")
print("created iNaturalist.duckdb")

# Optionally add indexes
con.execute("PRAGMA enable_object_cache;")
print("add indexes")


# Query repeatedly without rereading the TSV
result = con.execute("""
    SELECT COUNT(*)
    FROM iNaturalist

""").df()

result.head()

connected


In [4]:
con = duckdb.connect('data/iNaturalist.duckdb')
print("connected!")

connected!


In [25]:
query = """
SELECT COUNT(*) AS observation_count
FROM iNaturalist
"""
df = con.execute(query).df()
df

Unnamed: 0,observation_count
0,131470382


In [None]:
query = """
SELECT *
FROM iNaturalist
LIMIT 5
"""
df = con.execute(query).df()
df

Unnamed: 0,gbifID,datasetKey,occurrenceID,kingdom,phylum,class,order,family,genus,species,infraspecificEpithet,taxonRank,scientificName,verbatimScientificName,verbatimScientificNameAuthorship,countryCode,locality,stateProvince,occurrenceStatus,individualCount,publishingOrgKey,decimalLatitude,decimalLongitude,coordinateUncertaintyInMeters,coordinatePrecision,elevation,elevationAccuracy,depth,depthAccuracy,eventDate,day,month,year,taxonKey,speciesKey,basisOfRecord,institutionCode,collectionCode,catalogNumber,recordNumber,identifiedBy,dateIdentified,license,rightsHolder,recordedBy,typeStatus,establishmentMeans,lastInterpreted,mediaType,issue
0,4868040692,50c9509d-22c7-4a22-a47d-8c48425ef4a7,https://www.inaturalist.org/observations/18469...,Plantae,Tracheophyta,Magnoliopsida,Lamiales,Linderniaceae,Torenia,Torenia concolor,,SPECIES,Torenia concolor Lindl.,Torenia concolor,,TW,,Taiwan,PRESENT,,28eb1a3f-1c15-4a95-931a-4af90ecb574d,23.912801,120.887476,,,,,,,2023-09-24T10:00,24,9,2023,7331731,7331731,HUMAN_OBSERVATION,iNaturalist,Observations,184692892,,祐,2023-09-24 14:10:04,CC_BY_4_0,祐,祐,,,2025-10-27 18:16:30.103000-04:00,StillImage,COORDINATE_ROUNDED;CONTINENT_DERIVED_FROM_COOR...
1,4416978367,50c9509d-22c7-4a22-a47d-8c48425ef4a7,https://www.inaturalist.org/observations/18476...,Plantae,Tracheophyta,Magnoliopsida,Ericales,Primulaceae,Samolus,Samolus parviflorus,,SPECIES,Samolus parviflorus Raf.,Samolus parviflorus,,US,,Indiana,PRESENT,,28eb1a3f-1c15-4a95-931a-4af90ecb574d,40.19258,-85.400362,34.0,,,,,,2023-09-24T15:57:17,24,9,2023,4005796,4005796,HUMAN_OBSERVATION,iNaturalist,Observations,184760814,,lsodo,2023-09-24 20:48:04,CC_BY_NC_4_0,lsodo,lsodo,,,2025-10-27 18:16:31.279000-04:00,StillImage;StillImage;StillImage;StillImage,COORDINATE_ROUNDED;CONTINENT_DERIVED_FROM_COOR...


In [None]:
query = """
SELECT verbatimScientificName, COUNT(*) AS observation_count
FROM iNaturalist
GROUP BY verbatimScientificName
ORDER BY observation_count DESC
"""

df = con.execute(query).df()
df

Unnamed: 0,verbatimScientificName,observation_count
0,Anas platyrhynchos,491873
1,Apis mellifera,422116
2,Harmonia axyridis,317456
3,Passer domesticus,306767
4,Danaus plexippus,295071
...,...,...
483352,Neolepolepis occidentalis,1
483353,Copestylum willinki,1
483354,Acanthonotozoma cristatum,1
483355,Triodia danthonioides,1


In [None]:
# query = """
# SELECT 
#     verbatimScientificName,
#     STRFTIME(
#         TRY_STRPTIME(eventDate, ['%Y-%m-%dT%H:%M:%S', '%Y-%m-%dT%H:%M', '%Y-%m-%d']),
#         '%Y-%m-%d %H:%M:%S'
#     ) AS eventDate,
#     decimalLatitude,
#     decimalLongitude,
#     countryCode
# FROM iNaturalist
# WHERE 
#     decimalLatitude IS NOT NULL
#     AND decimalLongitude IS NOT NULL
#     AND eventDate IS NOT NULL
# LIMIT 10
# """

# df = con.execute(query).df()
# df

Unnamed: 0,verbatimScientificName,eventDate,decimalLatitude,decimalLongitude,countryCode
0,Torenia concolor,2023-09-24 10:00:00,23.912801,120.887476,TW
1,Samolus parviflorus,2023-09-24 15:57:17,40.19258,-85.400362,US
2,Harpaphe haydeniana,2020-05-24 16:33:17,37.262108,-122.241436,US
3,Polistes dominula,2020-05-26 11:07:43,40.701724,-73.99627,US
4,Mergus merganser,2020-05-08 00:00:00,45.501143,-122.306145,US
5,Bombus pratorum,2020-05-26 12:59:00,52.490877,13.321534,DE
6,Lotus corniculatus,2020-05-28 17:41:36,38.968353,-84.43062,US
7,Anthriscus sylvestris,2020-05-29 16:35:00,55.824916,37.57399,RU
8,Lamium galeobdolon,2020-05-30 13:08:05,50.975306,14.037483,DE
9,Setophaga magnolia,2020-05-30 09:20:00,43.945995,-76.879598,CA


In [15]:
query = """
WITH clean_iNaturalist AS (
    SELECT
        verbatimScientificName,
        STRFTIME(
            TRY_STRPTIME(eventDate, ['%Y-%m-%dT%H:%M:%S', '%Y-%m-%dT%H:%M', '%Y-%m-%d']),
            '%Y-%m-%d'
        ) AS dateOcurrence,  -- keep only the date part
        decimalLatitude,
        decimalLongitude,
        countryCode
    FROM iNaturalist
    WHERE
        decimalLatitude IS NOT NULL
        AND decimalLongitude IS NOT NULL
        AND eventDate IS NOT NULL AND TRIM(eventDate) <> '' 
        AND dateOcurrence IS NOT NULL AND TRIM(dateOcurrence) <> ''
        AND TRY_STRPTIME(eventDate, ['%Y-%m-%dT%H:%M:%S', '%Y-%m-%dT%H:%M', '%Y-%m-%d']) > TIMESTAMP '2025-01-01'
        AND verbatimScientificName = 'Danaus plexippus'
)

SELECT
    verbatimScientificName,
    dateOcurrence,
    decimalLatitude,
    decimalLongitude,
    countryCode,
    COUNT(*) AS countOccurrences
FROM clean_iNaturalist
GROUP BY
    verbatimScientificName,
    dateOcurrence,
    decimalLatitude,
    decimalLongitude,
    countryCode
ORDER BY
    dateOcurrence;
"""
df = con.execute(query).df()
df


Unnamed: 0,verbatimScientificName,dateOcurrence,decimalLatitude,decimalLongitude,countryCode,countOccurrences
0,Danaus plexippus,2025-01-01,-34.806677,138.663421,AU,1
1,Danaus plexippus,2025-01-01,35.129267,-120.632411,US,1
2,Danaus plexippus,2025-01-01,18.369262,-65.634262,PR,1
3,Danaus plexippus,2025-01-01,33.718342,-118.289008,US,1
4,Danaus plexippus,2025-01-01,28.032774,-82.781269,US,1
...,...,...,...,...,...,...
54023,Danaus plexippus,2025-10-21,30.070267,-95.739158,US,1
54024,Danaus plexippus,2025-10-21,37.885839,-122.298224,US,1
54025,Danaus plexippus,2025-10-21,37.885922,-122.298266,US,1
54026,Danaus plexippus,2025-10-21,37.885900,-122.298221,US,1


In [None]:
# Aggregate by date and time
query = """
WITH clean_iNaturalist AS (
    SELECT
        verbatimScientificName,
        STRFTIME(
            TRY_STRPTIME(eventDate, ['%Y-%m-%dT%H:%M:%S', '%Y-%m-%dT%H:%M', '%Y-%m-%d']),
            '%Y-%m-%d %H:%M:%S'
        ) AS dateOcurrence,  -- keep full datetime
        decimalLatitude,
        decimalLongitude,
        countryCode
    FROM iNaturalist
    WHERE
        decimalLatitude IS NOT NULL
        AND decimalLongitude IS NOT NULL
        AND eventDate IS NOT NULL AND TRIM(eventDate) <> ''
        AND TRY_STRPTIME(eventDate, ['%Y-%m-%dT%H:%M:%S', '%Y-%m-%dT%H:%M', '%Y-%m-%d']) > TIMESTAMP '2000-01-01'
        AND verbatimScientificName = 'Danaus plexippus'
)

SELECT
    verbatimScientificName,
    dateOcurrence,
    decimalLatitude,
    decimalLongitude,
    countryCode,
    COUNT(*) AS countOccurrences
FROM clean_iNaturalist
GROUP BY
    verbatimScientificName,
    dateOcurrence,
    decimalLatitude,
    decimalLongitude,
    countryCode
ORDER BY
    dateOcurrence;
"""

df = con.execute(query).df()
df


Unnamed: 0,verbatimScientificName,dateOcurrence,decimalLatitude,decimalLongitude,countryCode,countOccurrences
0,Danaus plexippus,2000-01-29 19:46:00,35.263600,-120.638700,US,1
1,Danaus plexippus,2000-01-29 20:14:00,35.263712,-120.638812,US,1
2,Danaus plexippus,2000-01-29 20:15:00,35.263638,-120.638652,US,1
3,Danaus plexippus,2000-02-28 21:44:00,20.607068,-103.341398,MX,1
4,Danaus plexippus,2000-04-27 16:20:00,36.099136,-112.089027,US,1
...,...,...,...,...,...,...
280161,Danaus plexippus,2025-10-21 17:05:35,33.012233,-83.734720,US,1
280162,Danaus plexippus,2025-10-21 17:19:44,33.465848,-82.230231,US,1
280163,Danaus plexippus,2025-10-21 17:30:21,36.704885,-76.236060,US,1
280164,Danaus plexippus,2025-10-21 18:22:53,-32.340301,116.013917,AU,1
