In [1]:
import duckdb
import pandas as pd, h3

In [2]:
con = duckdb.connect("ais.duckdb")

In [3]:
df = con.execute("""
    SELECT *
    FROM 'output/mmsi_246714000.parquet'
    WHERE LATITUDE IS NOT NULL
        AND LONGITUDE IS NOT NULL
        AND SPEED IS NOT NULL
        AND HEADING IS NOT NULL
        AND COURSE IS NOT NULL
    USING SAMPLE 100;
""").fetchdf()

print(df.head())

            TIMESTAMP       MMSI  LATITUDE   LONGITUDE  SPEED  HEADING  COURSE
0 2023-02-16 21:19:26  246714000  1.174930  103.926972    0.0     51.0   142.2
1 2023-02-22 13:49:26  246714000  1.175765  103.927030    0.0     55.0   182.1
2 2023-02-06 16:43:31  246714000  1.268458  103.876105    0.0     43.0   351.8
3 2023-02-18 17:46:26  246714000  1.175235  103.926875    0.0     57.0   115.1


In [4]:
res = 8

# random top 1 sample
lat, lon = df.iloc[0][['LATITUDE', 'LONGITUDE']]

# basic usage
cell = h3.latlng_to_cell(lat, lon, res)
center = h3.cell_to_latlng(cell)
boundary = h3.cell_to_boundary(cell)
neighbors = h3.grid_disk(cell, 1)
parent = h3.cell_to_parent(cell, 7)
children = h3.cell_to_children(cell, 9)

print("Point:", lat, lon)
print("Cell:", cell)
print("Center:", center)
print("Boundary:", boundary) 
print("Parent:", parent)
print("Children:", children)

Point: 1.17493 103.92697166666667
Cell: 886526a42bfffff
Center: (1.1754611395439742, 103.92821490627696)
Boundary: ((1.172579335735478, 103.93236717322335), (1.1776581468878855, 103.9328689296913), (1.1805400773090489, 103.92871661246168), (1.1783430389687974, 103.92406248409706), (1.1732641019931986, 103.9235608426224), (1.1703823291715592, 103.9277132145102))
Parent: 876526a42ffffff
Children: ['896526a42a3ffff', '896526a42a7ffff', '896526a42abffff', '896526a42afffff', '896526a42b3ffff', '896526a42b7ffff', '896526a42bbffff']


In [5]:
df["H3"] = [h3.latlng_to_cell(lat, lon, res) for lat, lon in zip(df["LATITUDE"], df["LONGITUDE"])]
df.head()

Unnamed: 0,TIMESTAMP,MMSI,LATITUDE,LONGITUDE,SPEED,HEADING,COURSE,H3
0,2023-02-16 21:19:26,246714000,1.17493,103.926972,0.0,51.0,142.2,886526a42bfffff
1,2023-02-22 13:49:26,246714000,1.175765,103.92703,0.0,55.0,182.1,886526a42bfffff
2,2023-02-06 16:43:31,246714000,1.268458,103.876105,0.0,43.0,351.8,886520db01fffff
3,2023-02-18 17:46:26,246714000,1.175235,103.926875,0.0,57.0,115.1,886526a42bfffff


In [6]:
# aggregate per hex
agg = (df.groupby("H3")
         .agg(ship_points=("MMSI","size"),
              unique_mmsi=("MMSI","nunique"),
              mean_speed=("SPEED","mean"))
         .reset_index())

In [7]:
agg.to_parquet("output/mmsi_246714000_h3.parquet", index=False)