In [1]:
from coiled.v2 import Cluster

cluster = Cluster(
    name="spatial-join",
    software="coiled-examples/spatial-join",
    n_workers=50,
    worker_memory="16Gib",
)

In [2]:
from distributed import Client
client = Client(cluster)

In [3]:
import dask.dataframe as dd

ddf = dd.read_parquet(
    "s3://coiled-datasets/dask-book/nyc-tlc/2009-2013/*",
    engine="pyarrow",
    storage_options={"anon": True},
)

In [4]:
ddf.isna().sum().compute()

vendor_id            0
pickup_datetime      0
dropoff_datetime     0
passenger_count      0
trip_distance        0
pickup_longitude     0
pickup_latitude      0
dropoff_longitude    0
dropoff_latitude     0
payment_type         0
fare_amount          0
surcharge            0
tip_amount           0
tolls_amount         0
total_amount         0
dtype: int64

In [5]:
ddf.dtypes

vendor_id                    object
pickup_datetime      datetime64[ns]
dropoff_datetime     datetime64[ns]
passenger_count               int64
trip_distance               float64
pickup_longitude            float64
pickup_latitude             float64
dropoff_longitude           float64
dropoff_latitude            float64
payment_type                 object
fare_amount                 float64
surcharge                   float64
tip_amount                  float64
tolls_amount                float64
total_amount                float64
dtype: object

In [7]:
import dask_geopandas

ddf = ddf.set_geometry(
    dask_geopandas.points_from_xy(ddf, "pickup_longitude", "pickup_latitude"),
)

In [8]:
ddf = ddf.set_crs(4326)

In [9]:
ddf.crs

<Geographic 2D CRS: EPSG:4326>
Name: WGS 84
Axis Info [ellipsoidal]:
- Lat[north]: Geodetic latitude (degree)
- Lon[east]: Geodetic longitude (degree)
Area of Use:
- name: World.
- bounds: (-180.0, -90.0, 180.0, 90.0)
Datum: World Geodetic System 1984 ensemble
- Ellipsoid: WGS 84
- Prime Meridian: Greenwich

In [10]:
import geopandas as gpd

ngbhoods = gpd.read_file(
    "CHS_2009_DOHMH_2010B/CHS_2009_DOHMH_2010B.shp"
)
ngbhoods = ngbhoods[["FIRST_UHF_", "UHF_CODE", "geometry"]]
ngbhoods = ngbhoods[:34]
ngbhoods = ngbhoods.rename(columns={"FIRST_UHF_": "nhbd_name", "UHF_CODE": "nhbd_id"})
ngbhoods = ngbhoods.to_crs(epsg=4326)

ngbhoods.head(3)

Unnamed: 0,nhbd_name,nhbd_id,geometry
0,Kingsbridge - Riverdale,101.0,"POLYGON ((-73.87793 40.90556, -73.87859 40.903..."
1,Northeast Bronx,102.0,"POLYGON ((-73.85253 40.90985, -73.85235 40.909..."
2,Fordham - Bronx Park,103.0,"POLYGON ((-73.85627 40.88315, -73.85666 40.882..."


In [16]:
dd.from_pandas(ngbhoods, npartitions=1).compute()

Unnamed: 0,nhbd_name,nhbd_id,geometry
0,Kingsbridge - Riverdale,101.0,"POLYGON ((-73.87793 40.90556, -73.87859 40.903..."
1,Northeast Bronx,102.0,"POLYGON ((-73.85253 40.90985, -73.85235 40.909..."
2,Fordham - Bronx Park,103.0,"POLYGON ((-73.85627 40.88315, -73.85666 40.882..."
3,Pelham - Throgs Neck,104.0,"MULTIPOLYGON (((-73.81625 40.86082, -73.81584 ..."
4,Greenpoint,201.0,"POLYGON ((-73.92436 40.71557, -73.92404 40.714..."
5,Downtown - Heights - Slope,202.0,"POLYGON ((-73.97022 40.70673, -73.97018 40.704..."
6,Bedford Stuyvesant - Crown Heights,203.0,"POLYGON ((-73.94506 40.68910, -73.94491 40.688..."
7,East New York,204.0,"POLYGON ((-73.86651 40.68447, -73.86630 40.683..."
8,Sunset Park,205.0,"MULTIPOLYGON (((-73.99723 40.66921, -73.99687 ..."
9,Borough Park,206.0,"POLYGON ((-73.97379 40.65639, -73.97355 40.655..."


In [10]:
%%time
joined1 = ddf.sjoin(ngbhoods, predicate="within")

CPU times: user 25.4 ms, sys: 3.36 ms, total: 28.8 ms
Wall time: 26.7 ms


In [11]:
joined1.head()

distributed.protocol.core - CRITICAL - Failed to Serialize
Traceback (most recent call last):
  File "/Users/rpelgrim/mambaforge/envs/dask-dataframe/lib/python3.9/site-packages/distributed/protocol/core.py", line 76, in dumps
    frames[0] = msgpack.dumps(msg, default=_encode_default, use_bin_type=True)
  File "/Users/rpelgrim/mambaforge/envs/dask-dataframe/lib/python3.9/site-packages/msgpack/__init__.py", line 35, in packb
    return Packer(**kwargs).pack(o)
  File "msgpack/_packer.pyx", line 294, in msgpack._cmsgpack.Packer.pack
  File "msgpack/_packer.pyx", line 300, in msgpack._cmsgpack.Packer.pack
  File "msgpack/_packer.pyx", line 297, in msgpack._cmsgpack.Packer.pack
  File "msgpack/_packer.pyx", line 264, in msgpack._cmsgpack.Packer._pack
  File "msgpack/_packer.pyx", line 231, in msgpack._cmsgpack.Packer._pack
  File "msgpack/_packer.pyx", line 231, in msgpack._cmsgpack.Packer._pack
  File "msgpack/_packer.pyx", line 264, in msgpack._cmsgpack.Packer._pack
  File "msgpack/_pack

CancelledError: ('head-1-5-sjoin-c2d35882e2de0ab2bf53039cd55f43e7', 0)

distributed.client - ERROR - Failed to reconnect to scheduler after 30.00 seconds, closing client


Traceback (most recent call last):
  File "/Users/rpelgrim/mambaforge/envs/dask-dataframe/lib/python3.9/site-packages/distributed/comm/tcp.py", line 409, in connect
    stream = await self.client.connect(
  File "/Users/rpelgrim/mambaforge/envs/dask-dataframe/lib/python3.9/site-packages/tornado/tcpclient.py", line 275, in connect
    af, addr, stream = await connector.start(connect_timeout=timeout)
asyncio.exceptions.CancelledError

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/Users/rpelgrim/mambaforge/envs/dask-dataframe/lib/python3.9/asyncio/tasks.py", line 492, in wait_for
    fut.result()
asyncio.exceptions.CancelledError

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/Users/rpelgrim/mambaforge/envs/dask-dataframe/lib/python3.9/site-packages/distributed/comm/core.py", line 289, in connect
    comm = await asyncio.wait_for(
  File "/Users/rpelgrim/ma