In [1]:
import dask

dask.__version__

'2022.01.0'

In [1]:
from coiled import Cluster

cluster = Cluster(
    name="spatial-join",
    software="coiled-examples/spatial-join",
    n_workers=50,
    worker_memory="16Gib",
)

Output()

In [2]:
from distributed import Client

client = Client(cluster)

In [3]:
import dask.dataframe as dd

ddf = dd.read_parquet(
    "s3://coiled-datasets/dask-book/nyc-tlc/2009-2013/*",
    engine="pyarrow",
    storage_options={"anon": True},
)

In [4]:
import dask_geopandas

In [5]:
ddf = dask_geopandas.from_dask_dataframe(
    ddf,
    geometry=dask_geopandas.points_from_xy(ddf, "pickup_longitude", "pickup_latitude"),
)

In [6]:
ddf = ddf.set_crs(4326)

In [7]:
ddf.crs

<Geographic 2D CRS: EPSG:4326>
Name: WGS 84
Axis Info [ellipsoidal]:
- Lat[north]: Geodetic latitude (degree)
- Lon[east]: Geodetic longitude (degree)
Area of Use:
- name: World.
- bounds: (-180.0, -90.0, 180.0, 90.0)
Datum: World Geodetic System 1984 ensemble
- Ellipsoid: WGS 84
- Prime Meridian: Greenwich

In [8]:
import geopandas as gpd

ngbhoods = gpd.read_file(
    "/Users/rpelgrim/Desktop/data/nyc/nyc-communityhealth-shapefiles/CHS_2009_DOHMH_2010B/CHS_2009_DOHMH_2010B.shp"
)
ngbhoods = ngbhoods[["FIRST_UHF_", "UHF_CODE", "geometry"]]
ngbhoods = ngbhoods[:34]
ngbhoods = ngbhoods.rename(columns={"FIRST_UHF_": "nhbd_name", "UHF_CODE": "nhbd_id"})
ngbhoods = ngbhoods.to_crs(epsg=4326)

ngbhoods.head(3)

Unnamed: 0,nhbd_name,nhbd_id,geometry
0,Kingsbridge - Riverdale,101.0,"POLYGON ((-73.87793 40.90556, -73.87859 40.903..."
1,Northeast Bronx,102.0,"POLYGON ((-73.85253 40.90985, -73.85235 40.909..."
2,Fordham - Bronx Park,103.0,"POLYGON ((-73.85627 40.88315, -73.85666 40.882..."


In [9]:
ddf = ddf.persist()

In [10]:
joined = ddf.map_partitions(gpd.sjoin, ngbhoods, predicate="within", align_dataframes=False)

In [11]:
joined.head()

KilledWorker: ('finalize-377d9036-2d75-41f4-82fc-3daedf82c0dc', <WorkerState 'tls://10.4.8.220:37665', name: spatial-join-worker-a756b4b0f3, status: closed, memory: 0, processing: 1>)

## TO DO
- return main notebook to state in which the sjoin code was working (git checkout)
- figure out what changed

In [None]:
%%timeit
joined.groupby("ngbd_name").tip_amount.mean().compute()

In [12]:
%%time
joined2 = ddf.sjoin(ngbhoods, predicate="within")

CPU times: user 24.2 ms, sys: 3.06 ms, total: 27.3 ms
Wall time: 25.5 ms


In [13]:
joined2.head()

KilledWorker: ('finalize-6a685596-a2ba-43b3-982a-aab013b2a536', <WorkerState 'tls://10.4.0.210:40293', name: spatial-join-worker-03e5b7f792, status: closed, memory: 0, processing: 1>)