<img src="https://warehouse-camo.ingress.cmh1.psfhosted.org/73fb6073b2bac71a627a410020353c89833c447a/68747470733a2f2f6769746875622e636f6d2f686f6c6f76697a2f646174617368616465722f7261772f6d61696e2f646f632f5f7374617469632f6c6f676f5f737461636b65642e706e67"
     align="right"
     width="20%"/>

Visualize 1,000,000,000 Points
==============================

In this notebook we process roughly one billion points and set them up for interactive visualization.

In [None]:
import dask.dataframe as dd
import datashader
import hvplot.dask
import coiled
from dask.distributed import Client, wait

## Create Cluster

In [None]:
%%time 

cluster = coiled.Cluster(
    n_workers=20,
    name="datashader",
    backend_options={"region_name": "us-east-2"}, 
) 

client = cluster.get_client()

## Load data

In [None]:
%%time

df = dd.read_parquet(
    "s3://coiled-datasets/dask-book/nyc-tlc/2009-2013/",
    columns=["dropoff_longitude", "dropoff_latitude", "pickup_longitude", "pickup_latitude"]
)

# clean data to limit to lat-longs near nyc
df = df.loc[
    (df.dropoff_longitude > -74.1) & (df.dropoff_longitude < -73.7) & 
    (df.dropoff_latitude > 40.6) & (df.dropoff_latitude < 40.9) &
    (df.pickup_longitude > -74.1) & (df.pickup_longitude < -73.7) &
    (df.pickup_latitude > 40.6) & (df.pickup_latitude < 40.9)
]

# now we have to get a DataFrame with just dropoff locations
df_drop = df[["dropoff_longitude", "dropoff_latitude"]]
df_drop["journey_type"] = "dropoff"
df_drop = df_drop.rename(columns={'dropoff_longitude': 'long', 'dropoff_latitude': 'lat'})


# now do the same for pickups
df_pick = df[["pickup_longitude", "pickup_latitude"]]
df_pick["journey_type"] = "pickup"
df_pick = df_pick.rename(columns={'pickup_longitude': 'long', 'pickup_latitude': 'lat'})

# concatenate two dask dataframes
df_plot = dd.concat([df_drop, df_pick])

df_plot = df_plot.astype({"journey_type": "category"})
df_plot["journey_type"] = df_plot["journey_type"].cat.set_categories(["dropoff", "pickup"])

#partitions are small - better to repartition
df_plot = df_plot.persist()
df_plot = df_plot.repartition(partition_size="256MiB").persist()

print("Number of records:", len(df_plot))

## Visualize

In [None]:
import holoviews as hv
hv.extension('bokeh')

color_key = {"pickup": "#EF1561", "dropoff": "#1F5AFF"}

df_plot.hvplot.scatter(
    x="long", 
    y="lat", 
    aggregator=datashader.by("journey_type"), 
    datashade=True, 
    cnorm="eq_hist",
    frame_width=700, 
    aspect=1.33, 
    color_key=color_key
)