# Writing ISD data to partitioned parquet

You'll need to download a year of ISD data, e.g. from Microsoft's Planetary Computer.
We're using https://github.com/gadomski/pyisd to read the data.

In [None]:
import dask
from dask.distributed import Client
import dask.dataframe
import dask_geopandas
import pandas
import os
import isd
import isd.pandas

client = Client('tcp://127.0.0.1:65076')

def read_to_data_frame(path):
    with isd.open(path) as iterator:
        records = list(iterator)
    data_frame = isd.pandas.data_frame(records)
    timestamp = pandas.to_datetime(data_frame[["year", "month", "day", "hour", "minute"]])
    data_frame["timestamp"] = timestamp
    data_frame.set_index("timestamp")
    return data_frame

data_frames = []
for file_name in os.listdir("isd/2020"):
    path = os.path.join("isd/2020", file_name)
    data_frames.append(dask.delayed(read_to_data_frame)(path))
    if len(data_frames) == 10:
        break

data_frame = dask.dataframe.from_delayed(data_frames)
data_frame = data_frame.set_index("timestamp")
data_frame = data_frame.repartition(freq="7d")

In [None]:
geo_data_frame = dask_geopandas.from_dask_dataframe(data_frame)
geo_data_frame = geo_data_frame.set_geometry(
    dask_geopandas.points_from_xy(data_frame, "longitude", "latitude")
)

In [None]:
geo_data_frame.to_parquet("isd/parquet", partition_on=["year", "month"])

In [None]:
data_frame = dask_geopandas.read_parquet("isd/parquet")
data_frame.groupby(["usaf_id", "ncei_id"])[["latitude", "longitude", "air_temperature"]].mean().compute()

In [None]:
data_frame[data_frame.usaf_id == "067000"]["air_temperature"].resample("24h").mean().compute().plot()