# Create NYC TLC Parquet files

There is New York City Taxi and Limousine Commission (TLC) Trip Record Data stored in S3, [see here](https://registry.opendata.aws/nyc-tlc-trip-records-pds/).

This notebook reads in the CSV data and writes out Parquet files that are easier and more performant to work with.

In [1]:
import coiled
import dask
import dask.dataframe as dd
import pandas as pd

In [2]:
from coiled.v2 import Cluster

cluster = Cluster(
    name="nyc-tlc-cleaning", 
    n_workers=25,
    software="coiled-examples/dask-dataframes",
    scheduler_options={'idle_timeout':'2 hours'},
)

In [3]:
client = dask.distributed.Client(cluster)


+---------+----------------+---------------+---------------+
| Package | client         | scheduler     | workers       |
+---------+----------------+---------------+---------------+
| python  | 3.9.12.final.0 | 3.9.7.final.0 | 3.9.7.final.0 |
+---------+----------------+---------------+---------------+


## 2009 data create

In [4]:
ddf = dd.read_csv(
    "s3://nyc-tlc/trip data/yellow_tripdata_2009-*.csv",
    parse_dates=["Trip_Pickup_DateTime", "Trip_Dropoff_DateTime"],
    dtype={
        "Tolls_Amt": "float64",
        "store_and_forward": "object",
    },
)

In [5]:
ddf.head()

Unnamed: 0,vendor_name,Trip_Pickup_DateTime,Trip_Dropoff_DateTime,Passenger_Count,Trip_Distance,Start_Lon,Start_Lat,Rate_Code,store_and_forward,End_Lon,End_Lat,Payment_Type,Fare_Amt,surcharge,mta_tax,Tip_Amt,Tolls_Amt,Total_Amt
0,VTS,2009-01-04 02:52:00,2009-01-04 03:02:00,1,2.63,-73.991957,40.721567,,,-73.993803,40.695922,CASH,8.9,0.5,,0.0,0.0,9.4
1,VTS,2009-01-04 03:31:00,2009-01-04 03:38:00,3,4.55,-73.982102,40.73629,,,-73.95585,40.76803,Credit,12.1,0.5,,2.0,0.0,14.6
2,VTS,2009-01-03 15:43:00,2009-01-03 15:57:00,5,10.35,-74.002587,40.739748,,,-73.869983,40.770225,Credit,23.7,0.0,,4.74,0.0,28.44
3,DDS,2009-01-01 20:52:58,2009-01-01 21:14:00,1,5.0,-73.974267,40.790955,,,-73.996558,40.731849,CREDIT,14.9,0.5,,3.05,0.0,18.45
4,DDS,2009-01-24 16:18:23,2009-01-24 16:24:56,1,0.4,-74.00158,40.719382,,,-74.008378,40.72035,CASH,3.7,0.0,,0.0,0.0,3.7


#### Clean Rate_Code NaNs

In [6]:
# # clean rate_code column NaNs
# import math

# def rate_code_to_one(something):
#     if math.isnan(something):
#         return 1
#     else:
#         return something
    
# ddf = ddf.assign(
#     Rate_Code=ddf.Rate_Code.apply(rate_code_to_one, meta=("float64", "int64"))
# )

In [6]:
# vendor_id                     object
# pickup_datetime       datetime64[ns]
# dropoff_datetime      datetime64[ns]
# passenger_count                int64
# trip_distance                float64
# pickup_longitude             float64
# pickup_latitude              float64
# rate_code                      int64
# store_and_fwd_flag            object
# dropoff_longitude            float64
# dropoff_latitude             float64
# payment_type                  object
# fare_amount                  float64
# surcharge                    float64
# mta_tax                      float64
# tip_amount                   float64
# tolls_amount                 float64
# total_amount                 float64
# dtype: object
ddf.dtypes

vendor_name                      object
Trip_Pickup_DateTime     datetime64[ns]
Trip_Dropoff_DateTime    datetime64[ns]
Passenger_Count                   int64
Trip_Distance                   float64
Start_Lon                       float64
Start_Lat                       float64
Rate_Code                       float64
store_and_forward                object
End_Lon                         float64
End_Lat                         float64
Payment_Type                     object
Fare_Amt                        float64
surcharge                       float64
mta_tax                         float64
Tip_Amt                         float64
Tolls_Amt                       float64
Total_Amt                       float64
dtype: object

In [7]:
# rename columns to standardize schema
ddf = ddf.rename(
    columns={
        "vendor_name": "vendor_id",
        "Trip_Pickup_DateTime": "pickup_datetime",
        "Trip_Dropoff_DateTime": "dropoff_datetime",
        "Passenger_Count": "passenger_count",
        "Trip_Distance": "trip_distance",
        "Start_Lon": "pickup_longitude",
        "Start_Lat": "pickup_latitude",
        "Rate_Code": "rate_code",
        "store_and_forward": "store_and_fwd_flag",
        "End_Lon": "dropoff_longitude",
        "End_Lat": "dropoff_latitude",
        "Payment_Type": "payment_type",
        "Fare_Amt": "fare_amount",
        "surcharge": "surcharge",
        "mta_tax": "mta_tax",
        "Tip_Amt": "tip_amount",
        "Tolls_Amt": "tolls_amount",
        "Total_Amt": "total_amount",
    }
)

In [8]:
ddf.dtypes

vendor_id                     object
pickup_datetime       datetime64[ns]
dropoff_datetime      datetime64[ns]
passenger_count                int64
trip_distance                float64
pickup_longitude             float64
pickup_latitude              float64
rate_code                    float64
store_and_fwd_flag            object
dropoff_longitude            float64
dropoff_latitude             float64
payment_type                  object
fare_amount                  float64
surcharge                    float64
mta_tax                      float64
tip_amount                   float64
tolls_amount                 float64
total_amount                 float64
dtype: object

In [9]:
len(ddf.dtypes)

18

In [10]:
ddf.head()

Unnamed: 0,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,trip_distance,pickup_longitude,pickup_latitude,rate_code,store_and_fwd_flag,dropoff_longitude,dropoff_latitude,payment_type,fare_amount,surcharge,mta_tax,tip_amount,tolls_amount,total_amount
0,VTS,2009-01-04 02:52:00,2009-01-04 03:02:00,1,2.63,-73.991957,40.721567,,,-73.993803,40.695922,CASH,8.9,0.5,,0.0,0.0,9.4
1,VTS,2009-01-04 03:31:00,2009-01-04 03:38:00,3,4.55,-73.982102,40.73629,,,-73.95585,40.76803,Credit,12.1,0.5,,2.0,0.0,14.6
2,VTS,2009-01-03 15:43:00,2009-01-03 15:57:00,5,10.35,-74.002587,40.739748,,,-73.869983,40.770225,Credit,23.7,0.0,,4.74,0.0,28.44
3,DDS,2009-01-01 20:52:58,2009-01-01 21:14:00,1,5.0,-73.974267,40.790955,,,-73.996558,40.731849,CREDIT,14.9,0.5,,3.05,0.0,18.45
4,DDS,2009-01-24 16:18:23,2009-01-24 16:24:56,1,0.4,-74.00158,40.719382,,,-74.008378,40.72035,CASH,3.7,0.0,,0.0,0.0,3.7


In [11]:
ddf.npartitions

478

In [12]:
ddf.known_divisions

False

In [13]:
ddf.repartition(partition_size="100MB").to_parquet(
    "s3://coiled-datasets/dask-book/nyc-tlc/2009",
    engine="pyarrow",
    compression="snappy",
    write_metadata_file=False,
)

[None]

In [14]:
cluster.scale(50)

## 2009 data query

In [15]:
ddf = dd.read_parquet(
    "s3://coiled-datasets/dask-book/nyc-tlc/2009",
    engine="pyarrow",
)

In [16]:
ddf.head()

Unnamed: 0,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,trip_distance,pickup_longitude,pickup_latitude,rate_code,store_and_fwd_flag,dropoff_longitude,dropoff_latitude,payment_type,fare_amount,surcharge,mta_tax,tip_amount,tolls_amount,total_amount
0,VTS,2009-01-04 02:52:00,2009-01-04 03:02:00,1,2.63,-73.991957,40.721567,,,-73.993803,40.695922,CASH,8.9,0.5,,0.0,0.0,9.4
1,VTS,2009-01-04 03:31:00,2009-01-04 03:38:00,3,4.55,-73.982102,40.73629,,,-73.95585,40.76803,Credit,12.1,0.5,,2.0,0.0,14.6
2,VTS,2009-01-03 15:43:00,2009-01-03 15:57:00,5,10.35,-74.002587,40.739748,,,-73.869983,40.770225,Credit,23.7,0.0,,4.74,0.0,28.44
3,DDS,2009-01-01 20:52:58,2009-01-01 21:14:00,1,5.0,-73.974267,40.790955,,,-73.996558,40.731849,CREDIT,14.9,0.5,,3.05,0.0,18.45
4,DDS,2009-01-24 16:18:23,2009-01-24 16:24:56,1,0.4,-74.00158,40.719382,,,-74.008378,40.72035,CASH,3.7,0.0,,0.0,0.0,3.7


In [17]:
dtypes_2009 = ddf.dtypes

In [19]:
dtypes_2009

vendor_id                     object
pickup_datetime       datetime64[ns]
dropoff_datetime      datetime64[ns]
passenger_count                int64
trip_distance                float64
pickup_longitude             float64
pickup_latitude              float64
rate_code                    float64
store_and_fwd_flag            object
dropoff_longitude            float64
dropoff_latitude             float64
payment_type                  object
fare_amount                  float64
surcharge                    float64
mta_tax                      float64
tip_amount                   float64
tolls_amount                 float64
total_amount                 float64
dtype: object

In [20]:
# verify by computing summary statistics
ddf.describe().compute()

Unnamed: 0,passenger_count,trip_distance,pickup_longitude,pickup_latitude,rate_code,dropoff_longitude,dropoff_latitude,fare_amount,surcharge,mta_tax,tip_amount,tolls_amount,total_amount
count,170896100.0,170896100.0,170896100.0,170896100.0,0.0,170896100.0,170896100.0,170896100.0,170896100.0,29855490.0,170896100.0,170896100.0,170896100.0
mean,1.691026,2.708163,-73.04755,40.24202,,-73.06975,40.25457,9.905162,0.2124957,0.4483104,0.5733786,0.1531628,10.93008
std,1.317733,3.119908,8.33958,4.634872,,8.217777,4.633053,7.686219,0.3366762,0.1522268,1.428889,0.8581441,8.855952
min,0.0,0.0,-2828.434,-3084.3,,-3509.015,-3579.139,2.5,0.0,0.0,0.0,0.0,2.5
25%,1.0,1.02,-73.99142,40.73691,,-73.99083,40.73623,5.7,0.0,0.5,0.0,0.0,6.7
50%,1.0,1.77,-73.98108,40.75461,,-73.97959,40.75486,7.7,0.0,0.5,0.0,0.0,9.0
75%,2.0,3.13,-73.96643,40.76873,,-73.96378,40.76934,11.3,0.5,0.5,1.0,0.0,13.0
max,255.0,50.0,3570.224,3210.379,,1565.33,3172.506,200.0,12.0,0.5,100.0,20.0,235.5


## 2010 data create

In [22]:
# read data in without dtypes
ddf = dd.read_csv(
    "s3://nyc-tlc/trip data/yellow_tripdata_2010-*.csv",
    parse_dates=["pickup_datetime", "dropoff_datetime"],
    on_bad_lines="skip",
    dtype={
        'tolls_amount': 'float64',
        "store_and_fwd_flag": "object",
    },
)

In [23]:
ddf.head()

Unnamed: 0,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,trip_distance,pickup_longitude,pickup_latitude,rate_code,store_and_fwd_flag,dropoff_longitude,dropoff_latitude,payment_type,fare_amount,surcharge,mta_tax,tip_amount,tolls_amount,total_amount
0,VTS,2010-01-26 07:41:00,2010-01-26 07:45:00,1,0.75,-73.956778,40.76775,1,,-73.965957,40.765232,CAS,4.5,0.0,0.5,0.0,0.0,5.0
1,DDS,2010-01-30 23:31:00,2010-01-30 23:46:12,1,5.9,-73.996118,40.763932,1,,-73.981512,40.741193,CAS,15.3,0.5,0.5,0.0,0.0,16.3
2,DDS,2010-01-18 20:22:20,2010-01-18 20:38:12,1,4.0,-73.979673,40.78379,1,,-73.917852,40.87856,CAS,11.7,0.5,0.5,0.0,0.0,12.7
3,VTS,2010-01-09 01:18:00,2010-01-09 01:35:00,2,4.7,-73.977922,40.763997,1,,-73.923908,40.759725,CAS,13.3,0.5,0.5,0.0,0.0,14.3
4,CMT,2010-01-18 19:10:14,2010-01-18 19:17:07,1,0.6,-73.990924,40.734682,1,0.0,-73.995511,40.739088,Cre,5.3,0.0,0.5,0.87,0.0,6.67


In [24]:
%%time
# calculate .describe() to check for dtype issues
ddf.describe().compute()

CPU times: user 666 ms, sys: 75.4 ms, total: 741 ms
Wall time: 39.6 s


Unnamed: 0,passenger_count,trip_distance,pickup_longitude,pickup_latitude,rate_code,dropoff_longitude,dropoff_latitude,fare_amount,surcharge,mta_tax,tip_amount,tolls_amount,total_amount
count,168994400.0,168994400.0,168994400.0,168994400.0,168994400.0,168994200.0,168994200.0,168994400.0,168994400.0,168994400.0,168994400.0,168994400.0,168994400.0
mean,1.674221,5.864681,-72.39051,39.88347,1.032463,-72.41661,39.89821,9.844589,0.3221727,0.495594,0.7605548,-0.07496016,11.35107
std,1.300666,5409.394,11.02328,7.053625,0.4236116,10.89876,7.027043,1664.848,0.3693486,0.1345498,173.5413,2336.194,2873.962
min,0.0,-21474830.0,-3509.015,-3579.139,0.0,-3579.139,-3538.432,-21474810.0,-1.0,-1.0,-1677720.0,-21474840.0,-21474830.0
25%,1.0,1.09,-73.99139,40.73673,1.0,-73.99071,40.73553,5.7,0.0,0.5,0.0,0.0,7.0
50%,1.0,1.8,-73.98101,40.75392,1.0,-73.97925,40.75437,8.1,0.5,0.5,0.0,0.0,9.54
75%,3.0,3.25,-73.96582,40.76844,1.0,-73.96257,40.76894,12.1,0.5,0.5,1.6,0.0,13.8
max,255.0,16201630.0,3569.931,3377.993,221.0,3443.651,3510.381,93960.07,615.78,1311.22,938.02,5510.07,93960.57


In [25]:
# verify dtypes and column names are identical for 2009 and 2010
dtypes_2010 = ddf.dtypes
dtypes_2009 == dtypes_2010

vendor_id              True
pickup_datetime        True
dropoff_datetime       True
passenger_count        True
trip_distance          True
pickup_longitude       True
pickup_latitude        True
rate_code             False
store_and_fwd_flag     True
dropoff_longitude      True
dropoff_latitude       True
payment_type           True
fare_amount            True
surcharge              True
mta_tax                True
tip_amount             True
tolls_amount           True
total_amount           True
dtype: bool

distributed.client - ERROR - Failed to reconnect to scheduler after 30.00 seconds, closing client
_GatheringFuture exception was never retrieved
future: <_GatheringFuture finished exception=CancelledError()>
asyncio.exceptions.CancelledError
Traceback (most recent call last):
  File "/Users/rpelgrim/mambaforge/envs/dask-dataframe/lib/python3.9/site-packages/distributed/comm/tcp.py", line 409, in connect
    stream = await self.client.connect(
  File "/Users/rpelgrim/mambaforge/envs/dask-dataframe/lib/python3.9/site-packages/tornado/tcpclient.py", line 275, in connect
    af, addr, stream = await connector.start(connect_timeout=timeout)
asyncio.exceptions.CancelledError

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/Users/rpelgrim/mambaforge/envs/dask-dataframe/lib/python3.9/asyncio/tasks.py", line 490, in wait_for
    return fut.result()
asyncio.exceptions.CancelledError

The above exception was the direct cause of the 

In [17]:
# repartition and write 2010 data to Parquet 
ddf.repartition(partition_size="100MB").to_parquet(
    "s3://coiled-datasets/dask-book/nyc-tlc/2010",
    engine="pyarrow",
    compression="snappy",
    write_metadata_file=False,
)

[None]

In [None]:
ddf = dd.read_parquet(
    "s3://coiled-datasets/dask-book/nyc-tlc/2010", 
    engine="pyarrow"
)

In [None]:
# check datatypes are stored correctly in Parquet file
dtypes_2010 == ddf.dtypes

## 2011 data create

In [25]:
ddf = dd.read_csv(
    "s3://nyc-tlc/trip data/yellow_tripdata_2011-*.csv",
    parse_dates=["pickup_datetime", "dropoff_datetime"],
    dtype={
        "tip_amount": "float64",
        "tolls_amount": "float64",
        # "vendor_name": "string[pyarrow]",
        # "Payment_Type": "string[pyarrow]",
    },
)

In [26]:
ddf.dtypes

vendor_id                     object
pickup_datetime       datetime64[ns]
dropoff_datetime      datetime64[ns]
passenger_count                int64
trip_distance                float64
pickup_longitude             float64
pickup_latitude              float64
rate_code                      int64
store_and_fwd_flag            object
dropoff_longitude            float64
dropoff_latitude             float64
payment_type                  object
fare_amount                  float64
surcharge                    float64
mta_tax                      float64
tip_amount                   float64
tolls_amount                 float64
total_amount                 float64
dtype: object

In [27]:
# verify data types are identical to those in 2010 dataset
dtypes_2010 == ddf.dtypes

vendor_id             True
pickup_datetime       True
dropoff_datetime      True
passenger_count       True
trip_distance         True
pickup_longitude      True
pickup_latitude       True
rate_code             True
store_and_fwd_flag    True
dropoff_longitude     True
dropoff_latitude      True
payment_type          True
fare_amount           True
surcharge             True
mta_tax               True
tip_amount            True
tolls_amount          True
total_amount          True
dtype: bool

distributed.client - ERROR - Failed to reconnect to scheduler after 30.00 seconds, closing client
_GatheringFuture exception was never retrieved
future: <_GatheringFuture finished exception=CancelledError()>
asyncio.exceptions.CancelledError
Traceback (most recent call last):
  File "/Users/rpelgrim/mambaforge/envs/dask-dataframe/lib/python3.9/site-packages/distributed/comm/tcp.py", line 409, in connect
    stream = await self.client.connect(
  File "/Users/rpelgrim/mambaforge/envs/dask-dataframe/lib/python3.9/site-packages/tornado/tcpclient.py", line 275, in connect
    af, addr, stream = await connector.start(connect_timeout=timeout)
asyncio.exceptions.CancelledError

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/Users/rpelgrim/mambaforge/envs/dask-dataframe/lib/python3.9/asyncio/tasks.py", line 490, in wait_for
    return fut.result()
asyncio.exceptions.CancelledError

The above exception was the direct cause of the 

In [17]:
ddf.repartition(partition_size="100MB").to_parquet(
    "s3://coiled-datasets/dask-book/nyc-tlc/2011",
    engine="pyarrow",
    compression="snappy",
    write_metadata_file=False,
)

[None]

In [12]:
ddf = dd.read_parquet("s3://coiled-datasets/nyc-tlc/2011", engine="pyarrow")

In [13]:
actual_dtypes_2011 = ddf.dtypes

In [15]:
type(actual_dtypes_2011)

pandas.core.series.Series

In [21]:
actual_dtypes_2011

vendor_id                     object
pickup_datetime       datetime64[ns]
dropoff_datetime      datetime64[ns]
passenger_count                int64
trip_distance                float64
pickup_longitude             float64
pickup_latitude              float64
rate_code                      int64
store_and_fwd_flag            object
dropoff_longitude            float64
dropoff_latitude             float64
payment_type                  object
fare_amount                  float64
surcharge                    float64
mta_tax                      float64
tip_amount                   float64
tolls_amount                 float64
total_amount                 float64
dtype: object

In [20]:
actual_dtypes_2011 == actual_dtypes_2010

vendor_id             True
pickup_datetime       True
dropoff_datetime      True
passenger_count       True
trip_distance         True
pickup_longitude      True
pickup_latitude       True
rate_code             True
store_and_fwd_flag    True
dropoff_longitude     True
dropoff_latitude      True
payment_type          True
fare_amount           True
surcharge             True
mta_tax               True
tip_amount            True
tolls_amount          True
total_amount          True
dtype: bool

## 2012 Data Create

In [22]:
ddf = dd.read_csv(
    "s3://nyc-tlc/trip data/yellow_tripdata_2012-*.csv",
    parse_dates=["pickup_datetime", "dropoff_datetime"],
    dtype={
        "tip_amount": "float64",
        "tolls_amount": "float64",
    },
)

In [23]:
ddf.dtypes

vendor_id                     object
pickup_datetime       datetime64[ns]
dropoff_datetime      datetime64[ns]
passenger_count                int64
trip_distance                float64
pickup_longitude             float64
pickup_latitude              float64
rate_code                      int64
store_and_fwd_flag            object
dropoff_longitude            float64
dropoff_latitude             float64
payment_type                  object
fare_amount                  float64
surcharge                    float64
mta_tax                      float64
tip_amount                   float64
tolls_amount                 float64
total_amount                 float64
dtype: object

In [24]:
ddf.dtypes == actual_dtypes_2010

vendor_id             True
pickup_datetime       True
dropoff_datetime      True
passenger_count       True
trip_distance         True
pickup_longitude      True
pickup_latitude       True
rate_code             True
store_and_fwd_flag    True
dropoff_longitude     True
dropoff_latitude      True
payment_type          True
fare_amount           True
surcharge             True
mta_tax               True
tip_amount            True
tolls_amount          True
total_amount          True
dtype: bool

In [25]:
ddf.repartition(partition_size="100MB").to_parquet(
    "s3://coiled-datasets/nyc-tlc/2012",
    engine="pyarrow",
    compression="snappy",
    write_metadata_file=False,
)

[None]

## 2013 Data Create

In [26]:
ddf = dd.read_csv(
    "s3://nyc-tlc/trip data/yellow_tripdata_2013-*.csv",
    parse_dates=["pickup_datetime", "dropoff_datetime"],
    dtype={
        "tip_amount": "float64",
        "tolls_amount": "float64",
    },
)

In [27]:
ddf.dtypes == actual_dtypes_2010

vendor_id             True
pickup_datetime       True
dropoff_datetime      True
passenger_count       True
trip_distance         True
pickup_longitude      True
pickup_latitude       True
rate_code             True
store_and_fwd_flag    True
dropoff_longitude     True
dropoff_latitude      True
payment_type          True
fare_amount           True
surcharge             True
mta_tax               True
tip_amount            True
tolls_amount          True
total_amount          True
dtype: bool

In [28]:
len(ddf.dtypes)

18

In [29]:
ddf.dtypes

vendor_id                     object
pickup_datetime       datetime64[ns]
dropoff_datetime      datetime64[ns]
passenger_count                int64
trip_distance                float64
pickup_longitude             float64
pickup_latitude              float64
rate_code                      int64
store_and_fwd_flag            object
dropoff_longitude            float64
dropoff_latitude             float64
payment_type                  object
fare_amount                  float64
surcharge                    float64
mta_tax                      float64
tip_amount                   float64
tolls_amount                 float64
total_amount                 float64
dtype: object

In [30]:
ddf.repartition(partition_size="100MB").to_parquet(
    "s3://coiled-datasets/nyc-tlc/2013",
    engine="pyarrow",
    compression="snappy",
    write_metadata_file=False,
)

[None]

## 2019 Data Create

In [4]:
ddf = dd.read_csv(
    "s3://nyc-tlc/trip data/yellow_tripdata_2019-*.csv",
    parse_dates=["tpep_pickup_datetime", "tpep_dropoff_datetime"],
    dtype={
        "RatecodeID": "float64",
        "VendorID": "float64",
        "passenger_count": "float64",
        "payment_type": "object",
    },
)

In [6]:
ddf.dtypes

VendorID                        float64
tpep_pickup_datetime     datetime64[ns]
tpep_dropoff_datetime    datetime64[ns]
passenger_count                 float64
trip_distance                   float64
RatecodeID                      float64
store_and_fwd_flag               object
PULocationID                      int64
DOLocationID                      int64
payment_type                     object
fare_amount                     float64
extra                           float64
mta_tax                         float64
tip_amount                      float64
tolls_amount                    float64
improvement_surcharge           float64
total_amount                    float64
congestion_surcharge            float64
dtype: object

In [16]:
ddf.repartition(partition_size="100MB").to_parquet(
    "s3://coiled-datasets/nyc-tlc/2019",
    engine="pyarrow",
    compression="snappy",
    write_metadata_file=False,
)

[None]

In [7]:
ddf.repartition(partition_size="100MB").to_parquet(
    "s3://coiled-datasets/nyc-tlc-with-metadata/2019",
    engine="pyarrow",
    compression="snappy",
    write_metadata_file=True,
)

[None]

## 2020 Data

In [17]:
ddf = dd.read_csv(
    "s3://nyc-tlc/trip data/yellow_tripdata_2020-*.csv",
    parse_dates=["tpep_pickup_datetime", "tpep_dropoff_datetime"],
    dtype={
        "RatecodeID": "float64",
        "VendorID": "float64",
        "passenger_count": "float64",
        "payment_type": "object",
    },
)

In [18]:
ddf.dtypes

VendorID                        float64
tpep_pickup_datetime     datetime64[ns]
tpep_dropoff_datetime    datetime64[ns]
passenger_count                 float64
trip_distance                   float64
RatecodeID                      float64
store_and_fwd_flag               object
PULocationID                      int64
DOLocationID                      int64
payment_type                     object
fare_amount                     float64
extra                           float64
mta_tax                         float64
tip_amount                      float64
tolls_amount                      int64
improvement_surcharge           float64
total_amount                    float64
congestion_surcharge            float64
dtype: object

## Data queries

In [86]:
ddf2009 = dd.read_parquet(
    "s3://coiled-datasets/nyc-tlc/2009",
    engine="pyarrow",
)

In [87]:
len(ddf2009)

OSError: [Errno 22] Bad Request

In [80]:
ddf2010 = dd.read_parquet(
    "s3://coiled-datasets/nyc-tlc/2010",
    engine="pyarrow",
)

In [81]:
ddf2011 = dd.read_parquet(
    "s3://coiled-datasets/nyc-tlc/2011",
    engine="pyarrow",
)

In [82]:
ddf2012 = dd.read_parquet(
    "s3://coiled-datasets/nyc-tlc/2012",
    engine="pyarrow",
)

In [83]:
ddf2013 = dd.read_parquet(
    "s3://coiled-datasets/nyc-tlc/2013",
    engine="pyarrow",
)

In [84]:
ddf = dd.concat([ddf2009, ddf2010, ddf2011, ddf2012, ddf2013])

In [85]:
len(ddf)

PermissionError: The provided token has expired.