In [7]:
import polars as pl
import pandas as pd

## Requisitos
Para realizar estos ejercicios hay que descargar el dataset de Taxis de NYC:<br>
https://data.cityofnewyork.us/Transportation/2018-Yellow-Taxi-Trip-Data/t29m-gskq
<br><br>

También es posible descargar los parquets:
<br>https://www.nyc.gov/site/tlc/about/tlc-trip-record-data.page
<br><br>

Finalmente también hay un dataset en Kaggle:<br>
https://www.kaggle.com/c/nyc-taxi-trip-duration

In [8]:
sample = pd.read_csv('../data/Taxi_Trip_Data.csv',nrows=10)
sample

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount
0,1,11/13/2018 07:45:26 AM,11/13/2018 07:57:39 AM,1,1.5,1,N,246,164,1,9.5,0,0.5,1.0,0.0,0.3,11.3
1,2,11/13/2018 07:24:47 AM,11/13/2018 07:29:45 AM,1,0.88,1,N,140,263,1,5.5,0,0.5,2.0,0.0,0.3,8.3
2,2,11/13/2018 07:31:34 AM,11/13/2018 07:36:04 AM,1,0.54,1,N,236,236,2,5.0,0,0.5,0.0,0.0,0.3,5.8
3,2,11/13/2018 07:36:59 AM,11/13/2018 07:51:07 AM,1,2.43,1,N,236,48,1,11.5,0,0.5,2.46,0.0,0.3,14.76
4,2,11/13/2018 07:53:40 AM,11/13/2018 08:21:53 AM,1,2.96,1,N,163,43,1,18.0,0,0.5,0.0,0.0,0.3,18.8
5,2,11/13/2018 07:26:53 AM,11/13/2018 07:30:55 AM,1,0.94,1,N,141,162,1,5.0,0,0.5,0.7,0.0,0.3,6.5
6,2,11/13/2018 07:44:07 AM,11/13/2018 08:24:08 AM,2,10.43,1,N,162,138,1,35.5,0,0.5,8.41,5.76,0.3,50.47
7,4,11/13/2018 07:11:46 AM,11/13/2018 07:23:47 AM,1,1.86,1,N,170,50,1,10.0,0,0.5,2.16,0.0,0.3,12.96
8,4,11/13/2018 07:26:25 AM,11/13/2018 07:29:20 AM,1,0.48,1,N,50,246,1,4.0,0,0.5,4.22,0.0,0.3,9.02
9,4,11/13/2018 07:34:29 AM,11/13/2018 07:46:43 AM,1,1.43,1,N,68,90,1,9.0,0,0.5,1.96,0.0,0.3,11.76


In [10]:
taxi_df = pl.scan_csv('../data/Taxi_Trip_Data.csv',infer_schema_length=0)
taxi_df.sink_parquet(
        "../output/taxi.parquet",
        compression="snappy",
        
        row_group_size=1_000_000) 

In [11]:
taxi_df.schema

{'VendorID': Utf8,
 'tpep_pickup_datetime': Utf8,
 'tpep_dropoff_datetime': Utf8,
 'passenger_count': Utf8,
 'trip_distance': Utf8,
 'RatecodeID': Utf8,
 'store_and_fwd_flag': Utf8,
 'PULocationID': Utf8,
 'DOLocationID': Utf8,
 'payment_type': Utf8,
 'fare_amount': Utf8,
 'extra': Utf8,
 'mta_tax': Utf8,
 'tip_amount': Utf8,
 'tolls_amount': Utf8,
 'improvement_surcharge': Utf8,
 'total_amount': Utf8}

In [12]:
schema = pl.read_parquet_schema('../output/taxi.parquet')
schema

{'VendorID': Utf8,
 'tpep_pickup_datetime': Utf8,
 'tpep_dropoff_datetime': Utf8,
 'passenger_count': Utf8,
 'trip_distance': Utf8,
 'RatecodeID': Utf8,
 'store_and_fwd_flag': Utf8,
 'PULocationID': Utf8,
 'DOLocationID': Utf8,
 'payment_type': Utf8,
 'fare_amount': Utf8,
 'extra': Utf8,
 'mta_tax': Utf8,
 'tip_amount': Utf8,
 'tolls_amount': Utf8,
 'improvement_surcharge': Utf8,
 'total_amount': Utf8}

In [16]:
taxi_total_amount = pl.scan_parquet('../output/taxi.parquet') \
    .select(
        pl.col("total_amount").cast(pl.Float64)
    ) \
    .sum() \
    .collect()    
    
taxi_total_amount

total_amount
f64
1837900000.0


In [20]:
pl.scan_parquet('../output/taxi.parquet') \
    .select(
        (
            pl.col("total_amount").cast(pl.Float64) - pl.col("tip_amount").cast(pl.Float64)
        ).alias("tolls_amount2")
    ) \
    .sum() \
    .collect()    

tolls_amount2
f64
1627700000.0
