In [23]:
import polars as pl
pl.Config.set_tbl_rows(15) #Esta config controla la cantidad de filas en print y display en Jupyter
pl.Config.set_tbl_cols(50) #Cantidad de columnas a desplegar
pl.Config.set_fmt_str_lengths(100) #Longitud de las cadenas a desplegar

polars.cfg.Config

In [24]:
taxi_sample = pl.read_parquet('../output/taxi.parquet', n_rows=10000)
taxi_sample.head()

VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount
str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str
"""1""","""11/13/2018 07:45:26 AM""","""11/13/2018 07:57:39 AM""","""1""","""1.5""","""1""","""N""","""246""","""164""","""1""","""9.5""","""0""","""0.5""","""1""","""0""","""0.3""","""11.3"""
"""2""","""11/13/2018 07:24:47 AM""","""11/13/2018 07:29:45 AM""","""1""","""0.88""","""1""","""N""","""140""","""263""","""1""","""5.5""","""0""","""0.5""","""2""","""0""","""0.3""","""8.3"""
"""2""","""11/13/2018 07:31:34 AM""","""11/13/2018 07:36:04 AM""","""1""","""0.54""","""1""","""N""","""236""","""236""","""2""","""5""","""0""","""0.5""","""0""","""0""","""0.3""","""5.8"""
"""2""","""11/13/2018 07:36:59 AM""","""11/13/2018 07:51:07 AM""","""1""","""2.43""","""1""","""N""","""236""","""48""","""1""","""11.5""","""0""","""0.5""","""2.46""","""0""","""0.3""","""14.76"""
"""2""","""11/13/2018 07:53:40 AM""","""11/13/2018 08:21:53 AM""","""1""","""2.96""","""1""","""N""","""163""","""43""","""1""","""18""","""0""","""0.5""","""0""","""0""","""0.3""","""18.8"""


In [40]:
def rename_columns(dataframe: pl.DataFrame) -> pl.DataFrame:
    dataframe = dataframe.rename({
        'VendorID' : 'vendor_id',
        'tpep_pickup_datetime' : 'pickup_datetime',
        'tpep_dropoff_datetime' : 'dropoff_datetime',
        'passenger_count' : 'passenger_count',
        'trip_distance' : 'trip_distance',
        'RatecodeID' : 'rate_code_id',
        'store_and_fwd_flag' : 'flag',
        'PULocationID' : 'pickup_location_id',
        'DOLocationID' : 'dropoff_location_id',
        'payment_type' : 'payment_type',
        'fare_amount' : 'fare_amount',
        'extra' : 'extra_amount',
        'mta_tax' : 'mobility_tax_amount',
        'tip_amount' : 'tip_amount',
        'tolls_amount' : 'tolls_amount',
        'improvement_surcharge' : 'improvement_surcharge_amount',
        'total_amount' : 'total_amount'
    })
    return dataframe

In [36]:
def cast_column_types(dataframe: pl.DataFrame) -> pl.DataFrame:
    dataframe = dataframe.with_columns(
        pl.col("^.*amount$|^.*distance$").cast(pl.Float64),
        pl.col("^.*id$|^.*count$|^.*type$").cast(pl.Int32),
        pl.col("^.*datetime$").str.strptime(pl.Datetime, "%m/%d/%Y %H:%M:%S %p",strict=False)
    )
    return dataframe

In [38]:
def calculate_columns(dataframe: pl.DataFrame) -> pl.DataFrame:
    dataframe = dataframe.with_columns(
        trip_duration = pl.col("dropoff_datetime") - pl.col("pickup_datetime")
    )
    return dataframe

In [41]:
taxi = (taxi_sample
            .pipe(rename_columns)
            .pipe(cast_column_types)
            .pipe(calculate_columns)
)
taxi.head()

vendor_id,pickup_datetime,dropoff_datetime,passenger_count,trip_distance,rate_code_id,flag,pickup_location_id,dropoff_location_id,payment_type,fare_amount,extra_amount,mobility_tax_amount,tip_amount,tolls_amount,improvement_surcharge_amount,total_amount,trip_duration
i32,datetime[μs],datetime[μs],i32,f64,i32,str,i32,i32,i32,f64,f64,f64,f64,f64,f64,f64,duration[μs]
1,2018-11-13 07:45:26,2018-11-13 07:57:39,1,1.5,1,"""N""",246,164,1,9.5,0.0,0.5,1.0,0.0,0.3,11.3,12m 13s
2,2018-11-13 07:24:47,2018-11-13 07:29:45,1,0.88,1,"""N""",140,263,1,5.5,0.0,0.5,2.0,0.0,0.3,8.3,4m 58s
2,2018-11-13 07:31:34,2018-11-13 07:36:04,1,0.54,1,"""N""",236,236,2,5.0,0.0,0.5,0.0,0.0,0.3,5.8,4m 30s
2,2018-11-13 07:36:59,2018-11-13 07:51:07,1,2.43,1,"""N""",236,48,1,11.5,0.0,0.5,2.46,0.0,0.3,14.76,14m 8s
2,2018-11-13 07:53:40,2018-11-13 08:21:53,1,2.96,1,"""N""",163,43,1,18.0,0.0,0.5,0.0,0.0,0.3,18.8,28m 13s
