In [1]:
import polars as pl
pl.Config.set_tbl_rows(15) #Esta config controla la cantidad de filas en print y display en Jupyter
pl.Config.set_tbl_cols(50) #Cantidad de columnas a desplegar
pl.Config.set_fmt_str_lengths(100) #Longitud de las cadenas a desplegar

polars.cfg.Config

In [2]:
taxi_sample = pl.read_parquet('../output/taxi.parquet', n_rows=1000000)
taxi_sample.head()

VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount
str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str
"""1""","""11/13/2018 07:45:26 AM""","""11/13/2018 07:57:39 AM""","""1""","""1.5""","""1""","""N""","""246""","""164""","""1""","""9.5""","""0""","""0.5""","""1""","""0""","""0.3""","""11.3"""
"""2""","""11/13/2018 07:24:47 AM""","""11/13/2018 07:29:45 AM""","""1""","""0.88""","""1""","""N""","""140""","""263""","""1""","""5.5""","""0""","""0.5""","""2""","""0""","""0.3""","""8.3"""
"""2""","""11/13/2018 07:31:34 AM""","""11/13/2018 07:36:04 AM""","""1""","""0.54""","""1""","""N""","""236""","""236""","""2""","""5""","""0""","""0.5""","""0""","""0""","""0.3""","""5.8"""
"""2""","""11/13/2018 07:36:59 AM""","""11/13/2018 07:51:07 AM""","""1""","""2.43""","""1""","""N""","""236""","""48""","""1""","""11.5""","""0""","""0.5""","""2.46""","""0""","""0.3""","""14.76"""
"""2""","""11/13/2018 07:53:40 AM""","""11/13/2018 08:21:53 AM""","""1""","""2.96""","""1""","""N""","""163""","""43""","""1""","""18""","""0""","""0.5""","""0""","""0""","""0.3""","""18.8"""


In [3]:
def drop_columns(dataframe: pl.LazyFrame) -> pl.LazyFrame:
    dataframe = dataframe.select(
        pl.exclude('RatecodeID','store_and_fwd_flag','fare_amount','extra','mta_tax','tolls_amount','improvement_surcharge')
    )
    return dataframe

In [4]:
def rename_columns(dataframe: pl.LazyFrame) -> pl.LazyFrame:
    dataframe = dataframe.rename({
        'VendorID' : 'vendor_id',
        'tpep_pickup_datetime' : 'pickup_datetime',
        'tpep_dropoff_datetime' : 'dropoff_datetime',
        'passenger_count' : 'passenger_count',
        'trip_distance' : 'trip_distance',
        'PULocationID' : 'pickup_location_id',
        'DOLocationID' : 'dropoff_location_id',
        'payment_type' : 'payment_type',
        'tip_amount' : 'tip_amount',
        'total_amount' : 'total_amount'
    })
    return dataframe

In [5]:
def cast_column_types(dataframe: pl.LazyFrame) -> pl.LazyFrame:
    dataframe = dataframe.with_columns(
        pl.col("^.*amount$|^.*distance$").cast(pl.Float64),
        pl.col("^.*id$|^.*count$|^.*type$").cast(pl.Int32),
        pl.col("^.*datetime$").str.strptime(pl.Datetime, "%m/%d/%Y %I:%M:%S %p",strict=False)
    )     
    return dataframe

In [35]:
def fix_bad_values(dataframe: pl.LazyFrame) -> pl.LazyFrame:
    dataframe = dataframe.with_columns(
        
        pl.col('trip_distance').clip(0,80).keep_name(),
        pl.col('passenger_count').clip(0,9).keep_name(),
        #pl.col('tip_amount').clip(0, pl.col('tip_amount').quantile(0.9999)).keep_name(),
        #pl.col('total_amount').clip(0, pl.col('tip_amount').quantile(0.9999)).keep_name(),
        pl.when(pl.col("^.*datetime$").dt.year() != 2018)
            .then(pl.col("^.*datetime$").dt.strftime('2018/%m/%d %H:%M:%S').str.strptime(pl.Datetime, '%Y/%m/%d %H:%M:%S'))
            .otherwise(pl.col("^.*datetime$"))
            .keep_name(),            
    )
    return dataframe
         

In [32]:
def calculate_columns(dataframe: pl.LazyFrame) -> pl.LazyFrame:
    dataframe = dataframe.with_columns(
        trip_duration = (pl.col("dropoff_datetime") - pl.col("pickup_datetime")).dt.minutes().clip(0,180),        
    ) \
    .with_columns(
        trip_speed = pl.when((pl.col('trip_duration') > 1) & (pl.col('trip_distance') > 0.1))
          .then((pl.col('trip_distance') * pl.lit(1.60934) * pl.lit(60)) / pl.col('trip_duration'))
          .otherwise(None)
    )
    return dataframe

In [36]:
taxi_sample_clean = (taxi_sample.lazy()
            .pipe(drop_columns)
            .pipe(rename_columns)
            .pipe(cast_column_types)
            .pipe(fix_bad_values)
            .pipe(calculate_columns)         
)
taxi_sample_clean.collect().head()

thread '<unnamed>' panicked at 'called `Result::unwrap()` on an `Err` value: PyErr { type: <class 'RuntimeError'>, value: RuntimeError('BindingsError: "object type not supported <polars.internals.expr.expr.Expr object at 0x15e98f640>"'), traceback: None }', src/lazy/dsl.rs:402:53


PanicException: called `Result::unwrap()` on an `Err` value: PyErr { type: <class 'RuntimeError'>, value: RuntimeError('BindingsError: "object type not supported <polars.internals.expr.expr.Expr object at 0x15e98f640>"'), traceback: None }

In [34]:
taxi_sample_clean.collect().describe()

describe,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,trip_distance,pickup_location_id,dropoff_location_id,payment_type,tip_amount,total_amount,trip_duration,trip_speed
str,f64,str,str,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""count""",1000000.0,"""1000000""","""1000000""",1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0
"""null_count""",0.0,"""0""","""0""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,23154.0
"""mean""",1.628152,,,1.535628,2.899356,166.266773,164.578035,1.278467,2.141617,17.791381,17.036333,16.39244
"""std""",0.57436,,,1.209576,3.88934,66.421972,69.432221,0.468597,3.087204,16.404734,18.056075,10.70029
"""min""",1.0,"""2018-01-01 00:05:38.000000""","""2018-01-01 04:50:14.000000""",0.0,0.0,1.0,1.0,1.0,-11.46,-225.3,0.0,0.059009
"""max""",4.0,"""2018-12-31 23:03:08.000000""","""2018-12-31 19:03:26.000000""",9.0,80.0,265.0,265.0,4.0,333.0,700.3,180.0,2574.944
"""median""",2.0,,,1.0,1.51,162.0,162.0,1.0,1.55,12.74,12.0,14.112674


In [24]:
taxi_clean = pl.scan_parquet('../output/taxi.parquet') \
            .pipe(drop_columns) \
            .pipe(rename_columns) \
            .pipe(cast_column_types) \
            .pipe(fix_bad_values) \
            .pipe(calculate_columns)

In [None]:
taxi_clean = taxi_clean.collect(streaming=True)

In [None]:
taxi_clean.describe()

describe,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,trip_distance,rate_code_id,flag,pickup_location_id,dropoff_location_id,payment_type,fare_amount,extra_amount,mobility_tax_amount,tip_amount,tolls_amount,improvement_surcharge_amount,total_amount,trip_duration
str,f64,str,str,f64,f64,f64,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,str
"""count""",112234626.0,"""112234626""","""112234626""",112234626.0,112234626.0,112234626.0,"""112234626""",112234626.0,112234626.0,112234626.0,112234626.0,112234626.0,112234626.0,112234626.0,112234626.0,112234626.0,112234626.0,"""112234626"""
"""null_count""",0.0,"""0""","""0""",0.0,0.0,0.0,"""0""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"""0"""
"""mean""",1.589763,,,1.595515,2.92944,1.049446,,163.179016,161.402084,1.313831,13.026152,0.331008,0.497152,1.872474,0.345069,0.299622,16.37517,"""1011062109"""
"""std""",0.517911,,,1.241769,18.337395,0.738067,,66.540181,70.424256,0.4844,147.031043,0.465949,0.052183,2.607947,1.781228,0.604608,147.279794,
"""min""",1.0,"""2001-01-01 00:01:48.000000""","""1926-08-10 21:42:11.000000""",0.0,0.0,1.0,"""N""",1.0,1.0,1.0,-800.0,-80.0,-0.5,-322.42,-52.5,-0.3,-800.3,"""-2911502804000000"""
"""max""",4.0,"""2084-11-04 12:32:24.000000""","""2084-11-04 12:47:41.000000""",192.0,189483.84,99.0,"""Y""",265.0,265.0,5.0,907070.24,96.64,150.0,945.97,1650.0,4000.3,907071.04,"""1730218000000"""
"""median""",2.0,,,1.0,1.6,1.0,,162.0,162.0,1.0,9.5,0.0,0.5,1.4,0.0,0.3,11.8,


In [37]:
taxi_clean.filter(pl.col('trip_duration')<0).height

AttributeError: 'LazyFrame' object has no attribute 'height'

In [None]:
taxi_clean.select(pl.all()).filter(pl.col('trip_duration')<0).height

1047

In [None]:
taxi_clean.filter((pl.col('dropoff_datetime').dt.year()>2023) | (pl.col('dropoff_datetime').dt.year()<2017))

vendor_id,pickup_datetime,dropoff_datetime,passenger_count,trip_distance,rate_code_id,flag,pickup_location_id,dropoff_location_id,payment_type,fare_amount,extra_amount,mobility_tax_amount,tip_amount,tolls_amount,improvement_surcharge_amount,total_amount,trip_duration
i32,datetime[μs],datetime[μs],i32,f64,i32,str,i32,i32,i32,f64,f64,f64,f64,f64,f64,f64,duration[μs]
2,2008-12-31 11:00:13,2008-12-31 11:29:13,1,2.87,1,"""N""",142,75,1,18.0,0.0,0.5,4.7,0.0,0.3,23.5,29m
2,2008-12-31 18:40:01,2008-12-31 19:03:26,2,2.12,1,"""N""",246,229,1,14.5,1.0,0.5,3.26,0.0,0.3,19.56,23m 25s
1,2018-11-13 20:28:55,1926-08-10 21:42:11,1,2.5,1,"""Y""",79,162,1,10.5,0.5,0.5,1.18,0.0,0.3,12.98,-33697d -22h -46m -44s
2,2009-01-01 13:12:35,2009-01-01 13:44:35,1,9.46,1,"""N""",66,262,1,30.0,0.0,0.5,2.58,0.0,0.3,33.38,32m
2,2009-01-01 00:05:38,2009-01-01 17:46:30,1,0.0,1,"""N""",193,193,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,17h 40m 52s
2,2009-01-01 14:33:53,2009-01-01 15:03:53,1,2.59,1,"""N""",237,230,2,18.0,0.0,0.5,0.0,0.0,0.3,18.8,30m
2,2008-12-31 23:03:08,2009-01-01 04:50:14,1,1.48,1,"""N""",170,229,1,6.0,0.5,0.5,2.19,0.0,0.3,9.49,5h 47m 6s
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2,2009-01-01 00:24:52,2009-01-01 00:38:45,1,1.24,1,"""N""",151,236,2,7.0,0.0,0.5,0.0,0.0,0.3,7.8,13m 53s
2,2008-12-31 07:13:31,2008-12-31 07:20:31,1,1.16,1,"""N""",140,229,1,6.5,0.0,0.5,1.46,0.0,0.3,8.76,7m


In [None]:
taxi_clean.select(pl.all().null_count())

vendor_id,pickup_datetime,dropoff_datetime,passenger_count,trip_distance,rate_code_id,flag,pickup_location_id,dropoff_location_id,payment_type,fare_amount,extra_amount,mobility_tax_amount,tip_amount,tolls_amount,improvement_surcharge_amount,total_amount,trip_duration
u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [None]:
taxi_clean.select(pl.col('passenger_count').value_counts()).unnest('passenger_count')

passenger_count,counts
i32,u32
1,79786664
0,1003298
2,16468127
3,4684094
6,3040893
9,290
8,349
96,1
4,2209579
192,1


In [None]:
taxi_clean.select(pl.col('dropoff_datetime').dt.year().value_counts()).unnest('dropoff_datetime').sort(by='counts')

dropoff_datetime,counts
i32,u32
1926,1
1998,1
2021,1
2032,1
2053,1
2041,1
2029,2
...,...
2084,16
2003,30
