In [2]:
import polars as pl
pl.Config.set_tbl_rows(15) #Esta config controla la cantidad de filas en print y display en Jupyter
pl.Config.set_tbl_cols(50) #Cantidad de columnas a desplegar
pl.Config.set_fmt_str_lengths(100) #Longitud de las cadenas a desplegar

polars.cfg.Config

In [3]:
taxi_sample = pl.read_parquet('../output/taxi.parquet', n_rows=1000000)
taxi_sample.head()

VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount
str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str
"""1""","""11/13/2018 07:45:26 AM""","""11/13/2018 07:57:39 AM""","""1""","""1.5""","""1""","""N""","""246""","""164""","""1""","""9.5""","""0""","""0.5""","""1""","""0""","""0.3""","""11.3"""
"""2""","""11/13/2018 07:24:47 AM""","""11/13/2018 07:29:45 AM""","""1""","""0.88""","""1""","""N""","""140""","""263""","""1""","""5.5""","""0""","""0.5""","""2""","""0""","""0.3""","""8.3"""
"""2""","""11/13/2018 07:31:34 AM""","""11/13/2018 07:36:04 AM""","""1""","""0.54""","""1""","""N""","""236""","""236""","""2""","""5""","""0""","""0.5""","""0""","""0""","""0.3""","""5.8"""
"""2""","""11/13/2018 07:36:59 AM""","""11/13/2018 07:51:07 AM""","""1""","""2.43""","""1""","""N""","""236""","""48""","""1""","""11.5""","""0""","""0.5""","""2.46""","""0""","""0.3""","""14.76"""
"""2""","""11/13/2018 07:53:40 AM""","""11/13/2018 08:21:53 AM""","""1""","""2.96""","""1""","""N""","""163""","""43""","""1""","""18""","""0""","""0.5""","""0""","""0""","""0.3""","""18.8"""


In [4]:
def drop_columns(dataframe: pl.LazyFrame) -> pl.LazyFrame:
    dataframe = dataframe.select(
        pl.exclude('RatecodeID','store_and_fwd_flag','fare_amount','extra','mta_tax','tolls_amount','improvement_surcharge')
    )
    return dataframe

In [5]:
def rename_columns(dataframe: pl.LazyFrame) -> pl.LazyFrame:
    dataframe = dataframe.rename({
        'VendorID' : 'vendor_id',
        'tpep_pickup_datetime' : 'pickup_datetime',
        'tpep_dropoff_datetime' : 'dropoff_datetime',
        'passenger_count' : 'passenger_count',
        'trip_distance' : 'trip_distance',
        'PULocationID' : 'pickup_location_id',
        'DOLocationID' : 'dropoff_location_id',
        'payment_type' : 'payment_type',
        'tip_amount' : 'tip_amount',
        'total_amount' : 'total_amount'
    })
    return dataframe

In [6]:
def cast_column_types(dataframe: pl.LazyFrame) -> pl.LazyFrame:
    dataframe = dataframe.with_columns(
        pl.col("^.*amount$|^.*distance$").cast(pl.Float64),
        pl.col("^.*id$|^.*count$|^.*type$").cast(pl.Int32),
        pl.col("^.*datetime$").str.strptime(pl.Datetime, "%m/%d/%Y %I:%M:%S %p",strict=False)
    )     
    return dataframe

In [20]:
def fix_bad_values(dataframe: pl.LazyFrame) -> pl.LazyFrame:
    quantile_tip_amount = dataframe.select(pl.col('tip_amount').quantile(0.9999)).collect().item()
    quantile_total_amount = dataframe.select(pl.col('total_amount').quantile(0.9999)).collect().item()
    dataframe = dataframe.with_columns(
        
        pl.col('trip_distance').clip(0,80).keep_name(),
        pl.col('passenger_count').clip(0,9).keep_name(),
        pl.col('tip_amount').clip(0, quantile_tip_amount).keep_name(),
        pl.col('total_amount').clip(0, quantile_total_amount).keep_name(),
        pl.when(pl.col("^.*datetime$").dt.year() != 2018)
            .then(pl.col("^.*datetime$").dt.strftime('2018/%m/%d %H:%M:%S').str.strptime(pl.Datetime, '%Y/%m/%d %H:%M:%S'))
            .otherwise(pl.col("^.*datetime$"))
            .keep_name(),            
    )
    return dataframe
         

In [8]:
def calculate_columns(dataframe: pl.LazyFrame) -> pl.LazyFrame:
    dataframe = dataframe.with_columns(
        trip_duration = (pl.col("dropoff_datetime") - pl.col("pickup_datetime")).dt.minutes().clip(0,180),        
    ) \
    .with_columns(
        trip_speed = pl.when((pl.col('trip_duration') > 1) & (pl.col('trip_distance') > 0.1))
          .then((pl.col('trip_distance') * pl.lit(1.60934) * pl.lit(60)) / pl.col('trip_duration'))
          .otherwise(None)
    )
    return dataframe

In [21]:
taxi_sample_clean = (taxi_sample.lazy()
            .pipe(drop_columns)
            .pipe(rename_columns)
            .pipe(cast_column_types)
            .pipe(fix_bad_values)
            .pipe(calculate_columns)         
)
taxi_sample_clean.collect().head()

vendor_id,pickup_datetime,dropoff_datetime,passenger_count,trip_distance,pickup_location_id,dropoff_location_id,payment_type,tip_amount,total_amount,trip_duration,trip_speed
i32,datetime[μs],datetime[μs],i32,f64,i32,i32,i32,f64,f64,i64,f64
1,2018-11-13 07:45:26,2018-11-13 07:57:39,1,1.5,246,164,1,1.0,11.3,12,12.07005
2,2018-11-13 07:24:47,2018-11-13 07:29:45,1,0.88,140,263,1,2.0,8.3,4,21.243288
2,2018-11-13 07:31:34,2018-11-13 07:36:04,1,0.54,236,236,2,0.0,5.8,4,13.035654
2,2018-11-13 07:36:59,2018-11-13 07:51:07,1,2.43,236,48,1,2.46,14.76,14,16.760127
2,2018-11-13 07:53:40,2018-11-13 08:21:53,1,2.96,163,43,1,0.0,18.8,28,10.207814


In [22]:
taxi_sample_clean.collect().describe()

describe,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,trip_distance,pickup_location_id,dropoff_location_id,payment_type,tip_amount,total_amount,trip_duration,trip_speed
str,f64,str,str,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""count""",1000000.0,"""1000000""","""1000000""",1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0
"""null_count""",0.0,"""0""","""0""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,23154.0
"""mean""",1.628152,,,1.535628,2.899356,166.266773,164.578035,1.278467,2.137468,17.795293,17.036333,16.39244
"""std""",0.57436,,,1.209576,3.88934,66.421972,69.432221,0.468597,2.940436,16.233516,18.056075,10.70029
"""min""",1.0,"""2018-01-01 00:05:38.000000""","""2018-01-01 04:50:14.000000""",0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.059009
"""max""",4.0,"""2018-12-31 23:03:08.000000""","""2018-12-31 19:03:26.000000""",9.0,80.0,265.0,265.0,4.0,62.27,287.27,180.0,2574.944
"""median""",2.0,,,1.0,1.51,162.0,162.0,1.0,1.55,12.74,12.0,14.112674


In [23]:
taxi_clean = pl.scan_parquet('../output/taxi.parquet') \
            .pipe(drop_columns) \
            .pipe(rename_columns) \
            .pipe(cast_column_types) \
            .pipe(fix_bad_values) \
            .pipe(calculate_columns)

In [24]:
taxi_clean = taxi_clean.collect(streaming=True)

In [25]:
taxi_clean.describe()

describe,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,trip_distance,pickup_location_id,dropoff_location_id,payment_type,tip_amount,total_amount,trip_duration,trip_speed
str,f64,str,str,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""count""",112234626.0,"""112234626""","""112234626""",112234626.0,112234626.0,112234626.0,112234626.0,112234626.0,112234626.0,112234626.0,112234626.0,112234626.0
"""null_count""",0.0,"""0""","""0""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2694977.0
"""mean""",1.589763,,,1.595513,2.926978,163.179016,161.402084,1.313831,1.86961,16.322072,14.184362,19.134251
"""std""",0.517911,,,1.241607,3.777462,66.540181,70.424256,0.4844,2.503802,14.151166,14.133129,10.547828
"""min""",1.0,"""2018-01-01 00:00:00.000000""","""2018-01-01 00:00:00.000000""",0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.059009
"""max""",4.0,"""2018-12-31 23:59:59.000000""","""2018-12-31 23:59:59.000000""",9.0,80.0,265.0,265.0,5.0,45.5,232.86,180.0,3862.416
"""median""",2.0,,,1.0,1.6,162.0,162.0,1.0,1.4,11.8,11.0,16.89807


In [26]:
taxi_clean.filter(pl.col('trip_duration')<0).height

0

In [28]:
taxi_clean.filter((pl.col('dropoff_datetime').dt.year()>2023) | (pl.col('dropoff_datetime').dt.year()<2017))

vendor_id,pickup_datetime,dropoff_datetime,passenger_count,trip_distance,pickup_location_id,dropoff_location_id,payment_type,tip_amount,total_amount,trip_duration,trip_speed
i32,datetime[μs],datetime[μs],i32,f64,i32,i32,i32,f64,f64,i64,f64


In [29]:
taxi_clean.select(pl.all().null_count())

vendor_id,pickup_datetime,dropoff_datetime,passenger_count,trip_distance,pickup_location_id,dropoff_location_id,payment_type,tip_amount,total_amount,trip_duration,trip_speed
u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
0,0,0,0,0,0,0,0,0,0,0,2694977


In [31]:
taxi_clean.select(pl.col('passenger_count').value_counts()).unnest('passenger_count').sort('passenger_count')

passenger_count,counts
i32,u32
0,1003298
1,79786664
2,16468127
3,4684094
4,2209579
5,5040905
6,3040893
7,425
8,349
9,292


In [32]:
taxi_clean.select(pl.col('dropoff_datetime').dt.year().value_counts()).unnest('dropoff_datetime')

dropoff_datetime,counts
i32,u32
2018,112234626


In [44]:
taxi_stats = (taxi_clean.lazy()
    .sort('pickup_datetime')
    .groupby_dynamic(
        'pickup_datetime', 
        every='1d',
        closed='both',
        by='vendor_id')
    .agg(
        pl.count().alias('trips_count'),
        pl.col(pl.NUMERIC_DTYPES).sum().suffix('_sum'),
        pl.col(pl.NUMERIC_DTYPES).mean().suffix('_mean'),
        pl.col(pl.NUMERIC_DTYPES).median().suffix('_median')       
    )
)

In [45]:
taxi_stats.select(
    'vendor_id',
    pl.col('pickup_datetime').cast(pl.Date),
    'trips_count',
    pl.col(r'^.*sum$')
).collect()

vendor_id,pickup_datetime,trips_count,passenger_count_sum,trip_distance_sum,pickup_location_id_sum,dropoff_location_id_sum,payment_type_sum,tip_amount_sum,total_amount_sum,trip_duration_sum,trip_speed_sum
i32,date,u32,i32,f64,i32,i32,i32,f64,f64,i64,f64
2,2017-12-31,5,5,48.36,1105,492,6,25.89,203.67,0,
2,2018-01-01,141708,270950,488256.77,22373769,21978156,200804,239081.62,2.3419e6,1757068,3.4909e6
2,2018-01-02,134594,257633,424216.99,21924065,21655690,185176,231022.7,2.1644e6,1696152,2.8857e6
2,2018-01-03,148674,282689,430242.46,24303628,24031963,200713,256164.56,2.3414e6,1989931,2.8654e6
2,2018-01-04,73431,139188,192916.33,11881249,11740675,101262,114500.22,1.0728e6,938488,1.3696e6
2,2018-01-05,152395,291154,405219.97,24793017,24450355,201491,263773.72,2.3535e6,2136202,2.5979e6
2,2018-01-06,160673,309555,457802.46,25994157,25589999,214933,267182.92,2.4334e6,2031899,3.1864e6
...,...,...,...,...,...,...,...,...,...,...,...
4,2018-12-25,1511,1586,5009.36,248002,240663,2243,2361.26,24090.42,17329,36024.474564
4,2018-12-26,2343,2491,7065.0,385630,378264,3367,3909.92,38405.93,30748,47332.921475
