## Pandas vs Polars vs FireDucks

This notebook is contributed by [Mr. Avi Chawla](https://www.linkedin.com/in/avi-chawla/), Co-founder @ [Daily Dose of Data Science](https://www.dailydoseofds.com/) 

We would like to express our deepest gratitude for his kind contribution.


In [1]:
!pip install -q -U fireducks polars linetimer

In [2]:
# download the dataset:
!wget -q https://modin-datasets.s3.amazonaws.com/testing/yellow_tripdata_2015-01.csv

In [3]:
import polars as pl
df = pl.scan_csv("yellow_tripdata_2015-01.csv")

big_df = pl.concat([df for _ in range(20)])
big_df.collect().write_parquet("taxi.parquet")

In [4]:
!ls -lah | grep taxi

-rw-r--r-- 1 sourav scaleup 613M  1月 13 15:49 taxi.parquet


In [5]:
import platform, psutil
print("="*30, "Evaluation Environment Information", "="*30)
print(f'platform: {platform.system()}')
print(f'architecture: {platform.machine()}')
print(f'processor: {platform.processor()}')
print(f'cpu: {psutil.cpu_count()}')

platform: Linux
architecture: x86_64
processor: x86_64
cpu: 128


# Pandas


In [6]:
# defining query to be performed on pandas DataFrame

from linetimer import CodeTimer

def pandas_query(key):
  with CodeTimer(name=f"Overall execution for ${key} using {pd.__name__}", unit="s"):
    res = (
        pd.read_parquet("taxi.parquet")
        .groupby(key)
        .agg(
            mean_mta_tax=("mta_tax", "mean"),
            mean_tip_amount=("tip_amount", "mean"),
            mean_tolls_amount=("tolls_amount", "mean"),
            mean_trip_distance=("trip_distance", "mean"),
        )
    )
    return res

In [7]:
import pandas as pd
pd.__version__

'2.2.3'

In [8]:
pandas_query("PULocationID")

Code block 'Overall execution for $PULocationID using pandas' took: 10.41286 s


Unnamed: 0_level_0,mean_mta_tax,mean_tip_amount,mean_tolls_amount,mean_trip_distance
PULocationID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,0.104895,5.670350,2.147483,1.645385
2,0.500000,5.030000,1.332500,10.280000
3,0.500000,1.368750,1.332500,9.281250
4,0.497843,1.401985,0.066277,2.877373
6,0.500000,0.000000,21.320000,36.700000
...,...,...,...,...
261,0.496182,1.193111,0.162796,3.924002
262,0.498785,1.318876,0.184178,2.736880
263,0.499182,1.282391,0.136376,2.596841
264,0.487009,1.526466,0.239003,2.742721


In [9]:
pandas_query("DOLocationID")

Code block 'Overall execution for $DOLocationID using pandas' took: 9.12239 s


Unnamed: 0_level_0,mean_mta_tax,mean_tip_amount,mean_tolls_amount,mean_trip_distance
DOLocationID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,0.012980,6.649897,12.258724,17.091866
2,0.500000,3.894000,3.198000,13.406000
3,0.500000,3.138618,1.753171,14.017805
4,0.499095,1.294488,0.057856,2.517735
5,0.500000,1.992222,12.678889,26.704444
...,...,...,...,...
261,0.499016,1.467040,0.137750,4.153200
262,0.499334,1.409528,0.271510,2.807141
263,0.499432,1.379754,0.211886,2.671808
264,0.485516,1.449961,0.269229,2.553356


# FireDucks

In [10]:
# defining query to be performed for FireDucks DataFrame (exact same pandas query with _evaluate to trigger execution)

from linetimer import CodeTimer

def fireducks_query(key):
  with CodeTimer(name=f"Overall execution for {key} using {pd.__name__}", unit="s"):
    res = (
        pd.read_parquet("taxi.parquet")
        .groupby(key)
        .agg(
            mean_mta_tax=("mta_tax", "mean"),
            mean_tip_amount=("tip_amount", "mean"),
            mean_tolls_amount=("tolls_amount", "mean"),
            mean_trip_distance=("trip_distance", "mean"),
        )
    )
    return res._evaluate()

In [11]:
# to get actual FireDucks version, when calling __version__
from fireducks.core import set_fireducks_option
set_fireducks_option("fireducks-version", True)

In [12]:
%load_ext fireducks.ipyext
import fireducks.pandas as pd
pd.__version__

'1.1.6'

In [13]:
%%fireducks.profile
fireducks_query("PULocationID") # exact same pandas code, but much faster 

Code block 'Overall execution for PULocationID using fireducks.pandas' took: 0.21669 s


Unnamed: 0_level_0,mean_mta_tax,mean_tip_amount,mean_tolls_amount,mean_trip_distance
PULocationID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,0.104895,5.670350,2.147483,1.645385
2,0.500000,5.030000,1.332500,10.280000
3,0.500000,1.368750,1.332500,9.281250
4,0.497843,1.401985,0.066277,2.877373
6,0.500000,0.000000,21.320000,36.700000
...,...,...,...,...
261,0.496182,1.193111,0.162796,3.924002
262,0.498785,1.318876,0.184178,2.736880
263,0.499182,1.282391,0.136376,2.596841
264,0.487009,1.526466,0.239003,2.742721


Unnamed: 0,name,type,n_calls,duration (msec)
0,read_parquet_with_metadata,kernel,1,156.751307
1,groupby_agg,kernel,1,49.29596
2,DataFrame._repr_html_,fallback,1,2.230732
3,read_parquet_metadata,kernel,1,1.815388
4,to_pandas.frame.metadata,kernel,1,1.339083
5,concat,kernel,1,0.048801
6,slice,kernel,2,0.02396
7,getattr:_repr_html_,fallback,1,0.00525
8,get_shape,kernel,2,0.00191


In [14]:
%%fireducks.profile
fireducks_query("DOLocationID") # exact same pandas code, but much faster 

Code block 'Overall execution for DOLocationID using fireducks.pandas' took: 0.23712 s


Unnamed: 0_level_0,mean_mta_tax,mean_tip_amount,mean_tolls_amount,mean_trip_distance
DOLocationID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,0.012980,6.649897,12.258724,17.091866
2,0.500000,3.894000,3.198000,13.406000
3,0.500000,3.138618,1.753171,14.017805
4,0.499095,1.294488,0.057856,2.517735
5,0.500000,1.992222,12.678889,26.704444
...,...,...,...,...
261,0.499016,1.467040,0.137750,4.153200
262,0.499334,1.409528,0.271510,2.807141
263,0.499432,1.379754,0.211886,2.671808
264,0.485516,1.449961,0.269229,2.553356


Unnamed: 0,name,type,n_calls,duration (msec)
0,read_parquet_with_metadata,kernel,1,178.157815
1,groupby_agg,kernel,1,49.760924
2,DataFrame._repr_html_,fallback,1,2.696758
3,read_parquet_metadata,kernel,1,2.039001
4,to_pandas.frame.metadata,kernel,1,1.366834
5,concat,kernel,1,0.02013
6,slice,kernel,2,0.01871
7,getattr:_repr_html_,fallback,1,0.00575
8,get_shape,kernel,2,0.00174


# Polars

In [15]:
# defining query to be performed for Polars DataFrame (a little different API from pandas query)

from linetimer import CodeTimer

def polars_query(key):
  with CodeTimer(name=f"Overall execution for {key} using {pl.__name__}", unit="s"):
    res = (
        pl.scan_parquet("taxi.parquet")
        .group_by(key)
        .agg([
             pl.mean("mta_tax").alias("mean_mta_tax"),
             pl.mean("tip_amount").alias("mean_tip_amount"),
             pl.mean("tolls_amount").alias("mean_tolls_amount"),
             pl.mean("trip_distance").alias("mean_trip_distance"),
        ])
    )
    ret, prof = res.profile()
    print(prof.with_columns(((pl.col("end") - pl.col("start")) / 1e3).alias("duration(msec)")))
    return ret


In [16]:
import polars as pl
pl.__version__

'1.19.0'

In [17]:
polars_query("PULocationID") # different API, with little slower than FireDucks

shape: (3, 4)
┌─────────────────────────────────┬────────┬─────────┬────────────────┐
│ node                            ┆ start  ┆ end     ┆ duration(msec) │
│ ---                             ┆ ---    ┆ ---     ┆ ---            │
│ str                             ┆ u64    ┆ u64     ┆ f64            │
╞═════════════════════════════════╪════════╪═════════╪════════════════╡
│ optimization                    ┆ 0      ┆ 8       ┆ 0.008          │
│ parquet(taxi.parquet)           ┆ 8      ┆ 271709  ┆ 271.701        │
│ group_by_partitioned(PULocatio… ┆ 271733 ┆ 1105267 ┆ 833.534        │
└─────────────────────────────────┴────────┴─────────┴────────────────┘
Code block 'Overall execution for PULocationID using polars' took: 1.11330 s


PULocationID,mean_mta_tax,mean_tip_amount,mean_tolls_amount,mean_trip_distance
i64,f64,f64,f64,f64
134,0.481651,1.662844,0.391193,4.269083
3,0.5,1.36875,1.3325,9.28125
265,0.258384,5.619169,2.060236,2.776486
137,0.498752,1.248832,0.178264,2.317984
6,0.5,0.0,21.32,36.7
…,…,…,…,…
125,0.495782,1.379642,0.130522,2.659686
128,0.5,0.0,0.0,1.947778
259,0.5,0.05,0.25381,2.894286
262,0.498785,1.318876,0.184178,2.73688


In [18]:
polars_query("DOLocationID") # different API, with little slower than FireDucks

shape: (3, 4)
┌─────────────────────────────────┬───────┬────────┬────────────────┐
│ node                            ┆ start ┆ end    ┆ duration(msec) │
│ ---                             ┆ ---   ┆ ---    ┆ ---            │
│ str                             ┆ u64   ┆ u64    ┆ f64            │
╞═════════════════════════════════╪═══════╪════════╪════════════════╡
│ optimization                    ┆ 0     ┆ 1      ┆ 0.001          │
│ parquet(taxi.parquet)           ┆ 1     ┆ 62090  ┆ 62.089         │
│ group_by_partitioned(DOLocatio… ┆ 62096 ┆ 792385 ┆ 730.289        │
└─────────────────────────────────┴───────┴────────┴────────────────┘
Code block 'Overall execution for DOLocationID using polars' took: 0.79804 s


DOLocationID,mean_mta_tax,mean_tip_amount,mean_tolls_amount,mean_trip_distance
i64,f64,f64,f64,f64
3,0.5,3.138618,1.753171,14.017805
134,0.494242,2.659962,0.910499,8.775873
265,0.271957,6.414913,4.495978,12.248787
6,0.5,3.629737,10.659211,16.370263
137,0.499297,1.238587,0.185286,2.079024
…,…,…,…,…
259,0.498428,2.07239,0.871572,14.047893
125,0.498713,1.352669,0.081436,2.36063
128,0.5,4.161176,0.752471,9.422235
262,0.499334,1.409528,0.27151,2.807141
