In [1]:
import polars as pl
import pandas as pa
import numpy as np
import time

In [2]:
# Create a DataFrame
df = pl.DataFrame({
    "id": [1, 2, 3, 4, 5],
    "name": ["Alice", "Bob", "Charlie", "Diana", "Evan"],
})

print(df)

shape: (5, 2)
┌─────┬─────────┐
│ id  ┆ name    │
│ --- ┆ ---     │
│ i64 ┆ str     │
╞═════╪═════════╡
│ 1   ┆ Alice   │
│ 2   ┆ Bob     │
│ 3   ┆ Charlie │
│ 4   ┆ Diana   │
│ 5   ┆ Evan    │
└─────┴─────────┘


## Data

## - dataset_with_lots_of_cols.parquet -> 2.5M rows x 103 cols -> 335 MB
## - dataset_with_lots_of_rows.parquet -> 25M rows x 4 cols -> 167 MB
## - books_rating.csv -> 3M rows x 10 cols -> 2.86 GB
## - used_cars.csv -> 3M rows x 66 cols -> 10 GB

## Generate Data

In [3]:
#import pandas as pa
#import numpy as np

# lots of columns dataset
# n_rows = 2 500 000
# n_cols = 100 # (103 in total)

# lots of rows dataset
# n_rows = 20 000 000
# n_cols = 1  #(4 in total)

#data = {f"col{i}": np.random.randint(0, 1000, n_rows) for i in range(n_cols)}
#data["country"] = np.random.choice(["USA", "Canada", "Costa Rica", "Brazil"], n_rows)
#data["city"] = np.random.choice(["NY", "Toronto", "San Jose", "Rio"], n_rows)
#data["subscription_date"] = pa.date_range("2020-01-01", periods=n_rows, freq="s")

#df = pa.DataFrame(data)
#df.to_parquet("dataset_with_lots_of_rows.parquet")

In [6]:
dataset = "Parquet/dataset_with_lots_of_rows.parquet"

## PANDAS

In [7]:
import pandas as pa
import time

start = time.time()

df = pa.read_parquet(dataset)
df_filtered = df[df["country"] == "Costa Rica"]
df_selected = df_filtered[["city", "subscription_date"]]
df_agg = df_selected.groupby("city", as_index=False).agg({"subscription_date": "min"})
result = df_agg.sort_values("city")

print(result)
exec_time_pandas = (time.time() - start)
print("PANDAS: %.4f seconds" % exec_time_pandas)
results = None

       city   subscription_date
0        NY 2020-01-01 00:00:59
1       Rio 2020-01-01 00:00:39
2  San Jose 2020-01-01 00:00:02
3   Toronto 2020-01-01 00:00:13
PANDAS: 2.9563 seconds


## EAGER

In [8]:
import polars as pl
import time

start = time.time()
result = (
    pl.read_parquet(dataset)
      .filter(pl.col("country") == "Costa Rica")
      .select(["city", "subscription_date"])
      .group_by("city")
      .agg(pl.min("subscription_date"))
      .sort("city")
)
print(result)

exec_time_eager = (time.time() - start)
print("POLARS EAGER: %.4f seconds" % exec_time_eager)
results = None

shape: (4, 2)
┌──────────┬─────────────────────┐
│ city     ┆ subscription_date   │
│ ---      ┆ ---                 │
│ str      ┆ datetime[ns]        │
╞══════════╪═════════════════════╡
│ NY       ┆ 2020-01-01 00:00:59 │
│ Rio      ┆ 2020-01-01 00:00:39 │
│ San Jose ┆ 2020-01-01 00:00:02 │
│ Toronto  ┆ 2020-01-01 00:00:13 │
└──────────┴─────────────────────┘
POLARS EAGER: 0.6175 seconds


## EAGER SPLIT

In [9]:
import polars as pl
import time

start = time.time()
df = pl.read_parquet(dataset)   # loads ALL 100+ columns
df_filtered = df.filter(pl.col("country") == "Costa Rica")
df_selected = df_filtered.select(["city", "subscription_date"])
df_agg = df_selected.group_by("city").agg(pl.min("subscription_date"))
df_sorted = df_agg.sort("city")
print(df_sorted)

exec_time_eager_split = (time.time() - start)
print("POLARS EAGER SPLIT: %.4f seconds" % exec_time_eager_split)
results = None

shape: (4, 2)
┌──────────┬─────────────────────┐
│ city     ┆ subscription_date   │
│ ---      ┆ ---                 │
│ str      ┆ datetime[ns]        │
╞══════════╪═════════════════════╡
│ NY       ┆ 2020-01-01 00:00:59 │
│ Rio      ┆ 2020-01-01 00:00:39 │
│ San Jose ┆ 2020-01-01 00:00:02 │
│ Toronto  ┆ 2020-01-01 00:00:13 │
└──────────┴─────────────────────┘
POLARS EAGER SPLIT: 0.4749 seconds


## LAZY

In [10]:
import pandas as pa
import time

start = time.time()

q = (
    pl.scan_parquet(dataset)   # lazy reader
      .filter(pl.col("country") == "Costa Rica")
      .select(["city", "subscription_date"])   # projection pushdown
      .group_by("city")
      .agg(pl.min("subscription_date"))
      .sort("city")
)

result_lazy = q.collect()
print(result_lazy)
print(q.explain())
exec_time_lazy = (time.time() - start)
print("POLARS LAZY: %.4f seconds" % exec_time_lazy)
results = None

shape: (4, 2)
┌──────────┬─────────────────────┐
│ city     ┆ subscription_date   │
│ ---      ┆ ---                 │
│ str      ┆ datetime[ns]        │
╞══════════╪═════════════════════╡
│ NY       ┆ 2020-01-01 00:00:59 │
│ Rio      ┆ 2020-01-01 00:00:39 │
│ San Jose ┆ 2020-01-01 00:00:02 │
│ Toronto  ┆ 2020-01-01 00:00:13 │
└──────────┴─────────────────────┘
SORT BY [col("city")]
  AGGREGATE[maintain_order: false]
    [col("subscription_date").min()] BY [col("city")]
    FROM
    simple π 2/2 ["city", "subscription_date"]
      Parquet SCAN [Parquet/dataset_with_lots_of_rows.parquet]
      PROJECT 3/4 COLUMNS
      SELECTION: [(col("country")) == ("Costa Rica")]
POLARS LAZY: 0.1663 seconds


## Compare Execution Time

In [11]:
print("PANDAS: %.4f seconds" % exec_time_pandas)
print("POLARS EAGER: %.4f seconds" % exec_time_eager)
print("POLARS EAGER SPLIT: %.4f seconds" % exec_time_eager_split)
print("POLARS LAZY: %.4f seconds" % exec_time_lazy)

PANDAS: 2.9563 seconds
POLARS EAGER: 0.6175 seconds
POLARS EAGER SPLIT: 0.4749 seconds
POLARS LAZY: 0.1663 seconds


## Specify schema to catch type error faster

In [1]:
import polars as pl
import time

schema = {}
schema["Id"] = pl.String
schema["Title"] = pl.String
schema["Price"] = pl.Float32
schema["User_id"] = pl.String
schema["profileName"] = pl.String
schema["review/helpfulness"] = pl.String
schema["review/score"] = pl.Float32
schema["review/time"] = pl.Datetime
schema["review/summary"] = pl.String
schema["review/text"] = pl.String


try:
    start = time.time()
    q = (
        pl.scan_csv("CSV/books_rating.csv", schema=schema)
        .with_columns(pl.col("Title").dt.date().alias("date"))
    )
    q.collect()
except pl.exceptions.ComputeError: 
    print("WITH SCHEMA SPECIFIED: %.4f seconds" % (time.time() - start))

try:
    start = time.time()
    q = (
        pl.scan_csv("CSV/books_rating.csv")
        .with_columns(pl.col("Title").dt.date().alias("date"))
    )
    q.collect()
except pl.exceptions.ComputeError: 
    print("WITHOUT SCHEMA SPECIFIED: %.4f seconds" % (time.time() - start))


WITH SCHEMA SPECIFIED: 0.4681 seconds
WITHOUT SCHEMA SPECIFIED: 3.9672 seconds


## Streaming Mode

In [None]:
import time
import polars as pl

start = time.time()

q = (
    pl.scan_csv("CSV/used_cars.csv", ignore_errors=True)
)

result_lazy = q.collect(engine="streaming")
print(result_lazy)
print("LAZY: %.4f seconds" % (time.time() - start))