In [None]:
!pip install polars

In [None]:
import pandas as pd
import polars as pl
import numpy as np
import time

## Large file read

In [None]:
# our best effort with pandas
%timeit pd.read_csv("yellow_tripdata_2020-01.csv.gz", dtype={"PULocationID": np.uint8,"DOLocationID": np.uint8}, parse_dates=["tpep_pickup_datetime", "tpep_dropoff_datetime"], converters={"VendorID": lambda x: np.int8(["", "1", "2"].index(x)),"store_and_fwd_flag": lambda x: ["", "N", "Y"].index(x) - 1,"payment_type": lambda x: -1 if x == "" else int(x),"RatecodeID": lambda x: -1 if x == "" else int(x),"passenger_count": lambda x: -1 if x == "" else int(x)})

In [None]:
# naive polars
%timeit pl.read_csv("yellow_tripdata_2020-01.csv.gz")

## Benchmarking

#### Create Dataframe

In [None]:
N = 10_000_000
data = {
    "A": np.random.randint(0, 100, N),
    "B": np.random.randn(N) * 50 + 100,
    "C": np.random.choice(["X", "Y", "Z"], N)
}

In [None]:
# Create Pandas DataFrame
start = time.time()
df_pandas = pd.DataFrame(data)
end = time.time()
print(f"pandas dataframe takes {end-start} secs. to create")
df_pandas.head()

In [None]:
# Create Polars DataFrame
start = time.time()
df_polars = pl.DataFrame(data)
end = time.time()
print(f"polars dataframe takes {end-start} secs. to create")
df_polars.head()

#### Filtering

In [None]:
# Pandas
start = time.time()
filtered_pandas = df_pandas[df_pandas["A"] > 50]
end = time.time()
print(f"Pandas filtering time: {end - start:.4f} sec")

In [None]:
# Polars
start = time.time()
filtered_polars = df_polars.filter(pl.col("A") > 50)
end = time.time()
print(f"Polars filtering time: {end - start:.4f} sec")

#### GroupBy and Aggregation

In [None]:
# Pandas
start = time.time()
grouped_pandas = df_pandas.groupby("C")["B"].mean()
end = time.time()
print(f"Pandas groupby time: {end - start:.4f} sec")

In [None]:
# Polars
start = time.time()
grouped_polars = df_polars.group_by("C").agg(pl.col("B").mean())
end = time.time()
print(f"Polars groupby time: {end - start:.4f} sec")

#### Operations

In [None]:
# Pandas
start = time.time()
df_pandas["A_squared"] = df_pandas["A"] ** 2
end = time.time()
print(f"Pandas column operation time: {end - start:.4f} sec")

In [None]:
# Polars
start = time.time()
df_polars = df_polars.with_columns((pl.col("A") ** 2).alias("A_squared"))
end = time.time()
print(f"Polars column operation time: {end - start:.4f} sec")