In [None]:
# dataset taken from https://www.kaggle.com/datasets/svaningelgem/crypto-currencies-daily-prices

In [65]:
# importing libraries
import pandas as pd
import polars as pl
import numpy as np
import time
import os
from datetime import datetime

In [None]:
# loading data into pandas, polars and pandas with pyarrow engine

DIR_PATH = r"C:\Users\edvar\Documents\jupyter\pandas_polars\crypto_kaggle\\"
files = os.listdir(DIR_PATH)

crypto_df_arrow = pd.DataFrame()
crypto_df_polars = pl.DataFrame()
crypto_df_pandas = pd.DataFrame()

# load data 5 times to get a better estimate of the time taken for each library/engine
for i in range(5):
    for file in files:
        crypto_df_temp = pd.read_csv(os.path.join(DIR_PATH, file), engine='pyarrow')
        crypto_df_arrow = pd.concat([crypto_df_arrow, crypto_df_temp], ignore_index=True)
        
        crypto_df_polars_temp = pl.read_csv(os.path.join(DIR_PATH, file)) # default engine is NumPy
        crypto_df_polars = pl.concat([crypto_df_polars, crypto_df_polars_temp], how='vertical')
        
        crypto_df_pandas_temp = pd.read_csv(os.path.join(DIR_PATH, file))
        crypto_df_pandas = pd.concat([crypto_df_pandas, crypto_df_pandas_temp], ignore_index=True)

crypto_df_arrow['date'] = pd.to_datetime(crypto_df_arrow['date'])
crypto_df_pandas['date'] = pd.to_datetime(crypto_df_pandas['date'])
crypto_df_polars = crypto_df_polars.with_columns(
    pl.col('date').str.strptime(pl.Datetime, "%Y-%m-%d")
)

In [57]:
crypto_df_arrow.head()

Unnamed: 0,ticker,date,open,high,low,close
0,1INCH,2021-01-08,1.3,1.357,1.143,1.2
1,1INCH,2021-01-09,1.2,1.34,1.114,1.244
2,1INCH,2021-01-10,1.244,1.547,1.1,1.224
3,1INCH,2021-01-11,1.224,1.224,0.9543,1.127
4,1INCH,2021-01-12,1.127,1.21,1.049,1.117


In [78]:
crypto_df_arrow.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 904720 entries, 0 to 904719
Data columns (total 6 columns):
 #   Column  Non-Null Count   Dtype         
---  ------  --------------   -----         
 0   ticker  904720 non-null  object        
 1   date    904720 non-null  datetime64[ns]
 2   open    904720 non-null  float64       
 3   high    904720 non-null  float64       
 4   low     904720 non-null  float64       
 5   close   904720 non-null  float64       
dtypes: datetime64[ns](1), float64(4), object(1)
memory usage: 41.4+ MB


In [79]:
crypto_df_arrow.describe()

Unnamed: 0,date,open,high,low,close
count,904720,904720.0,904720.0,904720.0,904720.0
mean,2022-06-14 20:33:08.165177344,688.2484,710.2264,668.8257,688.5276
min,2010-07-17 00:00:00,3.0196e-07,3.03688e-07,2.99255e-07,3.0256e-07
25%,2020-12-02 00:00:00,0.2306258,0.240286,0.2207243,0.2305295
50%,2022-11-18 00:00:00,1.83,1.923085,1.74123,1.82928
75%,2024-06-05 00:00:00,19.53993,20.4145,18.67125,19.53
max,2026-02-13 00:00:00,125172.7,749251.0,122812.5,125173.0
std,,5784.678,6120.215,5647.717,5785.805


In [81]:
def benchmark(operation, pd_func, pl_func):
    start = time.time()
    pd_result = pd_func()
    pd_time = time.time() - start
    start = time.time()
    pl_result = pl_func()
    pl_time = time.time() - start
    print(f"{operation}: Pandas {pd_time:.3f}s, Polars {pl_time:.3f}s ({pd_time/pl_time:.1f}x)")

In [82]:
crypto_df_polars.head()

ticker,date,open,high,low,close
str,datetime[μs],f64,f64,f64,f64
"""1INCH""",2021-01-08 00:00:00,1.3,1.357,1.143,1.2
"""1INCH""",2021-01-09 00:00:00,1.2,1.34,1.114,1.244
"""1INCH""",2021-01-10 00:00:00,1.244,1.547,1.1,1.224
"""1INCH""",2021-01-11 00:00:00,1.224,1.224,0.9543,1.127
"""1INCH""",2021-01-12 00:00:00,1.127,1.21,1.049,1.117


In [87]:
benchmark("Filter BTC 2025+",
          lambda: crypto_df_pandas[crypto_df_pandas['ticker'] == 'BTC'],
          lambda: crypto_df_polars.filter(pl.col('ticker') == 'BTC'))

# Groupby (no date issues)
benchmark("Ticker stats",
          lambda: crypto_df_pandas.groupby(
              'ticker')['close'].agg(['mean', 'max']),
          lambda: crypto_df_polars.group_by('ticker').agg([
              pl.col('close').mean().alias('avg_close'),
              pl.col('close').max().alias('max_close')]))

# Returns
crypto_df_pandas['returns'] = crypto_df_pandas.groupby('ticker')[
    'close'].pct_change()
crypto_df_polars = crypto_df_polars.with_columns(
    pl.col('close').pct_change().over('ticker').alias('returns'))

benchmark("Returns >1%",
          lambda: crypto_df_pandas[crypto_df_pandas['returns'] > 0.01],
    lambda: crypto_df_polars.filter(pl.col('returns') >0.01))

Filter BTC 2025+: Pandas 0.155s, Polars 0.028s (5.5x)
Ticker stats: Pandas 0.158s, Polars 0.078s (2.0x)
Returns >1%: Pandas 0.048s, Polars 0.031s (1.6x)


In [88]:
benchmark("Filter BTC 2025+",
          lambda: crypto_df_arrow[crypto_df_arrow['ticker'] == 'BTC'],
          lambda: crypto_df_polars.filter(pl.col('ticker') == 'BTC'))

# Groupby (no date issues)
benchmark("Ticker stats",
          lambda: crypto_df_arrow.groupby(
              'ticker')['close'].agg(['mean', 'max']),
          lambda: crypto_df_polars.group_by('ticker').agg([
              pl.col('close').mean().alias('avg_close'),
              pl.col('close').max().alias('max_close')]))

# Returns
crypto_df_arrow['returns'] = crypto_df_arrow.groupby('ticker')[
    'close'].pct_change()
crypto_df_polars = crypto_df_polars.with_columns(
    pl.col('close').pct_change().over('ticker').alias('returns'))

benchmark("Returns >1%",
          lambda: crypto_df_arrow[crypto_df_arrow['returns'] > 0.01],
    lambda: crypto_df_polars.filter(pl.col('returns') > 0.01))

Filter BTC 2025+: Pandas 0.222s, Polars 0.043s (5.1x)
Ticker stats: Pandas 0.192s, Polars 0.074s (2.6x)
Returns >1%: Pandas 0.104s, Polars 0.036s (2.9x)


In [91]:
print(f"NumPy memory: {crypto_df_pandas.memory_usage(deep=True).sum()/1e6:.1f} MB")
print(f"Arrow memory: {crypto_df_arrow.memory_usage(deep=True).sum()/1e6:.1f} MB")
print(f"Polars:          {crypto_df_polars.estimated_size() / 1e6:.1f} MB")

NumPy memory: 90.8 MB
Arrow memory: 90.8 MB
Polars:          46.6 MB


In [None]:
# arrow engine is supposed to optimize string memory usage, but this dataset doesn't have many unique strings, so the memory usage is similar to NumPy engine. 
# Polars is more memory efficient than both, because 
# a) Apache Arrow under the hood
# b) polars is written in rust, which is much more memory efficient than Python.