In [1]:
!pip install polars

Collecting polars
  Downloading polars-1.27.1-cp39-abi3-win_amd64.whl (35.6 MB)
Installing collected packages: polars
Successfully installed polars-1.27.1


You should consider upgrading via the 'C:\Users\DENNI\AppData\Local\Programs\Python\Python310\python.exe -m pip install --upgrade pip' command.


In [2]:
#### 1. Generación del dataset de ejemplo (20 000 000 filas) ####
import numpy as np
n = 20000000
np.random.seed(0) # sirver para tener la misma secuencia de valores

# Columnas: un grupo categórico y un valor numérico
groups = np.random.choice(["A","B","C","D","E"], size=n)
values = np.random.randn(n)

In [3]:
#### 2. En pandas ####
import pandas as pd
import time

# Construir DataFrame
df_pd = pd.DataFrame({"group": groups, "value": values})

In [4]:
df_pd

Unnamed: 0,group,value
0,E,-0.787968
1,A,-0.891982
2,D,-0.281685
3,D,1.770646
4,D,1.023282
...,...,...
19999995,A,-0.571438
19999996,B,-0.873468
19999997,E,-1.245281
19999998,E,0.546941


In [5]:
# Medir filtrado y agrupación
start = time.time()
filtered_pd = df_pd[df_pd["value"] > 0]
grouped_pd = filtered_pd.groupby("group")["value"].mean()
t_pd = time.time() - start
print(f"Pandas total time: {t_pd:.2f} s")


Pandas total time: 0.71 s


In [6]:
grouped_pd

group
A    0.797175
B    0.797904
C    0.798409
D    0.797725
E    0.796575
Name: value, dtype: float64

In [7]:
filtered_pd

Unnamed: 0,group,value
3,D,1.770646
4,D,1.023282
5,B,0.232165
6,D,0.078764
7,C,1.424839
...,...,...
19999988,C,0.659023
19999991,C,2.170404
19999993,A,0.951577
19999998,E,0.546941


In [8]:
## ------------------------------

In [9]:
#### 3. En polars (lazy) ####
import polars as pl


In [10]:
# 3. Polars lazy: filtrado + agrupación 
df_pl = pl.from_pandas(df_pd)

lazy_pl = (
    df_pl.lazy()
         .filter(pl.col("value") > 0)
         .group_by("group")                
         .agg(pl.col("value").mean())
)



In [11]:
start = time.time()
grouped_pl = lazy_pl.collect()
t_pl = time.time() - start
print(f"Polars total time: {t_pl:.2f} s")

Polars total time: 0.21 s


In [12]:
df_pl

group,value
str,f64
"""E""",-0.787968
"""A""",-0.891982
"""D""",-0.281685
"""D""",1.770646
"""D""",1.023282
…,…
"""A""",-0.571438
"""B""",-0.873468
"""E""",-1.245281
"""E""",0.546941


In [13]:
grouped_pl

group,value
str,f64
"""C""",0.798409
"""B""",0.797904
"""A""",0.797175
"""D""",0.797725
"""E""",0.796575
