In [206]:
import polars as pl
from datetime import datetime
import numpy as np


# Declarar

In [207]:
df = pl.DataFrame(
    {
        "a": range(8),
        "b": np.random.rand(8),
        "d": [1.0, 2.0, float("nan"), float("nan"), 0.0, -5.0, -42.0, None],
    }
)

df.write_csv("output.csv")

In [208]:
df = pl.DataFrame(
    {
        "integer": [1, 2, 3],
        "date": [
            datetime(2025, 1, 1),
            datetime(2025, 1, 2),
            datetime(2025, 1, 3),
        ],
        "float": [4.0, 5.0, 6.0],
        "string": ["a", "b", "c"],
    }
)

print(df)

shape: (3, 4)
┌─────────┬─────────────────────┬───────┬────────┐
│ integer ┆ date                ┆ float ┆ string │
│ ---     ┆ ---                 ┆ ---   ┆ ---    │
│ i64     ┆ datetime[μs]        ┆ f64   ┆ str    │
╞═════════╪═════════════════════╪═══════╪════════╡
│ 1       ┆ 2025-01-01 00:00:00 ┆ 4.0   ┆ a      │
│ 2       ┆ 2025-01-02 00:00:00 ┆ 5.0   ┆ b      │
│ 3       ┆ 2025-01-03 00:00:00 ┆ 6.0   ┆ c      │
└─────────┴─────────────────────┴───────┴────────┘


# Grabar y leer

In [209]:
archivo = "output.csv"
df.write_csv(archivo)
df_csv = pl.read_csv(archivo)
print(df_csv)

shape: (3, 4)
┌─────────┬────────────────────────────┬───────┬────────┐
│ integer ┆ date                       ┆ float ┆ string │
│ ---     ┆ ---                        ┆ ---   ┆ ---    │
│ i64     ┆ str                        ┆ f64   ┆ str    │
╞═════════╪════════════════════════════╪═══════╪════════╡
│ 1       ┆ 2025-01-01T00:00:00.000000 ┆ 4.0   ┆ a      │
│ 2       ┆ 2025-01-02T00:00:00.000000 ┆ 5.0   ┆ b      │
│ 3       ┆ 2025-01-03T00:00:00.000000 ┆ 6.0   ┆ c      │
└─────────┴────────────────────────────┴───────┴────────┘


# Expresiones

* select
* filter
* with_columns
* group_by

## Select
Esto selecciona la o las columnas donde queremos la info, tiene dos partes, primero el df y después la o las columnas

In [210]:
df.select(pl.col("*"))

integer,date,float,string
i64,datetime[μs],f64,str
1,2025-01-01 00:00:00,4.0,"""a"""
2,2025-01-02 00:00:00,5.0,"""b"""
3,2025-01-03 00:00:00,6.0,"""c"""


In [211]:
df.select(pl.col("date", "string"))

date,string
datetime[μs],str
2025-01-01 00:00:00,"""a"""
2025-01-02 00:00:00,"""b"""
2025-01-03 00:00:00,"""c"""


## Filter
Crea un subconjunto del DF

In [212]:
df.filter(
    pl.col("date").is_between(datetime(2025, 1, 2), datetime(2025, 1, 3)),
)

integer,date,float,string
i64,datetime[μs],f64,str
2,2025-01-02 00:00:00,5.0,"""b"""
3,2025-01-03 00:00:00,6.0,"""c"""


In [213]:
df.filter((pl.col("date").is_between(datetime(2025, 1, 2), datetime(2025, 1, 3))) & (pl.col("float") > 5))

integer,date,float,string
i64,datetime[μs],f64,str
3,2025-01-03 00:00:00,6.0,"""c"""


## With Columns
Esto sirve para crear nuevas columnas para analizar. 

In [214]:
df.with_columns(pl.col("float").sum().alias("e"), (pl.col("float") + 42).alias("float+42"))

integer,date,float,string,e,float+42
i64,datetime[μs],f64,str,f64,f64
1,2025-01-01 00:00:00,4.0,"""a""",15.0,46.0
2,2025-01-02 00:00:00,5.0,"""b""",15.0,47.0
3,2025-01-03 00:00:00,6.0,"""c""",15.0,48.0


## Group by
Agrupa

In [215]:
df2 = pl.DataFrame(
    {
        "x": range(8),
        "y": ["A", "A", "A", "B", "B", "C", "X", "X"],
    }
)
df2

x,y
i64,str
0,"""A"""
1,"""A"""
2,"""A"""
3,"""B"""
4,"""B"""
5,"""C"""
6,"""X"""
7,"""X"""


In [216]:
df2.group_by("y", maintain_order=True).len()

y,len
str,u32
"""A""",3
"""B""",2
"""C""",1
"""X""",2


In [217]:
df2.group_by("y", maintain_order=True).agg(
    pl.col("*").count().alias("count"),
    pl.col("*").sum().alias("sum"),
)

y,count,sum
str,u32,i64
"""A""",3,3
"""B""",2,7
"""C""",1,5
"""X""",2,13


## Combinación de operaciones

In [218]:
df = pl.DataFrame(
    {
        "a": range(5),
        "b":np.random.rand(5),
        "c": [
            datetime(2025, 12, 1),
            datetime(2025, 12, 2),
            datetime(2025, 12, 3),
            datetime(2025, 12, 4),
            datetime(2025, 12, 5),            
        ],
        "d": [1., 2., np.nan, -42, None]
    }
)

In [219]:
df

a,b,c,d
i64,f64,datetime[μs],f64
0,0.639316,2025-12-01 00:00:00,1.0
1,0.246934,2025-12-02 00:00:00,2.0
2,0.563067,2025-12-03 00:00:00,
3,0.765095,2025-12-04 00:00:00,-42.0
4,0.167124,2025-12-05 00:00:00,


In [220]:
df_x = df.with_columns((pl.col("a") * pl.col("b")).alias("a * b")).select(
    pl.all().exclude(["c", "d"])
)

print(df_x)

shape: (5, 3)
┌─────┬──────────┬──────────┐
│ a   ┆ b        ┆ a * b    │
│ --- ┆ ---      ┆ ---      │
│ i64 ┆ f64      ┆ f64      │
╞═════╪══════════╪══════════╡
│ 0   ┆ 0.639316 ┆ 0.0      │
│ 1   ┆ 0.246934 ┆ 0.246934 │
│ 2   ┆ 0.563067 ┆ 1.126135 │
│ 3   ┆ 0.765095 ┆ 2.295286 │
│ 4   ┆ 0.167124 ┆ 0.668495 │
└─────┴──────────┴──────────┘


In [221]:
df_y = df.with_columns((pl.col("a") * pl.col("b")).alias("a * b")).select(
    pl.all().exclude("d")
)

print(df_y)

shape: (5, 4)
┌─────┬──────────┬─────────────────────┬──────────┐
│ a   ┆ b        ┆ c                   ┆ a * b    │
│ --- ┆ ---      ┆ ---                 ┆ ---      │
│ i64 ┆ f64      ┆ datetime[μs]        ┆ f64      │
╞═════╪══════════╪═════════════════════╪══════════╡
│ 0   ┆ 0.639316 ┆ 2025-12-01 00:00:00 ┆ 0.0      │
│ 1   ┆ 0.246934 ┆ 2025-12-02 00:00:00 ┆ 0.246934 │
│ 2   ┆ 0.563067 ┆ 2025-12-03 00:00:00 ┆ 1.126135 │
│ 3   ┆ 0.765095 ┆ 2025-12-04 00:00:00 ┆ 2.295286 │
│ 4   ┆ 0.167124 ┆ 2025-12-05 00:00:00 ┆ 0.668495 │
└─────┴──────────┴─────────────────────┴──────────┘


# Combinación de DataFrames

## Join

In [222]:
df = pl.DataFrame(
    {
        "a": range(8),
        "b": np.random.rand(8),
        "d": [1.0, 2.0, float("nan"), float("nan"), 0.0, -5.0, -42.0, None],
    }
)

df2 = pl.DataFrame(
    {
        "x": range(8),
        "y": ["A", "A", "A", "B", "B", "C", "X", "X"],
    }
)

In [223]:
df

a,b,d
i64,f64,f64
0,0.908194,1.0
1,0.98036,2.0
2,0.948344,
3,0.89787,
4,0.964528,0.0
5,0.693571,-5.0
6,0.642649,-42.0
7,0.563708,


In [224]:
df2

x,y
i64,str
0,"""A"""
1,"""A"""
2,"""A"""
3,"""B"""
4,"""B"""
5,"""C"""
6,"""X"""
7,"""X"""


In [225]:
joined = df.join(df2, left_on="a", right_on="x")
print(joined)

shape: (8, 4)
┌─────┬──────────┬───────┬─────┐
│ a   ┆ b        ┆ d     ┆ y   │
│ --- ┆ ---      ┆ ---   ┆ --- │
│ i64 ┆ f64      ┆ f64   ┆ str │
╞═════╪══════════╪═══════╪═════╡
│ 0   ┆ 0.908194 ┆ 1.0   ┆ A   │
│ 1   ┆ 0.98036  ┆ 2.0   ┆ A   │
│ 2   ┆ 0.948344 ┆ NaN   ┆ A   │
│ 3   ┆ 0.89787  ┆ NaN   ┆ B   │
│ 4   ┆ 0.964528 ┆ 0.0   ┆ B   │
│ 5   ┆ 0.693571 ┆ -5.0  ┆ C   │
│ 6   ┆ 0.642649 ┆ -42.0 ┆ X   │
│ 7   ┆ 0.563708 ┆ null  ┆ X   │
└─────┴──────────┴───────┴─────┘


## Join

In [226]:
stacked = df.hstack(df2)
print(stacked)

shape: (8, 5)
┌─────┬──────────┬───────┬─────┬─────┐
│ a   ┆ b        ┆ d     ┆ x   ┆ y   │
│ --- ┆ ---      ┆ ---   ┆ --- ┆ --- │
│ i64 ┆ f64      ┆ f64   ┆ i64 ┆ str │
╞═════╪══════════╪═══════╪═════╪═════╡
│ 0   ┆ 0.908194 ┆ 1.0   ┆ 0   ┆ A   │
│ 1   ┆ 0.98036  ┆ 2.0   ┆ 1   ┆ A   │
│ 2   ┆ 0.948344 ┆ NaN   ┆ 2   ┆ A   │
│ 3   ┆ 0.89787  ┆ NaN   ┆ 3   ┆ B   │
│ 4   ┆ 0.964528 ┆ 0.0   ┆ 4   ┆ B   │
│ 5   ┆ 0.693571 ┆ -5.0  ┆ 5   ┆ C   │
│ 6   ┆ 0.642649 ┆ -42.0 ┆ 6   ┆ X   │
│ 7   ┆ 0.563708 ┆ null  ┆ 7   ┆ X   │
└─────┴──────────┴───────┴─────┴─────┘


# Tipos de datos
Polars se basa en tipos de datos de Arrow , son ene , hay una lista en https://docs.pola.rs/user-guide/concepts/data-types/overview/

# Categorical
Data en *string* que tiene número finito de valores. Esto sirve para ahorrar memoria, igual que en *pandas*. 
Existe un tipo Enum que se usa cuando se conoce de antemano las categorías. En Categorical es más flexible y se pueden ir agregando más categorías ex post, con mayor uso de memoria 

In [227]:
enum_dtype = pl.Enum(["Polar", "Panda", "Brown"])
enum_series = pl.Series(["Polar", "Panda", "Brown", "Brown", "Polar"], dtype=enum_dtype)
cat_series = pl.Series(
    ["Polar", "Panda", "Brown", "Brown", "Polar"], dtype=pl.Categorical
)

In [228]:
enum_dtype

Enum(categories=['Polar', 'Panda', 'Brown'])

In [229]:
cat_series

"""Polar"""
"""Panda"""
"""Brown"""
"""Brown"""
"""Polar"""


# Estructuras de datos
## Series
Igual que las series de pandas, tiene una dimensión y el mismo tipo de datos

In [230]:
import polars as pl

s = pl.Series("a", [1, 2, 3, 4, 5])
print(s)

shape: (5,)
Series: 'a' [i64]
[
	1
	2
	3
	4
	5
]


## DataFrame
Tiene 2 dimensiones, que por detrás son Series, como abstracción es una colección de series.

In [231]:
from datetime import datetime

df = pl.DataFrame(
    {
        "integer": [1, 2, 3, 4, 5],
        "date": [
            datetime(2022, 1, 1),
            datetime(2022, 1, 2),
            datetime(2022, 1, 3),
            datetime(2022, 1, 4),
            datetime(2022, 1, 5),
        ],
        "float": [4.0, 5.0, 6.0, 7.0, 8.0],
    }
)

print(df)

shape: (5, 3)
┌─────────┬─────────────────────┬───────┐
│ integer ┆ date                ┆ float │
│ ---     ┆ ---                 ┆ ---   │
│ i64     ┆ datetime[μs]        ┆ f64   │
╞═════════╪═════════════════════╪═══════╡
│ 1       ┆ 2022-01-01 00:00:00 ┆ 4.0   │
│ 2       ┆ 2022-01-02 00:00:00 ┆ 5.0   │
│ 3       ┆ 2022-01-03 00:00:00 ┆ 6.0   │
│ 4       ┆ 2022-01-04 00:00:00 ┆ 7.0   │
│ 5       ┆ 2022-01-05 00:00:00 ┆ 8.0   │
└─────────┴─────────────────────┴───────┘


## Mirando datos
Igual que en pandas

In [232]:
print(df.head())

shape: (5, 3)
┌─────────┬─────────────────────┬───────┐
│ integer ┆ date                ┆ float │
│ ---     ┆ ---                 ┆ ---   │
│ i64     ┆ datetime[μs]        ┆ f64   │
╞═════════╪═════════════════════╪═══════╡
│ 1       ┆ 2022-01-01 00:00:00 ┆ 4.0   │
│ 2       ┆ 2022-01-02 00:00:00 ┆ 5.0   │
│ 3       ┆ 2022-01-03 00:00:00 ┆ 6.0   │
│ 4       ┆ 2022-01-04 00:00:00 ┆ 7.0   │
│ 5       ┆ 2022-01-05 00:00:00 ┆ 8.0   │
└─────────┴─────────────────────┴───────┘


In [233]:
print(df.head(3))

shape: (3, 3)
┌─────────┬─────────────────────┬───────┐
│ integer ┆ date                ┆ float │
│ ---     ┆ ---                 ┆ ---   │
│ i64     ┆ datetime[μs]        ┆ f64   │
╞═════════╪═════════════════════╪═══════╡
│ 1       ┆ 2022-01-01 00:00:00 ┆ 4.0   │
│ 2       ┆ 2022-01-02 00:00:00 ┆ 5.0   │
│ 3       ┆ 2022-01-03 00:00:00 ┆ 6.0   │
└─────────┴─────────────────────┴───────┘


In [234]:
print(df.tail())

shape: (5, 3)
┌─────────┬─────────────────────┬───────┐
│ integer ┆ date                ┆ float │
│ ---     ┆ ---                 ┆ ---   │
│ i64     ┆ datetime[μs]        ┆ f64   │
╞═════════╪═════════════════════╪═══════╡
│ 1       ┆ 2022-01-01 00:00:00 ┆ 4.0   │
│ 2       ┆ 2022-01-02 00:00:00 ┆ 5.0   │
│ 3       ┆ 2022-01-03 00:00:00 ┆ 6.0   │
│ 4       ┆ 2022-01-04 00:00:00 ┆ 7.0   │
│ 5       ┆ 2022-01-05 00:00:00 ┆ 8.0   │
└─────────┴─────────────────────┴───────┘


In [235]:
print(df.tail(2))

shape: (2, 3)
┌─────────┬─────────────────────┬───────┐
│ integer ┆ date                ┆ float │
│ ---     ┆ ---                 ┆ ---   │
│ i64     ┆ datetime[μs]        ┆ f64   │
╞═════════╪═════════════════════╪═══════╡
│ 4       ┆ 2022-01-04 00:00:00 ┆ 7.0   │
│ 5       ┆ 2022-01-05 00:00:00 ┆ 8.0   │
└─────────┴─────────────────────┴───────┘


In [236]:
df.sample()

integer,date,float
i64,datetime[μs],f64
1,2022-01-01 00:00:00,4.0


In [237]:
df.sample(3)

integer,date,float
i64,datetime[μs],f64
1,2022-01-01 00:00:00,4.0
3,2022-01-03 00:00:00,6.0
2,2022-01-02 00:00:00,5.0


In [238]:
df.describe()

statistic,integer,date,float
str,f64,str,f64
"""count""",5.0,"""5""",5.0
"""null_count""",0.0,"""0""",0.0
"""mean""",3.0,"""2022-01-03 00:00:00""",6.0
"""std""",1.581139,,1.581139
"""min""",1.0,"""2022-01-01 00:00:00""",4.0
"""25%""",2.0,"""2022-01-02 00:00:00""",5.0
"""50%""",3.0,"""2022-01-03 00:00:00""",6.0
"""75%""",4.0,"""2022-01-04 00:00:00""",7.0
"""max""",5.0,"""2022-01-05 00:00:00""",8.0


# Contextos
Para transformar y leer datos existen contextos y expresiones. Un contexto se refiere al contexto (?) en el cual la expresión debe ser evaluada. Existen 3 contextos principales:
* Selección: df.select, df.with_columns
* Filtro: df.filter()
* Agrupación: df.group_by(..).agg(...)

In [239]:
df = pl.DataFrame(
    {
        "nrs": [1, 2, 3, None, 5],
        "names": ["foo", "ham", "spam", "egg", None],
        "random": np.random.rand(5),
        "groups": ["A", "A", "B", "C", "B"],
    }
)
print(df)

shape: (5, 4)
┌──────┬───────┬──────────┬────────┐
│ nrs  ┆ names ┆ random   ┆ groups │
│ ---  ┆ ---   ┆ ---      ┆ ---    │
│ i64  ┆ str   ┆ f64      ┆ str    │
╞══════╪═══════╪══════════╪════════╡
│ 1    ┆ foo   ┆ 0.04727  ┆ A      │
│ 2    ┆ ham   ┆ 0.659535 ┆ A      │
│ 3    ┆ spam  ┆ 0.121668 ┆ B      │
│ null ┆ egg   ┆ 0.016698 ┆ C      │
│ 5    ┆ null  ┆ 0.046269 ┆ B      │
└──────┴───────┴──────────┴────────┘


## Selección
Expresiones sobre columnas, un select puede generar nuevas columnas

In [240]:
out = df.select(
    pl.sum("nrs"),
    pl.col("names").sort(),
    pl.col("names").first().alias("first name"),
    (pl.mean("nrs") * 10).alias("10xnrs"),
)
print(out)

shape: (5, 4)
┌─────┬───────┬────────────┬────────┐
│ nrs ┆ names ┆ first name ┆ 10xnrs │
│ --- ┆ ---   ┆ ---        ┆ ---    │
│ i64 ┆ str   ┆ str        ┆ f64    │
╞═════╪═══════╪════════════╪════════╡
│ 11  ┆ null  ┆ foo        ┆ 27.5   │
│ 11  ┆ egg   ┆ foo        ┆ 27.5   │
│ 11  ┆ foo   ┆ foo        ┆ 27.5   │
│ 11  ┆ ham   ┆ foo        ┆ 27.5   │
│ 11  ┆ spam  ┆ foo        ┆ 27.5   │
└─────┴───────┴────────────┴────────┘


with_Columns mantiene las columnas originales y agrega nuevas, mientras que select bota las originales

In [241]:
df = df.with_columns(
    pl.sum("nrs").alias("nrs_sum"),
    pl.col("random").count().alias("count"),
)
print(df)

shape: (5, 6)
┌──────┬───────┬──────────┬────────┬─────────┬───────┐
│ nrs  ┆ names ┆ random   ┆ groups ┆ nrs_sum ┆ count │
│ ---  ┆ ---   ┆ ---      ┆ ---    ┆ ---     ┆ ---   │
│ i64  ┆ str   ┆ f64      ┆ str    ┆ i64     ┆ u32   │
╞══════╪═══════╪══════════╪════════╪═════════╪═══════╡
│ 1    ┆ foo   ┆ 0.04727  ┆ A      ┆ 11      ┆ 5     │
│ 2    ┆ ham   ┆ 0.659535 ┆ A      ┆ 11      ┆ 5     │
│ 3    ┆ spam  ┆ 0.121668 ┆ B      ┆ 11      ┆ 5     │
│ null ┆ egg   ┆ 0.016698 ┆ C      ┆ 11      ┆ 5     │
│ 5    ┆ null  ┆ 0.046269 ┆ B      ┆ 11      ┆ 5     │
└──────┴───────┴──────────┴────────┴─────────┴───────┘


## Filtrado
Evalua sobre boolean

In [242]:
out = df.filter(pl.col("nrs") > 2)
print(out)

shape: (2, 6)
┌─────┬───────┬──────────┬────────┬─────────┬───────┐
│ nrs ┆ names ┆ random   ┆ groups ┆ nrs_sum ┆ count │
│ --- ┆ ---   ┆ ---      ┆ ---    ┆ ---     ┆ ---   │
│ i64 ┆ str   ┆ f64      ┆ str    ┆ i64     ┆ u32   │
╞═════╪═══════╪══════════╪════════╪═════════╪═══════╡
│ 3   ┆ spam  ┆ 0.121668 ┆ B      ┆ 11      ┆ 5     │
│ 5   ┆ null  ┆ 0.046269 ┆ B      ┆ 11      ┆ 5     │
└─────┴───────┴──────────┴────────┴─────────┴───────┘


## Agregación

In [243]:
out = df.group_by("groups").agg(
    pl.sum("nrs"),  # sum nrs by groups
    pl.col("random").count().alias("count"),  # count group members
    # sum random where name != null
    pl.col("random").filter(pl.col("names").is_not_null()).sum().name.suffix("_sum"),
    pl.col("names").reverse().alias("reversed names"),
)
print(out)

shape: (3, 5)
┌────────┬─────┬───────┬────────────┬────────────────┐
│ groups ┆ nrs ┆ count ┆ random_sum ┆ reversed names │
│ ---    ┆ --- ┆ ---   ┆ ---        ┆ ---            │
│ str    ┆ i64 ┆ u32   ┆ f64        ┆ list[str]      │
╞════════╪═════╪═══════╪════════════╪════════════════╡
│ A      ┆ 3   ┆ 2     ┆ 0.706806   ┆ ["ham", "foo"] │
│ C      ┆ 0   ┆ 1     ┆ 0.016698   ┆ ["egg"]        │
│ B      ┆ 8   ┆ 2     ┆ 0.121668   ┆ [null, "spam"] │
└────────┴─────┴───────┴────────────┴────────────────┘


# Expresiones
Sirve para hacer varias cosas
* Una muestra de columnas
* multiplicar valores en columna
* extrar columna de años en fechas
* convertir columna a mayúsculas

La siguiente expresión selecciona la columna foo, la ordena y toma los dos primeros datos


In [244]:
pl.col("foo").sort().head(2)

# API Floja / Ágil
Hay dos modos de operación: floja ( lazy) y ágil. En la API ágil, la consulta se ejecuta inmediatamente, mientras que en la API floja la consulta solo se evalúa cuando es ‘necesaria’. Postergar la ejecución hasta el último momento puede tener ventajas significativas en el rendimiento y es por eso que en la mayoría de los casos se prefiere la API floja.

In [245]:
iris_filename = "iris/iris.csv"
df = pl.read_csv(iris_filename)
df_small = df.filter(pl.col("sepal.length") > 5)
df_agg = df_small.group_by("variety").agg(pl.col("sepal.width").mean())
print(df_agg)

shape: (3, 2)
┌────────────┬─────────────┐
│ variety    ┆ sepal.width │
│ ---        ┆ ---         │
│ str        ┆ f64         │
╞════════════╪═════════════╡
│ Versicolor ┆ 2.804255    │
│ Setosa     ┆ 3.713636    │
│ Virginica  ┆ 2.983673    │
└────────────┴─────────────┘


En este ejemplo usamos la API ágil para:

	1.	Leer el conjunto de datos iris.
	2.	Filtrar el conjunto de datos en función de la longitud del sépalo.
	3.	Calcular la media del ancho del sépalo por especie.

Cada paso se ejecuta inmediatamente devolviendo los resultados intermedios. Esto puede ser muy ineficiente ya que podríamos estar realizando trabajo o cargando datos adicionales que no se están utilizando. Si en su lugar usamos la API perezosa y esperamos a la ejecución hasta que todos los pasos estén definidos, el planificador de consultas podría realizar varias optimizaciones

La ejecución la hace al llamar collect

In [246]:
q = (
    pl.scan_csv(iris_filename)
    .filter(pl.col("sepal.length") > 5)
    .group_by("variety")
    .agg(pl.col("sepal.width").mean())
)

df = q.collect()

In [247]:
df

variety,sepal.width
str,f64
"""Setosa""",3.713636
"""Versicolor""",2.804255
"""Virginica""",2.983673


# API de Streaming
Un beneficio adicional de la API floja es que permite que las consultas se ejecuten de manera continua (streaming). En lugar de procesar todos los datos de una vez, Polars puede ejecutar la consulta en lotes, lo que te permite procesar conjuntos de datos que son más grandes que la memoria disponible.

Para indicarle a Polars que queremos ejecutar una consulta en modo de streaming, pasamos el argumento streaming=True a la función collect.

Esto todavía está en desarrollo

In [248]:
q = (
    pl.scan_csv(iris_filename)
    .filter(pl.col("sepal.length") > 5)
    .group_by("variety")
    .agg(pl.col("sepal.width").mean())
)

df = q.collect(streaming=True)

In [249]:
df

variety,sepal.width
str,f64
"""Virginica""",2.983673
"""Versicolor""",2.804255
"""Setosa""",3.713636


In [250]:
print(q.explain(streaming=True))

STREAMING:
  AGGREGATE
  	[col("sepal.width").mean()] BY [col("variety")] FROM
    Csv SCAN [iris/iris.csv]
    PROJECT 3/5 COLUMNS
    SELECTION: [(col("sepal.length")) > (5.0)]


In [251]:
q2 = pl.scan_csv(iris_filename).with_columns(
    pl.col("sepal.length").mean().over("variety")
)

print(q2.explain(streaming=True))

 WITH_COLUMNS:
 [col("sepal.length").mean().over([col("variety")])] 
  STREAMING:
    Csv SCAN [iris/iris.csv]
    PROJECT */5 COLUMNS


# Expresiones
## Básicas

In [252]:
df = pl.DataFrame(
    {
        "nrs": [1, 2, 3, None, 5],
        "names": ["foo", "ham", "spam", "egg", None],
        "random": np.random.rand(5),
        "groups": ["A", "A", "B", "C", "B"],
    }
)
print(df)

shape: (5, 4)
┌──────┬───────┬──────────┬────────┐
│ nrs  ┆ names ┆ random   ┆ groups │
│ ---  ┆ ---   ┆ ---      ┆ ---    │
│ i64  ┆ str   ┆ f64      ┆ str    │
╞══════╪═══════╪══════════╪════════╡
│ 1    ┆ foo   ┆ 0.661616 ┆ A      │
│ 2    ┆ ham   ┆ 0.638222 ┆ A      │
│ 3    ┆ spam  ┆ 0.909572 ┆ B      │
│ null ┆ egg   ┆ 0.953746 ┆ C      │
│ 5    ┆ null  ┆ 0.872299 ┆ B      │
└──────┴───────┴──────────┴────────┘


In [253]:
df_numerical = df.select(
    (pl.col("nrs") + 5).alias("nrs + 5"),
    (pl.col("nrs") - 5).alias("nrs - 5"),
    (pl.col("nrs") * pl.col("random")).alias("nrs * random"),
    (pl.col("nrs") / pl.col("random")).alias("nrs / random"),
)
print(df_numerical)

shape: (5, 4)
┌─────────┬─────────┬──────────────┬──────────────┐
│ nrs + 5 ┆ nrs - 5 ┆ nrs * random ┆ nrs / random │
│ ---     ┆ ---     ┆ ---          ┆ ---          │
│ i64     ┆ i64     ┆ f64          ┆ f64          │
╞═════════╪═════════╪══════════════╪══════════════╡
│ 6       ┆ -4      ┆ 0.661616     ┆ 1.511451     │
│ 7       ┆ -3      ┆ 1.276445     ┆ 3.133704     │
│ 8       ┆ -2      ┆ 2.728716     ┆ 3.298254     │
│ null    ┆ null    ┆ null         ┆ null         │
│ 10      ┆ 0       ┆ 4.361494     ┆ 5.73198      │
└─────────┴─────────┴──────────────┴──────────────┘


In [254]:
df_logical = df.select(
    (pl.col("nrs") > 1).alias("nrs > 1"),
    (pl.col("random") <= 0.5).alias("random <= .5"),
    (pl.col("nrs") != 1).alias("nrs != 1"),
    (pl.col("nrs") == 1).alias("nrs == 1"),
    ((pl.col("random") <= 0.5) & (pl.col("nrs") > 1)).alias("and_expr"),  # and
    ((pl.col("random") <= 0.5) | (pl.col("nrs") > 1)).alias("or_expr"),  # or
)
print(df_logical)

shape: (5, 6)
┌─────────┬──────────────┬──────────┬──────────┬──────────┬─────────┐
│ nrs > 1 ┆ random <= .5 ┆ nrs != 1 ┆ nrs == 1 ┆ and_expr ┆ or_expr │
│ ---     ┆ ---          ┆ ---      ┆ ---      ┆ ---      ┆ ---     │
│ bool    ┆ bool         ┆ bool     ┆ bool     ┆ bool     ┆ bool    │
╞═════════╪══════════════╪══════════╪══════════╪══════════╪═════════╡
│ false   ┆ false        ┆ false    ┆ true     ┆ false    ┆ false   │
│ true    ┆ false        ┆ true     ┆ false    ┆ false    ┆ true    │
│ true    ┆ false        ┆ true     ┆ false    ┆ false    ┆ true    │
│ null    ┆ false        ┆ null     ┆ null     ┆ false    ┆ null    │
│ true    ┆ false        ┆ true     ┆ false    ┆ false    ┆ true    │
└─────────┴──────────────┴──────────┴──────────┴──────────┴─────────┘


## Selección columnas

In [255]:
from datetime import date, datetime

import polars as pl

df = pl.DataFrame(
    {
        "id": [9, 4, 2],
        "place": ["Mars", "Earth", "Saturn"],
        "date": pl.date_range(date(2022, 1, 1), date(2022, 1, 3), "1d", eager=True),
        "sales": [33.4, 2142134.1, 44.7],
        "has_people": [False, True, False],
        "logged_at": pl.datetime_range(
            datetime(2022, 12, 1), datetime(2022, 12, 1, 0, 0, 2), "1s", eager=True
        ),
    }
).with_row_index("index")
print(df)

shape: (3, 7)
┌───────┬─────┬────────┬────────────┬───────────┬────────────┬─────────────────────┐
│ index ┆ id  ┆ place  ┆ date       ┆ sales     ┆ has_people ┆ logged_at           │
│ ---   ┆ --- ┆ ---    ┆ ---        ┆ ---       ┆ ---        ┆ ---                 │
│ u32   ┆ i64 ┆ str    ┆ date       ┆ f64       ┆ bool       ┆ datetime[μs]        │
╞═══════╪═════╪════════╪════════════╪═══════════╪════════════╪═════════════════════╡
│ 0     ┆ 9   ┆ Mars   ┆ 2022-01-01 ┆ 33.4      ┆ false      ┆ 2022-12-01 00:00:00 │
│ 1     ┆ 4   ┆ Earth  ┆ 2022-01-02 ┆ 2142134.1 ┆ true       ┆ 2022-12-01 00:00:01 │
│ 2     ┆ 2   ┆ Saturn ┆ 2022-01-03 ┆ 44.7      ┆ false      ┆ 2022-12-01 00:00:02 │
└───────┴─────┴────────┴────────────┴───────────┴────────────┴─────────────────────┘


In [256]:
out = df.select(pl.col("*"))

# Is equivalent to
out = df.select(pl.all())
print(out)

shape: (3, 7)
┌───────┬─────┬────────┬────────────┬───────────┬────────────┬─────────────────────┐
│ index ┆ id  ┆ place  ┆ date       ┆ sales     ┆ has_people ┆ logged_at           │
│ ---   ┆ --- ┆ ---    ┆ ---        ┆ ---       ┆ ---        ┆ ---                 │
│ u32   ┆ i64 ┆ str    ┆ date       ┆ f64       ┆ bool       ┆ datetime[μs]        │
╞═══════╪═════╪════════╪════════════╪═══════════╪════════════╪═════════════════════╡
│ 0     ┆ 9   ┆ Mars   ┆ 2022-01-01 ┆ 33.4      ┆ false      ┆ 2022-12-01 00:00:00 │
│ 1     ┆ 4   ┆ Earth  ┆ 2022-01-02 ┆ 2142134.1 ┆ true       ┆ 2022-12-01 00:00:01 │
│ 2     ┆ 2   ┆ Saturn ┆ 2022-01-03 ┆ 44.7      ┆ false      ┆ 2022-12-01 00:00:02 │
└───────┴─────┴────────┴────────────┴───────────┴────────────┴─────────────────────┘


In [257]:
out = df.select(pl.col("*").exclude("logged_at", "index"))
print(out)

shape: (3, 5)
┌─────┬────────┬────────────┬───────────┬────────────┐
│ id  ┆ place  ┆ date       ┆ sales     ┆ has_people │
│ --- ┆ ---    ┆ ---        ┆ ---       ┆ ---        │
│ i64 ┆ str    ┆ date       ┆ f64       ┆ bool       │
╞═════╪════════╪════════════╪═══════════╪════════════╡
│ 9   ┆ Mars   ┆ 2022-01-01 ┆ 33.4      ┆ false      │
│ 4   ┆ Earth  ┆ 2022-01-02 ┆ 2142134.1 ┆ true       │
│ 2   ┆ Saturn ┆ 2022-01-03 ┆ 44.7      ┆ false      │
└─────┴────────┴────────────┴───────────┴────────────┘


In [258]:
out = df.select(pl.col("date", "logged_at").dt.to_string("%Y-%h-%d"))
print(out)

shape: (3, 2)
┌─────────────┬─────────────┐
│ date        ┆ logged_at   │
│ ---         ┆ ---         │
│ str         ┆ str         │
╞═════════════╪═════════════╡
│ 2022-Jan-01 ┆ 2022-Dec-01 │
│ 2022-Jan-02 ┆ 2022-Dec-01 │
│ 2022-Jan-03 ┆ 2022-Dec-01 │
└─────────────┴─────────────┘


In [259]:
out = df.select(pl.col("^.*(as|sa).*$"))
print(out)

shape: (3, 2)
┌───────────┬────────────┐
│ sales     ┆ has_people │
│ ---       ┆ ---        │
│ f64       ┆ bool       │
╞═══════════╪════════════╡
│ 33.4      ┆ false      │
│ 2142134.1 ┆ true       │
│ 44.7      ┆ false      │
└───────────┴────────────┘


In [260]:
out = df.select(pl.col(pl.Int64, pl.UInt32, pl.Boolean).n_unique())
print(out)

shape: (1, 3)
┌───────┬─────┬────────────┐
│ index ┆ id  ┆ has_people │
│ ---   ┆ --- ┆ ---        │
│ u32   ┆ u32 ┆ u32        │
╞═══════╪═════╪════════════╡
│ 3     ┆ 3   ┆ 2          │
└───────┴─────┴────────────┘


In [261]:
import polars.selectors as cs

#Para seleccionar solo las columnas de tipo entero y cadena.
out = df.select(cs.integer(), cs.string())
print(out)

shape: (3, 3)
┌───────┬─────┬────────┐
│ index ┆ id  ┆ place  │
│ ---   ┆ --- ┆ ---    │
│ u32   ┆ i64 ┆ str    │
╞═══════╪═════╪════════╡
│ 0     ┆ 9   ┆ Mars   │
│ 1     ┆ 4   ┆ Earth  │
│ 2     ┆ 2   ┆ Saturn │
└───────┴─────┴────────┘


In [262]:
# Selecciona las columnas numéricas excepto la primera columna que indica el número de fila.
out = df.select(cs.numeric() - cs.first())
print(out)

shape: (3, 2)
┌─────┬───────────┐
│ id  ┆ sales     │
│ --- ┆ ---       │
│ i64 ┆ f64       │
╞═════╪═══════════╡
│ 9   ┆ 33.4      │
│ 4   ┆ 2142134.1 │
│ 2   ┆ 44.7      │
└─────┴───────────┘


In [263]:
#Selecciona las columnas numéricas, excepto la primera columna que indica los números de fila.
out = df.select(cs.numeric() - cs.first())
print(out)

shape: (3, 2)
┌─────┬───────────┐
│ id  ┆ sales     │
│ --- ┆ ---       │
│ i64 ┆ f64       │
╞═════╪═══════════╡
│ 9   ┆ 33.4      │
│ 4   ┆ 2142134.1 │
│ 2   ┆ 44.7      │
└─────┴───────────┘


In [264]:
#También podemos seleccionar el número de fila por nombre y cualquier columna no numérica.
out = df.select(cs.by_name("index") | ~cs.numeric())
print(out)

shape: (3, 5)
┌───────┬────────┬────────────┬────────────┬─────────────────────┐
│ index ┆ place  ┆ date       ┆ has_people ┆ logged_at           │
│ ---   ┆ ---    ┆ ---        ┆ ---        ┆ ---                 │
│ u32   ┆ str    ┆ date       ┆ bool       ┆ datetime[μs]        │
╞═══════╪════════╪════════════╪════════════╪═════════════════════╡
│ 0     ┆ Mars   ┆ 2022-01-01 ┆ false      ┆ 2022-12-01 00:00:00 │
│ 1     ┆ Earth  ┆ 2022-01-02 ┆ true       ┆ 2022-12-01 00:00:01 │
│ 2     ┆ Saturn ┆ 2022-01-03 ┆ false      ┆ 2022-12-01 00:00:02 │
└───────┴────────┴────────────┴────────────┴─────────────────────┘


In [265]:
# Por patrones
out = df.select(cs.contains("index"), cs.matches(".*_.*"))
out

index,has_people,logged_at
u32,bool,datetime[μs]
0,False,2022-12-01 00:00:00
1,True,2022-12-01 00:00:01
2,False,2022-12-01 00:00:02


In [266]:

out = df.select(cs.temporal().as_expr().dt.to_string("%Y-%h-%d"))
out

date,logged_at
str,str
"""2022-Jan-01""","""2022-Dec-01"""
"""2022-Jan-02""","""2022-Dec-01"""
"""2022-Jan-03""","""2022-Dec-01"""


In [267]:
# Para debug

from polars.selectors import is_selector

out = cs.numeric()
print (is_selector(out))

out = cs.boolean() | cs.numeric()
print (is_selector(out))
 
out = cs.numeric() + pl.lit(123)
print (is_selector(out))


True
True
False


In [268]:
pl.lit?

[0;31mSignature:[0m
[0mpl[0m[0;34m.[0m[0mlit[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mvalue[0m[0;34m:[0m [0;34m'Any'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mdtype[0m[0;34m:[0m [0;34m'PolarsDataType | None'[0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0;34m*[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mallow_object[0m[0;34m:[0m [0;34m'bool'[0m [0;34m=[0m [0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m [0;34m->[0m [0;34m'Expr'[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Return an expression representing a literal value.

Parameters
----------
value
    Value that should be used as a `literal`.
dtype
    The data type of the resulting expression.
    If set to `None` (default), the data type is inferred from the `value` input.
allow_object
    If type is unknown use an 'object' type.
    By default, we will raise a `ValueException`
    if the type is unknown.

Notes
-----
Expected datatypes:

- `pl.li

In [269]:
from polars.selectors import expand_selector

out = cs.temporal()
print (expand_selector(df, out))

out = ~(cs.temporal() | cs.numeric())
print (expand_selector(df, out))
print (type(expand_selector(df, out)))

('date', 'logged_at')
('place', 'has_people')
<class 'tuple'>


In [270]:
cs.temporal?

[0;31mSignature:[0m [0mcs[0m[0;34m.[0m[0mtemporal[0m[0;34m([0m[0;34m)[0m [0;34m->[0m [0;34m'SelectorType'[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Select all temporal columns.

See Also
--------
by_dtype : Select all columns matching the given dtype(s).
date : Select all date columns.
datetime : Select all datetime columns, optionally filtering by time unit/zone.
duration : Select all duration columns, optionally filtering by time unit.
time : Select all time columns.

Examples
--------
>>> from datetime import date, time
>>> import polars.selectors as cs
>>> df = pl.DataFrame(
...     {
...         "dt": [date(2021, 1, 1), date(2021, 1, 2)],
...         "tm": [time(12, 0, 0), time(20, 30, 45)],
...         "value": [1.2345, 2.3456],
...     }
... )

Match all temporal columns:

>>> df.select(cs.temporal())
shape: (2, 2)
┌────────────┬──────────┐
│ dt         ┆ tm       │
│ ---        ┆ ---      │
│ date       ┆ time     │
╞════════════╪══════════╡
│ 2021-01-0

## Funciones

In [271]:
df = pl.DataFrame(
    {
        "nrs": [1, 2, 3, None, 5],
        "names": ["foo", "ham", "spam", "egg", "spam"],
        "random": np.random.rand(5),
        "groups": ["A", "A", "B", "C", "B"],
    }
)
print(df)

shape: (5, 4)
┌──────┬───────┬──────────┬────────┐
│ nrs  ┆ names ┆ random   ┆ groups │
│ ---  ┆ ---   ┆ ---      ┆ ---    │
│ i64  ┆ str   ┆ f64      ┆ str    │
╞══════╪═══════╪══════════╪════════╡
│ 1    ┆ foo   ┆ 0.568712 ┆ A      │
│ 2    ┆ ham   ┆ 0.130073 ┆ A      │
│ 3    ┆ spam  ┆ 0.384082 ┆ B      │
│ null ┆ egg   ┆ 0.018753 ┆ C      │
│ 5    ┆ spam  ┆ 0.041167 ┆ B      │
└──────┴───────┴──────────┴────────┘


### Nombre columnas
Por defecto si usamos una expresión, va a mantener el mismo nombre que la columna original

In [272]:
df.select(pl.col("nrs")+5)

nrs
i64
6.0
7.0
8.0
""
10.0


Esto puecde ser un problema si hacemos dos operaciones sobre la misma columna, entonces se usa alias

In [273]:
df_alias = df.select(
    (pl.col("nrs") + 5).alias("nrs + 5"),
    (pl.col("nrs") - 5).alias("nrs - 5"),
)
print(df_alias)

shape: (5, 2)
┌─────────┬─────────┐
│ nrs + 5 ┆ nrs - 5 │
│ ---     ┆ ---     │
│ i64     ┆ i64     │
╞═════════╪═════════╡
│ 6       ┆ -4      │
│ 7       ┆ -3      │
│ 8       ┆ -2      │
│ null    ┆ null    │
│ 10      ┆ 0       │
└─────────┴─────────┘


In [274]:
df_alias = df.select(
    (pl.col("nrs") + 5).alias("nrs + 5"),
    (pl.col("nrs") - 5),
)
print(df_alias)

shape: (5, 2)
┌─────────┬──────┐
│ nrs + 5 ┆ nrs  │
│ ---     ┆ ---  │
│ i64     ┆ i64  │
╞═════════╪══════╡
│ 6       ┆ -4   │
│ 7       ┆ -3   │
│ 8       ┆ -2   │
│ null    ┆ null │
│ 10      ┆ 0    │
└─────────┴──────┘


Podemos contar valores únicos

In [275]:
df.select(pl.col("names").n_unique())

names
u32
4


In [276]:
df_alias = df.select(
    pl.col("names").n_unique().alias("unique"),
    pl.approx_n_unique("names").alias("unique_approx"),
)
print(df_alias)

shape: (1, 2)
┌────────┬───────────────┐
│ unique ┆ unique_approx │
│ ---    ┆ ---           │
│ u32    ┆ u32           │
╞════════╪═══════════════╡
│ 4      ┆ 4             │
└────────┴───────────────┘


Podemos aplicar condicionales con when, then, otherwise. 

In [277]:
df_conditional = df.select(
    pl.col("nrs"),
    pl.when(pl.col("nrs") > 2)
    .then(pl.lit(True))
    .otherwise(pl.lit(False))
    .alias("conditional"),
)
print(df_conditional)

shape: (5, 2)
┌──────┬─────────────┐
│ nrs  ┆ conditional │
│ ---  ┆ ---         │
│ i64  ┆ bool        │
╞══════╪═════════════╡
│ 1    ┆ false       │
│ 2    ┆ false       │
│ 3    ┆ true        │
│ null ┆ false       │
│ 5    ┆ true        │
└──────┴─────────────┘


	•	pl.col("nrs"): Selecciona la columna "nrs" del DataFrame original.
	•	pl.when(pl.col("nrs") > 2): Evalúa si los valores de la columna "nrs" son mayores que 2.
	•	.then(pl.lit(True)): Si la condición es verdadera, el valor será True.
	•	.otherwise(pl.lit(False)): Si la condición es falsa, el valor será False.
	•	.alias("conditional"): La columna resultante se renombra a "conditional".

## Casting
### Numéricos

In [278]:
df = pl.DataFrame(
    {
        "integers": [1, 2, 3, 4, 5],
        "big_integers": [1, 10000002, 3, 10000004, 10000005],
        "floats": [4.0, 5.0, 6.0, 7.0, 8.0],
        "floats_with_decimal": [4.532, 5.5, 6.5, 7.5, 8.5],
    }
)

print(df)

shape: (5, 4)
┌──────────┬──────────────┬────────┬─────────────────────┐
│ integers ┆ big_integers ┆ floats ┆ floats_with_decimal │
│ ---      ┆ ---          ┆ ---    ┆ ---                 │
│ i64      ┆ i64          ┆ f64    ┆ f64                 │
╞══════════╪══════════════╪════════╪═════════════════════╡
│ 1        ┆ 1            ┆ 4.0    ┆ 4.532               │
│ 2        ┆ 10000002     ┆ 5.0    ┆ 5.5                 │
│ 3        ┆ 3            ┆ 6.0    ┆ 6.5                 │
│ 4        ┆ 10000004     ┆ 7.0    ┆ 7.5                 │
│ 5        ┆ 10000005     ┆ 8.0    ┆ 8.5                 │
└──────────┴──────────────┴────────┴─────────────────────┘


In [279]:
out = df.select(
    pl.col("integers").cast(pl.Float32).alias("integers_as_floats"),
    pl.col("floats").cast(pl.Int32).alias("floats_as_integers"),
    pl.col("floats_with_decimal")
    .cast(pl.Int32)
    .alias("floats_with_decimal_as_integers"),
)
print(out)


shape: (5, 3)
┌────────────────────┬────────────────────┬─────────────────────────────────┐
│ integers_as_floats ┆ floats_as_integers ┆ floats_with_decimal_as_integer… │
│ ---                ┆ ---                ┆ ---                             │
│ f32                ┆ i32                ┆ i32                             │
╞════════════════════╪════════════════════╪═════════════════════════════════╡
│ 1.0                ┆ 4                  ┆ 4                               │
│ 2.0                ┆ 5                  ┆ 5                               │
│ 3.0                ┆ 6                  ┆ 6                               │
│ 4.0                ┆ 7                  ┆ 7                               │
│ 5.0                ┆ 8                  ┆ 8                               │
└────────────────────┴────────────────────┴─────────────────────────────────┘


In [280]:
out = df.select(
    pl.col("integers").cast(pl.Int16).alias("integers_smallfootprint"),
    pl.col("floats").cast(pl.Float32).alias("floats_smallfootprint"),
)
print(out)


shape: (5, 2)
┌─────────────────────────┬───────────────────────┐
│ integers_smallfootprint ┆ floats_smallfootprint │
│ ---                     ┆ ---                   │
│ i16                     ┆ f32                   │
╞═════════════════════════╪═══════════════════════╡
│ 1                       ┆ 4.0                   │
│ 2                       ┆ 5.0                   │
│ 3                       ┆ 6.0                   │
│ 4                       ┆ 7.0                   │
│ 5                       ┆ 8.0                   │
└─────────────────────────┴───────────────────────┘


In [281]:
try:
    out = df.select(pl.col("big_integers").cast(pl.Int8))
    print(out)
except Exception as e:
    print(e)

conversion from `i64` to `i8` failed in column 'big_integers' for 3 out of 5 values: [10000002, 10000004, 10000005]


In [282]:
out = df.select(pl.col("big_integers").cast(pl.Int8, strict=False))
print(out)

shape: (5, 1)
┌──────────────┐
│ big_integers │
│ ---          │
│ i8           │
╞══════════════╡
│ 1            │
│ null         │
│ 3            │
│ null         │
│ null         │
└──────────────┘


In [None]:
t