In [2]:
# see https://docs.pola.rs/user-guide/getting-started/#reading-writing
import polars as pl
from datetime import datetime

df = pl.DataFrame({
    "integer": [1, 2, 3],
    "date": [
        datetime(2025, 1, 1),
        datetime(2025, 1, 2),
        datetime(2025, 1, 3),
    ],
    "float": [4.0, 5.0, 6.0],
    "string": ["a", "b", "c"],
})
print(df)

shape: (3, 4)
┌─────────┬─────────────────────┬───────┬────────┐
│ integer ┆ date                ┆ float ┆ string │
│ ---     ┆ ---                 ┆ ---   ┆ ---    │
│ i64     ┆ datetime[μs]        ┆ f64   ┆ str    │
╞═════════╪═════════════════════╪═══════╪════════╡
│ 1       ┆ 2025-01-01 00:00:00 ┆ 4.0   ┆ a      │
│ 2       ┆ 2025-01-02 00:00:00 ┆ 5.0   ┆ b      │
│ 3       ┆ 2025-01-03 00:00:00 ┆ 6.0   ┆ c      │
└─────────┴─────────────────────┴───────┴────────┘


In [3]:
df.write_csv("../../data/polars.csv")
df_csv = pl.read_csv("../../data/polars.csv")
print(df_csv)

shape: (3, 4)
┌─────────┬────────────────────────────┬───────┬────────┐
│ integer ┆ date                       ┆ float ┆ string │
│ ---     ┆ ---                        ┆ ---   ┆ ---    │
│ i64     ┆ str                        ┆ f64   ┆ str    │
╞═════════╪════════════════════════════╪═══════╪════════╡
│ 1       ┆ 2025-01-01T00:00:00.000000 ┆ 4.0   ┆ a      │
│ 2       ┆ 2025-01-02T00:00:00.000000 ┆ 5.0   ┆ b      │
│ 3       ┆ 2025-01-03T00:00:00.000000 ┆ 6.0   ┆ c      │
└─────────┴────────────────────────────┴───────┴────────┘


In [10]:
df.select(pl.col("*"))

integer,date,float,string
i64,datetime[μs],f64,str
1,2025-01-01 00:00:00,4.0,"""a"""
2,2025-01-02 00:00:00,5.0,"""b"""
3,2025-01-03 00:00:00,6.0,"""c"""


In [8]:
df.filter(pl.col("date").is_between(datetime(2025, 1, 1), datetime(2025, 1, 2)))

integer,date,float,string
i64,datetime[μs],f64,str
1,2025-01-01 00:00:00,4.0,"""a"""
2,2025-01-02 00:00:00,5.0,"""b"""


In [9]:
df.filter((pl.col("integer") <= 3) & (pl.col("string").is_not_nan()))

InvalidOperationError: `is_not_nan` operation not supported for dtype `str`

In [11]:
df.with_columns(pl.col("float").sum().alias("e"), (pl.col("float") + 42).alias("b+42"))

integer,date,float,string,e,b+42
i64,datetime[μs],f64,str,f64,f64
1,2025-01-01 00:00:00,4.0,"""a""",15.0,46.0
2,2025-01-02 00:00:00,5.0,"""b""",15.0,47.0
3,2025-01-03 00:00:00,6.0,"""c""",15.0,48.0


In [12]:
df2 = pl.DataFrame(
    {
        "x": range(8),
        "y": ["A", "A", "A", "B", "B", "C", "X", "X"],
    }
)

In [13]:
df2.group_by("y", maintain_order=True).len()

y,len
str,u32
"""A""",3
"""B""",2
"""C""",1
"""X""",2


In [14]:
df2.group_by("y", maintain_order=True).agg(
    pl.col("*").count().alias("count"),
    pl.col("*").sum().alias("sum"),
)

y,count,sum
str,u32,i64
"""A""",3,3
"""B""",2,7
"""C""",1,5
"""X""",2,13


In [16]:
df_x = df.with_columns(
    (pl.col("integer") * pl.col("float")).alias("a * b")).select(
    pl.all().exclude(["date", "string"])
)
print(df_x)

shape: (3, 3)
┌─────────┬───────┬───────┐
│ integer ┆ float ┆ a * b │
│ ---     ┆ ---   ┆ ---   │
│ i64     ┆ f64   ┆ f64   │
╞═════════╪═══════╪═══════╡
│ 1       ┆ 4.0   ┆ 4.0   │
│ 2       ┆ 5.0   ┆ 10.0  │
│ 3       ┆ 6.0   ┆ 18.0  │
└─────────┴───────┴───────┘


In [17]:
df = pl.DataFrame(
    {
        "a": range(8),
        "b": np.random.rand(8),
        "d": [1.0, 2.0, float("nan"), float("nan"), 0.0, -5.0, -42.0, None],
    }
)

df2 = pl.DataFrame(
    {
        "x": range(8),
        "y": ["A", "A", "A", "B", "B", "C", "X", "X"],
    }
)
joined = df.join(df2, left_on="a", right_on="x")
print(joined)

NameError: name 'np' is not defined