In [1]:
#polars practice
import polars as pl
from datetime import datetime

In [2]:
df = pl.DataFrame(
    {
        "integer":[1,2,3],
        "date":[
            datetime(2025,1,1),
            datetime(2025,1,2),
            datetime(2025,1,3),
        ],
        "float":[4.0, 5.0, 6.0],
        "string":["a", "b","c"],
    }
)
print(df)

shape: (3, 4)
┌─────────┬─────────────────────┬───────┬────────┐
│ integer ┆ date                ┆ float ┆ string │
│ ---     ┆ ---                 ┆ ---   ┆ ---    │
│ i64     ┆ datetime[μs]        ┆ f64   ┆ str    │
╞═════════╪═════════════════════╪═══════╪════════╡
│ 1       ┆ 2025-01-01 00:00:00 ┆ 4.0   ┆ a      │
│ 2       ┆ 2025-01-02 00:00:00 ┆ 5.0   ┆ b      │
│ 3       ┆ 2025-01-03 00:00:00 ┆ 6.0   ┆ c      │
└─────────┴─────────────────────┴───────┴────────┘


In [3]:
df.write_csv('output.csv')
df_csv=pl.read_csv('output.csv')
print(df_csv)

shape: (3, 4)
┌─────────┬────────────────────────────┬───────┬────────┐
│ integer ┆ date                       ┆ float ┆ string │
│ ---     ┆ ---                        ┆ ---   ┆ ---    │
│ i64     ┆ str                        ┆ f64   ┆ str    │
╞═════════╪════════════════════════════╪═══════╪════════╡
│ 1       ┆ 2025-01-01T00:00:00.000000 ┆ 4.0   ┆ a      │
│ 2       ┆ 2025-01-02T00:00:00.000000 ┆ 5.0   ┆ b      │
│ 3       ┆ 2025-01-03T00:00:00.000000 ┆ 6.0   ┆ c      │
└─────────┴────────────────────────────┴───────┴────────┘


In [4]:
df.select(pl.col('*'))

integer,date,float,string
i64,datetime[μs],f64,str
1,2025-01-01 00:00:00,4.0,"""a"""
2,2025-01-02 00:00:00,5.0,"""b"""
3,2025-01-03 00:00:00,6.0,"""c"""


In [5]:
df.select(pl.col("date", "float"))

date,float
datetime[μs],f64
2025-01-01 00:00:00,4.0
2025-01-02 00:00:00,5.0
2025-01-03 00:00:00,6.0


In [6]:
df.filter(pl.col("date").is_between(datetime(2025, 1, 1), datetime(2025, 1, 2))
          )

integer,date,float,string
i64,datetime[μs],f64,str
1,2025-01-01 00:00:00,4.0,"""a"""
2,2025-01-02 00:00:00,5.0,"""b"""


In [7]:
df.filter((pl.col("integer") <= 1) & (pl.col('float').is_not_nan()))

integer,date,float,string
i64,datetime[μs],f64,str
1,2025-01-01 00:00:00,4.0,"""a"""


In [11]:
df.with_columns(pl.col('integer').sum().alias('float'), (pl.col('float')+42).alias('float+42'))

integer,date,float,string,float+42
i64,datetime[μs],i64,str,f64
1,2025-01-01 00:00:00,6,"""a""",46.0
2,2025-01-02 00:00:00,6,"""b""",47.0
3,2025-01-03 00:00:00,6,"""c""",48.0


In [13]:
df2 = pl.DataFrame(
    {
        "x":range(8),
        "y":["A", "A", "A","B", "B", "C", "X", "X"],
    }
)
df2

x,y
i64,str
0,"""A"""
1,"""A"""
2,"""A"""
3,"""B"""
4,"""B"""
5,"""C"""
6,"""X"""
7,"""X"""


In [14]:
df2.group_by("y", maintain_order=True).len()
df2

x,y
i64,str
0,"""A"""
1,"""A"""
2,"""A"""
3,"""B"""
4,"""B"""
5,"""C"""
6,"""X"""
7,"""X"""


In [15]:
df2.group_by("y", maintain_order=True).agg(
    pl.col("*").count().alias("count"),
    pl.col("*").sum().alias("sum"),
)

y,count,sum
str,u32,i64
"""A""",3,3
"""B""",2,7
"""C""",1,5
"""X""",2,13


In [17]:
df_x = df.with_columns((pl.col("a") * pl.col("b")).alias("a * b")).select(
    pl.all().exclude(["c", "d"])
)

print(df_x)

ColumnNotFoundError: a

In [18]:
df_y = df.with_columns((pl.col("a") * pl.col("b")).alias("a * b")).select(
    pl.all().exclude("d")
)

print(df_y)

ColumnNotFoundError: a

In [20]:
import numpy as np
df = pl.DataFrame(
    {
        "a": range(8),
        "b": np.random.rand(8),
        "d": [1.0, 2.0, float("nan"), float("nan"), 0.0, -5.0, -42.0, None],
    }
)

df2 = pl.DataFrame(
    {
        "x": range(8),
        "y": ["A", "A", "A", "B", "B", "C", "X", "X"],
    }
)
joined = df.join(df2, left_on="a", right_on="x")
print(joined)

shape: (8, 4)
┌─────┬──────────┬───────┬─────┐
│ a   ┆ b        ┆ d     ┆ y   │
│ --- ┆ ---      ┆ ---   ┆ --- │
│ i64 ┆ f64      ┆ f64   ┆ str │
╞═════╪══════════╪═══════╪═════╡
│ 0   ┆ 0.097743 ┆ 1.0   ┆ A   │
│ 1   ┆ 0.857446 ┆ 2.0   ┆ A   │
│ 2   ┆ 0.795493 ┆ NaN   ┆ A   │
│ 3   ┆ 0.271588 ┆ NaN   ┆ B   │
│ 4   ┆ 0.673882 ┆ 0.0   ┆ B   │
│ 5   ┆ 0.653872 ┆ -5.0  ┆ C   │
│ 6   ┆ 0.586638 ┆ -42.0 ┆ X   │
│ 7   ┆ 0.199789 ┆ null  ┆ X   │
└─────┴──────────┴───────┴─────┘


In [21]:
stacked = df.hstack(df2)
print(stacked)

shape: (8, 5)
┌─────┬──────────┬───────┬─────┬─────┐
│ a   ┆ b        ┆ d     ┆ x   ┆ y   │
│ --- ┆ ---      ┆ ---   ┆ --- ┆ --- │
│ i64 ┆ f64      ┆ f64   ┆ i64 ┆ str │
╞═════╪══════════╪═══════╪═════╪═════╡
│ 0   ┆ 0.097743 ┆ 1.0   ┆ 0   ┆ A   │
│ 1   ┆ 0.857446 ┆ 2.0   ┆ 1   ┆ A   │
│ 2   ┆ 0.795493 ┆ NaN   ┆ 2   ┆ A   │
│ 3   ┆ 0.271588 ┆ NaN   ┆ 3   ┆ B   │
│ 4   ┆ 0.673882 ┆ 0.0   ┆ 4   ┆ B   │
│ 5   ┆ 0.653872 ┆ -5.0  ┆ 5   ┆ C   │
│ 6   ┆ 0.586638 ┆ -42.0 ┆ 6   ┆ X   │
│ 7   ┆ 0.199789 ┆ null  ┆ 7   ┆ X   │
└─────┴──────────┴───────┴─────┴─────┘
