In [2]:
import polars as pl

# LazyFrame

In [3]:
data = {"a": [1, 2], "b": [3, 4]}
lf = pl.LazyFrame(data)
lf.collect()

a,b
i64,i64
1,3
2,4


In [4]:
lf.collect_schema().dtypes()

[Int64, Int64]

In [5]:
data = {"col1": [0, 2], "col2": [3, 7]}
lf2 = pl.LazyFrame(data, schema={"col1": pl.Float32, "col2": pl.Int64})
lf2.collect()

col1,col2
f32,i64
0.0,3
2.0,7


In [6]:
data = {"col1": [1, 2], "col2": [3, 4]}
lf3 = pl.LazyFrame(data, schema=[("col1", pl.Float32), ("col2", pl.Int64)])
lf3.collect()

col1,col2
f32,i64
1.0,3
2.0,4


In [7]:
data = [
    pl.Series("col1", [1, 2], dtype=pl.Float32),
    pl.Series("col2", [3, 4], dtype=pl.Int64),
]
lf4 = pl.LazyFrame(data)
lf4.collect()

col1,col2
f32,i64
1.0,3
2.0,4


In [8]:
import numpy as np
data = np.array([(1, 2), (3, 4)], dtype=np.int64)
lf5 = pl.LazyFrame(data, schema=["a", "b"], orient="col")
lf5.collect()

a,b
i64,i64
1,3
2,4


In [9]:
data = [[1, 2, 3], [4, 5, 6]]
lf6 = pl.LazyFrame(data, schema=["a", "b", "c"], orient="row")
lf6.collect()

a,b,c
i64,i64,i64
1,2,3
4,5,6


## Aggregation

### count

In [10]:
lf = pl.LazyFrame(
    {"a": [1, 2, 3, 4], "b": [1, 2, 1, None], "c": [None, None, None, None]}
)
lf.count().collect()

a,b,c
u32,u32,u32
4,3,0


### max

In [12]:
lf = pl.LazyFrame(
    {
        "a": [1, 2, 3, 4],
        "b": [1, 2, 1, 1],
    }
)
lf.max().collect()

a,b
i64,i64
4,2


### mean

In [13]:
lf = pl.LazyFrame(
    {
        "a": [1, 2, 3, 4],
        "b": [1, 2, 1, 1],
    }
)
lf.mean().collect()

a,b
f64,f64
2.5,1.25


### median

In [14]:
lf = pl.LazyFrame(
    {
        "a": [1, 2, 3, 4],
        "b": [1, 2, 1, 1],
    }
)
lf.median().collect()

a,b
f64,f64
2.5,1.0


### min

In [15]:
lf = pl.LazyFrame(
    {
        "a": [1, 2, 3, 4],
        "b": [1, 2, 1, 1],
    }
)
lf.min().collect()

a,b
i64,i64
1,1


### null_count

In [16]:
lf = pl.LazyFrame(
    {
        "foo": [1, None, 3],
        "bar": [6, 7, None],
        "ham": ["a", "b", "c"],
    }
)
lf.collect()

foo,bar,ham
i64,i64,str
1.0,6.0,"""a"""
,7.0,"""b"""
3.0,,"""c"""


In [18]:
lf.null_count().collect()

foo,bar,ham
u32,u32,u32
1,1,0


### quantile

In [19]:
lf = pl.LazyFrame(
    {
        "a": [1, 2, 3, 4],
        "b": [1, 2, 1, 1],
    }
)
lf.quantile(0.7).collect()

a,b
f64,f64
3.0,1.0


In [20]:
lf.quantile(0.2)

In [21]:
lf.quantile(0.2).collect()

a,b
f64,f64
2.0,1.0


### std

In [22]:
lf = pl.LazyFrame(
    {
        "a": [1, 2, 3, 4],
        "b": [1, 2, 1, 1],
    }
)
lf.std().collect()

a,b
f64,f64
1.290994,0.5


In [23]:
lf.std(ddof=0).collect()

a,b
f64,f64
1.118034,0.433013


### sum

In [24]:
lf = pl.LazyFrame(
    {
        "a": [1, 2, 3, 4],
        "b": [1, 2, 1, 1],
    }
)
lf.sum().collect()

a,b
i64,i64
10,5


### var

In [25]:
lf = pl.LazyFrame(
    {
        "a": [1, 2, 3, 4],
        "b": [1, 2, 1, 1],
    }
)
lf.var().collect()

a,b
f64,f64
1.666667,0.25


In [26]:
lf.var(ddof=0).collect()

a,b
f64,f64
1.25,0.1875


## Attributes

### columns

In [27]:
lf = pl.LazyFrame(
    {
        "foo": [1, 2, 3],
        "bar": [6, 7, 8],
        "ham": ["a", "b", "c"],
    }
).select("foo", "bar")
lf.columns  

  lf.columns


['foo', 'bar']

In [29]:
lf.collect_schema().names()

['foo', 'bar']

In [30]:
%timeit lf.columns



9.74 μs ± 1.2 μs per loop (mean ± std. dev. of 7 runs, 100,000 loops each)


In [31]:
%timeit lf.collect_schema().names()

1.12 μs ± 7.18 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)


### schema

In [32]:
lf.schema

  lf.schema


Schema([('foo', Int64), ('bar', Int64)])

In [33]:
lf.collect_schema()

Schema([('foo', Int64), ('bar', Int64)])

In [34]:
%timeit lf.schema



8.21 μs ± 670 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)


In [35]:
%timeit lf.collect_schema()

941 ns ± 2.38 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)


### dtypes

In [36]:
lf.dtypes

  lf.dtypes


[Int64, Int64]

In [38]:
lf.collect_schema().dtypes()

[Int64, Int64]

## width

In [39]:
lf.width

  lf.width


2

In [40]:
lf.collect_schema().len()

2

## Descriptive

### describe
This method does not maintain the laziness of the frame, and will collect the final result. This could potentially be an expensive operation.

We do not guarantee the output of describe to be stable. It will show statistics that we deem informative, and may be updated in the future. Using describe programmatically (versus interactive exploration) is not recommended for this reason.

In [42]:
lf.describe()

statistic,foo,bar
str,f64,f64
"""count""",3.0,3.0
"""null_count""",0.0,0.0
"""mean""",2.0,7.0
"""std""",1.0,1.0
"""min""",1.0,6.0
"""25%""",2.0,7.0
"""50%""",2.0,7.0
"""75%""",3.0,8.0
"""max""",3.0,8.0


In [43]:
from datetime import date, time
lf = pl.LazyFrame(
    {
        "float": [1.0, 2.8, 3.0],
        "int": [40, 50, None],
        "bool": [True, False, True],
        "str": ["zz", "xx", "yy"],
        "date": [date(2020, 1, 1), date(2021, 7, 5), date(2022, 12, 31)],
        "time": [time(10, 20, 30), time(14, 45, 50), time(23, 15, 10)],
    }
)

In [44]:
lf.describe()

statistic,float,int,bool,str,date,time
str,f64,f64,f64,str,str,str
"""count""",3.0,2.0,3.0,"""3""","""3""","""3"""
"""null_count""",0.0,1.0,0.0,"""0""","""0""","""0"""
"""mean""",2.266667,45.0,0.666667,,"""2021-07-02 16:00:00""","""16:07:10"""
"""std""",1.101514,7.071068,,,,
"""min""",1.0,40.0,0.0,"""xx""","""2020-01-01""","""10:20:30"""
"""25%""",2.8,40.0,,,"""2021-07-05""","""14:45:50"""
"""50%""",2.8,50.0,,,"""2021-07-05""","""14:45:50"""
"""75%""",3.0,50.0,,,"""2022-12-31""","""23:15:10"""
"""max""",3.0,50.0,1.0,"""zz""","""2022-12-31""","""23:15:10"""


In [46]:
    lf.describe(
        percentiles=[0.1, 0.3, 0.5, 0.7, 0.9],
        interpolation="linear",
    )

statistic,float,int,bool,str,date,time
str,f64,f64,f64,str,str,str
"""count""",3.0,2.0,3.0,"""3""","""3""","""3"""
"""null_count""",0.0,1.0,0.0,"""0""","""0""","""0"""
"""mean""",2.266667,45.0,0.666667,,"""2021-07-02 16:00:00""","""16:07:10"""
"""std""",1.101514,7.071068,,,,
"""min""",1.0,40.0,0.0,"""xx""","""2020-01-01""","""10:20:30"""
…,…,…,…,…,…,…
"""30%""",2.08,43.0,,,"""2020-11-26""","""12:59:42"""
"""50%""",2.8,45.0,,,"""2021-07-05""","""14:45:50"""
"""70%""",2.88,47.0,,,"""2022-02-07""","""18:09:34"""
"""90%""",2.96,49.0,,,"""2022-09-13""","""21:33:18"""


### explain

In [47]:
lf.explain()

'DF ["float", "int", "bool", "str"]; PROJECT */6 COLUMNS; SELECTION: None'

In [48]:
lf = pl.LazyFrame(
    {
        "a": ["a", "b", "a", "b", "b", "c"],
        "b": [1, 2, 3, 4, 5, 6],
        "c": [6, 5, 4, 3, 2, 1],
    }
)
lf.group_by("a", maintain_order=True).agg(pl.all().sum()).sort(
    "a"
).explain()  

'SORT BY [col("a")]\n  AGGREGATE\n  \t[col("b").sum(), col("c").sum()] BY [col("a")] FROM\n    DF ["a", "b", "c"]; PROJECT 3/3 COLUMNS; SELECTION: None'