In [7]:
import polars as pl
import numpy as np

In [4]:
from datetime import datetime

df = pl.DataFrame(
    {
        "integer": [1, 2, 3],
        "date": [
            datetime(2022, 1, 1),
            datetime(2022, 1, 2),
            datetime(2022, 1, 3),
        ],
        "float": [4.0, 5.0, 6.0],
    }
)

print(df)

shape: (3, 3)
┌─────────┬─────────────────────┬───────┐
│ integer ┆ date                ┆ float │
│ ---     ┆ ---                 ┆ ---   │
│ i64     ┆ datetime[μs]        ┆ f64   │
╞═════════╪═════════════════════╪═══════╡
│ 1       ┆ 2022-01-01 00:00:00 ┆ 4.0   │
│ 2       ┆ 2022-01-02 00:00:00 ┆ 5.0   │
│ 3       ┆ 2022-01-03 00:00:00 ┆ 6.0   │
└─────────┴─────────────────────┴───────┘


## Contexts 

Expressions are evaluated within one of 3 contexts:
1. Selection: `df.select(...)`, `df.with_columns(...)` etc.
2. Filtering: `df.filter(...)`
3. Group by / Aggregation: `df.groupby(...).agg(...)`



In [8]:
df = pl.DataFrame(
    {
        "nrs": [1, 2, 3, None, 5],
        "names": ["foo", "ham", "spam", "egg", None],
        "random": np.random.rand(5),
        "groups": ["A", "A", "B", "C", "B"],
    }
)
print(df)

shape: (5, 4)
┌──────┬───────┬──────────┬────────┐
│ nrs  ┆ names ┆ random   ┆ groups │
│ ---  ┆ ---   ┆ ---      ┆ ---    │
│ i64  ┆ str   ┆ f64      ┆ str    │
╞══════╪═══════╪══════════╪════════╡
│ 1    ┆ foo   ┆ 0.984354 ┆ A      │
│ 2    ┆ ham   ┆ 0.21297  ┆ A      │
│ 3    ┆ spam  ┆ 0.304313 ┆ B      │
│ null ┆ egg   ┆ 0.895808 ┆ C      │
│ 5    ┆ null  ┆ 0.024642 ┆ B      │
└──────┴───────┴──────────┴────────┘


> The selection context applies expressions over columns. A select may produce new columns that are aggregations, combinations of expressions, or literals.

> The expressions in a selection context must produce Series that are all the same length or have a length of 1. Literals are treated as length-1 Series.

In [9]:
df.select(
    pl.sum("nrs"),
    pl.col("names").sort()
)

nrs,names
i64,str
11,
11,"""egg"""
11,"""foo"""
11,"""ham"""
11,"""spam"""


> Similar to the select statement, the with_columns statement also enters into the selection context. The main difference between with_columns and select is that with_columns retains the original columns and adds new ones, whereas select drops the original columns

In [10]:
df.with_columns(
    pl.sum("nrs").alias("nrs_sum"),
    pl.col("random").count().alias("count"),
)

nrs,names,random,groups,nrs_sum,count
i64,str,f64,str,i64,u32
1.0,"""foo""",0.984354,"""A""",11,5
2.0,"""ham""",0.21297,"""A""",11,5
3.0,"""spam""",0.304313,"""B""",11,5
,"""egg""",0.895808,"""C""",11,5
5.0,,0.024642,"""B""",11,5


## Expressions

> Polars expressions are a mapping from a series to a series (or mathematically Fn(Series) -> Series). As expressions have a Series as an input and a Series as an output then it is straightforward to do a sequence of expressions (similar to method chaining in pandas).

```python
pl.col("foo").sort().head(2)
```

> The snippet above says:

> Select column "foo"
> Then sort the column (not in reversed order)
> Then take the first two values of the sorted output

> The power of expressions is that every expression produces a new expression, and that they can be piped together. You can run an expression by passing them to one of Polars execution contexts.

## Aggregation


In [11]:
url = "https://theunitedstates.io/congress-legislators/legislators-historical.csv"

dtypes = {
    "first_name": pl.Categorical,
    "gender": pl.Categorical,
    "type": pl.Categorical,
    "state": pl.Categorical,
    "party": pl.Categorical,
}

dataset = pl.read_csv(url, dtypes=dtypes).with_columns(
    pl.col("birthday").str.to_date(strict=False)
)

In [15]:
dataset.head()

last_name,first_name,middle_name,suffix,nickname,full_name,birthday,gender,type,state,district,senate_class,party,url,address,phone,contact_form,rss_url,twitter,twitter_id,facebook,youtube,youtube_id,mastodon,bioguide_id,thomas_id,opensecrets_id,lis_id,fec_ids,cspan_id,govtrack_id,votesmart_id,ballotpedia_id,washington_post_id,icpsr_id,wikipedia_id
str,cat,str,str,str,str,date,cat,cat,cat,i64,i64,cat,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,i64,str,str,str,i64,str
"""Bassett""","""Richard""",,,,,1745-04-02,"""M""","""sen""","""DE""",,2.0,"""Anti-Administr…",,,,,,,,,,,,"""B000226""",,,,,,401222,,,,507,"""Richard Basset…"
"""Bland""","""Theodorick""",,,,,1742-03-21,"""M""","""rep""","""VA""",9.0,,,,,,,,,,,,,,"""B000546""",,,,,,401521,,,,786,"""Theodorick Bla…"
"""Burke""","""Aedanus""",,,,,1743-06-16,"""M""","""rep""","""SC""",2.0,,,,,,,,,,,,,,"""B001086""",,,,,,402032,,,,1260,"""Aedanus Burke"""
"""Carroll""","""Daniel""",,,,,1730-07-22,"""M""","""rep""","""MD""",6.0,,,,,,,,,,,,,,"""C000187""",,,,,,402334,,,,1538,"""Daniel Carroll…"
"""Clymer""","""George""",,,,,1739-03-16,"""M""","""rep""","""PA""",-1.0,,,,,,,,,,,,,,"""C000538""",,,,,,402671,,,,1859,"""George Clymer"""
