<a href="https://colab.research.google.com/github/cloudhood/learn-polars/blob/main/notebooks/polars_tutorial.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# [Polars Tutorial](https://pola-rs.github.io/polars-book/user-guide/quickstart/quick-exploration-guide.html#installation-and-import)

In [1]:
!pip install -U polars

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting polars
  Downloading polars-0.15.11-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (14.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.6/14.6 MB[0m [31m29.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: polars
Successfully installed polars-0.15.11


In [2]:
import polars as pl
import numpy as np 
from datetime import datetime, timedelta 

In [3]:
series = pl.Series("a", [1, 2, 3, 4, 5])
print(series)

shape: (5,)
Series: 'a' [i64]
[
	1
	2
	3
	4
	5
]


In [4]:
series = pl.Series([1, 2, 3, 4, 5])
print(series)


shape: (5,)
Series: '' [i64]
[
	1
	2
	3
	4
	5
]


In [6]:
dataframe = pl.DataFrame({"integer": [1, 2, 3], 
                          "date": [
                              (datetime(2022, 1, 1)), 
                              (datetime(2022, 1, 2)), 
                              (datetime(2022, 1, 3))
                          ], 
                          "float":[4.0, 5.0, 6.0]})

print(dataframe)

shape: (3, 3)
┌─────────┬─────────────────────┬───────┐
│ integer ┆ date                ┆ float │
│ ---     ┆ ---                 ┆ ---   │
│ i64     ┆ datetime[μs]        ┆ f64   │
╞═════════╪═════════════════════╪═══════╡
│ 1       ┆ 2022-01-01 00:00:00 ┆ 4.0   │
│ 2       ┆ 2022-01-02 00:00:00 ┆ 5.0   │
│ 3       ┆ 2022-01-03 00:00:00 ┆ 6.0   │
└─────────┴─────────────────────┴───────┘


In [7]:
dataframe.write_csv('output.csv')

In [8]:
df_csv = pl.read_csv('output.csv')

print(df_csv)

shape: (3, 3)
┌─────────┬────────────────────────────┬───────┐
│ integer ┆ date                       ┆ float │
│ ---     ┆ ---                        ┆ ---   │
│ i64     ┆ str                        ┆ f64   │
╞═════════╪════════════════════════════╪═══════╡
│ 1       ┆ 2022-01-01T00:00:00.000000 ┆ 4.0   │
│ 2       ┆ 2022-01-02T00:00:00.000000 ┆ 5.0   │
│ 3       ┆ 2022-01-03T00:00:00.000000 ┆ 6.0   │
└─────────┴────────────────────────────┴───────┘


In [9]:
df_csv_with_dates = pl.read_csv('output.csv', parse_dates=True)

print(df_csv_with_dates)

shape: (3, 3)
┌─────────┬─────────────────────┬───────┐
│ integer ┆ date                ┆ float │
│ ---     ┆ ---                 ┆ ---   │
│ i64     ┆ datetime[μs]        ┆ f64   │
╞═════════╪═════════════════════╪═══════╡
│ 1       ┆ 2022-01-01 00:00:00 ┆ 4.0   │
│ 2       ┆ 2022-01-02 00:00:00 ┆ 5.0   │
│ 3       ┆ 2022-01-03 00:00:00 ┆ 6.0   │
└─────────┴─────────────────────┴───────┘


In [10]:
df = pl.DataFrame({"a": np.arange(0, 8), 
                   "b": np.random.rand(8), 
                   "c": [datetime(2022, 12, 1) + timedelta(days=idx) for idx in range(8)],
                   "d": [1, 2.0, np.NaN, np.NaN, 0, -5, -42, None]
                  })

print(df)

shape: (8, 4)
┌─────┬──────────┬─────────────────────┬───────┐
│ a   ┆ b        ┆ c                   ┆ d     │
│ --- ┆ ---      ┆ ---                 ┆ ---   │
│ i64 ┆ f64      ┆ datetime[μs]        ┆ f64   │
╞═════╪══════════╪═════════════════════╪═══════╡
│ 0   ┆ 0.172639 ┆ 2022-12-01 00:00:00 ┆ 1.0   │
│ 1   ┆ 0.804131 ┆ 2022-12-02 00:00:00 ┆ 2.0   │
│ 2   ┆ 0.205621 ┆ 2022-12-03 00:00:00 ┆ NaN   │
│ 3   ┆ 0.623705 ┆ 2022-12-04 00:00:00 ┆ NaN   │
│ 4   ┆ 0.918769 ┆ 2022-12-05 00:00:00 ┆ 0.0   │
│ 5   ┆ 0.768446 ┆ 2022-12-06 00:00:00 ┆ -5.0  │
│ 6   ┆ 0.928955 ┆ 2022-12-07 00:00:00 ┆ -42.0 │
│ 7   ┆ 0.674262 ┆ 2022-12-08 00:00:00 ┆ null  │
└─────┴──────────┴─────────────────────┴───────┘


In [11]:
df.head()

a,b,c,d
i64,f64,datetime[μs],f64
0,0.172639,2022-12-01 00:00:00,1.0
1,0.804131,2022-12-02 00:00:00,2.0
2,0.205621,2022-12-03 00:00:00,
3,0.623705,2022-12-04 00:00:00,
4,0.918769,2022-12-05 00:00:00,0.0


In [12]:
df.tail()

a,b,c,d
i64,f64,datetime[μs],f64
3,0.623705,2022-12-04 00:00:00,
4,0.918769,2022-12-05 00:00:00,0.0
5,0.768446,2022-12-06 00:00:00,-5.0
6,0.928955,2022-12-07 00:00:00,-42.0
7,0.674262,2022-12-08 00:00:00,


In [14]:
df.sample(5)

a,b,c,d
i64,f64,datetime[μs],f64
0,0.172639,2022-12-01 00:00:00,1.0
6,0.928955,2022-12-07 00:00:00,-42.0
2,0.205621,2022-12-03 00:00:00,
5,0.768446,2022-12-06 00:00:00,-5.0
4,0.918769,2022-12-05 00:00:00,0.0


In [15]:
df.describe()

describe,a,b,c,d
str,f64,f64,str,f64
"""count""",8.0,8.0,"""8""",8.0
"""null_count""",0.0,0.0,"""0""",1.0
"""mean""",3.5,0.637066,,
"""std""",2.44949,0.295955,,
"""min""",0.0,0.172639,"""2022-12-01 00:...",-42.0
"""max""",7.0,0.928955,"""2022-12-08 00:...",2.0
"""median""",3.5,0.721354,,1.0


In [16]:
df.select(
    pl.col('*')
)

a,b,c,d
i64,f64,datetime[μs],f64
0,0.172639,2022-12-01 00:00:00,1.0
1,0.804131,2022-12-02 00:00:00,2.0
2,0.205621,2022-12-03 00:00:00,
3,0.623705,2022-12-04 00:00:00,
4,0.918769,2022-12-05 00:00:00,0.0
5,0.768446,2022-12-06 00:00:00,-5.0
6,0.928955,2022-12-07 00:00:00,-42.0
7,0.674262,2022-12-08 00:00:00,


In [19]:
df.select(
    pl.col(['a', 'b'])
)

a,b
i64,f64
0,0.172639
1,0.804131
2,0.205621
3,0.623705
4,0.918769
5,0.768446
6,0.928955
7,0.674262


In [21]:
df.select([
    pl.col('a'),
    pl.col('b')
]).limit(3)

a,b
i64,f64
0,0.172639
1,0.804131
2,0.205621


In [22]:
df.select([
    pl.exclude('a')
])

b,c,d
f64,datetime[μs],f64
0.172639,2022-12-01 00:00:00,1.0
0.804131,2022-12-02 00:00:00,2.0
0.205621,2022-12-03 00:00:00,
0.623705,2022-12-04 00:00:00,
0.918769,2022-12-05 00:00:00,0.0
0.768446,2022-12-06 00:00:00,-5.0
0.928955,2022-12-07 00:00:00,-42.0
0.674262,2022-12-08 00:00:00,


In [27]:
df.filter(
    pl.col("c").is_between(
        datetime(2022, 12, 2), 
        datetime(2022, 12, 8)
    ),
)

a,b,c,d
i64,f64,datetime[μs],f64
2,0.205621,2022-12-03 00:00:00,
3,0.623705,2022-12-04 00:00:00,
4,0.918769,2022-12-05 00:00:00,0.0
5,0.768446,2022-12-06 00:00:00,-5.0
6,0.928955,2022-12-07 00:00:00,-42.0


In [28]:
df.filter(
    (pl.col('a') <= 3) & (pl.col('d').is_not_nan())
)

a,b,c,d
i64,f64,datetime[μs],f64
0,0.172639,2022-12-01 00:00:00,1.0
1,0.804131,2022-12-02 00:00:00,2.0


In [31]:
df.with_columns([
    pl.col('b').sum().alias('e'),
    (pl.col('b') + 42).alias('b+42')
])


a,b,c,d,e,b+42
i64,f64,datetime[μs],f64,f64,f64
0,0.172639,2022-12-01 00:00:00,1.0,5.096527,42.172639
1,0.804131,2022-12-02 00:00:00,2.0,5.096527,42.804131
2,0.205621,2022-12-03 00:00:00,,5.096527,42.205621
3,0.623705,2022-12-04 00:00:00,,5.096527,42.623705
4,0.918769,2022-12-05 00:00:00,0.0,5.096527,42.918769
5,0.768446,2022-12-06 00:00:00,-5.0,5.096527,42.768446
6,0.928955,2022-12-07 00:00:00,-42.0,5.096527,42.928955
7,0.674262,2022-12-08 00:00:00,,5.096527,42.674262


In [32]:
df2 = pl.DataFrame({
    "x": np.arange(0, 8), 
    "y": ['A', 'A', 'A', 'B', 'B', 'C', 'X', 'X'],
})

print(df2)

shape: (8, 2)
┌─────┬─────┐
│ x   ┆ y   │
│ --- ┆ --- │
│ i64 ┆ str │
╞═════╪═════╡
│ 0   ┆ A   │
│ 1   ┆ A   │
│ 2   ┆ A   │
│ 3   ┆ B   │
│ 4   ┆ B   │
│ 5   ┆ C   │
│ 6   ┆ X   │
│ 7   ┆ X   │
└─────┴─────┘


In [33]:
# without maintain_order you will get a random order back.
df2.groupby("y", maintain_order=True).count()

y,count
str,u32
"""A""",3
"""B""",2
"""C""",1
"""X""",2


In [34]:
df2.groupby("y", maintain_order=True).agg([
    pl.col("*").count().alias("count"),
    pl.col("*").sum().alias("sum")
])

y,count,sum
str,u32,i64
"""A""",3,3
"""B""",2,7
"""C""",1,5
"""X""",2,13


In [35]:
# create a new colum that multiplies column `a` and `b` from our DataFrame
# select all the columns, but exclude column `c` and `d` from the final DataFrame
df_x = df.with_column(
    (pl.col("a") * pl.col("b")).alias("a * b")
).select([
    pl.all().exclude(['c', 'd'])
])

print(df_x)

shape: (8, 3)
┌─────┬──────────┬──────────┐
│ a   ┆ b        ┆ a * b    │
│ --- ┆ ---      ┆ ---      │
│ i64 ┆ f64      ┆ f64      │
╞═════╪══════════╪══════════╡
│ 0   ┆ 0.172639 ┆ 0.0      │
│ 1   ┆ 0.804131 ┆ 0.804131 │
│ 2   ┆ 0.205621 ┆ 0.411241 │
│ 3   ┆ 0.623705 ┆ 1.871114 │
│ 4   ┆ 0.918769 ┆ 3.675075 │
│ 5   ┆ 0.768446 ┆ 3.842229 │
│ 6   ┆ 0.928955 ┆ 5.57373  │
│ 7   ┆ 0.674262 ┆ 4.719835 │
└─────┴──────────┴──────────┘


In [38]:
df = pl.DataFrame({
    "a": np.arange(0, 8), 
    "b": np.random.rand(8), 
    "c": [datetime(2022, 12, 1) + timedelta(days=idx) for idx in range(8)],
    "d": [1, 2.0, np.NaN, np.NaN, 0, -5, -42, None]
})

df2 = pl.DataFrame({
    "x": np.arange(0, 8), 
    "y": ['A', 'A', 'A', 'B', 'B', 'C', 'X', 'X'],
})

df.join(df2, left_on="a", right_on="x")

a,b,c,d,y
i64,f64,datetime[μs],f64,str
0,0.357576,2022-12-01 00:00:00,1.0,"""A"""
1,0.633996,2022-12-02 00:00:00,2.0,"""A"""
2,0.021021,2022-12-03 00:00:00,,"""A"""
3,0.293501,2022-12-04 00:00:00,,"""B"""
4,0.796249,2022-12-05 00:00:00,0.0,"""B"""
5,0.920057,2022-12-06 00:00:00,-5.0,"""C"""
6,0.214127,2022-12-07 00:00:00,-42.0,"""X"""
7,0.402607,2022-12-08 00:00:00,,"""X"""


In [39]:
pl.concat([df,df2], how="horizontal")

a,b,c,d,x,y
i64,f64,datetime[μs],f64,i64,str
0,0.357576,2022-12-01 00:00:00,1.0,0,"""A"""
1,0.633996,2022-12-02 00:00:00,2.0,1,"""A"""
2,0.021021,2022-12-03 00:00:00,,2,"""A"""
3,0.293501,2022-12-04 00:00:00,,3,"""B"""
4,0.796249,2022-12-05 00:00:00,0.0,4,"""B"""
5,0.920057,2022-12-06 00:00:00,-5.0,5,"""C"""
6,0.214127,2022-12-07 00:00:00,-42.0,6,"""X"""
7,0.402607,2022-12-08 00:00:00,,7,"""X"""
