# Polars Primer Notebook
### By Kevin Chamberlin
__Updated April 2025__

### Installing codespace requirements 
Depending on your development environment, this step may not be necessary.

In [None]:
%pip install polars --quiet

Note: you may need to restart the kernel to use updated packages.


In [1]:
import polars as pl

### Intializing dataframes
This is also commonly completed by reading data from a datasource.

In [2]:
# Example of manually creating a DataFrame using Polars code 
l_avengers_data = [[1, "Steve", "Rogers", "Brooklyn, NY", "Blue"]
              ,[2, "Tony", "Stark", "Manahattan, NY", "Gold"]
              ,[3, "Peter", "Parker", "Queens, NY", "Blue"]
              ,[4, "Scott", "Lang", "Coral Gables, FL", "Blue"]
              ,[5, "Natasha", "Romanoff", "Stalingrad, USSR", "Black"]
              ,[6, "Clint", "Barton", "Waverly, IA", "Purple"]]

l_avengers_col_names = ["ID", "FirstName", "LastName", "Hometown", "Favorite Color"]

df_avengers = pl.DataFrame(l_avengers_data, schema=l_avengers_col_names)

# Display the DataFrame
print(df_avengers)

shape: (6, 5)
┌─────┬───────────┬──────────┬──────────────────┬────────────────┐
│ ID  ┆ FirstName ┆ LastName ┆ Hometown         ┆ Favorite Color │
│ --- ┆ ---       ┆ ---      ┆ ---              ┆ ---            │
│ i64 ┆ str       ┆ str      ┆ str              ┆ str            │
╞═════╪═══════════╪══════════╪══════════════════╪════════════════╡
│ 1   ┆ Steve     ┆ Rogers   ┆ Brooklyn, NY     ┆ Blue           │
│ 2   ┆ Tony      ┆ Stark    ┆ Manahattan, NY   ┆ Gold           │
│ 3   ┆ Peter     ┆ Parker   ┆ Queens, NY       ┆ Blue           │
│ 4   ┆ Scott     ┆ Lang     ┆ Coral Gables, FL ┆ Blue           │
│ 5   ┆ Natasha   ┆ Romanoff ┆ Stalingrad, USSR ┆ Black          │
│ 6   ┆ Clint     ┆ Barton   ┆ Waverly, IA      ┆ Purple         │
└─────┴───────────┴──────────┴──────────────────┴────────────────┘


  df_avengers = pl.DataFrame(l_avengers_data, schema=l_avengers_col_names)


### Dataframe Manipulations
Getting your hands on the data and making changes to it. Think of this type of operation like how you'd manipulate data in SQL or the SQL functions found in PySpark if you're familiar with that. In Polars documentation, these are often referred to as _Expressions_ and they take the form of an attribute within the Polars class. This means you can access them with the _pl._ prefix alias if you've imported the Polars package as I have at the top of this notebook. Expressions in Polars often take snake_case naming convention.

In [3]:
df_avengers_names = (df_avengers
                      .with_columns((pl.col("FirstName") + pl.lit(" ") + pl.col("LastName")).alias("FullName"))
                      .select(pl.col("ID"), pl.col("FullName"))
                     )

# print(df_avengers_names)
df_avengers_names.head()

ID,FullName
i64,str
1,"""Steve Rogers"""
2,"""Tony Stark"""
3,"""Peter Parker"""
4,"""Scott Lang"""
5,"""Natasha Romanoff"""


In [4]:
# Create a new dataframe completely from scratch

l_hero_data = [[1, "Captain America"]
              ,[2, "Iron Man"]
              ,[3, "Spiderman"]
              ,[4, "Ant-Man"]
              ,[5, "Black Widow"]
              ,[6, "Hawkeye"]]

l_hero_col_names = ["ID", "Hero"]

df_avengers_heroes = pl.DataFrame(l_hero_data, schema=l_hero_col_names)


# Create a new dataframe that matches the columns of an existing dataframe

l_new_avenger_data = [[7, "Wanda", "Maximoff", "Sokovia", "Scarlet"]]

l_new_avenger_col_names = df_avengers.columns # data and metadata from DFs can be called upon

df_avengers_new = pl.DataFrame(l_new_avenger_data, schema=l_new_avenger_col_names)

# Display both new dataframes
print(df_avengers_heroes)

print(df_avengers_new)

shape: (6, 2)
┌─────┬─────────────────┐
│ ID  ┆ Hero            │
│ --- ┆ ---             │
│ i64 ┆ str             │
╞═════╪═════════════════╡
│ 1   ┆ Captain America │
│ 2   ┆ Iron Man        │
│ 3   ┆ Spiderman       │
│ 4   ┆ Ant-Man         │
│ 5   ┆ Black Widow     │
│ 6   ┆ Hawkeye         │
└─────┴─────────────────┘
shape: (1, 5)
┌─────┬───────────┬──────────┬──────────┬────────────────┐
│ ID  ┆ FirstName ┆ LastName ┆ Hometown ┆ Favorite Color │
│ --- ┆ ---       ┆ ---      ┆ ---      ┆ ---            │
│ i64 ┆ str       ┆ str      ┆ str      ┆ str            │
╞═════╪═══════════╪══════════╪══════════╪════════════════╡
│ 7   ┆ Wanda     ┆ Maximoff ┆ Sokovia  ┆ Scarlet        │
└─────┴───────────┴──────────┴──────────┴────────────────┘


  df_avengers_heroes = pl.DataFrame(l_hero_data, schema=l_hero_col_names)
  df_avengers_new = pl.DataFrame(l_new_avenger_data, schema=l_new_avenger_col_names)


In [5]:
# Two ways of combining data
# Union (Concatenation in Polars)
df_avengers_expanded = pl.concat([df_avengers, df_avengers_new], how="vertical")

# Join operation
df_avengers_expanded = df_avengers_expanded.join(df_avengers_heroes, on="ID", how="left")

# NOTE: If a table you're joing with is relatively small a "broadcast join" may improve processing time while distributed

print(df_avengers_expanded)

shape: (7, 6)
┌─────┬───────────┬──────────┬──────────────────┬────────────────┬─────────────────┐
│ ID  ┆ FirstName ┆ LastName ┆ Hometown         ┆ Favorite Color ┆ Hero            │
│ --- ┆ ---       ┆ ---      ┆ ---              ┆ ---            ┆ ---             │
│ i64 ┆ str       ┆ str      ┆ str              ┆ str            ┆ str             │
╞═════╪═══════════╪══════════╪══════════════════╪════════════════╪═════════════════╡
│ 1   ┆ Steve     ┆ Rogers   ┆ Brooklyn, NY     ┆ Blue           ┆ Captain America │
│ 2   ┆ Tony      ┆ Stark    ┆ Manahattan, NY   ┆ Gold           ┆ Iron Man        │
│ 3   ┆ Peter     ┆ Parker   ┆ Queens, NY       ┆ Blue           ┆ Spiderman       │
│ 4   ┆ Scott     ┆ Lang     ┆ Coral Gables, FL ┆ Blue           ┆ Ant-Man         │
│ 5   ┆ Natasha   ┆ Romanoff ┆ Stalingrad, USSR ┆ Black          ┆ Black Widow     │
│ 6   ┆ Clint     ┆ Barton   ┆ Waverly, IA      ┆ Purple         ┆ Hawkeye         │
│ 7   ┆ Wanda     ┆ Maximoff ┆ Sokovia          ┆ S

In [6]:
# Filtering is generally a good skill to be able to utilize

df_avengers_filtered = (df_avengers_expanded
                         .filter(pl.col("Hero").is_not_null())
                         .filter(pl.col("Favorite Color") != "Gold")
                         .with_columns(pl.col("FirstName").str.slice(0, 1).alias("FirstInitial"))
                         .drop("FirstName")
                         .select("FirstInitial", "LastName", "Hometown", "Hero", pl.col("Favorite Color").alias("FavoriteColor"))
                        )

print(df_avengers_filtered)

shape: (5, 5)
┌──────────────┬──────────┬──────────────────┬─────────────────┬───────────────┐
│ FirstInitial ┆ LastName ┆ Hometown         ┆ Hero            ┆ FavoriteColor │
│ ---          ┆ ---      ┆ ---              ┆ ---             ┆ ---           │
│ str          ┆ str      ┆ str              ┆ str             ┆ str           │
╞══════════════╪══════════╪══════════════════╪═════════════════╪═══════════════╡
│ S            ┆ Rogers   ┆ Brooklyn, NY     ┆ Captain America ┆ Blue          │
│ P            ┆ Parker   ┆ Queens, NY       ┆ Spiderman       ┆ Blue          │
│ S            ┆ Lang     ┆ Coral Gables, FL ┆ Ant-Man         ┆ Blue          │
│ N            ┆ Romanoff ┆ Stalingrad, USSR ┆ Black Widow     ┆ Black         │
│ C            ┆ Barton   ┆ Waverly, IA      ┆ Hawkeye         ┆ Purple        │
└──────────────┴──────────┴──────────────────┴─────────────────┴───────────────┘


# A note on Polars execution
Polars is designed to be a fast and efficient DataFrame library for data manipulation. It runs in a multi-threaded environment, taking advantage of modern hardware to perform operations in parallel. 

Polars is built with Rust, which allows it to be highly optimized for performance while maintaining memory safety and concurrency. This results in significant speedups, especially with large datasets, when compared to libraries like Pandas and PySpark.

When writing Polars code, there are some key differences to consider compared to Pandas or PySpark:
- Polars uses eager and lazy execution. While Pandas operates with eager execution (immediately evaluating each operation), Polars allows both eager execution (immediate evaluation) and lazy execution (building a query plan and executing all operations at once, which can be optimized).
- Polars is designed to minimize memory usage and maximize speed by using Arrow memory format under the hood, leading to better performance with large datasets compared to Pandas.
- Polars also scales better in multi-threaded environments, whereas Pandas is more single-threaded. PySpark, on the other hand, can scale to distributed clusters but has more overhead when running locally.
- Polars offers a simpler, more streamlined API, making it easy to switch from Pandas in many cases, but some operations (especially related to SQL queries) may require a different approach than in PySpark.

These factors make Polars an attractive alternative for users looking for high performance without the complexity of distributed systems.


# More common tasks
Below are some examples of common tasks that might need to be done with Polars. I'll continue to add to this section as I learn more. 

In [15]:
# Add a static column to the DF
df_avengers_weights = (df_avengers_filtered
                       .with_columns(pl.lit(200).alias("WeightLbs"))
                       )

# Changing specific rows
df_avengers_weights = (
    df_avengers_weights
    .with_columns([
        pl.when(pl.col("LastName") == "Barton")
          .then(175)  # Set WeightLbs to 175 where LastName is "Barton"
          .when(pl.col("LastName") == "Parker")
          .then(160) # Set WeightLbs to 160 for Parker
          .when(pl.col("LastName") == "Romanoff")
          .then(150) # Set WeightLbs to 150 for Romanoff
          .when(pl.col("LastName") == "Lang")
          .then(0.0001) # Set WeightLbs to 0.0001 for Lang
          .otherwise(pl.col("WeightLbs"))  # Keep existing WeightLbs for all other rows
          .alias("WeightLbs")  # Update the WeightLbs column
    ])
)
print(df_avengers_weights)

# Aggregate sums of columns with a group_by() on other columns
df_avengers_sum = (df_avengers_weights
                   .group_by(pl.col("FavoriteColor"))
                   .agg(pl.sum("WeightLbs").alias("TotalWeightLbs"))
                   )

df_avengers_sum.head(5)

shape: (5, 6)
┌──────────────┬──────────┬──────────────────┬─────────────────┬───────────────┬───────────┐
│ FirstInitial ┆ LastName ┆ Hometown         ┆ Hero            ┆ FavoriteColor ┆ WeightLbs │
│ ---          ┆ ---      ┆ ---              ┆ ---             ┆ ---           ┆ ---       │
│ str          ┆ str      ┆ str              ┆ str             ┆ str           ┆ f64       │
╞══════════════╪══════════╪══════════════════╪═════════════════╪═══════════════╪═══════════╡
│ S            ┆ Rogers   ┆ Brooklyn, NY     ┆ Captain America ┆ Blue          ┆ 200.0     │
│ P            ┆ Parker   ┆ Queens, NY       ┆ Spiderman       ┆ Blue          ┆ 160.0     │
│ S            ┆ Lang     ┆ Coral Gables, FL ┆ Ant-Man         ┆ Blue          ┆ 0.0001    │
│ N            ┆ Romanoff ┆ Stalingrad, USSR ┆ Black Widow     ┆ Black         ┆ 150.0     │
│ C            ┆ Barton   ┆ Waverly, IA      ┆ Hawkeye         ┆ Purple        ┆ 175.0     │
└──────────────┴──────────┴──────────────────┴──────────

FavoriteColor,TotalWeightLbs
str,f64
"""Blue""",360.0001
"""Black""",150.0
"""Purple""",175.0


# Window Functions
As with most SQL operations, windowing can dramatically improve your code's performance and readability. Unlike in PySpark, no additional imports are necessary, and the syntax doesn't require the extra layer of defining a window. I've found them to be the solution to a lot of headaches with compute limitations and below are some simple syntax examples of what might be useful to know. 

In [26]:
# Partitioning within the data
df_avengers_partition = (df_avengers_weights
                         .with_columns(pl.sum("WeightLbs").over(pl.col("FavoriteColor")).alias("ColorSumWeightLbs"))
                         )
print(df_avengers_partition)

shape: (5, 7)
┌──────────────┬──────────┬───────────────┬──────────────┬──────────────┬───────────┬──────────────┐
│ FirstInitial ┆ LastName ┆ Hometown      ┆ Hero         ┆ FavoriteColo ┆ WeightLbs ┆ ColorSumWeig │
│ ---          ┆ ---      ┆ ---           ┆ ---          ┆ r            ┆ ---       ┆ htLbs        │
│ str          ┆ str      ┆ str           ┆ str          ┆ ---          ┆ f64       ┆ ---          │
│              ┆          ┆               ┆              ┆ str          ┆           ┆ f64          │
╞══════════════╪══════════╪═══════════════╪══════════════╪══════════════╪═══════════╪══════════════╡
│ S            ┆ Rogers   ┆ Brooklyn, NY  ┆ Captain      ┆ Blue         ┆ 200.0     ┆ 360.0001     │
│              ┆          ┆               ┆ America      ┆              ┆           ┆              │
│ P            ┆ Parker   ┆ Queens, NY    ┆ Spiderman    ┆ Blue         ┆ 160.0     ┆ 360.0001     │
│ S            ┆ Lang     ┆ Coral Gables, ┆ Ant-Man      ┆ Blue         ┆ 0.0

In [27]:
# Ordering data
df_avengers_ordered = (
    df_avengers_filtered
    .with_columns([
        pl.col("LastName").str.to_lowercase().rank("ordinal").alias("AlphaRank"),
        pl.col("LastName").str.to_lowercase().rank("ordinal").over("FavoriteColor").alias("AlphaRankWithinColor")
    ])
)
print(df_avengers_ordered)

shape: (5, 7)
┌──────────────┬──────────┬───────────────┬──────────────┬──────────────┬───────────┬──────────────┐
│ FirstInitial ┆ LastName ┆ Hometown      ┆ Hero         ┆ FavoriteColo ┆ AlphaRank ┆ AlphaRankWit │
│ ---          ┆ ---      ┆ ---           ┆ ---          ┆ r            ┆ ---       ┆ hinColor     │
│ str          ┆ str      ┆ str           ┆ str          ┆ ---          ┆ u32       ┆ ---          │
│              ┆          ┆               ┆              ┆ str          ┆           ┆ u32          │
╞══════════════╪══════════╪═══════════════╪══════════════╪══════════════╪═══════════╪══════════════╡
│ S            ┆ Rogers   ┆ Brooklyn, NY  ┆ Captain      ┆ Blue         ┆ 4         ┆ 3            │
│              ┆          ┆               ┆ America      ┆              ┆           ┆              │
│ P            ┆ Parker   ┆ Queens, NY    ┆ Spiderman    ┆ Blue         ┆ 3         ┆ 2            │
│ S            ┆ Lang     ┆ Coral Gables, ┆ Ant-Man      ┆ Blue         ┆ 2  

In [28]:
# Leading and Lagging
df_avengers_lag_lead = (
    df_avengers_weights
    .sort("WeightLbs")  # Sort the entire DataFrame based on WeightLbs
    .with_columns([
        # LagWeight: the next highest value
        pl.col("WeightLbs").shift(-1).alias("LagWeight"),

        # LeadWeight: the next lowest value
        pl.col("WeightLbs").shift(1).alias("LeadWeight")
    ])
)

print(df_avengers_lag_lead)


shape: (5, 8)
┌────────────┬──────────┬────────────┬────────────┬────────────┬───────────┬───────────┬───────────┐
│ FirstIniti ┆ LastName ┆ Hometown   ┆ Hero       ┆ FavoriteCo ┆ WeightLbs ┆ LagWeight ┆ LeadWeigh │
│ al         ┆ ---      ┆ ---        ┆ ---        ┆ lor        ┆ ---       ┆ ---       ┆ t         │
│ ---        ┆ str      ┆ str        ┆ str        ┆ ---        ┆ f64       ┆ f64       ┆ ---       │
│ str        ┆          ┆            ┆            ┆ str        ┆           ┆           ┆ f64       │
╞════════════╪══════════╪════════════╪════════════╪════════════╪═══════════╪═══════════╪═══════════╡
│ S          ┆ Lang     ┆ Coral      ┆ Ant-Man    ┆ Blue       ┆ 0.0001    ┆ 150.0     ┆ null      │
│            ┆          ┆ Gables, FL ┆            ┆            ┆           ┆           ┆           │
│ N          ┆ Romanoff ┆ Stalingrad ┆ Black      ┆ Black      ┆ 150.0     ┆ 160.0     ┆ 0.0001    │
│            ┆          ┆ , USSR     ┆ Widow      ┆            ┆           ┆ 