# Polars Primer Notebook
### By Kevin Chamberlin
__Updated April 2025__

### Installing codespace requirements 
Depending on your development environment, this step may not be necessary.

In [1]:
%pip install polars --quiet

Note: you may need to restart the kernel to use updated packages.


In [2]:
import polars as pl

### Intializing dataframes
This is also commonly completed by reading data from a datasource.

In [5]:
# Example of manually creating a DataFrame using Polars code 
l_avengers_data = [[1, "Steve", "Rogers", "Brooklyn, NY", "Blue"]
              ,[2, "Tony", "Stark", "Manahattan, NY", "Gold"]
              ,[3, "Peter", "Parker", "Queens, NY", "Blue"]
              ,[4, "Scott", "Lang", "Coral Gables, FL", "Blue"]
              ,[5, "Natasha", "Romanoff", "Stalingrad, USSR", "Black"]
              ,[6, "Clint", "Barton", "Waverly, IA", "Purple"]]

l_avengers_col_names = ["ID", "FirstName", "LastName", "Hometown", "Favorite Color"]

df_avengers = pl.DataFrame(l_avengers_data, schema=l_avengers_col_names)

# Display the DataFrame
print(df_avengers)

shape: (6, 5)
┌─────┬───────────┬──────────┬──────────────────┬────────────────┐
│ ID  ┆ FirstName ┆ LastName ┆ Hometown         ┆ Favorite Color │
│ --- ┆ ---       ┆ ---      ┆ ---              ┆ ---            │
│ i64 ┆ str       ┆ str      ┆ str              ┆ str            │
╞═════╪═══════════╪══════════╪══════════════════╪════════════════╡
│ 1   ┆ Steve     ┆ Rogers   ┆ Brooklyn, NY     ┆ Blue           │
│ 2   ┆ Tony      ┆ Stark    ┆ Manahattan, NY   ┆ Gold           │
│ 3   ┆ Peter     ┆ Parker   ┆ Queens, NY       ┆ Blue           │
│ 4   ┆ Scott     ┆ Lang     ┆ Coral Gables, FL ┆ Blue           │
│ 5   ┆ Natasha   ┆ Romanoff ┆ Stalingrad, USSR ┆ Black          │
│ 6   ┆ Clint     ┆ Barton   ┆ Waverly, IA      ┆ Purple         │
└─────┴───────────┴──────────┴──────────────────┴────────────────┘


  df_avengers = pl.DataFrame(l_avengers_data, schema=l_avengers_col_names)


### Dataframe Manipulations
Getting your hands on the data and making changes to it. Think of this type of operation like how you'd manipulate data in SQL or the SQL functions found in PySpark if you're familiar with that. In Polars documentation, these are often referred to as _Expressions_ and they take the form of an attribute within the Polars class. This means you can access them with the _pl._ prefix alias if you've imported the Polars package as I have at the top of this notebook. Expressions in Polars often take snake_case naming convention.

In [None]:
df_avengers_names = (df_avengers
                      .with_columns((pl.col("FirstName") + pl.lit(" ") + pl.col("LastName")).alias("FullName"))
                      .select(pl.col("ID"), pl.col("FullName"))
                     )

# print(df_avengers_names)
df_avengers_names.head()

ID,FullName
i64,str
1,"""Steve Rogers"""
2,"""Tony Stark"""
3,"""Peter Parker"""
4,"""Scott Lang"""
5,"""Natasha Romanoff"""


In [14]:
# Create a new dataframe completely from scratch

l_hero_data = [[1, "Captain America"]
              ,[2, "Iron Man"]
              ,[3, "Spiderman"]
              ,[4, "Ant-Man"]
              ,[5, "Black Widow"]
              ,[6, "Hawkeye"]]

l_hero_col_names = ["ID", "Hero"]

df_avengers_heroes = pl.DataFrame(l_hero_data, schema=l_hero_col_names)


# Create a new dataframe that matches the columns of an existing dataframe

l_new_avenger_data = [[7, "Wanda", "Maximoff", "Sokovia", "Scarlet"]]

l_new_avenger_col_names = df_avengers.columns # data and metadata from DFs can be called upon

df_avengers_new = pl.DataFrame(l_new_avenger_data, schema=l_new_avenger_col_names)

# Display both new dataframes
print(df_avengers_heroes)

print(df_avengers_new)

shape: (6, 2)
┌─────┬─────────────────┐
│ ID  ┆ Hero            │
│ --- ┆ ---             │
│ i64 ┆ str             │
╞═════╪═════════════════╡
│ 1   ┆ Captain America │
│ 2   ┆ Iron Man        │
│ 3   ┆ Spiderman       │
│ 4   ┆ Ant-Man         │
│ 5   ┆ Black Widow     │
│ 6   ┆ Hawkeye         │
└─────┴─────────────────┘
shape: (1, 5)
┌─────┬───────────┬──────────┬──────────┬────────────────┐
│ ID  ┆ FirstName ┆ LastName ┆ Hometown ┆ Favorite Color │
│ --- ┆ ---       ┆ ---      ┆ ---      ┆ ---            │
│ i64 ┆ str       ┆ str      ┆ str      ┆ str            │
╞═════╪═══════════╪══════════╪══════════╪════════════════╡
│ 7   ┆ Wanda     ┆ Maximoff ┆ Sokovia  ┆ Scarlet        │
└─────┴───────────┴──────────┴──────────┴────────────────┘


  df_avengers_heroes = pl.DataFrame(l_hero_data, schema=l_hero_col_names)
  df_avengers_new = pl.DataFrame(l_new_avenger_data, schema=l_new_avenger_col_names)


In [16]:
# Two ways of combining data
# Union (Concatenation in Polars)
df_avengers_expanded = pl.concat([df_avengers, df_avengers_new], how="vertical")

# Join operation
df_avengers_expanded = df_avengers_expanded.join(df_avengers_heroes, on="ID", how="left")

# NOTE: If a table you're joing with is relatively small a "broadcast join" may improve processing time while distributed

print(df_avengers_expanded)

shape: (7, 6)
┌─────┬───────────┬──────────┬──────────────────┬────────────────┬─────────────────┐
│ ID  ┆ FirstName ┆ LastName ┆ Hometown         ┆ Favorite Color ┆ Hero            │
│ --- ┆ ---       ┆ ---      ┆ ---              ┆ ---            ┆ ---             │
│ i64 ┆ str       ┆ str      ┆ str              ┆ str            ┆ str             │
╞═════╪═══════════╪══════════╪══════════════════╪════════════════╪═════════════════╡
│ 1   ┆ Steve     ┆ Rogers   ┆ Brooklyn, NY     ┆ Blue           ┆ Captain America │
│ 2   ┆ Tony      ┆ Stark    ┆ Manahattan, NY   ┆ Gold           ┆ Iron Man        │
│ 3   ┆ Peter     ┆ Parker   ┆ Queens, NY       ┆ Blue           ┆ Spiderman       │
│ 4   ┆ Scott     ┆ Lang     ┆ Coral Gables, FL ┆ Blue           ┆ Ant-Man         │
│ 5   ┆ Natasha   ┆ Romanoff ┆ Stalingrad, USSR ┆ Black          ┆ Black Widow     │
│ 6   ┆ Clint     ┆ Barton   ┆ Waverly, IA      ┆ Purple         ┆ Hawkeye         │
│ 7   ┆ Wanda     ┆ Maximoff ┆ Sokovia          ┆ S

In [19]:
# Filtering is generally a good skill to be able to utilize

df_avengers_filtered = (df_avengers_expanded
                         .filter(pl.col("Hero").is_not_null())
                         .filter(pl.col("Favorite Color") != "Gold")
                         .with_columns(pl.col("FirstName").str.slice(0, 1).alias("FirstInitial"))
                         .drop("FirstName")
                         .select("FirstInitial", "LastName", "Hometown", "Hero", pl.col("Favorite Color").alias("FavoriteColor"))
                        )

print(df_avengers_filtered)

shape: (5, 5)
┌──────────────┬──────────┬──────────────────┬─────────────────┬───────────────┐
│ FirstInitial ┆ LastName ┆ Hometown         ┆ Hero            ┆ FavoriteColor │
│ ---          ┆ ---      ┆ ---              ┆ ---             ┆ ---           │
│ str          ┆ str      ┆ str              ┆ str             ┆ str           │
╞══════════════╪══════════╪══════════════════╪═════════════════╪═══════════════╡
│ S            ┆ Rogers   ┆ Brooklyn, NY     ┆ Captain America ┆ Blue          │
│ P            ┆ Parker   ┆ Queens, NY       ┆ Spiderman       ┆ Blue          │
│ S            ┆ Lang     ┆ Coral Gables, FL ┆ Ant-Man         ┆ Blue          │
│ N            ┆ Romanoff ┆ Stalingrad, USSR ┆ Black Widow     ┆ Black         │
│ C            ┆ Barton   ┆ Waverly, IA      ┆ Hawkeye         ┆ Purple        │
└──────────────┴──────────┴──────────────────┴─────────────────┴───────────────┘
