# Polars Cheat Sheet

The following Notebook contains detailed Polars examples to help you get started using Polars


#### Things to be added at a later date
- Melt and Pivot

In [3]:
# Import polars
import polars as pl

# Import polars functions
from polars import (
    col, # Allows us to call on columns within a dataframe
    lit, # Allows us to assign static values to columns
    coalesce,
    when
    )

# Import pandas (used for example)
import pandas as pd

### Read in Data
- Show how to change dataframe to pandas and from pandas

In [4]:
# Read in a csv using polars
df = pl.read_csv('GOT_data.csv') # Fake GOT data generated by OpenAI

# Convert polars to pandas (needed as a bridge to use many useful libraries)
df_pandas = df.to_pandas()

# Convert pandas df back to polars
df_polars = pl.from_pandas(df_pandas)

### Exploring Data

In [5]:
# Show the dataframe
display(df)

# Show the number of rows and columns in the dataframe
display(df.shape)

# Show the first n rows of the dataframe
display(df.head(2))

# Show the last n rows of the dataframe
display(df.tail(2))

# Show the columns of the dataframe
display(df.columns)

# Show a quantitative overview of the dataframe
display(df.describe())

# Show the unique values of a column
display(df.select('BirthPlace').unique())

# Show the unique values of a column and the count of each
display(df['BirthPlace'].value_counts())

# Get the number of rows (height)
df.height

# Get the number of columns (width)
df.width

FirstName,LastName,DateOfBirth,Residence,BirthPlace,NetWorth,TotalDebts
str,str,str,str,str,i64,i64
"""Eddard""","""Stark""","""06/13/263""","""WINTERFELL""","""Winterfell""",5000,800
"""CATELYN""","""TULLY""","""04/03/264""","""Riverrun""","""Riverrun""",4500,600
"""Robb""","""Stark""","""6-04-283""","""WINTERFELL""","""WINTERFELL""",3500,500
"""Sansa""","""STARK""","""Jun 27, 286""","""Winterfell""","""Winterfell""",3000,400
"""Arya""","""Stark""","""Dec 05, 286""","""Winterfell""","""Winterfell""",2500,0
…,…,…,…,…,…,…
"""EDMURE""","""TULLY""","""12/27/267""","""Riverrun""","""Riverrun""",4000,600
"""BRYNDEN""","""Tully""","""Mar 28, 250""","""Riverrun""","""Riverrun""",4500,400
"""Lancel""","""Lannister""","""23 Jul 271""","""King's landing""","""Casterly rock""",3000,500
"""Kevan""","""Lannister""","""11-01-245""","""Casterly rock""","""Casterly rock""",10000,1200


(60, 7)

FirstName,LastName,DateOfBirth,Residence,BirthPlace,NetWorth,TotalDebts
str,str,str,str,str,i64,i64
"""Eddard""","""Stark""","""06/13/263""","""WINTERFELL""","""Winterfell""",5000,800
"""CATELYN""","""TULLY""","""04/03/264""","""Riverrun""","""Riverrun""",4500,600


FirstName,LastName,DateOfBirth,Residence,BirthPlace,NetWorth,TotalDebts
str,str,str,str,str,i64,i64
"""Kevan""","""Lannister""","""11-01-245""","""Casterly rock""","""Casterly rock""",10000,1200
"""TOM""","""Of sevens""","""07/27/260""","""The riverlands""","""Unknown""",1500,100


['FirstName',
 'LastName',
 'DateOfBirth',
 'Residence',
 'BirthPlace',
 'NetWorth',
 'TotalDebts']

statistic,FirstName,LastName,DateOfBirth,Residence,BirthPlace,NetWorth,TotalDebts
str,str,str,str,str,str,f64,f64
"""count""","""60""","""50""","""60""","""60""","""60""",60.0,60.0
"""null_count""","""0""","""10""","""0""","""0""","""0""",0.0,0.0
"""mean""",,,,,,4621.666667,720.0
"""std""",,,,,,3835.078576,869.599557
"""min""","""Alliser""","""Aemon""","""01 Dec 260""","""BRAAVOS""","""Asshai""",500.0,0.0
"""25%""",,,,,,2000.0,100.0
"""50%""",,,,,,4000.0,500.0
"""75%""",,,,,,6000.0,1000.0
"""max""","""YGRITTE""","""Zo loraq""","""Sep 23, 270""","""Winterfell""","""Winterfell""",20000.0,5000.0


BirthPlace
str
"""Greywater watch"""
"""HIGHGARDEN"""
"""Casterly rock"""
"""LILAC'S REST"""
"""Kings landing"""
…
"""Dragonstone"""
"""Winterfell"""
"""MEEREEN"""
"""Bear island"""


BirthPlace,count
str,u32
"""EVENFALL HALL""",1
"""LILAC'S REST""",1
"""Beyond the wall""",1
"""WINTERFELL""",2
"""THE WESTERLANDS""",1
…,…
"""Unknown""",1
"""Flea bottom""",1
"""Horn hill""",1
"""CRASTER'S KEEP""",1


7

### Sorting

In [6]:
# Sort by one column
df_sort1 = df.sort('NetWorth')

#Sort by two columns
df_sort2 = df.sort('NetWorth', 'FirstName')

# Sort by two columns and dictate how the columns are ordered
df_sort3 = df.sort(['NetWorth', 'FirstName'], descending=[True, False])
display(df_sort3.head(2))

FirstName,LastName,DateOfBirth,Residence,BirthPlace,NetWorth,TotalDebts
str,str,str,str,str,i64,i64
"""TYWIN""","""Lannister""","""Jan 07, 243""","""Casterly rock""","""Casterly rock""",20000,1000
"""Cersei""","""LANNISTER""","""Mar 17, 266""","""King's landing""","""KING'S LANDING""",15000,2000


### Filtering

In [7]:
# Filter for a specific value in a specific column
df_filter1 = df.filter(col('BirthPlace') == 'Casterly Rock')

# Filter for where a specific value is not in a specific column
df_filter2 = df.filter(col('BirthPlace') != 'Casterly Rock')  

# Filter for where a column is greater than a specific value
df_filter3 = df.filter(col('NetWorth') > 4000)

# Filter for where a column is less than a specific value
df_filter4 = df.filter(col('TotalDebts') < 1000)

# Filter for where a column is less than or equal to a specific value
df_filter5 = df.filter(col('TotalDebts') <= 200)

# Filter for multiple conditions
df_filter6 = df.filter((col('BirthPlace') == 'Winterfell') & (col('NetWorth') >= 3000))

# Filter for where LastName is null
df_filter7 = df.filter(col('LastName').is_null())

# Filter for where LastName is not null
df_filter8 = df.filter(col('LastName').is_not_null()) 

# Filter for where a column contains a certain value
df_filter9 = df.filter(col('BirthPlace').str.contains('terfe')) # Case sensitive

# Filter for where a column starts with a certain value
df_filter10 = df.filter(col('BirthPlace').str.starts_with('W')) # Case sensitive

# Filter for where a column ends with a certain value
df_filter11 = df.filter(col('BirthPlace').str.ends_with('k')) # Case sensitive

# Filter for where a column value is in a list
df_filter12 = df.filter(col('BirthPlace').is_in(['Casterly Rock', 'Winterfell']))

# Filter for where a column value is not in a list
df_filter13 = df.filter(~col('BirthPlace').is_in(['Casterly Rock', 'Winterfell'])) # Negation (~) can be used with other filters


### Joining

In [8]:
df_sigils = pl.read_csv('GOT_sigils.csv')

# Left Join - Keep all records from the df on left (df) adding the new columns in df_sigils merging on "Residence" where possible
df_combined_left = df.join(df_sigils, on='Residence', how='left')

# Inner Join - Keep all records where a Residence Value has a match (drops those that would be nulls in the new columns)
df_combined_inner = df.join(df_sigils, on='Residence', how='inner') 

### Appending

In [9]:
# Filter into two different dataframes
df_part1 = df.filter(col('NetWorth') >= 2000)
df_part2 = df.filter(col('NetWorth') < 2000)

# Append the dataframes together
df_parts_combined = pl.concat([df_part1, df_part2])

### Working with Columns

In [10]:
# Rename columns
df_col_example = df.rename({'NetWorth':'AccountBalance',
                            'TotalDebts':'Debt'})

# Drop Columns
df_col_example = df_col_example.drop(['DateOfBirth', 'BirthPlace'])

# Create new column
df_col_example = df_col_example.with_columns(
    (col('AccountBalance') - col('Debt')).alias('NetWorth_New')
)

# Create new column with a static value 
df_col_example = df_col_example.with_columns(
    lit('Game of Thrones').alias('Fandom')
)

# Select a subset of columns, reordering columns and renaming in the process 
df_col_example = df_col_example.select('NetWorth_New',
                                        col('FirstName').alias('First Name'),
                                        col('LastName').alias('Last Name'),
                                        'Fandom')

# Select a subset of columns and sort by a column value
df_col_example = df_col_example.select('NetWorth_New', 'First Name', 'Last Name', 'Fandom').sort('NetWorth_New')

display(df_col_example.head(2))

NetWorth_New,First Name,Last Name,Fandom
i64,str,str,str
500,"""YGRITTE""",,"""Game of Thrones"""
500,"""Gilly""",,"""Game of Thrones"""


### Handling Nulls

In [11]:
# Drop rows with any nulls throughout the dataframe
df_dropped = df.drop_nulls()

# Drop rows with any nulls in certain columns
df_dropped2 = df.drop_nulls(subset=['FirstName', 'LastName'])

# Fill nulls with certain values
df_fill_null = df.with_columns(
    col('LastName').fill_null('Unknown')
)

### Group Bys

In [12]:
# Get Total Net Worth by Residence
df_grouped1 = df.group_by('Residence').agg(
    col('NetWorth').sum().alias('Total Wealth')
)

# Get Total Debt and Count of individuals by Residence
df_grouped2 = df.group_by('Residence').agg(
    col('TotalDebts').sum().alias('Total Debt'),
    col('FirstName').count().alias('Total People')
)

# Working with the above, calculate debt per capita and sort by it
df_grouped2 = df_grouped2.with_columns((col('Total Debt') / col('Total People')).alias('Debt per Capita')).sort('Debt per Capita')
display(df_grouped2.head(5))

Residence,Total Debt,Total People,Debt per Capita
str,i64,u32,f64
"""BRAAVOS""",0,1,0.0
"""The wall""",600,6,100.0
"""Greywater watch""",100,1,100.0
"""Winterfell""",700,3,233.333333
"""The north""",500,2,250.0


### Conditional Formatting

In [13]:
# Fix the incorrect naming of King's Landing using conditional formatting
df_conditional = df.with_columns([
    when(col('Residence') == "King'S Landing")
        .then(lit("King's Landing"))
        .otherwise(col('Residence'))
        .alias('Residence'),

    when(col('BirthPlace') == "King'S Landing")
        .then(lit("King's Landing"))
        .otherwise(col('BirthPlace'))
        .alias('BirthPlace')
])

display(df_conditional.filter(col('Residence') == "King's Landing").sort(['TotalDebts'], descending=[True]).head(3))


FirstName,LastName,DateOfBirth,Residence,BirthPlace,NetWorth,TotalDebts
str,str,str,str,str,i64,i64


### Manipulating Strings

In [14]:
# Set a column of strings to Uppercase
df_upper = df.with_columns([
    col('FirstName').str.to_uppercase().alias('FirstName')
])

# Set a column of strings to Lowercase
df_lower = df.with_columns([
    col('FirstName').str.to_lowercase().alias('FirstName')
])

# Set a column of strings to titlecase
df_title = df.with_columns([
    col('FirstName').str.to_titlecase().alias('FirstName')
])

# Remove leading/trailing whitespace
df_strip1 = df.with_columns([
    col('FirstName').str.strip_chars()
])

# Remove specified characters (e.g., spaces, dots)
df_strip2 = df.with_columns([
    col('FirstName').str.strip_chars(" .")
])

# Replace values
df_replace = df.with_columns([
    col('Residence').str.replace_all("Winterfell", "A cold city")
])

### Handling Duplicates

In [15]:
# Drop duplicate rows
df_unique = df.unique()

# Drop where certain subsets are duplicate
df_unique2 = df.unique(subset=['FirstName', 'LastName'])

# Drop where certain subsets are duplicate choosing which to keep
df_unique3 = df.unique(subset=['FirstName', 'LastName'], keep='last') # default is to keep first

### Using SQL in Polars

In [16]:
# Set up  SQL Context
ctx = pl.SQLContext()
ctx.register("GOT_Data", df)

# Run SQL query
query_example = ctx.execute('''SELECT * FROM GOT_Data
                               WHERE TotalDebts > 1500
                               ORDER BY BirthPlace asc;
                            ''')

df_new = query_example.collect()

display(df_new)

FirstName,LastName,DateOfBirth,Residence,BirthPlace,NetWorth,TotalDebts
str,str,str,str,str,i64,i64
"""DAENERYS""","""TARGARYEN""","""Jan 14, 284""","""Meereen""","""Dragonstone""",8000,2000
"""VARYS""",,"""07/09/260""","""KING'S LANDING""","""ESSOS""",8000,2000
"""Cersei""","""LANNISTER""","""Mar 17, 266""","""King's landing""","""KING'S LANDING""",15000,2000
"""JAIME""","""LANNISTER""","""08/06/266""","""King's landing""","""KING'S LANDING""",15000,2000
"""Petyr""","""BAELISH""","""Jan 06, 266""","""King's landing""","""LILAC'S REST""",9000,5000
"""Hizdahr""","""Zo loraq""","""07/17/275""","""MEEREEN""","""MEEREEN""",4000,2000
"""Euron""","""Greyjoy""","""11/05/270""","""PYKE""","""Pyke""",6000,3000


## Sampling

In [17]:
# Sample 2 rows without replacement
df.sample(n=2)

# Sample 50% of rows, with replacement
df.sample(fraction=0.5, with_replacement=True)

# Shuffle all rows
df.sample(fraction=1.0)

# Reproducible sample
df.sample(n=3, seed=123)

FirstName,LastName,DateOfBirth,Residence,BirthPlace,NetWorth,TotalDebts
str,str,str,str,str,i64,i64
"""BRYNDEN""","""Tully""","""Mar 28, 250""","""Riverrun""","""Riverrun""",4500,400
"""Gendry""",,"""3-05-284""","""King's landing""","""Kings landing""",2500,300
"""Margaery""","""Tyrell""","""9-11-283""","""KING'S LANDING""","""HIGHGARDEN""",6000,800


## Casting Datatypes

In [18]:
# Cast column as Int
df.with_columns(
    col("NetWorth").cast(pl.Int64)
)

# Cast multiple columns at once
df_casted = df.with_columns([
    pl.col("NetWorth").cast(pl.Int64),
    pl.col("TotalDebts").cast(pl.Float64)
])

## Other things requiring another DF

In [19]:
# Example DataFrame for regex + integers + rounding
df2 = pl.DataFrame({
    "Name": ["Anna123", "Bob_456", "Charlie789", "David_001"],
    "Score": [95.67, 88.23, 74.999, 100.555],
    "Count": [12345, 67890, 23456, 98765]
})

## Regex

In [20]:
# Extract digits from Name
df_regex = df2.with_columns(
    col("Name").str.extract(r"(\d+)", 1).alias("Digits")
)

# Replace underscores with hyphens
df_regex = df_regex.with_columns(
    col("Name").str.replace(r"_", "-").alias("Name_Replaced")
)

# Check if Name contains numbers
df_regex = df_regex.with_columns(
    col("Name").str.contains(r"\d").alias("HasNumber")
)

print(df_regex)

shape: (4, 6)
┌────────────┬─────────┬───────┬────────┬───────────────┬───────────┐
│ Name       ┆ Score   ┆ Count ┆ Digits ┆ Name_Replaced ┆ HasNumber │
│ ---        ┆ ---     ┆ ---   ┆ ---    ┆ ---           ┆ ---       │
│ str        ┆ f64     ┆ i64   ┆ str    ┆ str           ┆ bool      │
╞════════════╪═════════╪═══════╪════════╪═══════════════╪═══════════╡
│ Anna123    ┆ 95.67   ┆ 12345 ┆ 123    ┆ Anna123       ┆ true      │
│ Bob_456    ┆ 88.23   ┆ 67890 ┆ 456    ┆ Bob-456       ┆ true      │
│ Charlie789 ┆ 74.999  ┆ 23456 ┆ 789    ┆ Charlie789    ┆ true      │
│ David_001  ┆ 100.555 ┆ 98765 ┆ 001    ┆ David-001     ┆ true      │
└────────────┴─────────┴───────┴────────┴───────────────┴───────────┘


## Rounding

In [21]:
# Round Score to nearest integer
df_round = df2.with_columns(
    col("Score").round(0).alias("Score_Rounded")
)

# Round Score to 1 decimal place
df_round = df_round.with_columns(
    col("Score").round(1).alias("Score_Rounded_1dp")
)

# Floor & Ceil
df_round = df_round.with_columns([
    col("Score").floor().alias("Score_Floor"),
    col("Score").ceil().alias("Score_Ceil")
])

print(df_round)


shape: (4, 7)
┌────────────┬─────────┬───────┬───────────────┬───────────────────┬─────────────┬────────────┐
│ Name       ┆ Score   ┆ Count ┆ Score_Rounded ┆ Score_Rounded_1dp ┆ Score_Floor ┆ Score_Ceil │
│ ---        ┆ ---     ┆ ---   ┆ ---           ┆ ---               ┆ ---         ┆ ---        │
│ str        ┆ f64     ┆ i64   ┆ f64           ┆ f64               ┆ f64         ┆ f64        │
╞════════════╪═════════╪═══════╪═══════════════╪═══════════════════╪═════════════╪════════════╡
│ Anna123    ┆ 95.67   ┆ 12345 ┆ 96.0          ┆ 95.7              ┆ 95.0        ┆ 96.0       │
│ Bob_456    ┆ 88.23   ┆ 67890 ┆ 88.0          ┆ 88.2              ┆ 88.0        ┆ 89.0       │
│ Charlie789 ┆ 74.999  ┆ 23456 ┆ 75.0          ┆ 75.0              ┆ 74.0        ┆ 75.0       │
│ David_001  ┆ 100.555 ┆ 98765 ┆ 101.0         ┆ 100.6             ┆ 100.0       ┆ 101.0      │
└────────────┴─────────┴───────┴───────────────┴───────────────────┴─────────────┴────────────┘


  df_melted = df_round.melt(


Name,Score,Count
str,str,f64
"""Anna123""","""Score""",95.67
"""Bob_456""","""Score""",88.23
"""Charlie789""","""Score""",74.999
"""David_001""","""Score""",100.555
"""Anna123""","""Count""",12345.0
…,…,…
"""David_001""","""Score_Floor""",100.0
"""Anna123""","""Score_Ceil""",96.0
"""Bob_456""","""Score_Ceil""",89.0
"""Charlie789""","""Score_Ceil""",75.0
