# Replacing missing values

In [1]:
import polars as pl

In [2]:
df = pl.DataFrame(
    {
        "col1":[0,None,2,3],
        "col2":[0,None,None,3],
        "strings":["a",None,"c","d"]
    }
)
df

col1,col2,strings
i64,i64,str
0.0,0.0,"""a"""
,,
2.0,,"""c"""
3.0,3.0,"""d"""


## Replace missing values with a constant

Using `fill_null`

In [6]:
df.with_columns(
    pl.all().fill_null(0).name.suffix("_new")
)

col1,col2,strings,col1_new,col2_new,strings_new
i64,i64,str,i64,i64,str
0.0,0.0,"""a""",0,0,"""a"""
,,,0,0,"""0"""
2.0,,"""c""",2,0,"""c"""
3.0,3.0,"""d""",3,3,"""d"""


In [7]:
df.with_columns(
    pl.all().fill_null("missing").name.suffix("_new")
)

col1,col2,strings,col1_new,col2_new,strings_new
i64,i64,str,str,str,str
0.0,0.0,"""a""","""0""","""0""","""a"""
,,,"""missing""","""missing""","""missing"""
2.0,,"""c""","""2""","""missing""","""c"""
3.0,3.0,"""d""","""3""","""3""","""d"""


In this case `fill_null` has `cast` the columns from integer to string dtype!

## Replace missing values with a strategy
We can also replace missing values with a strategy including:
- forward: replace with the previous non-`null` value
- backward: replace with the next non-`null` value
- min: replace with the smallest value in the `Series`
- max: replace with the largest value in the `Series`
- mean: replace with the mean value in the `Series`
- zero: replace with `0`
- one: replace with `1`

### Forward strategy
In the forward strategy, the missing values are replaced with the previous non-`null` values

In [8]:
df.with_columns(
    pl.all().fill_null(
        strategy="forward"
    ).name.suffix("_new")
)

col1,col2,strings,col1_new,col2_new,strings_new
i64,i64,str,i64,i64,str
0.0,0.0,"""a""",0,0,"""a"""
,,,0,0,"""a"""
2.0,,"""c""",2,0,"""c"""
3.0,3.0,"""d""",3,3,"""d"""


We can set a limit on how many rows to fill-forward or backward with `limit`

In [10]:
df.with_columns(
    pl.all().fill_null(
        strategy="forward",
        limit=1
    ).name.suffix("_new")
)

col1,col2,strings,col1_new,col2_new,strings_new
i64,i64,str,i64,i64,str
0.0,0.0,"""a""",0,0.0,"""a"""
,,,0,0.0,"""a"""
2.0,,"""c""",2,,"""c"""
3.0,3.0,"""d""",3,3.0,"""d"""


## Replacing missing values by group

In [11]:
df = pl.DataFrame(
    {
        "group":["A","B","A","B","A","B"],
        "col1":[0,1,None,1,2,None],
    }
)
df

group,col1
str,i64
"""A""",0.0
"""B""",1.0
"""A""",
"""B""",1.0
"""A""",2.0
"""B""",


Do this using a *window expression* with `over`.

It's the same way in SQL called *window function*.

In [12]:
df.with_columns(
    pl.all().fill_null(
        strategy="forward"
    ).over(
        "group"
    ).name.suffix("_new")
)

group,col1,group_new,col1_new
str,i64,str,i64
"""A""",0.0,"""A""",0
"""B""",1.0,"""B""",1
"""A""",,"""A""",0
"""B""",1.0,"""B""",1
"""A""",2.0,"""A""",2
"""B""",,"""B""",1


## Exercises

### Exercise 1
Filter the `DataFrame` to have only two rows with missing values in the `Embarked` column and then replace the missing values in the `Embarked` column with the string `"unknown"`

In [13]:
csv_file = "data/titanic.csv"

In [15]:
df = pl.read_csv(csv_file)
df.head()

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
i64,i64,i64,str,str,f64,i64,i64,str,f64,str,str
1,0,3,"""Braund, Mr. Owen Harris""","""male""",22.0,1,0,"""A/5 21171""",7.25,,"""S"""
2,1,1,"""Cumings, Mrs. John Bradley (Fl…","""female""",38.0,1,0,"""PC 17599""",71.2833,"""C85""","""C"""
3,1,3,"""Heikkinen, Miss. Laina""","""female""",26.0,0,0,"""STON/O2. 3101282""",7.925,,"""S"""
4,1,1,"""Futrelle, Mrs. Jacques Heath (…","""female""",35.0,1,0,"""113803""",53.1,"""C123""","""S"""
5,0,3,"""Allen, Mr. William Henry""","""male""",35.0,0,0,"""373450""",8.05,,"""S"""


In [17]:
df.filter(
    pl.col("Embarked").is_null()
).with_columns(
    pl.col("Embarked").fill_null("unknown")
)

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
i64,i64,i64,str,str,f64,i64,i64,str,f64,str,str
62,1,1,"""Icard, Miss. Amelie""","""female""",38.0,0,0,"""113572""",80.0,"""B28""","""unknown"""
830,1,1,"""Stone, Mrs. George Nelson (Mar…","""female""",62.0,0,0,"""113572""",80.0,"""B28""","""unknown"""


### Exercise 2
Add a new column called `Age_filled` where missing values are replaced with the  value from the following row.

In [18]:
df.with_columns(
    Age_filled = pl.col("Age").fill_null(
        strategy="backward"
    )
).select(
    "Age", "Age_filled"
)

Age,Age_filled
f64,f64
22.0,22.0
38.0,38.0
26.0,26.0
35.0,35.0
35.0,35.0
…,…
27.0,27.0
19.0,19.0
,26.0
26.0,26.0


Do the same but this time with respect to the following row from the same passenger class 

In [20]:
df.with_columns(
    Age_filled = pl.col("Age").fill_null(
        strategy="backward"
    ).over("Pclass")
).select(
    "Pclass", "Age", "Age_filled"
)

Pclass,Age,Age_filled
i64,f64,f64
3,22.0,22.0
1,38.0,38.0
3,26.0,26.0
1,35.0,35.0
3,35.0,35.0
…,…,…
2,27.0,27.0
1,19.0,19.0
3,,32.0
1,26.0,26.0


Add three new columns called `Age_mean`, `Age_median` and `Age_interpolated` where missing values are replaced with the:
- mean
- median and
- interpolated values

In [21]:
df.with_columns(
    Age_mean = pl.col("Age").fill_null(strategy="mean"),
    Age_median = pl.col("Age").fill_null(pl.col("Age").median()),
    Age_interpolated = pl.col("Age").interpolate(),
).select(
    "Age","Age_mean","Age_median","Age_interpolated"
).filter(
    pl.col("Age").is_null()
)

Age,Age_mean,Age_median,Age_interpolated
f64,f64,f64,f64
,29.699118,28.0,44.5
,29.699118,28.0,16.5
,29.699118,28.0,33.0
,29.699118,28.0,28.5
,29.699118,28.0,26.0
…,…,…,…
,29.699118,28.0,32.5
,29.699118,28.0,36.0
,29.699118,28.0,17.5
,29.699118,28.0,37.5
