# Replacing missing values with expressions

In [1]:
import polars as pl

In [2]:
df = pl.DataFrame(
    {
        'col1':[0,None,2,3],
        "col2":[0,None,None,3],
        "col3":[4,5,6,7]
    }
)
df

col1,col2,col3
i64,i64,i64
0.0,0.0,4
,,5
2.0,,6
3.0,3.0,7


## Replace missing values using an expression from the `same` column

In [3]:
df.with_columns(
    pl.col("col1").fill_null(pl.col("col1").median()).name.suffix("_new")
)

col1,col2,col3,col1_new
i64,i64,i64,f64
0.0,0.0,4,0.0
,,5,2.0
2.0,,6,2.0
3.0,3.0,7,3.0


### Interpolation
We can replace missing values with linear interpolation

In [5]:
df.with_columns(
    pl.all().interpolate().name.suffix("_new")
)

col1,col2,col3,col1_new,col2_new,col3_new
i64,i64,i64,f64,f64,f64
0.0,0.0,4,0.0,0.0,4.0
,,5,1.0,1.0,5.0
2.0,,6,2.0,2.0,6.0
3.0,3.0,7,3.0,3.0,7.0


### Replace missing values using a `different` column

In [6]:
df.with_columns(
    pl.col("col2").fill_null(pl.col("col3")).name.suffix("_new")
)

col1,col2,col3,col2_new
i64,i64,i64,i64
0.0,0.0,4,0
,,5,5
2.0,,6,6
3.0,3.0,7,3


## Replacing missing values based on a `sequence` of columns

In [7]:
df_coalesce = pl.DataFrame(
    [
        {'a': None, 'b': 1.0, 'c': 1.0},
        {'a': None, 'b': 2.0, 'c': 2.0},
        {'a': None, 'b': None, 'c': 3.0},
        {'a': None, 'b': None, 'c': None}
    ]
)
df_coalesce

a,b,c
null,f64,f64
,1.0,1.0
,2.0,2.0
,,3.0
,,


We want to create a new column that has the first non-`null` value as we go through a sequence of columns in order. 

We do this with `pl.coalesce` where we can also specify a default value if all of the columns are `null`.

`pl.coalesce` has the same concept as the one in `SQL`.

In [9]:
df_coalesce.with_columns(
    pl.coalesce(["a", "b", "c", 9.0]).alias("d")
)

a,b,c,d
null,f64,f64,f64
,1.0,1.0,1.0
,2.0,2.0,2.0
,,3.0,3.0
,,,9.0


## Exercises

### Exercise 1
Replace `null` values in the `Age` column to have the `median` of the `Age` column

In [15]:
csv_file = "data/titanic.csv"

df = pl.read_csv(csv_file)

df.with_columns(
    pl.col("Age").fill_null(pl.col("Age").median())
).head(10)

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
i64,i64,i64,str,str,f64,i64,i64,str,f64,str,str
1,0,3,"""Braund, Mr. Owen Harris""","""male""",22.0,1,0,"""A/5 21171""",7.25,,"""S"""
2,1,1,"""Cumings, Mrs. John Bradley (Fl…","""female""",38.0,1,0,"""PC 17599""",71.2833,"""C85""","""C"""
3,1,3,"""Heikkinen, Miss. Laina""","""female""",26.0,0,0,"""STON/O2. 3101282""",7.925,,"""S"""
4,1,1,"""Futrelle, Mrs. Jacques Heath (…","""female""",35.0,1,0,"""113803""",53.1,"""C123""","""S"""
5,0,3,"""Allen, Mr. William Henry""","""male""",35.0,0,0,"""373450""",8.05,,"""S"""
6,0,3,"""Moran, Mr. James""","""male""",28.0,0,0,"""330877""",8.4583,,"""Q"""
7,0,1,"""McCarthy, Mr. Timothy J""","""male""",54.0,0,0,"""17463""",51.8625,"""E46""","""S"""
8,0,3,"""Palsson, Master. Gosta Leonard""","""male""",2.0,3,1,"""349909""",21.075,,"""S"""
9,1,3,"""Johnson, Mrs. Oscar W (Elisabe…","""female""",27.0,0,2,"""347742""",11.1333,,"""S"""
10,1,2,"""Nasser, Mrs. Nicholas (Adele A…","""female""",14.0,1,0,"""237736""",30.0708,,"""C"""


Replace `null` values in the `Age` column to have the `median` of the `Age` column **based on whether the passenger is `male` or `female` in the `Sex` column**.

In [16]:
df.with_columns(
    pl.when(
        pl.col("Sex") == "female"
    ).then(
        pl.col("Age").fill_null(pl.col("Age").filter(pl.col("Sex") == "female").median())
    ).otherwise(
        pl.col("Age").fill_null(pl.col("Age").filter(pl.col("Sex") == "male").median())
    ).alias("Age_filled")
).select(
    "Sex", "Age", "Age_filled"
).filter(
    pl.col("Age").is_null()
).head()

Sex,Age,Age_filled
str,f64,f64
"""male""",,29.0
"""male""",,29.0
"""female""",,27.0
"""male""",,29.0
"""female""",,27.0


### Exercise 2
We have the following `DataFrame` with 3 columns

In [18]:
df = pl.DataFrame(
    {
        "a":[10,None,22,1],
        "b":[8,12,19,None],        
        "c":[5,None,19,None],
    }
)

df

a,b,c
i64,i64,i64
10.0,8.0,5.0
,12.0,
22.0,19.0,19.0
1.0,,


Add a new column with values from column `c`. 

If `c` is `null` then use the value from column `b` and if `b` is also `null` use the value from column `a`

In [20]:
df.with_columns(
    pl.coalesce(["c", "b", "a"]).alias("d")
)

a,b,c,d
i64,i64,i64,i64
10.0,8.0,5.0,5
,12.0,,12
22.0,19.0,19.0,19
1.0,,,1


Add a new column `d` with:
- values from column `c`
- if `c` is `null` then use the median of values in `c`
- ensure the dtype is consistent with `c`

In [21]:
df.with_columns(
    pl.coalesce(["c", pl.col(("c")).median()]).cast(pl.Int64).alias("d")
)

a,b,c,d
i64,i64,i64,i64
10.0,8.0,5.0,5
,12.0,,12
22.0,19.0,19.0,19
1.0,,,12
