# Missing values

In [1]:
import polars as pl
import polars.selectors as cs

### Missing values in Polars
Missing values in Polars are represented with a `null` value for all dtypes.

In [2]:
df = pl.DataFrame(
    {
        'col1':[None,2,3,4],
        "col2":[None,None,5,6],
        "col3":[None,None,None,7]
    }
)
df

col1,col2,col3
i64,i64,i64
,,
2.0,,
3.0,5.0,
4.0,6.0,7.0


### Metadata on `null` values
Polars stores metadata about `null` values for each column in a `DataFrame`.

#### Null count
Polars stores a count of how many `null` values there are.

In [3]:
df.null_count()

col1,col2,col3
u32,u32,u32
1,2,3


### Finding `null` values

We use the `is_null` expression to find out whether each value is `null` and `is_not_null`.

In [4]:
df.select(
    pl.col("col1"),
    pl.col("col1").is_null().alias("is_null"),
    pl.col("col1").is_not_null().alias("is_not_null")
)

col1,is_null,is_not_null
i64,bool,bool
,True,False
2.0,False,True
3.0,False,True
4.0,False,True


### Filtering by `null` values

#### Filtering on a single column

In [5]:
df.filter(
    pl.col("col1").is_not_null()
)

col1,col2,col3
i64,i64,i64
2,,
3,5.0,
4,6.0,7.0


#### Filtering by `null` values in multiple columns

In [7]:
df.filter(
    pl.any_horizontal(pl.all().is_not_null())
)

col1,col2,col3
i64,i64,i64
2,,
3,5.0,
4,6.0,7.0


In [8]:
df.filter(
    pl.all_horizontal(pl.all().is_not_null())
)

col1,col2,col3
i64,i64,i64
4,6,7


### Using the `drop_nulls` method

In [9]:
df.drop_nulls()

col1,col2,col3
i64,i64,i64
4,6,7


Drop nulls according to columns.

In [10]:
df.drop_nulls(subset=["col1", "col2"])

col1,col2,col3
i64,i64,i64
3,5,
4,6,7.0


## Exercises

### Exercise 1
Count the number of `null` values in each row of the Titanic data

In [11]:
csv_file = "data/titanic.csv"
(
    pl.read_csv(csv_file)
    .null_count()
)

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
0,0,0,0,0,177,0,0,0,0,687,2


Filter out the rows that are `null` from the `Cabin` column and count the null values for all columns again

In [12]:
pl.read_csv(csv_file).filter(
    pl.col("Cabin").is_not_null()
).null_count()

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
0,0,0,0,0,19,0,0,0,0,0,2


### Exercise 2
Find all the rows for which the `Age` is `null`

In [14]:
pl.read_csv(csv_file).filter(
    pl.col("Age").is_null()
).head()

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
i64,i64,i64,str,str,f64,i64,i64,str,f64,str,str
6,0,3,"""Moran, Mr. James""","""male""",,0,0,"""330877""",8.4583,,"""Q"""
18,1,2,"""Williams, Mr. Charles Eugene""","""male""",,0,0,"""244373""",13.0,,"""S"""
20,1,3,"""Masselmani, Mrs. Fatima""","""female""",,0,0,"""2649""",7.225,,"""C"""
27,0,3,"""Emir, Mr. Farred Chehab""","""male""",,0,0,"""2631""",7.225,,"""C"""
29,1,3,"""O'Dwyer, Miss. Ellen ""Nellie""""","""female""",,0,0,"""330959""",7.8792,,"""Q"""


Find all the rows for which neither the `Age` nor the `Cabin` is `null`.

Use the Selectors API (imported above as `cs`) to select the columns

In [15]:
import polars.selectors as cs

In [16]:
pl.read_csv(csv_file).filter(
    pl.any_horizontal(cs.matches("Age|Cabin").is_null())
).select(
    cs.matches("Age|Cabin")
)

Age,Cabin
f64,str
22.0,
26.0,
35.0,
,
2.0,
…,…
25.0,
39.0,
27.0,
,
