# Selecting columns : Adding a new column based on a mapping or condition

In [1]:
import polars as pl

In [2]:
csv_file = "data/titanic.csv"

In [3]:
df = pl.read_csv(csv_file)
df.head(3)

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
i64,i64,i64,str,str,f64,i64,i64,str,f64,str,str
1,0,3,"""Braund, Mr. Owen Harris""","""male""",22.0,1,0,"""A/5 21171""",7.25,,"""S"""
2,1,1,"""Cumings, Mrs. John Bradley (Fl…","""female""",38.0,1,0,"""PC 17599""",71.2833,"""C85""","""C"""
3,1,3,"""Heikkinen, Miss. Laina""","""female""",26.0,0,0,"""STON/O2. 3101282""",7.925,,"""S"""


## Add a new column based on a mapping from another column
We can add a new column based on a Python `dict` that maps values in an existing column to a new value with the `replace` expression

In [None]:
df.with_columns(
    embarked_full=pl.col("Embarked").replace(
        {"S": "Southampton", "C": "Cherbourg"}
    )
).select(
    "Embarked", "embarked_full"
).head(2)

Embarked,embarked_full
str,str
"""S""","""Southampton"""
"""C""","""Cherbourg"""


If we want to return a column with a different dtype to the input column we use `replace_strict` and specify the `return_dtype`.

In [6]:
df.with_columns(
    class_name = pl.col("Pclass").replace_strict(
        {
            1: "first",
            2: "second",
            3: "third"
        },
        return_dtype=pl.String
    )
).select(
    "Pclass", "class_name"
).head(2)

Pclass,class_name
i64,str
3,"""third"""
1,"""first"""


## Add a new column based on a condition on another column

We use the `pl.when.then.otherwise` to define a new column based on a condition on one or more other columns.


In [7]:
df.select(
    "Pclass",
    first_class = pl.when(
        pl.col("Pclass") == 1
    ).then(1).otherwise(0)
).select(
    "Pclass", "first_class"
).head(2)

Pclass,first_class
i64,i32
3,0
1,1


## Syntax of `pl.when`

The syntax is:
```python

    pl.when(**predicate**)
    .then(**Value if True**)
    .otherwise(**Value if False)
    .alias(**New Column Name**)
```

> **Note:** if the value passed to `then` or `otherwise` is a string, Polars interprets it as a column name.

In [8]:
df.select(
    "Pclass",
    first_class = pl.when(
        pl.col("Pclass") == 1
    ).then(
        pl.lit("first")
    ).otherwise(
        pl.lit("not_first")
    )
).select(
    "Pclass", "first_class"
).head(2)

Pclass,first_class
i64,str
3,"""not_first"""
1,"""first"""


## Add a new column based on a condition on multiple other columns

In [10]:
df.select(
    pl.col("Pclass"),
    pl.col("Age"),
    pl.when(
        pl.col("Pclass") == 1,
        pl.col("Age") < 30
    ).then(1).otherwise(0).alias("young_first_class")
).tail(5)

Pclass,Age,young_first_class
i64,f64,i32
2,27.0,0
1,19.0,1
3,,0
1,26.0,1
3,32.0,0


## Add a new column based on an `if-elif-else` condition

The syntax here for a single extra condition is:
```python

    pl.when(**Boolean Expression**)
    .then(**Value if True**)
    .when(**Boolean Expression**)
    .then(**Value if True**)
    .otherwise(**Value if False)
    .alias(**New Column Name**)
    
```


In [12]:
df.select(
    pl.col("Pclass"),
    pl.col("Age"),
    age_class = pl.when(
        pl.col("Pclass") == 1,
        pl.col("Age") < 30
    ).then(1).when(
        pl.col("Pclass") == 1,
        pl.col("Age") >= 30
    ).then(2).otherwise(0)
).head(5)

Pclass,Age,age_class
i64,f64,i32
3,22.0,0
1,38.0,2
3,26.0,0
1,35.0,2
3,35.0,0


## Exercises

### Exercise 1 
Create a binary column for whether a passenger is female or male.

Add a column called `is_female` that maps rows with a female passenger to 1 and rows with a male passenger to 0. Ensure the column has an integer dtype

In [13]:
df.select(
    "Sex",
    is_female = pl.col("Sex").replace_strict(
        {"female": 1, "male": 2}, return_dtype=pl.Int64
    )
).head()

Sex,is_female
str,i64
"""male""",2
"""female""",1
"""female""",1
"""female""",1
"""male""",2


### Exercise 2 
Create a binary column called `young_female_first_class` for whether a passenger is:
- female
- in first class and
- under 30

In [16]:
df.select(
    "Pclass",
    "Sex",
    "Age",
    young_female_first_class = pl.when(
        pl.col("Sex") == "female",
        pl.col("Pclass") == 1,
        pl.col("Age") < 30
    ).then(1).otherwise(0)
).filter(
    young_female_first_class = 1
)

Pclass,Sex,Age,young_female_first_class
i64,str,f64,i32
1,"""female""",23.0,1
1,"""female""",19.0,1
1,"""female""",22.0,1
1,"""female""",26.0,1
1,"""female""",19.0,1
…,…,…,…
1,"""female""",29.0,1
1,"""female""",21.0,1
1,"""female""",17.0,1
1,"""female""",16.0,1


### Exercise 3 
Create a column called `embarked_categories` where
- female and embarked in Southampton then has value "FS"
- female and did not embark in Southampton then has value "NFS"
- male and embarked in Southampton then has value "MS"
- male and did not embark in Southampton then has value "NMS"

In [21]:
df.select(
    "Sex",
    "Embarked",
    embarked_categories = pl.when(
        pl.col("Sex") == "female",
        pl.col("Embarked") == "S"
    ).then(pl.lit("FS")).when(
        pl.col("Sex") == "female",
        pl.col("Embarked") != "S"
    ).then(pl.lit("NFS")).when(
        pl.col("Sex") == "male",
        pl.col("Embarked") == "S"
    ).then(pl.lit("MS")).otherwise(pl.lit("NMS"))
).head()

Sex,Embarked,embarked_categories
str,str,str
"""male""","""S""","""MS"""
"""female""","""C""","""NFS"""
"""female""","""S""","""FS"""
"""female""","""S""","""FS"""
"""male""","""S""","""MS"""
