## Selecting columns 6: Adding a new column based on a mapping or condition
In this lecture we learn how to:
- add a new column with a dict mapping from an existing column
- add a new column with an `if-else` condition using `pl.when`
- add a new column with a condition on multiple columns
- add a new column with multiple `if-elif` conditions

In [1]:
import polars as pl

In [2]:
csv_file = "../data/titanic.csv"

In [3]:
df = pl.read_csv(csv_file)
df.head(3)

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
i64,i64,i64,str,str,f64,i64,i64,str,f64,str,str
1,0,3,"""Braund, Mr. Owen Harris""","""male""",22.0,1,0,"""A/5 21171""",7.25,,"""S"""
2,1,1,"""Cumings, Mrs. John Bradley (Fl…","""female""",38.0,1,0,"""PC 17599""",71.2833,"""C85""","""C"""
3,1,3,"""Heikkinen, Miss. Laina""","""female""",26.0,0,0,"""STON/O2. 3101282""",7.925,,"""S"""


## Add a new column based on a mapping from another column
We can add a new column based on a Python `dict` that maps values in an existing column to a new value with the `replace` expression

In [22]:
#(22 // 10) * 10
tmp_df = (
    df
    .with_columns(
       age = ((pl.col('Age') // 10) * 10).cast(pl.Int32)
    )
    
)

print(tmp_df.head(3))

change = {
    0 : '유아',
    10 : '10대',
    20 : '20대',
    30 : '30대',
    40 : '40대',
    50 : '50대',
    60 : '60대',
    70 : '70대',
    80 : '80대',
    90 : '90대'
}

(
    tmp_df
    .with_columns(
        pl.col('age').replace_strict(change)
    )
    .head()
)

shape: (3, 13)
┌─────────────┬──────────┬────────┬─────────────────────────┬───┬─────────┬───────┬──────────┬─────┐
│ PassengerId ┆ Survived ┆ Pclass ┆ Name                    ┆ … ┆ Fare    ┆ Cabin ┆ Embarked ┆ age │
│ ---         ┆ ---      ┆ ---    ┆ ---                     ┆   ┆ ---     ┆ ---   ┆ ---      ┆ --- │
│ i64         ┆ i64      ┆ i64    ┆ str                     ┆   ┆ f64     ┆ str   ┆ str      ┆ i32 │
╞═════════════╪══════════╪════════╪═════════════════════════╪═══╪═════════╪═══════╪══════════╪═════╡
│ 1           ┆ 0        ┆ 3      ┆ Braund, Mr. Owen Harris ┆ … ┆ 7.25    ┆ null  ┆ S        ┆ 20  │
│ 2           ┆ 1        ┆ 1      ┆ Cumings, Mrs. John      ┆ … ┆ 71.2833 ┆ C85   ┆ C        ┆ 30  │
│             ┆          ┆        ┆ Bradley (Fl…            ┆   ┆         ┆       ┆          ┆     │
│ 3           ┆ 1        ┆ 3      ┆ Heikkinen, Miss. Laina  ┆ … ┆ 7.925   ┆ null  ┆ S        ┆ 20  │
└─────────────┴──────────┴────────┴─────────────────────────┴───┴─────────┴─

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,age
i64,i64,i64,str,str,f64,i64,i64,str,f64,str,str,str
1,0,3,"""Braund, Mr. Owen Harris""","""male""",22.0,1,0,"""A/5 21171""",7.25,,"""S""","""20대"""
2,1,1,"""Cumings, Mrs. John Bradley (Fl…","""female""",38.0,1,0,"""PC 17599""",71.2833,"""C85""","""C""","""30대"""
3,1,3,"""Heikkinen, Miss. Laina""","""female""",26.0,0,0,"""STON/O2. 3101282""",7.925,,"""S""","""20대"""
4,1,1,"""Futrelle, Mrs. Jacques Heath (…","""female""",35.0,1,0,"""113803""",53.1,"""C123""","""S""","""30대"""
5,0,3,"""Allen, Mr. William Henry""","""male""",35.0,0,0,"""373450""",8.05,,"""S""","""30대"""


In [24]:
(
    tmp_df
    .with_columns(
        age_group = pl.when(pl.col('age') < 10).then(pl.lit('유아'))
                    .when(pl.col('age') < 20).then(pl.lit('10대'))
                    .when(pl.col('age') < 30).then(pl.lit('20대'))
                    .when(pl.col('age') < 40).then(pl.lit('30대'))
                    .when(pl.col('age') < 50).then(pl.lit('40대'))
                    .when(pl.col('age') < 60).then(pl.lit('50대'))
                    .when(pl.col('age') < 70).then(pl.lit('60대'))
                    .when(pl.col('age') < 80).then(pl.lit('70대'))
                    .when(pl.col('age') < 90).then(pl.lit('80대'))
                    .otherwise(pl.lit('90대'))
    )
    .head()
)


PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,age,age_group
i64,i64,i64,str,str,f64,i64,i64,str,f64,str,str,i32,str
1,0,3,"""Braund, Mr. Owen Harris""","""male""",22.0,1,0,"""A/5 21171""",7.25,,"""S""",20,"""20대"""
2,1,1,"""Cumings, Mrs. John Bradley (Fl…","""female""",38.0,1,0,"""PC 17599""",71.2833,"""C85""","""C""",30,"""30대"""
3,1,3,"""Heikkinen, Miss. Laina""","""female""",26.0,0,0,"""STON/O2. 3101282""",7.925,,"""S""",20,"""20대"""
4,1,1,"""Futrelle, Mrs. Jacques Heath (…","""female""",35.0,1,0,"""113803""",53.1,"""C123""","""S""",30,"""30대"""
5,0,3,"""Allen, Mr. William Henry""","""male""",35.0,0,0,"""373450""",8.05,,"""S""",30,"""30대"""


In [None]:
(
    df
    .with_columns(
        embarked_full = pl.col("Embarked").replace(
            {
                "S":"Southampton",
                "C":"Cherbourg"
            },
        )
    )
    .select('Embarked','embarked_full')
    .head(2)
)

In [7]:
(
    df
    .with_columns(
        pl.col("Embarked").replace(
            {
                "S" : "Southampton",
                "C" : "Cherbourg"
            }
        )
        .alias("embarked_full")
    )
    .select("Embarked", "embarked_full")
    .head(2)
)

Embarked,embarked_full
str,str
"""S""","""Southampton"""
"""C""","""Cherbourg"""


If we want to return a column with a different dtype to the input column we use `replace_strict` and specify the `return_dtype`. In this example we map the (integer) class number to a (string) class name

In [12]:
(
    pl.scan_csv(csv_file)
    .select(
        pl.col('Pclass')
    )
    .with_columns(
        class_name = pl.col('Pclass').replace_strict(
            {
                1 : 'first',
                2 : 'second',
                3 : 'third'
            }
        )
    )
    .select(
        pl.col('class_name', 'Pclass')
    )
    .head(5)
    .collect()
)

class_name,Pclass
str,i64
"""third""",3
"""first""",1
"""third""",3
"""first""",1
"""third""",3


In [13]:
(
    df
    .with_columns(
        class_name = pl.col("Pclass").replace_strict(
            {
                1:'first',
                2:'second',
                3:'third'
            },
            return_dtype=pl.String
        )
    )
    .select('Pclass','class_name')
    .head(2)
)

Pclass,class_name
i64,str
3,"""third"""
1,"""first"""


In [15]:
df.head()

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
i64,i64,i64,str,str,f64,i64,i64,str,f64,str,str
1,0,3,"""Braund, Mr. Owen Harris""","""male""",22.0,1,0,"""A/5 21171""",7.25,,"""S"""
2,1,1,"""Cumings, Mrs. John Bradley (Fl…","""female""",38.0,1,0,"""PC 17599""",71.2833,"""C85""","""C"""
3,1,3,"""Heikkinen, Miss. Laina""","""female""",26.0,0,0,"""STON/O2. 3101282""",7.925,,"""S"""
4,1,1,"""Futrelle, Mrs. Jacques Heath (…","""female""",35.0,1,0,"""113803""",53.1,"""C123""","""S"""
5,0,3,"""Allen, Mr. William Henry""","""male""",35.0,0,0,"""373450""",8.05,,"""S"""


In [16]:
(
    df
    .with_columns(
        Sex_int = pl.col('Sex').replace_strict(
            {
                'male' : 0,
                'female' : 1
            }
        )
    )
    .select(
        pl.col('Sex', 'Sex_int')
    )
    .head()
)

Sex,Sex_int
str,i64
"""male""",0
"""female""",1
"""female""",1
"""female""",1
"""male""",0


We can think of `replace` as a left join (see the section on combining `DataFrames` if you are not familiar with joins) and in fact Polars implements this as a join by converting the mapping dictionary to a `DataFrame` under-the-hood.

## Add a new column based on a condition on another column

We use the `pl.when.then.otherwise` to define a new column based on a condition on one or more other columns.

We want to add a new binary column called `first_class` based on the `Pclass` column where the value is `1` for first class passengers and `0` for second and third class passengers:

`first_class = 1 if Pclass == 1`,`otherwise = 0`


In [20]:
(
    df
    .select(
        "Pclass",
        pl.when(
             pl.col("Pclass") == 1
        )
        .then(1)
        .otherwise(0)
        .alias("first_class")
    )
    #.select("Pclass","first_class")
    .head(2)
)

Pclass,first_class
i64,i32
3,0
1,1


In [21]:
(
    df
    .with_columns(
        first_class = pl.when(
            pl.col('Pclass') == 1
        )
        .then(1)
        .otherwise(0)
    )
    #.select('Pclass', 'first_class')
    .head(2)
)

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,first_class
i64,i64,i64,str,str,f64,i64,i64,str,f64,str,str,i32
1,0,3,"""Braund, Mr. Owen Harris""","""male""",22.0,1,0,"""A/5 21171""",7.25,,"""S""",0
2,1,1,"""Cumings, Mrs. John Bradley (Fl…","""female""",38.0,1,0,"""PC 17599""",71.2833,"""C85""","""C""",1


## Syntax of `pl.when`

The syntax is:
```python

    pl.when(**predicate**)
    .then(**Value if True**)
    .otherwise(**Value if False)
    .alias(**New Column Name**)
```
so we pass:
- a predicate expression (e.g. `pl.col("Pclass") == 1`) to `pl.when`
- an expression to show the value if True to `.then`
- an expression to show the value otherwise to `.otherwise`
- a name for the output to `alias`. If we don't pass `alias` the output is named after the first expression in the predicate

Note: if the value passed to `then` or `otherwise` is a string Polars interprets it as a column name. To pass a string as a value it must be passed as a Polars literal as seen in the following example

In [24]:
(
    df
    .select(
        "Pclass",
        pl.when(
             pl.col("Pclass") == 1
        )
        .then(pl.lit("first"))
        #.then("first") Makes error!!! 
        .otherwise(pl.lit("not_first"))
        .alias("first_class")
    )
    .select("Pclass","first_class")
    .head(2)
)

Pclass,first_class
i64,str
3,"""not_first"""
1,"""first"""


In [5]:
# 의도는 오케이...내가 쓸만 열로 뚜까뚜까 select로 내가 쓸 열 고정하고 하는거 ㅇㅈ
tmp = (
    df
    .with_columns(
        pl.when(
            pl.col('Pclass') == 1
        )
        .then(pl.lit('First'))
        .otherwise(pl.lit('Non_first'))
        .alias('first_class')
    )
    .select(
        pl.col('Pclass', 'first_class')
    )
    .head(3)
)
print(tmp)

(
    tmp
    .with_columns(
        test = pl.col('first_class') == 'First'
    )
)

shape: (3, 2)
┌────────┬─────────────┐
│ Pclass ┆ first_class │
│ ---    ┆ ---         │
│ i64    ┆ str         │
╞════════╪═════════════╡
│ 3      ┆ Non_first   │
│ 1      ┆ First       │
│ 3      ┆ Non_first   │
└────────┴─────────────┘


Pclass,first_class,test
i64,str,bool
3,"""Non_first""",False
1,"""First""",True
3,"""Non_first""",False


In this example the string in `then` is interpreted as a column name - though more generally we can pass expressions inside `then` and `otherwise`

In [27]:
print((
    df
    .select(
        pl.col('Pclass'),
        pl.when(
             pl.col("Pclass") == 1
        )
        .then("Fare")
        .otherwise(
            pl.col("Fare").min()
        )
        .alias("fare_approximation")
    )
    #.select("Pclass","fare_approximation") - > 필요없는 행
    .head(2)
))

print(
    (
        df
        .select(
            pl.col('Pclass'),
            pl.when(pl.col('Pclass') == 1).then(pl.col('Age')).otherwise(pl.lit("알빠노")).alias('취향파악')
        )
        .tail(3)
    )
)

shape: (2, 2)
┌────────┬────────────────────┐
│ Pclass ┆ fare_approximation │
│ ---    ┆ ---                │
│ i64    ┆ f64                │
╞════════╪════════════════════╡
│ 3      ┆ 0.0                │
│ 1      ┆ 71.2833            │
└────────┴────────────────────┘
shape: (3, 2)
┌────────┬──────────┐
│ Pclass ┆ 취향파악 │
│ ---    ┆ ---      │
│ i64    ┆ str      │
╞════════╪══════════╡
│ 3      ┆ 알빠노   │
│ 1      ┆ 26.0     │
│ 3      ┆ 알빠노   │
└────────┴──────────┘


## Add a new column based on a condition on multiple other columns

We can base conditions on multiple other columns. Here we want to add a new binary column called `young_first_class` where first class passengers with age under 30 are `1` and all other passengers are 0. 

`young_first_class = 1 if (Pclass == 1 and Age < 30) otherwise = 0`


We need an `AND` condition to combine the two filter conditions. We can do this by passing the predicates as a comma-separated list to `pl.when`

In [35]:
(
    df
    .select(
       pl.col("Pclass"),
        pl.col("Age"),
        pl.when(
            pl.col("Pclass") == 1,
            pl.col("Age") < 30
        )
        .then(1)
        .otherwise(0)
        .alias("young_first_class")
    )
    .tail(5)
)

# 1등급이고 30세 이하 -> 아이폰 ,1등급이고 50세 이하 -> 갤럭시 노트, 1등급이고 나머지 -> 김영웅 cd
(
    df
    .select(
        pl.col('Pclass'), 
        pl.col('Age'),
        pl.when(
            (pl.col('Pclass') == 1) & (pl.col('Age') <= 30)
        )
        .then(pl.lit('아이폰 15'))
        .when(
            (pl.col('Pclass') == 1) & (pl.col('Age').is_between(30, 50))
        )
        .then(pl.lit('갤럭시 노트'))
        .when(
            (pl.col('Pclass') == 1) & (pl.col('Age') > 50)
        )
        .then(pl.lit("임영웅 cd"))
        .otherwise(pl.lit("알빠노"))
        .alias("선물대작전")
    )
    .head(5)
)



Pclass,Age,선물대작전
i64,f64,str
3,22.0,"""알빠노"""
1,38.0,"""갤럭시 노트"""
3,26.0,"""알빠노"""
1,35.0,"""갤럭시 노트"""
3,35.0,"""알빠노"""


## Add a new column based on an `if-elif-else` condition
We are not limited to a single `if-else` condition. 

For example we want to create 3 categories with:
- `1` for young first class passengers 
- `2` for older first class passengers and 
- `0` for all other passengers

`age_class == 1 if (Pclass == 1 and Age < 30) or
age_class == 2 if (Pclass == 1 and Age > 30) or
otherwise 0`

We do this by repeating the `.when.then` cycle for each sub-condition.

The syntax here for a single extra condition is:
```python

    pl.when(**Boolean Expression**)
    .then(**Value if True**)
    .when(**Boolean Expression**)
    .then(**Value if True**)
    .otherwise(**Value if False)
    .alias(**New Column Name**)
    
```


In [42]:
(
    df
    .select(
        pl.col('Age'),pl.col('Pclass'),
        pl.when(
            (pl.col('Pclass') == 1) & (pl.col('Age') < 30)
        )
        .then(pl.lit(1))
        .when(
            (pl.col('Pclass') == 1) & (pl.col('Age') > 30)
        )
        .then(pl.lit(2))
        .otherwise(pl.lit(0))
        .alias("test")
    )
    .head(3)
)

Age,Pclass,test
f64,i64,i32
22.0,3,0
38.0,1,2
26.0,3,0


In [37]:
(
    df
    .select(
        pl.col("Pclass"),
        pl.col("Age"),
        pl.when(
            pl.col("Pclass") == 1,
            pl.col("Age")<30
        )
        .then(1)
        .when(
            pl.col("Pclass") == 1,
            pl.col("Age")>=30
        )
        .then(2)
        .otherwise(0)
        .alias("age_class")
    )
    .head(5)
)

Pclass,Age,age_class
i64,f64,i32
3,22.0,0
1,38.0,2
3,26.0,0
1,35.0,2
3,35.0,0


The cycles of `.when.then` can be repeated indefinitely.

## Exercises

In the exercises you will develop your understanding of:
- adding a column based on a mapping of another column
- adding a binary column based on a condition on multiple columns
- adding a column based on a nested `if-elif` condition on another column

### Exercise 1 
Create a binary column for whether a passenger is female or male.

Add a column called `is_female` that maps rows with a female passenger to 1 and rows with a male passenger to 0. Ensure the column has an integer dtype

In [46]:
(
    pl.read_csv(csv_file)
    .select(
        pl.col('Sex'),
        pl.col('Sex').replace_strict(
            {
                'female' : 1,
                'male' : 0
            }
        ).alias("is_female")
    )
    .head()
)

Sex,is_female
str,i64
"""male""",0
"""female""",1
"""female""",1
"""female""",1
"""male""",0


### Exercise 2 
Create a binary column called `young_female_first_class` for whether a passenger is:
- female
- in first class and
- under 30

In [47]:
(
    pl.read_csv(csv_file)
    .select(
        pl.col('Pclass'), pl.col('Sex'), pl.col('Age'),
        pl.when(
            pl.col('Pclass') == 1, pl.col('Age') < 30
        )
        .then(1)
        .otherwise(0)
    )
    .head()
)

Pclass,Sex,Age,literal
i64,str,f64,i32
3,"""male""",22.0,0
1,"""female""",38.0,0
3,"""female""",26.0,0
1,"""female""",35.0,0
3,"""male""",35.0,0


### Exercise 3 
Create a column called `embarked_categories` where
- female and embarked in Southampton then has value "FS"
- female and did not embark in Southampton then has value "NFS"
- male and embarked in Southampton then has value "MS"
- male and did not embark in Southampton then has value "NMS"

In [4]:
df.head()

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
i64,i64,i64,str,str,f64,i64,i64,str,f64,str,str
1,0,3,"""Braund, Mr. Owen Harris""","""male""",22.0,1,0,"""A/5 21171""",7.25,,"""S"""
2,1,1,"""Cumings, Mrs. John Bradley (Fl…","""female""",38.0,1,0,"""PC 17599""",71.2833,"""C85""","""C"""
3,1,3,"""Heikkinen, Miss. Laina""","""female""",26.0,0,0,"""STON/O2. 3101282""",7.925,,"""S"""
4,1,1,"""Futrelle, Mrs. Jacques Heath (…","""female""",35.0,1,0,"""113803""",53.1,"""C123""","""S"""
5,0,3,"""Allen, Mr. William Henry""","""male""",35.0,0,0,"""373450""",8.05,,"""S"""


In [None]:
(
    pl.read_csv(csv_file)
    .select(
            "Sex",
            "Embarked",
            
    )
)


## Solutions

### Solution to Exercise 1 

Add a column called `is_female` that maps rows with a female passenger to 1 and rows with a male passenger to 0. Ensure the column has an integer dtype

In [None]:
(
    pl.read_csv(csv_file)
    .select(
        'Sex',
        pl.col("Sex").replace_strict({"female":1,"male":0},return_dtype=pl.Int64).alias("is_female")
    )
    .head()
)

### Solution to Exercise 2 
Create a binary column called `young_female_first_class` for whether a passenger is:
- female
- in first class and
- under 30

In [None]:
(
    pl.read_csv(csv_file)
    .select(
        "Pclass",
        "Sex",
        "Age",
        pl.when(
            pl.col("Sex")=="female",
            pl.col("Pclass")==1,
            pl.col("Age") <30
        )
        .then(1)
        .otherwise(0)
        .alias("young_female_first_class")
    )
    .filter(
        pl.col("young_female_first_class")==1
    )
    .head()
)

### Solution to Exercise 3 

Create a column called `embarked_categories` where
- female and embarked in Southampton then has value "FS"
- female and did not embark in Southampton then has value "NFS"
- male and embarked in Southampton then has value "MS"
- male and did not embark in Southampton then has value "NMS"

In [None]:
(
    pl.read_csv(csv_file)
    .select(
            "Sex",
            "Embarked",
            pl.when(
                pl.col("Sex")=="female",
                pl.col("Embarked") == "S"
            )
            .then(pl.lit("FS"))
            .when(
                pl.col("Sex")=="female",
                pl.col("Embarked") != "S"
            )
            .then(pl.lit("NFS"))
            .when(
                pl.col("Sex")=="male",
                pl.col("Embarked") == "S"
            )
            .then(pl.lit("MS"))
            .when(
                pl.col("Sex")=="male",
                pl.col("Embarked") != "S"
            )
            .then(pl.lit("NMS"))
            .otherwise(pl.lit('O'))
            .alias("embarked_categories")
    )
    .head()
)