# Selecting columns : Transforming and adding multiple columns

In [8]:
import polars as pl
import polars.selectors as cs

In [2]:
csv_file = "data/titanic.csv"

In [3]:
df = pl.read_csv(csv_file)
df.head(3)

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
i64,i64,i64,str,str,f64,i64,i64,str,f64,str,str
1,0,3,"""Braund, Mr. Owen Harris""","""male""",22.0,1,0,"""A/5 21171""",7.25,,"""S"""
2,1,1,"""Cumings, Mrs. John Bradley (Fl…","""female""",38.0,1,0,"""PC 17599""",71.2833,"""C85""","""C"""
3,1,3,"""Heikkinen, Miss. Laina""","""female""",26.0,0,0,"""STON/O2. 3101282""",7.925,,"""S"""


## Transforming existing columns

We can transform multiple existing columns by either passing a `list` of expressions to `with_columns` or comma-separated expressions.

In [4]:
df.with_columns(
    pl.col("Age").round(0),
    pl.col("Fare").round(0)
).head(3)

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
i64,i64,i64,str,str,f64,i64,i64,str,f64,str,str
1,0,3,"""Braund, Mr. Owen Harris""","""male""",22.0,1,0,"""A/5 21171""",7.0,,"""S"""
2,1,1,"""Cumings, Mrs. John Bradley (Fl…","""female""",38.0,1,0,"""PC 17599""",71.0,"""C85""","""C"""
3,1,3,"""Heikkinen, Miss. Laina""","""female""",26.0,0,0,"""STON/O2. 3101282""",8.0,,"""S"""


Less verbose

In [5]:
df.with_columns(
    pl.col("Age", "Fare").round(0)
).head(3)

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
i64,i64,i64,str,str,f64,i64,i64,str,f64,str,str
1,0,3,"""Braund, Mr. Owen Harris""","""male""",22.0,1,0,"""A/5 21171""",7.0,,"""S"""
2,1,1,"""Cumings, Mrs. John Bradley (Fl…","""female""",38.0,1,0,"""PC 17599""",71.0,"""C85""","""C"""
3,1,3,"""Heikkinen, Miss. Laina""","""female""",26.0,0,0,"""STON/O2. 3101282""",8.0,,"""S"""


In [6]:
df.with_columns(
    pl.col(pl.Float64).round(0)
).head(3)

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
i64,i64,i64,str,str,f64,i64,i64,str,f64,str,str
1,0,3,"""Braund, Mr. Owen Harris""","""male""",22.0,1,0,"""A/5 21171""",7.0,,"""S"""
2,1,1,"""Cumings, Mrs. John Bradley (Fl…","""female""",38.0,1,0,"""PC 17599""",71.0,"""C85""","""C"""
3,1,3,"""Heikkinen, Miss. Laina""","""female""",26.0,0,0,"""STON/O2. 3101282""",8.0,,"""S"""


In [9]:
df.with_columns(
    cs.float().round(0)
).head(3)

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
i64,i64,i64,str,str,f64,i64,i64,str,f64,str,str
1,0,3,"""Braund, Mr. Owen Harris""","""male""",22.0,1,0,"""A/5 21171""",7.0,,"""S"""
2,1,1,"""Cumings, Mrs. John Bradley (Fl…","""female""",38.0,1,0,"""PC 17599""",71.0,"""C85""","""C"""
3,1,3,"""Heikkinen, Miss. Laina""","""female""",26.0,0,0,"""STON/O2. 3101282""",8.0,,"""S"""


## Adding new columns from existing columns

In [10]:
df.with_columns(
    pl.col("Age").round(0).alias("Age_round"),
    pl.col("Fare").round(0).alias("Fare_round")
).select(
    "Age", "Age_round", "Fare", "Fare_round"
).head(3)

Age,Age_round,Fare,Fare_round
f64,f64,f64,f64
22.0,22.0,7.25,7.0
38.0,38.0,71.2833,71.0
26.0,26.0,7.925,8.0


In [11]:
df.with_columns(
    Age_round = pl.col("Age").round(0),
    Fare_round = pl.col("Fare").round(0)
).select(
    "Age", "Age_round", "Fare", "Fare_round"
).head(3)

Age,Age_round,Fare,Fare_round
f64,f64,f64,f64
22.0,22.0,7.25,7.0
38.0,38.0,71.2833,71.0
26.0,26.0,7.925,8.0


> **Note** that if you mix the `alias` and keyword assignment approach in the same `with_columns`, the keyword assignments must come after the `alias` expressions.

When should you use `alias` and when should you use the keyword approach?
- There is no performance difference between the `alias` and keyword approach
- You can use python variables inside an `alias` but not with keyword assignment

## Creating new columns when working with multiple expressions

In [12]:
df.with_columns(
    pl.col(pl.Float64).round(0).name.suffix("_round")
).select(
    "Age", "Age_round", "Fare", "Fare_round"
).head(3)

Age,Age_round,Fare,Fare_round
f64,f64,f64,f64
22.0,22.0,7.25,7.0
38.0,38.0,71.2833,71.0
26.0,26.0,7.925,8.0


## Exercises

## Exercise 1
Convert the 64-bit integer and float columns to their 32-bit equivalents

In [14]:
df.with_columns(
    pl.col(pl.Float64).cast(pl.Float32),
    pl.col(pl.Int64).cast(pl.Int32)
).head(3)

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
i32,i32,i32,str,str,f32,i32,i32,str,f32,str,str
1,0,3,"""Braund, Mr. Owen Harris""","""male""",22.0,1,0,"""A/5 21171""",7.25,,"""S"""
2,1,1,"""Cumings, Mrs. John Bradley (Fl…","""female""",38.0,1,0,"""PC 17599""",71.283302,"""C85""","""C"""
3,1,3,"""Heikkinen, Miss. Laina""","""female""",26.0,0,0,"""STON/O2. 3101282""",7.925,,"""S"""


Continue by adding 
- a `family_size` column as the sum of the siblings, parents and the passenger
- a Boolean `over_thirty` column showing if a passenger is aged 30 or over

Add these columns using keyword assignment

In [15]:
df.with_columns(
    pl.col(pl.Float64).cast(pl.Float32),
    pl.col(pl.Int64).cast(pl.Int32)
).with_columns(
    family_size =(pl.col("SibSp").add(pl.col("Parch")) + 1),
    over_thirty = pl.col("Age") >= 30
).head()

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,family_size,over_thirty
i32,i32,i32,str,str,f32,i32,i32,str,f32,str,str,i32,bool
1,0,3,"""Braund, Mr. Owen Harris""","""male""",22.0,1,0,"""A/5 21171""",7.25,,"""S""",2,False
2,1,1,"""Cumings, Mrs. John Bradley (Fl…","""female""",38.0,1,0,"""PC 17599""",71.283302,"""C85""","""C""",2,True
3,1,3,"""Heikkinen, Miss. Laina""","""female""",26.0,0,0,"""STON/O2. 3101282""",7.925,,"""S""",1,False
4,1,1,"""Futrelle, Mrs. Jacques Heath (…","""female""",35.0,1,0,"""113803""",53.099998,"""C123""","""S""",2,True
5,0,3,"""Allen, Mr. William Henry""","""male""",35.0,0,0,"""373450""",8.05,,"""S""",1,True


### Exercise 2
We have the following fictitious dataset with sales figures of bikes in different countries.

In [16]:
dfb = pl.read_parquet("data/bike_sales.parquet")
dfb.head()

date,customer age,customer gender,country,sub category,order quantity,unit cost,unit price,cost,revenue
date,i64,str,str,str,i64,i64,i64,i64,i64
2013-01-28,31,"""M""","""Australia""","""Mountain Bikes""",1,1912,3400,1912,2856
2015-01-28,31,"""M""","""Australia""","""Mountain Bikes""",1,1912,3400,1912,2856
2013-07-22,31,"""M""","""Australia""","""Mountain Bikes""",1,1912,3400,1912,2856
2015-07-22,31,"""M""","""Australia""","""Mountain Bikes""",2,1912,3400,3824,5712
2013-12-25,31,"""M""","""Australia""","""Mountain Bikes""",1,1912,3400,1912,2856


The monetary values are in the local currency but we want to compare them in US dollars. 

In order to do this we join the following `DataFrame` with the foreign-exchange rates to US dollars

In [17]:
fx_df = pl.DataFrame(
    {
        "country": [
            "Germany",
            "Canada",
            "Australia",
            "United States",
            "United Kingdom",
            "France",
        ],
        "fx_rate": [1.25, 2.0, 2.5, 1.0, 1.5, 1.25],
    }
)

dfb = dfb.join(fx_df, on="country", how="left", coalesce=True)
dfb.head()

date,customer age,customer gender,country,sub category,order quantity,unit cost,unit price,cost,revenue,fx_rate
date,i64,str,str,str,i64,i64,i64,i64,i64,f64
2013-01-28,31,"""M""","""Australia""","""Mountain Bikes""",1,1912,3400,1912,2856,2.5
2015-01-28,31,"""M""","""Australia""","""Mountain Bikes""",1,1912,3400,1912,2856,2.5
2013-07-22,31,"""M""","""Australia""","""Mountain Bikes""",1,1912,3400,1912,2856,2.5
2015-07-22,31,"""M""","""Australia""","""Mountain Bikes""",2,1912,3400,3824,5712,2.5
2013-12-25,31,"""M""","""Australia""","""Mountain Bikes""",1,1912,3400,1912,2856,2.5


Note that the some column names have whitespace.

In [18]:
dfb.columns

['date',
 'customer age',
 'customer gender',
 'country',
 'sub category',
 'order quantity',
 'unit cost',
 'unit price',
 'cost',
 'revenue',
 'fx_rate']

Do this conversion to float dtype in a single expression

In [20]:
dfb.with_columns(
    pl.col("unit cost", "unit price", "cost", "revenue").cast(pl.Float64)
).head()

date,customer age,customer gender,country,sub category,order quantity,unit cost,unit price,cost,revenue,fx_rate
date,i64,str,str,str,i64,f64,f64,f64,f64,f64
2013-01-28,31,"""M""","""Australia""","""Mountain Bikes""",1,1912.0,3400.0,1912.0,2856.0,2.5
2015-01-28,31,"""M""","""Australia""","""Mountain Bikes""",1,1912.0,3400.0,1912.0,2856.0,2.5
2013-07-22,31,"""M""","""Australia""","""Mountain Bikes""",1,1912.0,3400.0,1912.0,2856.0,2.5
2015-07-22,31,"""M""","""Australia""","""Mountain Bikes""",2,1912.0,3400.0,3824.0,5712.0,2.5
2013-12-25,31,"""M""","""Australia""","""Mountain Bikes""",1,1912.0,3400.0,1912.0,2856.0,2.5


Continue by adding a new `with_columns` statement where for each monetary column we add a column that has the US Dollar equivlent amount. We do this conversion by multiplying the monetary columns by `fx_rate`.

- Select the monetary columns using `cs.matches`
- Add `"_usd"` to the new column name
- Ensure you enclose the conversion in `()` before renaming the expressions

In [21]:
dfb.with_columns(
    pl.col("unit cost", "unit price", "cost", "revenue").cast(pl.Float64)
).with_columns(
    (cs.matches("cost|price|revenue") * pl.col("fx_rate")).name.suffix("_usd")
).head()

date,customer age,customer gender,country,sub category,order quantity,unit cost,unit price,cost,revenue,fx_rate,unit cost_usd,unit price_usd,cost_usd,revenue_usd
date,i64,str,str,str,i64,f64,f64,f64,f64,f64,f64,f64,f64,f64
2013-01-28,31,"""M""","""Australia""","""Mountain Bikes""",1,1912.0,3400.0,1912.0,2856.0,2.5,4780.0,8500.0,4780.0,7140.0
2015-01-28,31,"""M""","""Australia""","""Mountain Bikes""",1,1912.0,3400.0,1912.0,2856.0,2.5,4780.0,8500.0,4780.0,7140.0
2013-07-22,31,"""M""","""Australia""","""Mountain Bikes""",1,1912.0,3400.0,1912.0,2856.0,2.5,4780.0,8500.0,4780.0,7140.0
2015-07-22,31,"""M""","""Australia""","""Mountain Bikes""",2,1912.0,3400.0,3824.0,5712.0,2.5,4780.0,8500.0,9560.0,14280.0
2013-12-25,31,"""M""","""Australia""","""Mountain Bikes""",1,1912.0,3400.0,1912.0,2856.0,2.5,4780.0,8500.0,4780.0,7140.0
