## Transforming a `DataFrame`

In [1]:
import polars as pl
import polars.selectors as cs
pl.Config.set_tbl_rows(6)

polars.config.Config

In [2]:
csv_file = "data/titanic.csv"

In [3]:
df = pl.read_csv(csv_file)
df.head(3)

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
i64,i64,i64,str,str,f64,i64,i64,str,f64,str,str
1,0,3,"""Braund, Mr. Owen Harris""","""male""",22.0,1,0,"""A/5 21171""",7.25,,"""S"""
2,1,1,"""Cumings, Mrs. John Bradley (Fl…","""female""",38.0,1,0,"""PC 17599""",71.2833,"""C85""","""C"""
3,1,3,"""Heikkinen, Miss. Laina""","""female""",26.0,0,0,"""STON/O2. 3101282""",7.925,,"""S"""


In [4]:
lf = pl.scan_csv(csv_file)

## Renaming columns
We can rename columns by passing a `dict` that maps old names to new names.

In [5]:
df.rename(
    {
        "PassengerId": "Id"
    }
).head(2)

Id,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
i64,i64,i64,str,str,f64,i64,i64,str,f64,str,str
1,0,3,"""Braund, Mr. Owen Harris""","""male""",22.0,1,0,"""A/5 21171""",7.25,,"""S"""
2,1,1,"""Cumings, Mrs. John Bradley (Fl…","""female""",38.0,1,0,"""PC 17599""",71.2833,"""C85""","""C"""


## Dropping columns

We can drop columns by passing a `list` of column names

In [6]:
df.drop(["PassengerId", "Name"]).head(3)

Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
i64,i64,str,f64,i64,i64,str,f64,str,str
0,3,"""male""",22.0,1,0,"""A/5 21171""",7.25,,"""S"""
1,1,"""female""",38.0,1,0,"""PC 17599""",71.2833,"""C85""","""C"""
1,3,"""female""",26.0,0,0,"""STON/O2. 3101282""",7.925,,"""S"""


In [7]:
df.drop("PassengerId", "Name").head(3)

Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
i64,i64,str,f64,i64,i64,str,f64,str,str
0,3,"""male""",22.0,1,0,"""A/5 21171""",7.25,,"""S"""
1,1,"""female""",38.0,1,0,"""PC 17599""",71.2833,"""C85""","""C"""
1,3,"""female""",26.0,0,0,"""STON/O2. 3101282""",7.925,,"""S"""


## Re-ordering columns
We can re-order columns with a `list` in `select`.

In [8]:
df.select(
    sorted(df.columns)
).head(3)

Age,Cabin,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,Survived,Ticket
f64,str,str,f64,str,i64,i64,i64,str,i64,i64,str
22.0,,"""S""",7.25,"""Braund, Mr. Owen Harris""",0,1,3,"""male""",1,0,"""A/5 21171"""
38.0,"""C85""","""C""",71.2833,"""Cumings, Mrs. John Bradley (Fl…",0,2,1,"""female""",1,1,"""PC 17599"""
26.0,,"""S""",7.925,"""Heikkinen, Miss. Laina""",0,3,3,"""female""",0,1,"""STON/O2. 3101282"""


## Changing dtypes

In [9]:
df.cast({
    "Survived": pl.Utf8
}).head(2)

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
i64,str,i64,str,str,f64,i64,i64,str,f64,str,str
1,"""0""",3,"""Braund, Mr. Owen Harris""","""male""",22.0,1,0,"""A/5 21171""",7.25,,"""S"""
2,"""1""",1,"""Cumings, Mrs. John Bradley (Fl…","""female""",38.0,1,0,"""PC 17599""",71.2833,"""C85""","""C"""


Cast entire `DataFrame`

In [10]:
df.cast(
    pl.Utf8
).head(2)

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
str,str,str,str,str,str,str,str,str,str,str,str
"""1""","""0""","""3""","""Braund, Mr. Owen Harris""","""male""","""22.0""","""1""","""0""","""A/5 21171""","""7.25""",,"""S"""
"""2""","""1""","""1""","""Cumings, Mrs. John Bradley (Fl…","""female""","""38.0""","""1""","""0""","""PC 17599""","""71.2833""","""C85""","""C"""


Use selector

In [None]:
df.cast({cs.numeric(): pl.Utf8}).head(2)

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
str,str,str,str,str,str,str,str,str,str,str,str
"""1""","""0""","""3""","""Braund, Mr. Owen Harris""","""male""","""22.0""","""1""","""0""","""A/5 21171""","""7.25""",,"""S"""
"""2""","""1""","""1""","""Cumings, Mrs. John Bradley (Fl…","""female""","""38.0""","""1""","""0""","""PC 17599""","""71.2833""","""C85""","""C"""


## Transforming `DataFrames` in a function

In [12]:
def uppercase_all_strings(df: pl.DataFrame):
    return df.with_columns(
        pl.col(pl.Utf8).str.to_uppercase()
    )

Pipe the `DataFrame` to this function as follows

In [13]:
df.pipe(
    uppercase_all_strings
)

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
i64,i64,i64,str,str,f64,i64,i64,str,f64,str,str
1,0,3,"""BRAUND, MR. OWEN HARRIS""","""MALE""",22.0,1,0,"""A/5 21171""",7.25,,"""S"""
2,1,1,"""CUMINGS, MRS. JOHN BRADLEY (FL…","""FEMALE""",38.0,1,0,"""PC 17599""",71.2833,"""C85""","""C"""
3,1,3,"""HEIKKINEN, MISS. LAINA""","""FEMALE""",26.0,0,0,"""STON/O2. 3101282""",7.925,,"""S"""
…,…,…,…,…,…,…,…,…,…,…,…
889,0,3,"""JOHNSTON, MISS. CATHERINE HELE…","""FEMALE""",,1,2,"""W./C. 6607""",23.45,,"""S"""
890,1,1,"""BEHR, MR. KARL HOWELL""","""MALE""",26.0,0,0,"""111369""",30.0,"""C148""","""C"""
891,0,3,"""DOOLEY, MR. PATRICK""","""MALE""",32.0,0,0,"""370376""",7.75,,"""Q"""


`pipe` method allows us to access the `DataFrame` using a temporary variable inside a function.

In [18]:
lf.pipe(
    lambda temp_df: temp_df.select(sorted(temp_df.collect_schema().names()))
).collect_schema().names()

['Age',
 'Cabin',
 'Embarked',
 'Fare',
 'Name',
 'Parch',
 'PassengerId',
 'Pclass',
 'Sex',
 'SibSp',
 'Survived',
 'Ticket']

In [19]:
print(lf.pipe(
    lambda temp_df: temp_df.select(sorted(temp_df.collect_schema().names()[:3]))
).explain())

simple π 3/3 ["PassengerId", "Pclass", ... 1 other column]
  Csv SCAN [data/titanic.csv]
  PROJECT 3/12 COLUMNS
  ESTIMATED ROWS: 971


### Function arguments using `pipe`

We can pass optional arguments to functions using `pipe`

In [20]:
def _multiply_floats(df: pl.DataFrame, multiplication_factor: int):
    return df.select(pl.col(pl.Float64)) * multiplication_factor

df.pipe(
    _multiply_floats,
    multiplication_factor = 3
).head(3)

Age,Fare
f64,f64
66.0,21.75
114.0,213.8499
78.0,23.775


## Exercises

### Exercise 1
Drop the `Age` and `Fare` columns from the `DataFrame`

In [21]:
df.drop(
    "Age", "Fare"
).head(3)

PassengerId,Survived,Pclass,Name,Sex,SibSp,Parch,Ticket,Cabin,Embarked
i64,i64,i64,str,str,i64,i64,str,str,str
1,0,3,"""Braund, Mr. Owen Harris""","""male""",1,0,"""A/5 21171""",,"""S"""
2,1,1,"""Cumings, Mrs. John Bradley (Fl…","""female""",1,0,"""PC 17599""","""C85""","""C"""
3,1,3,"""Heikkinen, Miss. Laina""","""female""",0,0,"""STON/O2. 3101282""",,"""S"""


Cast all of the integer columns to 16-bit integers

In [22]:
df.cast(
    {
        cs.integer(): pl.Int16
    }
).head(3)

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
i16,i16,i16,str,str,f64,i16,i16,str,f64,str,str
1,0,3,"""Braund, Mr. Owen Harris""","""male""",22.0,1,0,"""A/5 21171""",7.25,,"""S"""
2,1,1,"""Cumings, Mrs. John Bradley (Fl…","""female""",38.0,1,0,"""PC 17599""",71.2833,"""C85""","""C"""
3,1,3,"""Heikkinen, Miss. Laina""","""female""",26.0,0,0,"""STON/O2. 3101282""",7.925,,"""S"""


### Exercise 2
Rename the `Age` column to `age`

In [23]:
df.rename({
    "Age": "age"
}).head(3)

PassengerId,Survived,Pclass,Name,Sex,age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
i64,i64,i64,str,str,f64,i64,i64,str,f64,str,str
1,0,3,"""Braund, Mr. Owen Harris""","""male""",22.0,1,0,"""A/5 21171""",7.25,,"""S"""
2,1,1,"""Cumings, Mrs. John Bradley (Fl…","""female""",38.0,1,0,"""PC 17599""",71.2833,"""C85""","""C"""
3,1,3,"""Heikkinen, Miss. Laina""","""female""",26.0,0,0,"""STON/O2. 3101282""",7.925,,"""S"""


Rename all column names to lower case

In [27]:
df.pipe(
    lambda df: df.rename({col: col.lower() for col in df.columns})
).head(3)

passengerid,survived,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked
i64,i64,i64,str,str,f64,i64,i64,str,f64,str,str
1,0,3,"""Braund, Mr. Owen Harris""","""male""",22.0,1,0,"""A/5 21171""",7.25,,"""S"""
2,1,1,"""Cumings, Mrs. John Bradley (Fl…","""female""",38.0,1,0,"""PC 17599""",71.2833,"""C85""","""C"""
3,1,3,"""Heikkinen, Miss. Laina""","""female""",26.0,0,0,"""STON/O2. 3101282""",7.925,,"""S"""
