## 05. Selecting and transformations

### Selecting columns using `[]`

In [1]:
import polars as pl
csv_file = './data/titanic.csv'
df = pl.read_csv(csv_file)
df.head()

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
i64,i64,i64,str,str,f64,i64,i64,str,f64,str,str
1,0,3,"""Braund, Mr. Owen Harris""","""male""",22.0,1,0,"""A/5 21171""",7.25,,"""S"""
2,1,1,"""Cumings, Mrs. John Bradley (Fl…","""female""",38.0,1,0,"""PC 17599""",71.2833,"""C85""","""C"""
3,1,3,"""Heikkinen, Miss. Laina""","""female""",26.0,0,0,"""STON/O2. 3101282""",7.925,,"""S"""
4,1,1,"""Futrelle, Mrs. Jacques Heath (…","""female""",35.0,1,0,"""113803""",53.1,"""C123""","""S"""
5,0,3,"""Allen, Mr. William Henry""","""male""",35.0,0,0,"""373450""",8.05,,"""S"""


#### Choosing columns with square brackets

In [2]:
df['Age'].head()

Age
f64
22.0
38.0
26.0
35.0
35.0
""
54.0
2.0
27.0
14.0


In [3]:
df[['Survived', 'Age']].head()

Survived,Age
i64,f64
0,22.0
1,38.0
1,26.0
1,35.0
0,35.0


`[]`로 하는 indexing은 pandas와 비슷해서 패스...

### Selecting columns : using `select` and expressions

In [4]:
(
    df
    .select(
        'Age', 'Survived'
    )
    .head()
)

Age,Survived
f64,i64
22.0,0
38.0,1
26.0,1
35.0,1
35.0,0


#### Selecting and transforming a column with an expression

We can apply a transformation to a column before we output it. <p>
In this example we use the `round` expression to round the values of the `Fare` column

In [5]:
(
    df
    .select(
        pl.col('Fare')
    )
    .head()
)

Fare
f64
7.25
71.2833
7.925
53.1
8.05


In [6]:
(
    df
    .select(
        pl.col('Fare').round(0)
    )
    .head()
)

Fare
f64
7.0
71.0
8.0
53.0
8.0


In [7]:
(
    df
    .select(
        pl.col('Fare').round(0).alias('roundedFare') # .alias로 이름 변경까지...
    )
    .head()
)

roundedFare
f64
7.0
71.0
8.0
53.0
8.0


In [8]:
(
    df
    .select(
        pl.col('Fare'),
        pl.col('Fare').round(0).alias('roundedFare')
        # , 으로 여러개 선택 가능..
    )
    .head(3)
)

Fare,roundedFare
f64,f64
7.25,7.0
71.2833,71.0
7.925,8.0


In [10]:
# single value
(
    df
    .select(
        pl.col('Name').first()
    )
    .item() # 값만 출력 오직 (1, 1) shape만 가능!!!
)

'Braund, Mr. Owen Harris'

In [14]:
(
    df
    .select(
        'Name'
    )
    .to_series()
    .to_list()
)

['Braund, Mr. Owen Harris',
 'Cumings, Mrs. John Bradley (Florence Briggs Thayer)',
 'Heikkinen, Miss. Laina',
 'Futrelle, Mrs. Jacques Heath (Lily May Peel)',
 'Allen, Mr. William Henry',
 'Moran, Mr. James',
 'McCarthy, Mr. Timothy J',
 'Palsson, Master. Gosta Leonard',
 'Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)',
 'Nasser, Mrs. Nicholas (Adele Achem)',
 'Sandstrom, Miss. Marguerite Rut',
 'Bonnell, Miss. Elizabeth',
 'Saundercock, Mr. William Henry',
 'Andersson, Mr. Anders Johan',
 'Vestrom, Miss. Hulda Amanda Adolfina',
 'Hewlett, Mrs. (Mary D Kingcome) ',
 'Rice, Master. Eugene',
 'Williams, Mr. Charles Eugene',
 'Vander Planke, Mrs. Julius (Emelia Maria Vandemoortele)',
 'Masselmani, Mrs. Fatima',
 'Fynney, Mr. Joseph J',
 'Beesley, Mr. Lawrence',
 'McGowan, Miss. Anna "Annie"',
 'Sloper, Mr. William Thompson',
 'Palsson, Miss. Torborg Danira',
 'Asplund, Mrs. Carl Oscar (Selma Augusta Emilia Johansson)',
 'Emir, Mr. Farred Chehab',
 'Fortune, Mr. Charles Alexander',
 '

### Selecting columns : selecting multiple columns

In [15]:
import polars.selectors as cs # columns selecting 할 때 도움을 주는 API 불러오기

In [17]:
# 모든 columns 선택
(
    df
    .select(
        pl.all()
    )
    .head()
)

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
i64,i64,i64,str,str,f64,i64,i64,str,f64,str,str
1,0,3,"""Braund, Mr. Owen Harris""","""male""",22.0,1,0,"""A/5 21171""",7.25,,"""S"""
2,1,1,"""Cumings, Mrs. John Bradley (Fl…","""female""",38.0,1,0,"""PC 17599""",71.2833,"""C85""","""C"""
3,1,3,"""Heikkinen, Miss. Laina""","""female""",26.0,0,0,"""STON/O2. 3101282""",7.925,,"""S"""
4,1,1,"""Futrelle, Mrs. Jacques Heath (…","""female""",35.0,1,0,"""113803""",53.1,"""C123""","""S"""
5,0,3,"""Allen, Mr. William Henry""","""male""",35.0,0,0,"""373450""",8.05,,"""S"""


In [18]:
# 특정 열 제외
(
    df
    .select(
        pl.exclude('Age', 'Survived', 'Pclass')
    )
    .head()
)

PassengerId,Name,Sex,SibSp,Parch,Ticket,Fare,Cabin,Embarked
i64,str,str,i64,i64,str,f64,str,str
1,"""Braund, Mr. Owen Harris""","""male""",1,0,"""A/5 21171""",7.25,,"""S"""
2,"""Cumings, Mrs. John Bradley (Fl…","""female""",1,0,"""PC 17599""",71.2833,"""C85""","""C"""
3,"""Heikkinen, Miss. Laina""","""female""",0,0,"""STON/O2. 3101282""",7.925,,"""S"""
4,"""Futrelle, Mrs. Jacques Heath (…","""female""",1,0,"""113803""",53.1,"""C123""","""S"""
5,"""Allen, Mr. William Henry""","""male""",0,0,"""373450""",8.05,,"""S"""


#### Selecting columns with a regex


regex starts with `^` and ends with `$` <p>
looks for columns starting with `P` and uses the regex wildcard `.*` to show `P` can be followed by any character.

In [19]:
(
    df
    .select(
        "^P.*$"
    )
    .head(3)
)

PassengerId,Pclass,Parch
i64,i64,i64
1,3,0
2,1,0
3,3,0


In [20]:
# regex can apply transformation to thees columns.
(
    df
    .select(
        pl.col("^P.*$").max()
    )
)

PassengerId,Pclass,Parch
i64,i64,i64
891,3,6


#### Selecting columns based on dtype.

In [21]:
(
    df
    .select(
        pl.col(pl.Utf8)
    )
    .head()
)

Name,Sex,Ticket,Cabin,Embarked
str,str,str,str,str
"""Braund, Mr. Owen Harris""","""male""","""A/5 21171""",,"""S"""
"""Cumings, Mrs. John Bradley (Fl…","""female""","""PC 17599""","""C85""","""C"""
"""Heikkinen, Miss. Laina""","""female""","""STON/O2. 3101282""",,"""S"""
"""Futrelle, Mrs. Jacques Heath (…","""female""","""113803""","""C123""","""S"""
"""Allen, Mr. William Henry""","""male""","""373450""",,"""S"""


In [22]:
(
    df
    .select(
        pl.col([pl.Utf8, pl.Float64])
    )
    .head()
)

Name,Sex,Age,Ticket,Fare,Cabin,Embarked
str,str,f64,str,f64,str,str
"""Braund, Mr. Owen Harris""","""male""",22.0,"""A/5 21171""",7.25,,"""S"""
"""Cumings, Mrs. John Bradley (Fl…","""female""",38.0,"""PC 17599""",71.2833,"""C85""","""C"""
"""Heikkinen, Miss. Laina""","""female""",26.0,"""STON/O2. 3101282""",7.925,,"""S"""
"""Futrelle, Mrs. Jacques Heath (…","""female""",35.0,"""113803""",53.1,"""C123""","""S"""
"""Allen, Mr. William Henry""","""male""",35.0,"""373450""",8.05,,"""S"""


### Selecting columns: Transforming and adding a column

* transform an `existing column` in place using `with_columns`
* add a columns with constant values using `pl.lit`

#### Transforming an existing column

In [24]:
(
    df
    .with_columns(
        pl.col('Fare').round(0)
    )
    .head(3)
)

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
i64,i64,i64,str,str,f64,i64,i64,str,f64,str,str
1,0,3,"""Braund, Mr. Owen Harris""","""male""",22.0,1,0,"""A/5 21171""",7.0,,"""S"""
2,1,1,"""Cumings, Mrs. John Bradley (Fl…","""female""",38.0,1,0,"""PC 17599""",71.0,"""C85""","""C"""
3,1,3,"""Heikkinen, Miss. Laina""","""female""",26.0,0,0,"""STON/O2. 3101282""",8.0,,"""S"""


* select에서 변경하면 'fare'만 나오는데
* with_columns에서 변경하면 fare와 함께 모든 컬럼 나옴.

#### Adding a new column from an existing column

We can create a new column from an existing column by renaming it with `alias`

In [25]:
(
    df
    .with_columns(
        pl.col('Fare').round(0).alias('컬럼이름변경') # 기존 컬럼은 남아있고 변경된 컬럼이 새로운 이름이 생겨서 따로 생긴다.
    )
    .head()
)

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,컬럼이름변경
i64,i64,i64,str,str,f64,i64,i64,str,f64,str,str,f64
1,0,3,"""Braund, Mr. Owen Harris""","""male""",22.0,1,0,"""A/5 21171""",7.25,,"""S""",7.0
2,1,1,"""Cumings, Mrs. John Bradley (Fl…","""female""",38.0,1,0,"""PC 17599""",71.2833,"""C85""","""C""",71.0
3,1,3,"""Heikkinen, Miss. Laina""","""female""",26.0,0,0,"""STON/O2. 3101282""",7.925,,"""S""",8.0
4,1,1,"""Futrelle, Mrs. Jacques Heath (…","""female""",35.0,1,0,"""113803""",53.1,"""C123""","""S""",53.0
5,0,3,"""Allen, Mr. William Henry""","""male""",35.0,0,0,"""373450""",8.05,,"""S""",8.0


In [26]:
(
    df
    .with_columns(
        (pl.col('Age') + pl.col('Fare')).alias('AgePlusFare')
    )
    .head(3)
)

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,AgePlusFare
i64,i64,i64,str,str,f64,i64,i64,str,f64,str,str,f64
1,0,3,"""Braund, Mr. Owen Harris""","""male""",22.0,1,0,"""A/5 21171""",7.25,,"""S""",29.25
2,1,1,"""Cumings, Mrs. John Bradley (Fl…","""female""",38.0,1,0,"""PC 17599""",71.2833,"""C85""","""C""",109.2833
3,1,3,"""Heikkinen, Miss. Laina""","""female""",26.0,0,0,"""STON/O2. 3101282""",7.925,,"""S""",33.925


#### Adding a new column with a constant value

`pl.lit(값)` => 고정된 값을 모든 행에 적용할 때 사용

In [27]:
(
    df
    .with_columns(
        pl.lit("yes").alias('aboard')
    )
    .select(
        'Name', 'aboard'
    )
    .head()
)

Name,aboard
str,str
"""Braund, Mr. Owen Harris""","""yes"""
"""Cumings, Mrs. John Bradley (Fl…","""yes"""
"""Heikkinen, Miss. Laina""","""yes"""
"""Futrelle, Mrs. Jacques Heath (…","""yes"""
"""Allen, Mr. William Henry""","""yes"""


### Selecting columns: Transforming and adding multiple columns

In [31]:
(
    df
    .with_columns(
        (pl.col('Age') + pl.col('Fare')).alias('AgePlusFare'),
        pl.col('Fare').round(0)
    )
    .select(
        'Age', 'Fare', 'AgePlusFare'
    )
    .head()
)

Age,Fare,AgePlusFare
f64,f64,f64
22.0,7.0,29.25
38.0,71.0,109.2833
26.0,8.0,33.925
35.0,53.0,88.1
35.0,8.0,43.05


In [32]:
(
    df
    .with_columns(
        pl.col(pl.Float64).round(0).name.suffix("_round")
    )
    .select(
        'Age', 'Age_round', 'Fare', 'Fare_round'
    )
    .head()
)

Age,Age_round,Fare,Fare_round
f64,f64,f64,f64
22.0,22.0,7.25,7.0
38.0,38.0,71.2833,71.0
26.0,26.0,7.925,8.0
35.0,35.0,53.1,53.0
35.0,35.0,8.05,8.0


### Selecting columns: Adding a new columns based on mapping or condition

#### Add a new column based on a mapping from another column

In [37]:
(
    df
    .select(
        'Embarked'
    )
    .unique()
)

Embarked
str
"""C"""
""
"""S"""
"""Q"""


In [39]:
# replace the values

replace_dict = {
    "S" : "Southampton",
    "C" : "Cherbourg"
}

(
    df
    .with_columns(
        pl.col("Embarked").replace(replace_dict).alias("Embarked_full")
    )
    .select(
        "Embarked", "Embarked_full"
    )
    .head(5)
)

Embarked,Embarked_full
str,str
"""S""","""Southampton"""
"""C""","""Cherbourg"""
"""S""","""Southampton"""
"""S""","""Southampton"""
"""S""","""Southampton"""


In [40]:
# repalce value to other dtypes
replace_dict = {
    1:'first',
    2:'second',
    3:'third'
}

(
    df
    .with_columns(
        pl.col('Pclass').replace_strict(replace_dict, return_dtype=pl.String).alias('class_name')
    )
    .select(
        'Pclass', 'class_name'
    )
    .head()
)

Pclass,class_name
i64,str
3,"""third"""
1,"""first"""
3,"""third"""
1,"""first"""
3,"""third"""


#### Add a new column based on a condition on another column

`pl.when.then.otherwise` -> define a new column based on a condition on nore or more other columns.

In [47]:
(
    df
    .select(
        pl.all(),
        pl.when(pl.col('Pclass') == 1)
        .then(1)
        .otherwise(0)
        .alias("First_class?")
    )
    .select(
        pl.col('Pclass'), pl.col('First_class?')
    )
    .head()
)

Pclass,First_class?
i64,i32
3,0
1,1
3,0
1,1
3,0


In [48]:
(
    df
    .select(
        "Pclass",
        pl.when(
             pl.col("Pclass") == 1
        )
        .then(pl.lit("first"))
        .otherwise(pl.lit("not_first"))
        .alias("first_class")
    )
    .select("Pclass","first_class")
    .head(2)
)

Pclass,first_class
i64,str
3,"""not_first"""
1,"""first"""


In [50]:
(
    df
    .select(
        pl.col("Pclass"),
        pl.col('Age'),
        pl.when(
            pl.col('Pclass') == 1,
            pl.col('Age') < 30
        )
        .then(1)
        .otherwise(0)
        .alias("new")
    )
    .tail()
)

Pclass,Age,new
i64,f64,i32
2,27.0,0
1,19.0,1
3,,0
1,26.0,1
3,32.0,0


In [51]:
(
    df
    .select(
        pl.col("Pclass"),
        pl.col("Age"),
        pl.when(
            pl.col("Pclass") == 1,
            pl.col("Age")<30
        )
        .then(1)
        .when(
            pl.col("Pclass") == 1,
            pl.col("Age")>=30
        )
        .then(2)
        .otherwise(0)
        .alias("age_class")
    )
    .head(5)
)

Pclass,Age,age_class
i64,f64,i32
3,22.0,0
1,38.0,2
3,26.0,0
1,35.0,2
3,35.0,0


### Sorting

#### Sorting a `DataFrame`

In [2]:
df.sort('Age')

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
i64,i64,i64,str,str,f64,i64,i64,str,f64,str,str
6,0,3,"""Moran, Mr. James""","""male""",,0,0,"""330877""",8.4583,,"""Q"""
18,1,2,"""Williams, Mr. Charles Eugene""","""male""",,0,0,"""244373""",13.0,,"""S"""
20,1,3,"""Masselmani, Mrs. Fatima""","""female""",,0,0,"""2649""",7.225,,"""C"""
27,0,3,"""Emir, Mr. Farred Chehab""","""male""",,0,0,"""2631""",7.225,,"""C"""
29,1,3,"""O'Dwyer, Miss. Ellen ""Nellie""""","""female""",,0,0,"""330959""",7.8792,,"""Q"""
…,…,…,…,…,…,…,…,…,…,…,…
117,0,3,"""Connors, Mr. Patrick""","""male""",70.5,0,0,"""370369""",7.75,,"""Q"""
97,0,1,"""Goldschmidt, Mr. George B""","""male""",71.0,0,0,"""PC 17754""",34.6542,"""A5""","""C"""
494,0,1,"""Artagaveytia, Mr. Ramon""","""male""",71.0,0,0,"""PC 17609""",49.5042,,"""C"""
852,0,3,"""Svensson, Mr. Johan""","""male""",74.0,0,0,"""347060""",7.775,,"""S"""


By default `null` values are at the start of the sort. <p>
We can move the `nulls` to the end of the sort by setting the `null_last` arg to `True`

In [5]:
df.sort('Age', nulls_last=True)

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
i64,i64,i64,str,str,f64,i64,i64,str,f64,str,str
804,1,3,"""Thomas, Master. Assad Alexande…","""male""",0.42,0,1,"""2625""",8.5167,,"""C"""
756,1,2,"""Hamalainen, Master. Viljo""","""male""",0.67,1,1,"""250649""",14.5,,"""S"""
470,1,3,"""Baclini, Miss. Helene Barbara""","""female""",0.75,2,1,"""2666""",19.2583,,"""C"""
645,1,3,"""Baclini, Miss. Eugenie""","""female""",0.75,2,1,"""2666""",19.2583,,"""C"""
79,1,2,"""Caldwell, Master. Alden Gates""","""male""",0.83,0,2,"""248738""",29.0,,"""S"""
…,…,…,…,…,…,…,…,…,…,…,…
860,0,3,"""Razi, Mr. Raihed""","""male""",,0,0,"""2629""",7.2292,,"""C"""
864,0,3,"""Sage, Miss. Dorothy Edith ""Dol…","""female""",,8,2,"""CA. 2343""",69.55,,"""S"""
869,0,3,"""van Melkebeke, Mr. Philemon""","""male""",,0,0,"""345777""",9.5,,"""S"""
879,0,3,"""Laleff, Mr. Kristo""","""male""",,0,0,"""349217""",7.8958,,"""S"""


In [7]:
df.sort('Age', descending=True, nulls_last=True)

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
i64,i64,i64,str,str,f64,i64,i64,str,f64,str,str
631,1,1,"""Barkworth, Mr. Algernon Henry …","""male""",80.0,0,0,"""27042""",30.0,"""A23""","""S"""
852,0,3,"""Svensson, Mr. Johan""","""male""",74.0,0,0,"""347060""",7.775,,"""S"""
97,0,1,"""Goldschmidt, Mr. George B""","""male""",71.0,0,0,"""PC 17754""",34.6542,"""A5""","""C"""
494,0,1,"""Artagaveytia, Mr. Ramon""","""male""",71.0,0,0,"""PC 17609""",49.5042,,"""C"""
117,0,3,"""Connors, Mr. Patrick""","""male""",70.5,0,0,"""370369""",7.75,,"""Q"""
…,…,…,…,…,…,…,…,…,…,…,…
860,0,3,"""Razi, Mr. Raihed""","""male""",,0,0,"""2629""",7.2292,,"""C"""
864,0,3,"""Sage, Miss. Dorothy Edith ""Dol…","""female""",,8,2,"""CA. 2343""",69.55,,"""S"""
869,0,3,"""van Melkebeke, Mr. Philemon""","""male""",,0,0,"""345777""",9.5,,"""S"""
879,0,3,"""Laleff, Mr. Kristo""","""male""",,0,0,"""349217""",7.8958,,"""S"""


#### Sort on mulitple columns

In [8]:
df.sort(['Pclass', 'Age'])

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
i64,i64,i64,str,str,f64,i64,i64,str,f64,str,str
296,0,1,"""Lewy, Mr. Ervin G""","""male""",,0,0,"""PC 17612""",27.7208,,"""C"""
603,0,1,"""Harrington, Mr. Charles H""","""male""",,0,0,"""113796""",42.4,,"""S"""
458,1,1,"""Kenyon, Mrs. Frederick R (Mari…","""female""",,1,0,"""17464""",51.8625,"""D21""","""S"""
794,0,1,"""Hoyt, Mr. William Fisher""","""male""",,0,0,"""PC 17600""",30.6958,,"""C"""
712,0,1,"""Klaber, Mr. Herman""","""male""",,0,0,"""113028""",26.55,"""C124""","""S"""
…,…,…,…,…,…,…,…,…,…,…,…
327,0,3,"""Nysveen, Mr. Johan Hansen""","""male""",61.0,0,0,"""345364""",6.2375,,"""S"""
484,1,3,"""Turkula, Mrs. (Hedwig)""","""female""",63.0,0,0,"""4134""",9.5875,,"""S"""
281,0,3,"""Duane, Mr. Frank""","""male""",65.0,0,0,"""336439""",7.75,,"""Q"""
117,0,3,"""Connors, Mr. Patrick""","""male""",70.5,0,0,"""370369""",7.75,,"""Q"""


In [9]:
# or
df.sort('Pclass', 'Age')

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
i64,i64,i64,str,str,f64,i64,i64,str,f64,str,str
296,0,1,"""Lewy, Mr. Ervin G""","""male""",,0,0,"""PC 17612""",27.7208,,"""C"""
603,0,1,"""Harrington, Mr. Charles H""","""male""",,0,0,"""113796""",42.4,,"""S"""
458,1,1,"""Kenyon, Mrs. Frederick R (Mari…","""female""",,1,0,"""17464""",51.8625,"""D21""","""S"""
794,0,1,"""Hoyt, Mr. William Fisher""","""male""",,0,0,"""PC 17600""",30.6958,,"""C"""
712,0,1,"""Klaber, Mr. Herman""","""male""",,0,0,"""113028""",26.55,"""C124""","""S"""
…,…,…,…,…,…,…,…,…,…,…,…
327,0,3,"""Nysveen, Mr. Johan Hansen""","""male""",61.0,0,0,"""345364""",6.2375,,"""S"""
484,1,3,"""Turkula, Mrs. (Hedwig)""","""female""",63.0,0,0,"""4134""",9.5875,,"""S"""
281,0,3,"""Duane, Mr. Frank""","""male""",65.0,0,0,"""336439""",7.75,,"""Q"""
117,0,3,"""Connors, Mr. Patrick""","""male""",70.5,0,0,"""370369""",7.75,,"""Q"""


#### Sorting a columns with an expression

In [11]:
(
    df
    .select(
        pl.all().sort()
    )
    .head()
)

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
i64,i64,i64,str,str,f64,i64,i64,str,f64,str,str
1,0,1,"""Abbing, Mr. Anthony""","""female""",,0,0,"""110152""",0.0,,
2,0,1,"""Abbott, Mr. Rossmore Edward""","""female""",,0,0,"""110152""",0.0,,
3,0,1,"""Abbott, Mrs. Stanton (Rosa Hun…","""female""",,0,0,"""110152""",0.0,,"""C"""
4,0,1,"""Abelson, Mr. Samuel""","""female""",,0,0,"""110413""",0.0,,"""C"""
5,0,1,"""Abelson, Mrs. Samuel (Hannah W…","""female""",,0,0,"""110413""",0.0,,"""C"""


In [14]:
(
    df
    .select(
        pl.all().sort_by('Age', nulls_last=True)
    )
    .head()
)

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
i64,i64,i64,str,str,f64,i64,i64,str,f64,str,str
804,1,3,"""Thomas, Master. Assad Alexande…","""male""",0.42,0,1,"""2625""",8.5167,,"""C"""
756,1,2,"""Hamalainen, Master. Viljo""","""male""",0.67,1,1,"""250649""",14.5,,"""S"""
470,1,3,"""Baclini, Miss. Helene Barbara""","""female""",0.75,2,1,"""2666""",19.2583,,"""C"""
645,1,3,"""Baclini, Miss. Eugenie""","""female""",0.75,2,1,"""2666""",19.2583,,"""C"""
79,1,2,"""Caldwell, Master. Alden Gates""","""male""",0.83,0,2,"""248738""",29.0,,"""S"""


In [18]:
(
    df
    .group_by("Pclass")
    .agg(
        pl.col('Name').sort_by('Age').last(),
        pl.col('Age').sort_by('Age').last()
    )
)

Pclass,Name,Age
i64,str,f64
1,"""Barkworth, Mr. Algernon Henry …",80.0
3,"""Svensson, Mr. Johan""",74.0
2,"""Mitchell, Mr. Henry Michael""",70.0


#### Fitering for the largest/smallest values

In [19]:
# using head or tail
(
    df
    .sort('Age')
    .tail(3)
)

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
i64,i64,i64,str,str,f64,i64,i64,str,f64,str,str
494,0,1,"""Artagaveytia, Mr. Ramon""","""male""",71.0,0,0,"""PC 17609""",49.5042,,"""C"""
852,0,3,"""Svensson, Mr. Johan""","""male""",74.0,0,0,"""347060""",7.775,,"""S"""
631,1,1,"""Barkworth, Mr. Algernon Henry …","""male""",80.0,0,0,"""27042""",30.0,"""A23""","""S"""


In [20]:
# more faster approach
(
    df
    .top_k(
        k = 5,
        by = 'Age',
        # Return the largest records
        reverse = False
    )
)

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
i64,i64,i64,str,str,f64,i64,i64,str,f64,str,str
631,1,1,"""Barkworth, Mr. Algernon Henry …","""male""",80.0,0,0,"""27042""",30.0,"""A23""","""S"""
852,0,3,"""Svensson, Mr. Johan""","""male""",74.0,0,0,"""347060""",7.775,,"""S"""
97,0,1,"""Goldschmidt, Mr. George B""","""male""",71.0,0,0,"""PC 17754""",34.6542,"""A5""","""C"""
494,0,1,"""Artagaveytia, Mr. Ramon""","""male""",71.0,0,0,"""PC 17609""",49.5042,,"""C"""
117,0,3,"""Connors, Mr. Patrick""","""male""",70.5,0,0,"""370369""",7.75,,"""Q"""


#### Taking advantage of sorted data
For some operations Polars can use a fast tack algorithm if it knows the data in a column is sorted.

In [21]:
# checking the sorted status
df['PassengerId'].flags

{'SORTED_ASC': False, 'SORTED_DESC': False}

In [22]:
df['PassengerId'].is_sorted()

True

In [23]:
df = (
    pl.read_csv(csv_file)
    .with_columns(
        pl.col("PassengerId").set_sorted()
    )
)
df["PassengerId"].flags

{'SORTED_ASC': True, 'SORTED_DESC': False}

In [24]:
df = (
    pl.read_csv(csv_file)
    .sort("PassengerId")
)
df["PassengerId"].flags

{'SORTED_ASC': True, 'SORTED_DESC': False}

In [25]:
(
    df
    .select(
        pl.col("PassengerId").set_sorted().max()
    )
)

PassengerId
i64
891


In [26]:
import numpy as np

N = 10_000_000
sorted_array = np.sort(np.random.standard_normal(N))
df_sort = pl.DataFrame({"known_sorted": sorted_array, "unknown_sorted": sorted_array})

df_sort.head(3)

known_sorted,unknown_sorted
f64,f64
-5.620574,-5.620574
-5.580196,-5.580196
-5.464528,-5.464528


In [27]:
df_sort["known_sorted"].flags

{'SORTED_ASC': False, 'SORTED_DESC': False}

In [28]:
df_sort = (
    df_sort
    .with_columns(
        pl.col('known_sorted').set_sorted()
    )
)

In [29]:
df_sort["known_sorted"].flags

{'SORTED_ASC': True, 'SORTED_DESC': False}

In [30]:
%%timeit -n1 -r5
(
    df_sort
    .select(
        pl.col("known_sorted").median()
    )
)

The slowest run took 17.59 times longer than the fastest. This could mean that an intermediate result is being cached.
138 μs ± 116 μs per loop (mean ± std. dev. of 5 runs, 1 loop each)


In [31]:
%%timeit -n1 -r5
(
    df_sort
    .select(
        pl.col("unknown_sorted").median()
    )
)

25 ms ± 5.63 ms per loop (mean ± std. dev. of 5 runs, 1 loop each)


In [32]:
%%timeit -n1 -r5
(
    df_sort
    .filter(
        pl.col("known_sorted") < -2
    )
)

The slowest run took 24.58 times longer than the fastest. This could mean that an intermediate result is being cached.
1.16 ms ± 1.89 ms per loop (mean ± std. dev. of 5 runs, 1 loop each)


In [33]:
%%timeit -n1 -r5
(
    df_sort
    .filter(
        pl.col("unknown_sorted") < -2
    )
)

3.15 ms ± 389 μs per loop (mean ± std. dev. of 5 runs, 1 loop each)


### Transforming a `DataFrame`

#### Renaming columns

In [34]:
(
    df
    .rename(
        {"PassengerId" : "ID"}
    )
    .head(2)
)

ID,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
i64,i64,i64,str,str,f64,i64,i64,str,f64,str,str
1,0,3,"""Braund, Mr. Owen Harris""","""male""",22.0,1,0,"""A/5 21171""",7.25,,"""S"""
2,1,1,"""Cumings, Mrs. John Bradley (Fl…","""female""",38.0,1,0,"""PC 17599""",71.2833,"""C85""","""C"""


#### Dropping columns

In [35]:
(
    df
    .drop(
        "PassengerId", "Pclass"
    )
    .head(2)
)

Survived,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
i64,str,str,f64,i64,i64,str,f64,str,str
0,"""Braund, Mr. Owen Harris""","""male""",22.0,1,0,"""A/5 21171""",7.25,,"""S"""
1,"""Cumings, Mrs. John Bradley (Fl…","""female""",38.0,1,0,"""PC 17599""",71.2833,"""C85""","""C"""


#### Re-ordering columns

In [37]:
sorted(df.columns)

['Age',
 'Cabin',
 'Embarked',
 'Fare',
 'Name',
 'Parch',
 'PassengerId',
 'Pclass',
 'Sex',
 'SibSp',
 'Survived',
 'Ticket']

In [38]:
(
    df
    .select(
        sorted(df.columns)
    )
    .head()
)

Age,Cabin,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,Survived,Ticket
f64,str,str,f64,str,i64,i64,i64,str,i64,i64,str
22.0,,"""S""",7.25,"""Braund, Mr. Owen Harris""",0,1,3,"""male""",1,0,"""A/5 21171"""
38.0,"""C85""","""C""",71.2833,"""Cumings, Mrs. John Bradley (Fl…",0,2,1,"""female""",1,1,"""PC 17599"""
26.0,,"""S""",7.925,"""Heikkinen, Miss. Laina""",0,3,3,"""female""",0,1,"""STON/O2. 3101282"""
35.0,"""C123""","""S""",53.1,"""Futrelle, Mrs. Jacques Heath (…",0,4,1,"""female""",1,1,"""113803"""
35.0,,"""S""",8.05,"""Allen, Mr. William Henry""",0,5,3,"""male""",0,0,"""373450"""


#### Changinf dtypes

We can change dtypes within an expression using `pl.col(...).cast()` <p>
but we can also call `cast` with a `dict` argument on a DataFrame.

In [41]:
(
    df
    .cast(
        {
            "Survived" : pl.Utf8
        }
    )
    .select(
        pl.col("Survived")
    )
    .head()
)

Survived
str
"""0"""
"""1"""
"""1"""
"""1"""
"""0"""


In [42]:
(
    df
    .with_columns(
        pl.col('Survived').cast(pl.Utf8).alias("Utf8_Survived")
    )
    .select(
        pl.col('Survived', 'Utf8_Survived')
    )
    .head(3)
)

Survived,Utf8_Survived
i64,str
0,"""0"""
1,"""1"""
1,"""1"""


#### Transforming `DataFrames` in a function

We may want to capture some `DataFrame` transformations in a function. This can be to:
- re-use the same transformations multiple times
- make code easier to read or
- make the transformations testable

If our function:
- takes a `DataFrame` (and some other optional arguments) as an input and
- outputs a `DataFrame`
then we can use the `pipe` method.

In this example we define a function that makes all string columns uppercaseb

In [43]:
def uppercase_all_strings(df):
    return(
        df.with_columns(
            pl.col(pl.Utf8).str.to_uppercase()
        )
    )

We can `pipe` the `DataFrame` to this function as follows

In [45]:
(
    df
    .pipe(uppercase_all_strings)
    .head()
)

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
i64,i64,i64,str,str,f64,i64,i64,str,f64,str,str
1,0,3,"""BRAUND, MR. OWEN HARRIS""","""MALE""",22.0,1,0,"""A/5 21171""",7.25,,"""S"""
2,1,1,"""CUMINGS, MRS. JOHN BRADLEY (FL…","""FEMALE""",38.0,1,0,"""PC 17599""",71.2833,"""C85""","""C"""
3,1,3,"""HEIKKINEN, MISS. LAINA""","""FEMALE""",26.0,0,0,"""STON/O2. 3101282""",7.925,,"""S"""
4,1,1,"""FUTRELLE, MRS. JACQUES HEATH (…","""FEMALE""",35.0,1,0,"""113803""",53.1,"""C123""","""S"""
5,0,3,"""ALLEN, MR. WILLIAM HENRY""","""MALE""",35.0,0,0,"""373450""",8.05,,"""S"""


#### Function arguments using `pipe`

In [52]:
def _multiply_floats(df : pl.DataFrame, multiplication_factor : int) -> pl.DataFrame: # -> 는 반환 타입을 의미한다
    return (
        df
        .select(
            pl.col(pl.Float64)
        ) * multiplication_factor
    )

In [53]:
(
    df
    .pipe(
        _multiply_floats, multiplication_factor = 3
    )
    .head(3)
)

Age,Fare
f64,f64
66.0,21.75
114.0,213.8499
78.0,23.775


In [59]:
def lower(df : pl.DataFrame) -> pl.DataFrame:
    return(
        df
        .rename(
            {oldcol : oldcol.lower() for oldcol in df.columns}
        )
    )

(
    df
    .pipe(lower)
    .head()
)

passengerid,survived,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked
i64,i64,i64,str,str,f64,i64,i64,str,f64,str,str
1,0,3,"""Braund, Mr. Owen Harris""","""male""",22.0,1,0,"""A/5 21171""",7.25,,"""S"""
2,1,1,"""Cumings, Mrs. John Bradley (Fl…","""female""",38.0,1,0,"""PC 17599""",71.2833,"""C85""","""C"""
3,1,3,"""Heikkinen, Miss. Laina""","""female""",26.0,0,0,"""STON/O2. 3101282""",7.925,,"""S"""
4,1,1,"""Futrelle, Mrs. Jacques Heath (…","""female""",35.0,1,0,"""113803""",53.1,"""C123""","""S"""
5,0,3,"""Allen, Mr. William Henry""","""male""",35.0,0,0,"""373450""",8.05,,"""S"""


### Iterating through a DataFrame

#### Iterating overa single column

In [61]:
ages = [age for age in df['Age']]
ages[:3]

[22.0, 38.0, 26.0]

#### Iterating over multiple columns

In [62]:
df.rows() # 모든 행을 튜플로 반환.. 각 행이 하나의 튜플로 표현

[(1,
  0,
  3,
  'Braund, Mr. Owen Harris',
  'male',
  22.0,
  1,
  0,
  'A/5 21171',
  7.25,
  None,
  'S'),
 (2,
  1,
  1,
  'Cumings, Mrs. John Bradley (Florence Briggs Thayer)',
  'female',
  38.0,
  1,
  0,
  'PC 17599',
  71.2833,
  'C85',
  'C'),
 (3,
  1,
  3,
  'Heikkinen, Miss. Laina',
  'female',
  26.0,
  0,
  0,
  'STON/O2. 3101282',
  7.925,
  None,
  'S'),
 (4,
  1,
  1,
  'Futrelle, Mrs. Jacques Heath (Lily May Peel)',
  'female',
  35.0,
  1,
  0,
  '113803',
  53.1,
  'C123',
  'S'),
 (5,
  0,
  3,
  'Allen, Mr. William Henry',
  'male',
  35.0,
  0,
  0,
  '373450',
  8.05,
  None,
  'S'),
 (6,
  0,
  3,
  'Moran, Mr. James',
  'male',
  None,
  0,
  0,
  '330877',
  8.4583,
  None,
  'Q'),
 (7,
  0,
  1,
  'McCarthy, Mr. Timothy J',
  'male',
  54.0,
  0,
  0,
  '17463',
  51.8625,
  'E46',
  'S'),
 (8,
  0,
  3,
  'Palsson, Master. Gosta Leonard',
  'male',
  2.0,
  3,
  1,
  '349909',
  21.075,
  None,
  'S'),
 (9,
  1,
  3,
  'Johnson, Mrs. Oscar W (Elisabeth Vi

In [76]:
name_age = [(row[3], row[5]) for row in df.rows()]
name_age[:3]

[('Braund, Mr. Owen Harris', 22.0),
 ('Cumings, Mrs. John Bradley (Florence Briggs Thayer)', 38.0),
 ('Heikkinen, Miss. Laina', 26.0)]

In [77]:
name_age = [(row[3], row[5]) for row in df.iter_rows()] # reduce memory use
name_age[:3]

[('Braund, Mr. Owen Harris', 22.0),
 ('Cumings, Mrs. John Bradley (Florence Briggs Thayer)', 38.0),
 ('Heikkinen, Miss. Laina', 26.0)]

In [72]:
nameAge = [(row["Name"],row["Age"]) for row in df.rows(named=True)]
nameAge[:3]

[('Braund, Mr. Owen Harris', 22.0),
 ('Cumings, Mrs. John Bradley (Florence Briggs Thayer)', 38.0),
 ('Heikkinen, Miss. Laina', 26.0)]

In [73]:
nameAge = [(row["Name"],row["Age"]) for row in df.iter_rows(named=True)]
nameAge[:3]

[('Braund, Mr. Owen Harris', 22.0),
 ('Cumings, Mrs. John Bradley (Florence Briggs Thayer)', 38.0),
 ('Heikkinen, Miss. Laina', 26.0)]

### User defined functions (`map_elements` and `map_batches`)

#### Element-wise functions

In [78]:
def square(x):
    return x ** 2

In [79]:
(
    df
    .with_columns(
        pl.col('Age').map_elements(square).alias('age_squared')
    )
    .select(
        'Age', 'age_squared'
    )
    .head()
)

Expr.map_elements is significantly slower than the native expressions API.
Only use if you absolutely CANNOT implement your logic otherwise.
Replace this expression...
  - pl.col("Age").map_elements(square)
with this one instead:
  + pl.col("Age") ** 2

  pl.col('Age').map_elements(square).alias('age_squared')
  .with_columns(


Age,age_squared
f64,f64
22.0,484.0
38.0,1444.0
26.0,676.0
35.0,1225.0
35.0,1225.0


In [80]:
(
    df.with_columns(
        age_squared = pl.col("Age").map_elements(square,return_dtype=pl.Float64)
    )
    .select("Age","age_squared")
    .head()
)

Expr.map_elements is significantly slower than the native expressions API.
Only use if you absolutely CANNOT implement your logic otherwise.
Replace this expression...
  - pl.col("Age").map_elements(square)
with this one instead:
  + pl.col("Age") ** 2

  age_squared = pl.col("Age").map_elements(square,return_dtype=pl.Float64)


Age,age_squared
f64,f64
22.0,484.0
38.0,1444.0
26.0,676.0
35.0,1225.0
35.0,1225.0


In [83]:
def test(df : pl.DataFrame)->pl.DataFrame:
    return(
        df
        .with_columns(
            (pl.col('Age') ** 2).alias('Age_squared')
        )
    )


In [86]:
(
    df
    .pipe(
        test
    )
    .select(
        'Age', 'Age_squared'
    )
    .head()
)

Age,Age_squared
f64,f64
22.0,484.0
38.0,1444.0
26.0,676.0
35.0,1225.0
35.0,1225.0


In [87]:
(
    df.with_columns(
        age_squared = pl.col("Age").map_elements(lambda x: x**2,return_dtype=pl.Float64)
    )
    .select("Age","age_squared")
    .head()
)

Expr.map_elements is significantly slower than the native expressions API.
Only use if you absolutely CANNOT implement your logic otherwise.
Replace this expression...
  - pl.col("Age").map_elements(lambda x: ...)
with this one instead:
  + pl.col("Age") ** 2

  age_squared = pl.col("Age").map_elements(lambda x: x**2,return_dtype=pl.Float64)


Age,age_squared
f64,f64
22.0,484.0
38.0,1444.0
26.0,676.0
35.0,1225.0
35.0,1225.0


In [91]:
(
    df
    .with_columns(
        pl.col("Age","Fare").map_elements(lambda x: x**2,return_dtype=pl.Float64).name.suffix("_squared")
    )
    .select("Age","Age_squared","Fare","Fare_squared")
    .head()
)

Age,Age_squared,Fare,Fare_squared
f64,f64,f64,f64
22.0,484.0,7.25,52.5625
38.0,1444.0,71.2833,5081.308859
26.0,676.0,7.925,62.805625
35.0,1225.0,53.1,2819.61
35.0,1225.0,8.05,64.8025


In [93]:
(
    df
    .select(
        pl.struct(pl.col('Age'), pl.col('Fare'))
    )
)

Age
struct[2]
"{22.0,7.25}"
"{38.0,71.2833}"
"{26.0,7.925}"
"{35.0,53.1}"
"{35.0,8.05}"
…
"{27.0,13.0}"
"{19.0,30.0}"
"{null,23.45}"
"{26.0,30.0}"


In [94]:
def sum_age_fare(struct:dict)->float:
    # We check if both values are floats
    if isinstance(struct["Age"],float) and isinstance(struct["Fare"],float):
        # If they are we add them
        return struct["Age"] + struct["Fare"]
    else:
        # If there is a null value return a null
        return None

In [95]:
(
    df
    .with_columns(
        age_fare_summed = pl.struct(pl.col("Age"),pl.col("Fare")).map_elements(sum_age_fare,return_dtype=float)
    )
    .select("Age","Fare","age_fare_summed")
    .head(6)
)

Age,Fare,age_fare_summed
f64,f64,f64
22.0,7.25,29.25
38.0,71.2833,109.2833
26.0,7.925,33.925
35.0,53.1,88.1
35.0,8.05,43.05
,8.4583,


In [98]:
(
    df
    .with_columns(
        (pl.col('Age') + pl.col('Fare')).alias('sum_age').cast(pl.Float64)
    )
    .select(
        'Age', 'Fare', 'sum_age'
    )
    .head(6)
)

Age,Fare,sum_age
f64,f64,f64
22.0,7.25,29.25
38.0,71.2833,109.2833
26.0,7.925,33.925
35.0,53.1,88.1
35.0,8.05,43.05
,8.4583,


#### Numpy ufuncs

In [100]:
(
    df
    .with_columns(
        np.power(pl.col('Age'), 2).alias('age_sq')
    )
    .select(
        'Age', 'age_sq'
    )
    .head()
)

Age,age_sq
f64,f64
22.0,484.0
38.0,1444.0
26.0,676.0
35.0,1225.0
35.0,1225.0


#### Applying functions to a `Series`
With `map_elements` the function works one-row at a time.

In other cases we want to apply user-defined functions that work on an entire `Series` at once. For this case we use `map_batches`.

In [105]:
def normallize_column(s:pl.Series)->pl.Series:
    mean = s.mean()
    std = s.std()
    return(s-mean)/std

(
    df
    .with_columns(
        pl.col('Age').map_batches(normallize_column).alias('Nor_Age')
    )
    .select(
        'Age', 'Nor_Age'
    )
    .head()
)

Age,Nor_Age
f64,f64
22.0,-0.530005
38.0,0.57143
26.0,-0.254646
35.0,0.364911
35.0,0.364911
