## 05. Selecting and transformations

### Selecting columns using `[]`

In [1]:
import polars as pl
csv_file = './data/titanic.csv'
df = pl.read_csv(csv_file)
df.head()

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
i64,i64,i64,str,str,f64,i64,i64,str,f64,str,str
1,0,3,"""Braund, Mr. Owen Harris""","""male""",22.0,1,0,"""A/5 21171""",7.25,,"""S"""
2,1,1,"""Cumings, Mrs. John Bradley (Fl…","""female""",38.0,1,0,"""PC 17599""",71.2833,"""C85""","""C"""
3,1,3,"""Heikkinen, Miss. Laina""","""female""",26.0,0,0,"""STON/O2. 3101282""",7.925,,"""S"""
4,1,1,"""Futrelle, Mrs. Jacques Heath (…","""female""",35.0,1,0,"""113803""",53.1,"""C123""","""S"""
5,0,3,"""Allen, Mr. William Henry""","""male""",35.0,0,0,"""373450""",8.05,,"""S"""


#### Choosing columns with square brackets

In [2]:
df['Age'].head()

Age
f64
22.0
38.0
26.0
35.0
35.0
""
54.0
2.0
27.0
14.0


In [3]:
df[['Survived', 'Age']].head()

Survived,Age
i64,f64
0,22.0
1,38.0
1,26.0
1,35.0
0,35.0


`[]`로 하는 indexing은 pandas와 비슷해서 패스...

### Selecting columns : using `select` and expressions

In [4]:
(
    df
    .select(
        'Age', 'Survived'
    )
    .head()
)

Age,Survived
f64,i64
22.0,0
38.0,1
26.0,1
35.0,1
35.0,0


#### Selecting and transforming a column with an expression

We can apply a transformation to a column before we output it. <p>
In this example we use the `round` expression to round the values of the `Fare` column

In [5]:
(
    df
    .select(
        pl.col('Fare')
    )
    .head()
)

Fare
f64
7.25
71.2833
7.925
53.1
8.05


In [6]:
(
    df
    .select(
        pl.col('Fare').round(0)
    )
    .head()
)

Fare
f64
7.0
71.0
8.0
53.0
8.0


In [7]:
(
    df
    .select(
        pl.col('Fare').round(0).alias('roundedFare') # .alias로 이름 변경까지...
    )
    .head()
)

roundedFare
f64
7.0
71.0
8.0
53.0
8.0


In [8]:
(
    df
    .select(
        pl.col('Fare'),
        pl.col('Fare').round(0).alias('roundedFare')
        # , 으로 여러개 선택 가능..
    )
    .head(3)
)

Fare,roundedFare
f64,f64
7.25,7.0
71.2833,71.0
7.925,8.0


In [10]:
# single value
(
    df
    .select(
        pl.col('Name').first()
    )
    .item() # 값만 출력 오직 (1, 1) shape만 가능!!!
)

'Braund, Mr. Owen Harris'

In [14]:
(
    df
    .select(
        'Name'
    )
    .to_series()
    .to_list()
)

['Braund, Mr. Owen Harris',
 'Cumings, Mrs. John Bradley (Florence Briggs Thayer)',
 'Heikkinen, Miss. Laina',
 'Futrelle, Mrs. Jacques Heath (Lily May Peel)',
 'Allen, Mr. William Henry',
 'Moran, Mr. James',
 'McCarthy, Mr. Timothy J',
 'Palsson, Master. Gosta Leonard',
 'Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)',
 'Nasser, Mrs. Nicholas (Adele Achem)',
 'Sandstrom, Miss. Marguerite Rut',
 'Bonnell, Miss. Elizabeth',
 'Saundercock, Mr. William Henry',
 'Andersson, Mr. Anders Johan',
 'Vestrom, Miss. Hulda Amanda Adolfina',
 'Hewlett, Mrs. (Mary D Kingcome) ',
 'Rice, Master. Eugene',
 'Williams, Mr. Charles Eugene',
 'Vander Planke, Mrs. Julius (Emelia Maria Vandemoortele)',
 'Masselmani, Mrs. Fatima',
 'Fynney, Mr. Joseph J',
 'Beesley, Mr. Lawrence',
 'McGowan, Miss. Anna "Annie"',
 'Sloper, Mr. William Thompson',
 'Palsson, Miss. Torborg Danira',
 'Asplund, Mrs. Carl Oscar (Selma Augusta Emilia Johansson)',
 'Emir, Mr. Farred Chehab',
 'Fortune, Mr. Charles Alexander',
 '

### Selecting columns : selecting multiple columns

In [15]:
import polars.selectors as cs # columns selecting 할 때 도움을 주는 API 불러오기

In [17]:
# 모든 columns 선택
(
    df
    .select(
        pl.all()
    )
    .head()
)

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
i64,i64,i64,str,str,f64,i64,i64,str,f64,str,str
1,0,3,"""Braund, Mr. Owen Harris""","""male""",22.0,1,0,"""A/5 21171""",7.25,,"""S"""
2,1,1,"""Cumings, Mrs. John Bradley (Fl…","""female""",38.0,1,0,"""PC 17599""",71.2833,"""C85""","""C"""
3,1,3,"""Heikkinen, Miss. Laina""","""female""",26.0,0,0,"""STON/O2. 3101282""",7.925,,"""S"""
4,1,1,"""Futrelle, Mrs. Jacques Heath (…","""female""",35.0,1,0,"""113803""",53.1,"""C123""","""S"""
5,0,3,"""Allen, Mr. William Henry""","""male""",35.0,0,0,"""373450""",8.05,,"""S"""


In [18]:
# 특정 열 제외
(
    df
    .select(
        pl.exclude('Age', 'Survived', 'Pclass')
    )
    .head()
)

PassengerId,Name,Sex,SibSp,Parch,Ticket,Fare,Cabin,Embarked
i64,str,str,i64,i64,str,f64,str,str
1,"""Braund, Mr. Owen Harris""","""male""",1,0,"""A/5 21171""",7.25,,"""S"""
2,"""Cumings, Mrs. John Bradley (Fl…","""female""",1,0,"""PC 17599""",71.2833,"""C85""","""C"""
3,"""Heikkinen, Miss. Laina""","""female""",0,0,"""STON/O2. 3101282""",7.925,,"""S"""
4,"""Futrelle, Mrs. Jacques Heath (…","""female""",1,0,"""113803""",53.1,"""C123""","""S"""
5,"""Allen, Mr. William Henry""","""male""",0,0,"""373450""",8.05,,"""S"""


#### Selecting columns with a regex


regex starts with `^` and ends with `$` <p>
looks for columns starting with `P` and uses the regex wildcard `.*` to show `P` can be followed by any character.

In [19]:
(
    df
    .select(
        "^P.*$"
    )
    .head(3)
)

PassengerId,Pclass,Parch
i64,i64,i64
1,3,0
2,1,0
3,3,0


In [20]:
# regex can apply transformation to thees columns.
(
    df
    .select(
        pl.col("^P.*$").max()
    )
)

PassengerId,Pclass,Parch
i64,i64,i64
891,3,6


#### Selecting columns based on dtype.

In [21]:
(
    df
    .select(
        pl.col(pl.Utf8)
    )
    .head()
)

Name,Sex,Ticket,Cabin,Embarked
str,str,str,str,str
"""Braund, Mr. Owen Harris""","""male""","""A/5 21171""",,"""S"""
"""Cumings, Mrs. John Bradley (Fl…","""female""","""PC 17599""","""C85""","""C"""
"""Heikkinen, Miss. Laina""","""female""","""STON/O2. 3101282""",,"""S"""
"""Futrelle, Mrs. Jacques Heath (…","""female""","""113803""","""C123""","""S"""
"""Allen, Mr. William Henry""","""male""","""373450""",,"""S"""


In [22]:
(
    df
    .select(
        pl.col([pl.Utf8, pl.Float64])
    )
    .head()
)

Name,Sex,Age,Ticket,Fare,Cabin,Embarked
str,str,f64,str,f64,str,str
"""Braund, Mr. Owen Harris""","""male""",22.0,"""A/5 21171""",7.25,,"""S"""
"""Cumings, Mrs. John Bradley (Fl…","""female""",38.0,"""PC 17599""",71.2833,"""C85""","""C"""
"""Heikkinen, Miss. Laina""","""female""",26.0,"""STON/O2. 3101282""",7.925,,"""S"""
"""Futrelle, Mrs. Jacques Heath (…","""female""",35.0,"""113803""",53.1,"""C123""","""S"""
"""Allen, Mr. William Henry""","""male""",35.0,"""373450""",8.05,,"""S"""


### Selecting columns: Transforming and adding a column

* transform an `existing column` in place using `with_columns`
* add a columns with constant values using `pl.lit`

#### Transforming an existing column

In [24]:
(
    df
    .with_columns(
        pl.col('Fare').round(0)
    )
    .head(3)
)

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
i64,i64,i64,str,str,f64,i64,i64,str,f64,str,str
1,0,3,"""Braund, Mr. Owen Harris""","""male""",22.0,1,0,"""A/5 21171""",7.0,,"""S"""
2,1,1,"""Cumings, Mrs. John Bradley (Fl…","""female""",38.0,1,0,"""PC 17599""",71.0,"""C85""","""C"""
3,1,3,"""Heikkinen, Miss. Laina""","""female""",26.0,0,0,"""STON/O2. 3101282""",8.0,,"""S"""


* select에서 변경하면 'fare'만 나오는데
* with_columns에서 변경하면 fare와 함께 모든 컬럼 나옴.

#### Adding a new column from an existing column

We can create a new column from an existing column by renaming it with `alias`

In [25]:
(
    df
    .with_columns(
        pl.col('Fare').round(0).alias('컬럼이름변경') # 기존 컬럼은 남아있고 변경된 컬럼이 새로운 이름이 생겨서 따로 생긴다.
    )
    .head()
)

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,컬럼이름변경
i64,i64,i64,str,str,f64,i64,i64,str,f64,str,str,f64
1,0,3,"""Braund, Mr. Owen Harris""","""male""",22.0,1,0,"""A/5 21171""",7.25,,"""S""",7.0
2,1,1,"""Cumings, Mrs. John Bradley (Fl…","""female""",38.0,1,0,"""PC 17599""",71.2833,"""C85""","""C""",71.0
3,1,3,"""Heikkinen, Miss. Laina""","""female""",26.0,0,0,"""STON/O2. 3101282""",7.925,,"""S""",8.0
4,1,1,"""Futrelle, Mrs. Jacques Heath (…","""female""",35.0,1,0,"""113803""",53.1,"""C123""","""S""",53.0
5,0,3,"""Allen, Mr. William Henry""","""male""",35.0,0,0,"""373450""",8.05,,"""S""",8.0


In [26]:
(
    df
    .with_columns(
        (pl.col('Age') + pl.col('Fare')).alias('AgePlusFare')
    )
    .head(3)
)

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,AgePlusFare
i64,i64,i64,str,str,f64,i64,i64,str,f64,str,str,f64
1,0,3,"""Braund, Mr. Owen Harris""","""male""",22.0,1,0,"""A/5 21171""",7.25,,"""S""",29.25
2,1,1,"""Cumings, Mrs. John Bradley (Fl…","""female""",38.0,1,0,"""PC 17599""",71.2833,"""C85""","""C""",109.2833
3,1,3,"""Heikkinen, Miss. Laina""","""female""",26.0,0,0,"""STON/O2. 3101282""",7.925,,"""S""",33.925


#### Adding a new column with a constant value

`pl.lit(값)` => 고정된 값을 모든 행에 적용할 때 사용

In [27]:
(
    df
    .with_columns(
        pl.lit("yes").alias('aboard')
    )
    .select(
        'Name', 'aboard'
    )
    .head()
)

Name,aboard
str,str
"""Braund, Mr. Owen Harris""","""yes"""
"""Cumings, Mrs. John Bradley (Fl…","""yes"""
"""Heikkinen, Miss. Laina""","""yes"""
"""Futrelle, Mrs. Jacques Heath (…","""yes"""
"""Allen, Mr. William Henry""","""yes"""


### Selecting columns: Transforming and adding multiple columns

In [31]:
(
    df
    .with_columns(
        (pl.col('Age') + pl.col('Fare')).alias('AgePlusFare'),
        pl.col('Fare').round(0)
    )
    .select(
        'Age', 'Fare', 'AgePlusFare'
    )
    .head()
)

Age,Fare,AgePlusFare
f64,f64,f64
22.0,7.0,29.25
38.0,71.0,109.2833
26.0,8.0,33.925
35.0,53.0,88.1
35.0,8.0,43.05


In [32]:
(
    df
    .with_columns(
        pl.col(pl.Float64).round(0).name.suffix("_round")
    )
    .select(
        'Age', 'Age_round', 'Fare', 'Fare_round'
    )
    .head()
)

Age,Age_round,Fare,Fare_round
f64,f64,f64,f64
22.0,22.0,7.25,7.0
38.0,38.0,71.2833,71.0
26.0,26.0,7.925,8.0
35.0,35.0,53.1,53.0
35.0,35.0,8.05,8.0


### Selecting columns: Adding a new columns based on mapping or condition

#### Add a new column based on a mapping from another column

In [37]:
(
    df
    .select(
        'Embarked'
    )
    .unique()
)

Embarked
str
"""C"""
""
"""S"""
"""Q"""


In [39]:
# replace the values

replace_dict = {
    "S" : "Southampton",
    "C" : "Cherbourg"
}

(
    df
    .with_columns(
        pl.col("Embarked").replace(replace_dict).alias("Embarked_full")
    )
    .select(
        "Embarked", "Embarked_full"
    )
    .head(5)
)

Embarked,Embarked_full
str,str
"""S""","""Southampton"""
"""C""","""Cherbourg"""
"""S""","""Southampton"""
"""S""","""Southampton"""
"""S""","""Southampton"""


In [40]:
# repalce value to other dtypes
replace_dict = {
    1:'first',
    2:'second',
    3:'third'
}

(
    df
    .with_columns(
        pl.col('Pclass').replace_strict(replace_dict, return_dtype=pl.String).alias('class_name')
    )
    .select(
        'Pclass', 'class_name'
    )
    .head()
)

Pclass,class_name
i64,str
3,"""third"""
1,"""first"""
3,"""third"""
1,"""first"""
3,"""third"""


#### Add a new column based on a condition on another column

`pl.when.then.otherwise` -> define a new column based on a condition on nore or more other columns.

In [47]:
(
    df
    .select(
        pl.all(),
        pl.when(pl.col('Pclass') == 1)
        .then(1)
        .otherwise(0)
        .alias("First_class?")
    )
    .select(
        pl.col('Pclass'), pl.col('First_class?')
    )
    .head()
)

Pclass,First_class?
i64,i32
3,0
1,1
3,0
1,1
3,0


In [48]:
(
    df
    .select(
        "Pclass",
        pl.when(
             pl.col("Pclass") == 1
        )
        .then(pl.lit("first"))
        .otherwise(pl.lit("not_first"))
        .alias("first_class")
    )
    .select("Pclass","first_class")
    .head(2)
)

Pclass,first_class
i64,str
3,"""not_first"""
1,"""first"""


In [50]:
(
    df
    .select(
        pl.col("Pclass"),
        pl.col('Age'),
        pl.when(
            pl.col('Pclass') == 1,
            pl.col('Age') < 30
        )
        .then(1)
        .otherwise(0)
        .alias("new")
    )
    .tail()
)

Pclass,Age,new
i64,f64,i32
2,27.0,0
1,19.0,1
3,,0
1,26.0,1
3,32.0,0


In [51]:
(
    df
    .select(
        pl.col("Pclass"),
        pl.col("Age"),
        pl.when(
            pl.col("Pclass") == 1,
            pl.col("Age")<30
        )
        .then(1)
        .when(
            pl.col("Pclass") == 1,
            pl.col("Age")>=30
        )
        .then(2)
        .otherwise(0)
        .alias("age_class")
    )
    .head(5)
)

Pclass,Age,age_class
i64,f64,i32
3,22.0,0
1,38.0,2
3,26.0,0
1,35.0,2
3,35.0,0


### Sorting