# 레시피 10 : 컬럼 선택 select 함수

In [2]:
import polars as pl 

df = pl.read_csv('data/titanic_dataset.csv')

# 기본 컬럼 선택
df.select(['Name', 'Age', 'Survived']).head(2)

Name,Age,Survived
str,f64,i64
"""Braund, Mr. Owen Harris""",22.0,0
"""Cumings, Mrs. John Bradley (Fl…",38.0,1


In [6]:
# 여러 컬럼 선택 방법들
df.select(pl.col(['Name', 'Age'])).head(2)  # pl.col 사용

Name,Age
str,f64
"""Braund, Mr. Owen Harris""",22.0
"""Cumings, Mrs. John Bradley (Fl…",38.0


In [7]:
df.select(pl.all()).head(2)  # 모든 컬럼 선택

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
i64,i64,i64,str,str,f64,i64,i64,str,f64,str,str
1,0,3,"""Braund, Mr. Owen Harris""","""male""",22.0,1,0,"""A/5 21171""",7.25,,"""S"""
2,1,1,"""Cumings, Mrs. John Bradley (Fl…","""female""",38.0,1,0,"""PC 17599""",71.2833,"""C85""","""C"""


In [7]:
df.select(pl.exclude('PassengerId')).head(2)  # 특정 컬럼 제외하고 선택

Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
i64,i64,str,str,f64,i64,i64,str,f64,str,str
0,3,"""Braund, Mr. Owen Harris""","""male""",22.0,1,0,"""A/5 21171""",7.25,,"""S"""
1,1,"""Cumings, Mrs. John Bradley (Fl…","""female""",38.0,1,0,"""PC 17599""",71.2833,"""C85""","""C"""


In [15]:
# 컬럼 타입 기반 선택
df.select(pl.col(pl.Float64)).head(1)  # float64 타입 컬럼들

Age,Fare
f64,f64
22.0,7.25


In [8]:
# 컬럼 변환과 함께 선택
df.select([
    pl.col('Age').alias('age_years'),  # 컬럼명 변경
    pl.col('Fare').round(2),           # 반올림
    pl.col('Name').str.to_uppercase()  # 대문자 변환
]).head(1)

age_years,Fare,Name
f64,f64,str
22.0,7.25,"""BRAUND, MR. OWEN HARRIS"""


# 레시피 11 : 새로운 컬럼 추가 및 변경 with_columns()

In [6]:
df.with_columns((pl.col("Age") * 2).alias("Age_Doubled")).head(2)

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Age_Doubled
i64,i64,i64,str,str,f64,i64,i64,str,f64,str,str,f64
1,0,3,"""Braund, Mr. Owen Harris""","""male""",22.0,1,0,"""A/5 21171""",7.25,,"""S""",44.0
2,1,1,"""Cumings, Mrs. John Bradley (Fl…","""female""",38.0,1,0,"""PC 17599""",71.2833,"""C85""","""C""",76.0


In [9]:
# 기존 컬럼 변환
df.with_columns([
    pl.col('Age').alias('age_years'),  # 컬럼명 변경
    pl.col('Fare').round(2),           # 반올림
    pl.col('Name').str.to_uppercase()  # 대문자 변환
]).head(1)

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,age_years
i64,i64,i64,str,str,f64,i64,i64,str,f64,str,str,f64
1,0,3,"""BRAUND, MR. OWEN HARRIS""","""male""",22.0,1,0,"""A/5 21171""",7.25,,"""S""",22.0


# 레시피 12 : 정규표현식을 활용한 컬럼 선택

In [10]:
# 정규표현식으로 컬럼 선택
# ^: 문자열 시작
# [a-zA-Z]: 알파벳 대소문자
# {0, 4}: 0~4글자 
# $: 문자열 끝
# => 알파벳으로만 이루어진 0~4글자 컬럼 선택
df.select(pl.col(r'^[a-zA-Z]{0, 4}$')).head(2)

Name,Sex,Age,Fare
str,str,f64,f64
"""Braund, Mr. Owen Harris""","""male""",22.0,7.25
"""Cumings, Mrs. John Bradley (Fl…","""female""",38.0,71.2833


In [56]:
# ^S: S로 시작하는
# .*: 그 뒤에 어떤 문자(.)가 0개 이상(*) 올 수 있는
# $: 문자열 끝
# => S로 시작하는 모든 컬럼 선택 (Sex, Survived, SibSp)
df.select(pl.col(r"^S.*$")).head(2)

Survived,Sex,SibSp
i64,str,i64
0,"""male""",1
1,"""female""",1


In [59]:
df.select(pl.col(r"^S.*$")).head(2)

Survived,Sex,SibSp
i64,str,i64
0,"""male""",1
1,"""female""",1


In [60]:
# d로 끝나는 컬럼 선택
# .*: 앞에 어떤 문자(.)가 0개 이상(*) 올 수 있는
# d$: d로 끝나는 문자열
df.select(pl.col(r"^.*d$")).head(2)

PassengerId,Survived,Embarked
i64,i64,str
1,0,"""S"""
2,1,"""C"""


# 레시피 13 : polars selectors 

In [4]:
import polars.selectors as cs 

# 예시: 숫자형(numeric) 데이터 타입의 컬럼만 선택하여 첫 두 행을 출력
df.select(cs.numeric()).head(2)

PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
i64,i64,i64,f64,i64,i64,f64
1,0,3,22.0,1,0,7.25
2,1,1,38.0,1,0,71.2833


In [5]:
# cs.matches를 활용하여 대소문자 구분 없이 'a' 또는 'A'가 포함된 컬럼 선택 (예제)
df.select(cs.matches(r"(?i).*a.*")).head(2)

PassengerId,Pclass,Name,Age,Parch,Fare,Cabin,Embarked
i64,i64,str,f64,i64,f64,str,str
1,3,"""Braund, Mr. Owen Harris""",22.0,0,7.25,,"""S"""
2,1,"""Cumings, Mrs. John Bradley (Fl…",38.0,0,71.2833,"""C85""","""C"""


In [6]:
# P로 시작하는 컬럼 조회
df.select(cs.starts_with("P")).head(2)

PassengerId,Pclass,Parch
i64,i64,i64
1,3,0
2,1,0


# 레시피 14 : 조건에 맞는 행 추출 filter 함수

In [8]:
# 30세 이상 승객을 선택하는 코드
df.filter(pl.col("Age") > 30).head(1)

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
i64,i64,i64,str,str,f64,i64,i64,str,f64,str,str
2,1,1,"""Cumings, Mrs. John Bradley (Fl…","""female""",38.0,1,0,"""PC 17599""",71.2833,"""C85""","""C"""


In [10]:
# 탑승 요금이 20미만인 승객 선택하는 코드
df.filter(pl.col("Fare") < 20).head(1)

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
i64,i64,i64,str,str,f64,i64,i64,str,f64,str,str
1,0,3,"""Braund, Mr. Owen Harris""","""male""",22.0,1,0,"""A/5 21171""",7.25,,"""S"""


In [11]:
# 여성 승객만 선택하는 코드
df.filter(pl.col("Sex") == "female").head(1)

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
i64,i64,i64,str,str,f64,i64,i64,str,f64,str,str
2,1,1,"""Cumings, Mrs. John Bradley (Fl…","""female""",38.0,1,0,"""PC 17599""",71.2833,"""C85""","""C"""


In [14]:
# 3등석이 아닌 승객을 선택
df.filter(pl.col("Pclass") != 3).head(1)

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
i64,i64,i64,str,str,f64,i64,i64,str,f64,str,str
2,1,1,"""Cumings, Mrs. John Bradley (Fl…","""female""",38.0,1,0,"""PC 17599""",71.2833,"""C85""","""C"""


In [17]:
# Age값이 NULL(결측치)인 고객을 선택
df.filter(pl.col("Age").is_null()).head(1)

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
i64,i64,i64,str,str,f64,i64,i64,str,f64,str,str
6,0,3,"""Moran, Mr. James""","""male""",,0,0,"""330877""",8.4583,,"""Q"""


In [19]:
# Cabin이 NULL이 아닌 승객만 선택
df.filter(pl.col("Cabin").is_not_null()).head(1)

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
i64,i64,i64,str,str,f64,i64,i64,str,f64,str,str
2,1,1,"""Cumings, Mrs. John Bradley (Fl…","""female""",38.0,1,0,"""PC 17599""",71.2833,"""C85""","""C"""


In [21]:
# Name 컬럼에 “Smith”가 포함된 승객을 선택
df.filter(pl.col("Name").str.contains("Smith")).head(1)

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
i64,i64,i64,str,str,f64,i64,i64,str,f64,str,str
175,0,1,"""Smith, Mr. James Clinch""","""male""",56.0,0,0,"""17764""",30.6958,"""A7""","""C"""


In [22]:
# Embarked가 "C" 또는 "Q" 인 승객을 선택
df.filter(pl.col("Embarked").is_in(["C", "Q"])).head(1)

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
i64,i64,i64,str,str,f64,i64,i64,str,f64,str,str
2,1,1,"""Cumings, Mrs. John Bradley (Fl…","""female""",38.0,1,0,"""PC 17599""",71.2833,"""C85""","""C"""


In [23]:
# 18세 이상 & 여성 승객을 선택
df.filter((pl.col("Age") > 18) & (pl.col("Sex") == "female")).head(1)

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
i64,i64,i64,str,str,f64,i64,i64,str,f64,str,str
2,1,1,"""Cumings, Mrs. John Bradley (Fl…","""female""",38.0,1,0,"""PC 17599""",71.2833,"""C85""","""C"""


In [25]:
# 요금이 100 초과 또는 1등석인 승객만 선택
df.filter((pl.col("Fare") > 100) | (pl.col("Pclass") == 1)).head(1)

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
i64,i64,i64,str,str,f64,i64,i64,str,f64,str,str
2,1,1,"""Cumings, Mrs. John Bradley (Fl…","""female""",38.0,1,0,"""PC 17599""",71.2833,"""C85""","""C"""
