# 레시피 15 : 사칙연산을 활용한 데이터 생성

In [30]:
import polars as pl

df = pl.read_csv("data/titanic_dataset.csv")
df.head(1)

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
i64,i64,i64,str,str,f64,i64,i64,str,f64,str,str
1,0,3,"""Braund, Mr. Owen Harris""","""male""",22.0,1,0,"""A/5 21171""",7.25,,"""S"""


In [3]:
df.with_columns([
    # Fare에 10% surcharge(추가 요금)를 부과한 새로운 컬럼 생성
    (pl.col("Fare") * 1.1).alias("Fare_with_surcharge"),

    # Pclass(객실 등급)으로 나눈 Fare를 계산 (클래스별 운임 비교용)
    (pl.col("Fare") / pl.col("Pclass")).alias("Fare_per_class"),

    # 가족 규모 계산: 형제/자매(SibSp) + 부모/자녀(Parch) + 본인(1명 추가)
    (pl.col("SibSp") + pl.col("Parch") + 1).alias("Family_size"),
    
    # 미성년자인지 여부: Age가 존재하는 경우 18세 미만이면 True 반환
    (pl.col("Age") < 18).alias("IsMinor"), 
]).head(1)

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Fare_with_surcharge,Fare_per_class,Family_size,IsMinor
i64,i64,i64,str,str,f64,i64,i64,str,f64,str,str,f64,f64,i64,bool
1,0,3,"""Braund, Mr. Owen Harris""","""male""",22.0,1,0,"""A/5 21171""",7.25,,"""S""",7.975,2.416667,2,False


# 레시피 16 : 집계 함수를 활용한 새로운 컬럼 생성

In [4]:
df.with_columns([
    # Fare 컬럼의 최대값을 새로운 컬럼 "max_fare"로 저장
    pl.col("Fare").max().alias("max_fare"),

    # Fare 컬럼의 평균값을 새로운 컬럼 "ave_fare"로 저장
    pl.col("Fare").mean().alias("ave_fare"),

    # 최대 운임과 평균 운임의 차이를 계산하여 "Fare_diff" 컬럼 생성
    (pl.col("Fare").max() - pl.col("Fare").mean()).alias("Fare_diff")
]).head(1)

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,max_fare,ave_fare,Fare_diff
i64,i64,i64,str,str,f64,i64,i64,str,f64,str,str,f64,f64,f64
1,0,3,"""Braund, Mr. Owen Harris""","""male""",22.0,1,0,"""A/5 21171""",7.25,,"""S""",512.3292,32.204208,480.124992


# 레시피 17 : 행 인덱스 추가 with_row_index

In [6]:
df.with_row_index().head(1)

index,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
u32,i64,i64,i64,str,str,f64,i64,i64,str,f64,str,str
0,1,0,3,"""Braund, Mr. Owen Harris""","""male""",22.0,1,0,"""A/5 21171""",7.25,,"""S"""


# 레시피 18 : 데이터 수정

In [12]:
# rename
df.rename({
    "Pclass": "Passenger_Class",
    "Fare": "Ticket_Price"
}).head(1)

PassengerId,Survived,Passenger_Class,Name,Sex,Age,SibSp,Parch,Ticket,Ticket_Price,Cabin,Embarked
i64,i64,i64,str,str,f64,i64,i64,str,f64,str,str
1,0,3,"""Braund, Mr. Owen Harris""","""male""",22.0,1,0,"""A/5 21171""",7.25,,"""S"""


In [18]:
# map_elements 활용
df.with_columns(
    pl.col("Age")
    .map_elements(lambda x: "Child" if x < 18 else "Adult", return_dtype=pl.String)
    .alias("age_group")
).head(1)

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,age_group
i64,i64,i64,str,str,f64,i64,i64,str,f64,str,str,str
1,0,3,"""Braund, Mr. Owen Harris""","""male""",22.0,1,0,"""A/5 21171""",7.25,,"""S""","""Adult"""


In [19]:
# 속도가 더 빠른 방법
df.with_columns(
    pl.when(pl.col("Age") < 18)
    .then(pl.lit("Child"))
    .otherwise(pl.lit("Adult"))
    .alias("age_group")
).head(1)

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,age_group
i64,i64,i64,str,str,f64,i64,i64,str,f64,str,str,str
1,0,3,"""Braund, Mr. Owen Harris""","""male""",22.0,1,0,"""A/5 21171""",7.25,,"""S""","""Adult"""


In [23]:
# 비교 
import polars as pl
import timeit
import numpy as np 

# 샘플 데이터 생성 (100만 개의 Age 데이터)
df = pl.DataFrame({"Age": np.arange(0, 100000000) % 100}) 

# map_elements() 방식
def test_map_elements():
    df.with_columns(
        pl.col("Age")
        .map_elements(lambda x: "Child" if x < 18 else "Adult", return_dtype=pl.String)
        .alias("age_group")
    ).head(1)

# when().then().otherwise() 방식
def test_when_then():
    df.with_columns(
        pl.when(pl.col("Age") < 18)
        .then(pl.lit("Child"))
        .otherwise(pl.lit("Adult"))
        .alias("age_group")
    ).head(1)

# 실행 시간 측정
map_elements_time = timeit.timeit(test_map_elements, number=10)
when_then_time = timeit.timeit(test_when_then, number=10)

# 결과 출력
print(f"map_elements() 실행 시간: {map_elements_time:.6f}초")
print(f"when().then().otherwise() 실행 시간: {when_then_time:.6f}초")

map_elements() 실행 시간: 131.874113초
when().then().otherwise() 실행 시간: 9.131302초


In [31]:
# 각 컬럼의 NULL 값 개수 확인
df.select(pl.all().is_null().sum())

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
0,0,0,0,0,177,0,0,0,0,687,2


In [32]:
df2 = df.with_columns(
    df["Age"].fill_null(df["Age"].mean())
)
df2.select(pl.all().is_null().sum())

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
0,0,0,0,0,0,0,0,0,0,687,2


# 레시피 19 : 컬럼 삭제

In [33]:
df.drop('Age').head(1)

PassengerId,Survived,Pclass,Name,Sex,SibSp,Parch,Ticket,Fare,Cabin,Embarked
i64,i64,i64,str,str,i64,i64,str,f64,str,str
1,0,3,"""Braund, Mr. Owen Harris""","""male""",1,0,"""A/5 21171""",7.25,,"""S"""


In [34]:
df.drop(["PassengerId", "Ticket", "Cabin"]).head(1)

Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked
i64,i64,str,str,f64,i64,i64,f64,str
0,3,"""Braund, Mr. Owen Harris""","""male""",22.0,1,0,7.25,"""S"""


In [38]:
import polars.selectors as cs

# cs.exclude
df.select(cs.exclude("Age")).head(1)

PassengerId,Survived,Pclass,Name,Sex,SibSp,Parch,Ticket,Fare,Cabin,Embarked
i64,i64,i64,str,str,i64,i64,str,f64,str,str
1,0,3,"""Braund, Mr. Owen Harris""","""male""",1,0,"""A/5 21171""",7.25,,"""S"""


In [40]:
df.select(cs.exclude(["Age", "Fare"])).head(1)

PassengerId,Survived,Pclass,Name,Sex,SibSp,Parch,Ticket,Cabin,Embarked
i64,i64,i64,str,str,i64,i64,str,str,str
1,0,3,"""Braund, Mr. Owen Harris""","""male""",1,0,"""A/5 21171""",,"""S"""
