# 데이터 가져오기

In [2]:
import polars as pl
pl.__version__

'1.22.0'

In [5]:
contoso_sales = pl.read_csv("data/ch08/contoso_sales.csv")
contoso_sales.head(1)


Order Number,Line Number,Order Date,Delivery Date,Customer Name,Customer Gender,Customer Country,Customer Age,Store Name,Product Name,Color,Brand,Category,Subcategory,Quantity,Unit Price,Net Price,Unit Cost,Currency Code,Exchange Rate
i64,i64,str,str,str,str,str,i64,str,str,str,str,str,str,i64,f64,f64,f64,str,f64
284806,1,"""2017-10-18""","""2017-10-20""","""Eric Kennedy""","""Male""","""United States""",47,"""Online store""","""Contoso 512MB MP3 Player E51 S…","""Silver""","""Contoso""","""Audio""","""MP4&MP3""",7,11.691,10.288,5.958,"""USD""",1.0


# 집계함수 기초 

In [6]:
from polars import selectors as cs 
# 수치형 컬럼만 선택하고 합계 계산하기
# Select numeric columns and calculate their sums
contoso_sales.select(cs.numeric()).sum()



Order Number,Line Number,Customer Age,Quantity,Unit Price,Net Price,Unit Cost,Exchange Rate
i64,i64,i64,i64,f64,f64,f64,f64
4466019052,16195,725757,43517,4178500.0,3928600.0,1735600.0,14124.4597


In [9]:
# Quantity 컬럼을 series로 변환하고 합계 계산
# Convert Quantity column to series and calculate sum
print(contoso_sales.get_column("Quantity").sum())
print(contoso_sales.select(pl.col("Quantity")).sum())

43517
shape: (1, 1)
┌──────────┐
│ Quantity │
│ ---      │
│ i64      │
╞══════════╡
│ 43517    │
└──────────┘


In [16]:
# pandas의 value_counts() 함수 적용
# Store Name 컬럼의 값별 빈도수 계산
print(contoso_sales.get_column("Store Name").value_counts())

# Store Name 컬럼의 값별 빈도수를 정렬하여 계산
print("\n정렬된 결과:")
print(contoso_sales.get_column("Store Name").value_counts(sort=True))

# 정규화된 빈도수 계산 (비율)
print("\n정규화된 결과:")
print(contoso_sales.get_column("Store Name").value_counts(normalize=True))


shape: (62, 2)
┌─────────────────────────────────┬───────┐
│ Store Name                      ┆ count │
│ ---                             ┆ ---   │
│ str                             ┆ u32   │
╞═════════════════════════════════╪═══════╡
│ Contoso Store Brandenburg       ┆ 96    │
│ Contoso Store Western Australi… ┆ 72    │
│ Contoso Store Corse             ┆ 21    │
│ Contoso Store Arkansas          ┆ 125   │
│ Contoso Store Delaware          ┆ 100   │
│ …                               ┆ …     │
│ Contoso Store Hawaii            ┆ 132   │
│ Contoso Store Ayrshire          ┆ 74    │
│ Contoso Store Limousin          ┆ 25    │
│ Contoso Store Mayotte           ┆ 30    │
│ Contoso Store Oregon            ┆ 167   │
└─────────────────────────────────┴───────┘

정렬된 결과:
shape: (62, 2)
┌───────────────────────────────┬───────┐
│ Store Name                    ┆ count │
│ ---                           ┆ ---   │
│ str                           ┆ u32   │
╞═══════════════════════════════╪═══════╡
│ O

In [18]:
# Polars 버전 - 스토어 이름으로 필터링하고 수량 합계 계산
# Polars version - Filter store name and sum quantity
store_quantity_pl = contoso_sales.filter(pl.col("Store Name") == "Contoso Store Corse").select("Quantity").sum()
print("시애틀 스토어 수량 합계 (Polars):")
print(store_quantity_pl)

시애틀 스토어 수량 합계 (Polars):
shape: (1, 1)
┌──────────┐
│ Quantity │
│ ---      │
│ i64      │
╞══════════╡
│ 56       │
└──────────┘


In [19]:
# Polars 버전 - when() 함수를 사용한 필터링과 수량 합계 계산 
# Polars version - Filter using when() function and sum quantity
store_quantity_when = contoso_sales.select(
    pl.when(pl.col("Store Name") == "Contoso Store Corse")
    .then(pl.col("Quantity"))
    .otherwise(0)
    .sum()
)

print("시애틀 스토어 수량 합계 (when 사용):")
print(store_quantity_when)


시애틀 스토어 수량 합계 (when 사용):
shape: (1, 1)
┌──────────┐
│ Quantity │
│ ---      │
│ i64      │
╞══════════╡
│ 56       │
└──────────┘


In [22]:
# Pandas 버전 - Pandas로 변환하여 스토어 필터링 후 수량 합계 계산
# Pandas version - Convert to pandas, filter store and sum quantity
import pandas as pd
contoso_sales_pd = contoso_sales.to_pandas()
store_quantity_pd = contoso_sales_pd[contoso_sales_pd["Store Name"] == "Contoso Store Corse"]["Quantity"].sum()
print("\n시애틀 스토어 수량 합계 (Pandas):")
print(store_quantity_pd)


시애틀 스토어 수량 합계 (Pandas):
56


## polars vs pandas 데이터 변환 비교

In [45]:
# Polars 버전의 데이터 처리
# Data processing using Polars
print("\nPolars select를 활용한 데이터 조회:")
contoso_sales.select(["Store Name", "Quantity", "Net Price"]).head(1)



Polars select를 활용한 데이터 조회:


Store Name,Quantity,Net Price
str,i64,f64
"""Online store""",7,10.288


In [44]:
# 특정 조건의 데이터 필터링 
filtered_data_pl = contoso_sales.filter(pl.col("Store Name") == "Contoso Store Corse")
print("\n특정 스토어 데이터:")
filtered_data_pl.head(1)


특정 스토어 데이터:


Order Number,Line Number,Order Date,Delivery Date,Customer Name,Customer Gender,Customer Country,Customer Age,Store Name,Product Name,Color,Brand,Category,Subcategory,Quantity,Unit Price,Net Price,Unit Cost,Currency Code,Exchange Rate
i64,i64,str,str,str,str,str,i64,str,str,str,str,str,str,i64,f64,f64,f64,str,f64
357705,0,"""2019-10-17""","""2019-10-17""","""Tabor Daigneault""","""Male""","""France""",35,"""Contoso Store Corse""","""Contoso 16GB Mp5 Player M1600 …","""White""","""Contoso""","""Audio""","""MP4&MP3""",1,199.9,173.913,91.93,"""EUR""",0.8998


In [43]:
# 다중 조건을 활용한 데이터 필터링
multi_condition_pl = contoso_sales.filter(
    (pl.col("Store Name") == "Contoso Store Corse") & 
    (pl.col("Net Price") > 400)
)
print("\n다중 조건 필터링 결과:")
multi_condition_pl.head(1)


다중 조건 필터링 결과:


Order Number,Line Number,Order Date,Delivery Date,Customer Name,Customer Gender,Customer Country,Customer Age,Store Name,Product Name,Color,Brand,Category,Subcategory,Quantity,Unit Price,Net Price,Unit Cost,Currency Code,Exchange Rate
i64,i64,str,str,str,str,str,i64,str,str,str,str,str,str,i64,f64,f64,f64,str,f64
349303,0,"""2019-07-25""","""2019-07-25""","""Fanette Robitaille""","""Female""","""France""",41,"""Contoso Store Corse""","""Adventure Works Desktop PC1.80…","""White""","""Adventure Works""","""Computers""","""Desktops""",2,499.9,439.912,254.86,"""EUR""",0.8997


In [42]:
# Pandas loc를 활용한 데이터 처리
# Data processing using pandas loc
print("\nPandas loc를 활용한 데이터 조회:")
contoso_sales_pd.loc[:, ["Store Name", "Quantity", "Net Price"]].head(1)


Pandas loc를 활용한 데이터 조회:


Unnamed: 0,Store Name,Quantity,Net Price
0,Online store,7,10.288


In [41]:

# 특정 조건의 데이터 필터링
filtered_data = contoso_sales_pd.loc[contoso_sales_pd["Store Name"] == "Contoso Store Corse"]
print("\n특정 스토어 데이터:")
filtered_data.head(1)


특정 스토어 데이터:


Unnamed: 0,Order Number,Line Number,Order Date,Delivery Date,Customer Name,Customer Gender,Customer Country,Customer Age,Store Name,Product Name,Color,Brand,Category,Subcategory,Quantity,Unit Price,Net Price,Unit Cost,Currency Code,Exchange Rate
68,357705,0,2019-10-17,2019-10-17,Tabor Daigneault,Male,France,35,Contoso Store Corse,Contoso 16GB Mp5 Player M1600 White,White,Contoso,Audio,MP4&MP3,1,199.9,173.913,91.93,EUR,0.8998


In [40]:
# 다중 조건을 활용한 데이터 필터링
multi_condition = contoso_sales_pd.loc[
    (contoso_sales_pd["Store Name"] == "Contoso Store Corse") & 
    (contoso_sales_pd["Net Price"] > 400)
]
print("\n다중 조건 필터링 결과:")
multi_condition


다중 조건 필터링 결과:


Unnamed: 0,Order Number,Line Number,Order Date,Delivery Date,Customer Name,Customer Gender,Customer Country,Customer Age,Store Name,Product Name,Color,Brand,Category,Subcategory,Quantity,Unit Price,Net Price,Unit Cost,Currency Code,Exchange Rate
2634,349303,0,2019-07-25,2019-07-25,Fanette Robitaille,Female,France,41,Contoso Store Corse,Adventure Works Desktop PC1.80 ED182 White,White,Adventure Works,Computers,Desktops,2,499.9,439.912,254.86,EUR,0.8997
3824,272204,3,2017-06-14,2017-06-14,Malagigi Lespérance,Male,France,84,Contoso Store Corse,Proseware Screen 113in X1609 Silver,Silver,Proseware,Computers,Projectors & Screens,3,448.5,448.5,148.59,EUR,0.8926
13075,272204,1,2017-06-14,2017-06-14,Malagigi Lespérance,Male,France,84,Contoso Store Corse,WWI Floor Lamp X115 Grey,Grey,Wide World Importers,Home Appliances,Lamps,3,572.391,503.704,189.648,EUR,0.8926
13225,357705,2,2019-10-17,2019-10-17,Tabor Daigneault,Male,France,35,Contoso Store Corse,Proseware Air conditioner 25000BTU L167 Silver,Silver,Proseware,Home Appliances,Air Conditioners,3,635.99,635.99,210.72,EUR,0.8998
