# CSV

## CSV 파일 읽기

In [1]:
import polars as pl 
df_read = pl.read_csv('data/ch06/input/customer_shopping_data.csv')
df_read.head(1)

invoice_no,customer_id,gender,age,category,quantity,price,payment_method,invoice_date,shopping_mall
str,str,str,i64,str,i64,f64,str,str,str
"""I138884""","""C241288""","""Female""",28,"""Clothing""",5,1500.4,"""Credit Card""","""5/8/2022""","""Kanyon"""


In [11]:
import polars as pl 
df_scan = pl.scan_csv('data/ch06/input/customer_shopping_data.csv')
df_scan.head(1).collect()

invoice_no,customer_id,gender,age,category,quantity,price,payment_method,invoice_date,shopping_mall
str,str,str,i64,str,i64,f64,str,str,str
"""I138884""","""C241288""","""Female""",28,"""Clothing""",5,1500.4,"""Credit Card""","""5/8/2022""","""Kanyon"""


## CSV 파일 쓰기

In [14]:
df_read.write_csv('data/ch06/output/customer_shopping_data(read).csv')


In [15]:
df_scan.collect().write_csv('data/ch06/output/customer_shopping_data(scan).csv')

## 내보낸 파일 확인

In [2]:
import os
import glob

def print_file_info(directory_path: str) -> None:
    """
    지정된 디렉토리의 CSV 파일들의 정보를 출력하는 함수
    
    매개변수:
        directory_path: CSV 파일들이 있는 디렉토리 경로
    """
    # Get all CSV files in output directory
    output_files = glob.glob(f'{directory_path}/*')
    
    for file in output_files:
        size = os.path.getsize(file) / (1024 * 1024)  # Convert to MB
        print(f"File: {file}")
        print(f"Size: {size:.2f} MB")
        print(f"Absolute path: {os.path.abspath(file)}")
        print()

# 사용자 정의 함수 호출
print_file_info('data/ch06/output')


File: data/ch06/output\customer_shopping_data(read).csv
Size: 7.11 MB
Absolute path: c:\Users\campus3S043\Desktop\polars_recipes\data\ch06\output\customer_shopping_data(read).csv

File: data/ch06/output\customer_shopping_data(scan).csv
Size: 7.11 MB
Absolute path: c:\Users\campus3S043\Desktop\polars_recipes\data\ch06\output\customer_shopping_data(scan).csv

File: data/ch06/output\customer_shopping_data.parquet
Size: 1.17 MB
Absolute path: c:\Users\campus3S043\Desktop\polars_recipes\data\ch06\output\customer_shopping_data.parquet

File: data/ch06/output\customer_shopping_data_output.parquet
Size: 1.80 MB
Absolute path: c:\Users\campus3S043\Desktop\polars_recipes\data\ch06\output\customer_shopping_data_output.parquet

File: data/ch06/output\customer_shopping_data_output_lazy.parquet
Size: 0.89 MB
Absolute path: c:\Users\campus3S043\Desktop\polars_recipes\data\ch06\output\customer_shopping_data_output_lazy.parquet

File: data/ch06/output\customer_shopping_data_output_partitioned.parquet
S

# Parquet 
- pyarrow 설치 

In [20]:
!pip install pyarrow


Collecting pyarrow
  Using cached pyarrow-19.0.0-cp312-cp312-win_amd64.whl.metadata (3.4 kB)
Using cached pyarrow-19.0.0-cp312-cp312-win_amd64.whl (25.2 MB)
Installing collected packages: pyarrow
Successfully installed pyarrow-19.0.0



[notice] A new release of pip is available: 24.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


## CSV 파일에서 parquet으로 내보내기

In [27]:
import polars as pl 
df_read = pl.read_csv('data/ch06/input/customer_shopping_data.csv')

# Parquet 파일로 내보내기
df_read.write_parquet('data/ch06/input/customer_shopping_data.parquet')

## parquet으로 파일 불러오기

In [37]:

# 일부 컬럼만 선택해서 가져오기 
df_read_parquet = pl.read_parquet(
    'data/ch06/input/customer_shopping_data.parquet',
    columns=['invoice_no', 'customer_id', 'gender', 'price'],
    row_index_name='row_cnt'
)
df_read_parquet.head(1)


invoice_no,customer_id,gender,price
str,str,str,f64
"""I138884""","""C241288""","""Female""",1500.4


In [36]:
# scan_parquet 사용
df_scan_parquet = pl.scan_parquet(
    'data/ch06/input/customer_shopping_data.parquet',
    row_index_name='row_cnt'
).select(['invoice_no', 'customer_id', 'gender', 'price'])
df_scan_parquet.head(1).collect()

invoice_no,customer_id,gender,price
str,str,str,f64
"""I138884""","""C241288""","""Female""",1500.4


In [9]:
# Hive 형태의 파티션된 parquet 파일 읽기
df_read_parquet_hive = pl.read_parquet(
    'data/ch06/input/venture_funding_deals_partitioned',
    use_pyarrow=True,
    pyarrow_options={'partitioning': 'hive'}
)
df_read_parquet_hive.head(5)

Company,Amount,Lead investors,Valuation,Date reported,Industry
str,str,str,str,str,cat
"""Restaurant365""","""$135,000,000""","""KKR, L Catterton""","""$1,000,000,000""","""5/19/23""","""Accounting"""
"""Madhive""","""$300,000,000""","""Goldman Sachs Asset Management""","""$1,000,000,000""","""6/13/23""","""Advertising"""
"""Ursa Major,""","""$100,000,000""","""BlackRock, Space Capital""","""n/a""","""4/26/23""","""Aerospace"""
"""Indigo""","""$250,000,000""","""Flagship Pioneering, State of …","""na""","""9/15/23""","""Agriculture"""
"""Chronosphere""","""$115,000,000""","""GV""","""n/a""","""1/9/23""","""Analytics"""


## 파일 내보내기 옵션

In [None]:
# compression 옵션을 사용하여 parquet 파일로 내보내기
# compression='lz4'
# compression_level=10
df_read.write_parquet(
    'data/ch06/output/customer_shopping_data_output.parquet',
    compression='lz4',
    compression_level=10
)

print_file_info('data/ch06/output')

File: data/ch06/output\customer_shopping_data(read).csv
Size: 7.11 MB
Absolute path: c:\Users\campus3S043\Desktop\polars_recipes\data\ch06\output\customer_shopping_data(read).csv

File: data/ch06/output\customer_shopping_data(scan).csv
Size: 7.11 MB
Absolute path: c:\Users\campus3S043\Desktop\polars_recipes\data\ch06\output\customer_shopping_data(scan).csv

File: data/ch06/output\customer_shopping_data.parquet
Size: 1.17 MB
Absolute path: c:\Users\campus3S043\Desktop\polars_recipes\data\ch06\output\customer_shopping_data.parquet

File: data/ch06/output\customer_shopping_data_output.parquet
Size: 1.80 MB
Absolute path: c:\Users\campus3S043\Desktop\polars_recipes\data\ch06\output\customer_shopping_data_output.parquet



In [39]:
# LazyFrame Parquet 파일로 내보내기
df_scan_parquet.sink_parquet(
    'data/ch06/output/customer_shopping_data_output_lazy.parquet',
    maintain_order=False
)
print_file_info('data/ch06/output')


File: data/ch06/output\customer_shopping_data(read).csv
Size: 7.11 MB
Absolute path: c:\Users\campus3S043\Desktop\polars_recipes\data\ch06\output\customer_shopping_data(read).csv

File: data/ch06/output\customer_shopping_data(scan).csv
Size: 7.11 MB
Absolute path: c:\Users\campus3S043\Desktop\polars_recipes\data\ch06\output\customer_shopping_data(scan).csv

File: data/ch06/output\customer_shopping_data.parquet
Size: 1.17 MB
Absolute path: c:\Users\campus3S043\Desktop\polars_recipes\data\ch06\output\customer_shopping_data.parquet

File: data/ch06/output\customer_shopping_data_output.parquet
Size: 1.80 MB
Absolute path: c:\Users\campus3S043\Desktop\polars_recipes\data\ch06\output\customer_shopping_data_output.parquet

File: data/ch06/output\customer_shopping_data_output_lazy.parquet
Size: 0.89 MB
Absolute path: c:\Users\campus3S043\Desktop\polars_recipes\data\ch06\output\customer_shopping_data_output_lazy.parquet




More information on the new streaming engine: https://github.com/pola-rs/polars/issues/20947
  df_scan_parquet.sink_parquet(


In [5]:
# partitioned parquet 파일 내보내기
df_read.write_parquet(
    'data/ch06/output/customer_shopping_data_output_partitioned.parquet',
    partition_by='gender',
)

print_file_info('data/ch06/output')

File: data/ch06/output\customer_shopping_data(read).csv
Size: 7.11 MB
Absolute path: c:\Users\campus3S043\Desktop\polars_recipes\data\ch06\output\customer_shopping_data(read).csv

File: data/ch06/output\customer_shopping_data(scan).csv
Size: 7.11 MB
Absolute path: c:\Users\campus3S043\Desktop\polars_recipes\data\ch06\output\customer_shopping_data(scan).csv

File: data/ch06/output\customer_shopping_data.parquet
Size: 1.17 MB
Absolute path: c:\Users\campus3S043\Desktop\polars_recipes\data\ch06\output\customer_shopping_data.parquet

File: data/ch06/output\customer_shopping_data_output.parquet
Size: 1.80 MB
Absolute path: c:\Users\campus3S043\Desktop\polars_recipes\data\ch06\output\customer_shopping_data_output.parquet

File: data/ch06/output\customer_shopping_data_output_lazy.parquet
Size: 0.89 MB
Absolute path: c:\Users\campus3S043\Desktop\polars_recipes\data\ch06\output\customer_shopping_data_output_lazy.parquet

File: data/ch06/output\customer_shopping_data_output_partitioned.parquet
S

# Delta Lake
- Delta Lake는 데이터 레이크를 위한 오픈소스 스토리지 계층입니다.
# Delta Lake의 주요 특징:
- ACID 트랜잭션 보장
- 스키마 진화 (Schema Evolution)
- 시간 여행 (Time Travel)
- 데이터 품질 관리
- 대규모 메타데이터 처리
- 스트리밍과 배치 처리 통합

## DataFrame에서 Delta로 내보내기

In [7]:
!pip install deltalake




[notice] A new release of pip is available: 24.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip





In [7]:
df_read.head(1)

invoice_no,customer_id,gender,age,category,quantity,price,payment_method,invoice_date,shopping_mall
str,str,str,i64,str,i64,f64,str,str,str
"""I138884""","""C241288""","""Female""",28,"""Clothing""",5,1500.4,"""Credit Card""","""5/8/2022""","""Kanyon"""


In [10]:
df_read.write_delta(
    'data/ch06/output/customer_shopping_data_partitioned', 
    mode='overwrite', 
    delta_write_options={
        'partition_by' : 'category'
    }
)

print_file_info('data/ch06/output')

File: data/ch06/output\customer_shopping_data(read).csv
Size: 7.11 MB
Absolute path: c:\Users\campus3S043\Desktop\polars_recipes\data\ch06\output\customer_shopping_data(read).csv

File: data/ch06/output\customer_shopping_data(scan).csv
Size: 7.11 MB
Absolute path: c:\Users\campus3S043\Desktop\polars_recipes\data\ch06\output\customer_shopping_data(scan).csv

File: data/ch06/output\customer_shopping_data.delta
Size: 0.00 MB
Absolute path: c:\Users\campus3S043\Desktop\polars_recipes\data\ch06\output\customer_shopping_data.delta

File: data/ch06/output\customer_shopping_data.parquet
Size: 1.17 MB
Absolute path: c:\Users\campus3S043\Desktop\polars_recipes\data\ch06\output\customer_shopping_data.parquet

File: data/ch06/output\customer_shopping_data_output.parquet
Size: 1.80 MB
Absolute path: c:\Users\campus3S043\Desktop\polars_recipes\data\ch06\output\customer_shopping_data_output.parquet

File: data/ch06/output\customer_shopping_data_output_lazy.parquet
Size: 0.89 MB
Absolute path: c:\User

## 파일 불러오기

In [11]:
# pl.read_delta
df_read_delta = pl.read_delta('data/ch06/input/venture_funding_deals_delta')
df_read_delta.head(1)


Company,Amount,Lead investors,Valuation,Industry,Date reported
str,str,str,str,str,str
"""OpenAI""","""$10,000,000,000""","""Microsoft""","""n/a""","""Artificial intelligence""","""1/23/23"""


In [12]:
# pl.scan_delta
df_scan_delta = pl.scan_delta('data/ch06/input/venture_funding_deals_delta')
df_scan_delta.collect().head(1)

Company,Amount,Lead investors,Valuation,Industry,Date reported
str,str,str,str,str,str
"""OpenAI""","""$10,000,000,000""","""Microsoft""","""n/a""","""Artificial intelligence""","""1/23/23"""


## 파일 내보내기

In [35]:
df_read_delta.write_delta(
    'data/ch06/input/venture_funding_deals_partitioned', 
    mode='overwrite', 
    delta_write_options={'partition_by':'Industry', 'schema_mode' : 'overwrite'}
)

## 파일 읽어오기

In [39]:
df_delta_partitioned = pl.read_delta(
    'data/ch06/input/venture_funding_deals_partitioned',
    use_pyarrow=False, 
)

df_delta_partitioned.head(1)

Company,Amount,Lead investors,Valuation,Date reported,Industry
str,str,str,str,str,str
"""Neuralink""","""$280,000,000""","""Founders Fund""","""n/a""","""8/7/23""","""Neuroscience"""


# JSON

In [47]:
df_read.head(1)

invoice_no,customer_id,gender,age,category,quantity,price,payment_method,invoice_date,shopping_mall
str,str,str,i64,str,i64,f64,str,str,str
"""I138884""","""C241288""","""Female""",28,"""Clothing""",5,1500.4,"""Credit Card""","""5/8/2022""","""Kanyon"""


## DataFrame에서 JSON으로 내보내기

In [52]:
df_read.write_json('data/ch06/output/customer_shopping_data.json')

print_file_info('data/ch06/output')

File: data/ch06/output\customer_shopping_data(read).csv
Size: 7.11 MB
Absolute path: c:\Users\campus3S043\Desktop\polars_recipes\data\ch06\output\customer_shopping_data(read).csv

File: data/ch06/output\customer_shopping_data(scan).csv
Size: 7.11 MB
Absolute path: c:\Users\campus3S043\Desktop\polars_recipes\data\ch06\output\customer_shopping_data(scan).csv

File: data/ch06/output\customer_shopping_data.delta
Size: 0.00 MB
Absolute path: c:\Users\campus3S043\Desktop\polars_recipes\data\ch06\output\customer_shopping_data.delta

File: data/ch06/output\customer_shopping_data.json
Size: 20.01 MB
Absolute path: c:\Users\campus3S043\Desktop\polars_recipes\data\ch06\output\customer_shopping_data.json

File: data/ch06/output\customer_shopping_data.parquet
Size: 1.17 MB
Absolute path: c:\Users\campus3S043\Desktop\polars_recipes\data\ch06\output\customer_shopping_data.parquet

File: data/ch06/output\customer_shopping_data_output.parquet
Size: 1.80 MB
Absolute path: c:\Users\campus3S043\Desktop\po

In [70]:
# NDJSON
df_read.write_ndjson('data/ch06/output/customer_shopping_data.jsonl')

print_file_info('data/ch06/output')

File: data/ch06/output\customer_shopping_data(read).csv
Size: 7.11 MB
Absolute path: c:\Users\campus3S043\Desktop\polars_recipes\data\ch06\output\customer_shopping_data(read).csv

File: data/ch06/output\customer_shopping_data(scan).csv
Size: 7.11 MB
Absolute path: c:\Users\campus3S043\Desktop\polars_recipes\data\ch06\output\customer_shopping_data(scan).csv

File: data/ch06/output\customer_shopping_data.delta
Size: 0.00 MB
Absolute path: c:\Users\campus3S043\Desktop\polars_recipes\data\ch06\output\customer_shopping_data.delta

File: data/ch06/output\customer_shopping_data.json
Size: 20.01 MB
Absolute path: c:\Users\campus3S043\Desktop\polars_recipes\data\ch06\output\customer_shopping_data.json

File: data/ch06/output\customer_shopping_data.jsonl
Size: 20.01 MB
Absolute path: c:\Users\campus3S043\Desktop\polars_recipes\data\ch06\output\customer_shopping_data.jsonl

File: data/ch06/output\customer_shopping_data.parquet
Size: 1.17 MB
Absolute path: c:\Users\campus3S043\Desktop\polars_recip

## JSON 파일 읽기

In [57]:
df_json_read = pl.read_json('data/ch06/input/world_population.json')

df_json_read.head(1)

place,pop1980,pop2000,pop2010,pop2022,pop2023,pop2030,pop2050,country,area,landAreaKm,cca2,cca3,netChange,growthRate,worldPercentage,density,densityMi,rank
i64,f64,f64,f64,f64,i64,f64,f64,str,i64,f64,str,str,f64,f64,f64,f64,f64,i64
356,696828385.0,1059600000.0,1240600000.0,1417200000.0,1428627663,1515000000.0,1670500000.0,"""India""",3287590,2973190.0,"""IN""","""IND""",0.4184,0.0081,0.1785,480.5033,1244.5036,1


In [58]:
df_jsonl_read = pl.read_ndjson('data/ch06/input/world_population.jsonl')

df_jsonl_read.head(1)

place,pop1980,pop2000,pop2010,pop2022,pop2023,pop2030,pop2050,country,area,landAreaKm,cca2,cca3,netChange,growthRate,worldPercentage,density,densityMi,rank
i64,f64,f64,f64,f64,i64,f64,f64,str,i64,f64,str,str,f64,f64,f64,f64,f64,i64
356,696828385.0,1059600000.0,1240600000.0,1417200000.0,1428627663,1515000000.0,1670500000.0,"""India""",3287590,2973190.0,"""IN""","""IND""",0.4184,0.0081,0.1785,480.5033,1244.5036,1


In [66]:
df_ndjson_scan = pl.scan_ndjson('data/ch06/input/world_population.jsonl')

df_ndjson_scan.head(1).collect()

place,pop1980,pop2000,pop2010,pop2022,pop2023,pop2030,pop2050,country,area,landAreaKm,cca2,cca3,netChange,growthRate,worldPercentage,density,densityMi,rank
i64,f64,f64,f64,f64,i64,f64,f64,str,i64,f64,str,str,f64,f64,f64,f64,f64,i64
356,696828385.0,1059600000.0,1240600000.0,1417200000.0,1428627663,1515000000.0,1670500000.0,"""India""",3287590,2973190.0,"""IN""","""IND""",0.4184,0.0081,0.1785,480.5033,1244.5036,1


# Excel

In [75]:
!pip install xlsx2csv xlsxwriter openpyxl

Collecting openpyxl
  Using cached openpyxl-3.1.5-py2.py3-none-any.whl.metadata (2.5 kB)
Collecting et-xmlfile (from openpyxl)
  Using cached et_xmlfile-2.0.0-py3-none-any.whl.metadata (2.7 kB)
Using cached openpyxl-3.1.5-py2.py3-none-any.whl (250 kB)
Using cached et_xmlfile-2.0.0-py3-none-any.whl (18 kB)
Installing collected packages: et-xmlfile, openpyxl
Successfully installed et-xmlfile-2.0.0 openpyxl-3.1.5



[notice] A new release of pip is available: 24.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


## 파일 내보내기

In [72]:
df_read.write_excel(
    'data/ch06/output/customer_shopping_data.xlsx', 
    worksheet='Sheet1', 
    header_format={'bold': True}
)

print_file_info('data/ch06/output')

File: data/ch06/output\customer_shopping_data(read).csv
Size: 7.11 MB
Absolute path: c:\Users\campus3S043\Desktop\polars_recipes\data\ch06\output\customer_shopping_data(read).csv

File: data/ch06/output\customer_shopping_data(scan).csv
Size: 7.11 MB
Absolute path: c:\Users\campus3S043\Desktop\polars_recipes\data\ch06\output\customer_shopping_data(scan).csv

File: data/ch06/output\customer_shopping_data.delta
Size: 0.00 MB
Absolute path: c:\Users\campus3S043\Desktop\polars_recipes\data\ch06\output\customer_shopping_data.delta

File: data/ch06/output\customer_shopping_data.json
Size: 20.01 MB
Absolute path: c:\Users\campus3S043\Desktop\polars_recipes\data\ch06\output\customer_shopping_data.json

File: data/ch06/output\customer_shopping_data.jsonl
Size: 20.01 MB
Absolute path: c:\Users\campus3S043\Desktop\polars_recipes\data\ch06\output\customer_shopping_data.jsonl

File: data/ch06/output\customer_shopping_data.parquet
Size: 1.17 MB
Absolute path: c:\Users\campus3S043\Desktop\polars_recip

In [78]:
df_read_excel = pl.read_excel(
    'data/ch06/output/customer_shopping_data.xlsx', 
    sheet_name='Sheet1', 
    engine='openpyxl', # xlsx2csv, xlsxwriter
    read_options={'try_parse_dates' : True}
)

df_read_excel.head(1)

invoice_no,customer_id,gender,age,category,quantity,price,payment_method,invoice_date,shopping_mall
str,str,str,i64,str,i64,f64,str,str,str
"""I138884""","""C241288""","""Female""",28,"""Clothing""",5,1500.4,"""Credit Card""","""5/8/2022""","""Kanyon"""
