# SparkSession으로 S3 CSV -> S3 DeltaTable 저장

In [6]:
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf

# 연결 변수
AWS_ACCESS_KEY_ID =""
AWS_SECRET_ACCESS_KEY = ""
AWS_DEFAULT_REGION = "ap-northeast-2"

# spark session 생성시 aws와 연동하기
def aws_connect_spark(AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY):
    # 설정
    conf = (
        SparkConf()
        .setAppName("MY_APP") # replace with your desired name
        .set("spark.jars.packages", "io.delta:delta-core_2.12:2.3.0,org.apache.hadoop:hadoop-aws:3.3.2")
        .set("spark.sql.catalog.spark_catalog","org.apache.spark.sql.delta.catalog.DeltaCatalog")
        .set("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
        .set("spark.hadoop.fs.s3a.access.key", AWS_ACCESS_KEY_ID)
        .set("spark.hadoop.fs.s3a.secret.key", AWS_SECRET_ACCESS_KEY)
        .set("spark.sql.shuffle.partitions", "4") # default is 200 partitions which is too many for local
        .setMaster("local[*]") # replace the * with your desired number of cores. * for use all.
    )

    # spark 생성
    spark = SparkSession.builder.config(conf=conf).getOrCreate()
    
    return spark

---
### CSV 읽기

In [7]:
spark = aws_connect_spark(AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY)

# Signature V4 설정
spark.sparkContext.setSystemProperty("com.amazonaws.services.s3.enableV4", "true")

s3_data_path = "s3a://donghee-s3-alldata/csv/sf-fire-calls.csv"

# s3a:// 사용 예시 
df = spark.read.format("csv").option("header", "true").option("inferSchema", "true").csv(s3_data_path)

                                                                                

In [8]:
df.show()

+----------+------+--------------+----------------+----------+----------+--------------------+--------------------+--------------------+----+-------+---------+-----------+----+----------------+--------+-------------+-------+-------------+---------+--------------+--------------------------+----------------------+------------------+--------------------+--------------------+-------------+---------+
|CallNumber|UnitID|IncidentNumber|        CallType|  CallDate| WatchDate|CallFinalDisposition|       AvailableDtTm|             Address|City|Zipcode|Battalion|StationArea| Box|OriginalPriority|Priority|FinalPriority|ALSUnit|CallTypeGroup|NumAlarms|      UnitType|UnitSequenceInCallDispatch|FirePreventionDistrict|SupervisorDistrict|        Neighborhood|            Location|        RowID|    Delay|
+----------+------+--------------+----------------+----------+----------+--------------------+--------------------+--------------------+----+-------+---------+-----------+----+----------------+--------+

---
### DeltaTable 형식으로 저장하기

In [6]:
s3_delta_path = "s3a://donghee-s3-alldata/delta-table/sf-fire-calls-table"
df.write.format("delta").save(s3_delta_path)

[Stage 3:>                                                        (0 + 11) / 11]

24/02/16 12:30:22 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 95.00% for 8 writers
24/02/16 12:30:22 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 84.44% for 9 writers
24/02/16 12:30:22 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 76.00% for 10 writers
24/02/16 12:30:22 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 69.09% for 11 writers
24/02/16 12:30:24 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 76.00% for 10 writers
24/02/16 12:30:24 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 84.44% for 9 writers
24/02/16 12:30:25 WARN MemoryManager: Total allocation exceeds 95.0

                                                                                