# Minio 라이브러리를 이용한 Bucket 확인 및 생성

In [2]:
from minio import Minio
from glob import glob
import os

BUCKET_NAME = "savepaint-bucket"

client = Minio(
    "localhost:9000",
    access_key="admin", secret_key="changeme", secure=False
)

client.bucket_exists(BUCKET_NAME)

True

In [2]:
if not client.bucket_exists(BUCKET_NAME):
    client.make_bucket(BUCKET_NAME)

client.bucket_exists(BUCKET_NAME)

True

In [3]:
buckets = client.list_buckets()

for bucket in buckets:
    print(bucket.name)

savepaint-bucket


---
# Spark Session 생성 (Minio <-> Delta Lake)

In [1]:
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf

minio_access_key = "admin"
minio_secret_key = "changeme"

# spark session 생성시 aws와 연동하기
def s3_connect_spark(minio_access_key, minio_secret_key):
    # 설정
    conf = (
        SparkConf()
        .setAppName("MY_APP") # replace with your desired name
        .set("spark.hadoop.fs.s3a.access.key", minio_access_key)
        .set("spark.hadoop.fs.s3a.secret.key", minio_secret_key)
        .set("spark.hadoop.fs.s3a.endpoint", "http://127.0.0.1:9000")
        .set("spark.jars.packages", "io.delta:delta-core_2.12:2.1.1,org.apache.hadoop:hadoop-aws:3.3.1")
        .set("spark.databricks.delta.retentionDurationCheck.enabled", "false")
        .set("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") # Deltalake로 Apache Spark 설정
        .set("spark.sql.catalog.spark_catalog","org.apache.spark.sql.delta.catalog.DeltaCatalog") # Deltalake로 Apache Spark 설정
        .set("spark.databricks.delta.properties.defaults.columnMapping.mode","name") # header 공백 및 특수문자 인식
        .set('spark.sql.parquet.columnarReaderBatchSize',100) # ?
        .set("spark.executor.memory", "8g") # 각 Spark worker의 memory 크기
        .set("spark.driver.memory", "2g") # Spark Driver의 크기
    )

    # spark 생성
    spark = SparkSession.builder.config(conf=conf).getOrCreate()
    
    return spark

In [2]:
spark = s3_connect_spark(minio_access_key, minio_secret_key)

24/03/11 09:40:19 WARN Utils: Your hostname, donghee-MS-7C94 resolves to a loopback address: 127.0.1.1; using 192.168.75.133 instead (on interface enp42s0)
24/03/11 09:40:19 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
:: loading settings :: url = jar:file:/home/donghee/work/deltalakeenv/lib/python3.10/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/donghee/.ivy2/cache
The jars for the packages stored in: /home/donghee/.ivy2/jars
io.delta#delta-core_2.12 added as a dependency
org.apache.hadoop#hadoop-aws added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-76530e26-eace-4eb6-9463-4d08594b57c4;1.0
	confs: [default]
	found io.delta#delta-core_2.12;2.1.1 in central
	found io.delta#delta-storage;2.1.1 in central
	found org.antlr#antlr4-runtime;4.8 in central
	found org.codehaus.jackson#jackson-core-asl;1.9.13 in central
	found org.apache.hadoop#hadoop-aws;3.3.1 in central
	found com.amazonaws#aws-java-sdk-bundle;1.11.901 in central
	found org.wildfly.openssl#wildfly-openssl;1.0.7.Final in central
:: resolution report :: resolve 163ms :: artifacts dl 8ms
	:: modules in use:
	com.amazonaws#aws-java-sdk-bundle;1.11.901 from central in [default]
	io.delta#delta-core_2.12;2.1.1 from central in [default]
	io.delta#delta-storage;2.1.1 from central in [default]
	org.antlr#antl

24/03/11 09:40:19 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [6]:
spark

---
# CSV Convert to pyspark.sql.dataframe

In [7]:
# local에서 spark로 csv 읽기
local_data_path = "/home/donghee/work/complaints.csv"

df = spark.read.format("csv").option("header",True).load(local_data_path)

df.show()
type(df)
df.count()

+--------------------+--------------------+--------------------+--------------------+--------------------+----------------------------+-----------------------+--------------------+--------------------+--------------------+--------------------+--------------------------+--------------------+--------------------+----------------------------+--------------------+--------------------+------------+
|       Date received|             Product|         Sub-product|               Issue|           Sub-issue|Consumer complaint narrative|Company public response|             Company|               State|            ZIP code|                Tags|Consumer consent provided?|       Submitted via|Date sent to company|Company response to consumer|    Timely response?|  Consumer disputed?|Complaint ID|
+--------------------+--------------------+--------------------+--------------------+--------------------+----------------------------+-----------------------+--------------------+--------------------+-----

                                                                                

6762146

---
# Save Delta Table to Minio

In [8]:
# minio에 csv -> delta table로 넣기
save_path = "s3a://savepaint-bucket/complaints-table"

df.write.format("delta").save(save_path)

24/03/10 01:02:58 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties


                                                                                

---

# Minio에서 다시 Delta Table 불러오기

In [3]:
from delta.tables import *
from pyspark.sql.functions import *

s3table_path = "s3a://savepaint-bucket/complaints-table"

# delta Table load
raw_deltaTable = DeltaTable.forPath(spark, s3table_path)
print(raw_deltaTable)

# convert to DataFrame
test = raw_deltaTable.toDF()
print(type(test))

test.show()

24/03/11 09:40:28 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties


                                                                                

<delta.tables.DeltaTable object at 0x7fcec5f39f30>
<class 'pyspark.sql.dataframe.DataFrame'>
+--------------------+--------------------+--------------------+--------------------+--------------------+----------------------------+-----------------------+--------------------+--------------------+--------------------+--------------------+--------------------------+-------------+--------------------+----------------------------+----------------+------------------+--------------------+
|       Date received|             Product|         Sub-product|               Issue|           Sub-issue|Consumer complaint narrative|Company public response|             Company|               State|            ZIP code|                Tags|Consumer consent provided?|Submitted via|Date sent to company|Company response to consumer|Timely response?|Consumer disputed?|        Complaint ID|
+--------------------+--------------------+--------------------+--------------------+--------------------+-----------------

---

## Version 별 optimize 하기

- Version 1 : Initial Data
- Version 2 : optimize.compaction()
- Version 3 : optimize.ZOrderBy("Columnname")

In [4]:
# verison 0 -> Raw Data
df0 = spark.read.format("delta").option("versionAsOf", 0).load(s3table_path)

In [5]:
# version 1 -> optimize.compaction()
(
    delta.DeltaTable.forPath(spark, s3table_path)
    .optimize()
    .executeCompaction()
)

df1 = spark.read.format("delta").option("versionAsOf", 1).load(s3table_path)

In [6]:
# version 2 -> optimize.compaction()
(
    delta.DeltaTable.forPath(spark, s3table_path)
    .optimize()
    .executeZOrderBy("Product")
)

                                                                                

DataFrame[path: string, metrics: struct<numFilesAdded:bigint,numFilesRemoved:bigint,filesAdded:struct<min:bigint,max:bigint,avg:double,totalFiles:bigint,totalSize:bigint>,filesRemoved:struct<min:bigint,max:bigint,avg:double,totalFiles:bigint,totalSize:bigint>,partitionsOptimized:bigint,zOrderStats:struct<strategyName:string,inputCubeFiles:struct<num:bigint,size:bigint>,inputOtherFiles:struct<num:bigint,size:bigint>,inputNumCubes:bigint,mergedFiles:struct<num:bigint,size:bigint>,numOutputCubes:bigint,mergedNumCubes:bigint>,numBatches:bigint,totalConsideredFiles:bigint,totalFilesSkipped:bigint,preserveInsertionOrder:boolean,numFilesSkippedToReduceWriteAmplification:bigint,numBytesSkippedToReduceWriteAmplification:bigint,startTimeMs:bigint,endTimeMs:bigint,totalClusterParallelism:bigint,totalScheduledTasks:bigint,autoCompactParallelismStats:struct<maxClusterActiveParallelism:bigint,minClusterActiveParallelism:bigint,maxSessionActiveParallelism:bigint,minSessionActiveParallelism:bigint>>]

---

# Compare Query Time

#### 1. Version0 : Not Compaction

In [7]:
(
    spark.read.format("delta")
    .option("versionAsOf","0")
    .load(s3table_path)
    .createOrReplaceTempView("x0")
)

In [18]:
%%time

spark.sql(
    "select * from x0 where Product = 'Mortgage'"
).collect()

                                                                                

CPU times: user 2.74 s, sys: 236 ms, total: 2.97 s
Wall time: 5.35 s


[Row(Date received='2019-08-08', Product='Mortgage', Sub-product='Conventional home mortgage', Issue='Struggling to pay mortgage', Sub-issue=None, Consumer complaint narrative='Hollo, My mortgage is with Select Portfolio Servicing and my loan # is XXXX. On XX/XX/XXXX I filed a complaint with your office, complaint ID XXXX, you contacted them and they provided you with their response. I then talked to one of your representative who told me I could refile if I did not agree with their response. I apologize for bothering your office so much but I have no one else to help me keep my home. ', Company public response=None, Company=None, State=None, ZIP code=None, Tags=None, Consumer consent provided?=None, Submitted via=None, Date sent to company=None, Company response to consumer=None, Timely response?=None, Consumer disputed?=None, Complaint ID=None),
 Row(Date received='2023-03-07', Product='Mortgage', Sub-product='Conventional home mortgage', Issue='Trouble during payment process', Sub-i

#### 2. Version1 : Compaction

In [11]:
(
    spark.read.format("delta")
    .option("versionAsOf","1")
    .load(s3table_path)
    .createOrReplaceTempView("x1")
)

In [24]:
%%time

spark.sql(
    "select * from x1 where Product = 'Mortgage'"
).collect()

                                                                                

CPU times: user 697 ms, sys: 272 ms, total: 969 ms
Wall time: 3.97 s


[Row(Date received='2019-08-08', Product='Mortgage', Sub-product='Conventional home mortgage', Issue='Struggling to pay mortgage', Sub-issue=None, Consumer complaint narrative='Hollo, My mortgage is with Select Portfolio Servicing and my loan # is XXXX. On XX/XX/XXXX I filed a complaint with your office, complaint ID XXXX, you contacted them and they provided you with their response. I then talked to one of your representative who told me I could refile if I did not agree with their response. I apologize for bothering your office so much but I have no one else to help me keep my home. ', Company public response=None, Company=None, State=None, ZIP code=None, Tags=None, Consumer consent provided?=None, Submitted via=None, Date sent to company=None, Company response to consumer=None, Timely response?=None, Consumer disputed?=None, Complaint ID=None),
 Row(Date received='2023-03-07', Product='Mortgage', Sub-product='Conventional home mortgage', Issue='Trouble during payment process', Sub-i

#### 3. Version2 : Compaction

In [20]:
(
    spark.read.format("delta")
    .option("versionAsOf","1")
    .load(s3table_path)
    .createOrReplaceTempView("x2")
)

In [25]:
%%time

spark.sql(
    "select * from x2 where Product = 'Mortgage'"
).collect()

                                                                                

CPU times: user 748 ms, sys: 218 ms, total: 966 ms
Wall time: 3.63 s


[Row(Date received='2019-08-08', Product='Mortgage', Sub-product='Conventional home mortgage', Issue='Struggling to pay mortgage', Sub-issue=None, Consumer complaint narrative='Hollo, My mortgage is with Select Portfolio Servicing and my loan # is XXXX. On XX/XX/XXXX I filed a complaint with your office, complaint ID XXXX, you contacted them and they provided you with their response. I then talked to one of your representative who told me I could refile if I did not agree with their response. I apologize for bothering your office so much but I have no one else to help me keep my home. ', Company public response=None, Company=None, State=None, ZIP code=None, Tags=None, Consumer consent provided?=None, Submitted via=None, Date sent to company=None, Company response to consumer=None, Timely response?=None, Consumer disputed?=None, Complaint ID=None),
 Row(Date received='2023-03-07', Product='Mortgage', Sub-product='Conventional home mortgage', Issue='Trouble during payment process', Sub-i

---

#### 실험의 결과가 항상 더 빠를 수는 없지만 대략적으로 더 빠르다는 것을 optimize에 따라확인 가능하였다.