# Minio 라이브러리 이용

In [None]:
from minio import Minio
from glob import glob
import os

BUCKET_NAME = "savepaint-bucket"

minio_access_key = "admin" # MINIO_ROOT_USER
minio_secret_key = "changeme" #MINIO_ROOT_PASSWORD
minio_endpoint_url = "http://127.0.0.1:9000" # MINIO_ENDPOINT_URL

client = Minio(
    "localhost:9000",
    access_key="admin", secret_key="changeme", secure=False
)

client.bucket_exists(BUCKET_NAME)

True

---

# Minio Bucket 생성

In [2]:
if not client.bucket_exists(BUCKET_NAME):
    client.make_bucket(BUCKET_NAME)

client.bucket_exists(BUCKET_NAME)

True

In [3]:
buckets = client.list_buckets()

for bucket in buckets:
    print(bucket.name)

savepaint-bucket


---

# Spark S3 API를 활용하여 Minio 연동된 Cluster 생성

In [1]:
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf

# spark session 생성시 aws와 연동하기
def aws_connect_spark(minio_access_key, minio_secret_key):
    # 설정
    conf = (
        SparkConf()
        .setAppName("MY_APP") # replace with your desired name
        .set("spark.hadoop.fs.s3a.access.key", minio_access_key) # MINIO_ROOT_USER
        .set("spark.hadoop.fs.s3a.secret.key", minio_secret_key) # MINIO_ROOT_PASSWORD
        .set("spark.hadoop.fs.s3a.endpoint", minio_endpoint_url) # Minio endpoint URL
        .set("spark.jars.packages", "io.delta:delta-core_2.12:1.2.1,org.apache.hadoop:hadoop-aws:3.3.1")
        .set("spark.databricks.delta.retentionDurationCheck.enabled", "false")
        .set("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") # Deltalake로 Apache Spark 설정
        .set("spark.sql.catalog.spark_catalog","org.apache.spark.sql.delta.catalog.DeltaCatalog") # Deltalake로 Apache Spark 설정
    )

    # spark 생성
    spark = SparkSession.builder.config(conf=conf).getOrCreate()
    
    return spark

In [2]:
spark = aws_connect_spark(minio_access_key, minio_secret_key)

your 131072x1 screen size is bogus. expect trouble


24/02/23 12:43:53 WARN Utils: Your hostname, DESKTOP-JJQA3IT resolves to a loopback address: 127.0.1.1; using 172.25.190.30 instead (on interface eth0)
24/02/23 12:43:53 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
:: loading settings :: url = jar:file:/home/donghee/work/deltalake1/lib/python3.8/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/donghee/.ivy2/cache
The jars for the packages stored in: /home/donghee/.ivy2/jars
io.delta#delta-core_2.12 added as a dependency
org.apache.hadoop#hadoop-aws added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-44869091-b836-4c26-a248-2ce996faea4f;1.0
	confs: [default]
	found io.delta#delta-core_2.12;1.2.1 in central
	found io.delta#delta-storage;1.2.1 in central
	found org.antlr#antlr4-runtime;4.8 in central
	found org.codehaus.jackson#jackson-core-asl;1.9.13 in central
	found org.apache.hadoop#hadoop-aws;3.3.1 in central
	found com.amazonaws#aws-java-sdk-bundle;1.11.901 in central
	found org.wildfly.openssl#wildfly-openssl;1.0.7.Final in central
:: resolution report :: resolve 209ms :: artifacts dl 16ms
	:: modules in use:
	com.amazonaws#aws-java-sdk-bundle;1.11.901 from central in [default]
	io.delta#delta-core_2.12;1.2.1 from central in [default]
	io.delta#delta-storage;1.2.1 from central in [default]
	org.antlr#ant

24/02/23 12:43:55 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [6]:
spark

In [7]:
local_data_path = "/home/donghee/work/deltalakeproject/data/totaldf3.csv"

df = spark.read.csv(local_data_path)
df.show()

+------------------------------+----------------------------------+------------+------------+------------+--------------+------+-----------+--------+--------+----------+-----------------------+
|                           _c0|                               _c1|         _c2|         _c3|         _c4|           _c5|   _c6|        _c7|     _c8|     _c9|      _c10|                   _c11|
+------------------------------+----------------------------------+------------+------------+------------+--------------+------+-----------+--------+--------+----------+-----------------------+
|                      상가이름|                          업종이름|업종대분류Cd|업종중분류Cd|업종소분류Cd|업종소소분류Cd|시군구|     읍면동|상세주소|종업원수|매출액등급|             roadNmAddr|
|                          대하|        가금류 가공 및 저장 처리업|          10|           1|           2|             1|경산시|     가일길|   133-2|       8|        16|    경산시 가일길 133-2|
|                    주식******|        가금류 가공 및 저장 처리업|          10|           1|           2|          

---

# Minio Bucket에 Data Write

In [3]:
spark.range(500).write.format("delta").save("s3a://savepaint-bucket/demo1")

24/02/23 12:45:22 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties


[Stage 0:>                                                        (0 + 12) / 12]

24/02/23 12:45:24 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 95.00% for 8 writers
24/02/23 12:45:24 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 84.44% for 9 writers
24/02/23 12:45:24 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 76.00% for 10 writers
24/02/23 12:45:24 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 69.09% for 11 writers
24/02/23 12:45:24 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 63.33% for 12 writers
24/02/23 12:45:24 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 69.09% for 11 writers
24/02/23 12:45:24 WARN MemoryManager: Total allocation exceeds 95.

                                                                                