In [1]:
import os 
import logging
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

logger.info("Application started successfully.")

In [2]:
storage_account = os.environ["AZURE_STORAGE_ACCOUNT_NAME"]
access_key = os.environ["AZURE_STORAGE_ACCOUNT_ACCESS_KEY"]
NESSIE_URI = os.environ['NESSIE_URI']
REF = "main"
FULL_PATH_TO_WAREHOUSE = os.environ['WAREHOUSE']
AWS_S3_ENDPOINT = os.environ["AWS_S3_ENDPOINT"]
AWS_ACCESS_KEY = os.environ['AWS_ACCESS_KEY_ID']
AWS_SECRET_KEY  = os.environ['AWS_SECRET_ACCESS_KEY']

In [3]:
from pyspark import SparkConf

conf = SparkConf() \
    .setAppName("Read_csv_from_azure_blob") \
    .setMaster("local[*]") \
    .set("spark.executor.memory", "4g") \
    .set("spark.driver.memory", "2g") \
    
# set azure config
conf.set("fs.azure", "org.apache.hadoop.fs.azure.NativeAzureFileSystem")
conf.set(f"fs.azure.account.key.{storage_account}.dfs.core.windows.net", access_key) 

# set MinIO config
conf.set("fs.s3a.access.key", AWS_ACCESS_KEY) 
conf.set("fs.s3a.secret.key", AWS_SECRET_KEY) 
conf.set("fs.s3a.endpoint", AWS_S3_ENDPOINT) 
conf.set("fs.s3a.connection.ssl.enabled", "false") 
conf.set("fs.s3a.path.style.access", "true") 


<pyspark.conf.SparkConf at 0x7144cd9684c0>

In [4]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.config(conf=conf).getOrCreate()
spark

25/06/12 11:41:29 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


# MinIO: Save and Load data 

In [5]:
container = "datalake"
endpoint = f"abfss://{container}@{storage_account}.dfs.core.windows.net"

## Read csv from azure blob storage

In [6]:
from pyspark.sql.functions import input_file_name, current_timestamp
folder = "sales"

logger.info("read csv from: ", folder)
df = spark.read.format("csv") \
    .option("header", "true") \
    .option("pathGlobFilter", "*2010.csv") \
    .option("sep", ";") \
    .load(f"{endpoint}/{folder}")

df = df.withColumn("source_filepath", input_file_name()) \
    .withColumn("ingestion_datetime", current_timestamp())
df.head(1)

[Row(SalesOrderNumber='SO43701', SalesOrderLineNumber='1', ProductId='BK-M82S-44', CustomerUsername='christy12', OrderDate='2010-12-29', DueDate='2011-01-10', ShipDate='2011-01-05', SalesTerritoryRegion='Australia', SalesTerritoryCountry='Australia', SalesTerritoryContinent='Pacific', OrderQuantity='1', SalesAmount='5266.92', TaxAmt='421.35', Freight='84,9998', Currency='AUD', AverageRate='0,645536117745788', source_filepath='abfss://datalake@storagesii.dfs.core.windows.net/sales/sales_2010.csv', ingestion_datetime=datetime.datetime(2025, 6, 12, 11, 41, 35, 971750))]

## Save data into MinIO

In [13]:
bucket = "seed"
save_path = "sales/sales.csv"

save_dir = os.path.join("s3a://",bucket,save_path)
logger.info("save csv to: ", save_dir)
df.write.format("csv") \
    .mode("overwrite") \
    .option("header", "true") \
    .save(save_dir)

25/06/12 04:22:53 WARN AbstractS3ACommitterFactory: Using standard FileOutputCommitter to commit work. This is slow and potentially unsafe.
25/06/12 04:22:53 WARN AbstractS3ACommitterFactory: Using standard FileOutputCommitter to commit work. This is slow and potentially unsafe.


## Load data from MinIO

In [14]:
bucket = "seed"
filepath = "sales/sales.csv"

df = spark.read.format("csv") \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .load(os.path.join("s3a://",bucket,filepath))

df.show()

+----------------+--------------------+----------+----------------+----------+----------+----------+--------------------+---------------------+-----------------------+-------------+-----------+-------+-------+--------+-----------------+--------------------+--------------------+
|SalesOrderNumber|SalesOrderLineNumber| ProductId|CustomerUsername| OrderDate|   DueDate|  ShipDate|SalesTerritoryRegion|SalesTerritoryCountry|SalesTerritoryContinent|OrderQuantity|SalesAmount| TaxAmt|Freight|Currency|      AverageRate|     source_filepath|  ingestion_datetime|
+----------------+--------------------+----------+----------------+----------+----------+----------+--------------------+---------------------+-----------------------+-------------+-----------+-------+-------+--------+-----------------+--------------------+--------------------+
|         SO43701|                   1|BK-M82S-44|       christy12|2010-12-29|2011-01-10|2011-01-05|           Australia|            Australia|                Paci