In [1]:
import os 
import logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

logger.info("Application started successfully.")

2025-06-16 08:49:25,458 - INFO - Application started successfully.


In [2]:
# load environment variables

storage_account = os.environ["AZURE_STORAGE_ACCOUNT_NAME"]
access_key = os.environ["AZURE_STORAGE_ACCOUNT_ACCESS_KEY"]
NESSIE_URI = os.environ['NESSIE_URI']
REF = "main"
FULL_PATH_TO_WAREHOUSE = os.environ['WAREHOUSE']
AWS_S3_ENDPOINT = os.environ["AWS_S3_ENDPOINT"]
AWS_ACCESS_KEY = os.environ['AWS_ACCESS_KEY_ID']
AWS_SECRET_KEY  = os.environ['AWS_SECRET_ACCESS_KEY']

In [3]:
# set pyspark configuration
from pyspark import SparkConf

conf = SparkConf() \
    .setAppName("Read_csv_from_azure_blob") \
    .setMaster("local[*]") \
    .set("spark.executor.memory", "4g") \
    .set("spark.driver.memory", "2g") \
    
# set azure config
conf.set("fs.azure", "org.apache.hadoop.fs.azure.NativeAzureFileSystem")
conf.set(f"fs.azure.account.key.storagesii.dfs.core.windows.net", access_key) 

# set MinIO config
conf.set("fs.s3a.access.key", AWS_ACCESS_KEY) 
conf.set("fs.s3a.secret.key", AWS_SECRET_KEY) 
conf.set("fs.s3a.endpoint", AWS_S3_ENDPOINT) 
conf.set("fs.s3a.connection.ssl.enabled", "false") 
conf.set("fs.s3a.path.style.access", "true") 


<pyspark.conf.SparkConf at 0x7fe1b40ff130>

In [4]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.config(conf=conf).getOrCreate()
spark

25/06/16 08:49:26 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [5]:
def read_csv_from_azure_blob_storage(endpoint, directory_path, sep, pathGlobFilter):
    from datetime import datetime
    from pyspark.sql.functions import input_file_name, current_timestamp

    file_uri = os.path.join(endpoint, directory_path)
    logger.info(f"read csv from: {file_uri}")
    df = spark.read.format("csv") \
        .option("header", "true") \
        .option("pathGlobFilter", pathGlobFilter) \
        .option("sep", sep) \
        .load(file_uri)

    df = df.withColumn("source_filepath", input_file_name()) \
        .withColumn("ingestion_datetime", current_timestamp())
    return df

In [6]:
# read directory list
source_list = [
    {
        "source_uri": "abfss://datalake@storagesii.dfs.core.windows.net",
        "directory_path": "sales",
        "dest_path": "s3a://seed/adventureWorks",
        "extended": {
            "format": "csv",
            "extension": ".csv"
        },
        "enabled": True
    },
    {
        "source_uri": "abfss://datalake@storagesii.dfs.core.windows.net",
        "directory_path": "product",
        "dest_path": "s3a://seed/adventureWorks",
        "extended": {
            "format": "csv",
            "extension": ".csv"
        },
        "enabled": True
    },
    {
        "source_uri": "abfss://datalake@storagesii.dfs.core.windows.net",
        "directory_path": "customer",
        "dest_path": "s3a://seed/adventureWorks",
        "extended": {
            "format": "csv",
            "extension": ".csv"
        },
        "enabled": True
    },
    {
        "source_uri": "abfss://datalake@storagesii.dfs.core.windows.net",
        "directory_path": "currency_rate",
        "dest_path": "s3a://seed/adventureWorks",
        "extended": {
            "format": "csv",
            "extension": ".csv"
        },
        "enabled": True
    },
    {
        "source_uri": "abfss://datalake@storagesii.dfs.core.windows.net",
        "directory_path": "currency",
        "dest_path": "s3a://seed/adventureWorks",
        "extended": {
            "format": "csv",
            "extension": ".csv"
        },
        "enabled": True
    }
]

In [7]:
for ingestion in [item for item in source_list if item["enabled"]]:
    df = read_csv_from_azure_blob_storage(endpoint=ingestion["source_uri"], 
                                          directory_path=ingestion["directory_path"], 
                                          sep=";", 
                                          pathGlobFilter="*"+ingestion["extended"]["extension"])
        
    save_dir = os.path.join(ingestion["dest_path"], ingestion["directory_path"]+ingestion["extended"]["extension"])
    logger.info(f"save csv to: {save_dir}")
    df.write.format("csv") \
        .mode("overwrite") \
        .option("header", "true") \
        .save(save_dir)

2025-06-16 08:49:26,497 - INFO - read csv from: abfss://datalake@storagesii.dfs.core.windows.net/sales
2025-06-16 08:49:33,764 - INFO - save csv to: s3a://seed/adventureWorks/sales.csv
25/06/16 08:49:33 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties
25/06/16 08:49:36 WARN AbstractS3ACommitterFactory: Using standard FileOutputCommitter to commit work. This is slow and potentially unsafe.
25/06/16 08:49:36 WARN AbstractS3ACommitterFactory: Using standard FileOutputCommitter to commit work. This is slow and potentially unsafe.
25/06/16 08:49:36 WARN AbstractS3ACommitterFactory: Using standard FileOutputCommitter to commit work. This is slow and potentially unsafe.
25/06/16 08:49:37 WARN AbstractS3ACommitterFactory: Using standard FileOutputCommitter to commit work. This is slow and potentially unsafe.
2025-06-16 08:49:42,916 - INFO - read csv from: abfss://datalake@storagesii.dfs.core.windows.net/product
2025-06

In [10]:
spark.stop()