In [1]:
import os 
import logging
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

logger.info("Application started successfully.")

In [2]:
# load environment variables

storage_account = os.environ["AZURE_STORAGE_ACCOUNT_NAME"]
access_key = os.environ["AZURE_STORAGE_ACCOUNT_ACCESS_KEY"]
NESSIE_URI = os.environ['NESSIE_URI']
REF = "main"
FULL_PATH_TO_WAREHOUSE = os.environ['WAREHOUSE']
AWS_S3_ENDPOINT = os.environ["AWS_S3_ENDPOINT"]
AWS_ACCESS_KEY = os.environ['AWS_ACCESS_KEY_ID']
AWS_SECRET_KEY  = os.environ['AWS_SECRET_ACCESS_KEY']

In [3]:
# set pyspark configuration
from pyspark import SparkConf

conf = SparkConf() \
    .setAppName("Read_csv_from_azure_blob") \
    .setMaster("local[*]") \
    .set("spark.executor.memory", "4g") \
    .set("spark.driver.memory", "2g") \
    
# set azure config
conf.set("fs.azure", "org.apache.hadoop.fs.azure.NativeAzureFileSystem")
conf.set(f"fs.azure.account.key.storagesii.dfs.core.windows.net", access_key) 

# set MinIO config
conf.set("fs.s3a.access.key", AWS_ACCESS_KEY) 
conf.set("fs.s3a.secret.key", AWS_SECRET_KEY) 
conf.set("fs.s3a.endpoint", AWS_S3_ENDPOINT) 
conf.set("fs.s3a.connection.ssl.enabled", "false") 
conf.set("fs.s3a.path.style.access", "true") 

<pyspark.conf.SparkConf at 0x7fd51851f370>

In [4]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.config(conf=conf).getOrCreate()
spark

25/06/12 13:18:48 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [73]:
from datetime import datetime
from urllib.parse import urlparse
from azure.storage.filedatalake import *

def download_from_azure(file_uri, file_extension):
    # Parse the URI
    parsed = urlparse(file_uri)

    # Extract components
    container_name = parsed.netloc.split("@")[0]
    storage_account_name = parsed.netloc.split("@")[1].split(".")[0] 
    directory_name = "/".join(parsed.path.split("/")[:-1]) 
    file_name = parsed.path.split("/")[-1] 

    # Azure Data Lake Gen2 Configuration
    connection_string = f"DefaultEndpointsProtocol=https;AccountName={storage_account_name};AccountKey={access_key};EndpointSuffix=core.windows.net"

    # Azure Data Lake Gen2 Configuration
    file_path = os.path.join(directory_name, file_name)

    # Connect to Azure Data Lake
    service_client = DataLakeServiceClient(
        account_url=f"https://{storage_account_name}.dfs.core.windows.net",
        credential=access_key
    )

    # Get file system client
    file_system_client = service_client.get_file_system_client(container_name)

    # List all files in the directory
    paths = file_system_client.get_paths(path=directory_name, recursive=False)

    # filter files to match the extension
    filtered_paths = [path.name for path in paths if path.name.endswith(file_extension)]

    return_files = []
    for path in filtered_paths:
        print(path)
        dir, file_name = os.path.split(path)
        local_file_path = f'/tmp/{datetime.now().strftime("%Y%m%d%H%M%S")}_{file_name}'  # Local path to save file
        # Create DataLakeFileClient
        file_client = DataLakeFileClient.from_connection_string(connection_string, container_name, path)

        if file_client.exists():
            try:
                print("Downloading file:", file_path)
                # Download the file
                with open(local_file_path, "wb") as local_file:
                    local_file.write(file_client.download_file().readall())
                print(f"Downloaded {file_name} successfully from Azure Data Lake.")
                return_files.append(local_file_path)
            except Exception as e:
                print("Error while downloading file:", str(e))
        
    return return_files

In [74]:
# read directory list
source_list = [
    {
        "source_path": "abfss://datalake@storagesii.dfs.core.windows.net/sales/",
        "dest_path": "s3a://seed/adventureWorks/",
        "extended": {
            "format": "binaryFile",
            "extension": ".csv"
        }
    }
]

In [75]:
for ingestion in source_list:
    download_filepath = download_from_azure(ingestion["source_path"], ingestion["extended"]["extension"])
    print(download_filepath)

sales/sales_2010.csv
Downloading file: /sales/
Downloaded sales_2010.csv successfully from Azure Data Lake.
sales/sales_2011.csv
Downloading file: /sales/
Downloaded sales_2011.csv successfully from Azure Data Lake.
sales/sales_2012.csv
Downloading file: /sales/
Downloaded sales_2012.csv successfully from Azure Data Lake.
sales/sales_2013.csv
Downloading file: /sales/
Downloaded sales_2013.csv successfully from Azure Data Lake.
sales/sales_2014.csv
Downloading file: /sales/
Downloaded sales_2014.csv successfully from Azure Data Lake.
['/tmp/20250612135223_sales_2010.csv', '/tmp/20250612135223_sales_2011.csv', '/tmp/20250612135223_sales_2012.csv', '/tmp/20250612135223_sales_2013.csv', '/tmp/20250612135223_sales_2014.csv']


In [87]:
from minio import Minio

# MinIO Configuration
minio_client = Minio(
    "http://172.18.0.5:9000",
    access_key=os.environ["AWS_ACCESS_KEY_ID"],
    secret_key=os.environ["AWS_SECRET_ACCESS_KEY"],
    secure=False
)

# Define file and bucket
bucket_name = "seed"
directory = "sales"
for file_path in download_filepath:
    object_name = os.path.join(directory,file_path[20:])
    
    # Upload file to MinIO
    minio_client.fput_object(bucket_name, object_name, file_path)

    print(f"Uploaded {object_name} to MinIO successfully.")



ValueError: path in endpoint is not allowed

In [None]:
from pyspark.sql.functions import input_file_name, current_timestamp
import shutil

for ingestion in source_list:
    format_type = ingestion["extended"]["format"]
    if format_type == "binaryFile":
        download_from_azure(storage_account, \
                            container_name='datalake', \
                            directory_name=ingestion["source_path"], \
                            file_name)
    elif format_type == "csv":
        df = spark.read.format(format_type) \
            .option("header", "true") \
            .option("pathGlobFilter", ingestion["format"]["pathGlobFilter"]) \
            .option("sep", ingestion["format"]["sep"]) \
            .load(ingestion["source_path"])
        df.write.format(format_type) \
            .save(ingestion["dest_path"])

