In [None]:
import os 
import logging
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

# Log an info message
logger.info("Application started successfully.")


In [None]:
storage_account = os.environ["AZURE_STORAGE_ACCOUNT_NAME"]
access_key = os.environ["AZURE_STORAGE_ACCOUNT_ACCESS_KEY"]
NESSIE_URI = os.environ['NESSIE_URI']
REF = "main"
FULL_PATH_TO_WAREHOUSE = os.environ['WAREHOUSE']
AWS_S3_ENDPOINT = os.environ["AWS_S3_ENDPOINT"]
AWS_ACCESS_KEY = os.environ['AWS_ACCESS_KEY_ID']
AWS_SECRET_KEY  = os.environ['AWS_SECRET_ACCESS_KEY']

In [None]:
from pyspark import SparkConf

conf = SparkConf() \
    .setAppName("Read_csv_from_azure_blob") \
    .setMaster("local[*]") \
    .set("spark.executor.memory", "4g") \
    .set("spark.driver.memory", "2g") \
    
# set azure config
conf.set("fs.azure", "org.apache.hadoop.fs.azure.NativeAzureFileSystem")
conf.set(f"fs.azure.account.key.{storage_account}.dfs.core.windows.net", access_key) 

# set minio config
# conf.set("spark.sql.catalog.nessie.s3.endpoint", AWS_S3_ENDPOINT)
# conf.set("spark.sql.catalog.nessie.ref", REF)
# conf.set("spark.sql.catalog.nessie.authentication.type", "NONE") 
# conf.set("spark.sql.catalog.nessie.s3.access-key-id", AWS_ACCESS_KEY)
# conf.set("spark.sql.catalog.nessie.s3.secret-access-key", AWS_SECRET_KEY)
# conf.set("spark.sql.catalog.nessie.s3.path-style-access", "true")

conf.set("fs.s3a.access.key", AWS_ACCESS_KEY) 
conf.set("fs.s3a.secret.key", AWS_SECRET_KEY) 
conf.set("fs.s3a.endpoint", AWS_S3_ENDPOINT) 
conf.set("fs.s3a.connection.ssl.enabled", "false") 
conf.set("fs.s3a.path.style.access", "true") 


In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.config(conf=conf).getOrCreate()
spark

# Read sales from Azure Blob Storage

In [None]:
container = "datalake"
endpoint = f"abfss://{container}@{storage_account}.dfs.core.windows.net"

In [None]:
from pyspark.sql.functions import input_file_name, current_timestamp
def extract_csv(source_folder, save_path):
    logger.info("read csv from: ", source_folder)
    df = spark.read.format("csv") \
        .option("header", "true") \
        .option("pathGlobFilter", "*.csv") \
        .option("sep", ";") \
        .load(f"{endpoint}/{folder}")

    df = df.withColumn("source_filepath", input_file_name()) \
        .withColumn("ingestion_datetime", current_timestamp())

    save_dir = "s3a://"+save_path
    logger.info("save csv to: ", save_dir)
    df.write.format("csv") \
        .option("header", "true") \
        .save(save_dir)


In [None]:
spark.read.format("csv").load("s3a://seed/sales/sales_2010.csv")


In [None]:
folder = "sales"
save = "seed/sales/sales.csv"
extract_csv(folder, save)

In [None]:
from pyspark.sql.functions import input_file_name, current_timestamp
df = spark.read.format("csv") \
    .option("header", "true") \
    .option("pathGlobFilter", "*.csv") \
    .option("sep", ";") \
    .load(f"{endpoint}/{folder}")

df = df.withColumn("source_filepath", input_file_name()) \
       .withColumn("ingestion_datetime", current_timestamp())

df.show()


In [None]:
df.writeTo("nessie.adventureWorks.sales") \
   .createOrReplace()

In [None]:
%%sql
select *
from nessie.adventureWorks.sales
limit 10

# Read Product

In [None]:
folder = "product"

In [None]:
df = spark.read.format("csv") \
    .option("header", "true") \
    .option("pathGlobFilter", "*.csv") \
    .option("sep", ";") \
    .load(f"{endpoint}/{folder}")

df = df.withColumn("source_filepath", input_file_name()) \
       .withColumn("ingestion_datetime", current_timestamp())

df.show()

In [None]:
df.writeTo("nessie.adventureWorks.product") \
   .createOrReplace()

# Read currency_rate

In [None]:
folder = "currency_rate"

In [None]:
df = spark.read.format("csv") \
    .option("header", "true") \
    .option("pathGlobFilter", "*.csv") \
    .option("sep", ";") \
    .load(f"{endpoint}/{folder}")

df = df.withColumn("source_filepath", input_file_name()) \
       .withColumn("ingestion_datetime", current_timestamp())

df.show()

In [None]:
df.writeTo("nessie.adventureWorks.currency_rate") \
   .createOrReplace()

In [None]:
%%sql
select *
from nessie.adventureWorks.currency_rate
limit 10