# Data Ingestion Notebook

This notebook handles the ingestion of raw data from AWS S3 into the data lake.

## Steps:
1. Load customer data from multiple CSV files
2. Load transaction data
3. Load fiscal calendar data
4. Perform initial data validation
5. Store raw data in Delta Lake format

In [0]:
access = dbutils.secrets.get("aws-keys", key="aws-access-key")
secret = dbutils.secrets.get("aws-keys", key="aws-secret-key")

spark.conf.set("fs.s3a.access.key", access)
spark.conf.set("fs.s3a.secret.key", secret)
spark.conf.set("fs.s3a.impl","org.apache.hadoop.fs.s3a.S3AFileSystem")   


In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
import logging

# Retrieve AWS credentials from Databricks Secrets
access = dbutils.secrets.get("aws-keys", "aws-access-key")
secret = dbutils.secrets.get("aws-keys", "aws-secret-key")

# Build the Spark session with Delta Lake and S3 support
spark = (
    SparkSession.builder
    .appName("PsychoBunny-DataIngestion")
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
    .config("spark.hadoop.fs.s3a.access.key", access_key)
    .config("spark.hadoop.fs.s3a.secret.key", secret_key)
    .config("spark.hadoop.fs.s3a.endpoint", "s3.amazonaws.com")
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
    .config("spark.hadoop.fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider")
    .getOrCreate()
)

spark.conf.set("fs.s3a.access.key", access)
spark.conf.set("fs.s3a.secret.key", secret)
spark.conf.set("fs.s3a.impl","org.apache.hadoop.fs.s3a.S3AFileSystem")   

# logger code
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
logger.info("Spark session initialized with Delta Lake and AWS S3 support")


INFO:__main__:Spark session initialized with Delta Lake and AWS S3 support


In [0]:
# Configuration
S3_BUCKET = "psycho-bunny-data-lake"
LANDING_ZONE_PATH = "s3://{}/landing-zone/".format(S3_BUCKET)
RAW_DATA_PATH = "s3://{}/raw-data/".format(S3_BUCKET)


INFO:py4j.clientserver:Received command c on object id p0


In [0]:
# load all CSV files with automatic schema merging
customers_raw = spark.read \
    .option("header", "true") \
    .option("mergeSchema", "true") \
    .csv(f"s3://{S3_BUCKET}/landing-zone/customers/*.csv") \
    .withColumn("source_file", input_file_name()) \
    .withColumn("ingestion_date", current_timestamp()) \
    .withColumn("customer_id", concat_ws("_", lower(col("first_name")), lower(col("last_name")), lower(col("email")))) \
    .dropDuplicates(["customer_id"])

print(f"Loaded {customers_raw.count()} unique customers")
customers_raw.show(10,False)

INFO:py4j.clientserver:Received command c on object id p0
INFO:py4j.clientserver:Received command c on object id p0
INFO:py4j.clientserver:Received command c on object id p0
INFO:py4j.clientserver:Received command c on object id p0
INFO:py4j.clientserver:Received command c on object id p0
INFO:py4j.clientserver:Received command c on object id p0
INFO:py4j.clientserver:Received command c on object id p0
INFO:py4j.clientserver:Received command c on object id p0
INFO:py4j.clientserver:Received command c on object id p0
INFO:py4j.clientserver:Received command c on object id p0
INFO:py4j.clientserver:Received command c on object id p0
INFO:py4j.clientserver:Received command c on object id p0
INFO:py4j.clientserver:Received command c on object id p0
INFO:py4j.clientserver:Received command c on object id p0
INFO:py4j.clientserver:Received command c on object id p0
INFO:py4j.clientserver:Received command c on object id p0
INFO:py4j.clientserver:Received command c on object id p0
INFO:py4j.clie

Loaded 2000 unique customers


INFO:py4j.clientserver:Received command c on object id p0
INFO:py4j.clientserver:Received command c on object id p0
INFO:py4j.clientserver:Received command c on object id p0
INFO:py4j.clientserver:Received command c on object id p0
INFO:py4j.clientserver:Received command c on object id p0


+----------+---------+-----------------------------+-------------------------+---------------+--------------+-------+------------+------------+--------------------------------+---------------------------------------+--------+-----+-----+----+---------------------------------------------------------------------------------+--------------------------+-------------------------------------------------+
|first_name|last_name|company_name                 |address                  |city           |county        |postal |phone1      |phone2      |email                           |web                                    |province|state|zip  |post|source_file                                                                      |ingestion_date            |customer_id                                      |
+----------+---------+-----------------------------+-------------------------+---------------+--------------+-------+------------+------------+--------------------------------+--------------------

INFO:py4j.clientserver:Received command c on object id p0


In [0]:
#  check transaction data
transactions_path = "s3://{}/landing-zone/transactions/de_shop_transactions_20230821.csv".format(S3_BUCKET)

try:
    transactions_raw = spark.read.option("header", "true").option("inferSchema", "true").csv(transactions_path)
    transactions_raw = transactions_raw.withColumn("source_file", lit("de_shop_transactions_20230821.csv"))
    transactions_raw = transactions_raw.withColumn("ingestion_date", current_timestamp())
    
    logger.info(f"Loaded transaction data: {transactions_raw.count()} records")
    transactions_raw.show(5)
except Exception as e:
    logger.error(f"Error loading transactions: {str(e)}")

INFO:py4j.clientserver:Received command c on object id p0
INFO:py4j.clientserver:Received command c on object id p0
INFO:py4j.clientserver:Received command c on object id p0
INFO:py4j.clientserver:Received command c on object id p0
INFO:__main__:Loaded transaction data: 2823 records
INFO:py4j.clientserver:Received command c on object id p0


+-----------+---------------+---------------+------------+---------------+------+--------+-------+-----------+--------------------+----------+--------------------+------------+-------------+-----+----------+-------+---------+---------------+----------------+--------+--------------------+--------------------+
|ORDERNUMBER|QUANTITYORDERED|ORDERLINENUMBER|TOTAL_AMOUNT|      ORDERDATE|QTR_ID|MONTH_ID|YEAR_ID|PRODUCTCODE|        CUSTOMERNAME|     PHONE|        ADDRESSLINE1|ADDRESSLINE2|         CITY|STATE|POSTALCODE|COUNTRY|TERRITORY|CONTACTLASTNAME|CONTACTFIRSTNAME|DEALSIZE|         source_file|      ingestion_date|
+-----------+---------------+---------------+------------+---------------+------+--------+-------+-----------+--------------------+----------+--------------------+------------+-------------+-----+----------+-------+---------+---------------+----------------+--------+--------------------+--------------------+
|      10107|             30|              2|       -1993| 2/24/2003 0

In [0]:
#  fiscal calendar data
calendar_path = "s3://{}/landing-zone/calendar/de_dates.csv".format(S3_BUCKET)

try:
    calendar_raw = spark.read.option("header", "true").option("inferSchema", "true").csv(calendar_path)
    calendar_raw = calendar_raw.withColumn("source_file", lit("de_dates.csv"))
    calendar_raw = calendar_raw.withColumn("ingestion_date", current_timestamp())
    
    logger.info(f"Loaded calendar data: {calendar_raw.count()} records")
    calendar_raw.show(5)
except Exception as e:
    logger.error(f"Error loading calendar: {str(e)}")

INFO:py4j.clientserver:Received command c on object id p0
INFO:py4j.clientserver:Received command c on object id p0
INFO:py4j.clientserver:Received command c on object id p0
INFO:__main__:Loaded calendar data: 6944 records
INFO:py4j.clientserver:Received command c on object id p0
INFO:py4j.clientserver:Received command c on object id p0


+-------------+--------------+------------+--------------------+-------------------+-------------------+-----------------+------------------------+-----------------------+-------------------------+------------------------+-------------------+--------------+-----------+----------------+-----------------+-------------------+----------------------+---------------------+---------------------------+--------------------------+--------------------------+-------------------------+-------+------------+--------------------+
|CALENDAR_DATE|WEEKDAY_NUMBER|WEEKDAY_NAME|FISCAL_WEEK_OF_MONTH|FISCAL_WEEK_OF_YEAR|FISCAL_MONTH_NUMBER|FISCAL_MONTH_NAME|FISCAL_FIRST_DAY_OF_WEEK|FISCAL_LAST_DAY_OF_WEEK|FISCAL_FIRST_DAY_OF_MONTH|FISCAL_LAST_DAY_OF_MONTH|FISCAL_DAY_OF_MONTH|FISCAL_QUARTER|FISCAL_YEAR|FISCAL_YEAR_WEEK|FISCAL_YEAR_MONTH|FISCAL_YEAR_QUARTER|CALENDAR_WEEK_OF_MONTH|CALENDAR_WEEK_OF_YEAR|CALENDAR_FIRST_DAY_OF_MONTH|CALENDAR_LAST_DAY_OF_MONTH|CALENDAR_FIRST_DAY_OF_YEAR|CALENDAR_LAST_DAY_OF_YEAR|WEE

In [0]:
# Store raw data in Delta Lake format
try:
    # Store customers
    customers_raw.write \
        .format("delta") \
        .mode("overwrite") \
        .save(f"{RAW_DATA_PATH}customers")
    
    # Store transactions
    transactions_raw.write \
        .format("delta") \
        .mode("overwrite") \
        .save(f"{RAW_DATA_PATH}transactions")
    
    # Store calendar
    calendar_raw.write \
        .format("delta") \
        .mode("overwrite") \
        .save(f"{RAW_DATA_PATH}calendar")
    
    logger.info("Raw data successfully stored in Delta Lake format")
except Exception as e:
    logger.error(f"Error storing raw data: {str(e)}")

INFO:py4j.clientserver:Received command c on object id p0
INFO:py4j.clientserver:Received command c on object id p0
INFO:py4j.clientserver:Received command c on object id p0
INFO:py4j.clientserver:Received command c on object id p0
INFO:py4j.clientserver:Received command c on object id p0
INFO:py4j.clientserver:Received command c on object id p0
INFO:py4j.clientserver:Received command c on object id p0
INFO:py4j.clientserver:Received command c on object id p0
INFO:py4j.clientserver:Received command c on object id p0
INFO:py4j.clientserver:Received command c on object id p0
INFO:py4j.clientserver:Received command c on object id p0
INFO:py4j.clientserver:Received command c on object id p0
INFO:py4j.clientserver:Received command c on object id p0
INFO:py4j.clientserver:Received command c on object id p0
INFO:py4j.clientserver:Received command c on object id p0
INFO:py4j.clientserver:Received command c on object id p0
INFO:py4j.clientserver:Received command c on object id p0
INFO:__main__:

In [0]:
# Data validation summary
print("Customers loaded:", customers_raw.count())
print("Transactions loaded:", transactions_raw.count())
print("Calendar loaded:", calendar_raw.count())
print("Data stored at:", RAW_DATA_PATH)


logger.info("Data ingestion completed successfully")                                       

INFO:py4j.clientserver:Received command c on object id p0
INFO:py4j.clientserver:Received command c on object id p0


Customers loaded: 2000


INFO:py4j.clientserver:Received command c on object id p0
INFO:py4j.clientserver:Received command c on object id p0


Transactions loaded: 2823


INFO:py4j.clientserver:Received command c on object id p0
INFO:__main__:Data ingestion completed successfully


Calendar loaded: 6944
Data stored at: s3://psycho-bunny-data-lake/raw-data/
