# Data Ingestion Notebook

This notebook handles the ingestion of raw data from AWS S3 into the data lake.

## Steps:
1. Load customer data from multiple CSV files
2. Load transaction data
3. Load fiscal calendar data
4. Perform initial data validation
5. Store raw data in Delta Lake format

In [0]:
access = dbutils.secrets.get("aws-keys", key="aws-access-key")
secret = dbutils.secrets.get("aws-keys", key="aws-secret-key")

spark.conf.set("fs.s3a.access.key", access)
spark.conf.set("fs.s3a.secret.key", secret)
spark.conf.set("fs.s3a.impl","org.apache.hadoop.fs.s3a.S3AFileSystem")   


In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
import logging

# Retrieve AWS credentials from Databricks Secrets
access = dbutils.secrets.get("aws-keys", "aws-access-key")
secret = dbutils.secrets.get("aws-keys", "aws-secret-key")

# Build the Spark session with Delta Lake and S3 support
spark = (
    SparkSession.builder
    .appName("PsychoBunny-DataIngestion")
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
    .config("spark.hadoop.fs.s3a.access.key", access)
    .config("spark.hadoop.fs.s3a.secret.key", secret)
    .config("spark.hadoop.fs.s3a.endpoint", "s3.amazonaws.com")
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
    .config("spark.hadoop.fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider")
    .getOrCreate()
)

spark.conf.set("fs.s3a.access.key", access)
spark.conf.set("fs.s3a.secret.key", secret)
spark.conf.set("fs.s3a.impl","org.apache.hadoop.fs.s3a.S3AFileSystem")   

# logger code
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
logger.info("Spark session initialized with Delta Lake and AWS S3 support")


INFO:__main__:Spark session initialized with Delta Lake and AWS S3 support
INFO:py4j.clientserver:Received command c on object id p0
INFO:py4j.clientserver:Received command c on object id p0
INFO:py4j.clientserver:Received command c on object id p0


In [0]:
# Configuration
S3_BUCKET = "psycho-bunny-data-lake"
LANDING_ZONE_PATH = "s3://{}/landing-zone/".format(S3_BUCKET)
RAW_DATA_PATH = "s3://{}/raw-data/".format(S3_BUCKET)


In [0]:
# Column mapping function to fix schema differences
def customer_schema_validation(df):
    """Map different column names to standard names"""
    column_mapping = {
        "zip": "postal_code", 
        "post": "postal_code", 
        "postal": "postal_code",
        "province": "state"
    }
    
    for old_col, new_col in column_mapping.items():
        if old_col in df.columns:
            df = df.withColumnRenamed(old_col, new_col)
    
    return df


In [0]:
# Step 1: Get list of all customer CSV files
file_list = spark.read.option("header", "true").csv(f"s3://{S3_BUCKET}/landing-zone/customers/*.csv") \
    .withColumn("source_file", input_file_name()) \
    .select("source_file").distinct().collect()

file_paths = [row.source_file for row in file_list]

# Step 2: Read each file, apply schema validation, and collect
all_customers = []
for file_path in file_paths:
    df = spark.read.option("header", "true").csv(file_path) \
        .withColumn("source_file", lit(file_path))
    
    df_mapped = customer_schema_validation(df)
    all_customers.append(df_mapped)

# Step 3: Union all dataframes
customers_raw = all_customers[0]
for df in all_customers[1:]:
    customers_raw = customers_raw.unionByName(df,allowMissingColumns=True)

# Step 4: Add final processing
customers_raw = customers_raw \
    .withColumn("ingestion_date", current_timestamp()) \
    .withColumn("customer_id", concat_ws("_", lower(col("first_name")), lower(col("last_name")), lower(col("email")))) \
    .dropDuplicates(["customer_id"])

print(f"Loaded {customers_raw.count()} unique customers")
display(customers_raw.limit(30))

INFO:py4j.clientserver:Received command c on object id p0
INFO:py4j.clientserver:Received command c on object id p0
INFO:py4j.clientserver:Received command c on object id p0
INFO:py4j.clientserver:Received command c on object id p0
INFO:py4j.clientserver:Received command c on object id p0
INFO:py4j.clientserver:Received command c on object id p0
INFO:py4j.clientserver:Received command c on object id p0
INFO:py4j.clientserver:Received command c on object id p0


Loaded 2000 unique customers


INFO:py4j.clientserver:Received command c on object id p0
INFO:py4j.clientserver:Received command c on object id p0
INFO:py4j.clientserver:Received command c on object id p0
INFO:py4j.clientserver:Received command c on object id p0
INFO:py4j.clientserver:Received command c on object id p0


first_name,last_name,company_name,address,city,county,postal_code,phone1,phone2,email,web,source_file,state,ingestion_date,customer_id
Aaron,Kloska,"Radecker, H Philip Jr",423 S Navajo St #56,Brookhill,,4816,07-9896-4827,0473-600-733,aaron_kloska@kloska.net.au,http://www.radeckerhphilipjr.com.au,s3://psycho-bunny-data-lake/landing-zone/customers/de_shop_customers_20240701.csv,QLD,2025-07-01T21:00:07.216645Z,aaron_kloska_aaron_kloska@kloska.net.au
Abel,Maclead,Rangoni Of Florence,37275 St Rt 17m M,Middle Island,Suffolk,11953,631-335-3414,631-677-3675,amaclead@gmail.com,http://www.rangoniofflorence.com,s3://psycho-bunny-data-lake/landing-zone/customers/de_shop_customers_20240614.csv,NY,2025-07-01T21:00:07.216645Z,abel_maclead_amaclead@gmail.com
Abraham,Cratch,"Cavuto, John A",41 Benedict St,Aldborough Ward,Greater London,IG2 7QG,01599-245408,01695-305111,acratch@gmail.com,http://www.cavutojohna.co.uk,s3://psycho-bunny-data-lake/landing-zone/customers/de_shop_customers_20230619.csv,,2025-07-01T21:00:07.216645Z,abraham_cratch_acratch@gmail.com
Adaline,Galagher,Debbie Reynolds Hotel,32716 N Michigan Ave #82,Barooga,,3644,02-3225-1954,0416-156-336,adaline.galagher@galagher.com.au,http://www.debbiereynoldshotel.com.au,s3://psycho-bunny-data-lake/landing-zone/customers/de_shop_customers_20240701.csv,NSW,2025-07-01T21:00:07.216645Z,adaline_galagher_adaline.galagher@galagher.com.au
Adela,Echegoyen,Fpa Corp,128 W Kellogg Dr,Burnaby,,V5B 4L5,604-571-8392,604-693-8094,adela.echegoyen@echegoyen.org,http://www.fpacorp.com,s3://psycho-bunny-data-lake/landing-zone/customers/de_shop_customers_20230901.csv,BC,2025-07-01T21:00:07.216645Z,adela_echegoyen_adela.echegoyen@echegoyen.org
Adelaide,Ender,Williams Design Group,175 N Central Ave,Greenslopes,,4120,07-7538-5504,0473-505-816,aender@gmail.com,http://www.williamsdesigngroup.com.au,s3://psycho-bunny-data-lake/landing-zone/customers/de_shop_customers_20240701.csv,QLD,2025-07-01T21:00:07.216645Z,adelaide_ender_aender@gmail.com
Adelina,Nabours,Courtyard By Marriott,80 Pittsford Victor Rd #9,Cleveland,Cuyahoga,44103,216-230-4892,216-937-5320,adelina_nabours@gmail.com,http://www.courtyardbymarriott.com,s3://psycho-bunny-data-lake/landing-zone/customers/de_shop_customers_20240614.csv,OH,2025-07-01T21:00:07.216645Z,adelina_nabours_adelina_nabours@gmail.com
Adell,Lipkin,Systems Graph Inc Ab Dick Dlr,65 Mountain View Dr,Whippany,Morris,07981,973-654-1561,973-662-8988,adell.lipkin@lipkin.com,http://www.systemsgraphincabdickdlr.com,s3://psycho-bunny-data-lake/landing-zone/customers/de_shop_customers_20240614.csv,NJ,2025-07-01T21:00:07.216645Z,adell_lipkin_adell.lipkin@lipkin.com
Adelle,Nitcher,John J Mccarthy Agency Inc,10 Midway Dr,Thorold,,L2V 5C7,905-998-3758,905-848-6892,anitcher@aol.com,http://www.johnjmccarthyagencyinc.com,s3://psycho-bunny-data-lake/landing-zone/customers/de_shop_customers_20230901.csv,ON,2025-07-01T21:00:07.216645Z,adelle_nitcher_anitcher@aol.com
Adelle,Schantini,Creative Photography Inc,162 Grayson St,Greenlands Ward,Lancashire,FY2 0TD,01624-595660,01938-262356,adelle_schantini@yahoo.com,http://www.creativephotographyinc.co.uk,s3://psycho-bunny-data-lake/landing-zone/customers/de_shop_customers_20230619.csv,,2025-07-01T21:00:07.216645Z,adelle_schantini_adelle_schantini@yahoo.com


In [0]:
from pyspark.sql import functions as F

df=customers_raw.filter(F.col('first_name') == 'Kwai')
display(df)

INFO:py4j.clientserver:Received command c on object id p0
INFO:py4j.clientserver:Received command c on object id p0


first_name,last_name,company_name,address,city,county,postal,phone1,phone2,email,web,province,state,zip,post,source_file,ingestion_date,customer_id


INFO:py4j.clientserver:Received command c on object id p0


In [0]:
#  check transaction data
transactions_path = "s3://{}/landing-zone/transactions/de_shop_transactions_20230821.csv".format(S3_BUCKET)

try:
    transactions_raw = spark.read.option("header", "true").option("inferSchema", "true").csv(transactions_path)
    transactions_raw = transactions_raw.withColumn("source_file", lit("de_shop_transactions_20230821.csv"))
    transactions_raw = transactions_raw.withColumn("ingestion_date", current_timestamp())
    
    logger.info(f"Loaded transaction data: {transactions_raw.count()} records")
    display(transactions_raw)
except Exception as e:
    logger.error(f"Error loading transactions: {str(e)}")

INFO:py4j.clientserver:Received command c on object id p0
INFO:py4j.clientserver:Received command c on object id p0
INFO:py4j.clientserver:Received command c on object id p0
INFO:__main__:Loaded transaction data: 2823 records
INFO:py4j.clientserver:Received command c on object id p0


ORDERNUMBER,QUANTITYORDERED,ORDERLINENUMBER,TOTAL_AMOUNT,ORDERDATE,QTR_ID,MONTH_ID,YEAR_ID,PRODUCTCODE,CUSTOMERNAME,PHONE,ADDRESSLINE1,ADDRESSLINE2,CITY,STATE,POSTALCODE,COUNTRY,TERRITORY,CONTACTLASTNAME,CONTACTFIRSTNAME,DEALSIZE,source_file,ingestion_date
10107,30,2,-1993,2/24/2003 0:00,1,2,2021,S10_1678,Land of Toys Inc.,2125557818,897 Long Airport Avenue,,NYC,NY,10022,USA,,Yu,Kwai,Small,de_shop_transactions_20230821.csv,2025-07-01T21:17:06.993825Z
10121,34,5,-92,5/7/2003 0:00,2,5,2021,S10_1678,Reims Collectables,26.47.1555,59 rue de l'Abbaye,,Reims,,51100,France,EMEA,Henriot,Paul,Small,de_shop_transactions_20230821.csv,2025-07-01T21:17:06.993825Z
10134,41,2,-574,7/1/2003 0:00,3,7,2024,S10_1678,Lyon Souveniers,#ERROR!,27 rue du Colonel Pierre Avia,,Paris,,75508,France,EMEA,Da Cunha,Daniel,Medium,de_shop_transactions_20230821.csv,2025-07-01T21:17:06.993825Z
10145,45,6,-623,8/25/2003 0:00,3,8,2020,S10_1678,Toys4GrownUps.com,6265557265,78934 Hillside Dr.,,Pasadena,CA,90003,USA,,Young,Julie,Medium,de_shop_transactions_20230821.csv,2025-07-01T21:17:06.993825Z
10159,49,14,-442,10/10/2003 0:00,4,10,2022,S10_1678,Corporate Gift Ideas Co.,6505551386,7734 Strong St.,,San Francisco,CA,,USA,,Brown,Julie,Medium,de_shop_transactions_20230821.csv,2025-07-01T21:17:06.993825Z
10168,36,1,1540,10/28/2003 0:00,4,10,2022,S10_1678,Technics Stores Inc.,6505556809,9408 Furth Circle,,Burlingame,CA,94217,USA,,Hirano,Juri,Medium,de_shop_transactions_20230821.csv,2025-07-01T21:17:06.993825Z
10180,29,9,-1669,11/11/2003 0:00,4,11,2023,S10_1678,Daedalus Designs Imports,20.16.1555,"184, chausse de Tournai",,Lille,,59000,France,EMEA,Rance,Martine,Small,de_shop_transactions_20230821.csv,2025-07-01T21:17:06.993825Z
10188,48,1,803,11/18/2003 0:00,4,11,2020,S10_1678,Herkku Gifts,#ERROR!,"Drammen 121, PR 744 Sentrum",,Bergen,,N 5804,Norway,EMEA,Oeztan,Veysel,Medium,de_shop_transactions_20230821.csv,2025-07-01T21:17:06.993825Z
10201,22,2,1341,12/1/2003 0:00,4,12,2022,S10_1678,Mini Wheels Co.,6505555787,5557 North Pendale Street,,San Francisco,CA,,USA,,Murphy,Julie,Small,de_shop_transactions_20230821.csv,2025-07-01T21:17:06.993825Z
10211,41,14,-741,1/15/2004 0:00,1,1,2023,S10_1678,Auto Canal Petit,(1) 47.55.6555,"25, rue Lauriston",,Paris,,75016,France,EMEA,Perrier,Dominique,Medium,de_shop_transactions_20230821.csv,2025-07-01T21:17:06.993825Z


In [0]:
# Testing one example
from pyspark.sql import functions as F

df=transactions_raw.filter(F.col('CONTACTFIRSTNAME') == 'Kwai')
display(df)

INFO:py4j.clientserver:Received command c on object id p0


ORDERNUMBER,QUANTITYORDERED,ORDERLINENUMBER,TOTAL_AMOUNT,ORDERDATE,QTR_ID,MONTH_ID,YEAR_ID,PRODUCTCODE,CUSTOMERNAME,PHONE,ADDRESSLINE1,ADDRESSLINE2,CITY,STATE,POSTALCODE,COUNTRY,TERRITORY,CONTACTLASTNAME,CONTACTFIRSTNAME,DEALSIZE,source_file,ingestion_date
10107,30,2,-1993,2/24/2003 0:00,1,2,2021,S10_1678,Land of Toys Inc.,2125557818,897 Long Airport Avenue,,NYC,NY,10022,USA,,Yu,Kwai,Small,de_shop_transactions_20230821.csv,2025-06-23T16:15:51.292647Z
10329,42,1,-608,11/15/2004 0:00,4,11,2021,S10_1678,Land of Toys Inc.,2125557818,897 Long Airport Avenue,,NYC,NY,10022,USA,,Yu,Kwai,Medium,de_shop_transactions_20230821.csv,2025-06-23T16:15:51.292647Z
10107,39,5,1671,2/24/2003 0:00,1,2,2021,S10_2016,Land of Toys Inc.,2125557818,897 Long Airport Avenue,,NYC,NY,10022,USA,,Yu,Kwai,Medium,de_shop_transactions_20230821.csv,2025-06-23T16:15:51.292647Z
10329,20,2,-603,11/15/2004 0:00,4,11,2021,S10_2016,Land of Toys Inc.,2125557818,897 Long Airport Avenue,,NYC,NY,10022,USA,,Yu,Kwai,Medium,de_shop_transactions_20230821.csv,2025-06-23T16:15:51.292647Z
10107,27,4,1764,2/24/2003 0:00,1,2,2023,S10_4698,Land of Toys Inc.,2125557818,897 Long Airport Avenue,,NYC,NY,10022,USA,,Yu,Kwai,Medium,de_shop_transactions_20230821.csv,2025-06-23T16:15:51.292647Z
10329,26,3,-1802,11/15/2004 0:00,4,11,2020,S10_4698,Land of Toys Inc.,2125557818,897 Long Airport Avenue,,NYC,NY,10022,USA,,Yu,Kwai,Medium,de_shop_transactions_20230821.csv,2025-06-23T16:15:51.292647Z
10248,20,3,-629,5/7/2004 0:00,2,5,2024,S10_4757,Land of Toys Inc.,2125557818,897 Long Airport Avenue,,NYC,NY,10022,USA,,Yu,Kwai,Small,de_shop_transactions_20230821.csv,2025-06-23T16:15:51.292647Z
10329,41,5,644,11/15/2004 0:00,4,11,2021,S12_1099,Land of Toys Inc.,2125557818,897 Long Airport Avenue,,NYC,NY,10022,USA,,Yu,Kwai,Small,de_shop_transactions_20230821.csv,2025-06-23T16:15:51.292647Z
10107,21,1,759,2/24/2003 0:00,1,2,2020,S12_2823,Land of Toys Inc.,2125557818,897 Long Airport Avenue,,NYC,NY,10022,USA,,Yu,Kwai,Medium,de_shop_transactions_20230821.csv,2025-06-23T16:15:51.292647Z
10329,24,6,1279,11/15/2004 0:00,4,11,2021,S12_2823,Land of Toys Inc.,2125557818,897 Long Airport Avenue,,NYC,NY,10022,USA,,Yu,Kwai,Medium,de_shop_transactions_20230821.csv,2025-06-23T16:15:51.292647Z


In [0]:
#  fiscal calendar data
calendar_path = "s3://{}/landing-zone/calendar/de_dates.csv".format(S3_BUCKET)

try:
    calendar_raw = spark.read.option("header", "true").option("inferSchema", "true").csv(calendar_path)
    calendar_raw = calendar_raw.withColumn("source_file", lit("de_dates.csv"))
    calendar_raw = calendar_raw.withColumn("ingestion_date", current_timestamp())
    
    logger.info(f"Loaded calendar data: {calendar_raw.count()} records")
    display(calendar_raw)
except Exception as e:
    logger.error(f"Error loading calendar: {str(e)}")

INFO:py4j.clientserver:Received command c on object id p0
INFO:__main__:Loaded calendar data: 6944 records
INFO:py4j.clientserver:Received command c on object id p0
INFO:py4j.clientserver:Received command c on object id p0


CALENDAR_DATE,WEEKDAY_NUMBER,WEEKDAY_NAME,FISCAL_WEEK_OF_MONTH,FISCAL_WEEK_OF_YEAR,FISCAL_MONTH_NUMBER,FISCAL_MONTH_NAME,FISCAL_FIRST_DAY_OF_WEEK,FISCAL_LAST_DAY_OF_WEEK,FISCAL_FIRST_DAY_OF_MONTH,FISCAL_LAST_DAY_OF_MONTH,FISCAL_DAY_OF_MONTH,FISCAL_QUARTER,FISCAL_YEAR,FISCAL_YEAR_WEEK,FISCAL_YEAR_MONTH,FISCAL_YEAR_QUARTER,CALENDAR_WEEK_OF_MONTH,CALENDAR_WEEK_OF_YEAR,CALENDAR_FIRST_DAY_OF_MONTH,CALENDAR_LAST_DAY_OF_MONTH,CALENDAR_FIRST_DAY_OF_YEAR,CALENDAR_LAST_DAY_OF_YEAR,WEEKEND,source_file,ingestion_date
2010-01-31,1,Sun,1,1,1,Feb,2010-01-31,2010-02-06,2010-01-31,2010-02-27,1,1,2010,201001,201001,20101,5,4,2010-01-01,2010-01-31,2010-01-01,2010-12-31,True,de_dates.csv,2025-07-01T21:17:16.931474Z
2010-02-01,2,Mon,1,1,1,Feb,2010-01-31,2010-02-06,2010-01-31,2010-02-27,2,1,2010,201001,201001,20101,1,5,2010-02-01,2010-02-28,2010-01-01,2010-12-31,False,de_dates.csv,2025-07-01T21:17:16.931474Z
2010-02-02,3,Tue,1,1,1,Feb,2010-01-31,2010-02-06,2010-01-31,2010-02-27,3,1,2010,201001,201001,20101,1,5,2010-02-01,2010-02-28,2010-01-01,2010-12-31,False,de_dates.csv,2025-07-01T21:17:16.931474Z
2010-02-03,4,Wed,1,1,1,Feb,2010-01-31,2010-02-06,2010-01-31,2010-02-27,4,1,2010,201001,201001,20101,1,5,2010-02-01,2010-02-28,2010-01-01,2010-12-31,False,de_dates.csv,2025-07-01T21:17:16.931474Z
2010-02-04,5,Thu,1,1,1,Feb,2010-01-31,2010-02-06,2010-01-31,2010-02-27,5,1,2010,201001,201001,20101,1,5,2010-02-01,2010-02-28,2010-01-01,2010-12-31,False,de_dates.csv,2025-07-01T21:17:16.931474Z
2010-02-05,6,Fri,1,1,1,Feb,2010-01-31,2010-02-06,2010-01-31,2010-02-27,6,1,2010,201001,201001,20101,2,5,2010-02-01,2010-02-28,2010-01-01,2010-12-31,False,de_dates.csv,2025-07-01T21:17:16.931474Z
2010-02-06,7,Sat,1,1,1,Feb,2010-01-31,2010-02-06,2010-01-31,2010-02-27,7,1,2010,201001,201001,20101,2,5,2010-02-01,2010-02-28,2010-01-01,2010-12-31,True,de_dates.csv,2025-07-01T21:17:16.931474Z
2010-02-07,1,Sun,2,2,1,Feb,2010-02-07,2010-02-13,2010-01-31,2010-02-27,8,1,2010,201002,201001,20101,2,5,2010-02-01,2010-02-28,2010-01-01,2010-12-31,True,de_dates.csv,2025-07-01T21:17:16.931474Z
2010-02-08,2,Mon,2,2,1,Feb,2010-02-07,2010-02-13,2010-01-31,2010-02-27,9,1,2010,201002,201001,20101,2,6,2010-02-01,2010-02-28,2010-01-01,2010-12-31,False,de_dates.csv,2025-07-01T21:17:16.931474Z
2010-02-09,3,Tue,2,2,1,Feb,2010-02-07,2010-02-13,2010-01-31,2010-02-27,10,1,2010,201002,201001,20101,2,6,2010-02-01,2010-02-28,2010-01-01,2010-12-31,False,de_dates.csv,2025-07-01T21:17:16.931474Z


In [0]:
# Store raw data in Delta Lake format
try:
    # Store customers
    customers_raw.withColumn("partition_date", date_format(col("ingestion_date"), "yyyy-MM-dd")) \
        .write \
        .format("delta") \
        .mode("overwrite") \
        .partitionBy("partition_date") \
        .save(f"{RAW_DATA_PATH}customers")
    
    # Store transactions
    transactions_raw.withColumn("partition_date", date_format(col("ingestion_date"), "yyyy-MM-dd")) \
        .write \
        .format("delta") \
        .mode("overwrite") \
        .partitionBy("partition_date") \
        .save(f"{RAW_DATA_PATH}transactions")
    
    # Store calendar
    calendar_raw.withColumn("partition_date", date_format(col("ingestion_date"), "yyyy-MM-dd")) \
        .write \
        .format("delta") \
        .mode("overwrite") \
        .partitionBy("partition_date") \
        .save(f"{RAW_DATA_PATH}calendar")
    
    logger.info("Raw data successfully stored in Delta Lake format")
except Exception as e:
    logger.error(f"Error storing raw data: {str(e)}")

INFO:py4j.clientserver:Received command c on object id p0
INFO:py4j.clientserver:Received command c on object id p0
INFO:py4j.clientserver:Received command c on object id p0
INFO:py4j.clientserver:Received command c on object id p0
INFO:py4j.clientserver:Received command c on object id p0
INFO:py4j.clientserver:Received command c on object id p0
INFO:py4j.clientserver:Received command c on object id p0
INFO:py4j.clientserver:Received command c on object id p0
INFO:py4j.clientserver:Received command c on object id p0
INFO:py4j.clientserver:Received command c on object id p0
INFO:py4j.clientserver:Received command c on object id p0
INFO:py4j.clientserver:Received command c on object id p0
INFO:py4j.clientserver:Received command c on object id p0
INFO:__main__:Raw data successfully stored in Delta Lake format


In [0]:
# Data validation summary
print("Customers loaded:", customers_raw.count())
print("Transactions loaded:", transactions_raw.count())
print("Calendar loaded:", calendar_raw.count())
print("Data stored at:", RAW_DATA_PATH)


logger.info("Data ingestion completed successfully")                                       

INFO:py4j.clientserver:Received command c on object id p0
INFO:py4j.clientserver:Received command c on object id p0


Customers loaded: 2000


INFO:py4j.clientserver:Received command c on object id p0
INFO:py4j.clientserver:Received command c on object id p0


Transactions loaded: 2823


INFO:py4j.clientserver:Received command c on object id p0
INFO:__main__:Data ingestion completed successfully


Calendar loaded: 6944
Data stored at: s3://psycho-bunny-data-lake/raw-data/
