# Data Ingestion Notebook
This notebook loads transaction and product data from ADLS Gen2, joins them, and performs basic analytics.

In [None]:
from pyspark.sql import SparkSession

# Create Spark session
spark = SparkSession.builder.appName("DataIngestion").getOrCreate()

In [None]:
# Define ADLS paths (change these paths to your actual container and folder)
transaction_path = "abfss://<your-container-name>@<your-storage-account-name>.dfs.core.windows.net/transactions.csv"
product_path = "abfss://<your-container-name>@<your-storage-account-name>.dfs.core.windows.net/products.csv" 

In [None]:
# Load data from CSV into DataFrames
df_transactions = spark.read.option("header", True).csv(transaction_path)
df_products = spark.read.option("header", True).csv(product_path)

# Show sample data
df_transactions.show(5)
df_products.show(5)

In [None]:
# Join both DataFrames on product ID
df_joined = df_transactions.join(df_products, df_transactions.product_id == df_products.product_id, "inner")
df_joined.show(5)

In [None]:
# Calculate average order value for each customer
from pyspark.sql.functions import col, avg

avg_order_value = df_joined.groupBy("customer_id").agg(avg(col("price")).alias("avg_order_value"))
avg_order_value.show(5)

In [None]:
# Identify popular products
from pyspark.sql.functions import count

popular_products = df_joined.groupBy("product_id", "product_name").agg(count("*").alias("orders")).orderBy("orders", ascending=False)
popular_products.show(5)

In [None]:
# Data quality checks
df_joined.select([col(c).isNull().alias(c + "_is_null") for c in df_joined.columns]).show(5)