# Copy the raw data to a local temp file

In [0]:
# Set the ADLS Gen2 folder path containing CSV files
csv_silver_path = "abfss://silver@qydatalake.dfs.core.windows.net/dynamicYeild/productfeed_csv/"

# List all files in the folder and filter for those ending with '.csv'
files = [file.name for file in dbutils.fs.ls(csv_silver_path) if file.name.endswith('.csv')]

# If no CSV files are found, raise an error to halt execution
if not files:
    raise FileNotFoundError(f"No CSV files found in {csv_silver_path}")

# Upload the CSV file from ADLS to S3

In [0]:
aws_key    = dbutils.secrets.get("azure_key_vault", "DY-AWS-S3-KEY")
aws_secret = dbutils.secrets.get("azure_key_vault", "DY-AWS-S3-SECRET")

spark.conf.set("fs.s3a.access.key", aws_key)
spark.conf.set("fs.s3a.secret.key", aws_secret)
# (optional) set your preferred endpoint if needed
spark.conf.set("fs.s3a.endpoint", "s3.amazonaws.com")

# Source path in ADLS
source_dir = "abfss://silver@qydatalake.dfs.core.windows.net/dynamicYeild/productfeed_csv/"

# Read source file from source path
df = (
  spark
    .read
    .option("header", "true")
    .csv(source_dir)          # reads all CSVs in that folder
)

# Write directly into S3
df.write \
  .mode("overwrite") \
  .option("header", "true") \
  .csv("s3a://com.dynamicyield.feeds/8776216/productfeed.csv")

# Validate upload status

In [0]:
from pyspark.sql.functions import col

# 1) Read back the CSV(s) you just wrote
s3_path = "s3a://com.dynamicyield.feeds/8776216/productfeed.csv/"
df_s3   = spark.read.option("header","true").csv(s3_path)

# 2) Compare counts between source and S3
count_src = df.count()
count_s3  = df_s3.count()

if count_src != count_s3:
    raise ValueError(
        f"Row count mismatch: source={count_src}, s3={count_s3}"
    )
else:
    print(f"✅ Row counts match: {count_src} rows")

# 3) Check schema equivalence
if df.schema != df_s3.schema:
    print("⚠️ Schemas differ:")
    print(" Source schema:", df.schema.simpleString())
    print(" S3    schema:", df_s3.schema.simpleString())
else:
    print("✅ Schemas match exactly")

✅ Row counts match: 384 rows
✅ Schemas match exactly
