# AdWiseAI – Data Ingestion (Sprint 1)





## Azure ADLS2 Ingestion

In [0]:
from azure.identity import DefaultAzureCredential
from azure.storage.filedatalake import DataLakeServiceClient
from pyspark.sql import SparkSession
import os

# Initialize Spark session
spark = SparkSession.builder.getOrCreate()

# ADLS Configuration
ACCOUNT_NAME = "llmops"
FILE_SYSTEM_NAME = "raw"
ADS_FILE_PATH = "ads_dataset.csv"
LOCAL_TEMP_PATH = "/dbfs/tmp/ads_dataset.csv"  # Use /dbfs so Spark can access it

# Authenticate with Managed Identity (Credential Passthrough)
credential = DefaultAzureCredential()
service_client = DataLakeServiceClient(
    account_url=f"https://{ACCOUNT_NAME}.dfs.core.windows.net",
    credential=credential
)

# Get the file system client
file_system_client = service_client.get_file_system_client(FILE_SYSTEM_NAME)

# Optional: List files to verify the dataset path
print("Listing files in container:")
for path in file_system_client.get_paths():
    print(" -", path.name)

# Download the CSV file from ADLS and save to DBFS
try:
    ads_file_client = file_system_client.get_file_client(ADS_FILE_PATH)
    downloaded = ads_file_client.download_file()
    ads_csv_data = downloaded.readall()

    with open(LOCAL_TEMP_PATH, "wb") as f:
        f.write(ads_csv_data)

    # Load CSV into Spark DataFrame
    df = spark.read.option("header", True).option("inferSchema", True).csv("dbfs:/tmp/ads_dataset.csv")
    display(df)
    df.printSchema()

    # Set current database (create manually beforehand if needed)
    spark.catalog.setCurrentDatabase("adwiseai")

    # Save as Delta Table
    spark.sql("CREATE DATABASE IF NOT EXISTS adwiseai") ### USE WHEN INIITIALIZING DB
    df.write.format("delta").mode("overwrite").saveAsTable("adwiseai.ads_dataset_raw")
    print("Ingestion complete. Delta table saved: adwiseai.ads_dataset_raw")

except Exception as e:
    print("Error downloading or reading file:")
    print(str(e))



## Local Ingestion

In [0]:

# # Path to the workspace CSV file
# # Copy file from workspace path to DBFS
# source_path = "file:/Workspace/Users/njdejong99@gmail.com/adwiseai/data/ads_dataset.csv"
# target_path = "dbfs:/tmp/ads_dataset.csv"

# dbutils.fs.cp(source_path, target_path)

# file_path = "file:/Workspace/Users/njdejong99@gmail.com/adwiseai/data/ads_dataset.csv"

# # Read into Spark DataFrame
# df = spark.read.option("header", True).option("inferSchema", True).csv(file_path)

# display(df)

# # Show schema
# df.printSchema()


# #spark.sql("CREATE DATABASE IF NOT EXISTS adwiseai") ## USE WHEN INIITIALIZING DB

# spark.catalog.setCurrentDatabase("adwiseai")

# df.write.format("delta").mode("overwrite").saveAsTable("adwiseai.ads_dataset_raw")

