In [0]:
import os

client_id = os.environ.get('client_id')
tenant_id = os.environ.get('tenant_id')
client_secret = os.environ.get('secret_value')
storage_account = "project1azure1"

spark.conf.set(f"fs.azure.account.auth.type.{storage_account}.dfs.core.windows.net", "OAuth")
spark.conf.set(f"fs.azure.account.oauth.provider.type.{storage_account}.dfs.core.windows.net", "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider")
spark.conf.set(f"fs.azure.account.oauth2.client.id.{storage_account}.dfs.core.windows.net", client_id)
spark.conf.set(f"fs.azure.account.oauth2.client.secret.{storage_account}.dfs.core.windows.net", client_secret)
spark.conf.set(f"fs.azure.account.oauth2.client.endpoint.{storage_account}.dfs.core.windows.net", f"https://login.microsoftonline.com/{tenant_id}/oauth2/token")

Reading data from adls

In [0]:
df_csv = spark.read.csv(
    "abfss://bronze@project1azure1.dfs.core.windows.net/inventory/category_tree.csv",
    header=True,
    inferSchema=True
)

df_csv.display()

categoryid,parentid
1016,213.0
809,169.0
570,9.0
1691,885.0
536,1691.0
231,
542,378.0
1146,542.0
1140,542.0
1479,1537.0


checking dupliactes and multiple parents

In [0]:
from pyspark.sql.functions import countDistinct

duplicates = df_csv.groupBy("categoryid").count().filter("count>1")
duplicates.display()


duplicates1 = df_csv.groupBy("categoryid").agg(countDistinct("parentid").alias("parent_count")).filter("parent_count>1")
duplicates1.display()



categoryid,count


categoryid,parent_count


Checking circular hirearchy

In [0]:
from pyspark.sql.functions import col


edges = df_csv.withColumnRenamed("categoryid", "child") \
          .withColumnRenamed("parentid", "parent")
print("STEP 2: Renamed Columns")
edges.show()

edges = edges.filter(col("parent").isNotNull())

# Initialize direct parent-child paths (depth = 1)
paths = edges.select("child", "parent")
print("STEP 3: Initial Paths")
paths.show()

# Traverse deeper to find circular paths
for i in range(10):
    print(f"\nITERATION {i+1}: Expanding paths to depth {i+2}")

    # Join to go one level deeper
    paths = paths.alias("p1").join(
        edges.alias("p2"),
        col("p1.parent") == col("p2.child"),
        how="inner"
    ).select(
        col("p1.child").alias("child"),
        col("p2.parent").alias("parent")
    )

    # stop if there are no more paths to explore
    if paths.rdd.isEmpty():
        print(f"✅ No more paths to explore at depth {i + 2}. Tree is clean.")
        break

    print(f"Paths after depth {i + 2}:")
    paths.orderBy("child").show()

    # Check for circular references
    cycles = paths.filter(col("child") == col("parent"))
    if cycles.count() > 0:
        print(f"🔴 Cycle detected at depth {i + 2}")
        cycles.show()
        break
else:
    print("✅ No circular references detected.")

STEP 2: Renamed Columns
+-----+------+
|child|parent|
+-----+------+
| 1016|   213|
|  809|   169|
|  570|     9|
| 1691|   885|
|  536|  1691|
|  231|  NULL|
|  542|   378|
| 1146|   542|
| 1140|   542|
| 1479|  1537|
|   83|  1621|
|  688|   893|
|  257|   312|
| 1640|   622|
|  963|  1281|
|  412|  1110|
|  948|  1110|
|  934|  1110|
|  148|  1110|
|   12|  1110|
+-----+------+
only showing top 20 rows

STEP 3: Initial Paths
+-----+------+
|child|parent|
+-----+------+
| 1016|   213|
|  809|   169|
|  570|     9|
| 1691|   885|
|  536|  1691|
|  542|   378|
| 1146|   542|
| 1140|   542|
| 1479|  1537|
|   83|  1621|
|  688|   893|
|  257|   312|
| 1640|   622|
|  963|  1281|
|  412|  1110|
|  948|  1110|
|  934|  1110|
|  148|  1110|
|   12|  1110|
| 1459|  1604|
+-----+------+
only showing top 20 rows


ITERATION 1: Expanding paths to depth 2
Paths after depth 2:
+-----+------+
|child|parent|
+-----+------+
|    0|  1482|
|    1|  1482|
|    2|   653|
|    3|   312|
|    4|   293|


Saving data into adls

In [0]:
output_path_parquet = "abfss://silver@project1azure1.dfs.core.windows.net/inventory/category_tree"


df_csv.write.format("delta").mode("overwrite").save(output_path_parquet)

In [0]:
df = spark.read.format("delta").load(output_path_parquet)
df.display()

categoryid,parentid
1016,213.0
809,169.0
570,9.0
1691,885.0
536,1691.0
231,
542,378.0
1146,542.0
1140,542.0
1479,1537.0


In [0]:
from pyspark.sql.functions import col

df = df_csv.filter(col('categoryid').isNull())
df1 = df_csv.filter(col('parentid')==col('categoryid'))

df.display()
df1.display()

categoryid,parentid


categoryid,parentid
