# Create an asset hierarchy
Ingest an Asset hierarchy from raw to clean

Fill out Cmd 2 and run the whole Notebook. It will create an Asset Hierarchy in clean out of a table with Parent/Child relationships. All data in that table in RAW will be added as Metadata in CLEAN.

In [2]:
secret_scope = "your scope" # name your secret scope here
project = "project key" # name the key to use from secret scope here
raw_db = "databasename" #name of raw db your asset hierarchy is stored in
raw_table = "tablename"  #name of raw table your asset hierarchy is stored in
column_parent = "parent ID" #column containing the parent key
column_child = "ID" #column containing the key
column_description = "Description" #column that should be displayed as Description in clean Assets
root_child = "ROOT" # value in 'column_child' column that the root node has 
treename = "Tree" # Unique name of the tree you're ingesting 

In [3]:
assets_clean = spark.read.format("com.cognite.spark.datasource") \
    .option("type", "assets") \
    .option("apiKey", dbutils.secrets.get(secret_scope, project)) \
    .option("inferSchema", "true") \
    .load()
assets_clean.createOrReplaceTempView("assets_clean")


input = spark.read.format("com.cognite.spark.datasource") \
    .option("type", "raw") \
    .option("database", raw_db) \
    .option("table", raw_table) \
    .option("apiKey", dbutils.secrets.get(secret_scope, project)) \
    .option("inferSchema","true") \
    .load()
input.createOrReplaceTempView("input")

In [4]:
import pyspark.sql.functions as F
metadata_map_columns = [ item for f in input.schema for item in (F.lit(f.name), "`" + f.name + "`") ]
#metadata_map_columns = [ item for f in before.schema if f.name not in asset_fields for item in (F.lit(f.name), "`" + f.name + "`")]

lookup_metadata = input.select(F.col(column_child), F.create_map(*metadata_map_columns+[F.lit("treename"),F.lit(treename)]).alias("metadata"))
lookup_metadata.createOrReplaceTempView("lookup_metadata")

In [5]:
import pyspark.sql.functions as F

metadata_map_columns = [ "input."+f.name for f in input.schema ]

df_base = spark.sql("select `" + column_child + "` as uniqueChild, cast(null as string) as uniqueParent,  cast(null as string) as parent,  `" + column_child + "` as child , 0 as depth from input where `"+column_child+"` = \""+root_child+"\"")
#df_base = spark.sql("select `" + column_child + "` as uniqueChild, cast(null as string) as uniqueParent,  cast(null as string) as parent,  `" + column_child + "` as child , 0 as depth from input where `"+column_parent+"` is null")
df_last_it = df_base
df_last_it.createOrReplaceTempView("t_last_it")

cnt = 0
cnt_df_tmp_new= 1

df_write = spark.sql("SELECT child as id, null as path, null as element, t_last_it.child as name, null as parentId, `" + column_description + "` as description, lookup_metadata.metadata as metadata, null as source, null as sourceId, null as createdTime , null \
  as lastUpdatedTime from input, t_last_it, lookup_metadata where input.`" + column_child + "` == t_last_it.child AND lookup_metadata.`" + column_child + "` == t_last_it.child")
df_write.printSchema()
df_write.write.insertInto("assets_clean")

relationships = spark.sql("select `" + column_parent + "`  as parent, `" + column_child + "` as child from input")
relationships.createOrReplaceTempView("relationships")

while cnt_df_tmp_new != 0 and cnt <= 20:
  df_tmp_new = spark.sql("(select concat(relationships.child,concat('_',t_last_it.uniqueChild)) as uniqueChild, t_last_it.uniqueChild as uniqueParent, relationships.parent as parent, relationships.child as child  from t_last_it cross JOIN relationships ON t_last_it.child = relationships.parent )")
  cnt = cnt+1
  cnt_df_tmp_new = df_tmp_new.count()
  print("layer: {}".format(cnt))
  print("number of new Assets to be added this layer: {}".format(cnt_df_tmp_new))
  df_base = df_base.union(df_tmp_new.withColumn("depth",F.lit(cnt)))
  df_last_it = df_tmp_new
  df_last_it.createOrReplaceTempView("t_last_it")
  df_write = spark.sql("SELECT child as id, null as path, null as element, t_last_it.child as name, assets_clean.id as parentId, `" + column_description + "` as description, lookup_metadata.metadata as metadata, null as source, null as sourceId, null as createdTime , null \
  as lastUpdatedTime from input, t_last_it, assets_clean, lookup_metadata where input.`" + column_child + "` == t_last_it.child AND assets_clean.name = t_last_it.parent AND lookup_metadata.`" + column_child + "` == t_last_it.child AND assets_clean.metadata.treename = '"+treename+"'")
  df_write.write.insertInto("assets_clean")
  print("--in clean--")
            