In [2]:
#Get relationship count
spark.read.format("org.neo4j.spark.DataSource")\
  .option("url", "bolt://137.138.226.101:7687")\
  .option("authentication.type", "basic")\
  .option("authentication.basic.username", "neo4j")\
  .option("authentication.basic.password", "mypass")\
  .option("query", "MATCH (:Dataset)<-[r:CONSITS_OF]-(:Container) RETURN count(r)")\
  .load().show()

+--------+
|count(r)|
+--------+
| 8957122|
+--------+



In [3]:
#Get node counts
spark.read.format("org.neo4j.spark.DataSource")\
  .option("url", "bolt://137.138.226.101:7687")\
  .option("authentication.type", "basic")\
  .option("authentication.basic.username", "neo4j")\
  .option("authentication.basic.password", "mypass")\
  .option("query", "MATCH (n:Rule) RETURN count(n)")\
  .load().show()

+--------+
|count(n)|
+--------+
| 7997498|
+--------+



In [4]:
from datetime import datetime
from pyspark.sql.functions import col, lower, hex as _hex

In [5]:
TODAY = datetime.today().strftime('%Y-%m-%d')
TODAY = '2022-06-21'
HDFS_RUCIO_DIDS = f"/project/awg/cms/rucio/{TODAY}/dids/part*.avro"
HDFS_RUCIO_CONTENTS = f"/project/awg/cms/rucio/{TODAY}/contents/part*.avro"
HDFS_RUCIO_DATASET_LOCKS = f"/project/awg/cms/rucio/{TODAY}/dataset_locks/part*.avro"
HDFS_RUCIO_RULES = f"/project/awg/cms/rucio/{TODAY}/rules/part*.avro"
HDFS_RUCIO_RSES = f"/project/awg/cms/rucio/{TODAY}/rses/part*.avro"

In [6]:
def get_df_dids(spark):
    return spark.read.format("avro").load(HDFS_RUCIO_DIDS) \
        .withColumnRenamed("BYTES", "SIZE") \
        .select(["name", "account", "did_type", "size"])

def get_df_contents(spark):
    return spark.read.format("avro").load(HDFS_RUCIO_CONTENTS) \
        .select(["name", "did_type", "child_name", "child_type"])
    
#should I also consider replicating ones?    
def get_df_rules(spark):
    return spark.read.format("avro").load(HDFS_RUCIO_RULES)\
        .filter(col('did_type')!='F')\
        .filter(col('state')=='O') \
        .withColumn("id", lower(_hex(col("id")))) \
        .select(["id", "name", "account", "did_type", "rse_expression"])


def get_dataset_locks(spark):
    return spark.read.format("avro").load(HDFS_RUCIO_DATASET_LOCKS)\
        .withColumn("rse_id", lower(_hex(col("rse_id")))) \
        .withColumn("rule_id", lower(_hex(col("rule_id")))) \
        .filter(col('state')=='O') \
        .select(["rule_id", "rse_id", "name", "account"])

def get_rses(spark):
    return spark.read.format("avro").load(HDFS_RUCIO_RSES)\
        .withColumn("id", lower(_hex(col("id")))) \
        .select(["id", "rse"])

In [7]:
did_df = get_df_dids(spark)

In [5]:
contents_df = get_df_contents(spark).filter(col("did_type")=="C").filter(col("child_type")=="D")

In [6]:
containers_df = did_df.filter(col("did_type")=="C").select("name", "account", "size").withColumnRenamed("name", "cname")
containers_df.limit(5).toPandas()

Unnamed: 0,cname,account,size
0,/LQLQToTopEl_M-400_TuneCP5_13TeV_pythia8/RunII...,sync_t1_us_fnal_disk,
1,/LambdabToJpsiSigma1383Phi_BMuonFilter_DGamma0...,sync_t1_us_fnal_disk,
2,/LambdabToJpsiLambdaPhi_BMuonFilter_DGamma0_Tu...,sync_t1_us_fnal_disk,
3,/LQToBMu_M-1800_pair_TuneCUETP8M1_13TeV-madgra...,sync_t1_us_fnal_disk,
4,/LQLQToTopElTopMu_M-1700_TuneCP5_13TeV_pythia8...,sync_t1_us_fnal_disk,


In [7]:
datasets_df = did_df.filter(col("did_type")=='D').select("name", "account", "size").withColumnRenamed("name", "dname")
datasets_df.limit(5).toPandas()

Unnamed: 0,dname,account,size
0,/LQToBMu_M-500_single_TuneCUETP8M1_13TeV-madgr...,sync_t1_us_fnal_disk,8325376
1,/LQLQToTopMu_M-2000_TuneCP5_13TeV_pythia8/RunI...,sync_t1_us_fnal_disk,6600614
2,/LQLQToTopMu_M-700_TuneCP5_13TeV_pythia8/RunII...,sync_t1_us_fnal_disk,10686788
3,/LQLQToTopMu_M-1400_TuneCP5_13TeV_pythia8/RunI...,sync_t1_us_fnal_disk,4804370
4,/LQToBEle_M-400_single_TuneCUETP8M1_13TeV-madg...,sync_t1_us_fnal_disk,15283018


In [8]:
print(f"{containers_df.count()=}")
print(f"{datasets_df.count()=}")

containers_df.count()=739156
datasets_df.count()=8738589


In [8]:
def write_nodes(df, labels, keys, batch_size=1000):
    df.write\
      .format("org.neo4j.spark.DataSource")\
      .mode("Overwrite")\
      .option("batch.size", batch_size)\
      .option("url", "bolt://137.138.226.101:7687")\
      .option("authentication.type", "basic")\
      .option("authentication.basic.username", "neo4j")\
      .option("authentication.basic.password", "mypass")\
      .option("labels", labels)\
      .option("node.keys", keys)\
      .option("schema.optimization.type", "INDEX")\
      .option("schema.optimization.type", "NODE_CONSTRAINTS")\
      .save()

In [13]:
def write_relationship(df, relationship, source_col, target_col, source_labels, target_labels, source_key, target_key, batch_size=1000, property_columns="") :
    renamed_source = f"source.{source_col}"
    renamed_target = f"target.{target_col}"
    df.withColumnRenamed(source_col, renamed_source).withColumnRenamed(target_col, renamed_target)\
      .write\
      .format("org.neo4j.spark.DataSource")\
      .mode("Append")\
      .option("batch.size", batch_size)\
      .option("url", "bolt://137.138.226.101:7687")\
      .option("authentication.type", "basic")\
      .option("authentication.basic.username", "neo4j")\
      .option("authentication.basic.password", "mypass")\
      .option("relationship", relationship)\
      .option("relationship.save.strategy", "keys")\
      .option("relationship.properties", property_columns)\
      .option("relationship.source.labels", source_labels)\
      .option("relationship.source.save.mode", "Match")\
      .option("relationship.source.node.keys", f"{renamed_source}:{source_key}")\
      .option("relationship.target.labels", target_labels)\
      .option("relationship.target.save.mode", "Match")\
      .option("relationship.target.node.keys", f"{renamed_target}:{target_key}")\
      .save()

In [32]:
write_nodes(containers_df, ":Container", "cname", 100)

In [None]:
write_nodes(datasets_df, ":Dataset", "dname", 100)

### Container Dataset relationship

In [11]:
container_node_map = containers_df.join(contents_df, containers_df["cname"]==contents_df["name"]).select(["name", "child_name"]).withColumnRenamed("name", "cname").withColumnRenamed("child_name", "dname")

In [12]:
container_node_map.limit(5).toPandas()

Unnamed: 0,cname,dname
0,/ADDGravToGG_MS-10000_NED-2_KK-1_M-500To1000_1...,/ADDGravToGG_MS-10000_NED-2_KK-1_M-500To1000_1...
1,/ADDGravToGG_MS-10000_NED-2_KK-1_M-500To1000_1...,/ADDGravToGG_MS-10000_NED-2_KK-1_M-500To1000_1...
2,/ADDGravToGG_MS-11000_NED-4_KK-1_M-4000To11000...,/ADDGravToGG_MS-11000_NED-4_KK-1_M-4000To11000...
3,/ADDGravToGG_MS-3000_NED-4_KK-1_M-2000To3000_1...,/ADDGravToGG_MS-3000_NED-4_KK-1_M-2000To3000_1...
4,/ADDGravToGG_MS-3000_NED-4_KK-1_M-2000To3000_1...,/ADDGravToGG_MS-3000_NED-4_KK-1_M-2000To3000_1...


In [13]:
container_node_map.count()

8957122

In [52]:
write_relationship(container_node_map, "CONSISTS_OF", "cname", "dname", ":Container", ":Dataset", "cname", "dname", 100)

22/06/30 21:12:18 WARN BlockManagerMasterEndpoint: Error trying to remove shuffle 6 from block manager BlockManagerId(77, ithdp2117.cern.ch, 5102, None)
java.io.IOException: Connection from /188.185.5.191:18925 closed
	at org.apache.spark.network.client.TransportResponseHandler.channelInactive(TransportResponseHandler.java:146)
	at org.apache.spark.network.server.TransportChannelHandler.channelInactive(TransportChannelHandler.java:117)
	at io.netty.channel.AbstractChannelHandlerContext.invokeChannelInactive(AbstractChannelHandlerContext.java:262)
	at io.netty.channel.AbstractChannelHandlerContext.invokeChannelInactive(AbstractChannelHandlerContext.java:248)
	at io.netty.channel.AbstractChannelHandlerContext.fireChannelInactive(AbstractChannelHandlerContext.java:241)
	at io.netty.channel.ChannelInboundHandlerAdapter.channelInactive(ChannelInboundHandlerAdapter.java:81)
	at io.netty.handler.timeout.IdleStateHandler.channelInactive(IdleStateHandler.java:277)
	at io.netty.channel.AbstractC

	at io.netty.channel.ChannelInboundHandlerAdapter.channelInactive(ChannelInboundHandlerAdapter.java:81)
	at org.apache.spark.network.util.TransportFrameDecoder.channelInactive(TransportFrameDecoder.java:225)
	at io.netty.channel.AbstractChannelHandlerContext.invokeChannelInactive(AbstractChannelHandlerContext.java:262)
	at io.netty.channel.AbstractChannelHandlerContext.invokeChannelInactive(AbstractChannelHandlerContext.java:248)
	at io.netty.channel.AbstractChannelHandlerContext.fireChannelInactive(AbstractChannelHandlerContext.java:241)
	at io.netty.channel.DefaultChannelPipeline$HeadContext.channelInactive(DefaultChannelPipeline.java:1405)
	at io.netty.channel.AbstractChannelHandlerContext.invokeChannelInactive(AbstractChannelHandlerContext.java:262)
	at io.netty.channel.AbstractChannelHandlerContext.invokeChannelInactive(AbstractChannelHandlerContext.java:248)
	at io.netty.channel.DefaultChannelPipeline.fireChannelInactive(DefaultChannelPipeline.java:901)
	at io.netty.channel.Abstr

22/06/30 21:12:19 WARN BlockManagerMasterEndpoint: Error trying to remove broadcast 47 from block manager BlockManagerId(73, ithdp1110.cern.ch, 5101, None)
java.io.IOException: Connection from /188.185.5.191:17772 closed
	at org.apache.spark.network.client.TransportResponseHandler.channelInactive(TransportResponseHandler.java:146)
	at org.apache.spark.network.server.TransportChannelHandler.channelInactive(TransportChannelHandler.java:117)
	at io.netty.channel.AbstractChannelHandlerContext.invokeChannelInactive(AbstractChannelHandlerContext.java:262)
	at io.netty.channel.AbstractChannelHandlerContext.invokeChannelInactive(AbstractChannelHandlerContext.java:248)
	at io.netty.channel.AbstractChannelHandlerContext.fireChannelInactive(AbstractChannelHandlerContext.java:241)
	at io.netty.channel.ChannelInboundHandlerAdapter.channelInactive(ChannelInboundHandlerAdapter.java:81)
	at io.netty.handler.timeout.IdleStateHandler.channelInactive(IdleStateHandler.java:277)
	at io.netty.channel.Abstra

22/06/30 21:12:20 WARN BlockManagerMasterEndpoint: Error trying to remove broadcast 47 from block manager BlockManagerId(68, ithdp2118.cern.ch, 5101, None)
java.io.IOException: Connection from /188.185.5.191:64488 closed
	at org.apache.spark.network.client.TransportResponseHandler.channelInactive(TransportResponseHandler.java:146)
	at org.apache.spark.network.server.TransportChannelHandler.channelInactive(TransportChannelHandler.java:117)
	at io.netty.channel.AbstractChannelHandlerContext.invokeChannelInactive(AbstractChannelHandlerContext.java:262)
	at io.netty.channel.AbstractChannelHandlerContext.invokeChannelInactive(AbstractChannelHandlerContext.java:248)
	at io.netty.channel.AbstractChannelHandlerContext.fireChannelInactive(AbstractChannelHandlerContext.java:241)
	at io.netty.channel.ChannelInboundHandlerAdapter.channelInactive(ChannelInboundHandlerAdapter.java:81)
	at io.netty.handler.timeout.IdleStateHandler.channelInactive(IdleStateHandler.java:277)
	at io.netty.channel.Abstra









### Rule Nodes

In [18]:
rules_df = get_df_rules(spark)

In [15]:
rules_df.columns

['id', 'name', 'account', 'did_type', 'rse_expression']

In [None]:
rules_df.select("did_type").distinct().show()

In [16]:
rules_df.limit(5).toPandas()

22/07/01 08:26:37 ERROR TransportClient: Failed to send RPC RPC 8597788510675780448 to /188.185.5.191:38010: java.nio.channels.ClosedChannelException
java.nio.channels.ClosedChannelException
	at io.netty.channel.AbstractChannel$AbstractUnsafe.newClosedChannelException(AbstractChannel.java:957)
	at io.netty.channel.AbstractChannel$AbstractUnsafe.write(AbstractChannel.java:865)
	at io.netty.channel.DefaultChannelPipeline$HeadContext.write(DefaultChannelPipeline.java:1367)
	at io.netty.channel.AbstractChannelHandlerContext.invokeWrite0(AbstractChannelHandlerContext.java:717)
	at io.netty.channel.AbstractChannelHandlerContext.invokeWriteAndFlush(AbstractChannelHandlerContext.java:764)
	at io.netty.channel.AbstractChannelHandlerContext$WriteTask.run(AbstractChannelHandlerContext.java:1071)
	at io.netty.util.concurrent.AbstractEventExecutor.safeExecute(AbstractEventExecutor.java:164)
	at io.netty.util.concurrent.SingleThreadEventExecutor.runAllTasks(SingleThreadEventExecutor.java:472)
	at io

22/07/01 08:26:37 ERROR TransportClient: Failed to send RPC RPC 4911781791887248827 to /188.185.5.191:11713: java.nio.channels.ClosedChannelException
java.nio.channels.ClosedChannelException
	at io.netty.channel.AbstractChannel$AbstractUnsafe.newClosedChannelException(AbstractChannel.java:957)
	at io.netty.channel.AbstractChannel$AbstractUnsafe.write(AbstractChannel.java:865)
	at io.netty.channel.DefaultChannelPipeline$HeadContext.write(DefaultChannelPipeline.java:1367)
	at io.netty.channel.AbstractChannelHandlerContext.invokeWrite0(AbstractChannelHandlerContext.java:717)
	at io.netty.channel.AbstractChannelHandlerContext.invokeWriteAndFlush(AbstractChannelHandlerContext.java:764)
	at io.netty.channel.AbstractChannelHandlerContext$WriteTask.run(AbstractChannelHandlerContext.java:1071)
	at io.netty.util.concurrent.AbstractEventExecutor.safeExecute(AbstractEventExecutor.java:164)
	at io.netty.util.concurrent.SingleThreadEventExecutor.runAllTasks(SingleThreadEventExecutor.java:472)
	at io

Unnamed: 0,id,name,account,did_type,rse_expression
0,4f6d3d0e28b9496384626357fa0e128e,/SMS-TChiWH_HToGG_TuneCP2_13TeV-madgraphMLM-py...,sync_t1_us_fnal_tape,D,rse=T1_US_FNAL_Tape
1,8218b61fb6264968b7b5d320548b5f2a,/SMS-T1tbs_RPV_mGluino1700_TuneCUETP8M1_13TeV-...,sync_t1_us_fnal_tape,D,rse=T1_US_FNAL_Tape
2,99980be19ebb45308b4b9a39746b4712,/SMS-T1tbs_RPV_mGluino1100_TuneCP2_13TeV-madgr...,sync_t1_us_fnal_tape,D,rse=T1_US_FNAL_Tape
3,7b00849ecefc4a8ab392cd08df2c0e88,/SMS-T5tttt_dM175_TuneCP2_13TeV-madgraphMLM-py...,sync_t1_us_fnal_tape,D,rse=T1_US_FNAL_Tape
4,585e14643cf447c99f0264159fd1fd84,/SMS-TStauStau-Ewkino_lefthanded_dM-10to40_gen...,sync_t1_us_fnal_tape,D,rse=T1_US_FNAL_Tape


In [61]:
rules_df.count()

7997498

In [62]:
write_nodes(rules_df, ":Rule", "id", 100)

### RSE Nodes


In [17]:
rses = get_rses(spark)

22/07/01 08:27:43 WARN BlockReaderFactory: I/O error constructing remote block reader.
org.apache.hadoop.net.ConnectTimeoutException: 60000 millis timeout while waiting for channel to be ready for connect. ch : java.nio.channels.SocketChannel[connection-pending remote=/10.116.5.200:1004]
	at org.apache.hadoop.net.NetUtils.connect(NetUtils.java:534)
	at org.apache.hadoop.hdfs.DFSClient.newConnectedPeer(DFSClient.java:2939)
	at org.apache.hadoop.hdfs.client.impl.BlockReaderFactory.nextTcpPeer(BlockReaderFactory.java:821)
	at org.apache.hadoop.hdfs.client.impl.BlockReaderFactory.getRemoteBlockReaderFromTcp(BlockReaderFactory.java:746)
	at org.apache.hadoop.hdfs.client.impl.BlockReaderFactory.build(BlockReaderFactory.java:379)
	at org.apache.hadoop.hdfs.DFSInputStream.getBlockReader(DFSInputStream.java:644)
	at org.apache.hadoop.hdfs.DFSInputStream.blockSeekTo(DFSInputStream.java:575)
	at org.apache.hadoop.hdfs.DFSInputStream.readWithStrategy(DFSInputStream.java:757)
	at org.apache.hadoop.

In [22]:
rses.limit(5).toPandas()

Unnamed: 0,id,rse
0,7582b6c39de54370887563869b8a06fd,T3_CH_CERN_OpenData
1,6f3a77a467df43c4a30458a6f5d9e668,T3_US_UMD
2,fa8a1412a31d4e6a85439be3a7861cd9,T3_US_Rutgers
3,14366bf923854ad39ebad6d6b19644c4,T0_CH_CERN_Tape_Test
4,9e2c577c976742d286fcb116f877980a,T3_US_OSU


In [48]:
(rses.select("id"))

DataFrame[id: string]

In [23]:
write_nodes(rses, ":Rse", "id")

### Locked at Relationship

In [10]:
dataset_locks = get_dataset_locks(spark).withColumnRenamed("rse_id", "id").withColumnRenamed("name", "dname")

In [11]:
dataset_locks.limit(5).toPandas()

Unnamed: 0,rule_id,id,dname,account
0,57d48bcc1418427181da96c18b05dc3b,0ad9b7eb296849219cb3af243b59c334,/DMsimp_t-S3D_uR_JChiChi_Mphi-2000_Mchi-350_La...,transfer_ops
1,d4c1f5cdb00345a68da54527e1b99516,2694b6bd279f4baa890f5abbe66351d2,/TTToSemiLeptonic_TuneCP5_13TeV-powheg-pythia8...,transfer_ops
2,5100aa7da04b4458a6327dd196b87bc3,2694b6bd279f4baa890f5abbe66351d2,/RSGravitonToGammaGamma_kMpl01_M_1750_TuneCP2_...,transfer_ops
3,8f827ab2f98e4b419245b6756cc49118,be0c1696016e4297a1573425d4a9b0a6,/SMS-TSlepSlep_TuneCP2_13TeV-madgraphMLM-pythi...,transfer_ops
4,575967e1fc1746fcbe54cc7ccbbff9e4,16eef8c8359d47c985c5cc48dd1a1a56,/ST_t-channel_top_4f_InclusiveDecays_TuneCP5do...,transfer_ops


In [12]:
dataset_locks.count()

19917960

In [None]:
write_relationship(dataset_locks, "LOCKED_AT", "dname", "id", ":Dataset", ":Rse", "dname", "id", 1000, "rule_id:rule_id, account:account")

### Account Nodes



In [31]:
import pandas as pd
accounts_df =pd.read_csv("account.csv")


In [32]:
accounts_df = spark.createDataFrame(accounts_df).withColumnRenamed("ACCOUNT", "account").withColumnRenamed("ACCOUNT_TYPE", "account_type").withColumnRenamed("STATUS", "status")

In [33]:
write_nodes(accounts_df, ":Account", "account")

In [45]:
# accounts_df.write.format("avro").save("accounts.avro")

### CREATED_RULE relationship

In [19]:
rules_df.limit(5).toPandas()

Unnamed: 0,id,name,account,did_type,rse_expression
0,4f6d3d0e28b9496384626357fa0e128e,/SMS-TChiWH_HToGG_TuneCP2_13TeV-madgraphMLM-py...,sync_t1_us_fnal_tape,D,rse=T1_US_FNAL_Tape
1,8218b61fb6264968b7b5d320548b5f2a,/SMS-T1tbs_RPV_mGluino1700_TuneCUETP8M1_13TeV-...,sync_t1_us_fnal_tape,D,rse=T1_US_FNAL_Tape
2,99980be19ebb45308b4b9a39746b4712,/SMS-T1tbs_RPV_mGluino1100_TuneCP2_13TeV-madgr...,sync_t1_us_fnal_tape,D,rse=T1_US_FNAL_Tape
3,7b00849ecefc4a8ab392cd08df2c0e88,/SMS-T5tttt_dM175_TuneCP2_13TeV-madgraphMLM-py...,sync_t1_us_fnal_tape,D,rse=T1_US_FNAL_Tape
4,585e14643cf447c99f0264159fd1fd84,/SMS-TStauStau-Ewkino_lefthanded_dM-10to40_gen...,sync_t1_us_fnal_tape,D,rse=T1_US_FNAL_Tape


In [20]:
created_rule_rel = rules_df.select(["id", "account"])

In [22]:
created_rule_rel.count()

7997498

In [21]:
write_relationship(created_rule_rel, "CREATED_RULE", "account", "id", ":Account", ":Rule", "account", "id", 500, "")

### Container has rule relationship


In [23]:
has_rule_rel_container = rules_df.filter(col("did_type")=='C').select(["id", "name"]).withColumnRenamed("name", "cname")

In [24]:
has_rule_rel_container.count()

1043215

In [25]:
write_relationship(has_rule_rel_container, "HAS_RULE", "cname", "id", ":Container", ":Rule", "cname", "id", 500, "")

### Dataset has_rule relationship

In [26]:
has_rule_rel_dataset = rules_df.filter(col("did_type")=='D').select(["id", "name"]).withColumnRenamed("name", "dname")

In [27]:
has_rule_rel_dataset.count()

6954283

In [28]:
write_relationship(has_rule_rel_dataset, "HAS_RULE", "dname", "id", ":Dataset", ":Rule", "dname", "id", 500, "")

### Testing dataset count consistency

In [36]:
child_datasets = containers_df.join(contents_df, containers_df["cname"]==contents_df["name"]).select("child_name")

In [37]:
child_datasets.distinct().count()

8709796

In [42]:
8738589 - 8709796

28793

In [38]:
dataset_df.select("dname").withColumnRenamed("dname", "name").subtract(child_datasets.select("child_name").withColumnRenamed("child_name", "name")).limit(5).collect()

[Row(name='/RelValProdMinBias/DMWM_Test-RECOPROD1_TaskChain_ProdMinBias_HG1806_Validation_Alan_v2-v11/GEN-SIM-RECO#e83ade5a-a202-4eb7-ad15-c28479fa688d'),
 Row(name='/SingleMuon/CMSSW_10_6_2-SiStripCalMinBias-106X_dataRun2_v20_RelVal_2017C-v1/ALCARECO#1a938dbb-be14-45eb-96cf-74ad900760a0'),
 Row(name='/BTag/Integ_Test-ReRecoSkim_reqmgr2_test-v1/DQM#a572e858-ed04-11e5-885b-02163e00efd5'),
 Row(name='/DYJetsToLL_Pt-50To100_TuneCUETP8M1_13TeV-amcatnloFXFX-pythia8/DMWM_Test-StepChain_MC_Agent136_Val_Alanv41-v20/GEN-SIM#e95b4633-9148-44f6-9c45-f4cc81933bb7'),
 Row(name='/SingleElectron/CMSSW_10_6_2-EcalESAlign-106X_dataRun2_v20_RelVal_2017E-v1/ALCARECO#04c95967-716f-4e3e-8c2d-7dbfdd1a814d')]

In [41]:
dataset_df.select("dname").withColumnRenamed("dname", "name").subtract(child_datasets.select("child_name").withColumnRenamed("child_name", "name")).count()

28793

In [None]:
#Also test container that do not have any datasets under them