In [1]:
from datetime import datetime

import click
import pandas as pd
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql.functions import (
    col, collect_list, collect_set, concat_ws, first, format_number, from_unixtime, greatest, lit, lower, when,
    avg as _avg,
    count as _count,
    hex as _hex,
    max as _max,
    min as _min,
    round as _round,
    split as _split,
    sum as _sum,
)

from pyspark.sql.types import (
    LongType,
    DecimalType
)

In [2]:
import pandas as pd
import pyspark.sql.functions as func

In [3]:
TODAY = datetime.today().strftime('%Y-%m-%d')

HDFS_RUCIO_LOCKS = f"/project/awg/cms/rucio/{TODAY}/locks/part*.avro"
HDFS_RUCIO_RSES = f'/tmp/cmsmonit/rucio_daily_stats-{TODAY}/RSES/part*.avro'
HDFS_RUCIO_CONTENTS = f"/project/awg/cms/rucio/{TODAY}/contents/part*.avro"
HDFS_RUCIO_RULES = f"/project/awg/cms/rucio/{TODAY}/rules/part*.avro"

In [4]:
#Pandas options to not truncate columns 
pd.options.display.float_format = '{:,.2f}'.format
pd.set_option('display.max_columns', None)
pd.set_option('max_colwidth', None)
pd.set_option('max_rows', None)

In [5]:
def get_df_locks(spark):
    return spark.read.format('avro').load(HDFS_RUCIO_LOCKS) \
        .withColumn('rse_id', lower(_hex(col('RSE_ID')))) \
        .withColumn('rule_id', lower(_hex(col('RULE_ID')))) \
        .withColumn('file_size', col('BYTES').cast(LongType())) \
        .withColumnRenamed('NAME', 'file_name') \
        .select(['file_name', 'rse_id', 'rule_id', 'account', 'file_size'])


def get_df_rses(spark):
    df_rses = spark.read.format("com.databricks.spark.avro").load(HDFS_RUCIO_RSES) \
        .filter(col('DELETED_AT').isNull()) \
        .withColumn('id', lower(_hex(col('ID')))) \
        .withColumn('rse_tier', _split(col('RSE'), '_').getItem(0)) \
        .withColumn('rse_country', _split(col('RSE'), '_').getItem(1)) \
        .withColumn('rse_kind',
                    when((col("rse").endswith('Temp') | col("rse").endswith('temp') | col("rse").endswith('TEMP')),
                         'temp')
                    .when((col("rse").endswith('Test') | col("rse").endswith('test') | col("rse").endswith('TEST')),
                          'test')
                    .otherwise('prod')
                    ) \
        .withColumnRenamed('id', 'rse_id') \
        .withColumnRenamed('RSE', 'rse_name') \
        .withColumnRenamed('RSE_TYPE', 'rse_type') \
        .select(['rse_id', 'rse_name', 'rse_type', 'rse_tier', 'rse_country', 'rse_kind'])
    return df_rses


def get_df_contents(spark):
    return spark.read.format('avro').load(HDFS_RUCIO_CONTENTS) \
        .filter(col("scope")=="cms")\
        .select(['name', 'child_name', 'did_type', 'child_type'])


#TODO: Do we need to consider replicating rules here?
def get_df_rules(spark):
    return spark.read.format("avro").load(HDFS_RUCIO_RULES)\
        .filter(col('state')=='O') \
        .withColumn("rule_id", lower(_hex(col("id")))) \
        .select(["rule_id", "account", "did_type", "rse_expression", "copies"])

In [6]:
#get actual dataset names here
#We drop the datasets names with /CONTAINER - these are created for data transfer challenges and deletion campaigns
# If needed we deal with them separately
def get_dataset_file_map(spark):
    df_contents = get_df_contents(spark)
    
    block_file_map = df_contents.filter(col("child_type")=="F")\
                                  .filter(col("name").endswith("#DATASET")==False)\
                                  .filter(col("name").endswith("#DATASET2")==False)\
                                  .withColumnRenamed("child_name", "file")\
                                  .withColumnRenamed("name", "block")
                                 
        
    dataset_block_map = df_contents.filter(col("child_type")=="D")\
                                  .filter(col("name").endswith("/CONTAINER")==False)\
                                  .withColumnRenamed("child_name", "block")\
                                  .withColumnRenamed("name", "dataset")
        
    #we do a right join to capture files that do not map to a dataset
    #later we club these files in a Unknown Container category
    dataset_file_map = dataset_block_map.alias("dbm").join(block_file_map.alias("bfm"), col("dbm.block")==col("bfm.block"), "right")\
                                  .na.fill({"dataset":"/UnknownDataset"})\
                                  .withColumn("data_tier", func.element_at(func.split("dataset","/"),-1))\
                                  .select(["dataset", "bfm.block", "bfm.file", "data_tier"])
    
    #We are creating some dummy datasets here (derived from block names), to get data_tier info for files as much as possible
#     df_dataset_file_map = df_contents_file\
#                                 .withColumn("dataset", func.element_at(func.split("name","#"),1))\
#                                 .withColumn("data_tier", func.element_at(func.split("dataset","/"),-1))\
#                                 .select(["file", "dataset", "data_tier"])
        
    return dataset_file_map
                                

In [7]:
df_map = get_dataset_file_map(spark)
df_locks = get_df_locks(spark)
df_rses = get_df_rses(spark)
df_rules = get_df_rules(spark)

# Child Table
#### Rules and their associated metrics with the mentioned rse  (and dataset? - TODO)

- Get a rule - rse - file relation (i.e locks table)
     1. collect - rses for that rule
     2. sum file size for that rule
     3. sum file size at just that rse

In [8]:
df_locks.limit(5).toPandas()

Unnamed: 0,file_name,rse_id,rule_id,account,file_size
0,/store/mc/Run3Summer21DRPremix/DYToLL_M-50_TuneCP5_14TeV-pythia8/GEN-SIM-DIGI-RAW/120X_mcRun3_2021_realistic_v6-v2/80011/9757e58e-e12e-4e98-97f9-131b206bee10.root,f44c866a264d4da9972969e9f3b5bb52,13b62f7b9d0840ec8288d2b060bd0ea3,wmcore_output,3098398779
1,/store/mc/RunIISummer16NanoAODv6/ST_FCNC-TH_Thadronic_HToaa_Ctcphi_CP5_13TeV-mcatnlo-madspin-pythia8/NANOAODSIM/PUMoriond17_Nano25Oct2019_102X_mcRun2_asymptotic_v7-v1/2810000/B1F4A0C2-96CB-284C-BC89-8B9C02DFA654.root,751eb808adf54eada3be77dd40c9c399,e9de5a667a254d5fb33218a038aa17a3,transfer_ops,2855979699
2,/store/mc/RunIISummer16NanoAODv5/GluGluToBulkGravitonToHHTo4B_M-300_narrow_13TeV-madgraph/NANOAODSIM/PUMoriond17_Nano1June2019_102X_mcRun2_asymptotic_v7-v1/250000/93D8DE1D-29CD-F54F-8367-204A172436AA.root,5175b9ddaed0484cbe9678c9cd260d28,edb0cf2bccc64955a654e17e884a57ef,transfer_ops,12667642
3,/store/mc/RunIIFall17NanoAODv5/GluGluToBulkGravitonToHHTo2B2G_M-2500_narrow_TuneCP5_13TeV-madgraph-pythia8_correctedcfg/NANOAODSIM/PU2017_12Apr2018_Nano1June2019_102X_mc2017_realistic_v7-v1/00000/50253EB9-DDDA-4847-91B9-1E078022F112.root,be0c1696016e4297a1573425d4a9b0a6,500d1410ded24c3ba04d8a71b5b4f57f,transfer_ops,22292601
4,/store/mc/RunIIAutumn18NanoAODv5/ST_t-channel_top_4f_InclusiveDecays_TuneCP5down_13TeV-powheg-madspin-pythia8/NANOAODSIM/Nano1June2019_102X_upgrade2018_realistic_v19-v1/250000/FAEE0636-0E8D-D94C-9152-C3B0AAAC0113.root,16eef8c8359d47c985c5cc48dd1a1a56,575967e1fc1746fcbe54cc7ccbbff9e4,transfer_ops,1596499945


In [9]:
df_locks_r = df_locks.join(df_rses.alias("rse"), df_rses["rse_id"]==df_locks["rse_id"]).select(['file_name', 'rse.rse_id', 'rule_id', 'file_size', 'rse_name'])


In [10]:
rule_rse_map = df_locks_r.alias("locks1")\
                        .groupby("rule_id")\
                        .agg(func.sum("file_size").alias("size_locked_total"),
                             func.countDistinct("rse_name").alias("count_rses"),
                             collect_set("rse_name").alias("other_rses"),
                            )

In [11]:
rule_size_rse = df_locks_r.alias("locks1")\
                        .groupby(["rse_name", "rule_id"])\
                        .agg(func.sum("file_size").alias("size_locked_rse"))

In [12]:
df_rule_agg_info = rule_rse_map.join(rule_size_rse, "rule_id")
# .withColumn("other_rses", concat_ws(", ", "other_rses"))

In [13]:
# df_other_rses = df_locks.alias("lock1").join(df_locks.alias("lock"), col("map.file")==col("lock.file_name"), "right")\
#     .na.fill({"dataset":"/UnknownBlock", "block":"/UnknownBlock#unknown", "data_tier":"UnknownBlock"})\
#     .filter(col("data_tier").isin(["UnknownBlock", "UnknownDataset"])==False)\
#     .groupby(["dataset", "rse_id", "data_tier"])\
#     .agg(func.countDistinct("file_name").alias("distinct_file_count"), func.count("file_name").alias("file_count"), func.sum("file_size").alias("file_sum"),
#          func.countDistinct("rule_id").alias("distinct_ruleid_count"), collect_set("rule_id").alias("ruleid_set"),
#          func.countDistinct("account").alias("distinct_account_count"), collect_set("account").alias("account_set"),
#         )

In [14]:
df_rule_agg_info.filter(col("count_rses")>10).limit(10).toPandas()

Unnamed: 0,rule_id,size_locked_total,count_rses,other_rses,rse_name,size_locked_rse
0,0173828e3a984bc6abca956dabb63b20,127408116390,13,"[T2_FR_GRIF_LLR, T1_DE_KIT_Disk, T2_DE_DESY, T2_UK_SGrid_RALPP, T2_CH_CSCS, T2_IT_Bari, T2_IT_Rome, T1_FR_CCIN2P3_Disk, T2_UK_London_Brunel, T2_UK_London_IC, T1_UK_RAL_Disk, T2_FR_IPHC, T2_HU_Budapest]",T2_UK_SGrid_RALPP,5052303575
1,0173828e3a984bc6abca956dabb63b20,127408116390,13,"[T2_FR_GRIF_LLR, T1_DE_KIT_Disk, T2_DE_DESY, T2_UK_SGrid_RALPP, T2_CH_CSCS, T2_IT_Bari, T2_IT_Rome, T1_FR_CCIN2P3_Disk, T2_UK_London_Brunel, T2_UK_London_IC, T1_UK_RAL_Disk, T2_FR_IPHC, T2_HU_Budapest]",T2_HU_Budapest,1727613698
2,0173828e3a984bc6abca956dabb63b20,127408116390,13,"[T2_FR_GRIF_LLR, T1_DE_KIT_Disk, T2_DE_DESY, T2_UK_SGrid_RALPP, T2_CH_CSCS, T2_IT_Bari, T2_IT_Rome, T1_FR_CCIN2P3_Disk, T2_UK_London_Brunel, T2_UK_London_IC, T1_UK_RAL_Disk, T2_FR_IPHC, T2_HU_Budapest]",T1_UK_RAL_Disk,10930111288
3,0173828e3a984bc6abca956dabb63b20,127408116390,13,"[T2_FR_GRIF_LLR, T1_DE_KIT_Disk, T2_DE_DESY, T2_UK_SGrid_RALPP, T2_CH_CSCS, T2_IT_Bari, T2_IT_Rome, T1_FR_CCIN2P3_Disk, T2_UK_London_Brunel, T2_UK_London_IC, T1_UK_RAL_Disk, T2_FR_IPHC, T2_HU_Budapest]",T2_UK_London_IC,39761925477
4,0173828e3a984bc6abca956dabb63b20,127408116390,13,"[T2_FR_GRIF_LLR, T1_DE_KIT_Disk, T2_DE_DESY, T2_UK_SGrid_RALPP, T2_CH_CSCS, T2_IT_Bari, T2_IT_Rome, T1_FR_CCIN2P3_Disk, T2_UK_London_Brunel, T2_UK_London_IC, T1_UK_RAL_Disk, T2_FR_IPHC, T2_HU_Budapest]",T1_DE_KIT_Disk,9736438996
5,0173828e3a984bc6abca956dabb63b20,127408116390,13,"[T2_FR_GRIF_LLR, T1_DE_KIT_Disk, T2_DE_DESY, T2_UK_SGrid_RALPP, T2_CH_CSCS, T2_IT_Bari, T2_IT_Rome, T1_FR_CCIN2P3_Disk, T2_UK_London_Brunel, T2_UK_London_IC, T1_UK_RAL_Disk, T2_FR_IPHC, T2_HU_Budapest]",T2_DE_DESY,14109063127
6,0173828e3a984bc6abca956dabb63b20,127408116390,13,"[T2_FR_GRIF_LLR, T1_DE_KIT_Disk, T2_DE_DESY, T2_UK_SGrid_RALPP, T2_CH_CSCS, T2_IT_Bari, T2_IT_Rome, T1_FR_CCIN2P3_Disk, T2_UK_London_Brunel, T2_UK_London_IC, T1_UK_RAL_Disk, T2_FR_IPHC, T2_HU_Budapest]",T2_FR_GRIF_LLR,1594326162
7,0173828e3a984bc6abca956dabb63b20,127408116390,13,"[T2_FR_GRIF_LLR, T1_DE_KIT_Disk, T2_DE_DESY, T2_UK_SGrid_RALPP, T2_CH_CSCS, T2_IT_Bari, T2_IT_Rome, T1_FR_CCIN2P3_Disk, T2_UK_London_Brunel, T2_UK_London_IC, T1_UK_RAL_Disk, T2_FR_IPHC, T2_HU_Budapest]",T2_UK_London_Brunel,1104304192
8,0173828e3a984bc6abca956dabb63b20,127408116390,13,"[T2_FR_GRIF_LLR, T1_DE_KIT_Disk, T2_DE_DESY, T2_UK_SGrid_RALPP, T2_CH_CSCS, T2_IT_Bari, T2_IT_Rome, T1_FR_CCIN2P3_Disk, T2_UK_London_Brunel, T2_UK_London_IC, T1_UK_RAL_Disk, T2_FR_IPHC, T2_HU_Budapest]",T2_FR_IPHC,4750933842
9,0173828e3a984bc6abca956dabb63b20,127408116390,13,"[T2_FR_GRIF_LLR, T1_DE_KIT_Disk, T2_DE_DESY, T2_UK_SGrid_RALPP, T2_CH_CSCS, T2_IT_Bari, T2_IT_Rome, T1_FR_CCIN2P3_Disk, T2_UK_London_Brunel, T2_UK_London_IC, T1_UK_RAL_Disk, T2_FR_IPHC, T2_HU_Budapest]",T2_CH_CSCS,346538697


In [15]:
df_rule_combined = df_rule_agg_info.join(df_rules, "rule_id")

In [16]:
df_rule_combined.limit(10).toPandas()

Unnamed: 0,rule_id,size_locked_total,count_rses,other_rses,rse_name,size_locked_rse,account,did_type,rse_expression,copies
0,00020c8ee75442069f4ca41075cd6361,7625918328,1,[T1_US_FNAL_Disk],T1_US_FNAL_Disk,7625918328,transfer_ops,C,ddm_quota>0&rse_type=DISK&country=US,1
1,0002243149214bd7888c5d37e3442871,633728008,1,[T0_CH_CERN_Tape],T0_CH_CERN_Tape,633728008,sync_t0_ch_cern_tape,D,rse=T0_CH_CERN_Tape,1
2,00055ba22f4d491fb1b58ac6310360a5,772368,1,[T1_IT_CNAF_Tape],T1_IT_CNAF_Tape,772368,sync_t1_it_cnaf_tape,D,rse=T1_IT_CNAF_Tape,1
3,00062def5e194202847bd7e18a6f2e15,1818274926,1,[T1_US_FNAL_Tape],T1_US_FNAL_Tape,1818274926,sync_t1_us_fnal_tape,D,rse=T1_US_FNAL_Tape,1
4,000aedb86a8c4292b2229988891d54e5,16106916597,1,[T0_CH_CERN_Tape],T0_CH_CERN_Tape,16106916597,sync_t0_ch_cern_tape,D,rse=T0_CH_CERN_Tape,1
5,000d2984f99242b5a48b884c305fdbd0,2917284657,1,[T1_UK_RAL_Tape],T1_UK_RAL_Tape,2917284657,sync_t1_uk_ral_tape,D,rse=T1_UK_RAL_Tape,1
6,000d8d2265a44e11b90e14af350db130,51222423302,1,[T1_DE_KIT_Tape],T1_DE_KIT_Tape,51222423302,sync_t1_de_kit_tape,D,rse=T1_DE_KIT_Tape,1
7,00106e9392ea4ec28e4d0304263a756e,9842718643,1,[T1_ES_PIC_Tape],T1_ES_PIC_Tape,9842718643,sync_t1_es_pic_tape,D,rse=T1_ES_PIC_Tape,1
8,0010ea0f9f57478baaac1ab1e90cab95,370126718,1,[T0_CH_CERN_Tape],T0_CH_CERN_Tape,370126718,sync_t0_ch_cern_tape,D,rse=T0_CH_CERN_Tape,1
9,0011a735dcba4600bfd35414be7228d7,175390716,1,[T1_IT_CNAF_Disk],T1_IT_CNAF_Disk,175390716,transfer_ops,C,region=C&cms_type=real,1


In [17]:
# df_rule_combined.sort(col("count_rses").desc()).limit(10).toPandas()

In [18]:
# df_rule_combined.write.format("parquet").save("rule_combined3.parquet")

In [19]:
# df_rule_combined2 = spark.read.format('parquet').load("rule_combined2.parquet")
# df_rule_combined2.write.format("com.mongodb.spark.sql.DefaultSource").option("ordered", "false").option("database", "rchauhan").option("collection", "rule_summary2").save()

In [20]:
# df_rule_combined.write.format("com.mongodb.spark.sql.DefaultSource").option("ordered", "false").option("database", "rchauhan").option("collection", "rule_summary3").save()

# Parent Table

#### Datasets at RSEs with set of rules that bind them there! 

In [21]:
#Right join is same as inner join - all files in locks are available in the map
#We re-evaluate the datatier of a dataset here, so that a dataset does ierrorneously map to different datatiers because a file belonging to it is also present in other datasets - Unknown and CONTAINER
df_filtered = df_map.alias("map").join(df_locks.alias("lock"), col("map.file")==col("lock.file_name"), "right")\
    .na.fill({"dataset":"/UnknownBlock", "block":"/UnknownBlock#unknown", "data_tier":"UnknownBlock"})\
    .filter(col("data_tier").isin(["UnknownBlock", "UnknownDataset"])==False)\
    .join(df_rses.alias("rse"), col("lock.rse_id")==col("rse.rse_id"))\
    .select(["dataset", "rse_name", "data_tier", "file_name", "file_size", "block", "rule_id", "account"])\
    .groupby(["dataset", "rse_name", "data_tier"]).agg(func.countDistinct("file_name").alias("file_count"), 
                                                     func.count("file_name").alias("total_file_locks_count"),
                                                     func.sum("file_size").alias("file_sum"),
                                                     func.countDistinct("block").alias("block_count"),
                                                     func.countDistinct("rule_id").alias("ruleid_count"),
                                                     collect_set("rule_id").alias("ruleid_set"),
                                                     func.countDistinct("account").alias("account_count"),
                                                     collect_set("account").alias("account_set"),
        )


# .withColumn("data_tier", func.element_at(func.split("dataset","/"),-1))\

In [22]:
# df_joined =  df_filtered_saved.groupby(["dataset", "rse_id", "data_tier"]).agg(func.countDistinct("file_name").alias("distinct_file_count"), func.count("file_name").alias("file_count"), func.sum("file_size").alias("file_sum"),
#          func.countDistinct("rule_id").alias("distinct_ruleid_count"), collect_set("rule_id").alias("ruleid_set"),
#          func.countDistinct("account").alias("distinct_account_count"), collect_set("account").alias("account_set"),
#         )

In [23]:
# df_grouped = df_joined.groupby(["dataset", "rse_id", "data_tier"])\
#                      .agg(func.countDistinct("file_name").alias("distinct_file_count"), func.count("file_name").alias("file_count"), func.sum("file_size").alias("file_sum"),
#                           func.countDistinct("rule_id").alias("distinct_ruleid_count"), collect_set("rule_id").alias("ruleid_set"),
#                           func.countDistinct("account").alias("distinct_account_count"), collect_set("account").alias("account_set"),
#                          )\
#                      .persist()

#   func.countDistinct("data_tier").alias("distinct_datatier_count"), collect_set("data_tier").alias("datatier_set"),

In [24]:
# df_filtered.write.format("avro").save("dataset-rule.avro")
# df_filtered_saved = spark.read.format('avro').load("dataset-rule.avro")

In [25]:
# df_filtered_saved.write.format("json").save("dataset-rule.json")

In [27]:
# df = df_filtered_saved

In [28]:
mydataset="/QCD_Pt-15to7000_TuneCP5_Flat_14TeV-pythia8/Phase2HLTTDRWinter20DIGI-FlatPU0To200_castor_110X_mcRun4_realistic_v3_ext1-v1/GEN-SIM-DIGI-RAW"
mydataset="/EGamma/Run2018A-v1/RAW"
mydataset="/ExpressCosmics/Commissioning2021-Express-v1/FEVT"

In [29]:

# df.filter(col("dataset")==mydataset).toPandas()

In [30]:
spark

In [31]:
# df_grouped_saved.filter(col("dataset").contains("Unknown")).toPandas().sort_values("file_sum")

In [32]:
# df.write.format("com.mongodb.spark.sql.DefaultSource").option("ordered", "false").option("database", "rchauhan").option("collection", "dataset-rule-connector").save()

# # !hdfs dfs -rm -R -f grouped.parquet

# Sanity Checks



### Unique and total file counts

In [33]:
#Sanity checks
#The number of files in Container-File Map should be greater than (accounting for files belonging to multiple datasets) or equal to total entries of file in the contents table
df_contents = get_df_contents(spark)
# df_contents.filter(col("child_type")=='F').count()

In [34]:
#Comparing the unique counts of files in contents and map table
map_file_count = df_map.select(["file"]).distinct().count()
contents_file_count = df_contents.filter(col("child_type")=='F').select(["child_name"]).distinct().count()
print(f"{map_file_count=}, {contents_file_count=}")

map_file_count=76930490, contents_file_count=76931233


### Unknown and CONTAINER data_tier sources

In [35]:
#Sanity Check
res = df_map.groupby("file").agg(func.count("data_tier"), collect_list("data_tier"), collect_list("dataset"), collect_list("block"))


In [36]:
#Are there any other multimaps after removing campaign and challenge specific collections
#Nope
res.filter(col("count(data_tier)") > 1).count()

0

In [37]:
#Checking if files with multiple maps are only because of CONTAINER data_tier - which are a part of Tests and Challenges etc
#Cheking if files with multiple maps are only in Unknown 
res.filter(col("count(data_tier)") > 1)\
   .filter(func.array_contains("collect_list(data_tier)","CONTAINER")==False)\
   .filter(func.array_contains("collect_list(data_tier)","Unknown")==False)\
   .limit(10).toPandas()

Unnamed: 0,file,count(data_tier),collect_list(data_tier),collect_list(dataset),collect_list(block)


In [38]:
#collecting list of datasets and containers created for deletion campaings and data transfer challenges
challenge_datasets = df_contents.select(["name"]).filter(col("name").contains("DATASET")).distinct().collect()
challenge_containers = df_contents.select(["name"]).filter(col("did_type")=='C').filter(col("name").contains("/CONTAINER")).distinct().collect()

#check if there are Unknown block-file map other than TapeDeletionCampaign
res.filter(col("count(data_tier)") > 1)\
   .filter(func.array_contains("collect_list(data_tier)","CONTAINER")==False)\
   .filter(func.array_contains("collect_list(block)","/Test/TapeDeletionCampaign_Fall2021_FNAL_Lock/CONTAINER#DATASET")==False)\
   .filter(func.array_contains("collect_list(block)","/Test/TapeDeletionCampaign_Fall2021_PIC_Lock/CONTAINER#DATASET")==False)\
   .filter(func.array_contains("collect_list(block)","/Test/TapeDeletionCampaign_Fall2021_CCIN2P3_Lock/CONTAINER#DATASET2")==False)\
   .filter(func.array_contains("collect_list(block)","/Test/TapeDeletionCampaign_Fall2021_CNAF_Lock/CONTAINER#DATASET2")==False)\
   .filter(func.array_contains("collect_list(block)","/Test/TapeDeletionCampaign_Fall2021_PIC_Lock/CONTAINER#DATASET2")==False)\
   .count()

0