In [5]:
from datetime import datetime

In [6]:
TODAY = datetime.today().strftime('%Y-%m-%d')
HDFS_RUCIO_DIDS = f"/project/awg/cms/rucio/{TODAY}/dids/part*.avro"
HDFS_RUCIO_CONTENTS = f"/project/awg/cms/rucio/{TODAY}/contents/part*.avro"

In [7]:
def get_df_dids_files(spark):
    """Create spark dataframe for DIDS table by selecting only Files.

    Filters:
        - DELETED_AT not null
        - HIDDEN = 0
        - SCOPE = cms
        - DID_TYPE = F

    Columns selected:
        - file: file name
        - fsize_dids: represents size of a file in DIDS table
        - accessed_at: file last access time

    df_dids_files: All files catalog, their sizes and last access times
    """
    return spark.read.format("avro").load(HDFS_RUCIO_DIDS) \
        .withColumnRenamed("BYTES", "SIZE") \
        .select(["name", "scope", "account", "did_type", "is_open", "monotonic", "availability", "size", "length"])




def get_df_contents(spark):
     return spark.read.format("avro").load(HDFS_RUCIO_CONTENTS) \
        .select(["name", "did_type", "child_name", "child_type"])

In [8]:
df_did = get_df_dids_files(spark)

In [11]:
df_did.limit(2).toPandas()

Unnamed: 0,name,scope,account,did_type,is_open,monotonic,availability,size,length
0,/LQLQToTopEl_M-400_TuneCP5_13TeV_pythia8/RunII...,cms,sync_t1_us_fnal_disk,C,1,0,A,,9
1,/LQToBMu_M-500_single_TuneCUETP8M1_13TeV-madgr...,cms,sync_t1_us_fnal_disk,D,0,0,A,8325376.0,1


In [12]:
df_container  = df_did.filter(df_did.did_type == 'C')
df_dataset = df_did.filter(df_did.did_type == 'D')
df_file = df_did.filter(df_did.did_type == 'F')

In [15]:
df_contents = get_df_contents(spark)

In [14]:
df_1000_container = df_container.limit(1000)

Unnamed: 0,name,scope,account,did_type,is_open,monotonic,availability,size,length
0,/LQLQToTopEl_M-400_TuneCP5_13TeV_pythia8/RunII...,cms,sync_t1_us_fnal_disk,C,1,0,A,,9
1,/LambdabToJpsiSigma1383Phi_BMuonFilter_DGamma0...,cms,sync_t1_us_fnal_disk,C,1,0,A,,2
2,/LambdabToJpsiLambdaPhi_BMuonFilter_DGamma0_Tu...,cms,sync_t1_us_fnal_disk,C,1,0,A,,45
3,/LQToBMu_M-1800_pair_TuneCUETP8M1_13TeV-madgra...,cms,sync_t1_us_fnal_disk,C,1,0,A,,3
4,/LQLQToTopElTopMu_M-1700_TuneCP5_13TeV_pythia8...,cms,sync_t1_us_fnal_disk,C,1,0,A,,10


In [52]:
container_names = [row.name for row in df_1000_container.select('name').collect()]

In [30]:
dataset_names = df_1000_container.join(df_contents, df_1000_container["name"]==df_contents["name"], "inner").select('child_name').collect()

In [38]:
dataset_names = [row.child_name for row in dataset_names]

In [39]:
df_1000_datasets = df_dataset.filter(df_did.name.isin(dataset_names))

In [40]:
df_1000_datasets.limit(5).toPandas()

Unnamed: 0,name,scope,account,did_type,is_open,monotonic,availability,size,length
0,/ZPhi_2e_M-20_PS_TuneCP5_madgraph-pythia8/RunI...,cms,wma_prod,D,0,0,A,287594249,1
1,/ZPhi_2e_M-20_PS_TuneCP5_madgraph-pythia8/RunI...,cms,wma_prod,D,0,0,A,31817302,1
2,/SinglePiPlusPt30/RunIIFall15DR76-PU25nsData20...,cms,sync_t1_uk_ral_tape,D,0,0,A,1746844579,1
3,/SeesawTypeIII_SIGMAplusSIGMA0ZH_M-300_13TeV-m...,cms,sync_t1_uk_ral_tape,D,0,0,A,881016579,2
4,/SeesawTypeIII_SIGMAplusSIGMA0WZ_M-1060_13TeV-...,cms,sync_t1_uk_ral_tape,D,0,0,A,1698070672,1


In [41]:
file_names = df_1000_datasets.join(df_contents, df_1000_datasets["name"]==df_contents["name"], "inner").select('child_name').collect()

22/06/20 08:51:36 WARN DAGScheduler: Broadcasting large task binary with size 3.1 MiB


In [42]:
file_names = [row.child_name for row in file_names]


['/store/mc/RunIIAutumn18NanoAODv7/GluGluToBulkGravitonToHHTo2B4L_M-1750_narrow_TuneCP5_PSWeights_13TeV-madgraph-pythia8/NANOAODSIM/Nano02Apr2020_102X_upgrade2018_realistic_v21-v1/130000/561B5E76-9671-EB45-897D-3A174F20B3E5.root',
 '/store/mc/RunIIFall17NanoAODv6/HAHMHToAA_AToGG_MA-60GeV_TuneCP5_PSweights_13TeV-madgraph_pythia8/NANOAODSIM/PU2017_12Apr2018_Nano25Oct2019_102X_mc2017_realistic_v7-v1/120000/173715C3-CE7C-9F45-AB1C-B4C15CFE2992.root',
 '/store/mc/RunIISummer16MiniAODv3/LQToDEle_M-1400_single_TuneCUETP8M1_13TeV-madgraph-pythia8/MINIAODSIM/PUMoriond17_94X_mcRun2_asymptotic_v3-v1/260000/B2AB3B5A-B3FD-EA11-B54F-001E67DBE435.root',
 '/store/mc/RunIIFall18GS/ST_t-channel_muDecays_anomwtbLVRT_LV1RT4_13TeV-comphep-pythia8_TuneCP5/GEN-SIM/102X_upgrade2018_realistic_v11-v1/250000/2429D08A-7A5D-4546-9651-8655A1C4E53D.root']

In [45]:
len(file_names)

21387

In [46]:
df_1000_files = df_file.filter(df_file.name.isin(file_names))

In [47]:
df_1000_files.limit(5).toPandas()

22/06/20 08:56:36 WARN DAGScheduler: Broadcasting large task binary with size 3.5 MiB


Unnamed: 0,name,scope,account,did_type,is_open,monotonic,availability,size,length
0,/store/mc/RunIIFall17NanoAODv6/ZPhi_2e_M-20_PS...,cms,wma_prod,F,,0,A,287594249,
1,/store/mc/RunIIFall17NanoAODv6/ZPhi_2e_M-20_PS...,cms,wma_prod,F,,0,A,31817302,
2,/store/mc/RunIIFall15DR76/SinglePiPlusPt30/AOD...,cms,sync_t1_uk_ral_tape,F,,0,A,1746844579,
3,/store/mc/RunIIWinter15wmLHE/SeesawTypeIII_SIG...,cms,sync_t1_uk_ral_tape,F,,0,A,386671137,
4,/store/mc/RunIISummer16DR80Premix/SeesawTypeII...,cms,sync_t1_uk_ral_tape,F,,0,A,3215222030,


In [48]:
#Write container nodes to neo4j
results = df_1000_container.write\
  .format("org.neo4j.spark.DataSource")\
  .mode("Overwrite")\
  .option("batch.size", 500)\
  .option("url", "bolt://r-neo4j.cern.ch:7687")\
  .option("authentication.type", "basic")\
  .option("authentication.basic.username", "neo4j")\
  .option("authentication.basic.password", "mypass")\
  .option("labels", ":Container")\
  .option("node.keys", "name")\
  .option("schema.optimization.type", "INDEX")\
  .save()

print(results)

None


In [49]:
#Write dataset nodes to neo4j
results = df_1000_datasets.write\
  .format("org.neo4j.spark.DataSource")\
  .mode("Overwrite")\
  .option("batch.size", 500)\
  .option("url", "bolt://r-neo4j.cern.ch:7687")\
  .option("authentication.type", "basic")\
  .option("authentication.basic.username", "neo4j")\
  .option("authentication.basic.password", "mypass")\
  .option("labels", ":Dataset")\
  .option("node.keys", "name")\
  .option("schema.optimization.type", "INDEX")\
  .save()

print(results)

None


In [51]:
#Write file nodes to neo4j
results = df_1000_files.limit(10000000).write\
  .format("org.neo4j.spark.DataSource")\
  .mode("Overwrite")\
  .option("batch.size", 500)\
  .option("url", "bolt://r-neo4j.cern.ch:7687")\
  .option("authentication.type", "basic")\
  .option("authentication.basic.username", "neo4j")\
  .option("authentication.basic.password", "mypass")\
  .option("labels", ":File")\
  .option("node.keys", "name")\
  .option("schema.optimization.type", "NODE_CONSTRAINTS")\
  .save()

print(results)

22/06/20 09:05:51 WARN DAGScheduler: Broadcasting large task binary with size 3.5 MiB


None


In [82]:
df_container_dataset_relation_1000 = df_contents\
    .filter(df_contents.name.isin(container_names))\
    .filter(df_contents.did_type=='C')\
    .filter(df_contents.child_type=='D')\
    .select(['name', 'child_name'])\
    .withColumnRenamed("name", "source.name")\
    .withColumnRenamed("child_name", "target.name")

In [80]:
df_dataset_file_relation_1000 = df_contents\
    .filter(df_contents.name.isin(dataset_names))\
    .filter(df_contents.did_type=='D')\
    .filter(df_contents.child_type=='F')\
    .select(['name', 'child_name'])\
    .withColumnRenamed("name", "source.name")\
    .withColumnRenamed("child_name", "target.name")

In [61]:
df_container_container_relation_1000 = df_contents\
    .filter(df_contents.name.isin(container_names))\
    .filter(df_contents.did_type=='C')\
    .filter(df_contents.child_type=='C')\
    .select(['name', 'child_name'])\
    .withColumnRenamed("name", "source.name")\
    .withColumnRenamed("child_name", "target.name")

In [75]:
df_container_dataset_relation_1000.limit(2).collect()


[Row(source.name='/SMS-T1tbs_RPV_mGluino1700_TuneCUETP8M1_13TeV-madgraphMLM-pythia8/RunIISummer16NanoAODv4-PUMoriond17_Nano14Dec2018_102X_mcRun2_asymptotic_v6_ext1-v1/NANOAODSIM', target.name='/SMS-T1tbs_RPV_mGluino1700_TuneCUETP8M1_13TeV-madgraphMLM-pythia8/RunIISummer16NanoAODv4-PUMoriond17_Nano14Dec2018_102X_mcRun2_asymptotic_v6_ext1-v1/NANOAODSIM#e1cf4e36-36f3-4a0d-b314-5ec8f973a13f'),
 Row(source.name='/MSSMD_mH_125_mN1_60_mGammaD_35_cT_0p5_TuneCP5_13TeV-madgraph-pythia8/RunIIAutumn18NanoAODv5-Nano1June2019_102X_upgrade2018_realistic_v19-v1/NANOAODSIM', target.name='/MSSMD_mH_125_mN1_60_mGammaD_35_cT_0p5_TuneCP5_13TeV-madgraph-pythia8/RunIIAutumn18NanoAODv5-Nano1June2019_102X_upgrade2018_realistic_v19-v1/NANOAODSIM#e0d930d4-4a19-4ab0-811f-bda0b1b0933d')]

In [76]:
df_dataset_file_relation_1000.limit(2).collect()


[Row(source.name='/WprimeToENu_M_1600_TuneCP5_13TeV_pythia8/RunIIFall18GS-102X_upgrade2018_realistic_v11-v1/GEN-SIM#8f4e92bc-29de-49c5-b560-cad593cb23b1', target.name='/store/mc/RunIIFall18GS/WprimeToENu_M_1600_TuneCP5_13TeV_pythia8/GEN-SIM/102X_upgrade2018_realistic_v11-v1/250000/C12BE4B0-C633-BE4B-880F-D4753B0966D7.root'),
 Row(source.name='/WtoTauNu_TuneP0_7TeV-pythia6-tauola/Fall11-PU_S6_START44_V9B-v1/AODSIM#59e2367e-414a-11e1-93ae-003048f02c8a', target.name='/store/mc/Fall11/WtoTauNu_TuneP0_7TeV-pythia6-tauola/AODSIM/PU_S6_START44_V9B-v1/0000/263BF9AC-2441-E111-9A75-003048D4DFA6.root')]

In [77]:
df_container_container_relation_1000.limit(2).collect()

[]

In [None]:
#Writing CONSISTS_OF relation

In [9]:
df_contents = get_df_contents(spark)

In [10]:
# df_container_dataset_relation = df_contents.filter(df_contents.did_type=='C').filter(df_contents.child_type=='D').select(['name', 'child_name']).withColumnRenamed("name", "source.name").withColumnRenamed("child_name", "target.name")
# df_dataset_file_relation = df_contents.filter(df_contents.did_type=='D').filter(df_contents.child_type=='F').select(['name', 'child_name']).withColumnRenamed("name", "source.name").withColumnRenamed("child_name", "target.name")
# df_container_container_relation = df_contents.filter(df_contents.did_type=='C').filter(df_contents.child_type=='C').select(['name', 'child_name']).withColumnRenamed("name", "source.name").withColumnRenamed("child_name", "target.name")

In [58]:
print(f'{df_container_dataset_relation_1000.count()=}')
print(f'{df_dataset_file_relation_1000.count()=}')
print(f'{df_container_container_relation_1000.count()=}')

df_container_dataset_relation.count()=34624
df_dataset_file_relation.count()=21387
df_container_container_relation.count()=0


In [84]:
# initial_executors = spark.conf.get("spark.dynamicAllocation.maxExecutors")
# spark.conf.set("spark.dynamicAllocation.maxExecutors", 1)
df_container_dataset_relation_1000.write\
  .format("org.neo4j.spark.DataSource")\
  .mode("Overwrite")\
  .option("url", "bolt://r-neo4j.cern.ch:7687")\
  .option("authentication.type", "basic")\
  .option("authentication.basic.username", "neo4j")\
  .option("authentication.basic.password", "mypass")\
  .option("relationship", "CONSISTS_OF_DATASET")\
  .option("relationship.source.labels", ":Container")\
  .option("relationship.source.save.mode", "Overwrite")\
  .option("relationship.source.node.keys", "source.name:name")\
  .option("relationship.target.labels", ":Dataset")\
  .option("relationship.target.save.mode", "Overwrite")\
  .option("relationship.target.node.keys", "target.name:name")\
  .save()

# spark.conf.set("spark.dynamicAllocation.maxExecutors", initial_executors)

In [66]:
# df_container_container_relation_1000.write\
#   .format("org.neo4j.spark.DataSource")\
#   .mode("Overwrite")\
#   .option("url", "bolt://r-neo4j.cern.ch:7687")\
#   .option("authentication.type", "basic")\
#   .option("authentication.basic.username", "neo4j")\
#   .option("authentication.basic.password", "mypass")\
#   .option("relationship", "CONSISTS_OF")\
#   .option("relationship.source.labels", ":Container")\
#   .option("relationship.source.save.mode", "Overwrite")\
#   .option("relationship.source.node.keys", "source.name:name")\
#   .option("relationship.target.labels", ":Container")\
#   .option("relationship.target.save.mode", "Overwrite")\
#   .option("relationship.target.node.keys", "target.name:name")\
#   .save()

In [None]:
df_dataset_file_relation_1000.write\
  .format("org.neo4j.spark.DataSource")\
  .mode("Overwrite")\
  .option("url", "bolt://r-neo4j.cern.ch:7687")\
  .option("authentication.type", "basic")\
  .option("authentication.basic.username", "neo4j")\
  .option("authentication.basic.password", "mypass")\
  .option("relationship", "CONSISTS_OF_FILE")\
  .option("relationship.source.labels", ":Dataset")\
  .option("relationship.source.save.mode", "Overwrite")\
  .option("relationship.source.node.keys", "source.name:name")\
  .option("relationship.target.labels", ":File")\
  .option("relationship.target.save.mode", "Overwrite")\
  .option("relationship.target.node.keys", "target.name:name")\
  .save()

In [5]:
df = spark.read.format("org.neo4j.spark.DataSource")\
  .option("url", "bolt://rahul-neo4j.cern.ch:7687")\
  .option("authentication.basic.username", "neo4j")\
  .option("authentication.basic.password", "mypass")\
  .option("labels", "Person")\
  .load()

22/06/19 12:56:20 WARN SchemaService: Switching to query schema resolution


In [6]:
df.show()

+----+--------+------+---+
|<id>|<labels>|  name| id|
+----+--------+------+---+
|   0|[Person]|  John|  1|
|   1|[Person]|Thomas|  2|
+----+--------+------+---+



In [23]:
df = spark.createDataFrame(
 [(1, "John"),(2, "Thomas")],
 ["id", "name"]
)

In [25]:
df.toPandas()

Unnamed: 0,id,name
0,1,John
1,2,Thomas


In [4]:
df.write\
  .format("org.neo4j.spark.DataSource")\
  .mode("Append")\
  .option("url", "bolt://rahul-neo4j.cern.ch:7687")\
  .option("authentication.type", "basic")\
  .option("authentication.basic.username", "neo4j")\
  .option("authentication.basic.password", "mypass")\
  .option("labels", ":Person")\
  .save()

In [None]:
spark.sparkContext.version