In [1]:
from pyarrow import parquet as pq
from matplotlib import pyplot as plt
from datetime import datetime as dt
from datetime import timezone
from dateutil import parser
from random import randint
import pandas as pd
import numpy as np

# Sanity Checks: XRootD vs. ClassAds
***

## Get XRootD Dataset
New Column Notes:
 - the `app_info` column from XRootD is formatted like: `[crab_id]_https://glidein.cern.ch/[workflow_idFront]:[workflow_idBack]`
 - we make the column `job_id` = `crab_id`/`workflow_id` so that we may simultaneously compare the two IDs for each job.
 
Requirement Notes:
 - there is a filter that restricts the site to UCSD or CalTech in the HDFS fetching script
 - some jobs do not come from CRAB, so they have a null entry in the `crab_id` column and are promptly filtered out

In [2]:
# Get parquet files
dataset_xRootD = pq.ParquetDataset("parquet/XRootD_06-02-2019")
table_xRootD = dataset_xRootD.read()

# Convert to pandas dataframe
df_xRootD = table_xRootD.to_pandas()
# Make some columns
df_xRootD["crab_id"] = df_xRootD.app_info.str.split('_').str.get(0)
workflow_idFront = (df_xRootD.app_info.str.split('/').str[-1]
                                      .str.split(':').str[:2]
                                      .str.join('_')
                   ) # Front half of workflow_id
workflow_idBack = (df_xRootD.app_info.str.split('/').str[-1]
                                     .str.split(':').str[2:]
                                     .str.join('_')
                  ) # Back half of workflow_id
df_xRootD["workflow_id"] = (workflow_idFront.map(str)+":"+workflow_idBack).str.replace("_[0-9]$", "")
df_xRootD["job_id"] = df_xRootD.crab_id.map(str) + "/" + df_xRootD.workflow_id
df_xRootD["start_datetime"] = pd.to_datetime(df_xRootD.start_time, unit="ms")

# Require real crab jobs
df_xRootD = df_xRootD[(~df_xRootD.crab_id.isna()) & (df_xRootD.job_id != "/:")]
# Require only read jobs
# df_xRootD = df_xRootD[df_xRootD.operation == "read"]
# Require that job started on June 2nd
date = parser.parse("Jun 2 00:00:00 UTC 2019")
df_xRootD = df_xRootD[df_xRootD.start_time/1e3 >= date.timestamp()]

## Get ClassAds Dataset
New Column Notes:
 - the `workflow_id` column from ClassAds as an extra component at the end that must be removed (i.e. `[workflow_id]_[0-9]`)
 - when we extract a timestamp from the pandas datetime object, the result is given in **nanoseconds**, so we must convert it immediately or remember to do it later (this is done for consistency with XRootD, since the `start_date` ClassAds column is already a timestamp)

In [3]:
# Get parquet files
dataset_classAds = pq.ParquetDataset("parquet/ClassAds_06-02-2019")
table_classAds = dataset_classAds.read()

# Convert to pandas dataframe
df_classAds = table_classAds.to_pandas()
# Make some columns
df_classAds["workflow_id"] = df_classAds.workflow_id.str.replace("_[0-9]$", "")
df_classAds["job_id"] = df_classAds.crab_id.map(str) + "/" + df_classAds.workflow_id
df_classAds["start_datetime"] = pd.to_datetime(df_classAds.start_date, unit="ms") # 'start_date' column is a timestamp
df_classAds["start_time"] = df_classAds.start_datetime.dt.tz_localize('UTC').values.astype(np.int64) # in ns

# Require at UCSD or CalTech exclusively
df_classAds = df_classAds[df_classAds.site.isin(["T2_US_UCSD", "T2_US_Caltech"])]
# Require real crab jobs
df_classAds = df_classAds[(~df_classAds.crab_id.isna()) & (df_classAds.job_id != "/")]
# Require that job started on June 2nd
date = parser.parse("Jun 2 00:00:00 UTC 2019")
df_classAds = df_classAds[df_classAds.start_time/1e9 >= date.timestamp()]

## Preliminary Checks

#### Is everything happening on the right day?

In [4]:
# Check that an arbitrarily-chosen start time is within June 2nd
print(dt.fromtimestamp(df_classAds.start_time.values[0]/1e9, tz=timezone.utc))
print(dt.fromtimestamp(date.timestamp(), tz=timezone.utc))

2019-06-02 08:05:27+00:00
2019-06-02 00:00:00+00:00


In [5]:
# Check that all start times for XRootD are on June 2nd
df_xRootD.start_datetime.dt.tz_localize('UTC').dt.date.value_counts()

2019-06-02    50102
Name: start_datetime, dtype: int64

In [6]:
# Check that all start times for ClassAds are on June 2nd
df_classAds.start_datetime.dt.tz_localize('UTC').dt.date.value_counts()

2019-06-02    25435
Name: start_datetime, dtype: int64

#### How many unique job IDs are there?

In [7]:
# See how many unique job IDs are in XRootD and ClassAds respectively
ids_xRootD = df_xRootD.job_id.unique()
ids_classAds = df_classAds.job_id.unique()
print(len(ids_xRootD), len(ids_classAds))

35473 24182


## Coverage Checks

#### How many ClassAds jobs _are_ recorded in XRootD?

In [8]:
# Check coverage of ClassAds job IDs by XRootD
df_classAds["in_xRootD"] = df_classAds.job_id.isin(list(df_xRootD.job_id.to_numpy()))
inOrOut_counts = df_classAds.in_xRootD.value_counts()
print("{0:.2f}%".format(inOrOut_counts[True]/(inOrOut_counts[True]+inOrOut_counts[False])*100),
      "of ClassAds jobs are recorded in XRootD")

93.29% of ClassAds jobs are recorded in XRootD


In [9]:
# Check if a random job ID from ClassAds is recorded in XRootD
i = randint(0,len(df_classAds.job_id.values)-1)
print("Is {} in XRootD?".format(df_classAds.job_id.values[i]))
df_classAds.job_id.values[i] in df_xRootD.job_id.values

Is 2590/190602_063338:bianjg_crab_JpsiDMuon_v28h_test-Run2017D-31Mar2018-v1 in XRootD?


True

#### How many XRootD jobs _are_ recorded in ClassAds?

In [10]:
# Check coverage of ClassAds job IDs by XRootD
df_xRootD["in_classAds"] = df_xRootD.job_id.isin(list(df_classAds.job_id.to_numpy()))
inOrOut_counts1 = df_xRootD.in_classAds.value_counts()
print("{0:.2f}%".format(inOrOut_counts1[True]/(inOrOut_counts1[True]+inOrOut_counts1[False])*100),
      "of XRootD jobs are recorded in ClassAds")

62.21% of XRootD jobs are recorded in ClassAds


In [11]:
# Check if a random job ID from XRootD is recorded in ClassAds
j = randint(0,len(df_xRootD.job_id.values)-1)
print("Is {} in ClassAds?".format(df_xRootD.job_id.values[j]))
df_xRootD.job_id.values[j] in df_classAds.job_id.values

Is 207/190602_063338:bianjg_crab_JpsiDMuon_v28h_test-Run2017D-31Mar2018-v1 in ClassAds?


True

#### Ok, so what about the ClassAds jobs that _aren't_ recorded in XRootD?

In [12]:
# List unique schedd names
df_classAds.schedd_name.unique()

array(['crab3@vocms0195.cern.ch', 'crab3@vocms0196.cern.ch',
       'crab3@vocms0120.cern.ch', 'crab3@vocms0137.cern.ch',
       'crab3@vocms0197.cern.ch', 'crab3@vocms0119.cern.ch',
       'crab3@vocms0107.cern.ch', 'crab3@vocms0144.cern.ch',
       'crab3@vocms0198.cern.ch', 'crab3@vocms0121.cern.ch',
       'crab3@vocms0155.cern.ch', 'crab3@vocms0194.cern.ch'], dtype=object)

In [13]:
# Count schedd names for jobs that AREN'T recorded in XRootD
df_classAds[~df_classAds.in_xRootD].schedd_name.value_counts()

crab3@vocms0121.cern.ch    551
crab3@vocms0195.cern.ch    196
crab3@vocms0107.cern.ch    185
crab3@vocms0137.cern.ch    143
crab3@vocms0119.cern.ch    129
crab3@vocms0198.cern.ch    119
crab3@vocms0196.cern.ch    106
crab3@vocms0144.cern.ch     72
crab3@vocms0155.cern.ch     68
crab3@vocms0197.cern.ch     66
crab3@vocms0194.cern.ch     41
crab3@vocms0120.cern.ch     30
Name: schedd_name, dtype: int64

In [14]:
# Get workflows of jobs NOT recorded in XRootD
workflowsNotInXRootD = df_classAds[~df_classAds.in_xRootD].workflow_id.unique()
# Count ClassAds in/not in same workflow as jobs that are NOT recoreded in XRootD
df_classAds.workflow_id.isin(workflowsNotInXRootD).value_counts()

True     24579
False      856
Name: workflow_id, dtype: int64

In [15]:
# Show count of ClassAds IN and NOT IN XRootD records for later reference
df_classAds.in_xRootD.value_counts()

True     23729
False     1706
Name: in_xRootD, dtype: int64

In [16]:
# Count(ClassAds in the same workflow as jobs NOT recorded in XRootD) recorded/not recorded in XRootD
df_classAds_workflowsNotInXRootD = df_classAds[df_classAds.workflow_id.isin(workflowsNotInXRootD)]
c = df_classAds_workflowsNotInXRootD.job_id.isin(list(df_xRootD.job_id.to_numpy())).value_counts()
print("{0:.2f}%".format(c[True]/df_classAds.in_xRootD.value_counts()[True]*100),
      "of jobs that ARE recorded in XRootD share the same workflow as the jobs that ARE NOT recorded.")

96.39% of jobs that ARE recorded in XRootD share the same workflow as the jobs that ARE NOT recorded.


In [17]:
print(workflowsNotInXRootD.shape[0], "unique workflows of jobs NOT recorded in XRootD\n")

# Group classAds by workflow, check if workflow contains ANY jobs recorded in XRootD
byWorkflow_classAds_anyJobInXRootD = df_classAds_workflowsNotInXRootD.groupby("workflow_id")["in_xRootD"].any()
print("True/False counts for the following statement:",
      "\nANY job for a given, unique workflow is recorded in XRootD")
print(byWorkflow_classAds_anyJobInXRootD.value_counts())

# Show workflows that have no jobs in XRootD
byWorkflow_classAds_anyJobInXRootD[byWorkflow_classAds_anyJobInXRootD.values == False]

134 unique workflows of jobs NOT recorded in XRootD

True/False counts for the following statement: 
ANY job for a given, unique workflow is recorded in XRootD
True     91
False    43
Name: in_xRootD, dtype: int64


workflow_id
190520_203443:abdatta_crab_ttH_Analyzer_zjets_ll_mll_50_inf_NLO    False
190520_204725:abdatta_crab_ttH_Analyzer_ttgjets                    False
190524_211559:gmestdac_crab_MiniAOD2017v2-v1_2017-v1               False
190524_212941:gmestdac_crab_MiniAOD2017v2-v1_2017-v1               False
190525_003435:gmestdac_crab_MiniAOD2017v2-v1_2017-v1               False
190531_102921:yuanc_crab_Data13TeV_SingleElectron2017B             False
190531_103256:yuanc_crab_Data13TeV_SingleElectron2017C             False
190531_103636:yuanc_crab_Data13TeV_SingleElectron2017D             False
190531_104019:yuanc_crab_Data13TeV_SingleElectron2017E             False
190531_104748:yuanc_crab_Data13TeV_SingleMuon2017B                 False
190531_110237:yuanc_crab_Data13TeV_SingleMuon2017F                 False
190531_114317:yuanc_crab_Data13TeV_DoubleMuon2017F                 False
190531_120454:yuanc_crab_Data13TeV_DoubleEle2017C                  False
190531_122929:yuanc_crab_Data13TeV_Doub

In [18]:
# Get a random workflow that has some jobs recorded, some not recorded in XRootD
k = randint(0, byWorkflow_classAds_anyJobInXRootD.value_counts()[True]-1)
r = byWorkflow_classAds_anyJobInXRootD[byWorkflow_classAds_anyJobInXRootD.values == True].keys()[k]

# Boolean for entries that do/do not match the above random workflow
w = (df_classAds.workflow_id == r)
print("ClassAds IN XRootD:\n{}".format("\n".join(list(df_classAds[w & (df_classAds.in_xRootD)].job_id.unique()))))
print("ClassAds NOT IN XRootD:\n{}".format("\n".join(list(df_classAds[w & (~df_classAds.in_xRootD)].job_id.unique()))))

ClassAds IN XRootD:
17/190525_003645:gmestdac_crab_MiniAOD2017v2-v1_2017-v1
152/190525_003645:gmestdac_crab_MiniAOD2017v2-v1_2017-v1
114/190525_003645:gmestdac_crab_MiniAOD2017v2-v1_2017-v1
ClassAds NOT IN XRootD:
120/190525_003645:gmestdac_crab_MiniAOD2017v2-v1_2017-v1


In [19]:
# Get the job ID (crab_id/workflow_id) of a random ClassAds job that is NOT recored in XRootD
l = randint(0,len(df_classAds[~df_classAds.in_xRootD].job_id.values)-1)
df_classAds[~df_classAds.in_xRootD].job_id.values[l]

'635/190520_203443:abdatta_crab_ttH_Analyzer_zjets_ll_mll_50_inf_NLO'

#### .. and the XRootD jobs that _aren't_ recorded in ClassAds?

In [20]:
# Get the job ID (crab_id/workflow_id) of a random XRootD job that is NOT recored in ClassAds
l = randint(0,len(df_xRootD[~df_xRootD.in_classAds].job_id.values)-1)
df_xRootD[~df_xRootD.in_classAds].job_id.values[l]

'3911/190515_095853:rasharma_crab_SingleMuon_Run2017F-31Mar2018-v1'