In [39]:
from datetime import datetime, timedelta, timezone
import os
import time
import pandas as pd

from pyspark import SparkContext, StorageLevel
from pyspark.sql import SparkSession
from pyspark.sql.functions import (
    current_user,
    col, collect_list, concat_ws, greatest, lit, lower, when,
    avg as _avg,
    count as _count,
    hex as _hex,
    max as _max,
    min as _min,
    round as _round,
    sum as _sum,
    split as _split,
    regexp_extract
)
from pyspark.sql.types import (
    StructType,
    LongType,
    StringType,
    StructField,
    DoubleType,
    IntegerType,
)

In [2]:
spark = SparkSession\
        .builder\
        .appName('crab-taskdb')\
        .getOrCreate()
spark

24/10/02 14:02:22 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [16]:
spark.catalog.clearCache()

In [4]:
# arguments
# secret path, also check if file exists
secretpath = os.environ.get('OPENSEARCH_SECRET_PATH', f'{os.getcwd()}/../workdir/secret_opensearch.txt')
if not os.path.isfile(secretpath): 
    raise Exception(f'OS secrets file {secretpath} does not exists')
# if PROD, index prefix will be `crab-*`, otherwise `crab-test-*`
PROD = os.environ.get('PROD', 'false').lower() in ('true', '1', 't')
# FROM_DATE, in strptime("%Y-%m-%d")
START = os.environ.get('START_DATE', None) 
END = os.environ.get('END_DATE', None)

In [5]:
# try to import osearch from current directory, fallback to $PWD/../workdir if not found
try:
    import osearch
except ModuleNotFoundError:
    import sys
    sys.path.insert(0, f'{os.getcwd()}/../workdir')
    import osearch

In [6]:
## variables for run inside notebook
START_DATE = "2020-01-01"
END_DATE = "2024-10-31"

In [7]:
# const variable
index_name = 'crab-test-ruio-rules' # always put test index prefix

In [8]:
# if cronjob, replace constant with value from env
if START and END:
    START_DATE = START
    END_DATE = END
# use prod index pattern if this execution is for production
if PROD:
    index_name = f'crab-{"-".join(index_name.split("-")[2:])}'
# datetime object
start_datetime = datetime.strptime(START_DATE, "%Y-%m-%d").replace(tzinfo=timezone.utc)
end_datetime = datetime.strptime(END_DATE, "%Y-%m-%d").replace(tzinfo=timezone.utc)
if end_datetime < start_datetime:
    raise Exception(f"end date ({END_DATE}) is less than start date ({START_DATE})")

In [9]:
# debug
print(START_DATE, 
      start_datetime, 
      END_DATE, 
      end_datetime, 
      index_name, 
      sep='\n')

2020-01-01
2020-01-01 00:00:00+00:00
2024-10-31
2024-10-31 00:00:00+00:00
crab-test-ruio-rules


In [17]:
def get_df_rses(spark):
    """Get Spark dataframe of RSES
    """
    hdfs_rses_path = '/project/awg/cms/rucio/{}/rses/part*.avro'.format(datetime.today().strftime('%Y-%m-%d'))
    df_rses = spark.read.format("avro").load(hdfs_rses_path) \
        .filter(col('DELETED_AT').isNull()) \
        .withColumn('rse_id', lower(_hex(col('ID')))) \
        .withColumn('rse_tier', _split(col('RSE'), '_').getItem(0)) \
        .withColumn('rse_country', _split(col('RSE'), '_').getItem(1)) \
        .withColumn('rse_kind',
                    when((col("rse").endswith('Temp') | col("rse").endswith('temp') | col("rse").endswith('TEMP')),
                         'temp')
                    .when((col("rse").endswith('Test') | col("rse").endswith('test') | col("rse").endswith('TEST')),
                          'test')
                    .otherwise('prod')
                    ) \
        .select(['rse_id', 'RSE', 'RSE_TYPE', 'rse_tier', 'rse_country', 'rse_kind'])
    return df_rses
def get_df_locks(spark):
    """Get Spark dataframe of Locks
    """
    today = datetime.today().strftime('%Y-%m-%d')
    locks_path = f'/project/awg/cms/rucio/{today}/locks/part*.avro'
    locks = spark.read.format('avro').load(locks_path) \
                .filter(col('SCOPE') == 'cms') \
                .filter(col('STATE').isin(['O', 'R'])) \
                .withColumn('rse_id', lower(_hex(col('RSE_ID')))) \
                .withColumnRenamed('NAME', 'f_name') \
                .withColumnRenamed('ACCOUNT', 'account_name') \
                .withColumnRenamed('BYTES', 'f_size') \
                .withColumn('r_id', lower(_hex(col('RULE_ID')))) \
                .select(['rse_id', 'f_name', 'f_size', 'r_id', 'account_name'])
    return locks
def get_df_accounts(spark):
    """Get Spark dataframe of Accounts
    """
    today = datetime.today().strftime('%Y-%m-%d')
    hdfs_rucio_accounts = f'/project/awg/cms/rucio/{today}/accounts/part*.avro'
    df_accounts = spark.read.format("avro").load(hdfs_rucio_accounts) \
        .filter(col('DELETED_AT').isNull()) \
        .withColumnRenamed('ACCOUNT', 'account_name') \
        .withColumnRenamed('ACCOUNT_TYPE', 'account_type') \
        .select(['account_name', 'account_type'])
    return df_accounts
def get_df_rules(spark):
    """Get Spark dataframe of rules
    """
    hdfs_rules_path = '/project/awg/cms/rucio/{}/rules/part*.avro'.format(datetime.today().strftime('%Y-%m-%d'))
    return spark.read.format('avro').load(hdfs_rules_path) \
        .filter(col('SCOPE') == 'cms') \
        .withColumnRenamed('name', 'r_name') \
        .withColumn('r_id', lower(_hex(col('ID')))) \
        .withColumn('s_id', lower(_hex(col('SUBSCRIPTION_ID')))) \
        .withColumnRenamed('ACTIVITY', 'activity') \
        .withColumnRenamed('STATE', 'rule_state') \
        .withColumnRenamed('RSE_EXPRESSION', 'rse_expression') \
        .select(['r_name','r_id', 's_id', 'activity', 'rule_state', 'rse_expression']) 



In [57]:
# modified
df_rses = get_df_rses(spark)
df_locks = get_df_locks(spark)
df_accounts = get_df_accounts(spark)
df_rules = get_df_rules(spark)
tb_denominator = 10 ** 12
locks = df_locks.join(df_rses, ['rse_id'], how='left') \
        .filter(col('rse_kind') == 'prod') \
        .select(['f_name', 'f_size', 'RSE', 'rse_type', 'account_name', 'r_id']) 

locks_with_activity = (
    locks.join(df_rules, ['r_id'], how='leftouter')
         .select(['f_name', 'account_name', 'RSE', 'rse_type', 'f_size', 'activity', 'r_name'])
         .withColumn('data_tier', regexp_extract('r_name', r'^\/([\w-]+)\/([\w-]+)\/([\w-]+)(#[\w-]+)?', 3))
         .select(['f_name', 'account_name', 'RSE', 'rse_type', 'f_size', 'activity', 'data_tier'])
)

timestamp = int(time.time())

# A File locked by the user for two activities is accounted to both activities
# A File locked by two users for the same activity is accounted to both Users
user_aggreagated = locks_with_activity \
        .select(['f_name', 'f_size', 'RSE', 'rse_type', 'account_name', 'activity', 'data_tier']) \
        .distinct() \
        .groupby(['RSE', 'rse_type', 'account_name', 'activity', 'data_tier']) \
        .agg(_round(_sum(col('f_size')) / tb_denominator, 5).alias('total_locked')) \
        .join(df_accounts, ['account_name'], how='left') \
        .withColumnRenamed('RSE', 'rse_name') \
        .withColumn('timestamp', lit(timestamp)) \
        .select(['total_locked', 'rse_name', 'rse_type', 'account_name', 'account_type', 'activity', 'data_tier', 'timestamp']) \
        .cache()



In [58]:
user_aggreagated.show(10, False)

24/10/02 18:56:34 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_238_194 !
24/10/02 18:56:34 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_229_74 !
24/10/02 18:56:34 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_238_22 !
24/10/02 18:56:34 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_229_96 !
24/10/02 18:56:34 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_238_53 !
24/10/02 18:56:34 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_238_84 !
24/10/02 18:56:34 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_229_48 !
24/10/02 18:56:34 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_238_196 !
24/10/02 18:56:34 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_285_10 !
24/10/02 18:56:34 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_229_22 !
24/10/02 18:56:34 WARN BlockManagerMasterEndpoin

24/10/02 18:57:23 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_238_49 !
24/10/02 18:57:23 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_285_3 !
24/10/02 18:57:23 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_229_49 !
24/10/02 18:57:23 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_238_80 !
24/10/02 18:57:23 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_229_97 !
24/10/02 18:57:23 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_229_75 !
24/10/02 18:57:23 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_238_18 !
24/10/02 18:57:23 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_229_23 !
24/10/02 18:57:23 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_238_111 !
24/10/02 18:57:24 ERROR YarnScheduler: Lost executor 108 on ithdp3107.cern.ch: Container from a bad node: container_e289_1722958100713_123141_01_

24/10/02 18:57:33 WARN YarnSchedulerBackend$YarnSchedulerEndpoint: Requesting driver to remove executor 110 for reason Container from a bad node: container_e289_1722958100713_123141_01_000127 on host: ithdp7016.cern.ch. Exit status: 137. Diagnostics: [2024-10-02 18:57:33.662]Container killed on request. Exit code is 137
[2024-10-02 18:57:33.702]Container exited with a non-zero exit code 137. 
[2024-10-02 18:57:33.703]Killed by external signal
.
24/10/02 18:57:33 ERROR YarnScheduler: Lost executor 110 on ithdp7016.cern.ch: Container from a bad node: container_e289_1722958100713_123141_01_000127 on host: ithdp7016.cern.ch. Exit status: 137. Diagnostics: [2024-10-02 18:57:33.662]Container killed on request. Exit code is 137
[2024-10-02 18:57:33.702]Container exited with a non-zero exit code 137. 
[2024-10-02 18:57:33.703]Killed by external signal
.
24/10/02 18:57:33 WARN TaskSetManager: Lost task 132.0 in stage 175.0 (TID 4982) (ithdp7016.cern.ch executor 110): ExecutorLostFailure (execut

+------------+-------------------+--------+--------------------+------------+------------------+------------------------+----------+
|total_locked|rse_name           |rse_type|account_name        |account_type|activity          |data_tier               |timestamp |
+------------+-------------------+--------+--------------------+------------+------------------+------------------------+----------+
|351.10199   |T0_CH_CERN_Tape    |TAPE    |sync_t0_ch_cern_tape|USER        |Data Consolidation|USER                    |1727888173|
|119.3932    |T2_IT_Legnaro      |DISK    |transfer_ops        |SERVICE     |Data Consolidation|MINIAODSIM              |1727888173|
|4.93308     |T2_BE_IIHE         |DISK    |wma_prod            |SERVICE     |Production Output |ALCARECO                |1727888173|
|3.33033     |T3_KR_KISTI        |DISK    |geonmo              |USER        |Data Consolidation|NANOAODSIM              |1727888173|
|9.82887     |T2_RU_IHEP         |DISK    |transfer_ops        |SERVI

In [59]:
user_aggreagated.count()

6578

In [60]:
# original
tb_denominator = 10 ** 12
  

df_rses = get_df_rses(spark)
df_locks = get_df_locks(spark)
df_accounts = get_df_accounts(spark)
df_rules = get_df_rules(spark)

locks = df_locks.join(df_rses, ['rse_id'], how='left') \
        .filter(col('rse_kind') == 'prod') \
        .select(['f_name', 'f_size', 'RSE', 'rse_type', 'account_name', 'r_id']) 

locks_with_activity = locks.join(df_rules, ['r_id'], how='leftouter').select(['f_name', 'account_name', 'RSE', 'rse_type', 'f_size', 'activity'])

timestamp = int(time.time())

# A File locked by the user for two activities is accounted to both activities
# A File locked by two users for the same activity is accounted to both Users
user_aggreagated = locks_with_activity \
        .select(['f_name', 'f_size', 'RSE', 'rse_type', 'account_name', 'activity']) \
        .distinct() \
        .groupby(['RSE', 'rse_type', 'account_name', 'activity']) \
        .agg(_round(_sum(col('f_size')) / tb_denominator, 5).alias('total_locked')) \
        .join(df_accounts, ['account_name'], how='left') \
        .withColumnRenamed('RSE', 'rse_name') \
        .withColumn('timestamp', lit(timestamp)) \
        .select(['total_locked', 'rse_name', 'rse_type', 'account_name', 'account_type', 'activity', 'timestamp']) \
        .cache()



In [61]:
user_aggreagated.count()

24/10/02 19:35:19 ERROR TransportResponseHandler: Still have 1 requests outstanding when connection from /188.184.195.43:34778 is closed
24/10/02 19:35:19 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_238_134 !
24/10/02 19:35:19 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_229_30 !
24/10/02 19:35:19 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_238_98 !
24/10/02 19:35:19 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_238_124 !
24/10/02 19:35:19 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_238_198 !
24/10/02 19:35:19 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_238_126 !
24/10/02 19:35:19 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_327_27 !
24/10/02 19:35:19 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_229_3 !
24/10/02 19:35:19 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_229_199 !
24/1

24/10/02 19:36:02 ERROR YarnScheduler: Lost executor 124 on ithdp7007.cern.ch: Container from a bad node: container_e289_1722958100713_123141_01_000141 on host: ithdp7007.cern.ch. Exit status: 137. Diagnostics: [2024-10-02 19:36:02.629]Container killed on request. Exit code is 137
[2024-10-02 19:36:02.669]Container exited with a non-zero exit code 137. 
[2024-10-02 19:36:02.670]Killed by external signal
.
24/10/02 19:36:02 WARN YarnSchedulerBackend$YarnSchedulerEndpoint: Requesting driver to remove executor 124 for reason Container from a bad node: container_e289_1722958100713_123141_01_000141 on host: ithdp7007.cern.ch. Exit status: 137. Diagnostics: [2024-10-02 19:36:02.629]Container killed on request. Exit code is 137
[2024-10-02 19:36:02.669]Container exited with a non-zero exit code 137. 
[2024-10-02 19:36:02.670]Killed by external signal
.
24/10/02 19:36:02 WARN TaskSetManager: Lost task 126.0 in stage 209.0 (TID 5846) (ithdp7007.cern.ch executor 124): ExecutorLostFailure (execut

1546

In [44]:
df_locks.show(10, False)

+--------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------+--------------------------------+------------+
|rse_id                          |f_name                                                                                                                                                                                                           |f_size    |r_id                            |account_name|
+--------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------+--------------------------------+------------+
|5017683aea934c46b94bac226f086e53|/store/mc/RunIIFall18wmLHEGS/SUSYGluGluToBBHToBB_M-400_TuneC

In [28]:
df_rules.filter(~col('r_name').contains('#')).count()

2363240

In [43]:
df_rules.select("*").withColumn('newcol', regexp_extract('r_name', r'^\/([\w-]+)\/([\w-]+)\/([\w-]+)(#[\w-]+)?', 3)).show(10, False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+--------------------------------+----+------------------+----------+-------------------+----------+
|r_name                                                                                                                                                                                                   |r_id                            |s_id|activity          |rule_state|rse_expression     |newcol    |
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+--------------------------------+----+------------------+----------+-------------------+----------+
|/BulkGravTohhTohtatahbb_narrow_M-1800_13TeV-madgraph/RunIISummer16DR80Premix-PUMoriond17_8