In [1]:
spark

In [2]:
# !hdfs dfs -stat /project/awg/cms/rucio/2023-07-24/

In [3]:
# check available files
!hdfs dfs -ls /project/awg/cms/rucio/2023-07-25

23/08/09 12:12:50 WARN ipc.Client: Exception encountered while connecting to the server 
org.apache.hadoop.ipc.RemoteException(org.apache.hadoop.ipc.StandbyException): Operation category READ is not supported in state standby. Visit https://s.apache.org/sbnn-error
	at org.apache.hadoop.security.SaslRpcClient.saslConnect(SaslRpcClient.java:376)
	at org.apache.hadoop.ipc.Client$Connection.setupSaslConnection(Client.java:622)
	at org.apache.hadoop.ipc.Client$Connection.access$2300(Client.java:413)
	at org.apache.hadoop.ipc.Client$Connection$2.run(Client.java:822)
	at org.apache.hadoop.ipc.Client$Connection$2.run(Client.java:818)
	at java.base/java.security.AccessController.doPrivileged(Native Method)
	at java.base/javax.security.auth.Subject.doAs(Subject.java:423)
	at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1845)
	at org.apache.hadoop.ipc.Client$Connection.setupIOstreams(Client.java:818)
	at org.apache.hadoop.ipc.Client$Connection.access$3800(Client.

In [1]:
import pickle
from datetime import datetime, timedelta

import click
import os
import pandas as pd
import pprint
import time
from dateutil.relativedelta import relativedelta
from pyspark import SparkContext, StorageLevel
from pyspark.sql import SparkSession
from pyspark.sql.functions import (
    col, collect_list, concat_ws, greatest, lit, lower, when,
    avg as _avg,
    count as _count,
    hex as _hex,
    max as _max,
    min as _min,
    round as _round,
    sum as _sum,
)

from pyspark.sql.types import (
    LongType,
)

#from CMSSpark.src.python.CMSSpark import schemas as cms_schemas

## load dataset

In [2]:
wa_date = str(datetime.now())[:10]
# wa_date = "2023-08-08"

HDFS_RUCIO_DATASET_LOCKS = f'/project/awg/cms/rucio/{wa_date}/dataset_locks/part*.avro'
# HDFS_RUCIO_LOCKS =         f'/project/awg/cms/rucio/{wa_date}/locks'
HDFS_RUCIO_RSES =          f'/project/awg/cms/rucio/{wa_date}/rses/part*.avro'
HDFS_RUCIO_RULES =         f'/project/awg/cms/rucio/{wa_date}/rules'
# HDFS_RUCIO_RULES_HISTORY = f'/project/awg/cms/rucio/{wa_date}/rules_history'
# HDFS_RUCIO_REPLICAS =      f'/project/awg/cms/rucio/{wa_date}/replicas'

In [3]:
rucio_dataset_locks = spark.read.format('avro').load(HDFS_RUCIO_DATASET_LOCKS)\
    .withColumn('BYTES', col('BYTES').cast(LongType()))\
    .withColumn('RULE_ID', lower(_hex(col('RULE_ID'))))\
    .withColumn('RSE_ID', lower(_hex(col('RSE_ID')))).filter(f"""ACCOUNT IN ('crab_tape_recall')""").cache()
rucio_dataset_locks.createOrReplaceTempView("dataset_locks")

In [4]:
rucio_rses = spark.read.format('avro').load(HDFS_RUCIO_RSES)\
    .withColumn('ID', lower(_hex(col('ID'))))
rucio_rses.createOrReplaceTempView("rses")


In [5]:
rucio_rules = spark.read.format('avro').load(HDFS_RUCIO_RULES)\
    .withColumn('ID', lower(_hex(col('ID'))))
rucio_rules.createOrReplaceTempView("rules")
#spark.sql("SELECT * FROM rules").count()

23/08/09 12:37:11 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


In [6]:
# rucio_locks = spark.read.format('avro').load(HDFS_RUCIO_LOCKS)\
#     .withColumn('BYTES', col('BYTES').cast(LongType()))\
#     .withColumn('RULE_ID', lower(_hex(col('RULE_ID'))))\
#     .withColumn('RSE_ID', lower(_hex(col('RSE_ID'))))
# rucio_locks.createOrReplaceTempView("locks")

In [7]:
# rucio_rules_history = spark.read.format('avro').load(HDFS_RUCIO_RULES_HISTORY)\
#     .withColumn('ID', lower(_hex(col('ID'))))
#     #.persist(StorageLevel.DISK_ONLY)
# rucio_rules_history.createOrReplaceTempView("rules_history")
# #spark.sql("SELECT * FROM rules_history").count()

In [8]:
# rucio_replicas = spark.read.format('avro').load(HDFS_RUCIO_REPLICAS)\
#     .withColumn('RSE_ID', lower(_hex(col('RSE_ID'))))
# rucio_replicas.createOrReplaceTempView("replicas")
# #spark.sql("SELECT * FROM replicas").count()

## Query

In [9]:
# rucio_dataset_locks.count()

In [10]:
# rucio_dataset_locks.printSchema()
# rucio_rses.printSchema()
# rucio_rules.printSchema()

In [11]:
# rucio_dataset_locks = rucio_dataset_locks.select('')
rucio_rses = rucio_rses.select('ID', 'RSE', 'RSE_TYPE').cache()
rucio_rules = rucio_rules.select('ID', 'ACCOUNT', 'DID_TYPE', 'EXPIRES_AT').cache()

In [12]:
result_df = rucio_dataset_locks.join(rucio_rses, rucio_rses["ID"] == rucio_dataset_locks["RSE_ID"])\
        .join(rucio_rules, rucio_rules["ID"] == rucio_dataset_locks["RULE_ID"]).drop('ID', 'RULE_ID', 'RSE_ID', 'ACCESSED_AT', 'ACCOUNT')

In [13]:
# result_df.show(100)

In [14]:
# result_df.printSchema()

In [15]:
# result_df.count()

In [16]:
docs = result_df.toPandas().to_dict('records')

In [38]:
len(docs)

17770

In [18]:
for i in range(len(docs)):
    docs[i]['SIZE_TiB'] = docs[i]["BYTES"]/1099511627776
    del docs[i]["BYTES"]

In [19]:
TIME = datetime.strptime(f"""{wa_date} 00:00:00""", "%Y-%m-%d %H:%M:%S").timestamp()*1000
for i in range(len(docs)):
    docs[i]['TIMESTAMP'] = TIME

In [41]:
for i in range(len(docs)):
    NAME_i = docs[i]['NAME']
    split_NAME = NAME_i.split('#')[0]
    docs[i]['NAME_'] = NAME_i.split('#')[0]
    split_NAME = docs[i]['NAME_'].split('/')
    if len(split_NAME) != 4:
        print("YO HOO !!, something wrong.", NAME_i)
    docs[i]['PriDataset'] = split_NAME[1]
    docs[i]['DataTier'] = split_NAME[-1]    

In [42]:
docs[:5]

[{'SCOPE': 'cms',
  'NAME': '/ZprimeToA0hToA0chichihbb_2HDM_MZp1700_MA0900_TuneCP2_13TeV_madgraph-pythia8/RunIIFall17MiniAODv2-PU2017_12Apr2018_94X_mc2017_realistic_v14-v3/MINIAODSIM#c7b37e2d-77d8-40b9-b8c9-cdf7658406bd',
  'STATE': 'O',
  'LENGTH': '1',
  'UPDATED_AT': 1689164433000,
  'CREATED_AT': 1689096938000,
  'RSE': 'T2_UK_SGrid_RALPP',
  'RSE_TYPE': 'DISK',
  'DID_TYPE': 'C',
  'EXPIRES_AT': 1691719252000,
  'SIZE_TiB': 0.0003293267427579849,
  'TIMESTAMP': 1691532000000.0,
  'NAME_': '/ZprimeToA0hToA0chichihbb_2HDM_MZp1700_MA0900_TuneCP2_13TeV_madgraph-pythia8/RunIIFall17MiniAODv2-PU2017_12Apr2018_94X_mc2017_realistic_v14-v3/MINIAODSIM',
  'PriDataset': 'ZprimeToA0hToA0chichihbb_2HDM_MZp1700_MA0900_TuneCP2_13TeV_madgraph-pythia8',
  'DataTier': 'MINIAODSIM'},
 {'SCOPE': 'cms',
  'NAME': '/ZprimeToA0hToA0chichihbb_2HDM_MZp1700_MA0900_TuneCP2_13TeV_madgraph-pythia8/RunIIFall17MiniAODv2-PU2017_12Apr2018_94X_mc2017_realistic_v14-v3/MINIAODSIM#4e06c095-6b19-46a1-a6a6-321e6692a086'

In [27]:
split_str = test_str.split('/')
split_str

['',
 'ZprimeToA0hToA0chichihbb_2HDM_MZp1700_MA0900_TuneCP2_13TeV_madgraph-pythia8',
 'RunIIFall17MiniAODv2-PU2017_12Apr2018_94X_mc2017_realistic_v14-v3',
 'MINIAODSIM#c7b37e2d-77d8-40b9-b8c9-cdf7658406bd']

In [29]:
split_str[3].split('#')

['MINIAODSIM', 'c7b37e2d-77d8-40b9-b8c9-cdf7658406bd']

In [23]:
import osearch

In [24]:
def get_index_schema():
    return {
        "settings": {"index": {"number_of_shards": "1", "number_of_replicas": "1"}},
        "mappings": {
            "properties": {
                'SCOPE': {"ignore_above": 2048, "type": "keyword"},
                'NAME': {"ignore_above": 2048, "type": "keyword"},
                'STATE': {"ignore_above": 1024, "type": "keyword"},
                'LENGTH': {"ignore_above": 1024, "type": "keyword"},
                'SIZE_TiB': {"type": "long"},
                'UPDATED_AT': {"format": "epoch_millis", "type": "date"},
                'CREATED_AT': {"format": "epoch_millis", "type": "date"},
                'RSE': {"ignore_above": 2048, "type": "keyword"},
                'RSE_TYPE': {"ignore_above": 2048, "type": "keyword"},
                'DID_TYPE': {"ignore_above": 1024, "type": "keyword"},
                'EXPIRES_AT': {"format": "epoch_millis", "type": "date"},
                'TIMESTAMP': {"format": "epoch_millis", "type": "date"},
                'NAME_': {"ignore_above": 2048, "type": "keyword"},
                'PriDataset': {"ignore_above": 2048, "type": "keyword"},
                'DataTier': {"ignore_above": 2048, "type": "keyword"},
            }
        }
    }

In [1]:
# _index_template = 'crab-tape-recall-rules-ekong'
# client = osearch.get_es_client("es-cms1.cern.ch/es", 'secret_opensearch.txt', get_index_schema())
# # index_mod="": 'test-foo', index_mod="Y": 'test-foo-YYYY', index_mod="M": 'test-foo-YYYY-MM', index_mod="D": 'test-foo-YYYY-MM-DD',
# idx = client.get_or_create_index(timestamp=time.time(), index_template=_index_template, index_mod="M")
# client.send(idx, docs, metadata=None, batch_size=10000, drop_nulls=False)

In [1]:
from datetime import datetime, timedelta
import os
import pandas as pd
import time
from pyspark import SparkContext, StorageLevel
from pyspark.sql import SparkSession
from pyspark.sql.functions import (
    col, collect_list, concat_ws, greatest, lit, lower, when,
    avg as _avg,
    count as _count,
    hex as _hex,
    max as _max,
    min as _min,
    round as _round,
    sum as _sum,
)

from pyspark.sql.types import (
    LongType,
)

import numpy as np
import osearch
from pyspark.sql import SparkSession

## Multiple Day Upload

In [2]:
def multi_upload(start_date, end_date):
    # change to the date of collected data
    start_date = start_date + timedelta(days=1)
    end_date = end_date + timedelta(days=1)
    
    days = (end_date - start_date).days
    for i in range(days):
        TODAY = start_date + timedelta(days=i)
        TODAY = str(TODAY)[:10]
        
        print(TODAY)
        # Import data into database form

        wa_date = TODAY
        HDFS_RUCIO_DATASET_LOCKS = f'/project/awg/cms/rucio/{wa_date}/dataset_locks/part*.avro'
        HDFS_RUCIO_RSES =          f'/project/awg/cms/rucio/{wa_date}/rses/part*.avro'
        HDFS_RUCIO_RULES =         f'/project/awg/cms/rucio/{wa_date}/rules'

        rucio_dataset_locks = spark.read.format('avro').load(HDFS_RUCIO_DATASET_LOCKS)\
            .withColumn('BYTES', col('BYTES').cast(LongType()))\
            .withColumn('RULE_ID', lower(_hex(col('RULE_ID'))))\
            .withColumn('RSE_ID', lower(_hex(col('RSE_ID')))).filter(f"""ACCOUNT IN ('crab_tape_recall')""").cache()
        rucio_dataset_locks.createOrReplaceTempView("dataset_locks")

        rucio_rses = spark.read.format('avro').load(HDFS_RUCIO_RSES)\
            .withColumn('ID', lower(_hex(col('ID'))))
        rucio_rses.createOrReplaceTempView("rses")

        rucio_rules = spark.read.format('avro').load(HDFS_RUCIO_RULES)\
            .withColumn('ID', lower(_hex(col('ID'))))
        rucio_rules.createOrReplaceTempView("rules")

        # filter and query

        rucio_rses = rucio_rses.select('ID', 'RSE', 'RSE_TYPE').cache()
        rucio_rules = rucio_rules.select('ID', 'ACCOUNT', 'DID_TYPE', 'EXPIRES_AT').cache()

        result_df = rucio_dataset_locks.join(rucio_rses, rucio_rses["ID"] == rucio_dataset_locks["RSE_ID"])\
                .join(rucio_rules, rucio_rules["ID"] == rucio_dataset_locks["RULE_ID"]).drop('ID', 'RULE_ID', 'RSE_ID', 'ACCESSED_AT', 'ACCOUNT')

        # Convert database to dictionary

        docs = result_df.toPandas().to_dict('records')
            
        # Add TIMESTAMP column and convert TiB
        TIME = datetime.strptime(f"""{wa_date} 00:00:00""", "%Y-%m-%d %H:%M:%S").timestamp()*1000
        for i in range(len(docs)):
            docs[i]['TIMESTAMP'] = TIME
            docs[i]['SIZE_TiB'] = docs[i]["BYTES"]/1099511627776
            del docs[i]["BYTES"]
            
            # break down the name
            NAME_i = docs[i]['NAME']
            split_NAME = NAME_i.split('#')[0]
            docs[i]['NAME_'] = NAME_i.split('#')[0]
            split_NAME = docs[i]['NAME_'].split('/')
            if len(split_NAME) != 4:
                print("YO HOO !!, something wrong.", NAME_i)
            docs[i]['PriDataset'] = split_NAME[1]
            docs[i]['DataTier'] = split_NAME[-1]

        # Define type of each schema

        def get_index_schema():
            return {
                "settings": {"index": {"number_of_shards": "1", "number_of_replicas": "1"}},
                "mappings": {
                    "properties": {
                        'SCOPE': {"ignore_above": 2048, "type": "keyword"},
                        'NAME': {"ignore_above": 2048, "type": "keyword"},
                        'STATE': {"ignore_above": 1024, "type": "keyword"},
                        'LENGTH': {"ignore_above": 1024, "type": "keyword"},
                        'BYTES': {"type": "long"},
                        'UPDATED_AT': {"format": "epoch_millis", "type": "date"},
                        'CREATED_AT': {"format": "epoch_millis", "type": "date"},
                        'RSE': {"ignore_above": 2048, "type": "keyword"},
                        'RSE_TYPE': {"ignore_above": 2048, "type": "keyword"},
                        'DID_TYPE': {"ignore_above": 1024, "type": "keyword"},
                        'EXPIRES_AT': {"format": "epoch_millis", "type": "date"},
                        'TIMESTAMP': {"format": "epoch_millis", "type": "date"},
                        'NAME_': {"ignore_above": 2048, "type": "keyword"},
                        'PriDataset': {"ignore_above": 2048, "type": "keyword"},
                        'DataTier': {"ignore_above": 2048, "type": "keyword"},
                    }
                }
            }

        # Send data to Opensearch

        _index_template = 'crab-tape-recall-rules-ekong'
        client = osearch.get_es_client("es-cms1.cern.ch/es", 'secret_opensearch.txt', get_index_schema())
        idx = client.get_or_create_index(timestamp=time.time(), index_template=_index_template, index_mod="M")
        no_of_fail_saved = client.send(idx, docs, metadata=None, batch_size=10000, drop_nulls=False)

        print("========================================================================", "FINISHED : ", len(docs), "ROWS ARE SENT", no_of_fail_saved, "ROWS ARE FAILED", "========================================================================", sep='\n')


In [5]:
# upload the data of start_date day to end_date-1d
start_date = datetime(2023, 7, 23)
end_date = datetime(2023, 7, 24)

multi_upload(start_date, end_date)

2023-07-23




FINISHED : 
40190
ROWS ARE SENT
0
ROWS ARE FAILED


