In [1]:
spark

In [2]:
import os
import sys

os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

import time
# from utils import (
#     _to_dict,
#     _donut,
#     _pie,
#     _line_graph,
#     _other_fields,
#     _exitcode_info,
#     _better_label
# )
from datetime import datetime, date, timedelta
from pyspark.sql.functions import (
    col,
    lit,
    when,
    sum as _sum,
    count as _count,
    first,
    date_format,
    from_unixtime
)
import numpy as np
import pandas as pd
from pyspark.sql.types import (
    StructType,
    LongType,
    StringType,
    StructField,
    DoubleType,
    IntegerType,
)
# spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")


### Prepare condor file name/configuration

In [3]:
def _get_schema():
    return StructType(
        [
            StructField(
                "data",
                StructType(
                    [
                        StructField("RecordTime", LongType(), nullable=False),
                        StructField("CMSPrimaryDataTier", StringType(), nullable=True),
                        StructField("Status", StringType(), nullable=True),
                        StructField("WallClockHr", DoubleType(), nullable=True),
                        StructField("CoreHr", DoubleType(), nullable=True),
                        StructField("CpuTimeHr", DoubleType(), nullable=True),
                        StructField("Type", StringType(), nullable=True),
                        StructField("CRAB_DataBlock", StringType(), nullable=True),
                        StructField("GlobalJobId", StringType(), nullable=False),
                        StructField("ExitCode", LongType(), nullable=True),
                        StructField("CRAB_Workflow", StringType(), nullable=True),
                        StructField("CommittedCoreHr", StringType(), nullable=True),
                        StructField("CommittedWallClockHr", StringType(), nullable=True),
                    ]
                ),
            ),
        ]
    )

In [4]:
_DEFAULT_HDFS_FOLDER = "/project/monitoring/archive/condor/raw/metric"

In [5]:
# # Check available files 
# !hdfs dfs -ls /project/monitoring/archive/condor/raw/metric/2023/07/08

In [6]:
def get_candidate_files(start_date, end_date, spark, base=_DEFAULT_HDFS_FOLDER):
    st_date = start_date - timedelta(days=0)
    ed_date = end_date + timedelta(days=0)
    days = (ed_date - st_date).days
    pre_candidate_files = [
        "{base}/{day}{{,.tmp}}".format(
            base=base, day=(st_date + timedelta(days=i)).strftime("%Y/%m/%d")
        )
        for i in range(0, days)
    ]
    sc = spark.sparkContext
    
    candidate_files = [
        f"{base}/{(st_date + timedelta(days=i)).strftime('%Y/%m/%d')}"
        for i in range(0, days)
    ]
    FileSystem = sc._gateway.jvm.org.apache.hadoop.fs.FileSystem
    URI = sc._gateway.jvm.java.net.URI
    Path = sc._gateway.jvm.org.apache.hadoop.fs.Path
    fs = FileSystem.get(URI("hdfs:///"), sc._jsc.hadoopConfiguration())
    # FIXME
    candidate_files = [url for url in candidate_files if fs.globStatus(Path(url))]
    print("No. of Consisted files:", len(candidate_files))
    return candidate_files

#     all_candidate_files = []
#     candidate_files = [
#         f"{base}/{(st_date + timedelta(days=i)).strftime('%Y/%m/%d')}"
#         for i in range(0, days)
#     ]
    
#     URI           = sc._gateway.jvm.java.net.URI
#     Path          = sc._gateway.jvm.org.apache.hadoop.fs.Path
#     FileSystem    = sc._gateway.jvm.org.apache.hadoop.fs.FileSystem
#     Configuration = sc._gateway.jvm.org.apache.hadoop.conf.Configuration
#     fs = FileSystem.get(URI("hdfs:///"), Configuration())

#     for fileNames in candidate_files:
#         status = fs.listStatus(Path(fileNames))
#         candidate_files_day_i = [
#             str(fileStatus.getPath()).replace('hdfs://analytix', '')
#             for fileStatus in status
#         ]
#         all_candidate_files.extend(candidate_files_day_i)
#     print("Files Directory:", candidate_files, "\nNo. of Consisted files:", len(all_candidate_files))
#     return all_candidate_files

def group_files(files, n=16):
    # Yield successive n-sized
    # chunks from files
    all_group = []
    for i in range(0, len(files), n):
        all_group.append(files[i:i+n])
    print("There are", len(all_group), "chunks of files")
    return all_group

## load dataset

In [7]:
schema = _get_schema()
start_date = datetime(2023, 8, 10)
end_date = datetime(2023, 8, 11)

In [8]:
candidate_files = get_candidate_files(start_date, end_date, spark, base=_DEFAULT_HDFS_FOLDER)
candidate_files

No. of Consisted files: 1


['/project/monitoring/archive/condor/raw/metric/2023/08/10']

### Prepare CRAB data file name

In [9]:
TODAY = str(end_date)[:10]
wa_date = TODAY
HDFS_CRAB_part = f'/project/awg/cms/crab/tasks/{wa_date}/'

### Get raw data from condor raw

In [10]:
spark.conf.set("spark.sql.session.timeZone", "UTC")

crab_df = spark.read.format('avro').load(HDFS_CRAB_part)
crab_df = crab_df.select('TM_TASKNAME', 'TM_IGNORE_LOCALITY')

In [25]:
condor_df = (
        spark.read.option("basePath", _DEFAULT_HDFS_FOLDER)
        .json(
            candidate_files,
            schema=schema,
        ).select("data.*")
        .filter(
            f"""Status IN ('Completed')
            AND Type IN ('analysis')
            AND RecordTime >= {start_date.timestamp() * 1000}
            AND RecordTime < {end_date.timestamp() * 1000}
            """
        )
        .drop_duplicates(["GlobalJobId"]).cache()
    ) 
condor_df.write.mode('overwrite').parquet("/cms/users/eatthaph/condor_vir_data" ,compression='zstd')
condor_df = spark.read.format('parquet').load('/cms/users/eatthaph/condor_vir_data')
# condor_df.count()

23/08/16 13:48:02 WARN CacheManager: Asked to cache already cached data.


In [27]:
result_df = condor_df.join(crab_df, crab_df["TM_TASKNAME"] == condor_df["CRAB_Workflow"])\
    .select('RecordTime', 'CMSPrimaryDataTier', 'WallClockHr', 'CoreHr', 'CpuTimeHr', 'ExitCode'
            , "CRAB_DataBlock", "TM_IGNORE_LOCALITY", "GlobalJobId", "CommittedCoreHr", "CommittedWallClockHr")
docs = result_df.toPandas()

In [None]:
len(docs)

In [19]:
# def spark_exec(candidate_files):
#     condor_df = (
#             spark.read.option("basePath", _DEFAULT_HDFS_FOLDER)
#             .json(
#                 candidate_files,
#                 schema=schema,
#             ).select("data.*")
#             .filter(
#                 f"""Status IN ('Completed')
#                 AND Type IN ('analysis')
#                 AND RecordTime >= {start_date.timestamp() * 1000}
#                 AND RecordTime < {end_date.timestamp() * 1000}
#                 """
#             )
#             .drop_duplicates(["GlobalJobId"]).cache()
#         ) 
#     condor_df.write.mode('overwrite').parquet("/cms/users/eatthaph/condor_vir_data" ,compression='zstd')
#     condor_df = spark.read.format('parquet').load('/cms/users/eatthaph/condor_vir_data')
#     result_df = condor_df.join(crab_df, crab_df["TM_TASKNAME"] == condor_df["CRAB_Workflow"])\
#         .select('RecordTime', 'CMSPrimaryDataTier', 'WallClockHr', 'CoreHr', 'CpuTimeHr', 'ExitCode'
#                 , "CRAB_DataBlock", "TM_IGNORE_LOCALITY", "GlobalJobId", "CommittedCoreHr", "CommittedWallClockHr")
#     sub_docs = result_df.toPandas()
#     return sub_docs

# def loop_excute(candidate_files, initial_n=len(candidate_files)):
#     r = 0
#     n = initial_n
#     df_list = []
#     file_chunk = group_files(candidate_files, n)
#     while len(file_chunk)!=0 and r<10:
#         print("=================================\n round :", r+1, "\n=================================")
#         df_err_list = []
#         for i, chunk in enumerate(file_chunk):
#             print("=================================\n", i+1, "out of", len(file_chunk), "\n=================================")
#             try:
#                 df_list.append(spark_exec(chunk))
#             except Exception as ex:
#                 print("=====", ex)
#                 df_err_list.extend(chunk)
# #         if n != 1:
# #             n = n//2
#         file_chunk = group_files(df_err_list, n)
#         r += 1
#         print("")
#     print("Fail excuted files :", df_err_list)
#     return df_list

In [20]:
# useful_df = loop_excute(candidate_files)
# df_list = spark_exec(candidate_files)

23/08/15 18:56:45 WARN TaskSetManager: Lost task 39.0 in stage 3.0 (TID 1044) (ithdp6017.cern.ch executor 53): FetchFailed(BlockManagerId(32, ithdp2115.cern.ch, 7337, None), shuffleId=1, mapIndex=7, mapId=911, reduceId=39, message=
org.apache.spark.shuffle.FetchFailedException
	at org.apache.spark.errors.SparkCoreErrors$.fetchFailedError(SparkCoreErrors.scala:312)
	at org.apache.spark.storage.ShuffleBlockFetcherIterator.throwFetchFailedException(ShuffleBlockFetcherIterator.scala:1180)
	at org.apache.spark.storage.ShuffleBlockFetcherIterator.next(ShuffleBlockFetcherIterator.scala:918)
	at org.apache.spark.storage.ShuffleBlockFetcherIterator.next(ShuffleBlockFetcherIterator.scala:85)
	at org.apache.spark.util.CompletionIterator.next(CompletionIterator.scala:29)
	at scala.collection.Iterator$$anon$11.nextCur(Iterator.scala:486)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:492)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at org.apache.spark.util.Com

23/08/15 18:56:45 WARN TaskSetManager: Lost task 19.0 in stage 3.0 (TID 1024) (ithdp2113.cern.ch executor 44): FetchFailed(BlockManagerId(32, ithdp2115.cern.ch, 7337, None), shuffleId=1, mapIndex=7, mapId=911, reduceId=19, message=
org.apache.spark.shuffle.FetchFailedException
	at org.apache.spark.errors.SparkCoreErrors$.fetchFailedError(SparkCoreErrors.scala:312)
	at org.apache.spark.storage.ShuffleBlockFetcherIterator.throwFetchFailedException(ShuffleBlockFetcherIterator.scala:1180)
	at org.apache.spark.storage.ShuffleBlockFetcherIterator.next(ShuffleBlockFetcherIterator.scala:918)
	at org.apache.spark.storage.ShuffleBlockFetcherIterator.next(ShuffleBlockFetcherIterator.scala:85)
	at org.apache.spark.util.CompletionIterator.next(CompletionIterator.scala:29)
	at scala.collection.Iterator$$anon$11.nextCur(Iterator.scala:486)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:492)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at org.apache.spark.util.Com

23/08/15 18:56:45 WARN TaskSetManager: Lost task 45.0 in stage 3.0 (TID 1050) (ithdp2102.cern.ch executor 50): FetchFailed(BlockManagerId(32, ithdp2115.cern.ch, 7337, None), shuffleId=1, mapIndex=7, mapId=911, reduceId=45, message=
org.apache.spark.shuffle.FetchFailedException
	at org.apache.spark.errors.SparkCoreErrors$.fetchFailedError(SparkCoreErrors.scala:312)
	at org.apache.spark.storage.ShuffleBlockFetcherIterator.throwFetchFailedException(ShuffleBlockFetcherIterator.scala:1180)
	at org.apache.spark.storage.ShuffleBlockFetcherIterator.next(ShuffleBlockFetcherIterator.scala:918)
	at org.apache.spark.storage.ShuffleBlockFetcherIterator.next(ShuffleBlockFetcherIterator.scala:85)
	at org.apache.spark.util.CompletionIterator.next(CompletionIterator.scala:29)
	at scala.collection.Iterator$$anon$11.nextCur(Iterator.scala:486)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:492)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at org.apache.spark.util.Com

23/08/15 18:56:45 WARN TaskSetManager: Lost task 9.0 in stage 3.0 (TID 1014) (ithdp2120.cern.ch executor 37): FetchFailed(BlockManagerId(32, ithdp2115.cern.ch, 7337, None), shuffleId=1, mapIndex=7, mapId=911, reduceId=9, message=
org.apache.spark.shuffle.FetchFailedException
	at org.apache.spark.errors.SparkCoreErrors$.fetchFailedError(SparkCoreErrors.scala:312)
	at org.apache.spark.storage.ShuffleBlockFetcherIterator.throwFetchFailedException(ShuffleBlockFetcherIterator.scala:1180)
	at org.apache.spark.storage.ShuffleBlockFetcherIterator.next(ShuffleBlockFetcherIterator.scala:918)
	at org.apache.spark.storage.ShuffleBlockFetcherIterator.next(ShuffleBlockFetcherIterator.scala:85)
	at org.apache.spark.util.CompletionIterator.next(CompletionIterator.scala:29)
	at scala.collection.Iterator$$anon$11.nextCur(Iterator.scala:486)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:492)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at org.apache.spark.util.Compl

23/08/15 18:57:36 WARN TaskSetManager: Lost task 79.0 in stage 3.1 (TID 1190) (ithdp3102.cern.ch executor 48): FetchFailed(BlockManagerId(32, ithdp2115.cern.ch, 7337, None), shuffleId=1, mapIndex=7, mapId=1108, reduceId=83, message=
org.apache.spark.shuffle.FetchFailedException
	at org.apache.spark.errors.SparkCoreErrors$.fetchFailedError(SparkCoreErrors.scala:312)
	at org.apache.spark.storage.ShuffleBlockFetcherIterator.throwFetchFailedException(ShuffleBlockFetcherIterator.scala:1180)
	at org.apache.spark.storage.ShuffleBlockFetcherIterator.next(ShuffleBlockFetcherIterator.scala:918)
	at org.apache.spark.storage.ShuffleBlockFetcherIterator.next(ShuffleBlockFetcherIterator.scala:85)
	at org.apache.spark.util.CompletionIterator.next(CompletionIterator.scala:29)
	at scala.collection.Iterator$$anon$11.nextCur(Iterator.scala:486)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:492)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at org.apache.spark.util.Co

23/08/15 18:58:28 WARN TaskSetManager: Lost task 31.0 in stage 3.2 (TID 1241) (ithdp5003.cern.ch executor 41): FetchFailed(BlockManagerId(32, ithdp2115.cern.ch, 7337, None), shuffleId=1, mapIndex=7, mapId=1207, reduceId=35, message=
org.apache.spark.shuffle.FetchFailedException
	at org.apache.spark.errors.SparkCoreErrors$.fetchFailedError(SparkCoreErrors.scala:312)
	at org.apache.spark.storage.ShuffleBlockFetcherIterator.throwFetchFailedException(ShuffleBlockFetcherIterator.scala:1180)
	at org.apache.spark.storage.ShuffleBlockFetcherIterator.next(ShuffleBlockFetcherIterator.scala:918)
	at org.apache.spark.storage.ShuffleBlockFetcherIterator.next(ShuffleBlockFetcherIterator.scala:85)
	at org.apache.spark.util.CompletionIterator.next(CompletionIterator.scala:29)
	at scala.collection.Iterator$$anon$11.nextCur(Iterator.scala:486)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:492)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at org.apache.spark.util.Co

23/08/15 18:59:31 WARN TaskSetManager: Lost task 8.0 in stage 9.0 (TID 1674) (ithdp6016.cern.ch executor 35): FetchFailed(BlockManagerId(32, ithdp2115.cern.ch, 7337, None), shuffleId=2, mapIndex=5, mapId=1517, reduceId=11, message=
org.apache.spark.shuffle.FetchFailedException
	at org.apache.spark.errors.SparkCoreErrors$.fetchFailedError(SparkCoreErrors.scala:312)
	at org.apache.spark.storage.ShuffleBlockFetcherIterator.throwFetchFailedException(ShuffleBlockFetcherIterator.scala:1180)
	at org.apache.spark.storage.ShuffleBlockFetcherIterator.next(ShuffleBlockFetcherIterator.scala:918)
	at org.apache.spark.storage.ShuffleBlockFetcherIterator.next(ShuffleBlockFetcherIterator.scala:85)
	at org.apache.spark.util.CompletionIterator.next(CompletionIterator.scala:29)
	at scala.collection.Iterator$$anon$11.nextCur(Iterator.scala:486)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:492)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at org.apache.spark.util.Com

23/08/15 18:59:32 WARN TaskSetManager: Lost task 32.0 in stage 9.0 (TID 1695) (ithdp3102.cern.ch executor 49): FetchFailed(BlockManagerId(32, ithdp2115.cern.ch, 7337, None), shuffleId=2, mapIndex=0, mapId=1495, reduceId=47, message=
org.apache.spark.shuffle.FetchFailedException
	at org.apache.spark.errors.SparkCoreErrors$.fetchFailedError(SparkCoreErrors.scala:312)
	at org.apache.spark.storage.ShuffleBlockFetcherIterator.throwFetchFailedException(ShuffleBlockFetcherIterator.scala:1183)
	at org.apache.spark.storage.ShuffleBlockFetcherIterator.next(ShuffleBlockFetcherIterator.scala:918)
	at org.apache.spark.storage.ShuffleBlockFetcherIterator.next(ShuffleBlockFetcherIterator.scala:85)
	at org.apache.spark.util.CompletionIterator.next(CompletionIterator.scala:29)
	at scala.collection.Iterator$$anon$11.nextCur(Iterator.scala:486)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:492)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at org.apache.spark.util.Co

23/08/15 18:59:32 WARN TaskSetManager: Lost task 50.0 in stage 9.0 (TID 1712) (ithdp2122.cern.ch executor 55): FetchFailed(BlockManagerId(32, ithdp2115.cern.ch, 7337, None), shuffleId=2, mapIndex=0, mapId=1495, reduceId=74, message=
org.apache.spark.shuffle.FetchFailedException
	at org.apache.spark.errors.SparkCoreErrors$.fetchFailedError(SparkCoreErrors.scala:312)
	at org.apache.spark.storage.ShuffleBlockFetcherIterator.throwFetchFailedException(ShuffleBlockFetcherIterator.scala:1183)
	at org.apache.spark.storage.ShuffleBlockFetcherIterator.next(ShuffleBlockFetcherIterator.scala:918)
	at org.apache.spark.storage.ShuffleBlockFetcherIterator.next(ShuffleBlockFetcherIterator.scala:85)
	at org.apache.spark.util.CompletionIterator.next(CompletionIterator.scala:29)
	at scala.collection.Iterator$$anon$11.nextCur(Iterator.scala:486)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:492)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at org.apache.spark.util.Co

23/08/15 18:59:32 WARN TaskSetManager: Lost task 92.0 in stage 9.0 (TID 1752) (ithdp2105.cern.ch executor 47): FetchFailed(BlockManagerId(32, ithdp2115.cern.ch, 7337, None), shuffleId=2, mapIndex=0, mapId=1495, reduceId=137, message=
org.apache.spark.shuffle.FetchFailedException
	at org.apache.spark.errors.SparkCoreErrors$.fetchFailedError(SparkCoreErrors.scala:312)
	at org.apache.spark.storage.ShuffleBlockFetcherIterator.throwFetchFailedException(ShuffleBlockFetcherIterator.scala:1183)
	at org.apache.spark.storage.ShuffleBlockFetcherIterator.next(ShuffleBlockFetcherIterator.scala:918)
	at org.apache.spark.storage.ShuffleBlockFetcherIterator.next(ShuffleBlockFetcherIterator.scala:85)
	at org.apache.spark.util.CompletionIterator.next(CompletionIterator.scala:29)
	at scala.collection.Iterator$$anon$11.nextCur(Iterator.scala:486)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:492)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at org.apache.spark.util.C

23/08/15 18:59:33 WARN TaskSetManager: Lost task 7.0 in stage 9.1 (TID 1784) (ithdp3102.cern.ch executor 49): FetchFailed(BlockManagerId(32, ithdp2115.cern.ch, 7337, None), shuffleId=2, mapIndex=5, mapId=1763, reduceId=11, message=
org.apache.spark.shuffle.FetchFailedException
	at org.apache.spark.errors.SparkCoreErrors$.fetchFailedError(SparkCoreErrors.scala:312)
	at org.apache.spark.storage.ShuffleBlockFetcherIterator.throwFetchFailedException(ShuffleBlockFetcherIterator.scala:1180)
	at org.apache.spark.storage.ShuffleBlockFetcherIterator.next(ShuffleBlockFetcherIterator.scala:918)
	at org.apache.spark.storage.ShuffleBlockFetcherIterator.next(ShuffleBlockFetcherIterator.scala:85)
	at org.apache.spark.util.CompletionIterator.next(CompletionIterator.scala:29)
	at scala.collection.Iterator$$anon$11.nextCur(Iterator.scala:486)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:492)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at org.apache.spark.util.Com

23/08/15 18:59:33 WARN TaskSetManager: Lost task 44.0 in stage 9.1 (TID 1770) (ithdp2102.cern.ch executor 50): FetchFailed(BlockManagerId(32, ithdp2115.cern.ch, 7337, None), shuffleId=2, mapIndex=0, mapId=1762, reduceId=69, message=
org.apache.spark.shuffle.FetchFailedException
	at org.apache.spark.errors.SparkCoreErrors$.fetchFailedError(SparkCoreErrors.scala:312)
	at org.apache.spark.storage.ShuffleBlockFetcherIterator.throwFetchFailedException(ShuffleBlockFetcherIterator.scala:1183)
	at org.apache.spark.storage.ShuffleBlockFetcherIterator.next(ShuffleBlockFetcherIterator.scala:918)
	at org.apache.spark.storage.ShuffleBlockFetcherIterator.next(ShuffleBlockFetcherIterator.scala:85)
	at org.apache.spark.util.CompletionIterator.next(CompletionIterator.scala:29)
	at scala.collection.Iterator$$anon$11.nextCur(Iterator.scala:486)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:492)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at org.apache.spark.util.Co

23/08/15 18:59:33 WARN TaskSetManager: Lost task 92.0 in stage 9.2 (TID 1976) (ithdp6017.cern.ch executor 53): FetchFailed(BlockManagerId(32, ithdp2115.cern.ch, 7337, None), shuffleId=2, mapIndex=0, mapId=1881, reduceId=147, message=
org.apache.spark.shuffle.FetchFailedException
	at org.apache.spark.errors.SparkCoreErrors$.fetchFailedError(SparkCoreErrors.scala:312)
	at org.apache.spark.storage.ShuffleBlockFetcherIterator.throwFetchFailedException(ShuffleBlockFetcherIterator.scala:1180)
	at org.apache.spark.storage.ShuffleBlockFetcherIterator.next(ShuffleBlockFetcherIterator.scala:918)
	at org.apache.spark.storage.ShuffleBlockFetcherIterator.next(ShuffleBlockFetcherIterator.scala:85)
	at org.apache.spark.util.CompletionIterator.next(CompletionIterator.scala:29)
	at scala.collection.Iterator$$anon$11.nextCur(Iterator.scala:486)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:492)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at org.apache.spark.util.C

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [15]:
docs = docs.to_dict('records')

In [16]:
for i in range(len(docs)):
    if docs[i]['CRAB_DataBlock'] == 'MCFakeBlock':
        docs[i]['CRAB_Type'] = 'PrivateMC'
    else:
        docs[i]['CRAB_Type'] = 'Analysis'

In [17]:
docs[:5]

[{'RecordTime': 1692101192000,
  'CMSPrimaryDataTier': 'MINIAODSIM',
  'WallClockHr': 0.12361111111111112,
  'CoreHr': 0.12361111111111112,
  'CpuTimeHr': 0.0022222222222222222,
  'ExitCode': 8020,
  'CRAB_DataBlock': '/WWTo4Q_4f_TuneCP5_13TeV-amcatnloFXFX-pythia8/RunIISummer20UL18MiniAODv2-106X_upgrade2018_realistic_v16_L1v1-v3/MINIAODSIM#eb5a0cbd-6c43-492c-ac21-4318775aee3b',
  'TM_IGNORE_LOCALITY': 'F',
  'GlobalJobId': 'crab3@vocms0155.cern.ch#98631116.0#1692100543',
  'CommittedCoreHr': '0.12361111111111112',
  'CommittedWallClockHr': '0.12361111111111112',
  'CRAB_Type': 'Analysis'},
 {'RecordTime': 1692099933000,
  'CMSPrimaryDataTier': 'MINIAODSIM',
  'WallClockHr': 0.12166666666666667,
  'CoreHr': 0.12166666666666667,
  'CpuTimeHr': 0.004722222222222222,
  'ExitCode': 8020,
  'CRAB_DataBlock': '/WWTo4Q_4f_TuneCP5_13TeV-amcatnloFXFX-pythia8/RunIISummer20UL18MiniAODv2-106X_upgrade2018_realistic_v16_L1v1-v3/MINIAODSIM#eb5a0cbd-6c43-492c-ac21-4318775aee3b',
  'TM_IGNORE_LOCALITY':

In [18]:
import osearch

In [19]:
def get_index_schema():
    return {
        "settings": {"index": {"number_of_shards": "1", "number_of_replicas": "1"}},
        "mappings": {
            "properties": {
                "RecordTime": {"format": "epoch_millis", "type": "date"},
                "CMSPrimaryDataTier": {"ignore_above": 2048, "type": "keyword"},
                "GlobalJobId": {"ignore_above": 2048, "type": "keyword"},
                "WallClockHr": {"type": "long"},
                "CoreHr": {"type": "long"},
                "CpuTimeHr": {"type": "long"},
                "ExitCode": {"ignore_above": 2048, "type": "keyword"},
                "TM_IGNORE_LOCALITY": {"ignore_above": 2048, "type": "keyword"},
                "CRAB_Type": {"ignore_above": 2048, "type": "keyword"},
                "CRAB_DataBlock": {"ignore_above": 2048, "type": "keyword"},
                "CommittedCoreHr": {"type": "long"}, 
                "CommittedWallClockHr": {"type": "long"},
            }
        }
    }

In [20]:
_index_template = 'crab-condor-ekong'
client = osearch.get_es_client("es-cms1.cern.ch/es", 'secret_opensearch.txt', get_index_schema())
# index_mod="": 'test-foo', index_mod="Y": 'test-foo-YYYY', index_mod="M": 'test-foo-YYYY-MM', index_mod="D": 'test-foo-YYYY-MM-DD',
idx = client.get_or_create_index(timestamp=time.time(), index_template=_index_template, index_mod="M")
client.send(idx, docs, metadata=None, batch_size=10000, drop_nulls=False)



