In [1]:
from pyarrow import parquet as pq
from matplotlib import pyplot as plt
from datetime import datetime as dt
import pandas as pd
import numpy as np

## Data

In [2]:
# Get parquet files
dataset = pq.ParquetDataset("parquet/ClassAds_06-23to29-2019")
table = dataset.read()

# Convert to pandas dataframe
df = table.to_pandas()
# Make some columns
df["workflow_id"] = df.workflow_id.map(str)+"_"+df.retries.astype(str)
df["job_id"] = df.crab_id.map(str)+"/"+df.workflow_id
df["start_datetime"] = pd.to_datetime(df.start_time, unit="ms")

In [3]:
df.columns

Index(['site', 'dataset', 'workflow_id', 'crab_id', 'retries', 'start_time',
       'schedd_name', 'primaryDataset_primary', 'primaryDataset_processed',
       'primaryDataTier', 'task', 'user_hn', 'walltime', 'cpuTime', 'exitCode',
       'cpus', 'read_bytes', 'read_ops', 'read_segs', 'read_time', 'read_vops',
       'write_bytes', 'write_time', 'job_id', 'start_datetime'],
      dtype='object')

## Definitions
$J \rightarrow$ a job
<br/><br/>
$W(J) =$ total walltime for a job $J$
<br/>
$C(J) =$ total CPU time for a job $J$

## Number of Unique Jobs

In [23]:
N_jobs = df.job_id.nunique()
print("Number of Unique Jobs: {}".format(N_jobs))

Number of Unique Jobs: 118669


## Number of Unique Users

In [6]:
N_unique_users = df.user_hn.nunique()
print("Number of Unique Users: {}".format(N_unique_users))

Number of Unique Users: 95


## Total Walltime
$$W_{tot} = \sum_i W(J_i)$$

In [11]:
total_walltime = df.walltime.sum()
print("Total Walltime: {0:.2f}".format(total_walltime))

Total Walltime: 376396.30


## Total CPU Time
$$C_{tot} = \sum_i C(J_i)$$

In [14]:
total_cpu_time = df.cpuTime.sum()
print("Total CPU Time: {0:.2f}".format(total_cpu_time))

Total CPU Time: 299695.74


## CPU Efficiency
$$\epsilon = \frac{C_{tot}}{\sum_i W(J_i) \times N_{cores}(J_i)}$$

In [37]:
cpu_eff_numer = (df.walltime*df.cpus).sum()
cpu_eff_denom = (df.cpuTime).sum()

cpu_eff = cpu_eff_numer/cpu_eff_denom
print("CPU Efficiency: {0:.2f}".format(cpu_eff))

CPU Efficiency: 2.39


## Fraction of Jobs with Exit Code 0

In [22]:
exit_code_frac = np.sum(df.exitCode == 0)/df.shape[0]
print("Fraction of Jobs with Exit Code 0: {0:.2f}".format(exit_code_frac))

Fraction of Jobs with Exit Code 0: 0.83
