In [1]:
from pyarrow import parquet as pq
from matplotlib import pyplot as plt
from datetime import datetime as dt
import pandas as pd
import numpy as np

## Data

In [2]:
# Get parquet files
dataset = pq.ParquetDataset("parquet/XRootD_06-23to29-2019")
table = dataset.read()

# Convert to pandas dataframe
df = table.to_pandas()
# Make some columns
workflow_idFront = (df.app_info.str.split('/').str[-1]
                               .str.split(':').str[:2]
                               .str.join('_')
                   ) # Front half of workflow_id
workflow_idBack = (df.app_info.str.split('/').str[-1]
                              .str.split(':').str[2:]
                              .str.join('_')
                  ) # Back half of workflow_id
df["workflow_id"] = workflow_idFront.map(str)+":"+workflow_idBack
df["crab_id"] = df.app_info.str.split('_').str[0]
df["job_id"] = df.crab_id.map(str)+"/"+df.workflow_id
df["start_datetime"] = pd.to_datetime(df.start_time, unit="ms")

file_size_lookup = df[df.operation == "read"].drop_duplicates(["file_name","file_size"])[["file_name", "file_size"]]

In [3]:
df.columns

Index(['operation', 'app_info', 'file_name', 'file_size', 'server_host',
       'client_host', 'client_domain', 'start_time', 'read_bytes',
       'workflow_id', 'crab_id', 'job_id', 'start_datetime'],
      dtype='object')

## Definitions
$J \rightarrow$ a job
<br/>
$f \rightarrow$ a file
<br/><br/>
$S(f) =$ size of file $f$
<br/>
$B(f) =$ bytes read from file $f$
<br/>
$N_J(f) =$ number of jobs that read from file $f$
<br/>
$\mathcal{N}_f =$ number of _unique_ files

## Working Set
$$w = \sum_{N_f} S(f_i)$$

In [4]:
working_set = df[df.operation == "read"].drop_duplicates(["file_name", "file_size"]).file_size.sum()/1e12
print("Working Set: {0:.2f} TB".format(working_set))

Working Set: 152.96 TB


## Total Naive Reads
$$r_{naive} = \sum_i S(f_i)$$

In [5]:
total_naive_reads = df.file_size.sum()/1e12
print("Total Naive Reads: {0:.2f} TB".format(total_naive_reads))

Total Naive Reads: 771.07 TB


## Total Actual Reads
$$r_{actual} = \sum_i B(f_i)$$

In [6]:
total_actual_reads = df.read_bytes.sum()/1e12
print("Total Actual Reads: {0:.2f} TB".format(total_actual_reads))

Total Actual Reads: 103.19 TB


## Reuse Multiplier

In [15]:
df_by_file = df.groupby("file_name")

#### Definition 1:
$$\mathcal{R}_1 = \frac{\sum_{\mathcal{N_f}} N_{J}(f_i)}{\mathcal{N}_f}$$

In [23]:
rmult_numer_1 = (df_by_file.app_info.nunique()).sum()
rmult_denom_1 = df.file_name.nunique()

rmult_1 = rmult_numer_1/rmult_denom_1
print("Reuse Multplier 1: {0:.2f}".format(rmult_1))

Reuse Multplier 1: 2.47


#### Definition 2
$$\mathcal{R}_2 = \frac{\sum_{\mathcal{N_f}} N_{J}(f_i) \times S(f_i)}{\sum_{\mathcal{N_f}} S(f_i)}$$

In [22]:
rmult_numer_2 = (df_by_file.app_info.nunique()*df_by_file.file_size.apply(lambda group: group.unique()[0])).sum()
rmult_denom_2 = np.sum(df["file_size"].unique())

rmult_2 = rmult_numer_2/rmult_denom_2
print("Reuse Multplier 2: {0:.2f}".format(rmult_2))

Reuse Multplier 2: 2.58


#### Definition 3:
$$\mathcal{R}_3 = \frac{\sum_{\mathcal{N_f}} N_{J}(f_i) \times B(f_i)}{\sum_{\mathcal{N_f}} S(f_i)}$$

In [25]:
rmult_numer_3 = (df_by_file.app_info.nunique()*df_by_file.read_bytes.sum()).sum()
rmult_denom_3 = rmult_denom_2

rmult_3 = rmult_numer_3/rmult_denom_3
print("Reuse Multplier 3: {0:.2f}".format(rmult_3))

Reuse Multplier 3: 1.53
