In [29]:
from pyarrow import parquet as pq
from matplotlib import pyplot as plt
from datetime import datetime as dt
import pandas as pd
import numpy as np

## Data

In [30]:
# Get parquet files
dataset = pq.ParquetDataset("parquet/XRootD_06-23to29-2019")
table = dataset.read()

# Convert to pandas dataframe
df = table.to_pandas()
# Make some columns
workflow_idFront = (df.app_info.str.split('/').str[-1]
                               .str.split(':').str[:2]
                               .str.join('_')
                   ) # Front half of workflow_id
workflow_idBack = (df.app_info.str.split('/').str[-1]
                              .str.split(':').str[2:]
                              .str.join('_')
                  ) # Back half of workflow_id
df["workflow_id"] = workflow_idFront.map(str)+":"+workflow_idBack
df["crab_id"] = df.app_info.str.split('_').str[0]
df["job_id"] = df.crab_id.map(str)+"/"+df.workflow_id
df["start_datetime"] = pd.to_datetime(df.start_time, unit="ms")

file_size_lookup = df[df.operation == "read"].drop_duplicates(["file_name","file_size"])[["file_name", "file_size"]]

In [31]:
df.columns

Index(['operation', 'app_info', 'file_name', 'file_size', 'server_host',
       'client_host', 'client_domain', 'start_time', 'read_bytes',
       'workflow_id', 'crab_id', 'job_id', 'start_datetime'],
      dtype='object')

## Definitions
$J \rightarrow$ a job
<br/>
$f \rightarrow$ a file
<br/><br/>
$S(f) =$ size of file $f$
<br/>
$B(f) =$ bytes read from file $f$
<br/>
$\mathcal{N}_J(f) =$ number of unique jobs that read from file $f$
<br/>
$\mathcal{N}_f =$ number of _unique_ files

## Working Set
$$w = \sum_{N_f} S(f_i)$$

In [32]:
working_set = df[df.operation == "read"].drop_duplicates(["file_name", "file_size"]).file_size.sum()/1e12
print("Working Set: {0:.2f} TB".format(working_set))

Working Set: 152.96 TB


## Total Naive Reads
$$r_{naive} = \sum_i S(f_i)$$

In [33]:
total_naive_reads = df.file_size.sum()/1e12
print("Total Naive Reads: {0:.2f} TB".format(total_naive_reads))

Total Naive Reads: 771.07 TB


## Total Actual Reads
$$r_{actual} = \sum_i B(f_i)$$

In [34]:
total_actual_reads = df.read_bytes.sum()/1e12
print("Total Actual Reads: {0:.2f} TB".format(total_actual_reads))

Total Actual Reads: 103.19 TB


## Unique Accesses per File

In [35]:
num_unique_file_accesses = (df.groupby("file_name").app_info.nunique()).sum()

## Number of Unique Files

In [36]:
num_unique_files = df.file_name.nunique()

## Reuse Multiplier

In [37]:
df_by_file = df.groupby("file_name")

#### Definition 1:
$$\mathcal{R}_1 = \frac{\sum_{\mathcal{N}_f} \mathcal{N}_{J}(f_i)}{\mathcal{N}_f}$$

In [38]:
rmult_1 = num_unique_file_accesses/num_unique_files
print("Reuse Multplier 1: {0:.2f}".format(rmult_1))

Reuse Multplier 1: 2.47


#### Definition 2
$$\mathcal{R}_2 = \frac{r_{naive}}{w}$$

In [39]:
rmult_2 = total_naive_reads/working_set
print("Reuse Multplier 2: {0:.2f}".format(rmult_2))

Reuse Multplier 2: 5.04


#### Definition 3:
$$\mathcal{R}_3 = \frac{r_{actual}}{w}$$

In [40]:
rmult_3 = total_actual_reads/working_set
print("Reuse Multplier 3: {0:.2f}".format(rmult_3))

Reuse Multplier 3: 0.67
