In [None]:
# Work in progress
# We want to calculate the union, the intersection, (WMStats - union) and (CMSSW - union)
# Then try find if there is a pattern to show us how above's subsets are created

from __future__ import print_function
import datetime
from functools import reduce
import os

import pandas as pd
import numpy as np
%matplotlib nbagg
import matplotlib.pyplot as plt
import time

# Do not truncate values
pd.set_option('display.max_colwidth', -1)

In [298]:
# Validates that putting together all the dataset ids on every day of a week are the same as the
# dataset ids of the week
def validate_days_in_weeks(week_ts, weeks_ds, days_ds):
    
    ret_val = True
    # Get the set of dataset_ids accessed in the week identified by 'week_ts'
    week_set = set(weeks_ds[weeks_ds['week_ts']==week_ts].datasets_set.values[0])    
    
    # Get the set of dataset_ids accessed in every day that belongs to a week identified by 'week_ts'
    days_set=set()
    for day_set in days_ds[days_ds['week_ts']==week_ts]['datasets_set']:
        days_set.update(set(day_set))
    
    # Makes sure both sets are the same size
    week_set_len= len(week_set)
    days_set_len= len(days_set)
    if week_set_len != days_set_len:
        print(str(week_set_len)+" != "+str(days_set_len))
        ret_val = False
    
    # If both sets are the same size proceed to make sure tha both
    # sets contain the same items
    if ret_val != False:
        if days_set != week_set:
            ret_val = False
    
    return ret_val


# Receives a weeks DataFrame ('weeks_df') and returns a sorted (by week_ts) list
# of datasets sets
# @weeks_ds: a pandas DataFrame of the form:
#    ------------------------------------------------
#    |weeks_ts      |datasets_set                   |
#    ------------------------------------------------
#    |1.561594e+09  |[12686513, 13766504, 14689984] |
#    |1.361594e+09  |[15686513, 16766504]           |
#    |1.761594e+09  |[17686513, 18766504, 13689984] |
#    ------------------------------------------------
#    where:
#     'weeks_ts' is a Linux timestamp that identifies the week and 
#     'datasets_set' is an array of datasets IDs that were accessed in that week
#    @return: [{15686513, 16766504},{12686513, 13766504, 14689984},{17686513, 18766504, 13689984}] 
#
def get_sorted_list_of_datasets_sets(weeks_df):
    # Sort the dataset in cronological order (by week_ts (week timestamp))
    # Reset the index once the dataFrame is sorted so that we can access it
    # in order using the indices
    weeks_df_sorted = weeks_df.sort_values('week_ts')
    weeks_df_sorted = weeks_df_sorted.reset_index(drop=True)

    # count() returns a series structure, get an integer 
    weeks_df_count = weeks_df_sorted.count()
    weeks_df_count = weeks_df_count.week_ts
    # Create a cronological ordered list of datasets sets(arrays are converted into sets)
    weeks_sorted_list= []
    for i in range(0, weeks_df_count):
        weeks_sorted_list.append(set(weeks_df_sorted.datasets_set[i]))
        
    return weeks_sorted_list

def get_freed_recalled_and_ws_sizes(weeks_list, policy, datasets_size):
    freed = set()
    recalled_per_week = []
    freed_per_week = []
    called_per_week = []
    working_set_size_per_week=[]
    ws_per_week = []
    to_free = set()
    to_recall = set()
    
    # Fill in the first 'policy' weeks with empty sets given that nothing could have
    # recalled nor freed during those weeks.
    # The working set size for these first 'policy' weeks will be accumulated set of
    # datasets accessed during those weeks
    current_working_set = set()
    current_working_set_size = 0
    for i in range(0, policy):
        recalled_per_week.append(to_recall)
        freed_per_week.append(to_free)
        current_working_set = current_working_set.union(weeks_list[i])
        current_working_set_size = get_dataset_set_bytes(current_working_set, datasets_size)
        working_set_size_per_week.append(current_working_set_size)
   
    # For each week in the list, starting on the first week
    # after the policy
    for i in range(policy, len(weeks_list)):
        #print(i)
        # Calculate the intermediate working_set that includes the set of datasets
        # accesed between the week leaving the working set(old_week) and the
        # the current week(new_week)
        int_ws = set()
        int_ws_to = i - 1
        int_ws_from = i - (policy) + 1

        #print("from: "+str(int_ws_from))
        #print("to: "+str(int_ws_to))
        
        for j in range(int_ws_from, int_ws_to+1):
            #print("adding: "+str(weeks_list[j])+" to int_ws")
            int_ws.update(weeks_list[j])
        new_week = weeks_list[i]
        old_week = weeks_list[int_ws_from -1]
        #print(old_week)
        #print(int_ws)
        #print(new_week)
        
        current_working_set = int_ws.union(new_week)
        current_working_set_size = get_dataset_set_bytes(current_working_set, datasets_size)
        to_free = old_week - (int_ws.union(new_week))
        to_call = (new_week - (int_ws.union(old_week)))
        to_recall = (new_week - (int_ws.union(old_week))).intersection(freed)
       
        working_set_size_per_week.append(current_working_set_size)
        freed.update(to_free)
        recalled_per_week.append(to_recall)
        freed_per_week.append(to_free)
        
        #called_per_week.append(to_call)
        #ws_per_week.append(int_ws.union(old_week))

        #print("to free: "+ str(to_free))
        #print("to call: "+ str(to_call))
        #print("to recall: "+ str(to_recall))

    return freed_per_week, recalled_per_week, working_set_size_per_week

def get_size_of_datasets_sets(datasets_set, datasets_size):
    week_sizes = []
    for week in datasets_set:
        total_size=0
        for dataset_id in week:
            size = datasets_size[datasets_size['d_dataset_id'] == dataset_id].dataset_size.values[0]
            total_size = total_size + size
            #print("id: "+str(dataset_id))
            #print("size: "+str(size))
        week_sizes.append(total_size)
    return week_sizes


# Get the set of datasets recalled in every day of a given week
def get_datasets_recalled_per_day(recalled_set, week_ts, days_df):
    datasets_recalled_per_day = dict()
    
    for day_ts in days_df[days_df['week_ts'] == week_ts]['day_ts'].values:
        datasets_recalled_per_day[day_ts]=set()
    
    for recalled_dataset in recalled_set:   
        # For each of the days in the week
        for day_ts in days_df[days_df['week_ts'] == week_ts]['day_ts'].values:
            a= days_df['week_ts'] == week_ts
            b= days_df['day_ts'] == day_ts
            # Is the recalled dataset in this day
            if recalled_dataset in days_df[a&b]['datasets_set'].values[0]:
                #print(recalled_dataset)
                datasets_recalled_per_day[day_ts].add(recalled_dataset)
                # If a dataset was accessed more than once within the same week
                # it was only recalled the first time since the minimum delete
                # policy is 1 week
                break
                
    return datasets_recalled_per_day


def get_dataset_set_bytes(datasets_set, datasets_size):
    size=0
    total_size=0
    for dataset_id in datasets_set:
        size = datasets_size[datasets_size['d_dataset_id'] == dataset_id].dataset_size.values[0]
        total_size = total_size + size
    return total_size

In [310]:
## Testing dataset ------------------------------------------------------------------
days_df = pd.DataFrame(np.array(
        [
            [1, 1, [1,2,3]], 
            [1, 2, [1,2]], 
            [1, 3, [1,5]],
            [2, 1, [2,5,6]], 
            [2, 2, [1]], 
            [2, 3, [0,9]],
            [3, 1, [3]], 
            [3, 2, [2,4,8]], 
            [3, 3, [8,0]],
            [4, 1, [2]], 
            [4, 2, [1,0]], 
            [4, 3, [9,5]],
            [5, 1, [1,5,7]], 
            [5, 2, []], 
            [5, 3, [1]],
        ]
        ),columns=['week_ts', 'day_ts', 'datasets_set'])

datasets_size = pd.DataFrame(np.array(
    [
       [0,0], 
       [1,1], 
       [2,2],
       [3,3], 
       [4,4], 
       [5,5], 
       [6,6], 
       [7,7], 
       [8,8], 
       [9,9], 
    ]
    ), columns=['d_dataset_id','dataset_size'])

#datasets_size = pd.DataFrame(np.array(
#    [
#       [0,11000000], 
#       [1,14000000], 
#       [2,9000000],
#       [3,18000000], 
#       [4,1000000], 
#       [5,4000000], 
#       [6,9000000], 
#       [7,15000000], 
#       [8,20000000], 
#       [9,11000000], 
#    ]
#    ), columns=['d_dataset_id','dataset_size'])

# Calculate 'weeks_ds' out of 'days_ds'
weeks_df = days_df.groupby('week_ts').agg({'datasets_set':sum})
weeks_df['datasets_set'] = weeks_df['datasets_set'].apply(set)

# Shuffle the rows so that they are not sorted by the week_ts as it happens
# on the real dataset
weeks_df = weeks_df.sample(frac=1)

# Insert an index so that 'week_ts' can be accessed as a field nd not as an index
weeks_df.reset_index(inplace=True)

##---------------------------------------------------------------------------------------
datasets_df = pd.read_parquet("/Users/ddavila/projects/DOMA/data/model/dataset.parquet/")
days_df = pd.read_parquet("/Users/ddavila/projects/DOMA/data/model/days_201906.parquet/")
weeks_df = pd.read_parquet("/Users/ddavila/projects/DOMA/data/model/weeks_201906.parquet/")
datasets_size = datasets_df[['d_dataset_id', 'dataset_size']]

In [311]:
# Validate that the union of the sets of the days that belong to a week are the same
# as the set of the whole week.
# Note. This will make no sense if we calculate the 'weeks' dataset from the 'days'
for week_ts in weeks_df['week_ts']:
    print(str(week_ts) + ": " + str(validate_days_in_weeks(week_ts, weeks_df, days_df)))

1561593600.0: True
1559174400.0: True
1559779200.0: True
1560988800.0: True
1560384000.0: True


In [312]:
# Get a sorted list of the week sets so that the first element in the list
# would be the set of dataset IDs accessed in the first week and so on
weeks_list = get_sorted_list_of_datasets_sets(weeks_df)

In [313]:
# Calculate for each week, the working_set size and the set of datasets freed and recalled depending
# on the delete policy used
policy=1
datasets_freed, datasets_recalled, ws_sizes = get_freed_recalled_and_ws_sizes(weeks_list, policy, datasets_size)
max_ws_size = 0
for ws_size in ws_sizes:
    if ws_size > max_ws_size:
        max_ws_size = ws_size
        
#print(datasets_recalled)
print("max working set size: "+str(max_ws_size)+" Bytes")

max working set size: 1.0491340410239376e+16 Bytes


In [314]:
# Get the week timestamps sorted so that they corresponds to 'datasets_recalled'
weeks_ts = weeks_df.sort_values('week_ts')['week_ts'].values

In [315]:
# Calculate the day with more Bytes recalled
max_recall = 0
mr_day_ts = 0
mr_week_ts = 0
for i in range(0, len(weeks_ts)):
    #print(weeks_ts[i])
    recalled_datasets_per_day= get_datasets_recalled_per_day(datasets_recalled[i], weeks_ts[i], days_df)
    #print(recalled_datasets_per_day)
    for day in recalled_datasets_per_day:
        max_recall_per_week = get_dataset_set_bytes(recalled_datasets_per_day[day], datasets_size)
        if max_recall_per_week > max_recall:
            max_recall = max_recall_per_week
            mr_day_ts = day
            mr_week_ts = weeks_ts[i]

print("max recall per day: "+str(max_recall)+" Bytes, on week: "+str(mr_week_ts)+" on day: "+ str(mr_day_ts))
 

max recall per day: 455283116162785.0 Bytes, on week: 1560988800.0 on day: 1561420800.0


In [316]:
# Calculate the amount of Bytes recalled and freed per week
datasets_freed_sizes = get_size_of_datasets_sets(datasets_freed, datasets_size)
datasets_recalled_sizes = get_size_of_datasets_sets(datasets_recalled, datasets_size)

# Calculate totals
total_freed_bytes =0
for week in datasets_freed_sizes:
    total_freed_bytes = total_freed_bytes + week

total_recalled_bytes = 0
for week in datasets_recalled_sizes:
    total_recalled_bytes = total_recalled_bytes + week
    
#print(datasets_freed)
#print(datasets_freed_sizes)
print("total freed Bytes: "+str(total_freed_bytes))
#print(datasets_recalled)
#print(datasets_recalled_sizes)
print("total recalled Bytes: "+str(total_recalled_bytes))



total freed Bytes: 1.0307301195785932e+16
total recalled Bytes: 2467554212135155.0
