In [None]:
# Work in progress
# We want to calculate the union, the intersection, (WMStats - union) and (CMSSW - union)
# Then try find if there is a pattern to show us how above's subsets are created

from __future__ import print_function
import datetime
from functools import reduce
import os

import pandas as pd
import numpy as np
%matplotlib nbagg
import matplotlib.pyplot as plt
import time

# Do not truncate values
pd.set_option('display.max_colwidth', -1)

In [9]:
# Validates that putting together all the dataset ids on every day of a week are the same as the
# dataset ids of the week
def validate_days_in_weeks(week_ts, weeks_ds, days_ds):
    
    ret_val = True
    # Get the set of dataset_ids accessed in the week identified by 'week_ts'
    week_set = set(weeks_ds[weeks_ds['week_ts']==week_ts].datasets_set.values[0])    
    
    # Get the set of dataset_ids accessed in every day that belongs to a week identified by 'week_ts'
    days_set=set()
    for day_set in days_ds[days_ds['week_ts']==week_ts]['datasets_set']:
        days_set.update(set(day_set))
    
    # Makes sure both sets are the same size
    week_set_len= len(week_set)
    days_set_len= len(days_set)
    if week_set_len != days_set_len:
        print(str(week_set_len)+" != "+str(days_set_len))
        ret_val = False
    
    # If both sets are the same size proceed to make sure tha both
    # sets contain the same items
    if ret_val != False:
        if days_set != week_set:
            ret_val = False
    
    return ret_val


# Receives a weeks DataFrame ('weeks_ds') and returns a sorted (by week_ts) list
# of datasets sets
# @weeks_ds: a pandas DataFrame of the form:
#    ------------------------------------------------
#    |weeks_ts      |datasets_set                   |
#    ------------------------------------------------
#    |1.561594e+09  |[12686513, 13766504, 14689984] |
#    |1.361594e+09  |[15686513, 16766504]           |
#    |1.761594e+09  |[17686513, 18766504, 13689984] |
#    ------------------------------------------------
#    where:
#     'weeks_ts' is a Linux timestamp that identifies the week and 
#     'datasets_set' is an array of datasets IDs that were accessed in that week
#    @return: [{15686513, 16766504},{12686513, 13766504, 14689984},{17686513, 18766504, 13689984}] 
#
def get_sorted_list_of_datasets_sets(weeks_df):
    # Sort the dataset in cronological order (by week_ts (week timestamp))
    # Reset the index once the dataFrame is sorted so that we can access it
    # in order using the indices
    weeks_df_sorted = weeks_df.sort_values('week_ts')
    weeks_df_sorted = weeks_df_sorted.reset_index(drop=True)

    # count() returns a series structure, get an integer 
    weeks_df_count = weeks_df_sorted.count()
    weeks_df_count = weeks_df_count.week_ts
    # Create a cronological ordered list of datasets sets(arrays are converted into sets)
    weeks_sorted_list= []
    for i in range(0, weeks_df_count):
        weeks_sorted_list.append(set(weeks_df_sorted.datasets_set[i]))
        
    return weeks_sorted_list

def get_freed_and_recalled(weeks_list, policy):
    freed = set()
    recalled = set()
    recalled_per_week = dict()
    to_free = set()
    to_recall = set()
    # For each week in the list, starting on the first week
    # after the policy
    for i in range(policy, len(weeks_list)):
        print(i)
        # Calculate the current working_set that includes the set of datasets
        # accesed within the current week, pointed by 'i', and the previous 'N' 
        # weeks, where N = 'policy'
        working_set = set()
        for j in range(i-policy+1, i+1):
            #print("j: "+str(j))
            working_set.update(weeks_list[j])
        new_week = weeks_list[i]
        old_week = weeks_list[i-policy]
        #print("old_week: "+str(old_week))
        #print("new_weeks: "+str(new_weeks))

        to_free = old_week - working_set
        to_recall = (new_week - old_week).intersection(freed)
        #recalled_per_week['']
        freed.update(to_free)
        recalled.update(to_recall)

        print("to free: "+ str(to_free))
        print("to recall: "+ str(to_recall))

    return len(freed), len(recalled)


In [211]:
datasets_df = pd.read_parquet("/Users/ddavila/projects/DOMA/data/model/dataset.parquet/")
days_df = pd.read_parquet("/Users/ddavila/projects/DOMA/data/model/days_201906.parquet/")
weeks_df = pd.read_parquet("/Users/ddavila/projects/DOMA/data/model/weeks_201906.parquet/")

## Testing dataset ------------------------------------------------------------------
my_days_df = pd.DataFrame(np.array(
        [
            [1, 1, [1,2,3]], 
            [1, 2, [1,2]], 
            [1, 3, [1,5]],
            [2, 1, [2,5,6]], 
            [2, 2, [1]], 
            [2, 3, [0,9]],
            [3, 1, [1,3]], 
            [3, 2, [2,4,8]], 
            [3, 3, [8,9]],
            [4, 1, [2]], 
            [4, 2, [3,5]], 
            [4, 3, [9,0]],
            [5, 1, [1,5,7]], 
            [5, 2, []], 
            [5, 3, [8]],
        ]
        ),columns=['week_ts', 'day_ts', 'datasets_set'])

# Calculate 'weeks_ds' out of 'days_ds'
my_weeks_df = my_days_ds.groupby('week_ts').agg({'datasets_set':sum})
my_weeks_df['datasets_set'] = my_weeks_ds['datasets_set'].apply(set)

# Shuffle the rows so that they are not sorted by the week_ts as it happens
# on the real dataset
my_weeks_df = my_weeks_df.sample(frac=1)

# Insert an index so that 'week_ts' can be accessed as a field nd not as an index
my_weeks_df.reset_index(inplace=True)

##---------------------------------------------------------------------------------------

In [213]:
# Validate that the union of the sets of the days that belong to a week are the same
# as the set of the whole week.
# Note. This will make no sense if we calculate the 'weeks' dataset from the 'days' one
for week_ts in weeks_df['week_ts']:
    print(str(week_ts) + ": " + str(validate_days_in_weeks(week_ts, weeks_df, days_df)))

for my_week_ts in my_weeks_df['week_ts']:
    print(str(my_week_ts) + ": " + str(validate_days_in_weeks(week_ts, weeks_df, days_df)))

1561593600.0: True
1559174400.0: True
1559779200.0: True
1560988800.0: True
1560384000.0: True
5: True
2: True
1: True
3: True
4: True


In [221]:
my_weeks_list = get_sorted_list_of_datasets_sets(my_weeks_df)

In [222]:
for i in range(1,4):
    f,r = get_freed_and_recalled(my_weeks_list, i)
    print("freed: "+str(f))
    print("recalled: "+str(r))

freed: 9
recalled: 5
freed: 2
recalled: 0
freed: 1
recalled: 0


In [7]:
my_list=[{1},{2},{0},{1},{3}]

In [10]:
get_freed_and_recalled(my_list, 2)

2
to free: {1}
to recall: set()
3
to free: {2}
to recall: {1}
4
to free: {0}
to recall: set()


(3, 1)