In [1]:
from __future__ import print_function
import datetime
from functools import reduce
import os

import pandas as pd
import numpy as np
%matplotlib nbagg
%matplotlib inline
import matplotlib.pyplot as plt
import time

# Home made functions
from algorithmUtils import *
import glob

# Do not truncate values
pd.set_option('display.max_colwidth', -1)

In [2]:
def add_record_report(report, policy, max_ws_size, total_recalled_bytes, total_freed_bytes, \
                      max_recall, mr_week_ts, mr_day_ts, \
                      weeks_ts, ws_size_per_week_per_policy, recalled_size_per_week_per_policy, freed_size_per_week_per_policy, \
                      total_time_s):
    
    record = {  'policy': policy,
                'max_workingset_size':format_bytes(max_ws_size),
                'total_recalled':format_bytes(total_recalled_bytes),
                'total_freed':format_bytes(total_freed_bytes),
                'max_recalled_per_day':format_bytes(max_recall),
                'max_recalled_week_ts':mr_week_ts,
                'max_recalled_day_ts':mr_day_ts,
                'weeks_ts':weeks_ts,
                'ws_size_per_week_per_policy':ws_size_per_week_per_policy,
                'recalled_size_per_week_per_policy': recalled_size_per_week_per_policy,
                'freed_size_per_week_per_policy':freed_size_per_week_per_policy,
                'total_time_s':total_time_s,
            }
    report.append(record)



def print_short_report(report):
    df = pd.DataFrame(report)
    df['max_recalled_week'] = pd.to_datetime(df['max_recalled_week_ts'], unit='s').dt.date
    df['max_recalled_day'] = pd.to_datetime(df['max_recalled_day_ts'], unit='s').dt.date
    df = df[['policy', 'max_recalled_per_day', 'max_workingset_size','total_recalled', 'total_freed']]
    print(df)
    
def print_full_report(report):
    df = pd.DataFrame(report)
    df['max_recalled_week'] = pd.to_datetime(df['max_recalled_week_ts'], unit='s').dt.date
    df['max_recalled_day'] = pd.to_datetime(df['max_recalled_day_ts'], unit='s').dt.date
    #df = df[['policy', 'max_recalled_per_day', 'max_workingset_size','total_recalled', 'total_freed']]
    print(df)


In [None]:
basepath="/Users/ddavila/projects/DOMA/data/model/"

datasets_size_path = basepath+"dataset.parquet"
datasets_creation_path = basepath+"dataset_dates_NANO.parquet"
days_path= basepath+"datatiers/data_tier_days_*_NANO.parquet"

outputfile=basepath+"reports/072018_062019_p1_12_NANO-v3.0"
#outputfile=basepath+"reports/test_NANO"

policy_range = range(1,13)
deltaT = 7

# REAL data
days_df, weeks_df, datasets_creation_df, datasets_size = get_input_data(days_path, datasets_creation_path, datasets_size_path)

#TEST data
#days_df, weeks_df, datasets_creation_df, datasets_size = get_test_data()

In [None]:
# Get a list with the timesgtamps of each of the weeks in the time range we are analyzing
weeks_ts = weeks_df.sort_values('week_ts')['week_ts'].values
start_date = weeks_ts[0]
end_date = weeks_ts[len(weeks_ts) - 1]

# Get a sorted list of the week sets so that the first element in the list
# would be the set of dataset IDs accessed in the first week and so on
weeks_list_accesses = get_sorted_list_of_datasets_setsX(weeks_df, "week_ts", "datasets_set", start_date, end_date)
weeks_list_creation = get_sorted_list_of_datasets_setsX(datasets_creation_df, "creation_week_ts", "d_dataset_id", start_date, end_date)

# Make sure that the 3 lists: weeks_ts, weeks_list_accesses and weeks_list_creation have the same length
# Otherwise something went wrong
if(len(weeks_ts) != len(weeks_list_accesses) or len(weeks_ts) != len(weeks_list_creation)):
    print("ERROR, lists: weeks_ts, weeks_list_accesses and/or weeks_list_creation have different lengths")
    exit(1)

In [None]:
report= []
ws_size_created_per_week_per_policy = []
ws_size_accessed_per_week_per_policy = []
recalled_size_per_week_per_policy = []
freed_size_per_week_per_policy = []
# Execute general algorithm for each of the different policies
for policy in policy_range:
    time1 = time.time()

    # STEP 1.
    # Calculate for each week, the working_set size and the set of datasets freed and recalled depending
    # on the delete policy used
    datasets_freed, datasets_recalled, working_set_size_created_per_week, working_set_size_accessed_per_week = get_freed_recalled_and_ws_sizes(weeks_list_accesses, weeks_list_creation, policy, policy+deltaT, datasets_size)
    #datasets_freed, datasets_recalled, ws_sizes = get_freed_recalled_and_ws_sizes(weeks_list_accesses, weeks_list_creation, policy, policy+deltaT, datasets_size)
    # Get the maximum working_set size
    max_ws_accessed_size = max(working_set_size_accessed_per_week)
    max_ws_created_size =  max(working_set_size_created_per_week)
    ws_size_created_per_week_per_policy.append(to_petabytes(max_ws_created_size))
    ws_size_accessed_per_week_per_policy.append(to_petabytes(max_ws_accessed_size))
    time2 = time.time()
    print('STEP1 took %0.2f s' % (time2-time1))
    

In [None]:
    
    ## STEP 2.
    ## Calculate the day with more Bytes recalled
    #max_recalled, max_recalled_week_ts, max_recalled_day_ts= get_day_with_max_bytes_recalled(datasets_recalled, weeks_ts,\
                                                                                             days_df, datasets_size) 
    #time3 = time.time()
    #print('STEP2 took %0.2f s' % (time3-time2))
    
    ## STEP 3.
    #recalled_size_per_week, freed_size_per_week, total_recalled, total_freed = get_recalled_and_freed_sizes(datasets_recalled,\
                                                                                                            datasets_freed,\
                                                                                                            datasets_size)
    #recalled_size_per_week_per_policy.append(to_petabytes(recalled_size_per_week))
    #freed_size_per_week_per_policy.append(to_petabytes(freed_size_per_week))
    #time4 = time.time()
    #print('STEP3 took %0.2f s' % (time4-time3))
    #total_time = (time4-time1)
    #print('Total time: %0.2f s' % total_time)
    
    ## Add the result to the report
    #add_record_report(report, policy, max_ws_size, total_recalled, total_freed,\
    #                  max_recalled, max_recalled_week_ts, max_recalled_day_ts,\
    #                  weeks_ts, ws_sizes, recalled_size_per_week, freed_size_per_week,\
    #                  total_time)

# Save the report
report_df= pd.DataFrame(report)
report_df.to_parquet(outputfile)

In [None]:
def get_test_data():
    # Build test data
    weeks=[
        [0,  {0,1}],
        [1,  {3,4}],
        [2,  {1}],
        [3,  {2}],
        [4,  {3}],
        [5,  {2}],
        [6,  {0,1}],
        [7,  {4}],
       # [8,  {4,3}],
       # [9, {4,3}],
    ]
    weeks_df = pd.DataFrame(weeks, columns=['week_ts', 'datasets_set'])
    #weeks_df['week_ts_simple'] = df['week_ts']

    dataset_size_creation=[
        [1,  1,  1000000],
        [2,  10,  2000000],
        [3,  0,  3000000],
        [4,  30,  4000000],
        [5,  40,  5000000],
        [6,  20,  6000000],
        [7,  10,  7000000],
        [8,  40,  8000000],
        [9,  50,  9000000],
        [0, 10, 10000000], 
    ]
    dataset_size_creation_df = pd.DataFrame(dataset_size_creation, columns=['d_dataset_id', 'creation_week_ts','dataset_size'])

    datasets_creation_df = dataset_size_creation_df[['creation_week_ts','d_dataset_id']]
    datasets_creation_df=datasets_creation_df.groupby('creation_week_ts').agg({'d_dataset_id':list})
    datasets_creation_df.reset_index(inplace=True)
    datasets_size_df     = dataset_size_creation_df[['d_dataset_id','dataset_size']]

    ts_map= {
                'week_ts':dict(),
                'creation_week_ts':dict(),
    }

    # Thursday Jun 7, 2018 00:00:00 UTC
    init_date = 1528329600
    seconds_in_week = 3600*24*7

    for i in range(0, 10):
        ts_map['week_ts'][i]= init_date + (seconds_in_week * i)
        ts_map['creation_week_ts'][i]= init_date + (seconds_in_week * i)
        
    weeks_df=weeks_df.replace(ts_map)    
    datasets_creation_df=datasets_creation_df.replace(ts_map)

    return 0, weeks_df, datasets_creation_df, datasets_size_df 