In [None]:
from __future__ import print_function
import datetime
from functools import reduce
import os

import pandas as pd
import numpy as np
%matplotlib nbagg
%matplotlib inline
import matplotlib.pyplot as plt
import time

# Home made functions
#from myUtils import *
from algorithmUtils import *
import glob

# Do not truncate values
pd.set_option('display.max_colwidth', -1)

In [None]:
# Takes a list of dataframes (one per month) and merge them into 
# a single dataframe making sure that records with the same week timestamp(week_ts)
# and day timestamp(day_ts) present in more than one month are merged together
def merge_days_dataframes(days_df_list):
    # Data is coming separated in months, let's concatenate all these months
    # to make a single DataFrame containing all the data
    all_days = pd.concat(days_df_list)
    all_days.reset_index(inplace=True)
    
    # Transform 'datasets_set' from array to list, so that we can group lists
    all_days['datasets_set']=all_days['datasets_set'].apply(list)
    
    # Make sure that the same week_ts + day_ts doesn't exist in more than 1 month
    # and if so, group it together
    all_days= all_days.groupby(['week_ts','day_ts']).agg({'datasets_set':sum})
    all_days.reset_index(inplace=True)
    
    return all_days

# group day records into weeks, making a union on the datasets sets
def group_days_into_weeks_df(all_days):
    # Create a new DataFrame 'all_weeks' where we group all days, within a week, together
    all_weeks = all_days.groupby(['week_ts']).agg({'datasets_set':sum})

    # Transform the 'datasets_set' from list to set, to remove duplicates
    all_weeks['datasets_set']=all_weeks['datasets_set'].apply(set)
    all_weeks.reset_index(inplace=True)
    
    # Transform the 'datasets_set' from list to set, to remove duplicates
    all_days['datasets_set']=all_days['datasets_set'].apply(set)
    all_days.reset_index(inplace=True)
    
    return all_weeks


# Receives 2 paths:
# @days_paths: from where to read input files of days datasets and 
# @datasets_path: where is the file that contains the datasets sizes 
# returns 3 DataFrames:
# 1. all_days: each record contains the set of datsets accessed in a given day 
# 2. all_weeks: each record contains the set of datsets accessed in a given day week
# 3. datasets_size: each record contains a dataset ID and its size on Bytes
def get_input_data(days_path, datasets_path):
    datasets_df = pd.read_parquet(datasets_path)
    datasets_size = datasets_df[['d_dataset_id', 'dataset_size']]
    
    days_df_list = [] 
    list_of_files = glob.glob(days_path)
    for file in list_of_files:
        print("Reading: "+file)
        day_df = pd.read_parquet(file)
        days_df_list.append(day_df)
        
    all_days = merge_days_dataframes(days_df_list)
    all_weeks = group_days_into_weeks_df(all_days)
    
    return all_days, all_weeks, datasets_size
            



def add_record_report(report, policy, max_ws_size, total_recalled_bytes, total_freed_bytes, \
                      max_recall, mr_week_ts, mr_day_ts, \
                      weeks_ts, ws_size_per_week_per_policy, recalled_size_per_week_per_policy, freed_size_per_week_per_policy, \
                      total_time_s):
    
    record = {  'policy': policy,
                'max_workingset_size':format_bytes(max_ws_size),
                'total_recalled':format_bytes(total_recalled_bytes),
                'total_freed':format_bytes(total_freed_bytes),
                'max_recalled_per_day':format_bytes(max_recall),
                'max_recalled_week_ts':mr_week_ts,
                'max_recalled_day_ts':mr_day_ts,
                'weeks_ts':weeks_ts,
                'ws_size_per_week_per_policy':ws_size_per_week_per_policy,
                'recalled_size_per_week_per_policy': recalled_size_per_week_per_policy,
                'freed_size_per_week_per_policy':freed_size_per_week_per_policy,
                'total_time_s':total_time_s,
            }
    report.append(record)



def print_short_report(report):
    df = pd.DataFrame(report)
    df['max_recalled_week'] = pd.to_datetime(df['max_recalled_week_ts'], unit='s').dt.date
    df['max_recalled_day'] = pd.to_datetime(df['max_recalled_day_ts'], unit='s').dt.date
    df = df[['policy', 'max_recalled_per_day', 'max_workingset_size','total_recalled', 'total_freed']]
    print(df)
    
def print_full_report(report):
    df = pd.DataFrame(report)
    df['max_recalled_week'] = pd.to_datetime(df['max_recalled_week_ts'], unit='s').dt.date
    df['max_recalled_day'] = pd.to_datetime(df['max_recalled_day_ts'], unit='s').dt.date
    #df = df[['policy', 'max_recalled_per_day', 'max_workingset_size','total_recalled', 'total_freed']]
    print(df)


In [None]:
basepath="/Users/ddavila/projects/DOMA/data/model/"
datasets_size_path = basepath+"dataset.parquet"
#days_path= basepath+"datatiers/data_tier_days_*_NANO.parquet"
days_path= basepath+"days_*.parquet"
outputfile=basepath+"reports/072018_062019_p1_3"
days_df, weeks_df, datasets_size = get_input_data(days_path, datasets_size_path)

In [None]:

# Get a sorted list of the week sets so that the first element in the list
# would be the set of dataset IDs accessed in the first week and so on
weeks_list = get_sorted_list_of_datasets_sets(weeks_df)

# Get the week timestamps sorted so that they corresponds to 'weeks_list'
weeks_ts = weeks_df.sort_values('week_ts')['week_ts'].values

report= []
ws_size_per_week_per_policy = []
recalled_size_per_week_per_policy = []
freed_size_per_week_per_policy = []
# Execute general algorithm for each of the different policies
for policy in range(1,4):
    time1 = time.time()

    # STEP 1.
    # Calculate for each week, the working_set size and the set of datasets freed and recalled depending
    # on the delete policy used
    datasets_freed, datasets_recalled, ws_sizes = get_freed_recalled_and_ws_sizes(weeks_list, policy, datasets_size)
    # Get the maximum working_set size
    max_ws_size = max(ws_sizes)
    ws_size_per_week_per_policy.append(to_petabytes(ws_sizes))
    time2 = time.time()
    print('STEP1 took %0.2f s' % (time2-time1))
    
    # STEP 2.
    # Calculate the day with more Bytes recalled
    max_recalled, max_recalled_week_ts, max_recalled_day_ts= get_day_with_max_bytes_recalled(datasets_recalled, weeks_ts,\
                                                                                             days_df, datasets_size) 
    time3 = time.time()
    print('STEP2 took %0.2f s' % (time3-time2))
    
    # STEP 3.
    recalled_size_per_week, freed_size_per_week, total_recalled, total_freed = get_recalled_and_freed_sizes(datasets_recalled,\
                                                                                                            datasets_freed,\
                                                                                                            datasets_size)
    recalled_size_per_week_per_policy.append(to_petabytes(recalled_size_per_week))
    freed_size_per_week_per_policy.append(to_petabytes(freed_size_per_week))
    time4 = time.time()
    print('STEP3 took %0.2f s' % (time4-time3))
    total_time = (time4-time1)
    print('Total time: %0.2f s' % total_time)
    
    # Add the result to the report
    add_record_report(report, policy, max_ws_size, total_recalled, total_freed,\
                      max_recalled, max_recalled_week_ts, max_recalled_day_ts,\
                      weeks_ts, ws_sizes, recalled_size_per_week, freed_size_per_week,\
                      total_time)

# Save the report
report_df= pd.DataFrame(report)
report_df.to_parquet(outputfile)