In [1]:
from __future__ import print_function
import datetime
import time
from functools import reduce
import os

import pyarrow.parquet as parquet
import pandas as pd
import numpy as np
%matplotlib nbagg
import matplotlib.pyplot as plt
import copy

from fastset import FastSet

In [2]:
resultDirectory = 'Results/classads3/'
if not os.path.exists(resultDirectory):
    os.mkdir(resultDirectory)

In [3]:
startTime = datetime.datetime.now()

In [4]:
# Data collected from a spark query at CERN, in pandas pickle format
# CRAB jobs only have data after Oct. 2017
# ws = pd.read_pickle("data/working_set_day.pkl.gz")
# spark returns lists, we want to use sets
# ws['working_set_blocks'] = ws.apply(lambda x: set(x.working_set_blocks), 'columns')
# ws['working_set'] = ws.apply(lambda x: set(x.working_set), 'columns')

# Reading from working_set_classads 
ws = parquet.read_table('data/classAds2018Total').to_pandas()
#ws = parquet.read_table('data/working_set_classads').to_pandas()
# spark returns lists, we want to use sets
ws['working_set_blocks'] = ws.apply(lambda x: FastSet(x.working_set_blocks), 'columns')

In [5]:
print(ws)

                 day  isCrab           input_campaign  d_data_tier_id  \
0       1.530403e+09    True       Run2016E-03Feb2017           31223   
1       1.530403e+09    True       Run2017B-17Nov2017               9   
2       1.530403e+09    True       Run2017B-31Mar2018           31223   
3       1.530403e+09    True       RunIIFall17MiniAOD           31224   
4       1.530403e+09    True     RunIIFall17MiniAODv2           31224   
5       1.530490e+09    True       Run2017E-17Nov2017               9   
6       1.530576e+09    True  Run2016B-03Feb2017_ver2           31223   
7       1.530576e+09    True       Run2017B-31Mar2018           31223   
8       1.530576e+09    True      Run2018A-PromptReco           31223   
9       1.530576e+09    True       RunIIFall17MiniAOD           31224   
10      1.530662e+09    True                HIRun2015               9   
11      1.530662e+09    True       Run2016D-03Feb2017           31223   
12      1.530662e+09    True       Run2017F-31Mar20

In [6]:
# Isolating the data associated with US sites
#US_ws = ws.loc[ws['SiteName'].str.startswith('T2_US')]
#print(US_ws)

In [7]:
#   DBS BLOCKS table schema:
#     BLOCK_ID NOT NULL NUMBER(38)
#     BLOCK_NAME NOT NULL VARCHAR2(500)
#     DATASET_ID NOT NULL NUMBER(38)
#     OPEN_FOR_WRITING NOT NULL NUMBER(38)
#     ORIGIN_SITE_NAME NOT NULL VARCHAR2(100)
#     BLOCK_SIZE NUMBER(38)
#     FILE_COUNT NUMBER(38)
#     CREATION_DATE NUMBER(38)
#     CREATE_BY VARCHAR2(500)
#     LAST_MODIFICATION_DATE NUMBER(38)
#     LAST_MODIFIED_BY VARCHAR2(500)
if not os.path.exists('data/block_size.npy'):
    blocksize = pd.read_csv("data/dbs_blocks.csv.gz", dtype='i8', usecols=(0,5), names=['block_id', 'block_size'])
    np.save('data/block_size.npy', blocksize.values)
    blocksize = blocksize.values
else:
    blocksize = np.load('data/block_size.npy')

# We'll be accessing randomly, make a dictionary
# blocksize = {v[0]:v[1] for v in blocksize}
# We'll be accessing randomly, make a fast lookup
bsort = np.argsort(blocksize[:,0])
bsize_index = blocksize[bsort, 0]
bsize_values = blocksize[bsort, 1]
def getsize(s):
    return bsize_values[np.searchsorted(bsize_index, s._set)].sum()
def getsetsize(s):
    return bsize_values[np.searchsorted(bsize_index, s)].sum()

In [8]:
# join the data tier definitions
datatiers = pd.read_csv('data/dbs_datatiers.csv').set_index('id')
ws['data_tier'] = datatiers.loc[ws.d_data_tier_id].data_tier.values

In [9]:
date_index = np.arange(np.min(ws.day.values//86400), np.max(ws.day.values//86400)+1)
date_index_ts = np.array(list(datetime.date.fromtimestamp(day*86400) for day in date_index))

In [10]:
print(date_index_ts)

[datetime.date(2017, 5, 31) datetime.date(2017, 6, 1)
 datetime.date(2017, 6, 2) datetime.date(2017, 6, 3)
 datetime.date(2017, 6, 4) datetime.date(2017, 6, 5)
 datetime.date(2017, 6, 6) datetime.date(2017, 6, 7)
 datetime.date(2017, 6, 8) datetime.date(2017, 6, 9)
 datetime.date(2017, 6, 10) datetime.date(2017, 6, 11)
 datetime.date(2017, 6, 12) datetime.date(2017, 6, 13)
 datetime.date(2017, 6, 14) datetime.date(2017, 6, 15)
 datetime.date(2017, 6, 16) datetime.date(2017, 6, 17)
 datetime.date(2017, 6, 18) datetime.date(2017, 6, 19)
 datetime.date(2017, 6, 20) datetime.date(2017, 6, 21)
 datetime.date(2017, 6, 22) datetime.date(2017, 6, 23)
 datetime.date(2017, 6, 24) datetime.date(2017, 6, 25)
 datetime.date(2017, 6, 26) datetime.date(2017, 6, 27)
 datetime.date(2017, 6, 28) datetime.date(2017, 6, 29)
 datetime.date(2017, 6, 30) datetime.date(2017, 7, 1)
 datetime.date(2017, 7, 2) datetime.date(2017, 7, 3)
 datetime.date(2017, 7, 4) datetime.date(2017, 7, 5)
 datetime.date(2017, 7, 

In [11]:
tic = time.time()

#is_crab = ws.SubmissionTool=='crab3'
#is_miniaod = ws.data_tier.str.contains('MINIAOD')
#is_us = ws.SiteName.str.match('T[23]_US')
#is_ucsd = ws.SiteName.str.match('T[23]_US_UCSD')
#ws_filtered = ws[is_crab & is_miniaod & is_ucsd]
ws_filtered = ws[ws.isCrab == True]

blocks_day = []
for i, day in enumerate(date_index):
    today = (ws_filtered.day==day*86400)
    blocks_day.append(reduce(FastSet.union, ws_filtered[today].working_set_blocks, FastSet()))

print("Done assembling blocklists")

nrecords = np.zeros_like(date_index)
lifetimes = {
    '1w': 7,
    '1m': 30,
    '3m': 90,
    '6m': 120,
}
ws_size = {k: np.zeros_like(date_index) for k in lifetimes}
nrecalls = {k: np.zeros_like(date_index) for k in lifetimes}
recall_size = {k: np.zeros_like(date_index) for k in lifetimes}
previous = {k: FastSet() for k in lifetimes}

# for i, day in enumerate(date_index):
#     nrecords[i] = ws_filtered[(ws_filtered.day==day*86400)].size
#     for key in lifetimes:
#         current = reduce(FastSet.union, blocks_day[max(0,i-lifetimes[key]):i+1], FastSet())
#         recall = current - previous[key]
#         nrecalls[key][i] = len(recall)
#         if (i > 390):
#             print(current)
#         ws_size[key][i] = getsize(current)
#         recall_size[key][i] = getsize(recall)
#         previous[key] = current
#     if i%30==0:
#         print("Day ", i)

# elapsed = time.time() - tic
# print("Done in %.1f seconds" % elapsed)

Done assembling blocklists


In [12]:
# block_dict is a dictionary that holds the lists of blocks
# for all of the days for which the lists are nonzero
# Keys: Day
# Values: List of Blocks that were accessed on that Day
block_dict = {}
i=0
for el in blocks_day:
    i=i+1
    if len(el)>0:
        block_dict[i] = el

print("Merging daily block lists into one block set")
block_list = []
for i in range(len(blocks_day)):
    block_list += blocks_day[i]
# block_set is a set of all unique blocks.
# This can be used to isolate properties of individual blocks
# (e.g. how many times a block is accessed)
block_set = set(block_list)
print("Block Set Created")

Merging daily block lists into one block set
Block Set Created


In [13]:
# Rounds down the number to the multiple specified by the divisor
def round_down(num, divisor):
    return num - (num % divisor)

# Counts the frequencies of the quantities of blocks reused in an
# n day period (over all given time) where n = threshold
# Parameters: threshold - integer that determines the range of days (counted back from the given day)
#                         over which the blocks are counted
#             bucket - integer that determines "tolerance" for quantity of blocks such that
#                      the blocks are counted together (e.g. a bucket of 1000 means that a
#                      quantity of blocks less than 2000 would be counted as part of the
#                      1000 bucket because it is being rounded down)
def countBlockReuseFreq(threshold, bucket):
    block_reuse_dict = {}
    for day in block_dict:
        try:
            b = block_dict[day]
            i = day
            for i in range(day+1, day+threshold+1):
                b = b.union(block_dict[i])
            # Rounds down to the bin and checks to see if such a bin exists in the dictionary
            if round_down(len(b), bucket) in block_reuse_dict:
                block_reuse_dict[round_down(len(b), bucket)] += 1
            else:
                block_reuse_dict[round_down(len(b), bucket)] = 1
        except Exception as e:
            emptykey = e
            
    # Returns block_reuse_dict
    # Keys: Number of Blocks reused within the given threshold
    #       (organized by the given buckets)
    # Values: Frequencies of occurrence
    return block_reuse_dict

# Counts the frequencies of the total number of bytes reused in an
# n day period (over all given time) where n = threshold
# Parameters: threshold - integer that determines the range of days (counted back from the given day)
#                         over which the blocks are counted
#             bucket - integer that determines "tolerance" for bytes such that
#                      the bytes are counted together (e.g. a bucket of 1000 means that a
#                      size of less than 2000 bytes would be counted as part of the
#                      1000 byte bucket because it is being rounded down)
def countByteReuseFreq(threshold, bucket):
    byte_reuse_dict = {}
    for day in block_dict:
        try:
            b = block_dict[day]
            i = day
            for i in range(day+1, day+threshold+1):
                b = b.union(block_dict[i])
            # Counting the number of bytes in b
            bSize = 0
            bSize += getsize(b)
            # Rounds down to the bin and checks to see if such a bin exists in the dictionary
            if round_down(bSize, bucket) in byte_reuse_dict:
                # Adding the number of bytes into the given block into the current bin
                byte_reuse_dict[round_down(bSize, bucket)] += 1
            else:
                byte_reuse_dict[round_down(bSize, bucket)] = 1
        except Exception as e:
            emptykey = e
    
    # Returns byte_reuse_dict
    # Keys: Bytes reused within the given threshold (organized by the given buckets)
    # Values: Frequencies of occurrence
    return byte_reuse_dict

# Counts the frequencies of the total number of bytes reused in an
# n day period (over all given time) where n = threshold
# Returns them as a fraction: (Reused Bytes for a given day) / (Bytes accessed for a given day) 
# Parameters: threshold - integer that determines the range of days (counted back from the given day)
#                         over which the blocks are counted
#             bucket - integer that determines "tolerance" for percentage of byte usage 
#                      such that the percentage of byte usage are counted together (e.g. 
#                      a bucket of 0.05 means that a size of less than 0.10 would be counted as
#                      part of the 0.05 percentage bucket because it is being rounded down)
def countByteFracReuseFreq(threshold, bucket):
    byte_reuse_dict = {}
    for day in block_dict:
        try:
            b = block_dict[day]
            i = day
            tSize = 0
            # Populating tSize with the bytes accessed in the given day
            tSize = getsize(block_dict[day])
            for i in range(day+1, day+threshold+1):
                b = b.union(block_dict[i])
                # Counting the number of bytes accessed in the days within the threshold
                tSize += getsize(block_dict[i])
            # Counting the number of bytes in b
            bSize = 0
            bSize += getsize(b)
            percent = float(bSize)/float(tSize)
            # Rounds down to the bin and checks to see if such a bin exists in the dictionary
            if round_down(percent, bucket) in byte_reuse_dict:
                # Adding the number of bytes into the given block into the current bin
                byte_reuse_dict[round_down(percent, bucket)] += 1
            else:
                byte_reuse_dict[round_down(percent, bucket)] = 1
        except Exception as e:
            emptykey = e
    
    # Returns byte_reuse_dict
    # Keys: Bytes reused within the given threshold (organized by the given buckets)
    # Values: Frequencies of occurrence
    return byte_reuse_dict

In [14]:
endTime = datetime.datetime.now()
timeDifference = endTime-startTime
print("Total Runtime:")
print(timeDifference)

Total Runtime:
0:00:17.497963


In [15]:
# Finds the number of days total in working_set_day
print("Number of days in working_set_day:")
print(len(list(ws['day'].unique())))
# Finds the number of days covered in the dbs_blocks dataset
print("Number of days covered:")
print(len(date_index))
# Find the dates that are covered
print("Dates that are covered:")
print(str(date_index_ts[0]) 
      + ' to '
      + str(date_index_ts[len(date_index_ts)-1]))
# Finds the number of unique input_campaigns and outputs them
print("Number of unique input_campaigns:")
print(len(list(ws['input_campaign'].unique())))
# Finds the total number of jobs (assuming njobs is the number
# of jobs ran per row)
# print("Number jobs total:")
# print(ws['njobs'].sum())
# Finds the number of blocks total in the database
print("Total number of blocks in dbs_blocks:")
print(len(blocksize))
# Finds the total number of blocks accessed (in the given working set)
# in the database
accessed_block_set = set()
for block in ws['working_set_blocks']:
    accessed_block_set.update(block)
print("Total number of unique blocks accessed in the working set:")
print(len(accessed_block_set))
# Finds the total data across all blocks
print("Total number of bytes across all blocks:")
print(str(sum(blocksize)[1]/2e15) + " PB")

Number of days in working_set_day:
385
Number of days covered:
579
Dates that are covered:
2017-05-31 to 2018-12-30
Number of unique input_campaigns:
528
Total number of blocks in dbs_blocks:
8935351
Total number of unique blocks accessed in the working set:
66472
Total number of bytes across all blocks:
180.63418737961337 PB


In [16]:
# Counts the number of blocks
def countBytesWorkingSet(bucket):
    accessed_block_set = set()
    size_dict = {}
    for block_set in ws['working_set_blocks']:
        accessed_block_set.update(block_set)
    for block in accessed_block_set:
        try: 
            bSize = getsize(FastSet({block}))
            if round_down(bSize, bucket) in accessed_block_set:
                size_dict[round_down(bSize, bucket)] += 1
            else:
                size_dict[round_down(bSize, bucket)] = 1
        except Exception as e:
            emptykey = e
    return size_dict

In [17]:
#fig, ax = plt.subplots(1,1)
#bucket = 1e3
#workingBlockDict = countBytesWorkingSet(bucket)
#ax.bar(list(workingBlockDict.keys()), list(workingBlockDict.values()), width=1)
#ax.set_title('Block Size Distribution')
#ax.set_xlabel('Blocksize (Bytes)')
#ax.set_ylabel('Number of Blocks')
#ax.set_ylim(0,None)
#ax.set_xlim(0,None)
#plt.savefig(resultDirectory + 'blockSizeDistribution.png')
#print(workingBlockDict)

In [18]:
campaign_list = list(ws['input_campaign'].unique())

# Finds the number of blocks for a given campaign
# Parameters: campaign - String detailing the name of the campaign
#                        to be searched
# Returns the number of blocks for a given campaign
def countCampaignBlocks(campaign):
    campaignBlockQuantity = 0
    campaignIndices = [i for i, 
                       x in enumerate(list(ws['input_campaign']))
                      if x == campaign]
    for i in campaignIndices:
        campaignBlockQuantity += len(ws['working_set_blocks'][i])
    return campaignBlockQuantity

# Finds the total number of bytes used in a given campaign
# Parameters: campaign - String detailing the name of the campaign
#                        to be searched
# Returns the total number of bytes used in a given campaign
def countCampaignBytes(campaign):
    campaignByteQuantity = 0
    campaignIndices = [i for i, 
                       x in enumerate(list(ws['input_campaign']))
                      if x == campaign]
    for i in campaignIndices:
        campaignByteQuantity += getsize(ws['working_set_blocks'][i])
    return campaignByteQuantity

# Creating a dictionary of quantity of blocks per campaign
# Keys: Quantity of Blocks
# Values: Quantity of Campaigns that have that Quantity of Blocks
def campaignBlockSumDict(bucket):
    campaignBlockDict = {}
    for campaign in campaign_list:
        try: 
            campaignBlockQuantity = countCampaignBlocks(campaign)
            if (round_down(campaignBlockQuantity, bucket)) in campaignBlockDict:
                campaignBlockDict[round_down(campaignBlockQuantity, bucket)] += 1
            else:
                campaignBlockDict[round_down(campaignBlockQuantity, bucket)] = 1           
        except exception as e:
            emptykey = e
    return campaignBlockDict
            
# Creating a dictionary of bytes per campaign
# Keys: Bytes
# Values: Quantity of Campaigns that have that number of bytes
def campaignByteSumDict(bucket):
    campaignByteDict = {}
    for campaign in campaign_list:
        try: 
            campaignByteQuantity = countCampaignBytes(campaign)
            if (round_down(campaignByteQuantity, bucket)) in campaignByteDict:
                campaignByteDict[round_down(campaignByteQuantity, bucket)] += 1
            else:
                campaignByteDict[round_down(campaignByteQuantity, bucket)] = 1           
        except exception as e:
            emptykey = e
    return campaignByteDict

In [19]:
fig, ax = plt.subplots(1,1)
bucket = 100
campaignBlockDict = campaignBlockSumDict(bucket)
ax.bar(list(campaignBlockDict.keys()), list(campaignBlockDict.values()), width=1e2)
ax.set_title('Block Quantity Distribution Per Campaign')
ax.set_xscale('log')
ax.set_xlabel('Number of Blocks')
ax.set_ylabel('Number of Campaigns')
ax.set_ylim(0,None)
plt.savefig(resultDirectory + 'blockQuantityPerCampaign.png')

<IPython.core.display.Javascript object>

In [20]:
# fig, ax = plt.subplots(1,1)
# bucket = 2e7
# campaignByteDict = campaignByteSumDict(bucket)
# ax.bar(campaignByteDict.keys(), campaignByteDict.values(), width=1e7)
# ax.set_title('Byte Distribution Per Campaign')
# ax.set_xscale('log')
# ax.set_xlabel('Number of Bytes')
# ax.set_ylabel('Number of Campaigns')
# ax.set_ylim(0,None)
# plt.savefig(resultDirectory + 'bytesPerCampaign.png')

In [21]:
# print(list(campaignByteDict.values()))
# print(list(campaignByteDict.keys()))

In [22]:
# Transposing the blocksize np array to be plotted
blocksizeT = np.transpose(blocksize)

fig, ax = plt.subplots(1,1)
#ax.bar(blocksizeT[0], blocksizeT[1], width=10)
#ax.bar(blocksizeT[0], blocksizeT[1])
ax.set_title('Block Size Distribution')
ax.legend(title='Bucket ' + str(bucket))
ax.set_ylabel('Size (PB)')
ax.set_xlabel('Block ID')
ax.set_ylim(0, None)
ax.set_xlim(0, None)
plt.savefig(resultDirectory + 'blockSizeDistribution.png')

<IPython.core.display.Javascript object>

No handles with labels found to put in legend.


In [23]:
# Counts quantity of unique blocks reused in an n-day period
# Parameters: threshold - integer that determines the range of
#                         days over which the blocks are counted
# Returns: A dictionary where: Keys - Dates
#                              Values - Set of unique blocks accessed
#                                       on the given date
def accessedBlockDict(threshold):
    accessed_block_dict = {}
    for day in block_dict:
        try: 
            # b is the set of all blocks accessed more than once 
            # within the n-day period
            b = block_dict[day].set()
            unique_set = block_dict[day].set()
            # t_block_set is the set of all blocks accessed in the n-day
            # time period
            t_block_set = copy.deepcopy(b)
            for i in range(day+1, day+threshold+1):
                t_block_set.update(block_dict[i])
                unique_set = unique_set.difference(block_dict[i])
            # unique_block_set is the set of all blocks accessed only once
            # in the n-day period
            b = t_block_set.difference(unique_set)
            accessed_block_dict[day] = unique_set
        except Exception as e:
            emptykey = e
    return accessed_block_dict

# Counts total quantity of blocks accessed in an n-day period
# Parameters: threshold - integer that determines the range of
#                         days over which the blocks are counted
# Returns: A dictionary where: Keys - Dates
#                              Values - Set of total blocks accessed
#                                       on the given date
def totalBlockDict(threshold):
    total_block_dict = {}
    for day in block_dict:
        try: 
            b = block_dict[day].set()
            # t_block_set is the set of all blocks accessed in the n-day
            # time period
            t_block_set = copy.deepcopy(b)
            for i in range(day+1, day+threshold+1):
                t_block_set.update(block_dict[i])
            total_block_dict[day] = t_block_set
        except Exception as e:
            emptykey = e
    return total_block_dict

# Tracks total quantity of blocks accessed in an n-day period
# and counts the number of blocks dropped for a given day step
# Parameters: threshold - integer that determines the range of
#                         days over which the blocks are counted
# Returns: A dictionary where: Keys - Dates
#                              Values - Set of blocks dropped
#                                       on the given date
def droppedBlocksDict(threshold):
    dropped_block_dict = {}
    prevOccurred = False
    prev = set()
    for day in block_dict:
        try: 
            # b is the set of blocks accessed in the previous day
            b = block_dict[day].set()
            # t_block_set is the set of all blocks accessed in the n-day
            # time period
            t_block_set = copy.deepcopy(b)
            for i in range(day+1, day+threshold+1):
                t_block_set.update(block_dict[i])
            if (prevOccurred):
                dropped_block_dict[day] = prev.difference(t_block_set.intersection(prev))
                prev.clear()
                prev.update(t_block_set)
            else:
                prev.clear()
                prev.update(t_block_set)
            prevOccurred = True
        except Exception as e:
            emptykey = e
    return dropped_block_dict

# Tracks total quantity of blocks accessed in an n-day period
# and counts the number of blocks gained for a given day step
# Parameters: threshold - integer that determines the range of
#                         days over which the blocks are counted
# Returns: A dictionary where: Keys - Dates
#                              Values - Set of blocks gained
#                                       on the given date
def gainedBlocksDict(threshold):
    gained_block_dict = {}
    prevOccurred = False
    prev = set()
    for day in block_dict:
        try: 
            # b is the set of blocks accessed in the previous day
            b = block_dict[day].set()
            # t_block_set is the set of all blocks accessed in the n-day
            # time period
            t_block_set = copy.deepcopy(b)
            for i in range(day+1, day+threshold+1):
                t_block_set.update(block_dict[i])
            if (prevOccurred):
                gained_block_dict[day] = t_block_set.difference(t_block_set.intersection(prev))
                prev.clear()
                prev.update(t_block_set)
            else:
                prev.clear()
                prev.update(t_block_set)
            prevOccurred = True
        except Exception as e:
            emptykey = e
    return gained_block_dict

In [24]:
accessed_block_dict = accessedBlockDict(14)
total_block_dict = totalBlockDict(14)
dropped_block_dict = droppedBlocksDict(14)
gained_block_dict = gainedBlocksDict(14)

In [25]:
print("Dropped Blocks: "+ str([len(x) for x in list(dropped_block_dict.values())[0:6]]))
print("Gained Blocks: " + str([len(x) for x in list(gained_block_dict.values())[0:6]]))
print("Total Blocks: " + str([len(x) for x in list(total_block_dict.values())[0:6]]))
print("Delta: " + str([len(x)-len(y) for x,y in zip(list(gained_block_dict.values())[0:6],list(dropped_block_dict.values())[0:6])]))

Dropped Blocks: [18, 290, 305, 84, 989, 3908]
Gained Blocks: [0, 0, 0, 0, 0, 8020]
Total Blocks: [6309, 6291, 6001, 5696, 5612, 4623]
Delta: [-18, -290, -305, -84, -989, 4112]


In [26]:
plt.rcParams["figure.figsize"] = (10,5)
fig, ax = plt.subplots(1,1)

dateList = []
for day in accessed_block_dict.keys():
    dateList.append(date_index_ts[day])
    
uniqueBlockQuantityList = []
for blockSet in accessed_block_dict.values():
    uniqueBlockQuantityList.append(len(list(blockSet)))
    
ax.plot(dateList, uniqueBlockQuantityList)
ax.set_title('Unique Blocks Accessed (in 2 Weeks)')
ax.set_ylabel('Number of Unique Blocks Accessed')
ax.set_xlabel('Date')
ax.set_ylim(0, None)
plt.savefig(resultDirectory + 'uniqueBlockAccessed2Weeks.png')

<IPython.core.display.Javascript object>


To register the converters:
	>>> from pandas.plotting import register_matplotlib_converters
	>>> register_matplotlib_converters()


In [27]:
# fig, ax = plt.subplots(1,1)

# uniqueByteList = []
# for blockSet in accessed_block_dict.values():
#     bSize = getsize(FastSet(blockSet))
#     uniqueByteList.append(bSize)
    
# ax.plot(dateList, uniqueByteList)
# ax.set_title('Unique Bytes Accessed (in 2 Weeks)')
# ax.set_ylabel('Number of Bytes Accessed')
# ax.set_xlabel('Date')
# ax.set_ylim(0, None)
# plt.savefig(resultDirectory + 'uniqueByteAccessed2Weeks.png')

In [28]:
dateList = []
for day in dropped_block_dict.keys():
    dateList.append(date_index_ts[day])
    
droppedBlockQuantityList = []
droppedByteList = []
for blockSet in dropped_block_dict.values():
    droppedBlockQuantityList.append(len(list(blockSet)))
    try:
        bsize = getsize(FastSet(blockSet))
        droppedByteList.append(bsize)
    except:
        print("Error Here:")
        print(blockSet)
        droppedByteList.append(0)

    
fig, ax = plt.subplots(1,1)
ax.plot(dateList, droppedBlockQuantityList)
ax.set_title('Quantity of Blocks Dropped (in 2 Weeks)')
ax.set_ylabel('Quantity of Blocks Dropped')
ax.set_xlabel('Date')
ax.set_ylim(0, None)
plt.savefig(resultDirectory + 'droppedBlockQuantity2Weeks.png')

# print("Date List Length: " + str(len(dateList)))
# print("Dropped Byte List Length: " + str(len(droppedByteList)))
# print("Dropped Block List Length: " + str(len(droppedBlockQuantityList)))
fig, ax = plt.subplots(1,1)
ax.plot(dateList, droppedByteList)
ax.set_title('Bytes Dropped (in 2 Weeks)')
ax.set_ylabel('Bytes Dropped')
ax.set_xlabel('Date')
ax.set_ylim(0, None)
plt.savefig(resultDirectory + 'droppedBytes2Weeks.png')

Error Here:
{17352192, 17344002, 17328132, 17352199, 17328648, 17327118, 17255951, 17286672, 16928786, 17272339, 17325074, 17324565, 16936982, 17261595, 17282591, 17272868, 17937956, 18632742, 17166980, 18630186, 18635818, 18985517, 17255470, 17255982, 17258544, 17286193, 19077169, 17259055, 18950703, 17280563, 17323062, 15986738, 17257528, 17343545, 18165814, 18165815, 16921935, 18087997, 17284155, 17322559, 17332351, 17315905, 17331268, 17331269, 19156041, 17257033, 17346633, 17274959, 18929232, 16988241, 17256533, 18641494, 18641495, 17325144, 18676556, 17259098, 17288283, 17260124, 17327707, 17996383, 16928353, 18669154, 17067105, 17259113, 17287274, 18408041, 17257071, 17324656, 17275505, 16943218, 17259631, 17261167, 17360498, 17280116, 17272439, 18910842, 18628730, 17323132, 18910845, 20122750, 18910847, 20122752, 15763585, 20122753, 17255554, 17334404, 18910848, 18910849, 20122751, 17322628, 17257095, 17291906, 16943243, 17271948, 18910860, 18910861, 17286799, 18910864, 1727962

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [29]:
dateList = []
for day in gained_block_dict.keys():
    dateList.append(date_index_ts[day])
    
gainedBlockQuantityList = []
gainedByteList = []
for blockSet in gained_block_dict.values():
    gainedBlockQuantityList.append(len(list(blockSet)))
    try:
        bsize = getsize(FastSet(blockSet))
        gainedByteList.append(bsize)
    except:
        print("Error Here:")
        print(blockSet)
        gainedByteList.append(0)
    
fig, ax = plt.subplots(1,1)
ax.plot(dateList, gainedBlockQuantityList)
ax.set_title('Quantity of Blocks Gained (in 2 Weeks)')
ax.set_ylabel('Quantity of Blocks Gained')
ax.set_xlabel('Date')
ax.set_ylim(0, None)
plt.savefig(resultDirectory + 'gainedBlockQuantity2Weeks.png')

fig, ax = plt.subplots(1,1)
ax.plot(dateList, gainedByteList)
ax.set_title('Bytes Gained (in 2 Weeks)')
ax.set_ylabel('Bytes Gained')
ax.set_xlabel('Date')
ax.set_ylim(0, None)
plt.savefig(resultDirectory + 'gainedBytes2Weeks.png')

Error Here:
{17256971, 18487824, 18798609, 18667545, 15940127, 18354208, 18300456, 17246255, 15934528, 17181257, 19053130, 18641494, 18670684, 18866274, 17250915, 19029607, 17872492, 18309242, 20122750, 20122751, 20122752, 20122753, 17936511, 18666624, 18669183, 19058309, 20122759, 20122760, 20122761, 20122762, 18346130, 18682004, 17471636, 20122777, 20122778, 20122779, 20122780, 18298012, 18147491, 18160804, 20122793, 20122794, 20122795, 20122796, 16979119, 20122803, 18989236, 20122805, 20122806, 15921849, 18766016, 17872578, 20122819, 18796229, 17255625, 17003723, 17253068, 17249483, 18308821, 18301654, 18558684, 16989414, 18672362, 19044586, 18497259, 18084078, 18893555, 18262772, 18979573, 18893048, 19052281, 19025155, 19045126, 18694919, 18669320, 18769679, 18668815, 18769681, 18671890, 19051285, 18769686, 16043798, 18769689, 18769690, 18029340, 18393888, 19046690, 16992546, 17268516, 18690342, 18673447, 18666794, 17249068, 18284333, 18722612, 19022138, 19031868, 18481475, 1726650

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [30]:
plt.clf()

dateList = []
for day in accessed_block_dict.keys():
    dateList.append(date_index_ts[day])
    
totalBlockQuantityList = []
for blockSet in total_block_dict.values():
    totalBlockQuantityList.append(len(list(blockSet)))
    
# Dropped/Gained blocks can only be calculated for pairs of days
# so for the first day in totalBlockQuantityList, there is no 
# corresponding gained/dropped value
# These values start on the second day, so we are setting the first
# day to have 0 dropped/gained blocks to maintain index consistency
droppedBlockQuantityList.insert(0,0)   
gainedBlockQuantityList.insert(0,0)   

fig, ax = plt.subplots(1,1)   
ax.plot(dateList, totalBlockQuantityList)
ax.errorbar(dateList, totalBlockQuantityList, yerr=[droppedBlockQuantityList, gainedBlockQuantityList])
plt.rcParams["figure.figsize"] = (10,5)
ax.set_title('Total Blocks Accessed (in 2 Weeks)')
ax.set_ylabel('Number of Total Blocks Accessed')
ax.set_xlabel('Date')
ax.set_ylim(0, None)
plt.savefig(resultDirectory + 'totalBlockAccessed2WeeksError.png')

fig, ax = plt.subplots(1,1)   
ax.plot(dateList, totalBlockQuantityList)
plt.rcParams["figure.figsize"] = (10,5)
ax.set_title('Total Blocks Accessed (in 2 Weeks)')
ax.set_ylabel('Number of Total Blocks Accessed')
ax.set_xlabel('Date')
ax.set_ylim(0, None)
plt.savefig(resultDirectory + 'totalBlockAccessed2Weeks.png')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [31]:
totalByteList = []
for blockSet in total_block_dict.values():
    try:
        bSize = getsize(FastSet(blockSet))
        totalByteList.append(bSize)
    except:
        print("Error Here:")
        # print(blockSet)
        totalByteList.append(0)
    
# Dropped/Gained blocks can only be calculated for pairs of days
# so for the first day in totalBlockQuantityList, there is no 
# corresponding gained/dropped value
# These values start on the second day, so we are setting the first
# day to have 0 dropped/gained blocks to maintain index consistency
droppedByteList.insert(0,0)   
gainedByteList.insert(0,0)   

Error Here:
Error Here:
Error Here:
Error Here:
Error Here:
Error Here:
Error Here:
Error Here:
Error Here:
Error Here:
Error Here:
Error Here:
Error Here:
Error Here:
Error Here:
Error Here:
Error Here:
Error Here:
Error Here:
Error Here:
Error Here:
Error Here:
Error Here:
Error Here:
Error Here:
Error Here:
Error Here:
Error Here:
Error Here:
Error Here:
Error Here:
Error Here:
Error Here:
Error Here:
Error Here:
Error Here:
Error Here:
Error Here:
Error Here:
Error Here:
Error Here:
Error Here:
Error Here:
Error Here:
Error Here:
Error Here:
Error Here:
Error Here:
Error Here:
Error Here:
Error Here:
Error Here:
Error Here:
Error Here:
Error Here:
Error Here:
Error Here:
Error Here:
Error Here:
Error Here:
Error Here:
Error Here:
Error Here:
Error Here:
Error Here:
Error Here:
Error Here:
Error Here:
Error Here:
Error Here:
Error Here:
Error Here:
Error Here:
Error Here:
Error Here:
Error Here:
Error Here:
Error Here:
Error Here:
Error Here:
Error Here:
Error Here:
Error Here:
Erro

In [32]:
fig, ax = plt.subplots(1,1)
ax.plot(dateList, totalByteList)
ax.set_title('Total Bytes Accessed (in 2 Weeks)')
ax.errorbar(dateList, totalByteList, yerr=[droppedByteList, gainedByteList])
ax.set_ylabel('Number of Bytes Total')
ax.set_xlabel('Date')
ax.set_ylim(0, None)
plt.savefig(resultDirectory + 'totalByteTotal2WeeksError.png')

fig, ax = plt.subplots(1,1)
ax.plot(dateList, totalByteList)
ax.set_title('Total Bytes Accessed (in 2 Weeks)')
ax.set_ylabel('Number of Bytes Total')
ax.set_xlabel('Date')
ax.set_ylim(0, None)
plt.savefig(resultDirectory + 'totalByteTotal2Weeks.png')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [33]:
# totalUniqueByteRatioList = []
# for i in range(0, len(total_block_dict.values())):
#     tSize = getsize(FastSet(list(total_block_dict.values())[i]))
#     aSize = getsize(FastSet(list(accessed_block_dict.values())[i]))
#     totalUniqueByteRatioList.append(tSize/aSize)
# for blockSet in total_block_dict.values():
#     bSize = getsize(FastSet(blockSet))
#     totalByteList.append(bSize)
    
# fig, ax = plt.subplots(1,1)
# ax.plot(dateList, totalUniqueByteRatioList)
# ax.set_title('Total Bytes/Unique Bytes Accessed (in 2 Weeks)')
# ax.set_ylabel('Total/Unique Bytes Accessed')
# ax.set_xlabel('Date')
# ax.set_ylim(0, None)
# plt.savefig(resultDirectory + 'totalUniqueByteRatio2Weeks.png')

In [34]:
# fig, ax = plt.subplots(1,1)
# ax.bar(dateList, totalByteList, width=1e5)
# ax.set_title('Total Bytes Accessed (in 2 Weeks)')
# ax.errorbar(dateList, totalByteList, yerr=[droppedByteList, gainedByteList])
# ax.set_ylabel('Number of Bytes Total')
# ax.set_xlabel('Date')
# ax.set_ylim(0, None)
#plt.savefig(resultDirectory + 'totalByteTotal2WeeksError.png')