In [3]:
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd

from cache import XCacheSite, load_data

matplotlib.rc('xtick', labelsize=14)
matplotlib.rc('ytick', labelsize=14)

step = 10000

MB = 1024 * 1024
GB = 1024 * MB
TB = 1024 * GB
PB = 1024 * TB

sites = ['DESY-HH', 'LRZ-LMU', 'MPPMU']  
periods = ['AUG']  # must be listed in order
kinds = ['prod']
skipFiles = []  # ':AOD.']

label = 'DE2x40TB_2x500int2'
output = label + '_' + '_'.join(kinds) + '_' + '_'.join(periods) + '_' + '_'.join(sites)
title = label + '\n' + ','.join(kinds) + ' ' + ' '.join(periods) + '\n' + ','.join(sites)

all_data = load_data(sites, periods, kinds, skipFiles)

# all_data = all_data[:100000]

# create caching network
all_sites = {}


# flat US with one central cache.
all_sites['xc_DESY-HH'] = XCacheSite('xc_DESY-HH', upstream='xc_Int2', servers=2, size=40 * TB)
all_sites['xc_LRZ-LMU'] = XCacheSite('xc_LRZ-LMU', upstream='xc_Int2', servers=2, size=40 * TB)
all_sites['xc_MPPMU'] = XCacheSite('xc_MPPMU', upstream='xc_Int2', servers=2, size=40 * TB)
all_sites['xc_Int2'] = XCacheSite('xc_Int2', upstream='Origin', servers=2, size=500 * TB)
all_sites['Origin'] = XCacheSite('Origin', upstream='none')


DESY-HH AUG prod 800852
LRZ-LMU AUG prod 323247
MPPMU AUG prod 450000
      site month  kind   files  unique files  total size [PB]  \
0  DESY-HH   AUG  prod  800852        294909         1.207999   
1  LRZ-LMU   AUG  prod  323247        177722         0.544677   
2    MPPMU   AUG  prod  450000        143315         0.921967   

   avg. filesize [GB]  
0            1.581664  
1            1.766870  
2            2.148340  
---------- merged data -----------
1606021 files	 596074 unique	 2.6746433980574595 PB	 1.781696625028984 GB avg. file size


In [None]:
print('---------- start requests ----------')
acs = []
dac = []
accesses = [0, 0, 0]
dataaccc = [0, 0, 0]
count = 0

for index, row in all_data.iterrows():

    count += 1

    if count > 200000000:
        break

    if not count % step and count > 0:
        # print(count, accesses, dataaccc)
        acs.append(accesses.copy())
        dac.append(dataaccc.copy())
        pacce = []
        pdata = []
        for i in range(len(accesses)):
            pacce.append(accesses[i] / sum(accesses))
            pdata.append(dataaccc[i] / sum(dataaccc))
        print(count, pacce, pdata)

    if row.site not in all_sites:
        continue

    fs = row.filesize
    ts = row.transfer_start
    l0 = all_sites[row.site]
    found = l0.add_request(index, fs, ts)
    if found:
        accesses[0] += 1
        dataaccc[0] += fs
        continue

    l1 = all_sites[l0.upstream]
    found = l1.add_request(index, fs, ts)
    if found:
        accesses[1] += 1
        dataaccc[1] += fs
        continue

    l2 = all_sites[l1.upstream]
    found = l2.add_request(index, fs, ts)
    if found:
        accesses[2] += 1
        dataaccc[2] += fs
        continue

    l3 = all_sites[l2.upstream]
    found = l3.add_request(index, fs, ts)
    if found:
        accesses[3] += 1
        dataaccc[3] += fs
        continue


print('final: ', accesses, dataaccc)


accdf = pd.DataFrame(acs)
dacdf = pd.DataFrame(dac)

dacdf = dacdf / TB


# ### ploting results

accdf.columns = ['level 1', 'level 2',  'origin']
dacdf.columns = ['level 1', 'level 2',  'origin']

fig, axs = plt.subplots(nrows=2, ncols=2, figsize=(12, 10))
fig.suptitle(title, fontsize=18)

accdf.plot(ax=axs[0][0])
axs[0][0].set_ylabel('hits')
axs[0][0].set_xlabel('requests [x' + str(step) + ']')
axs[0][0].legend()

dacdf.plot(ax=axs[1][0])
axs[1][0].set_ylabel('data delivered [TB]')
axs[1][0].set_xlabel('requests [x' + str(step) + ']')
axs[1][0].legend()

accdf = accdf.div(accdf.sum(axis=1), axis=0)
dacdf = dacdf.div(dacdf.sum(axis=1), axis=0)

accdf.plot(ax=axs[0][1])
axs[0][1].set_ylabel('hits [%]')
axs[0][1].set_xlabel('requests [x' + str(step) + ']')
axs[0][1].grid(axis='y')
axs[0][1].legend()

dacdf.plot(ax=axs[1][1])
axs[1][1].set_ylabel('data delivered [%]')
axs[1][1].set_xlabel('requests [x' + str(step) + ']')
axs[1][1].grid(axis='y')
axs[1][1].legend()

# plt.show()

fig.savefig('filling_up_' + output + '.png')


# Network states

tp = []
st = pd.DataFrame()
for site in all_sites:
    s = all_sites[site]
    si = [site.replace('xc_', ''), s.requests, s.hits, s.data_asked_for / TB, s.data_from_cache / TB]
    if s.requests > 0 and site != 'Origin':
        st = pd.concat([st, s.get_servers_stats()])
    tp.append(si)
    if s.requests > 0:
        s.plot_throughput()

print(st.groupby(['site']).mean())

sites = pd.DataFrame(tp)
sites.columns = ['xcache', 'requests', 'hits', 'data asked for', 'delivered from cache']
sites = sites[sites.requests != 0]
sites.set_index('xcache', drop=True, inplace=True)
print(sites.head(20))

fig, ax = plt.subplots(figsize=(8, 8))
fig.suptitle(title, fontsize=18)
sites.plot(kind="bar", ax=ax, secondary_y=['data asked for', 'delivered from cache'])
ax.right_ax.set_ylabel('[TB]')
fig.savefig('xcache_sites_' + output + '.png')


---------- start requests ----------
10000 [0.49364936493649364, 0.0009000900090009, 0.5054505450545055] [nan, nan, nan]
20000 [0.5170258512925646, 0.0076503825191259565, 0.4753237661883094] [nan, nan, nan]
30000 [0.4675822527417581, 0.02123404113470449, 0.5111837061235375] [nan, nan, nan]
40000 [0.4223105577639441, 0.025075626890672265, 0.5526138153453837] [nan, nan, nan]
50000 [0.38380767615352307, 0.020340406808136164, 0.5958519170383407] [nan, nan, nan]
60000 [0.3760062667711129, 0.017816963616060267, 0.6061767696128268] [nan, nan, nan]
70000 [0.3768053829340419, 0.01727167530964728, 0.6059229417563108] [nan, nan, nan]
80000 [0.3649295616195202, 0.016662708283853547, 0.6184077300966262] [nan, nan, nan]
90000 [0.35717063522928033, 0.016322403582262026, 0.6265069611884576] [nan, nan, nan]
100000 [0.37291372913729137, 0.016730167301673017, 0.6103561035610356] [nan, nan, nan]
110000 [0.3862398749079537, 0.016118328348439532, 0.5976417967436067] [nan, nan, nan]
120000 [0.392869940582838