In [20]:
"""
LOAD TEST WEB SERVER

CREATED: 13-APR-2022
LAST EDIT: 26-JUL-2022
AUTHORS: DUANE RINEHART

"""
import os
import time
import pandas as pd


win_data_dir = "E:/dev/web_server_load_test/"
linux_data_dir = "/mnt/e/dev/web_server_load_test/"


def load_app_constants():
    """
    LOAD APP/ENVIRONMENT SETTINGS (ESP. CROSS-PLATFORM)
    """
    if os.name == "nt":
        input_fullPath = os.path.join(win_data_dir)
    else:
        input_fullPath = os.path.join(linux_data_dir)
    return input_fullPath


def sizeof_fmt(num, suffix="B"):
    for unit in ["", "Ki", "Mi", "Gi", "Ti", "Pi", "Ei", "Zi"]:
        if abs(num) < 1024.0:
            return f"{num:3.1f}{unit}{suffix}"
        num /= 1024.0
    return f"{num:.1f}Yi{suffix}"


def capture_dir_listing(data_root):
    """
    PRIOR LIST OF REPRESENTATIVE DIRECTORY CONTENTS GENERATED WITH:
    [ON pons.dk.ucsd.edu]
    cd /data/neuroglancer/DK55/neuroglancer_data/C1
    DIR='325_325_20000' && ls -l $DIR | awk 'BEGIN {OFS="\t"} { print $9, $5 }' > ~/"$DIR.tsv"

    """

    input_filename = "325_325_20000.tsv"
    input_fullPath = os.path.join(data_root, input_filename)

    start = time.time()
    dfFileList = pd.read_csv(
        input_fullPath, sep="\t", names=["fname", "size_(bytes)"], header=None
    )
    dfFileList.index.name = "id"
    end = time.time()
    print(f"Read tsv: {input_fullPath}")
    print("Operation time: ", (end - start), "sec")
    return dfFileList


data_root = load_app_constants()
files_size_array = capture_dir_listing(data_root)


Read tsv: E:/dev/web_server_load_test/325_325_20000.tsv
Operation time:  0.12299537658691406 sec


In [21]:
print('n = ', len(files_size_array.index))
print('μ =', sizeof_fmt(files_size_array['size_(bytes)'].mean()))
print('median = ', sizeof_fmt(files_size_array['size_(bytes)'].median()))
print('σ = ', files_size_array['size_(bytes)'].std())
print('AGGREGATE FOLDER SIZE:', sizeof_fmt(files_size_array['size_(bytes)'].sum()))
#linux command: du -smh

n =  98165
μ = 3.1MiB
median =  18.7KiB
σ =  5261847.595657444
AGGREGATE FOLDER SIZE: 299.9GiB


In [22]:
#REPRESENTATIVE IMAGES DURING SESSION SUM IS BASED ON HISTORICAL DAILY AVERAGE (https://activebrainatlas.ucsd.edu/awstats/awstats.pl)
#MAR, 2022 AVG 8.82GB
aggregate_threshold = (8.82e+9) # IN BYTES
#aggregate_threshold = (2000) # IN BYTES
aggregate_file_size = 0
files_size_array['selected'] = ' ' # ADD TO EXISTING DATAFRAME

for i in files_size_array.sample(frac=1).iterrows(): #ref: https://stackoverflow.com/questions/43509114/randomly-sample-rows-of-a-dataframe-until-the-desired-sum-of-a-column-is-reached
    if (aggregate_file_size + i[1]['size_(bytes)']) <= aggregate_threshold:
        aggregate_file_size += i[1]['size_(bytes)']
        files_size_array.at[i[0], 'selected'] = 1

# SAVE RESULTS TO FILE ('size_(bytes)' COLUMN SUM SHOULD EQUAL aggregate_threshold)
data_root = load_app_constants()
out_filename = 'load_test_target_transfer_list.xlsx'
output_fullPath = os.path.join(data_root, out_filename)
files_size_array.index.name = 'id'
files_size_array[files_size_array['selected']==1].to_excel(output_fullPath, columns=['fname','size_(bytes)'])
files_size_array.to_excel(out_filename)

In [62]:
import pandas as pd
import wget
from timeit import default_timer as timer
import glob

out_filename = 'load_test_results.xlsx'
base_url = 'https://activebrainatlas.ucsd.edu/data/DK59/neuroglancer_data/C2/325_325_20000/'

def speed_test(url):
    local_file = 'test.del'
    start_time = timer()
    wget.download(url, local_file)
    end_time = timer()
    elapsed_time = round(end_time - start_time, 2)

    #CLEANUP
    fileList = glob.glob('*.del')
    for filePath in fileList:
        try:
            os.remove(filePath)
        except:
            print("Error while deleting file : ", filePath)

    return elapsed_time

files_size_array['transfer_time (s)'] = 0 #NEW COLUMN NAME; DEFAULT VALUE = 0

selected_files = files_size_array.loc[files_size_array['selected'] == 1]

for index, row in selected_files.iterrows():
    #for testing
    if index>200:
        continue
    filename = row['fname']
    url = base_url + filename
    elapsed_time = speed_test(url)
    print(index, url)
    print(f"TRANSFER TIME: {elapsed_time}s")

    #UPDATE DATAFRAME WITH NEW TRANSFER DATE
    files_size_array.loc[files_size_array['fname'] == filename, 'transfer_time'] = elapsed_time

files_size_array.to_excel(out_filename)

44 https://activebrainatlas.ucsd.edu/data/DK59/neuroglancer_data/3072_C2/325_325_20000/0-3072_0-3072_137-138.gz
TRANSFER TIME: 0.54s
73 https://activebrainatlas.ucsd.edu/data/DK59/neuroglancer_data/3072_C2/325_325_20000/0-3072_0-3072_163-164.gz
TRANSFER TIME: 0.06s
98 https://activebrainatlas.ucsd.edu/data/DK59/neuroglancer_data/3072_C2/325_325_20000/0-3072_0-3072_186-187.gz
TRANSFER TIME: 0.06s
150 https://activebrainatlas.ucsd.edu/data/DK59/neuroglancer_data/3072_C2/325_325_20000/0-3072_0-3072_233-234.gz
TRANSFER TIME: 0.06s
198 https://activebrainatlas.ucsd.edu/data/DK59/neuroglancer_data/3072_C2/325_325_20000/0-3072_0-3072_276-277.gz
TRANSFER TIME: 0.08s


In [61]:
#files_size_array.at[42, 'transfer_time'] = 22
files_size_array

Unnamed: 0_level_0,fname,size_(bytes),selected,transfer_time
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,,,,0.0
1,0-3072_0-3072_0-1.gz,19099.0,,0.0
2,0-3072_0-3072_100-101.gz,19099.0,,0.0
3,0-3072_0-3072_101-102.gz,19099.0,,0.0
4,0-3072_0-3072_10-11.gz,19099.0,,0.0
...,...,...,...,...
98161,9216-12288_9216-12288_96-97.gz,9099412.0,,0.0
98162,9216-12288_9216-12288_97-98.gz,6968951.0,,0.0
98163,9216-12288_9216-12288_98-99.gz,14700323.0,,0.0
98164,9216-12288_9216-12288_99-100.gz,14452161.0,,0.0
