In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import directory_functions as dirfuncs
import gather_middleware_statistics as gmws
import gather_memtier_statistics as gmts
import cut_away_warmup_cooldown as cut
import math
exp_dir = "/home/flo/Documents/eth-asl-final-experiment-data/exp4/4_detailedrun_two_2017-11-29_165453"

In [None]:
# Get response times from MW for 6 multigets
def gather_requests(exp_dir, reps, middlewares):
    all_reps = []
    for rep in reps:

        middleware_dirs = [dirfuncs.get_only_subdir(os.path.join(exp_dir, str(rep), mw_dir)) for mw_dir in middlewares]
        concatenated_requests = [gmws.concatenate_requestlogs(middleware_dir) for middleware_dir in middleware_dirs]


        
        metrics = [gmws.extract_metrics(reqs) for reqs in concatenated_requests]


        cut_metrics = [cut.cut_away_warmup_cooldown(mets, 10, 72) for mets in metrics]

        all_reps.extend(cut_metrics)
        
    return pd.concat(all_reps)


In [None]:
# What should be the granularity of the memtier response time interpolation?
granularity = 0.1

# Extract GET histogram from single memtier logfile
def parse_line(line):
    split = line.split()
    return (split[1], split[2])
def extract_GET_histogram(client_logfile_path):
    with open(client_logfile_path, "r") as logfile:
        return [parse_line(line)for line in logfile if line.startswith("SET")]
def extract_GET_throughput(client_logfile_path):
    with open(client_logfile_path, "r") as logfile:
        for line in logfile:
            if line.startswith("Sets"):
                split_line = line.split()
                return float(split_line[1])
def interpolate_histogram(hist, granularity):
    rowcount = hist.shape[0]
    desired_steps = np.arange(start=hist[0, 0], stop=hist[rowcount-1, 0], step=granularity)
    return desired_steps, np.interp(desired_steps, hist[:,0], hist[:,1])
def calculate_numrequests(interpolated, xput, granularity):
    interpolated = np.vstack(interpolated).transpose()
    sorted = interpolated[interpolated[:,0].argsort()]

    shifted = np.hstack([sorted[:-1,:], sorted[1:,:]])
    shifted_frame = pd.DataFrame(data=shifted, columns=['responsetime_unshifted', 'percentile_unshifted', 'responsetime', 'percentile'])
    shifted_frame['num_requests'] = shifted_frame['percentile'] * xput / 100
    shifted_frame['num_requests_unshifted'] = shifted_frame['percentile_unshifted'] * xput / 100
    shifted_frame['requests'] = (shifted_frame['num_requests'] - shifted_frame['num_requests_unshifted'])

    return shifted_frame.loc[:, ['responsetime', 'percentile', 'requests']]
def combine_multiple_histograms(histogram_arrays, throughputs, granularity):
    interpolated = [interpolate_histogram(hist, granularity) for hist in histogram_arrays]
    interpolated_with_numrequests = [calculate_numrequests(interp, xput, granularity) for interp, xput in zip(interpolated, throughputs)]
    return pd.concat(interpolated_with_numrequests).groupby('responsetime').agg({'requests': 'sum'}).reset_index()

def wquantile(x,q):           
    xsort = x.sort_values(x.columns[0])
    xsort['index'] = range(len(x))
    p = q * x[x.columns[1]].sum()
    pop = float(xsort[xsort.columns[1]][xsort['index']==0])
    i = 0
    while pop < p:
        pop = pop + float(xsort[xsort.columns[1]][xsort['index']==i+1])
        i = i + 1
    return xsort[xsort.columns[0]][xsort['index']==i]

def extract_percentiles(data, percentiles):
    percentile_list = []
    percentile_list.extend([(perc, wquantile(data, perc).values[0]) for perc in percentiles])
    return pd.DataFrame(data=percentile_list, columns=['percentile', 'responsetime'])

In [None]:
num_workers = 8
run = "writeOnly_32vc{}workers".format(num_workers)
inputdir = os.path.join(exp_dir, run)

                        
reps_range = range(1, 4)
middlewares = ["middleware_04", "middleware_05"]
reqs = gather_requests(inputdir, reps_range, middlewares)

# Extracting histograms for Sharded
client_logfiles = ["client_01_0.log", "client_01_1.log", "client_02_0.log", "client_02_1.log", "client_03_0.log", "client_03_1.log"]
reps = 3
filepaths = [os.path.join(inputdir, str(rep), logfiles) for rep in range(1, reps+1) for logfiles in client_logfiles]
histograms = [np.array(extract_GET_histogram(filepath), dtype=float) for filepath in filepaths]
throughputs = [extract_GET_throughput(filepath) for filepath in filepaths]
hist_data = combine_multiple_histograms(histograms, throughputs, granularity)


In [None]:
all_reps = []
for rep in reps_range:

    log_folder_path = os.path.join(inputdir, str(int(rep)))

    # Now we extract throughput, responsetime, average queuetime and missrate from the middleware
    middleware_dirs = [dirfuncs.get_only_subdir(os.path.join(log_folder_path, mw_dir)) for mw_dir in middlewares]
    concatenated_requests = [gmws.concatenate_requestlogs(middleware_dir) for middleware_dir in middleware_dirs]
    metrics = [gmws.extract_metrics(reqs) for reqs in concatenated_requests]
    cut_metrics = [cut.cut_away_warmup_cooldown(mets, 10, 72) for mets in metrics]
    windows = [gmws.aggregate_over_windows(cut_mets) for cut_mets in cut_metrics]
    rep_metrics = gmws.aggregate_over_middlewares(windows)
    all_reps.append(rep_metrics)

mw_agg_over_reps = gmws.aggregate_over_reps(all_reps)
mw_averages = gmws.aggregate_over_timesteps(mw_agg_over_reps)
avg = mw_averages.loc['mean', :]

all_rep_windows = []
for rep in reps_range:

    log_folder_path = os.path.join(inputdir, str(rep))

    client_logfile_paths = [os.path.join(log_folder_path, client_logfile) for client_logfile in client_logfiles]
    window = gmts.aggregate_over_clients(client_logfile_paths)
    window = cut.cut_away_warmup_cooldown(window, 10, 72)
    all_rep_windows.append(window)

mt_agg_over_reps = gmts.aggregate_over_reps(all_rep_windows)
mt_averages = gmts.aggregate_over_timesteps(mt_agg_over_reps)

In [None]:
m = num_workers * 2
def calculate_prob_zero_jobs(rho, m):
    return 1 / (1 + ((m*rho)**m)/(math.factorial(m)*(1-rho)) + sum([(m*rho)**n/math.factorial(n) for n in range(1, m)]))
    
arrivalrate_lambda = mt_averages.loc['mean', 'throughput']
servicetime_ms = (reqs['workerPreProcessingTime_ms'] + reqs['memcachedRTT_ms'] + reqs['workerPostProcessingTime_ms']).mean()
servicerate_mu = 1000/servicetime_ms
trafficIntensity_rho = arrivalrate_lambda / (m * servicerate_mu) # == average utilization of each server
interarrivalTime_tao = 1000 / arrivalrate_lambda
probability_zero_jobs = calculate_prob_zero_jobs(trafficIntensity_rho, m)
probability_of_queueing = ((m*trafficIntensity_rho)**m)/(math.factorial(m)*(1-trafficIntensity_rho))*probability_zero_jobs
mean_number_jobs_in_system = m*trafficIntensity_rho + trafficIntensity_rho*probability_of_queueing / (1-trafficIntensity_rho)# Compare with worker count
mean_number_jobs_in_queue = trafficIntensity_rho*probability_of_queueing/(1-trafficIntensity_rho) # Compare with worker count
mean_response_time_r = 1000/servicerate_mu * (1 + probability_of_queueing/(m*(1-trafficIntensity_rho))) # Compare with mean response time of client or middleware
mean_waiting_time_w = 1000*probability_of_queueing / (m*servicerate_mu*(1-trafficIntensity_rho))

In [None]:
print("Arrival Rate Lambda: Predicted: {}, Actual Throughput: {}".format(arrivalrate_lambda, mt_averages.loc['mean', 'throughput']))
print("Service Rate Mu: Predicted: {}".format(servicerate_mu))
print("Traffic Intensity rho / Utilization per server: Predicted: {:0.4f}".format(trafficIntensity_rho))
print("Interarrival Time: Predicted: {:0.4f}, Actual interarrival time: {:0.4f}".format(interarrivalTime_tao, reqs['interarrivalTime_ms'].mean()))
print("Worker Service Time: Predicted: {:0.4f}, Actual worker service time: {:0.4f}".format(servicetime_ms, servicetime_ms))
print("Probability zero jobs: Predicted: {:0.4f}".format(probability_zero_jobs))
print("Probability of queueing: Predicted: {:0.4f}".format(probability_of_queueing))
print("Number of Jobs in System: Predicted: {:0.4f}, Actual Num workers in the queue: {:0.4f}".format(mean_number_jobs_in_system, (num_workers*2*trafficIntensity_rho+avg['queueLength'])))
print("Number of Jobs in Queue: Predicted: {:0.4f}, Actual Queue Length: {:0.4f}".format(mean_number_jobs_in_queue, avg['queueLength']))
print("Mean Response Time r: Predicted: {:0.4f}, Actual Response Time (Middleware): {:0.4f}".format(mean_response_time_r, avg['responseTime_ms']))
print("Mean Waiting Time w: Predicted: {:0.4f}, Actual Queue Waiting Time: {:0.4f}".format(mean_waiting_time_w, avg['queueTime_ms']))
