# Performance Data Comparison

This notebook can be used either to view a summary of a single set performance benchmark data or to compare summaries of two sets of performance benchmark data.

Set the variable below to `True` to compare two sets of data, and rerun all of the cells.

In [None]:
compareDatasets = False

In [None]:
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
matplotlib.style.use('ggplot')

matplotlib.rcParams['figure.figsize'] = 9, 6
matplotlib.rcParams['legend.loc'] = 'best'

# We'll need these packages for plotting fit lines
import statsmodels.api as sm
from patsy import dmatrices

import os.path
assert os.path.isfile('cpuStats.csv'), 'Missing CPU stats file "cpuStats.csv"'
assert os.path.isfile('perfResults.csv'), 'Missing performance results file "perfResults.csv"'

if compareDatasets:
    assert os.path.isfile('old_cpuStats.csv'), 'Missing old CPU stats file "old_cpuStats.csv"'
    assert os.path.isfile('old_perfResults.csv'), 'Missing old performance results file "old_perfResults.csv"'

## CPU Percentage over time

We are curretly sampling the CPU percentage data at `5s` but if you want to resample with a different frequency, change the variable below to the desired frequency (in seconds) and rerun all the cells.

In [None]:
resampleFrequency = 5

In [None]:
def processCpuData(data):
    data['timestamp'] = pd.to_datetime(data['timestamp'].str.strip(' +0000 UTC'))

    meanData = data.set_index('timestamp').resample('{0}s'.format(resampleFrequency)).mean()
    meanData = meanData.reset_index()
    meanData = meanData.set_index(meanData.index.values * resampleFrequency)
    return meanData

cpuData = pd.read_csv('cpuStats.csv')
cpuMeanData = processCpuData(cpuData)

if compareDatasets:
    oldCpuData = pd.read_csv('old_cpuStats.csv')
    oldCpuMeanData = processCpuData(oldCpuData) 

In [None]:
fig, ax = plt.subplots()
ax = cpuMeanData.plot(ax=ax, kind='line', y='percentage', c='b')
if compareDatasets:
    ax = oldCpuMeanData.plot(ax=ax, kind='line', y='percentage', c='r')
    ax.legend(['after', 'before'])
else:
    ax.legend(['mean'])
ax.set_ylabel('CPU percentage')
ax.set_xlabel('Time since benchmark started (seconds)')
ax.set_title('Mean CPU percentage over time')
plt.show()

## Throughput Data over time

We see the spike's in below graph because of the way we are testing performance. Since it is a ramp up test throughput does not consistently increase over time. 

In [None]:
def processThroughputData(data):
    buckets = data.set_index('start-time')['response-time'].resample('1S')
    throughputData = buckets.aggregate({'throughput':'count'})
    throughputData = throughputData.reset_index()
    return buckets, throughputData

goData = pd.read_csv('perfResults.csv', parse_dates=['start-time'])
throughputBuckets, throughtputData = processThroughputData(goData)

if compareDatasets:
    oldGoData = pd.read_csv('old_perfResults.csv', parse_dates=['start-time'])
    oldThroughputBuckets, oldThroughputData = processThroughputData(oldGoData)

In [None]:
fix, ax = plt.subplots()
ax = throughputData.plot(ax=ax, y='throughput', c='b')
if compareDatasets:
    ax = oldThroughputData.plot(ax=ax, y='throughput', c='r')
    ax.legend(['after', 'before'])
ax.set_ylabel('Throughput (req/sec)')
ax.set_xlabel('Time since benchmark started (seconds)')
plt.show()

## Headroom plot

In [None]:
goData['throughput'] = throughputBuckets.transform(len).reset_index()['response-time']
goData.columns = ['start-time', 'latency', 'throughput']

if compareDatasets:
    oldGoData['throughput'] = oldThroughputBuckets.transform(len).reset_index()['response-time']
    oldGoData.columns = ['start-time', 'latency', 'throughput']

In [None]:
def generateFitLine(data):
    y, x = dmatrices('latency ~ throughput', data=data, return_type='dataframe')
    fit = sm.GLM(y, x, family=sm.families.InverseGaussian(sm.families.links.inverse_squared)).fit()

    domain = np.arange(data['throughput'].min(), data['throughput'].max())
    predictionInputs = np.ones((len(domain), 2))
    predictionInputs[:,1] = domain
    fitLine = fit.predict(predictionInputs)
    return domain, fitLine

domain, goFitLine = generateFitLine(goData)

if compareDatasets:
    oldDomain, oldGoFitLine = generateFitLine(oldGoData)

In [None]:
fig, ax = plt.subplots()

# Change the value of `c` to change the color. http://matplotlib.org/api/colors_api.html
ax = goData.plot(ax=ax, kind='scatter', x='throughput', y='latency', c='b', marker='.', alpha=0.2)
ax.plot(domain, goFitLine, c='b', lw=2) # Plot the fit line

if compareDatasets:
    ax = oldGoData.plot(ax=ax, kind='scatter', x='throughput', y='latency', c='r', marker='.', alpha=0.2)
    ax.plot(oldDomain, oldGoFitLine, c='r', lw=2) # Plot the fit line
    ax.legend(['after', 'before'])

# To update x & y axis range change the parameters in function set_(x/y)lim(lower_limit, uppper_limit) 
ax.set_ylim(0,0.15)
ax.set_xlim(0,2500)
plt.xlabel('Throughput (requests/sec)')
plt.ylabel('Latency (sec)')
plt.title('Headroom plot', y=1.05)
plt.show()