In [1]:
#import dependencies 
from numpy.random import seed
from numpy.random import randn
from scipy import stats
import numpy as np

import math  

import seaborn as sns
sns.set(style="darkgrid")
from matplotlib import pyplot as plt

import pandas as pd
import os

In [122]:
#check list of files in data directory 
data_folder = './data/dedicated_host/z1d/'
print(os.listdir(data_folder))

['z1d_ycruncher_dedicated_host_no_idle_-s_flag_11-25-2019_22_39~23_08_us-east-1b - raw.csv', 'z1d_sysbench_dedicated_host_no_idle_11-25-2019_23_18~23_47_us-east-1b - raw.csv']


In [150]:
#make sure to import correct file here
data_name = 'z1d_sysbench_dedicated_host_no_idle_11-25-2019_23_18~23_47_us-east-1b - raw.csv'

In [151]:
#adjust header if necessary
df_data = pd.read_csv(data_folder + data_name, header = 0)

In [152]:
#remove idle vm rows

#y-cruncher
#df_data.dropna(subset=['processorInfo'], inplace= True)

In [153]:
df_data.head()

Unnamed: 0,experimentID,instanceID,instanceType,wallTime,testOption,per-request-avg-time,total-time,thread-num
0,1574752581-0,i-029309b5aee386a90,z1d.large,6.792946,--test=cpu --cpu-max-prime=2000000 --num-threa...,1352.02ms,6.7644s,2
1,1574752581-0,i-029309b5aee386a90,z1d.large,6.781176,--test=cpu --cpu-max-prime=2000000 --num-threa...,1352.26ms,6.7621s,2
2,1574752581-0,i-029309b5aee386a90,z1d.large,6.790641,--test=cpu --cpu-max-prime=2000000 --num-threa...,1353.10ms,6.7718s,2
3,1574752581-1,i-029309b5aee386a90,z1d.large,6.789736,--test=cpu --cpu-max-prime=2000000 --num-threa...,1352.15ms,6.7619s,2
4,1574752581-1,i-029309b5aee386a90,z1d.large,6.780566,--test=cpu --cpu-max-prime=2000000 --num-threa...,1352.01ms,6.7616s,2


In [154]:
#setup cols, metric for the relative experiment type
def setUpMetrics(experimentName):
    
    if experimentName == 'pgbench':
        cols = ['instanceID', 'setId', 'transactions']
        metric = 'transactions'
    elif experimentName == 'sysbench' or experimentName == 'y-cruncher':
        cols = ['instanceID','setId','wallTime']
        metric = 'wallTime'
    elif experimentName == 'iperf':
        df_data['Total'] = df_data['Upload'] + df_data['Download']
        cols = ['instanceID','setId','Upload', 'Download', 'Total']
        metric = 'Total'
    else:
        print('Illegal Experiment Name..')
    return metric, cols

In [155]:
#setup vars for each experiment time... i.e. wallTime vs transactions
experimentName = 'y-cruncher'
metric, cols = setUpMetrics(experimentName)

In [156]:
#create setId column
df_data['setId'] = df_data['experimentID'].str.strip().str[-2:]
df_data['setId'] = df_data['setId'].astype('int32').abs()

In [157]:
df_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 900 entries, 0 to 899
Data columns (total 9 columns):
experimentID            900 non-null object
instanceID              900 non-null object
instanceType            900 non-null object
wallTime                900 non-null float64
testOption              900 non-null object
per-request-avg-time    900 non-null object
total-time              900 non-null object
thread-num              900 non-null int64
setId                   900 non-null int32
dtypes: float64(1), int32(1), int64(1), object(6)
memory usage: 59.8+ KB


In [158]:
df_data.tail()

Unnamed: 0,experimentID,instanceID,instanceType,wallTime,testOption,per-request-avg-time,total-time,thread-num,setId
895,1574752581-9,i-03242392bb4c0b0e0,z1d.large,6.77996,--test=cpu --cpu-max-prime=2000000 --num-threa...,1352.16ms,6.7611s,2,9
896,1574752581-9,i-03242392bb4c0b0e0,z1d.large,6.779342,--test=cpu --cpu-max-prime=2000000 --num-threa...,1351.85ms,6.7603s,2,9
897,1574752581-10,i-03242392bb4c0b0e0,z1d.large,6.786702,--test=cpu --cpu-max-prime=2000000 --num-threa...,1351.94ms,6.7607s,2,10
898,1574752581-10,i-03242392bb4c0b0e0,z1d.large,6.780089,--test=cpu --cpu-max-prime=2000000 --num-threa...,1351.76ms,6.7607s,2,10
899,1574752581-10,i-03242392bb4c0b0e0,z1d.large,6.78108,--test=cpu --cpu-max-prime=2000000 --num-threa...,1352.16ms,6.7615s,2,10


In [159]:
#drop other columns
df_data = df_data[cols]

In [160]:
df_data.head()

Unnamed: 0,instanceID,setId,wallTime
0,i-029309b5aee386a90,0,6.792946
1,i-029309b5aee386a90,0,6.781176
2,i-029309b5aee386a90,0,6.790641
3,i-029309b5aee386a90,1,6.789736
4,i-029309b5aee386a90,1,6.780566


In [161]:
df_data.describe()

Unnamed: 0,setId,wallTime
count,900.0,900.0
mean,7.666667,6.787505
std,5.767077,0.010632
min,0.0,6.776461
25%,3.0,6.782417
50%,7.0,6.7851
75%,12.0,6.788721
max,23.0,6.898084


In [162]:
df_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 900 entries, 0 to 899
Data columns (total 3 columns):
instanceID    900 non-null object
setId         900 non-null int32
wallTime      900 non-null float64
dtypes: float64(1), int32(1), object(1)
memory usage: 17.7+ KB


In [163]:
#group by tenants / setId 
mean = df_data.groupby('setId').mean()
mean.head()

Unnamed: 0_level_0,wallTime
setId,Unnamed: 1_level_1
0,6.79357
1,6.786317
2,6.78884
3,6.786352
4,6.787322


In [164]:
#create result dataframe
result_df = mean

In [165]:
#create std, normalized data
std = df_data.groupby('setId').std()
std.head()

Unnamed: 0_level_0,wallTime
setId,Unnamed: 1_level_1
0,0.021659
1,0.007882
2,0.012069
3,0.006573
4,0.010652


In [166]:
#add normalized column
if experimentName == 'pgbench':
    result_df['normalized'] = (result_df.loc[:,metric:]-result_df.iloc[-1][metric:]).div(result_df.iloc[-1][metric:])
    result_df['normalized'] *= 100
    result_df['normalized'] += 100
else: 
    result_df['normalized'] = 1 - (result_df.loc[:,metric:]-result_df.iloc[-1][metric:]).div(result_df.iloc[-1][metric:])
    result_df['normalized'] *= 100

result_df['normalized'] = result_df['normalized'].round(2).astype(str) + '%'

In [167]:
result_df.head()

Unnamed: 0_level_0,wallTime,normalized
setId,Unnamed: 1_level_1,Unnamed: 2_level_1
0,6.79357,100.11%
1,6.786317,100.21%
2,6.78884,100.18%
3,6.786352,100.21%
4,6.787322,100.2%


In [168]:
#add std column
result_df['std'] = std

In [169]:
#reverse table
result_df=result_df.iloc[::-1]

In [170]:
#add tenants
result_df['tenants'] = np.arange(len(result_df))+1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [171]:
#rearrange columns
cols = ['tenants', metric, 'std', 'normalized']

In [172]:
result_df = result_df[cols]

In [173]:
#add % to normalized

In [174]:
result_df

Unnamed: 0_level_0,tenants,wallTime,std,normalized
setId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
23,1,6.800862,0.028234,100.0%
22,2,6.791771,0.01776,100.13%
21,3,6.788097,0.009059,100.19%
20,4,6.793567,0.016361,100.11%
19,5,6.789285,0.011875,100.17%
18,6,6.791114,0.013256,100.14%
17,7,6.789383,0.013112,100.17%
16,8,6.788677,0.013163,100.18%
15,9,6.787843,0.010271,100.19%
14,10,6.787601,0.007282,100.19%


In [175]:
#print to csv
result_df.to_csv( path_or_buf='./data/summary/' + data_name + '-Summary.csv')

In [176]:
# graph tenants vs wallTime