In [310]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn import preprocessing
import matplotlib.pyplot as plt

In [311]:
base_path = 'C:/Users/gujju/Desktop/CapstoneWork/'
folder_name = '3IntervalRuns/'

In [312]:
file_name = 'November 14th m1.xlarge matrixprod - 2_300.csv'

In [313]:
df = pd.read_csv(base_path + folder_name + file_name)

In [314]:
column_names = df.columns
bad_columns = ['cpuType', 'cpuMhz', 'sampleTime', 'elapsedTime', 'cputime', 'testName']
df = df.drop(bad_columns, 1)
df = df.dropna()
print(df.head())

     output    vmIpAddr  dsr     dsw       nbr       nbs    cpuusr  cpukrn  \
0  133583.0  10.0.1.102  0.0  4816.0  223607.0  407182.0  118816.0  1565.0   
1  136324.0  10.0.1.103  0.0  4720.0  223193.0  405460.0  118808.0  1529.0   
2  136387.0   10.0.1.11  0.0  4824.0  225907.0  411152.0  118846.0  1572.0   
3  136086.0  10.0.1.116  0.0  4816.0  226333.0  412108.0  118935.0  1476.0   
4  135186.0  10.0.1.122  0.0  5136.0  222561.0  405260.0  118868.0  1536.0   

   cpuidle  cpuIoWait  ...   cpuNice  cpuSteal  dskRds  dskMrgdRds  dskReadTm  \
0   8038.0        0.0  ...         0     106.0     0.0           0        0.0   
1   7867.0        0.0  ...         0       3.0     0.0           0        0.0   
2   8743.0        0.0  ...         0       3.0     0.0           0        0.0   
3   8692.0        0.0  ...         0       3.0     0.0           0        0.0   
4   8053.0        2.0  ...         0       3.0     0.0           0        0.0   

   dskWrts  dskMrgdWrts  dskWrtTm  ldAvg  ti

** Approach 1 =>** Heuristical approach: statistical outlier method - take the average cpuSteal of the pool, calculate standard deviation. The NN-threshold is 'AVG + 2*STD-DEV'.  Classify NN VMs using this approach. 

In [315]:
cpuSteal_avg = np.mean(df['cpuSteal'])
cpuSteal_std = np.std(df['cpuSteal'])
print(cpuSteal_std)
outlier_val = cpuSteal_avg + (2*cpuSteal_std)
nn_df = df[df['cpuSteal'] > outlier_val]
non_nn_df = df[df['cpuSteal'] <= outlier_val]
print(nn_df.describe())
print(non_nn_df.describe())

63.29524637162322
              output  dsr     dsw            nbr           nbs         cpuusr  \
count       3.000000  3.0     3.0       3.000000       3.00000       3.000000   
mean   130549.000000  0.0  4720.0  221094.333333  402620.00000  118299.000000   
std      1657.946622  0.0     8.0     797.205954     925.46853     127.941393   
min    128816.000000  0.0  4712.0  220563.000000  402022.00000  118154.000000   
25%    129763.500000  0.0  4716.0  220636.000000  402087.00000  118250.500000   
50%    130711.000000  0.0  4720.0  220709.000000  402152.00000  118347.000000   
75%    131415.500000  0.0  4724.0  221360.000000  402919.00000  118371.500000   
max    132120.000000  0.0  4728.0  222011.000000  403686.00000  118396.000000   

            cpukrn      cpuidle  cpuIoWait  cIntSrvc     ...      cpuNice  \
count     3.000000     3.000000        3.0       3.0     ...          3.0   
mean   1987.333333  8431.000000        0.0       0.0     ...          0.0   
std     113.191578   

**Approach 2 =>** Percentile approach: determine the 10th and 5th percentile of cpuSteal for the pool.  Classify NN VMs as any VM falling in the 10th and/or 5th percentile.
Here, we’d like to know which percentile is better (10th vs. 5th).
Excel has a percentile function.

In [316]:
percentile_val = df.cpuSteal.quantile(.90)
print(percentile_val)

52.8


In [317]:
nn_df = df[df['cpuSteal'] > percentile_val]
non_nn_df = df[df['cpuSteal'] <= percentile_val]
print(nn_df.describe())
print(non_nn_df.describe())

             output  dsr          dsw            nbr            nbs  \
count       5.00000  5.0     5.000000       5.000000       5.000000   
mean   132060.60000  0.0  4768.000000  221844.200000  403784.800000   
std      2436.42326  0.0    68.117545    1255.230337    2081.686624   
min    128816.00000  0.0  4712.000000  220563.000000  402022.000000   
25%    130711.00000  0.0  4720.000000  220709.000000  402152.000000   
50%    132120.00000  0.0  4728.000000  222011.000000  403686.000000   
75%    133583.00000  0.0  4816.000000  222331.000000  403882.000000   
max    135073.00000  0.0  4864.000000  223607.000000  407182.000000   

              cpuusr     cpukrn      cpuidle  cpuIoWait  cIntSrvc     ...      \
count       5.000000     5.0000     5.000000        5.0       5.0     ...       
mean   118455.600000  1867.4000  8384.800000        0.0       0.0     ...       
std       249.080911   202.1863   396.635475        0.0       0.0     ...       
min    118154.000000  1565.0000  787

In [318]:
percentile_val = df.cpuSteal.quantile(.95)
print(percentile_val)
nn_df = df[df['cpuSteal'] > percentile_val]
non_nn_df = df[df['cpuSteal'] <= percentile_val]
print(nn_df.describe())
print(non_nn_df.describe())

172.6
              output  dsr     dsw            nbr           nbs         cpuusr  \
count       3.000000  3.0     3.0       3.000000       3.00000       3.000000   
mean   130549.000000  0.0  4720.0  221094.333333  402620.00000  118299.000000   
std      1657.946622  0.0     8.0     797.205954     925.46853     127.941393   
min    128816.000000  0.0  4712.0  220563.000000  402022.00000  118154.000000   
25%    129763.500000  0.0  4716.0  220636.000000  402087.00000  118250.500000   
50%    130711.000000  0.0  4720.0  220709.000000  402152.00000  118347.000000   
75%    131415.500000  0.0  4724.0  221360.000000  402919.00000  118371.500000   
max    132120.000000  0.0  4728.0  222011.000000  403686.00000  118396.000000   

            cpukrn      cpuidle  cpuIoWait  cIntSrvc     ...      cpuNice  \
count     3.000000     3.000000        3.0       3.0     ...          3.0   
mean   1987.333333  8431.000000        0.0       0.0     ...          0.0   
std     113.191578   479.196202  

** Approach 3 =>** Two-variable K-Means clustering with Bogo-ops and cpuSteal.  Classify VMs.  Do we get a cluster with VMs that have noisy neighbors?

In [319]:
modified_df = df[['output', 'cpuSteal']]
min_max_scaler = preprocessing.MinMaxScaler()
np_scaled = min_max_scaler.fit_transform(modified_df)
df_normalized = pd.DataFrame(np_scaled)
df_normalized.columns = ['output', 'cpuSteal']

In [320]:
model = KMeans(n_clusters=2)
model.fit(df_normalized)

KMeans(copy_x=True, init='k-means++', max_iter=300, n_clusters=2, n_init=10,
    n_jobs=1, precompute_distances='auto', random_state=None, tol=0.0001,
    verbose=0)

In [321]:
modified_df = df[['output', 'cpuSteal']]
modified_df['classification'] = pd.Series(model.labels_)
nn_df = modified_df[modified_df['classification'] == 1]
non_nn_df = modified_df[modified_df['classification'] == 0]
print(nn_df.describe())
print(non_nn_df.describe())

              output   cpuSteal  classification
count       3.000000    3.00000             3.0
mean   130549.000000  255.00000             1.0
std      1657.946622   45.90207             0.0
min    128816.000000  217.00000             1.0
25%    129763.500000  229.50000             1.0
50%    130711.000000  242.00000             1.0
75%    131415.500000  274.00000             1.0
max    132120.000000  306.00000             1.0
              output    cpuSteal  classification
count      46.000000   46.000000            46.0
mean   134866.565217    8.347826             0.0
std      1486.439036   21.476827             0.0
min    130862.000000    1.000000             0.0
25%    134065.000000    2.000000             0.0
50%    135136.500000    3.000000             0.0
75%    135809.250000    3.000000             0.0
max    137310.000000  106.000000             0.0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


** Approach 4 =>** Three-variable K-Means clustering with Bogo-ops, cpuSteal, and next best predictor.  Classify VMs.  Do we get a cluster with VMs that have noisy neighbors?

In [322]:
modified_df = df[['output', 'cpuSteal', 'cpuusr', 'cpukrn']]
min_max_scaler = preprocessing.MinMaxScaler()
np_scaled = min_max_scaler.fit_transform(modified_df)
df_normalized = pd.DataFrame(np_scaled)
df_normalized.columns = ['output', 'cpuSteal', 'cpuusr', 'cpukrn']
print(df_normalized)

      output  cpuSteal    cpuusr    cpukrn
0   0.561220  0.344262  0.803398  0.206096
1   0.883918  0.006557  0.793689  0.153846
2   0.891335  0.006557  0.839806  0.216255
3   0.855898  0.006557  0.947816  0.076923
4   0.749941  0.006557  0.866505  0.164006
5   0.738286  0.009836  0.711165  0.407837
6   0.759830  0.006557  0.859223  0.121916
7   0.779374  0.006557  0.856796  0.129173
8   0.388980  0.790164  0.293689  0.679245
9   0.612785  0.131148  0.780340  0.240929
10  0.959383  0.003279  0.842233  0.201742
11  0.330233  0.006557  0.856796  0.171263
12  0.660702  0.003279  0.867718  0.134978
13  0.805981  0.006557  0.847087  0.185776
14  0.808571  0.009836  0.912621  0.066763
15  0.391335  0.003279  0.849515  0.146589
16  0.859430  0.003279  0.873786  0.074020
17  0.824111  0.006557  0.891990  0.011611
18  0.756769  0.006557  0.913835  0.058055
19  0.614787  0.003279  0.912621  0.072569
20  0.736402  0.003279  0.842233  0.130624
21  0.687780  0.003279  0.862864  0.139332
22  0.24087

In [323]:
model = KMeans(n_clusters=2)
model.fit(df_normalized)

KMeans(copy_x=True, init='k-means++', max_iter=300, n_clusters=2, n_init=10,
    n_jobs=1, precompute_distances='auto', random_state=None, tol=0.0001,
    verbose=0)

In [324]:
modified_df = df[['output', 'cpuSteal', 'cpuusr', 'cpukrn']]
modified_df['classification'] = pd.Series(model.labels_)
nn_df = modified_df[modified_df['classification'] == 1]
non_nn_df = modified_df[modified_df['classification'] == 0]
print(nn_df.describe())
print(non_nn_df.describe())

              output   cpuSteal         cpuusr       cpukrn  classification
count       3.000000    3.00000       3.000000     3.000000             3.0
mean   130549.000000  255.00000  118299.000000  1987.333333             1.0
std      1657.946622   45.90207     127.941393   113.191578             0.0
min    128816.000000  217.00000  118154.000000  1891.000000             1.0
25%    129763.500000  229.50000  118250.500000  1925.000000             1.0
50%    130711.000000  242.00000  118347.000000  1959.000000             1.0
75%    131415.500000  274.00000  118371.500000  2035.500000             1.0
max    132120.000000  306.00000  118396.000000  2112.000000             1.0
              output    cpuSteal         cpuusr       cpukrn  classification
count      46.000000   46.000000      46.000000    46.000000            46.0
mean   134866.565217    8.347826  118845.369565  1536.869565             0.0
std      1486.439036   21.476827      82.235531    89.506700             0.0
min    1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
