# Get the data

Let's load our test and training data:


In [1]:
import svm
import sklearn

data_dir = "/home/gz/workspace/results-babybel/"
config = ["L3-SMT"]
uncore = "shared"
tests = ["PR700"]
X, Y, _, X_test, Y_test = svm.row_training_and_test_set(data_dir, config, tests, cutoff=1.15, uncore=uncore)

Then we build a SVM classifier:

In [2]:
#clf = svm.CLASSIFIERS['poly2balanced']
#min_max_scaler = sklearn.preprocessing.MinMaxScaler()
#X_scaled = min_max_scaler.fit_transform(X)
#X_test_scaled = min_max_scaler.transform(X_test)
#model = clf.fit(X_scaled, Y)

Now we need the weights of the decision function:

In [3]:
#print "dual_coef", len(model.dual_coef_)
#print "dual_coef[0]", len(model.dual_coef_[0])

The poly2 kernel translates the features in a different space so we don't get a 1:1 correspondence to our features, but we can use a linear kernel:

In [4]:
import numpy as np

clf = svm.CLASSIFIERS['linear']
min_max_scaler = sklearn.preprocessing.MinMaxScaler()
X_scaled = min_max_scaler.fit_transform(X)
X_test_scaled = min_max_scaler.transform(X_test)
model = clf.fit(X_scaled, Y)

print("#coefficients", len(model.coef_[0]))

abs_coefs = np.abs(model.coef_[0])

weight_idx_sorted = np.argsort( abs_coefs )
weights = list(weight_idx_sorted)
weights.reverse()

print("Is 1st equals to max?", abs_coefs[weights[0]] == max(abs_coefs))

for idx in weights[:10]:
    print(X_test.columns[idx], abs_coefs[idx])

#coefficients 2380
Is 1st equals to max? True
AVG.UNC_H_BT_OCCUPANCY.REMOTE 0.515895935102
AVG.UNC_R3_RxR_OCCUPANCY.HOM 0.43737861278
STD.UNC_M_WR_CAS_RANK1.BANK3 0.419150762644
STD.UNC_M_WR_CAS_RANK5.BANK4 0.387334687481
STD.UNC_R3_RxR_OCCUPANCY.NDR 0.377309605971
STD.UNC_M_WR_CAS_RANK1.BANK0 0.347131712795
AVG.L2_STORE_LOCK_RQSTS.HIT_M 0.342151596389
STD.UNC_M_WR_CAS_RANK1.BANK7 0.333013627092
AVG.UNC_H_TxR_AK_OCCUPANCY.SCHED0 0.328624892492
AVG.FP_COMP_OPS_EXE.SSE_PACKED_DOUBLE 0.32462826788


And we can compare this list with the ranking we get from pearson correlation:

In [5]:
import pandas as pd

Y_values = Y.map(lambda x: 1 if x else 0)

all_correlations = []
X_scaled_df = pd.DataFrame(X_scaled, columns=X.columns)
for col in X_scaled_df.columns:
    corr = X_scaled_df[col].corr(Y_values)
    if np.isnan(corr):
        corr = 0.0
    all_correlations.append(corr)

abs_correlations = np.abs(all_correlations)
correlations_idx_sorted = np.argsort(abs_correlations)

correlations_idx = list(correlations_idx_sorted)
correlations_idx.reverse()

for idx in correlations_idx[:10]:
    print(X_test.columns[idx], abs_correlations[idx])

AVG.OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD_GE_6 0.12807788187
AVG.ITLB_MISSES.STLB_HIT 0.123154371258
STD.ITLB_MISSES.STLB_HIT 0.11361604453
STD.OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD_GE_6 0.110620058147
AVG.OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD 0.108981952328
AVG.UNC_H_TxR_BL_CYCLES_FULL.SCHED0 0.105337678444
AVG.UNC_M_BYP_CMDS.CAS 0.105196306226
STD.MEM_LOAD_UOPS_LLC_MISS_RETIRED.LOCAL_DRAM 0.103947464821
AVG.UNC_M_PRE_COUNT.BYP 0.103788294069
AVG.UNC_M_BYP_CMDS.PRE 0.102291273264


Finally, we plot the histogram that shows the distribution of the event with the hightest weight in SVM.

In [6]:
from bokeh.plotting import show
from bokeh.charts import Histogram
from bokeh.layouts import column, row
from bokeh.models import Range1d

from bokeh.io import output_notebook
output_notebook()

# Training data
top_event = X_scaled_df.columns[weights[0]]
print(top_event)
Y_labels = Y.map(lambda x: "Training Y" if x else "Training N")
Y_labels = Y_labels.reset_index()

distribution = pd.DataFrame()
distribution['value'] = X_scaled_df[top_event]
distribution = distribution.reset_index()
del distribution['index']
distribution['class'] = Y_labels[0]

# Testing data
Y_test_labels = Y_test.map(lambda x: "Test Y" if x else "Test N")
Y_test_labels = Y_test_labels.reset_index()

X_scaled_test_df = pd.DataFrame(X_test_scaled, columns=X_test.columns)
distribution_test = pd.DataFrame()
distribution_test['value'] = X_scaled_test_df[top_event]
distribution_test = distribution_test.reset_index()
del distribution_test['index']
distribution_test['class'] = Y_test_labels[0]

distribution = pd.concat([distribution, distribution_test])





AVG.UNC_H_BT_OCCUPANCY.REMOTE


NameError: name 'measured' is not defined