# Get the data

Let's load our test and training data:


In [1]:
import svm
import sklearn

data_dir = "/home/gz/Downloads/results-babybel/"
config = ["L3-SMT"]
uncore = "shared"
tests = ["PR700"]
X, Y, _, X_test, Y_test = svm.row_training_and_test_set(data_dir, config, tests, cutoff=1.15, uncore=uncore)

Then we build a SVM classifier:

In [2]:
#clf = svm.CLASSIFIERS['poly2balanced']
#min_max_scaler = sklearn.preprocessing.MinMaxScaler()
#X_scaled = min_max_scaler.fit_transform(X)
#X_test_scaled = min_max_scaler.transform(X_test)
#model = clf.fit(X_scaled, Y)

Now we need the weights of the decision function:

In [3]:
#print "dual_coef", len(model.dual_coef_)
#print "dual_coef[0]", len(model.dual_coef_[0])

The poly2 kernel translates the features in a different space so we don't get a 1:1 correspondence to our features, but we can use a linear kernel:

In [4]:
import numpy as np

clf = svm.CLASSIFIERS['linear']
min_max_scaler = sklearn.preprocessing.MinMaxScaler()
X_scaled = min_max_scaler.fit_transform(X)
X_test_scaled = min_max_scaler.transform(X_test)
model = clf.fit(X_scaled, Y)

print "#coefficients", len(model.coef_[0])

abs_coefs = np.abs(model.coef_[0])

weight_idx_sorted = np.argsort( abs_coefs )
weights = list(weight_idx_sorted)
weights.reverse()

print "Is 1st equals to max?", abs_coefs[weights[0]] == max(abs_coefs)

for idx in weights[:10]:
    print X_test.columns[idx], abs_coefs[idx]

#coefficients 2380
Is 1st equals to max? True
STD.OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_CODE_RD 0.347148376353
STD.UNC_M_WR_CAS_RANK1.BANK4 0.307836107212
AVG.UNC_R3_RxR_OCCUPANCY.NDR 0.28454720478
STD.UNC_R3_RxR_OCCUPANCY.NDR 0.279940554382
AVG.UNC_H_BT_OCCUPANCY.READS_LOCAL 0.271433137136
STD.UNC_M_WR_CAS_RANK4.BANK3 0.264367674417
STD.UNC_R3_VNA_CREDITS_ACQUIRED.BL 0.244774587251
AVG.UNC_R3_RxR_OCCUPANCY.SNP 0.242707394143
STD.UNC_M_WR_CAS_RANK1.BANK3 0.231697561455
STD.UNC_M_WR_CAS_RANK1.BANK7 0.202045286918


And we can compare this list with the ranking we get from pearson correlation:

In [5]:
import pandas as pd

Y_values = Y.map(lambda x: 1 if x else 0)

all_correlations = []
X_scaled_df = pd.DataFrame(X_scaled, columns=X.columns)
for col in X_scaled_df.columns:
    corr = X_scaled_df[col].corr(Y_values)
    if np.isnan(corr):
        corr = 0.0
    all_correlations.append(corr)

abs_correlations = np.abs(all_correlations)
correlations_idx_sorted = np.argsort(abs_correlations)

correlations_idx = list(correlations_idx_sorted)
correlations_idx.reverse()

for idx in correlations_idx[:10]:
    print X_test.columns[idx], abs_correlations[idx]

STD.MEM_LOAD_UOPS_LLC_MISS_RETIRED.LOCAL_DRAM 0.121486303191
AVG.ITLB_MISSES.STLB_HIT 0.114442852867
AVG.UNC_H_TxR_BL_CYCLES_FULL.SCHED0 0.107097420434
STD.MEM_LOAD_UOPS_RETIRED.L1_MISS 0.104845460427
STD.MEM_LOAD_UOPS_RETIRED.L2_MISS 0.0982564826809
AVG.UNC_M_CAS_COUNT.RD_UNDERFILL 0.0969038615645
AVG.UNC_H_IGR_NO_CREDIT_CYCLES.BL_QPI0 0.0962068847874
AVG.UNC_H_SNOOP_RESP.RSPCNFLCT 0.0927274303824
STD.UNC_M_CAS_COUNT.RD_UNDERFILL 0.0918841256533
STD.UNC_M_WR_CAS_RANK5.BANK5 0.0917919766173


Finally, we plot the histogram that shows the distribution of the event with the hightest weight in SVM.

In [127]:
from bokeh.plotting import show
from bokeh.charts import Histogram
from bokeh.layouts import column, row

output_notebook()

# Training data
top_event = X_scaled_df.columns[weights[0]]
Y_labels = Y.map(lambda x: "Y" if x else "N")
Y_labels = Y_labels.reset_index()

distribution = pd.DataFrame()
distribution['value'] = X_scaled_df[top_event]
distribution = distribution.reset_index()
del distribution['index']
distribution['class'] = Y_labels[0]

# Testing data
Y_test_labels = Y_test.map(lambda x: "Y" if x else "N")
Y_test_labels = Y_test_labels.reset_index()

X_scaled_test_df = pd.DataFrame(X_test_scaled, columns=X_test.columns)
distribution_test = pd.DataFrame()
distribution_test['value'] = X_scaled_test_df[top_event]
distribution_test = distribution_test.reset_index()
del distribution_test['index']
distribution_test['class'] = Y_labels[0]


hist_training = Histogram(distribution, values='value', title="Histogram Training", legend='top_right', bins=20, label='class', stack='class')
hist_test = Histogram(distribution_test, values='value', title="Histogram Test", legend='top_right', bins=20, label='class', stack='class')

show(row(hist_training, hist_test))
