# Imports

In [1]:
%%time
#Imports requisite packages
import os
import time
import numpy
import pickle
import cProfile
import itertools
import matplotlib
import sklearn.svm
import sklearn.tree
import sklearn.metrics
import sklearn.ensemble
import sklearn.preprocessing
import sklearn.learning_curve
import sklearn.model_selection
import sklearn.cross_validation
import sklearn.feature_selection
import sklearn.kernel_approximation
from matplotlib import pyplot as plt

#%jsroot on9
%matplotlib inline
matplotlib.use('Agg')



CPU times: user 606 ms, sys: 58 ms, total: 664 ms
Wall time: 988 ms


because the backend has already been chosen;
matplotlib.use() must be called *before* pylab, matplotlib.pyplot,
or matplotlib.backends is imported for the first time.




# Function Definitions

In [2]:
%%time
#Takes the converted tree and turns it into an
#n-by-30 array usable by sklearn.
def outputs(array):
    #Only uses events with non-zero luminosity
    goodEvents = array[array['lumi'] != 0]
    ind = numpy.lexsort((goodEvents['lumiId'],goodEvents['runId']))
    events = goodEvents[ind]
    dataset = numpy.empty([len(goodEvents),30])
    target = numpy.empty([len(goodEvents)])
    badOnes = numpy.array([])

    #Fills dataset array with proper features
    for j, event in enumerate(events):
        try:
            dataset[j,0:7] = event['qPFJetPt']
            dataset[j,7:14] = event['qPFJetEta']
            dataset[j,14:21] = event['qPFJetPhi']
            dataset[j,21:28] = event['qNVtx']
            dataset[j,28] = event['crossSection']
            dataset[j,29] = event['lumi']
            target[j] = event['isSig']
        except ValueError:
            badOnes = numpy.append(badOnes,j)
            
    #Takes out corrupt events
    mask = numpy.zeros(len(dataset), dtype=bool)
    mask[badOnes.astype(int)] = True
    mask = ~mask
    dataset = dataset[mask]
    target = target[mask]
       
    return dataset, target

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 6.44 µs


In [3]:
#Initializes and fits SVM classifier
def suppVM(xTrain, yTrain, lumiTrain):
    start = time.time()
    classifier = SVC(C = 100, kernel = 'rbf',tol=0.0001,gamma='auto') 
    
    #Comment sample_weight if the training is to be done without sample weight.
    if weights == True:
        classifier = classifier.fit(xTrain, yTrain, sample_weight = lumiTrain) 
    else:
        classifier = classifier.fit(xTrain, yTrain)
    
    print('It took', time.time()-start, 'seconds for SVM.')
    return classifier

In [4]:
#Function that plots confusion matrix, taken from sklearn website
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):

    #This function prints and plots the confusion matrix.
    #Normalization can be applied by setting `normalize=True`.     
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, numpy.newaxis]
        plt.imshow(cm, interpolation='nearest', cmap=cmap)
        plt.title(title)
        plt.colorbar()
        tick_marks = numpy.arange(len(classes))
        plt.xticks(tick_marks, classes, rotation=45)
        plt.yticks(tick_marks, classes)
        print("Normalized confusion matrix")
    else:
        plt.imshow(cm, interpolation='nearest', cmap=cmap)
        plt.title(title)
        plt.colorbar()
        tick_marks = numpy.arange(len(classes))
        plt.xticks(tick_marks, classes, rotation=45)
        plt.yticks(tick_marks, classes)
        print('Confusion matrix, without normalization')

    print(cm)
    
    thresh = cm.max()*.7
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')


# Settings and Data Prep

In [5]:
#Pick learning algorithm, crossSection, weights, and random splitting
SVM = True
crossSection = True
weights = False
random = True
odds = False

#Saves dataset for pickling
# dataset, target = outputs(array)
# outFile = open('data.pkl', 'wb')
# pickle.dump(dataset, outFile)
# pickle.dump(target, outFile)
# outFile.close()

#Loads pickled dataset
inFile = open('realData.pkl', 'rb')
dataset = pickle.load(inFile, encoding="latin1")
target = pickle.load(inFile, encoding="latin1")
inFile.close()

In [6]:
# Splits dataset into training and testing sets based on settings

# xTrain = []
# yTrain = []
# xTest = []
# yTest = []

# if random == True:
#     xTrain, xTest, yTrain, yTest = train_test_split(dataset, target, test_size=0.5)
      
      #The 
#     lumiTrain = xTrain[:,-1]
#     lumiTest = xTest[:,-1]
#     lumiTrain = numpy.copy(lumiTrain, order='C')
#     lumiTest = numpy.copy(lumiTest, order='C')
      
#     if crossSection == True:
#         xTrain = xTrain[:,:-1]
#         xTest = xTest[:,:-1]
#     else:
#         xTrain = xTrain[:,:-2]
#         xTest = xTest[:,:-2]
# elif odds == True:
#     lumiTrain = dataset[::2,-1]
#     lumiTest = dataset[1::2,-1]
#     lumiTrain = numpy.copy(lumiTrain, order='C')
#     lumiTest = numpy.copy(lumiTest, order='C')
    
#     dataset = dataset[:,:-1]
    
#     xTrain = dataset[::2]
#     xTest = dataset[1::2]
#     yTrain = target[::2]
#     yTest = target[1::2]
# else:
#     n = int(numpy.floor(len(dataset)/2))
#     lumiTrain = dataset[:n,-1]
#     lumiTest = dataset[n:,-1]
#     lumiTrain = numpy.copy(lumiTrain, order='C')
#     lumiTest = numpy.copy(lumiTest, order='C')
    
#     if crossSection ==  True:
#         xTrain = dataset[:n,:-1]
#         xTest = dataset[n:,:-1]
#     else:
#         xTrain = dataset[:n,:-2]
#         xTest = dataset[n:,:-2]
        
#     yTrain = target[:n]
#     yTest = target[n:]

In [7]:
#Imports training and testing sets used across models
inFile = open('splits.pkl', 'rb')
xTrain = pickle.load(inFile, encoding="latin1")
xTest = pickle.load(inFile, encoding="latin1")
yTrain = pickle.load(inFile, encoding="latin1")
yTest = pickle.load(inFile, encoding="latin1")
lumiTrain = pickle.load(inFile, encoding="latin1")
lumiTest = pickle.load(inFile, encoding="latin1")
inFile.close()

In [8]:
# #Removes features with variance less than 0.1
# sel = VarianceThreshold(threshold = 0.01)
# print(xTrain.shape)
# sel.fit(xTrain)
# indices = sel.get_support()
# xTrain = xTrain[:,indices]
# print(xTrain.shape)
# xTest = xTest[:,indices]
# print(indices)

#Removes features based on different metrics

# print(xTrain.shape)
# genSel = GenericUnivariateSelect(chi2, mode = 'k_best', param = 5)
# genSel.fit(xTrain,yTrain)
# indices = genSel.get_support()
# print(indices)
# xTrain = xTrain[:,indices]
# xTest = xTest[:,indices]
# print(xTrain.shape)
# print(xTest.shape)
# mask = numpy.ones(29,dtype=bool)
# print(mask)
# ind = [1, 2, 6, 7, 9, 11, 13, 14, 15, 16, 17, 18, 19, 20, 21]
# mask[ind] = False
# print(mask)
# print(xTrain.shape)
# xTrain = xTrain[:,mask]
# print(xTrain.shape)
# xTest = xTest[:,mask]

[ 0  1  4  5  6 21 22 23 24 25 26 27 28]
[2 3 7 8 9 10 11 12 13 14 15 16 17 18 19 20]

In [9]:
#Scales the data to zero mean and unit variance
scaler = StandardScaler()
scaler.fit(xTrain)
xTrain = scaler.transform(xTrain)
xTest = scaler.transform(xTest)

NameError: name 'StandardScaler' is not defined

In [None]:
%%time
#Grid Search with cross validation

tunedParams = [{'gamma': [.001, 0.0001, .5, 1],}]
scores = ['precision', 'recall', 'roc_auc', 'f1']
for score in scores:
    print("# Tuning hyper-parameters for %s" % score)
    print()
    
    clf = GridSearchCV(SVC(kernel='rbf', C = 100, tol = 0.001), tunedParams, scoring='%s' % score, n_jobs=-1)
    clf.fit(xTrain, yTrain)

    print("Best parameters set found on development set:")
    print()
    print(clf.best_params_)
    print()
    print("Grid scores on development set:")
    print()
    means = clf.cv_results_['mean_test_score']
    stds = clf.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, clf.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))
    print()

    print("Detailed classification report:")
    print()
    print("The model is trained on the full development set.")
    print("The scores are computed on the full evaluation set.")
    print()
    y_true, y_pred = yTest, clf.predict(xTest)
    print(classification_report(y_true, y_pred))
    print()

In [None]:
%%time
#Kernel approximation

# rbfFeature = RBFSampler(gamma = .0001, n_components = 10000)
# rbfFeature.fit(xTrain)
# newXTrain = rbfFeature.transform(xTrain)
# newXTest = rbfFeature.transform(xTest)
# nysFeature = Nystroem(gamma = .001, n_components = 1000)
# nysFeature.fit(xTrain)
# newXTrain = nysFeature.transform(xTrain)
# newXTest = nysFeature.transform(xTest)


# Trains and Tests

In [None]:
%%time
#Calls classifier function
svmClf = suppVM(xTrain, yTrain, lumiTrain)

In [None]:
#Pickles classifier

# outClfs = open('svmClf_tuned_weights.pkl', 'wb')
# pickle.dump(svmClf, outClfs)
# outClfs.close()

In [None]:
#Unpickles classifier

# outClfs = open('svmClf_untuned_weights.pkl', 'rb')
# svmClf = pickle.load(outClfs)
# outClfs.close()

In [None]:
%%time
#Provides classification reports 
if weights == True:    
    svmScore = svmClf.fit(xTrain, yTrain, sample_weight = lumiTrain).decision_function(xTest)
else:
    svmScore = svmClf.fit(xTrain, yTrain).decision_function(xTest)
    
svmPredict = svmClf.predict(xTest)
print("Classification report for SVM, Tuned, Weights %s:\n%s\n"
      % (svmClf, metrics.classification_report(yTest, svmPredict)))

In [None]:
#Pickles scores

# outfile = open('svm_classscores_tuned_weights.pkl', 'wb')
# pickle.dump(svmScore, outfile)
# pickle.dump(svmPredict, outfile)
# outfile.close()

In [None]:
#Unpickles scores

# inFile = open('svm_classscores_untuned_noweights', 'rb')
# svmScore = pickle.load(inFile)
# svmPredict = pickle.load(inFile)

# Plots classification results

In [None]:
%%time
#Plots classification results for signal and background
svmArrs = []
svmHists = []

#Separates decision function results into signal and background
#along with training and testing
svmArrs.append(svmClf.decision_function(xTrain[yTrain>0.5]).ravel())
svmArrs.append(svmClf.decision_function(xTrain[yTrain<0.5]).ravel())
svmArrs.append(svmClf.decision_function(xTest[yTest>0.5]).ravel())
svmArrs.append(svmClf.decision_function(xTest[yTest<0.5]).ravel())

#Turns those arrays into histograms
svmHists.append(list(numpy.histogram(svmArrs[0], normed = True, bins = 40)))
svmHists.append(list(numpy.histogram(svmArrs[1], normed = True, bins = 40)))
svmHists.append(list(numpy.histogram(svmArrs[2], normed = True, bins = 40)))
svmHists.append(list(numpy.histogram(svmArrs[3], normed = True, bins = 40)))

#Defines bin edges, centers, and widths
svmMax = max([hist[0].max() for hist in svmHists])*1.2
svmMin = max([hist[0].min() for hist in svmHists])
svmEdges = svmHists[0][1]
svmCenters = (svmEdges[:-1] + svmEdges[1:])/2.
svmWidths = (svmEdges[1:] - svmEdges[:-1])

In [None]:
#Normalizes histogram based on maximum value
svmNormVal1 = max(max(svmHists[0][0]), max(svmHists[1][0]))
svmNormVal2 = max(max(svmHists[2][0]), max(svmHists[3][0]))
svmHists[0][0] = [x/svmNormVal1 for x in svmHists[0][0]]
svmHists[1][0] = [x/svmNormVal1 for x in svmHists[1][0]]
svmHists[2][0] = [x/svmNormVal2 for x in svmHists[2][0]]
svmHists[3][0] = [x/svmNormVal2 for x in svmHists[3][0]]

In [None]:
%%time
#Plots histograms
ax1 = plt.subplot(111)
ax1.bar(svmCenters-svmWidths/2.,svmHists[0][0],facecolor='red',linewidth=0,width=svmWidths,label='Signal',alpha=0.5)
ax1.bar(svmCenters-svmWidths/2.,svmHists[1][0],facecolor='blue',linewidth=0,width=svmWidths,label='Background',alpha=0.5)
#Change depending on which classifier and options are chosen
plt.title("Classification, SVM, Tuned, Weights, Rand, 15 feats, Training Set")
plt.xlabel("classifier score")
plt.ylabel("Counts/Bin")
legend = ax1.legend(loc='upper center', shadow=True,ncol=2)
for alabel in legend.get_texts():
            alabel.set_fontsize('small')
plt.legend(loc='upper left')
plt.show()

ax2 = plt.subplot(111)
ax2.bar(svmCenters-svmWidths/2.,svmHists[2][0],facecolor='red',linewidth=0,width=svmWidths,label='Signal',alpha=0.5)
ax2.bar(svmCenters-svmWidths/2.,svmHists[3][0],facecolor='blue',linewidth=0,width=svmWidths,label='Background',alpha=0.5)
plt.title("Classification, SVM, Tuned, Weights, Rand, 15 feats, Testing Set")
plt.xlabel("classifier score")
plt.ylabel("Counts/Bin")
legend = ax1.legend(loc='upper center', shadow=True,ncol=2)
for alabel in legend.get_texts():
            alabel.set_fontsize('small')
plt.legend(loc='upper left')
plt.show()

In [None]:
%%time
#Plots roc curve, code taken from sklearn website
fpr, tpr, _ = roc_curve(yTest, svmScore)
roc_auc = auc(fpr, tpr)

plt.figure()
lw = 2;
plt.plot(fpr, tpr, color='darkorange',
        lw = lw, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw = lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC SVM, Tuned, Weights, Rand, 15 feat')
plt.legend(loc="lower right")
plt.show()

In [None]:
%%time
#Plots confusion matrix, code taken from sklearn
classNames = ['Background','Signal']
confMat = confusion_matrix(yTest, svmPredict)
#numpy.set_printoptions(precision=)

#Plot non-normalized confusion matrix
plt.figure()
plot_confusion_matrix(confMat, classes=classNames,
                      title='Confusion matrix, SVM, Tuned, Un-normalization, Weights')

#Plot normalized confusion matrix
plt.figure()
plot_confusion_matrix(confMat, classes=classNames, normalize=True,
                      title='Confusion Matrix, SVM, Tuned, Normalized, Weights')

plt.show()

In [None]:
%%time
#Calculates Matthews Correlation Coefficient
#Ranges from -1 to 1, with 1 being a perfect predictor
matthews_corrcoef(yTest,svmPredict)