## Example: Running DNN on DataFrame

In [1]:
import pandas as pd
import numpy as np

from keras.models import Sequential
from keras.layers import Dense,Dropout
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

Using TensorFlow backend.


Minimal Example dataframe.
- 20k events from anomalyDetection dataset.
- Has truth information and Signal+Background
- Intended for testing setup and establishing workflow.

In [5]:
## load dataset
in_df = pd.read_pickle("../dataExamples/testDF.pkl")
nevt=len(in_df)

In [None]:
#inspect
print(in_df)

Randomly chosen variables for testing: 
- invariant mass of the 2 leading-pt jets
- pseudorapidity gap between 2 leading jets
- scalar sum of transverse momenta leading jets

In [8]:
var1 = 'mjj'
var2 = 'dEtajj'
var3 = 'sumPtjj'
# split into input (X) and output (Y) variables
X = in_df.loc[:nevt-1,[var1, var2, var3]].values
Y = in_df['isSignal'].values[:nevt]

Create simple NN (following some default example)
- 2 layers
- 32 nodes each

In [10]:
def create_model():
  # create model
  model = Sequential()
  model.add(Dense(32, input_dim=3, activation='relu'))
  model.add(Dropout(0.2))
  model.add(Dense(32, input_dim=3, activation='relu'))
  model.add(Dropout(0.2))
  model.add(Dense(1, activation='sigmoid'))
  # Compile model
  model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
  return model

In [11]:
estimators = []
estimators.append(('standardize', StandardScaler()))
estimators.append(('mlp', KerasClassifier(build_fn=create_model, epochs=10, batch_size=2048, verbose=0)))

pipeline = Pipeline(estimators)
kfold = StratifiedKFold(n_splits=2, shuffle=True)
results = cross_val_score(pipeline, X, Y, cv=kfold)
#et voila!
print("Training on {0} events".format(nevt))
print("Efficiency: %.2f (%.2f)" % (results.mean(), results.std()))

Training on 20000 events
Efficiency: 0.91 (0.00)
