## Baseline analysis workflow

In [None]:
%%time
## Import common python libraries
import sys
import time
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import random

# Import from root_numpy library
import root_numpy
from root_numpy import root2array, rec2array

# Import from root_pandas library
from root_pandas import read_root

# Import panda library
from pandas.tools import plotting
from pandas.tools.plotting import scatter_matrix
from pandas.core.index import Index
import pandas.core.common as com

# Import scipy
import scipy
from scipy.stats import ks_2samp
import scipy as sp
from scipy.stats import distributions

# Import itertools
import itertools
from itertools import cycle

# Import Jupyter
from IPython.core.interactiveshell import InteractiveShell

# Import scikit-learn
import sklearn
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import RFE
from sklearn.feature_selection import SelectFromModel

from sklearn.model_selection import ParameterGrid
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold, StratifiedKFold, ShuffleSplit
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import learning_curve
from sklearn.model_selection import cross_val_predict

from sklearn.svm import SVC, LinearSVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, VotingClassifier
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier, BaggingClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import RandomizedLasso

from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler
from sklearn.feature_selection import SelectPercentile, f_classif
from sklearn.feature_selection import RFECV
from sklearn.calibration import calibration_curve, CalibratedClassifierCV
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.metrics import (confusion_matrix, roc_auc_score, roc_curve, 
                             auc, average_precision_score, precision_score, 
                             brier_score_loss, recall_score, f1_score, log_loss, 
                             classification_report, precision_recall_curve)
from sklearn.dummy import DummyClassifier

from sklearn.externals import joblib
from sklearn import feature_selection

## Keras deep neural network library
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.wrappers.scikit_learn import KerasClassifier
from keras.regularizers import l1, l2 #,WeightRegularizer
from keras.models import model_from_json
from keras.optimizers import SGD


# Import imblearn
import imblearn
from imblearn.over_sampling import ADASYN, SMOTE, RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

from collections import defaultdict, Counter

# python regular-expression
import re

# Sciki-kit learn graph 
from sklearn.tree import export_graphviz

# Check the versions of libraries/packages
print("Python version " + sys.version)
print("Sklearn version " + sklearn.__version__)
print("Root_numpy version " + root_numpy.__version__)
print("Numpy version " + np.__version__)
print("Scipy version " + scipy.__version__)
print("Pandas version " + pd.__version__)
print("Matplotlib version " + matplotlib.__version__)
print("Seaborn version " + sns.__version__)
print("Imblance version " +imblearn.__version__)

# Fix random seed for reproducibility
seed = 7
np.random.seed(seed)

# Specifying which nodes should be run interactively
InteractiveShell.ast_node_interactivity = "all"
print(__doc__)

## Load data files

In [None]:
%%time
## Data loading function

def load(sig_filename, bkg_filename, category, features):
    """load fucntion.

    Parameters
    ----------
    sig_filename : array, shape = [n_samples]
            true class, intergers in [0, n_classes - 1)
    bkg_filename : array, shape = [n_samples, n_classes]
    category: string
    features: array, shape = [n_features]

    Returns
    -------
    data : pandas.DataFrame
    """

    # Read in ROOT file and produce panda dataframes
    signal = read_root([sig_filename], category, columns=features)
    signal['y']= 1 # add target column for signal

    background = read_root([bkg_filename], category, columns=features)
    background['y']= 0 # add target column for background

    data = pd.concat([signal, background])
    
    return data                       

In [None]:
%%time
## Load input data files

# Feature names
branch_names = """mass_tag_tag_min_deltaR,median_mass_jet_jet,
    maxDeltaEta_tag_tag,mass_higgsLikeDijet,HT_tags,
    btagDiscriminatorAverage_tagged,mass_jet_tag_min_deltaR,
    mass_jet_jet_min_deltaR,mass_tag_tag_max_mass,maxDeltaEta_jet_jet,
    centrality_jets_leps,centrality_tags,globalTimesEventWeight""".split(",")

features = [c.strip() for c in branch_names]
features = (b.replace(" ", "_") for b in features)
features = list(b.replace("-", "_") for b in features)

# Load dataset
signal_sample = "combined/signalMC.root"
background_sample = "combined/backgroundMC.root"
tree_category = "event_mvaVariables_step7_cate4"

data = load(signal_sample, background_sample, tree_category, features)

print "Total number of events: {}\nNumber of features: {}".format(len(data.index), len(data.columns))

# Store a copy for later use
df_archived = data.copy(deep=True)

In [None]:
%%time
## Function to extract class label counts and percentage

def class_info(classes):
    # Store the number of signal and background events
    class_count = {}
    counts = Counter(classes)
    total = sum(counts.values())

    for cls in counts.keys():
        class_count[class_label[cls]] = counts[cls]
        print("%10s: %7d  =  % 5.1f%%" % (class_label[cls], counts[cls], float(counts[cls])/float((total))*100.0))

    return (class_count["signal"], class_count["background"])

In [None]:
%%time
## Determine class label counts and percentages

class_label = {0.0: "background", 1.0: "signal"}
class_info(data.y);

In [None]:
%%time
## Create features dataframe and target array

df_X = data.drop("y", axis=1, inplace=False)
df_y = data["y"]

In [None]:
%%time
print "background: ", sum(df_X["globalTimesEventWeight"][df_y<0.5]), sum(df_X["globalTimesEventWeight"][df_y<0.5])/(sum(df_X["globalTimesEventWeight"][df_y<0.5])+sum(df_X["globalTimesEventWeight"][df_y>0.5]))*100,"%"
print "signal: ", sum(df_X["globalTimesEventWeight"][df_y>0.5]), sum(df_X["globalTimesEventWeight"][df_y>0.5])/(sum(df_X["globalTimesEventWeight"][df_y>0.5])+sum(df_X["globalTimesEventWeight"][df_y<0.5]))*100,"%"

## Statistical summary

In [None]:
%%time
## Print statistical summary of dataset

#Set the display format to be scientific for ease of analysis
pd.options.display.float_format = '{:,.3g}'.format

# To print out all rows and columns to the terminal
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)

print "Head:"
data.head()

print "Describe:"
data.describe()

print "Describe based on signal or data:"
data.groupby('y').describe()

print "Information:" 
data.info()

print "Signal weight average:"
print data.groupby(["y"])['globalTimesEventWeight'].mean()

## Feature visualization: basic exploratory data analysis

In [None]:
%%time
## Plot signal and background distributions for some variables

def signal_background(data1, data2, column=None, grid=True,
                      xlabelsize=None, xrot=None, ylabelsize=None,
                      yrot=None, ax=None, sharex=False,
                      sharey=False, figsize=None,
                      layout=None, bins=10, **kwds):
    """Draw histogram of the DataFrame's series comparing the distribution
    in `data1` to `data2`.

    data1: DataFrame
    data2: DataFrame
    column: string or sequence
       If passed, will be used to limit data to a subset of columns
    grid : boolean, default True
       Whether to show axis grid lines
    xlabelsize : int, default None
       If specified changes the x-axis label size
    xrot : float, default None
       rotation of x axis labels
    ylabelsize : int, default None
       If specified changes the y-axis label size
    yrot : float, default None
       rotation of y axis labels
    ax : matplotlib axes object, default None
    sharex : bool, if True, the X axis will be shared amongst all subplots.
    sharey : bool, if True, the Y axis will be shared amongst all subplots.
    figsize : tuple
       The size of the figure to create in inches by default
    layout: (optional) a tuple (rows, columns) for the layout of the histograms
    bins: integer, default 10
       Number of histogram bins to be used
    kwds : other plotting keyword arguments
       To be passed to hist function
    """
    manification=20
    background_weight = data1["globalTimesEventWeight"]
    signal_weight = data2["globalTimesEventWeight"]*manification

    if "alpha" not in kwds:
        kwds["alpha"] = 0.5

    w, h = (12, 8)
    figsize = (w, h)

    if column is not None:
        if not isinstance(column, (list, np.ndarray, Index)):
            column = [column]
        data1 = data1[column]
        data2 = data2[column]

    data1 = data1._get_numeric_data()
    data2 = data2._get_numeric_data()
    naxes = len(data1.columns)


    fig, axes = plotting._subplots(naxes=naxes,
                                   ax=ax, 
                                   squeeze=False,
                                   sharex=sharex,
                                   sharey=sharey,
                                   figsize=figsize,
                                   layout=layout)
    xs = plotting._flatten(axes)

    for i, col in enumerate(com._try_sort(data1.columns)):
        ax = xs[i]
        low = min(data1[col].min(), data2[col].min())
        high = max(data1[col].max(), data2[col].max())
        ax.hist(data1[col].dropna().values, weights=background_weight, 
                bins=bins, histtype='stepfilled', range=(low,high), **kwds)
        ax.hist(data2[col].dropna().values, weights=signal_weight,
                bins=bins, histtype='stepfilled', range=(low,high), **kwds)
        ax.set_title(col)
        ax.legend(['background', 'signal (%s)'% (manification)], loc='best')
        ax.set_facecolor('white')
    
        # Customize the major grid
        ax.grid(which='major', linestyle='-', linewidth='0.2', color='gray')
        ax.set_facecolor('white')
    

    plotting._set_ticks_props(axes, xlabelsize=xlabelsize, xrot=xrot,
                             ylabelsize=ylabelsize, yrot=yrot)
    fig.subplots_adjust(wspace=0.5, hspace=0.8)

    return plt.show()

In [None]:
%%time
## Plot feature distributions

signal_background(data[data["y"] < 0.5],
                  data[data["y"] > 0.5],
                  column=features, bins=40);

In [None]:
%%time
data["globalTimesEventWeight"][data.y>0].hist(bins=70)
plt.show()


data["globalTimesEventWeight"][data.y<1].hist(bins=70)
plt.show()


In [None]:
%%time
import os
    
def ensure_dir(directory):
    """When directory is not present, create it.
    Arguments: 
    directory: name of directory.
    """
    if not os.path.exists(directory):
        os.makedirs(directory)

In [None]:
%%time
## Linear correlation matrix

def correlations(data, **kwds):
    """Multi class version of Logarithmic Loss metric.
    https://www.kaggle.com/wiki/MultiClassLogLoss

    Parameters
    ----------
    data : array, shape = [n_samples]
            true class, intergers in [0, n_classes - 1)
    kwds : array, shape = [n_samples, n_classes]

    Returns
    -------
    loss : float
    """
        
    
    """To calculate pairwise correlation between features.
    
    Extra arguments are passed on to DataFrame.corr()
    """
    
    # Select signal or background label for plot title
    if (data["y"] > 0.5).all(axis=0):
        label = "signal"
    elif (data["y"] < 0.5).all(axis=0):
        label = "background"
    
    # simply call df.corr() to get a table of
    # correlation values if you do not need
    # the fancy plotting
    data = data.drop("y", axis=1) 
 
    # Add colorbar, make sure to specify tick locations to match desired ticklabels
    labels = data.corr(**kwds).columns.values
    
    fig, ax1 = plt.subplots(ncols=1, figsize=(9,8))
    
    opts = {"annot" : True,
            "ax" : ax1,
            "vmin": 0, "vmax": 1*100,
            "annot_kws" : {"size": 8}, 
            "cmap": plt.get_cmap("Blues", 20),
            }
    
    ax1.set_title("Correlations: " + label)

    sns.heatmap(data.corr(method="spearman").iloc[::-1]*100, **opts) 
    
    plt.yticks(rotation=0)
    plt.xticks(rotation=90)
    
    for ax in (ax1,):
        # shift location of ticks to center of the bins
        ax.set_xticks(np.arange(len(labels))+0.5, minor=False)
        ax.set_yticks(np.arange(len(labels))+0.5, minor=False)
        ax.set_xticklabels(labels[::-1], minor=False, ha="right", rotation=70)
        ax.set_yticklabels(np.flipud(labels), minor=False)
        
    plt.tight_layout()
    
    #  checks if directory exists and if not creates it
    dir = 'plots'
    ensure_dir(dir)
    fig.savefig(dir+'/'+label+'_correlation_{}_features.pdf'.format(len(data.columns)))
    
    return plt.show()

In [None]:
%%time
## Plot feature correlations (assumes linear correlations)

# Remove the y column from the correlation matrix
# after using it to select background and signal
sig = data[data["y"] > 0.5].drop('globalTimesEventWeight', axis=1, inplace=False)
bg = data[data["y"] < 0.5].drop('globalTimesEventWeight', axis=1, inplace=False)

# Correlation Matrix
correlations(sig)
correlations(bg)

In [None]:
%%time
## Scatter Plot
#%matplotlib inline

sns.set(style="ticks", color_codes=True)

random.seed(a=seed)

sample_weight = data["globalTimesEventWeight"].values

g = sns.PairGrid(data.drop(["globalTimesEventWeight"], axis=1),
                 hue="y", palette="GnBu_d",
                 hue_kws={"marker": ["o", "s"]})
_=g.map_diag(plt.hist)
_=g.map_offdiag(plt.scatter, s=sample_weight, alpha=0.7, edgecolor="white")
_=g.add_legend();


xlabels,ylabels = [],[]

for ax in g.axes[-1,:]:
    xlabel = ax.xaxis.get_label_text()
    xlabels.append(xlabel)
for ax in g.axes[:,0]:
    ylabel = ax.yaxis.get_label_text()
    ylabels.append(ylabel)

for i in range(len(xlabels)):
    for j in range(len(ylabels)):
        _=g.axes[j,i].xaxis.set_label_text(xlabels[i])
        _=g.axes[j,i].yaxis.set_label_text(ylabels[j])

plt.show()

#_ = sns.pairplot(data.drop(["globalTimesEventWeight"], axis=1), size=2.5, hue="y", #kind="reg",
#                 markers=["o", "s"], plot_kws={ "s": sample_weight, "alpha":0.7 })

## Keras Deep Neural Network modeling

In [None]:
%%time
# Create network with Keras: Function to create model, 
# required for KerasClassifier (model architecture)

def build_model(optimizer='rmsprop', init='glorot_uniform', dropout_rate=0.0,
                     learn_rate=0.01, momentum=0):
    """Multi class version of Logarithmic Loss metric.

    Parameters
    ----------
    y_true : array, shape = [n_samples]
            true class, intergers in [0, n_classes - 1)
    y_pred : array, shape = [n_samples, n_classes]

    Returns
    -------
    loss : float
    """
    
    # create model: create a simple multi-layer neural network for the problem.

    # Note: initialization of the weights was chose as default to be 
    # randomly drawn from a uniform distribution (if normal then the distribution
    # would have mean 0 and standard deviation 0.05 in keras)

    # expected input data shape: (batch_size, timesteps, data_dim)
    model = Sequential()
    # Rectify Linear Unit (Relu) = relu, Exponential Linear Unit (Elu) =  elu
    model.add(Dense(12, input_dim=12, init=init, activation='elu')) 
    # ReLu(x) = {0 for x <=0 else x for x > 0}
    model.add(Dropout(dropout_rate))
     # 8 neurons in the hidden layer and 12 in the visible layer 
    model.add(Dense(8, init=init, activation='elu')) # 8 neurons in the hidden layer 
    model.add(Dropout(dropout_rate))
    model.add(Dense(1, init=init, activation='sigmoid')) # 1 neuron in the output layer

    # Compile model
    optimizer = SGD(lr=learn_rate, momentum=momentum)
    model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    
    #model.fit(x_train, y_train, verbose=1, batch_size=100, nb_epoch=50,
    #          show_accuracy=True,validation_data=(x_test, y_test))
        
    return model

In [None]:
%%time
# Function to create kearas network model, 
# required for KerasClassifier (model architecture)

def create_model(optimizer='rmsprop', init='glorot_uniform', neurons=12,
                 dropout_rate=0.0, weight_constraint=0,
                 activation='elu',
                 lr=0.01, momentum=0):
    """Multi class version of Logarithmic Loss metric.

    Parameters
    ----------
    y_true : array, shape = [n_samples]
            true class, intergers in [0, n_classes - 1)
    y_pred : array, shape = [n_samples, n_classes]

    Returns
    -------
    loss : float
    """
    
    # Note: initialization of the weights was chose as default to be 
    # randomly drawn from a uniform distribution (if normal then the distribution
    # would have mean 0 and standard deviation 0.05 in keras)

    # expected input data shape: (batch_size, timesteps, data_dim)
    
    # create model: create a simple multi-layer neural network for the problem.
    model = Sequential()
    # Rectify Linear Unit (Relu) = relu, Exponential Linear Unit (Elu) =  elu
    #model.add(Dense(12, input_dim=12, init=init, activation=activation))
    model.add(Dense(neurons, input_dim=neurons, init=init, activation=activation))
    #model.add(Dense(num_features, input_dim=len(num_features), init=init, activation='linear', 
    #W_constraint=maxnorm(weight_constraint)))

    # ReLu(x) = {0 for x <=0 else x for x > 0}
    model.add(Dropout(dropout_rate))
     # 8 neurons in the hidden layer and 12 in the visible layer 
    model.add(Dense(8, init=init, activation=activation)) # 8 neurons in the hidden layer 
    model.add(Dropout(dropout_rate))
    model.add(Dense(1, init=init, activation='sigmoid')) # 1 neuron in the output layer

    # Compile model
    model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    
    #adam=keras.optimizers.Adam(lr=0.01)
    #model.fit(x_train, y_train, verbose=1, batch_size=100, nb_epoch=200,
    #          show_accuracy=True,validation_data=(x_test, y_test))
    
    return model

In [None]:
%%time
# Plot AUC for ROC curve for several classifiers out-of-the-box

# Set feature scaling type
scaler = RobustScaler()

# create classifier for use in scikit-learn
#model = KerasClassifier(build_fn=create_model, nb_epoch=50, batch_size=3, verbose=0)

# prepare models: create a mapping of ML classifier name to algorithm
pipe_classifiers = {
    'KerasClassifier':  make_pipeline(scaler, KerasClassifier(build_fn=create_model, nb_epoch=50,
                                                              batch_size=3, verbose=0)),
    'SVC':  make_pipeline(scaler, SVC(probability=True, class_weight="balanced")),
    'LogisticRegression'    : make_pipeline(scaler, LogisticRegression(class_weight="balanced")),
    'AdaBoostClassifier'    : make_pipeline(None,   AdaBoostClassifier()),
    'RandomForestClassifier': make_pipeline(None,   RandomForestClassifier(min_samples_leaf=10)),
    'DecisionTreeClassifier': make_pipeline(None,   DecisionTreeClassifier(min_samples_leaf=10,
                                                                           class_weight="balanced")),
    'GradientBoostingClassifier': make_pipeline(None,   GradientBoostingClassifier(min_samples_leaf=10)),
    'BaggingClassifier': make_pipeline(None,   BaggingClassifier(n_estimators=50, max_samples=100,
                                                                 max_features=12)),
    'ExtraTreesClassifier' :  make_pipeline(None, ExtraTreesClassifier(min_samples_leaf=10)),#,
    #'LinearDiscriminantAnalysis':  make_pipeline(scaler, LinearDiscriminantAnalysis()),
    #'KNeighborsClassifier':  make_pipeline(scaler, KNeighborsClassifier()),
    #'GaussianNB' :  make_pipeline(scaler, GaussianNB()), 
    #'MLPClassifier':  make_pipeline(scaler, MLPClassifier()), 
}

## Model performance measure

In [None]:
%%time
## Compute ROC curve and area under the curve

def roc_plot(models, X, y, n_folds=3, sample_weight_flag=True):
    """Multi class version of Logarithmic Loss metric.
    https://www.kaggle.com/wiki/MultiClassLogLoss

    Parameters
    ----------
    models : dictionary, shape = [n_models]
    X : DataFrame, shape = [n_samples, n_classes]
    y : DataFrame, shape = [n_classes]

    Returns
    -------
    roc : matplotlib plot
    """

    # Split data into a development and evaluation set
    X_dev, X_eval, y_dev, y_eval = train_test_split(X, y, test_size=0.33, random_state=42)

    # Split development set into a train and test set
    X_train, X_test, y_train, y_test = train_test_split(X_dev, y_dev, 
                                                        test_size=0.33, random_state=seed)

    # Extract training and test weights from dataset
    sample_weight_dev = X_dev["globalTimesEventWeight"].values
    sample_weight_train = X_train["globalTimesEventWeight"].values
    sample_weight_test = X_test["globalTimesEventWeight"].values

    X_train = X_train.drop('globalTimesEventWeight', axis=1, inplace=False)
    X_test = X_test.drop('globalTimesEventWeight', axis=1, inplace=False)

    # contains rates for ML classifiers
    fpr = {}
    tpr = {}
    roc_auc = {}

    # Customize the major grid
    fig, ax = plt.subplots()
    ax.grid(which='major', linestyle='-', linewidth='0.2', color='gray')
    ax.set_facecolor('white')

    # Include random by chance 'luck' curve
    plt.plot([1, 0], [0, 1], '--', color=(0.1, 0.1, 0.1), label='Luck')

    # Loop through classifiers
    for (name, model) in models.items():

        print "\n\x1b[1;31mBuilding model "+name+" ...\x1b[0m"
        process = time.clock()
        if sample_weight_flag:
            model.fit(X_train, y_train, **{name.lower()+'__sample_weight': sample_weight_train})
        else:
            model.fit(X_train, y_train)

        print "\t%s fit time: %.3f"%(name, time.clock()-process)

        y_predicted = model.predict(X_test)

        process = time.clock()

        # Statistics summary report
        print classification_report(y_test, y_predicted, 
                                    target_names=['signal', 'background'],
                                    sample_weight=sample_weight_test)
        print("\tScore (i.e. accuracy) of test dataset: {:.5f}"
              .format(model.score(X_test, y_test, #sample_weight=sample_weight_test
                                 )))

        #scores = cross_val_score(model, 
        #                         X_dev.drop('globalTimesEventWeight', axis=1, inplace=False),
        #                         y_dev, scoring="roc_auc", cv=n_folds, n_jobs=1,#n_jobs=-1
        #                         fit_params={name.lower()+'__sample_weight': sample_weight_dev})

        #print "\tCross-validated AUC ROC score: %0.5f (+/- %0.5f)"%(scores.mean(), scores.std())

        if hasattr(model, "predict_proba"):
            # probability estimates of the positive class(as needed in the roc_curve function)
            decisions = model.predict_proba(X_test)[:, 1]
        else:  # use decision function
            decisions = model.decision_function(X_test)

        process = time.clock()
        fpr[name], tpr[name], thresholds = roc_curve(y_test, decisions,
                                                     sample_weight=sample_weight_test)

        # Non-cross-validated AUROC
        roc_auc[name] = auc(fpr[name], tpr[name])
        print "\tAUC ROC score for %s: %.4f"%(name, roc_auc[name])
        print "\tAUC ROC time: ", time.clock()-process

    # color choices: https://css-tricks.com/snippets/css/named-colors-and-hex-equivalents/
    colors = cycle(['aqua', 'darkorange', 'cornflowerblue', 
                    'green', 'yellow', 'SlateBlue', 'DarkSlateGrey',
                    'CadetBlue', 'Chocolate', 'darkred', 'GoldenRod'])

    for (name, model), color in zip(models.items(), colors):

        signal_efficiecy = tpr[name] # true positive rate (tpr)
        background_efficiecy = fpr[name] # false positive rate (fpr)
        # NOTE: background rejection rate = 1 - background efficiency (i.e specicity)
        background_rejection_rate = 1 - background_efficiecy

        plt.plot(signal_efficiecy, background_rejection_rate, color=color, lw=2,
                 label='%s (AUC = %0.3f)'%(name, roc_auc[name]))                 

    # Plot all ROC curves
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('Signal Efficiency (True Positive Rate)')
    plt.ylabel('Background Rejection Rate (1- False Positive Rate)')
    plt.title("Receiver operating characteristic ({} events)".format(X.shape[0]))
    leg = plt.legend(loc="lower left", frameon=True, fancybox=True, fontsize=10) # loc='best'
    leg.get_frame().set_edgecolor('w')
    frame = leg.get_frame()
    frame.set_facecolor('White')

    return plt.show() 

In [None]:
%%time
## Assessing the various classifiers performance

roc_plot(pipe_classifiers, df_X, df_y)

In [None]:
%%time
## Define precision-recall curve

def plot_PR_curve(classifier, X, y, n_folds=5):
    """
    Plot a basic precision/recall curve.
    """

    # Convert negative weights to 1.0 (or else average_precision_score crashes)
    X["globalTimesEventWeight"] = X["globalTimesEventWeight"].map(lambda x: 1.0 if x < 0.0 else x)
    signal_sample_weight = X["globalTimesEventWeight"][y>0.5].values
    background_sample_weight = X["globalTimesEventWeight"][y<0.5].values

    # Customize the major grid
    fig, ax = plt.subplots()
    ax.grid(which='major', linestyle='-', linewidth='0.2', color='gray')
    ax.set_facecolor('white')

    # Calculate the random luck for PR 
    # (above the constant line is a classifier that is well modeled)
    signal_count = sum(signal_sample_weight*y[y>0.5])
    background_count = sum(background_sample_weight*np.ones(len(y[y<0.5])))
    ratio = float(signal_count)/float(signal_count + background_count)

    # store average precision calculation
    avg_scores = []

    # Loop through classifiers
    for (name, model) in classifier.items():

        skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=seed)

        for train, test in skf.split(X, y):

            # Transform numpy array into panda dataframe to easier drop features
            training_samples = pd.DataFrame(X.iloc[train], columns=features)
            test_samples = pd.DataFrame(X.iloc[test], columns=features)

            # Extract training and test sample weights from the dataset
            sample_weight_train = training_samples["globalTimesEventWeight"].values
            sample_weight_test = test_samples["globalTimesEventWeight"].values

            # Drop event/sample weight feature
            training_samples = training_samples.drop('globalTimesEventWeight',
                                                     axis=1, inplace=False)
            test_samples = test_samples.drop('globalTimesEventWeight',
                                             axis=1, inplace=False)

            model.fit(training_samples, y[train],
                      **{name.lower()+'__sample_weight': sample_weight_train})   

            if hasattr(model, "predict_proba"):
                probas_ = model.predict_proba(test_samples)[:, 1]
            else:  # use decision function
                probas_ = model.decision_function(test_samples)

            # Compute precision recall curve
            precision, recall, thresholds = precision_recall_curve(y[test],
                                                                   probas_, pos_label=1,
                                                                   sample_weight=sample_weight_test)
            # Area under the precision-recall curve (AUCPR)
            sample_weight_test = np.ones(len(sample_weight_test))

            average_precision = average_precision_score(y[test], probas_, 
                                                        sample_weight=sample_weight_test)
            avg_scores.append(average_precision)

        plt.plot(recall, precision, lw=1, 
                 label='{0} (auc = {1:0.2f})'.format(name,np.mean(avg_scores, axis=0)))

    plt.plot([ratio,ratio], '--', color=(0.1, 0.1, 0.1), 
             label='Luck (auc = {0:0.2f})'.format(ratio))

    plt.xlim([-0.05, 1.05])
    plt.ylim([-0.05, 1.05])
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('Precision-recall curve')
    plt.legend(loc="upper right")

    return plt.show()

In [None]:
#%%time
# Plot precision-recall curve for several classifiers out-of-the-box

plot_PR_curve(pipe_classifiers, df_X, df_y, n_folds=3)

## Overfitting evaluation

### The Kolmogorov-Smirnov statistic

We perform a two-sided asymptotic Kolmogorov-Smirnov test in which the null hypothesis stipulates that two independent samples are drawn from the same continuous parent distribution. If the K-S statistic is small or the p-value is high, then we cannot reject the hypothesis that the distributions of the two samples are the same. 

${\displaystyle D_{n, n'}>c(\alpha ){\sqrt {\frac {n + n'}{n \cdot n'}}}}$, Where $n$ and $n'$ are the sizes of two samples, respectively, and

${\displaystyle c\left(\alpha \right)={\sqrt {-{\frac {1}{2}}\ln \left({\frac {\alpha }{2}}\right)}}}$

We reject the null hypothesis at the 95% level hence $\alpha=0.05$ corresponding to $c(0.05) = 1.36$.

For signal distributions,

${\displaystyle D_{n_s, n'_s}>c(\alpha ){\sqrt {\frac {n_s + n'_s}{n_s \cdot n'_s}}}}$, Where $n_s$ and $n'_s$ are the sizes of training and test signal samples, respectively.

For background distributions,

${\displaystyle D_{n_b, n'_b}>c(\alpha ){\sqrt {\frac {n_b + n'_b}{n_b \cdot n'_b}}}}$, Where $n_b$ and $n'_b$ are the sizes of training and test background samples, respectively.

In [None]:
%%time
# Source:
#       - https://github.com/scipy/scipy/blob/v0.14.0/scipy/stats/stats.py#L3809
#       - https://stackoverflow.com/questions/40044375/how-to-calculate-the-kolmogorov-smirnov-statistic-between-two-weighted-samples

def ks_weighted_2samp(data1, data2, wei1, wei2, alpha = 0.05):
    """
    Computes the Kolmogorov-Smirnov statistic on 2 samples.
    This is a two-sided test for the null hypothesis that 2 independent samples
    are drawn from the same continuous distribution.

    Parameters
    ----------
    data1, data2 : sequence of 1-D ndarrays
        two arrays of sample observations assumed to be drawn from a continuous
        distribution, sample sizes can be different

    wei1, wei2 : sequence of 1-D ndarrays
        two arrays with corresponding sample weights 

    alpha : float
        confidence level

    Returns
    -------
    D : float
        KS statistic
    p-value : float
        two-tailed p-value
    """

    data1, data2 = map(np.asarray, (data1, data2))

    hist1, bin_edges1 = np.histogram(data1, weights=wei1)
    n1 = sum(hist1)
    hist2, bin_edges2 = np.histogram(data2, weights=wei2)
    n2 = sum(hist2)

    ix1 = np.argsort(data1)
    ix2 = np.argsort(data2)

    data1 = data1[ix1]
    data2 = data2[ix2]

    wei1 = wei1[ix1]
    wei2 = wei2[ix2]

    data_all = np.concatenate([data1,data2])

    cwei1 = np.hstack([0.,np.cumsum(wei1)*1./sum(wei1)])
    cwei2 = np.hstack([0.,np.cumsum(wei2)*1./sum(wei2)])

    cdf1we = cwei1[[np.searchsorted(data1,data_all,side='right')]]
    cdf2we = cwei2[[np.searchsorted(data2,data_all,side='right')]]

    d = np.max(np.absolute(cdf1we - cdf2we))

    # Note: d absolute not signed distance
    en = np.sqrt(n1*n2/float(n1+n2))

    try:
        prob = distributions.kstwobign.sf((en + 0.12 + 0.11 / en) * d)
    except:
        prob = 1.0

    c_alpha = (-0.5*np.log(alpha/2.))**(0.5)
    k_alpha = c_alpha/en

    print "\n=============================="
    print "Summary Report:"
    print "KS(data) value: ", d
    print "KS(null) value: ", k_alpha
    
    if d > k_alpha:
        print "KS test: ", True, " (null-hypothesis rejected)"
    else:
        print "KS test: ", False, " (null-hypothesis not rejected)"

    return d, prob

In [None]:
%%time
def compute_cdf(ordered_weights):
    """Computes cumulative distribution function (CDF) by ordered weights,
    be sure that sum(ordered_weights) == 1.
    Minor difference: using symmetrized version
    F(x) = 1/2 (F(x-0) + F(x+0))
    """
    return np.cumsum(ordered_weights) - 0.5 * ordered_weights

In [None]:
%%time
def ks_2samp_weighted(data1, data2, weights1, weights2):
    """Kolmogorov-Smirnov distance, almost the same as ks2samp from scipy.stats, but this version supports weights.
    :param data1: array-like of shape [n_samples1]
    :param data2: array-like of shape [n_samples2]
    :param weights1: None or array-like of shape [n_samples1]
    :param weights2: None or array-like of shape [n_samples2]
    :return: float, Kolmogorov-Smirnov distance.
    """
    x = np.unique(np.concatenate([data1, data2]))
    
    weights1 = weights1 / np.sum(weights1) * 1.
    weights2 = weights2 / np.sum(weights2) * 1.
    
    inds1 = np.searchsorted(x, data1)
    inds2 = np.searchsorted(x, data2)
    
    w1 = np.bincount(inds1, weights=weights1, minlength=len(x))
    w2 = np.bincount(inds2, weights=weights2, minlength=len(x))
    
    F1 = compute_cdf(w1)
    F2 = compute_cdf(w2)
    
    return np.max(np.abs(F1 - F2))

In [None]:
## Defined overfitting plot
from matplotlib.ticker import MultipleLocator, AutoMinorLocator
#plt.style.use(['white_background'])
plt.style.use('seaborn-white')

def compare_train_test(clf, X_in, y_in, bins=50):
    """Multi class version of Logarithmic Loss metric.

    Parameters
    ----------
    y_true : array, shape = [n_samples]
            true class, intergers in [0, n_classes - 1)
    y_pred : array, shape = [n_samples, n_classes]

    Returns
    -------
    loss : float
    """

    if hasattr(clf, "steps"):
        name = clf.steps[1][1].__class__.__name__
    else:
        name = clf.__class__.__name__

    # Split data into a development and evaluation set
    X_dev, X_eval, y_dev, y_eval = train_test_split(X_in, y_in, 
                                                    test_size=0.33, random_state=42)
    # Split development set into a train and test set
    X_train, X_test, y_train, y_test = train_test_split(X_dev, y_dev, 
                                                       test_size=0.33, random_state=seed)

    sample_weight_train = X_train["globalTimesEventWeight"].values

    # Extract signal and background weights for traning set
    signal_sample_weight_train = X_train["globalTimesEventWeight"][y_train>0.5].values
    background_sample_weight_train = X_train["globalTimesEventWeight"][y_train<0.5].values

    signal_sample_weight_test = X_test["globalTimesEventWeight"][y_test>0.5].values
    background_sample_weight_test = X_test["globalTimesEventWeight"][y_test<0.5].values

    X_train = X_train.drop('globalTimesEventWeight', axis=1, inplace=False)
    X_test = X_test.drop('globalTimesEventWeight', axis=1, inplace=False)

    # use subplot to extract axis to add ks and p-value to plot
    fig, ax = plt.subplots()

    # Customize the major grid
    ax.grid(which='major', linestyle='-', linewidth='0.2', color='gray')
    ax.set_facecolor('white')

    if name != "CalibratedClassifierCV":
        clf.fit(X_train, y_train, **{name.lower()+'__sample_weight': sample_weight_train})
    else:
        clf.fit(X_train, y_train, **{'sample_weight': sample_weight_train})

    if hasattr(clf,"decision_function"):
        d = clf.decision_function(X_in.drop('globalTimesEventWeight', axis=1, inplace=False))
        bin_edges_low_high = np.linspace(min(d), max(d), bins + 1)
    else:
        bin_edges_low_high = np.linspace(0., 1., bins + 1)

    decisions = []
    for X, y in ((X_train, y_train), (X_test, y_test)):

        if hasattr(clf,"decision_function"): #hasattr(clf,"predict_proba")
            label_name = "Decision Function"
            d1 = clf.decision_function(X[y>0.5]).ravel()
            d2 = clf.decision_function(X[y<0.5]).ravel()
            d = clf.decision_function(X)
        else:
            label_name = "Prediction Probability"
            d1 = clf.predict_proba(X[y>0.5])[:, 1]
            d2 = clf.predict_proba(X[y<0.5])[:, 1]
            #d1 = cross_val_predict(clf, X[y>0.5], y, cv=3, method='predict_proba', n_jobs=-1)[:, 1]
            #d2 = cross_val_predict(clf, X[y<0.5], y, cv=3, method='predict_proba', n_jobs=-1)[:, 1]
                                   
            
        decisions += [d1, d2]

    width = np.diff(bin_edges_low_high)

    # Training signal and background histograms
    hist_sig_train, bin_edges = np.histogram(decisions[0], bins=bin_edges_low_high,
                                             weights=signal_sample_weight_train)

    hist_sig_train = hist_sig_train/sum(hist_sig_train)

    plt.bar(bin_edges[:-1], hist_sig_train, width=width, color='r', alpha=0.5, 
            label='signal (train)')

    hist_bkg_train, bin_edges = np.histogram(decisions[1], bins=bin_edges_low_high,
                                             weights=background_sample_weight_train)

    hist_bkg_train = hist_bkg_train/sum(hist_bkg_train)

    plt.bar(bin_edges[:-1], hist_bkg_train, width=width, 
            color='steelblue', alpha=0.5, label='background (train)')


    # Test signal and background histograms
    hist_sig_test, bin_edges = np.histogram(decisions[2], bins=bin_edges_low_high,
                                            weights=signal_sample_weight_test)

    hist_sig_test = hist_sig_test/sum(hist_sig_test)
    scale = len(decisions[2]) / sum(hist_sig_test)
    err = np.sqrt(hist_sig_test * scale) / scale
    center = (bin_edges[:-1] + bin_edges[1:]) / 2

    plt.errorbar(center, hist_sig_test, yerr=err, fmt='o', c='r', label='signal (test)')

    hist_bkg_test, bin_edges = np.histogram(decisions[3], bins=bin_edges_low_high,
                                            weights=background_sample_weight_test)

    hist_bkg_test = hist_bkg_test/sum(hist_bkg_test)
    scale = len(decisions[3]) / sum(hist_bkg_test)
    err = np.sqrt(hist_bkg_test * scale) / scale

    plt.errorbar(center, hist_bkg_test, yerr=err, fmt='o', c='steelblue', #range=low_high,
                 label='background (test)')

    # Estimate ks-test and p-values as an indicator of overtraining of fit model
    s_ks, s_pv = ks_weighted_2samp(decisions[0], decisions[2], 
                                   signal_sample_weight_train, signal_sample_weight_test)
    b_ks, b_pv = ks_weighted_2samp(decisions[1], decisions[3], 
                                   background_sample_weight_train, background_sample_weight_test)  
    
    print "[CC]", ks_weighted_2samp(decisions[0], decisions[2], signal_sample_weight_train, signal_sample_weight_test)
    
    print "[EC]", ks_2samp_weighted(decisions[0], decisions[2], signal_sample_weight_train, signal_sample_weight_test)    
    
    ax.set_title("Classifier: %s\nsignal (background) ks: %f (%f)\n p-value: %f (%f)" 
                 % (name, s_ks, b_ks, s_pv, b_pv))

    plt.xlabel(label_name)
    plt.ylabel("Arbitrary units")
    plt.legend(loc='best')

    # for the minor ticks, use no labels; default NullFormatter
    plt.tick_params(which='both', width=2)
    plt.tick_params(which='major', length=7, color='gray')
    plt.tick_params(which='minor', length=4, color='gray')
    #minorLocator = MultipleLocator(5)
    minorLocator = AutoMinorLocator()
    ax.xaxis.set_minor_locator(minorLocator)
    
    return plt.show()

In [None]:
%%time
## Overfitting evaluation
bins=50
# Uncalibrated model predictions
model = pipe_classifiers["GradientBoostingClassifier"]
compare_train_test(model, df_X, df_y, bins=bins)

# Calibrated with isotonic calibration
model_isotonic = CalibratedClassifierCV(model, cv=5, method='sigmoid')
compare_train_test(model_isotonic, df_X, df_y, bins=bins)

In [None]:
%%time
## Overfitting evaluation
bins=50
# Uncalibrated model predictions
model = pipe_classifiers["AdaBoostClassifier"]
compare_train_test(model, df_X, df_y, bins=bins)

# Calibrated with isotonic calibration
model_isotonic = CalibratedClassifierCV(model, cv=5, method='sigmoid')
compare_train_test(model_isotonic, df_X, df_y, bins=bins)

In [None]:
%%time
## Overfitting evaluation
bins=25
# Uncalibrated model predictions
model = pipe_classifiers["SVC"]
compare_train_test(model, df_X, df_y, bins=bins)

# Calibrated with isotonic calibration
model_isotonic = CalibratedClassifierCV(model, cv=5, method='sigmoid')
compare_train_test(model_isotonic, df_X, df_y, bins=bins)

In [None]:
%%time
## Overfitting evaluation
bins=50
# Uncalibrated model predictions
model = pipe_classifiers["RandomForestClassifier"]
compare_train_test(model, df_X, df_y, bins=bins)

# Calibrated with isotonic calibration
model_isotonic = CalibratedClassifierCV(model, cv=5, method='sigmoid')
compare_train_test(model_isotonic, df_X, df_y, bins=bins)

In [None]:
%%time
## Overfitting evaluation
bins=50
# Uncalibrated model predictions (sample_weight not implemented)
model = pipe_classifiers["ExtraTreesClassifier"]
compare_train_test(model, df_X, df_y, bins=bins)

# Calibrated with isotonic calibration
model_isotonic = CalibratedClassifierCV(model, cv=5, method='sigmoid')
compare_train_test(model_isotonic, df_X, df_y, bins=bins)

In [None]:
%%time
## Overfitting evaluation
bins=50
# Uncalibrated model predictions (sample_weight not implemented)
model = pipe_classifiers["BaggingClassifier"]
compare_train_test(model, df_X, df_y, bins=bins)

# Calibrated with isotonic calibration
model_isotonic = CalibratedClassifierCV(model, cv=5, method='sigmoid')
compare_train_test(model_isotonic, df_X, df_y, bins=bins)

In [None]:
## Overfitting evaluation

# NOTE: Classsifier does not implemented sample_weight: 
#      - LinearDiscriminantAnalysis, KNeighborsClassifier, GaussianNB, DecisionTreeClassifier


## Probability calibration

In [None]:
%%time
## Calibration curve (reliability curve)

def plot_calibration_curve(est, X, y, fig_index, n_bins=10):
    """Plot calibration curve for est w/o and with calibration. """
    
    # Split data into a development and evaluation set
    X_dev,X_eval, y_dev,y_eval = train_test_split(X, y,
                                                  test_size=0.33, random_state=42)
    # Split development set into a train and test set
    X_train, X_test, y_train, y_test = train_test_split(X_dev, y_dev, test_size=0.33,
                                                        random_state=seed)
    # Extract sample weights
    sample_weight_train = X_train["globalTimesEventWeight"].values
    sample_weight_test = X_test["globalTimesEventWeight"].values
    
    X_train = X_train.drop('globalTimesEventWeight', axis=1, inplace=False)
    X_test = X_test.drop('globalTimesEventWeight', axis=1, inplace=False)
    
    # Calibrated with isotonic calibration
    isotonic = CalibratedClassifierCV(est, cv=2, method='isotonic')

    # Calibrated with sigmoid calibration
    sigmoid = CalibratedClassifierCV(est, cv=2, method='sigmoid')

    fig = plt.figure(fig_index, figsize=(6, 6))
    ax1 = plt.subplot2grid((3, 1), (0, 0), rowspan=2)
    ax2 = plt.subplot2grid((3, 1), (2, 0))

    ax1.plot([0, 1], [0, 1], "--", label="Perfectly calibrated")

    #print type(est)
    #print type(isotonic)  
    #print type(sigmoid)
    
    for clf, name in [(est, est.steps[1][0]),
                      (isotonic, est.steps[1][0] + '_Isotonic'),
                      (sigmoid, est.steps[1][0] + '_Sigmoid')]: # Also called Platt Scaling
  
        #clf.fit(X_train, y_train, sample_weight
        if  clf.__class__.__name__ == "CalibratedClassifierCV":
            clf.fit(X_train,y_train, **{'sample_weight': sample_weight_train})
        else:
            clf.fit(X_train,y_train, 
                    **{clf.steps[1][1].__class__.__name__.lower()
                       +'__sample_weight': sample_weight_train});
            
        y_pred = clf.predict(X_test);
        
        if hasattr(clf, "predict_proba"):
            prob_pos = clf.predict_proba(X_test)[:, 1]
        else:  # use decision function
            prob_pos = clf.decision_function(X_test);
            prob_pos = (prob_pos - prob_pos.min()) / (prob_pos.max() - prob_pos.min())

        clf_score = brier_score_loss(y_test, prob_pos, pos_label=y.max())
        print("\n\x1b[1;31mclassifier %s:\x1b[0m" % name)
        print("\tBrier: %1.3f" % (clf_score))
        print("\tPrecision: %1.3f" % precision_score(y_test, y_pred))
        print("\tRecall: %1.3f" % recall_score(y_test, y_pred))
        print("\tF1: %1.3f\n" % f1_score(y_test, y_pred))

        fraction_of_positives, mean_predicted_value = calibration_curve(y_test, prob_pos, n_bins=10)

        ax1.plot(mean_predicted_value, fraction_of_positives, "o-",
                 label="%s (%1.3f)" % (name, clf_score))

        ax2.hist(prob_pos, range=(0, 1), bins=n_bins, label=name, weights=sample_weight_test,
                 histtype="step", lw=2)

    ax1.set_ylabel("Fraction of positives")
    ax1.set_ylim([-0.05, 1.05])
    ax1.legend(loc="lower right")
    ax1.set_title('Calibration plots  (reliability curve)')

    # Customize the major grid
    ax1.grid(which='major', linestyle='-', linewidth='0.2', color='gray')
    ax1.set_facecolor('white')
    
    ax2.set_xlabel("Mean predicted value")
    ax2.set_ylabel("Count")
    ax2.legend(loc="best", ncol=1)
    
    # Customize the major grid
    ax2.grid(which='major', linestyle='-', linewidth='0.2', color='gray')
    ax2.set_facecolor('white')
    
    plt.tight_layout()
    plt.show()

In [None]:
%%time
## Plot reliability curve (i.e. calibration curve)

plot_calibration_curve(make_pipeline(None, SVC()), df_X, df_y, 2)
plot_calibration_curve(pipe_classifiers["SVC"],   df_X, df_y, 2)

In [None]:
%%time
## Plot reliability curve (i.e. calibration curve)

plot_calibration_curve(make_pipeline(None, LogisticRegression()), df_X, df_y, 2)
plot_calibration_curve(pipe_classifiers["LogisticRegression"],   df_X, df_y, 2)

In [None]:
%%time
## Plot reliability curve (i.e. calibration curve)

plot_calibration_curve(pipe_classifiers["AdaBoostClassifier"],  df_X, df_y, 2)
plot_calibration_curve(pipe_classifiers["GradientBoostingClassifier"], df_X, df_y, 2)

In [None]:
## Plot reliability curve (i.e. calibration curve)

#plot_calibration_curve(pipe_classifiers["DecisionTreeClassifier"], df_X, df_y, 2) # problem with predict
#plot_calibration_curve(pipe_classifiers["RandomForestClassifier"],  df_X, df_y, 2) # problem with predict

In [None]:
## Plot reliability curve (i.e. calibration curve)

#plot_calibration_curve(make_pipeline(None, GaussianNB()), df_X, df_y, 2)
#plot_calibration_curve(pipe_classifiers["GaussianNB"],   df_X, df_y, 2)

In [None]:
## Plot reliability curve (i.e. calibration curve)

#plot_calibration_curve(make_pipeline(None, LinearDiscriminantAnalysis()), df_X, df_y, 2)
#plot_calibration_curve(pipe_classifiers["LinearDiscriminantAnalysis"],  df_X, df_y, 2)

In [None]:
## Plot reliability curve (i.e. calibration curve)

#plot_calibration_curve(make_pipeline(None, KNeighborsClassifier()), df_X, df_y, 2)
#plot_calibration_curve(pipe_classifiers["KNeighborsClassifier"],  df_X, df_y, 2)

In [None]:
## Plot reliability curve (i.e. calibration curve)

#plot_calibration_curve(make_pipeline(None, MLPClassifier()), df_X, df_y, 2)
#plot_calibration_curve(pipe_classifiers["MLPClassifier"],  df_X, df_y, 2)

In [None]:
%%time
## Confusion matrix plot

def plot_confusion_matrix(clf, X, y, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """Multi class version of Logarithmic Loss metric.
    https://www.kaggle.com/wiki/MultiClassLogLoss

    Parameters
    ----------
    y_true : array, shape = [n_samples]
            true class, intergers in [0, n_classes - 1)
    y_pred : array, shape = [n_samples, n_classes]

    Returns
    -------
    loss : float
    """
    
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    # Split data into a development and evaluation set
    X_dev,X_eval, y_dev,y_eval = train_test_split(X, y,
                                              test_size=0.33, random_state=42)
    # Split development set into a train and test set
    X_train, X_test, y_train, y_test = train_test_split(X_dev, y_dev, test_size=0.33,
                                                        random_state=seed)
    
    sample_weight = X_train["globalTimesEventWeight"].values
    
    X_train = X_train.drop('globalTimesEventWeight', axis=1, inplace=False)
    X_test = X_test.drop('globalTimesEventWeight', axis=1, inplace=False)
    
    classifier = clf.fit(X_train,y_train, 
                         **{clf.steps[1][1].__class__.__name__.lower()
                            +'__sample_weight': sample_weight}) 

    
    #classifier = clf.fit(X_train, y_train)
    y_pred = classifier.predict(X_test)
    
    # Compute confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    
    plt.imshow(cm, interpolation='nearest', cmap=cmap);
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)
 
    name = clf.steps[1][0]

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    thresh = cm.max() / 2.
 
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.grid(False, which='both')
    
    return plt.show()

In [None]:
%%time
## Set envorinment

np.set_printoptions(precision=2)
class_names = ['Background', 'Signal']

In [None]:
%%time
## Generate confusion matrix plot

# Plot non-normalized confusion matrix
clf = pipe_classifiers["GradientBoostingClassifier"]
plot_confusion_matrix(clf, df_X, df_y, classes=class_names,
                      title="Classifier: %s\nConfusion matrix, without normalization"%("Gradient-Boosting"))

In [None]:
%%time
## Generate confusion matrix plot

# Plot normalized confusion matrix
clf = pipe_classifiers["GradientBoostingClassifier"]
plot_confusion_matrix(clf, df_X, df_y, classes=class_names, normalize=True,
                      title="Classifier: %s\nNormalized confusion matrix"%("Gradient-Boosting"))

In [None]:
%%time
## Generate confusion matrix plot

# Plot non-normalized confusion matrix
clf = pipe_classifiers["AdaBoostClassifier"]
plot_confusion_matrix(clf, df_X, df_y, classes=class_names,
                      title="Classifier: %s\nConfusion matrix, without normalization"%("Adaptive-Boosting"))

In [None]:
%%time
## Generate confusion matrix plot

# Plot normalized confusion matrix
clf = pipe_classifiers["AdaBoostClassifier"]
plot_confusion_matrix(clf, df_X, df_y, classes=class_names, normalize=True,
                      title="Classifier: %s\nNormalized confusion matrix"%("Adaptive-Boosting"))

In [None]:
%%time
## Generate confusion matrix plot

# Plot non-normalized confusion matrix
clf = pipe_classifiers["SVC"]
plot_confusion_matrix(clf, df_X, df_y, classes=class_names,
                      title="Classifier: %s\nConfusion matrix, without normalization"%("Support Vector"))

In [None]:
%%time
## Generate confusion matrix plot

# Plot normalized confusion matrix
clf = pipe_classifiers["SVC"]
plot_confusion_matrix(clf, df_X, df_y, classes=class_names, normalize=True,
                      title="Classifier: %s\nNormalized confusion matrix"%("Super Vector"))

In [None]:
%%time
# Plot non-normalized confusion matrix
clf = pipe_classifiers["LogisticRegression"]
plot_confusion_matrix(clf, df_X, df_y, classes=class_names,
                      title="Classifier: %s\nConfusion matrix, without normalization"%("Logistic Regression"))

In [None]:
%%time
## Generate confusion matrix plot

# Plot normalized confusion matrix
clf = pipe_classifiers["LogisticRegression"]
plot_confusion_matrix(clf, df_X, df_y, classes=class_names, normalize=True,
                      title="Classifier: %s\nNormalized confusion matrix"%("Logistic Regression"))

In [None]:
%%time
## Generate confusion matrix plot

# Plot non-normalized confusion matrix
clf = pipe_classifiers["GradientBoostingClassifier"]
plot_confusion_matrix(clf, df_X, df_y, classes=class_names,
                      title="Classifier: %s\nConfusion matrix, without normalization"%("Random Forest"))

In [None]:
%%time
## Generate confusion matrix plot

# Plot normalized confusion matrix
clf = pipe_classifiers["GradientBoostingClassifier"]
plot_confusion_matrix(clf, df_X, df_y, classes=class_names, normalize=True,
                      title="Classifier: %s\nNormalized confusion matrix"%("Random Forest"))

In [None]:
%%time
## Generate confusion matrix plot

# Plot non-normalized confusion matrix
clf = pipe_classifiers["DecisionTreeClassifier"]
plot_confusion_matrix(clf, df_X, df_y, classes=class_names,
                      title="Classifier: %s\nConfusion matrix, without normalization"%("Decision Tree"))

In [None]:
%%time
## Generate confusion matrix plot

# Plot normalized confusion matrix
clf = pipe_classifiers["DecisionTreeClassifier"]
plot_confusion_matrix(clf, df_X, df_y, classes=class_names, normalize=True,
                      title="Classifier: %s\nNormalized confusion matrix"%("Decision Tree"))

In [None]:
# Plot non-normalized confusion matrix
#clf = pipe_classifiers["LinearDiscriminantAnalysis"]
#plot_confusion_matrix(clf, df_X, df_y, classes=class_names,
#                      title="Classifier: %s\nConfusion matrix, without normalization"%("Linear Discriminant Analysis"))

In [None]:
# Plot normalized confusion matrix
#clf = pipe_classifiers["LinearDiscriminantAnalysis"]
#plot_confusion_matrix(clf, df_X, df_y, classes=class_names, normalize=True,
#                      title="Classifier: %s\nNormalized confusion matrix"%("Linear Discriminant Analysis"))

In [None]:
# Plot non-normalized confusion matrix
#clf = pipe_classifiers["GaussianNB"]
#plot_confusion_matrix(clf, df_X, df_y, classes=class_names,
#                      title="Classifier: %s\nConfusion matrix, without normalization"%("Guassian Naive Bayes"))

In [None]:
# Plot normalized confusion matrix
#clf = pipe_classifiers["GaussianNB"]
#plot_confusion_matrix(clf, df_X, df_y, classes=class_names, normalize=True,
#                      title="Classifier: %s\nNormalized confusion matrix"%("Gaussian Naive Bayes"))

In [None]:
# Plot non-normalized confusion matrix
#clf = pipe_classifiers["KNeighborsClassifier"]
#plot_confusion_matrix(clf, df_X, df_y, classes=class_names,
#                      title="Classifier: %s\nConfusion matrix, without normalization"%("K-Nearest Neighbor"))

In [None]:
# Plot normalized confusion matrix
#clf = pipe_classifiers["KNeighborsClassifier"]
#plot_confusion_matrix(clf, df_X, df_y, classes=class_names, normalize=True,
#                      title="Classifier: %s\nNormalized confusion matrix"%("K-Nearest Neighbor"))

In [None]:
%%time
# Learning curve

def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,
                        n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5)):
    
    """
    ========================
    Plotting Learning Curves
    ========================
    """
    
    """
    Generate a simple plot of the test and training learning curve.

    Parameters
    ----------
    estimator : object type that implements the "fit" and "predict" methods
        An object of that type which is cloned for each validation.

    title : string
        Title for the chart.

    X : array-like, shape (n_samples, n_features)
        Training vector, where n_samples is the number of samples and
        n_features is the number of features.

    y : array-like, shape (n_samples) or (n_samples, n_features), optional
        Target relative to X for classification or regression;
        None for unsupervised learning.

    ylim : tuple, shape (ymin, ymax), optional
        Defines minimum and maximum yvalues plotted.

    cv : int, cross-validation generator or an iterable, optional
        Determines the cross-validation splitting strategy.
        Possible inputs for cv are:
          - None, to use the default 3-fold cross-validation,
          - integer, to specify the number of folds.
          - An object to be used as a cross-validation generator.
          - An iterable yielding train/test splits.

        For integer/None inputs, if ``y`` is binary or multiclass,
        :class:`StratifiedKFold` used. If the estimator is not a classifier
        or if ``y`` is neither binary nor multiclass, :class:`KFold` is used.

        Refer :ref:`User Guide <cross_validation>` for the various
        cross-validators that can be used here.

    n_jobs : integer, optional
        Number of jobs to run in parallel (default 1).
    """
    plt.figure()
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel("Training examples")
    plt.ylabel("Score")
    
    #train_sizes, train_scores, test_scores = learning_curve(estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)    
    train_sizes_abs, train_scores, test_scores = \
    learning_curve(estimator, X, y,
                   train_sizes=np.linspace(0.1, 1.0, 10),
                   cv=None, scoring=None,
                   exploit_incremental_learning=False,
                   n_jobs=1, pre_dispatch="all", verbose=0)
    
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.grid()

    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
             label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
             label="Cross-validation score")

    plt.legend(loc="best")
    
    return plt.show()

In [None]:
%%time # FIXME: CURRENTLY CRASHES USED TO WORK
## Plot learning curve

title = "Learning Curves (Gradient Boosting)"

# Cross validation with 100 iterations to get smoother mean test and train
# score curves, each time with 20% data randomly selected as a validation set.
cv = ShuffleSplit(n_splits=100, test_size=0.33, random_state=0)

estimator = GradientBoostingClassifier()
plot_learning_curve(estimator, title, df_X.drop('globalTimesEventWeight', axis=1, inplace=False), 
                    df_y, ylim=(0.4, 1.01), cv=cv, n_jobs=-1, train_sizes=np.linspace(.1, 1.0, 5))

In [None]:
%%time
from sklearn.linear_model import LinearRegression

def plot_curve():
    # instantiate
    lg = LinearRegression()

    # fit
    X = df_X.drop('globalTimesEventWeight', axis=1, inplace=False).values
    y = df_y.values
    lg.fit(X, y)
    
    
    """
    Generate a simple plot of the test and traning learning curve.

    Parameters
    ----------
    estimator : object type that implements the "fit" and "predict" methods
        An object of that type which is cloned for each validation.

    title : string
        Title for the chart.

    X : array-like, shape (n_samples, n_features)
        Training vector, where n_samples is the number of samples and
        n_features is the number of features.

    y : array-like, shape (n_samples) or (n_samples, n_features), optional
        Target relative to X for classification or regression;
        None for unsupervised learning.

    ylim : tuple, shape (ymin, ymax), optional
        Defines minimum and maximum yvalues plotted.

    cv : integer, cross-validation generator, optional
        If an integer is passed, it is the number of folds (defaults to 3).
        Specific cross-validation objects can be passed, see
        sklearn.cross_validation module for the list of possible objects

    n_jobs : integer, optional
        Number of jobs to run in parallel (default 1).
        
    x1 = np.linspace(0, 10, 8, endpoint=True) produces
        8 evenly spaced points in the range 0 to 10
    """
    
    train_sizes, train_scores, test_scores = learning_curve(lg, X, y, n_jobs=-1, cv=cv, 
                                                            train_sizes=np.linspace(.1, 1.0, 5), verbose=0)
    
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    
    plt.figure()
    plt.title("RandomForestClassifier")
    plt.legend(loc="best")
    plt.xlabel("Training examples")
    plt.ylabel("Score")
    plt.gca().invert_yaxis()
    
    # box-like grid
    plt.grid()
    
    # plot the std deviation as a transparent range at each training set size
    plt.fill_between(train_sizes, train_scores_mean - train_scores_std, 
                     train_scores_mean + train_scores_std, alpha=0.1, color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std, 
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    
    # plot the average training and test score lines at each training set size
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g", label="Cross-validation score")
    
    # sizes the window for readability and displays the plot
    # shows error from 0 to 1.1
    plt.ylim(-.1,1.1)
    plt.show()

plot_curve()

In [None]:
%%time
from sklearn.externals.six import StringIO
import pydot

clf = ExtraTreeClassifier()

dot_data = StringIO() 
tree.export_graphviz(clf, out_file=dot_data) 
graph = pydot.graph_from_dot_data(dot_data.getvalue()) 
file_name = "DT_ttH.pdf"
graph.write_pdf(file_name) 