# Import

In [92]:
import pandas
from sklearn.ensemble import GradientBoostingClassifier
import yandex_evaluation as evaluation

# Read training data

In [93]:
folder = 'input/'
train = pandas.read_csv(folder + 'training.csv', index_col='id')

In [94]:
train.head().T

id,18453471,5364094,11130990,15173787,1102544
LifeTime,0.001578,0.000988,0.000877,0.000854,0.001129
dira,0.999999,0.999705,0.999984,0.999903,0.999995
FlightDistance,14.033335,5.536157,6.117302,5.228067,39.069534
FlightDistanceError,0.681401,0.302341,0.276463,0.220739,1.898197
IP,0.016039,0.142163,0.034746,0.076389,0.120936
IPSig,0.451886,9.564503,1.970751,4.271331,4.984982
VertexChi2,1.900433,0.865666,10.975849,3.276358,0.468348
pt,1482.037476,3050.720703,3895.908691,4010.781738,4144.546875
DOCAone,0.066667,0.024022,0.055044,0.053779,0.004491
DOCAtwo,0.060602,0.019245,0.047947,0.006417,0.037326


# Define training features
Here we use subset of the all features to pass the agreement checking

In [95]:
#variables = ['LifeTime', 'FlightDistance', 'pt']
variables = train.columns[:-10]
variables

Index([u'LifeTime', u'dira', u'FlightDistance', u'FlightDistanceError', u'IP',
       u'IPSig', u'VertexChi2', u'pt', u'DOCAone', u'DOCAtwo', u'DOCAthree',
       u'IP_p0p2', u'IP_p1p2', u'isolationa', u'isolationb', u'isolationc',
       u'isolationd', u'isolatione', u'isolationf', u'iso', u'CDF1', u'CDF2',
       u'CDF3', u'ISO_SumBDT', u'p0_IsoBDT', u'p1_IsoBDT', u'p2_IsoBDT',
       u'p0_track_Chi2Dof', u'p1_track_Chi2Dof', u'p2_track_Chi2Dof', u'p0_IP',
       u'p1_IP', u'p2_IP', u'p0_IPSig', u'p1_IPSig', u'p2_IPSig', u'p0_pt',
       u'p1_pt', u'p2_pt', u'p0_p'],
      dtype='object')

# Baseline training

In [96]:
baseline = GradientBoostingClassifier(n_estimators=40, learning_rate=0.01, subsample=0.7,
                                      min_samples_leaf=10, max_depth=7, random_state=11)
baseline.fit(train[variables], train['signal'])

GradientBoostingClassifier(init=None, learning_rate=0.01, loss='deviance',
              max_depth=7, max_features=None, max_leaf_nodes=None,
              min_samples_leaf=10, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=40,
              random_state=11, subsample=0.7, verbose=0, warm_start=False)

# Check agreement test

In [97]:
check_agreement = pandas.read_csv(folder + 'check_agreement.csv', index_col='id')
agreement_probs = baseline.predict_proba(check_agreement[variables])[:, 1]

ks = evaluation.compute_ks(
    agreement_probs[check_agreement['signal'].values == 0],
    agreement_probs[check_agreement['signal'].values == 1],
    check_agreement[check_agreement['signal'] == 0]['weight'].values,
    check_agreement[check_agreement['signal'] == 1]['weight'].values)
print 'KS metric', ks, ks < 0.09

KS metric 0.059192038363 True


# Check correlation test

In [98]:
check_correlation = pandas.read_csv(folder + 'check_correlation.csv', index_col='id')
correlation_probs = baseline.predict_proba(check_correlation[variables])[:, 1]
cvm = evaluation.compute_cvm(correlation_probs, check_correlation['mass'])
print 'CvM metric', cvm, cvm < 0.002

CvM metric 0.000970051229354 True


# Compute weighted AUC on the training data with min_ANNmuon > 0.4

In [99]:
train_eval = train[train['min_ANNmuon'] > 0.4]
train_probs = baseline.predict_proba(train_eval[variables])[:, 1]
AUC = evaluation.roc_auc_truncated(train_eval['signal'], train_probs)
print 'AUC', AUC

AUC 0.977884355139


# Predict test, create file for kaggle

In [100]:
test = pandas.read_csv(folder + 'test.csv', index_col='id')
result = pandas.DataFrame({'id': test.index})
result['prediction'] = baseline.predict_proba(test[variables])[:, 1]

In [101]:
result.to_csv('output/baseline.csv', index=False, sep=',')