In [25]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
import xgboost as xgb
import yandex_evaluation as evaluation

In [26]:
train = pd.read_csv("input/training.csv")
test  = pd.read_csv("input/test.csv")

In [27]:
features = list(train.columns[1:-5])

In [28]:
rf = RandomForestClassifier(
    n_estimators=300,
    random_state=1)
rf.fit(train[features], train["signal"])

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=300, n_jobs=1,
            oob_score=False, random_state=1, verbose=0, warm_start=False)

In [29]:
gb = GradientBoostingClassifier(
    n_estimators=40,
    learning_rate=0.01,
    subsample=0.7,
    min_samples_leaf=10,
    max_depth=7,
    random_state=11)
gb.fit(train[features], train['signal'])

GradientBoostingClassifier(init=None, learning_rate=0.01, loss='deviance',
              max_depth=7, max_features=None, max_leaf_nodes=None,
              min_samples_leaf=10, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=40,
              random_state=11, subsample=0.7, verbose=0, warm_start=False)

In [30]:
m = gb

In [31]:
params = {"objective": "binary:logistic",
          "eta": 0.3,
          "max_depth": 6,
          "min_child_weight": 2,
          "silent": 1,
          "subsample": 0.4,
          "colsample_bytree": 0.4,
          "seed": 1}
num_trees=250
gbm = xgb.train(params, xgb.DMatrix(train[features], train["signal"]), num_trees)

In [32]:
check_agreement = pd.read_csv('input/check_agreement.csv', index_col='id')
agreement_probs = m.predict_proba(check_agreement[features])[:, 1]
ks = evaluation.compute_ks(
    agreement_probs[check_agreement['signal'].values == 0],
    agreement_probs[check_agreement['signal'].values == 1],
    check_agreement[check_agreement['signal'] == 0]['weight'].values,
    check_agreement[check_agreement['signal'] == 1]['weight'].values)
print 'KS metric', ks, ks < 0.09

KS metric 0.0591267190271 True


In [33]:
check_correlation = pd.read_csv('input/check_correlation.csv', index_col='id')
correlation_probs = m.predict_proba(check_correlation[features])[:, 1]
cvm = evaluation.compute_cvm(correlation_probs, check_correlation['mass'])
print 'CvM metric', cvm, cvm < 0.002

CvM metric 0.000966771969177 True


In [34]:
test_probs = (m.predict_proba(test[features])[:,1] +
              gbm.predict(xgb.DMatrix(test[features])))/2
submission = pd.DataFrame({"id": test["id"], "prediction": test_probs})
submission.to_csv("output/ben_hamner_rf_xgboost_submission.csv", index=False)