In [None]:
from __future__ import division

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.pipeline import Pipeline
from sklearn.svm import OneClassSVM

In [None]:
df_train = pd.read_csv('new_train.csv')

In [None]:
names = df_train.columns.values.tolist()[:-1]

In [None]:
df_test = pd.read_csv('test.csv', names=names)

In [None]:
# remove constant columns
remove = []
for col in df_train.columns:
    if df_train[col].std() == 0:
        remove.append(col)

df_train.drop(remove, axis=1, inplace=True)
df_test.drop(remove, axis=1, inplace=True)

In [None]:
# remove duplicated columns
remove = []
c = df_train.columns
for i in range(len(c)-1):
    v = df_train[c[i]].values
    for j in range(i+1,len(c)):
        if np.array_equal(v,df_train[c[j]].values):
            remove.append(c[j])
df_train.drop(remove, axis=1, inplace=True)
df_test.drop(remove, axis=1, inplace=True)

In [None]:
y_train = df_train['TARGET'].values
X_train = df_train.drop(['TARGET'], axis=1).values

X_test = df_test.values

# length of dataset
len_train = len(X_train)
len_test  = len(X_test)

In [None]:
# classifier
clf = xgb.XGBClassifier(missing=np.nan, max_depth=5,
                        n_estimators=350, learning_rate=0.03, 
                        nthread=-1, subsample=0.95, colsample_bytree=0.85, seed=4242)

X_fit, X_eval, y_fit, y_eval= train_test_split(X_train, y_train, test_size=0.3)

In [None]:
# fitting
clf.fit(X_train, y_train, early_stopping_rounds=20, eval_metric="auc", eval_set=[(X_eval, y_eval)])

In [None]:
print('Overall AUC:', roc_auc_score(y_train, clf.predict_proba(X_train)[:,1]))

In [None]:
# predicting
y_pred= clf.predict_proba(X_test)[:,1]

In [None]:
submission = pd.DataFrame(data=y_pred)
submission.to_csv("submission.csv", index=False, header=None)