In [None]:
get_ipython().magic('matplotlib inline')

import matplotlib
import numpy as np
import matplotlib.pyplot as plt

In [None]:
from sklearn import cross_validation, svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split
from sklearn.metrics import roc_curve, auc
import pylab as pl
import xgboost as xgb

from functools import reduce

In [None]:
def itercols(dataframe):
    for col in dataframe.columns:
        yield (col, dataframe[col])

In [None]:
RAND_SEED = 111398

In [None]:
from pandas import read_csv, DataFrame, Series
data = read_csv('~/kaggle/santander/train.csv')

In [None]:
cdata = data.drop(['ID', 'TARGET'], axis=1)
cdata[[kv[0] for kv in filter(lambda kv: kv[1], cdata.mean().isnull().items())]].count()

In [None]:
#log scaling features with higher than criterion_to_log difference between min and max
criterion_to_log = 1e2
cols_to_log = [kv[0] for kv in filter(lambda nc: np.abs(nc[1].min() - nc[1].max()) > criterion_to_log, itercols(cdata))]
offsets = {}
for col in cols_to_log:
    offset = np.abs(cdata[col]).max() + np.exp(1)
    offsets[col] = offset
    cdata[col] = np.log(cdata[col] + offset)# - np.log(offset)

In [None]:
#feature scaling
#from sklearn import preprocessing

#for col in cdata.columns:
#    cdata[col] = preprocessing.StandardScaler().fit_transform(np.array(cdata[col]).astype(np.float64).reshape(-1,1))

means = {}
deviations = {}

for col in cdata.columns:
    ser = cdata[col]
    
    mean = ser.mean()
    if np.abs(mean) < 1e-10:
        continue
    
    dev = ser.std(ddof=1)
    if np.abs(dev) < 1e-10:
        continue
    
    #print("mean = ", mean, " dev = ", dev, "\n")
    
    means[col] = mean
    deviations[col] = dev
    
    cdata[col] = (ser - mean) / dev

In [None]:
testdata = read_csv('~/kaggle/santander/test.csv')
ctestdata = testdata.drop(['ID'], axis=1)

In [None]:
#log scaling test samle
for col in cols_to_log:
    offset = offsets[col]
    ctestdata[col] = np.log(ctestdata[col] + offset)# - np.log(offset)

In [None]:
#standardizing
for col in means:
    mean, dev = means[col], deviations[col]
    ctestdata[col] = (ctestdata[col] - mean) / dev

In [None]:
Y = data['TARGET']
X = cdata

In [None]:
X_train, _, y_train, _ = train_test_split(X, Y, test_size = 0.2, random_state = RAND_SEED)

In [None]:
#feature count reduction
#rfc = RandomForestClassifier(n_estimators = 1000, n_jobs = -1, random_state = 42)
#rf_tr = rfc.fit(X_train, y_train)

#cdata = DataFrame(rf_tr.transform(cdata))
#ctestdata = DataFrame(rf_tr.transform(ctestdata))

#X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, random_state = 42)

In [None]:
#model_rfc = RandomForestClassifier(n_estimators = 1000, n_jobs = -1)
model_xgb = xgb.XGBClassifier(\
    max_depth = 7,\
    n_estimators = 100,\
    learning_rate = 0.1,\
    colsample_bytree = 0.85,\
    min_child_weight = 3,\
    nthread = -1,\
)

In [None]:
kcv = 5
cvd = {}

#scores = cross_validation.cross_val_score(model_rfc, X_train, y_train, cv = kcv)
#cvd['RandomForestClassifier'] = scores

scores = cross_validation.cross_val_score(model_xgb, X_train, y_train, cv = kcv)
#cvd['XGBClassifier'] = scores

#plt.plot(range(len(scores)), cvd['RandomForestClassifier'], 'b', range(len(scores)), cvd['XGBClassifier'], 'r')
plt.plot(range(len(scores)), scores, 'r')
plt.show()

In [None]:
scores

In [None]:
pl.clf()

#rf_probas = model_rfc.fit(X_train, y_train).predict_proba(X_test)
#fpr, tpr, thresholds = roc_curve(y_test, rf_probas[:, 1])
#roc_auc  = auc(fpr, tpr)
#pl.plot(fpr, tpr, label='%s ROC (area = %0.2f)' % ('RandomForest',roc_auc))

gb_probas = model_xgb.fit(X_train, y_train).predict_proba(X_test)
fpr, tpr, thresholds = roc_curve(y_test, gb_probas[:, 1])
roc_auc  = auc(fpr, tpr)
pl.plot(fpr, tpr, label='%s ROC (area = %0.2f)' % ('GradientBoosting',roc_auc))

#rf_gb_probas = np.maximum(rf_probas, gb_probas)
#fpr, tpr, thresholds = roc_curve(y_test, rf_gb_probas[:, 1])
#roc_auc  = auc(fpr, tpr)
#pl.plot(fpr, tpr, label='%s ROC (area = %0.2f)' % ('RF + GB',roc_auc))

pl.xlabel('False Positive Rate')
pl.ylabel('True Positive Rate')
pl.legend(loc=0, fontsize='small')
pl.show()

In [None]:
result = DataFrame(testdata['ID'])
pred_xgb = model_xgb.fit(X, Y).predict_proba(ctestdata)[:, 1]
result.insert(1,'TARGET', pred_xgb)
result.to_csv('~/ml/kaggle_santander_happy_customers/res_xgb.csv', index=False)

#result = DataFrame(testdata['ID'])
#pred_rfc = model_rfc.fit(X, Y).predict_proba(ctestdata)[:, 1]
#result.insert(1,'TARGET', pred_rfc)
#result.to_csv('~/ml/kaggle_santander_happy_customers/res_rf.csv', index=False)

#result = DataFrame(testdata['ID'])
#result.insert(1,'TARGET', (pred_xgb + pred_rfc) / 2)
#result.to_csv('~/ml/kaggle_santander_happy_customers/res_xgb_rf.csv', index=False)