In [1]:
%matplotlib inline

In [2]:
import sys
import os
sys.path.append(os.path.abspath('..'))

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.covariance import EmpiricalCovariance

from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.optimizers import SGD

from santander.preprocessing import ColumnDropper
from santander.preprocessing import ZERO_VARIANCE_COLUMNS, CORRELATED_COLUMNS

Using Theano backend.
  "downsample module has been moved to the theano.tensor.signal.pool module.")


In [4]:
df_train = pd.read_csv('../data/train.csv')
df_target = df_train['TARGET']
df_train = df_train.drop(['TARGET', 'ID'], axis=1)

In [5]:
# examining the effect of preprocessing/feature engineering on fixed knn scores -- base case

X_train, X_test, y_train, y_test= train_test_split(df_train, df_target, test_size=0.3, random_state=0)

pipeline = Pipeline([
    ('cd', ColumnDropper(drop=ZERO_VARIANCE_COLUMNS+CORRELATED_COLUMNS)),
    ('std', StandardScaler()),
    ('pca', PCA(n_components=0.6))
])

X_train = pipeline.fit_transform(X_train)
X_test = pipeline.transform(X_test)

knn = KNeighborsClassifier(n_jobs=-1)
knn.fit(X_train, y_train)
print 'KNN: %f' % roc_auc_score(y_test, knn.predict_proba(X_test)[:, -1])

knn_bag = BaggingClassifier(KNeighborsClassifier(n_jobs=-1), max_samples=0.01, max_features=0.9, n_estimators=250, random_state=0)
knn_bag = knn_bag.fit(X_train, y_train)
print 'KNN bag: %f' % roc_auc_score(y_test, knn_bag.predict_proba(X_test)[:, -1])

KNN: 0.629489
KNN bag: 0.794783


In [6]:
# examining the effect of preprocessing/feature engineering on fixed knn scores

df_train_copy = df_train.copy()
df_train_copy.loc[df_train_copy['var3'] == -999999.000000, 'var3'] = 2.0
df_train_copy['num_zeros'] = (df_train_copy == 0).sum(axis=1)

X_train, X_test, y_train, y_test= train_test_split(df_train_copy, df_target, test_size=0.3, random_state=0)
age = X_test['var15']

# 50k cutoff is slightly worse in this CV, but more conservative
ec = EmpiricalCovariance()
ec = ec.fit(X_train)
m2 = ec.mahalanobis(X_train)
X_train = X_train[m2 < 40000]
y_train = y_train[m2 < 40000]

# clip test to train values
X_test = X_test.clip(X_train.min(), X_train.max(), axis=1)

pipeline = Pipeline([
    ('cd', ColumnDropper(drop=ZERO_VARIANCE_COLUMNS+CORRELATED_COLUMNS)),
    ('std', StandardScaler()),
    ('pca', PCA(n_components=0.6))
])

X_train = pipeline.fit_transform(X_train)
X_test = pipeline.transform(X_test)

knn = KNeighborsClassifier(n_jobs=-1)
knn.fit(X_train, y_train)
y_pred = knn.predict_proba(X_test)[:, -1]
y_pred[age < 23] = 0
print 'KNN: %f' % roc_auc_score(y_test, y_pred)

knn_bag = BaggingClassifier(KNeighborsClassifier(n_jobs=-1), max_samples=0.01, max_features=0.9, n_estimators=250, random_state=0)
knn_bag = knn_bag.fit(X_train, y_train)
y_pred = knn_bag.predict_proba(X_test)[:, -1]
y_pred[age < 23] = 0
print 'KNN bag: %f' % roc_auc_score(y_test, y_pred)



KNN: 0.630247
KNN bag: 0.797944


