In [1]:
import time

import pandas as pd
import numpy as np

from sklearn.ensemble import GradientBoostingClassifier, ExtraTreesClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
import xgboost

In [2]:
trainTransformedDF = pd.read_csv('./data/trainTransformed.csv')

In [3]:
trainTransformedDF.shape

(76020, 3921)

In [4]:
trainTransformedDF.TARGET.value_counts()

0    73012
1     3008
Name: TARGET, dtype: int64

In [5]:
dataTarget0 = trainTransformedDF[trainTransformedDF.TARGET == 0]
dataTarget1 = trainTransformedDF[trainTransformedDF.TARGET == 1]

def getBalancedTrainAndValidationSets():
    np.random.seed()
    # shuffle
    dataTarget0.reindex(np.random.permutation(dataTarget0.index))
    dataTarget1.reindex(np.random.permutation(dataTarget1.index))

    trn0 = dataTarget0[0:1500]
    trn1 = dataTarget1[0:1500]
    trn = pd.concat([trn0, trn1])
    y_train = trn['TARGET']
    X_train = trn.drop(['TARGET'], axis=1)
    
    val0 = dataTarget0[1500:3000]
    val1 = dataTarget1[1500:]
    val = pd.concat([val0, val1])
    y_val = val['TARGET']
    X_val = val.drop(['TARGET'], axis=1)
    
    return X_train, y_train, X_val, y_val

In [10]:
# cross validated performance
n_folds = 5
clf = GradientBoostingClassifier(max_features=12)
trn_scores = []
val_scores = []
start = time.time()
for i in range(n_folds):
    X_train, y_train, X_val, y_val = getBalancedTrainAndValidationSets()
    clf.fit(X_train, y_train)
    # evaluate w .predict_proba() !!!! :
    trn_score = roc_auc_score(y_train, clf.predict_proba(X_train)[:,1])
    val_score = roc_auc_score(y_val, clf.predict_proba(X_val)[:,1])
    # evaluate w .predict():
    # trn_score = roc_auc_score(y_train, clf.predict(X_train))
    # val_score = roc_auc_score(y_val, clf.predict(X_val))
    print "train auc: %.4f" % trn_score
    print "validation auc: %.4f" % val_score
    trn_scores.append(trn_score)
    val_scores.append(val_score)

print "-----------------------"
print "Train Mean: %.4f" % np.mean(trn_scores)
print "Validation Mean: %.4f" % np.mean(val_scores)
print "Total Time (mins): %.1f" % ((time.time()-start)/60.)

train auc: 0.8689
validation auc: 0.8409
train auc: 0.8701
validation auc: 0.8407
train auc: 0.8691
validation auc: 0.8400
train auc: 0.8695
validation auc: 0.8393
train auc: 0.8682
validation auc: 0.8392
-----------------------
Train Mean: 0.8691
Validation Mean: 0.8400
Total Time (mins): 5.6


In [54]:
print X_train.shape
print y_train.shape
print X_val.shape
print y_val.shape

(3000, 3920)
(3000,)
(3008, 3920)
(3008,)


In [102]:
confusion_matrix(y_val, clf.predict(X_val))

array([[1054,  446],
       [ 419, 1089]])

## TRY WITH ORIGINAL DATA

In [2]:
trainDataFrame = pd.read_csv('./data/train.csv')

# remove constant columns
colsToRemove1 = []
for col in trainDataFrame.columns:
    if trainDataFrame[col].std() == 0:
        colsToRemove1.append(col)

trainDataFrame.drop(colsToRemove1, axis=1, inplace=True)

# remove duplicate columns
colsToRemove2 = []
columns = trainDataFrame.columns
for i in range(len(columns)-1):
    v = trainDataFrame[columns[i]].values
    for j in range(i+1,len(columns)):
        if np.array_equal(v,trainDataFrame[columns[j]].values):
            colsToRemove2.append(columns[j])

trainDataFrame.drop(colsToRemove2, axis=1, inplace=True)

#trainLabels = trainDataFrame['TARGET']
#trainFeatures = trainDataFrame.drop(['ID','TARGET'], axis=1)

In [5]:
trainTransformedDF = trainDataFrame.drop(['ID'], axis=1)
dataTarget0 = trainTransformedDF[trainTransformedDF.TARGET == 0]
dataTarget1 = trainTransformedDF[trainTransformedDF.TARGET == 1]

def getBalancedTrainAndValidationSets():
    # shuffle
    dataTarget0.reindex(np.random.permutation(dataTarget0.index))
    dataTarget1.reindex(np.random.permutation(dataTarget1.index))

    trn0 = dataTarget0[0:1500]
    trn1 = dataTarget1[0:1500]
    trn = pd.concat([trn0, trn1])
    y_train = trn['TARGET']
    X_train = trn.drop(['TARGET'], axis=1)
    
    val0 = dataTarget0[1500:3000]
    val1 = dataTarget1[1500:]
    val = pd.concat([val0, val1])
    y_val = val['TARGET']
    X_val = val.drop(['TARGET'], axis=1)
    
    return X_train, y_train, X_val, y_val

## TRY WITH ORIGINAL BEST 5 FEATURES 
### from https://www.kaggle.com/selfishgene/santander-customer-satisfaction/advanced-feature-exploration/comments

In [21]:
trainTransformedDF = trainDataFrame.drop(['ID'], axis=1)
trainTransformedDF = trainTransformedDF[['saldo_var30', 'var15', 'saldo_var5', 'ind_var30', 'var38', 'TARGET']]

dataTarget0 = trainTransformedDF[trainTransformedDF.TARGET == 0]
dataTarget1 = trainTransformedDF[trainTransformedDF.TARGET == 1]

def getBalancedTrainAndValidationSets():
    # shuffle
    dataTarget0.reindex(np.random.permutation(dataTarget0.index))
    dataTarget1.reindex(np.random.permutation(dataTarget1.index))

    trn0 = dataTarget0[0:1500]
    trn1 = dataTarget1[0:1500]
    trn = pd.concat([trn0, trn1])
    y_train = trn['TARGET']
    X_train = trn.drop(['TARGET'], axis=1)
    
    val0 = dataTarget0[1500:3000]
    val1 = dataTarget1[1500:]
    val = pd.concat([val0, val1])
    y_val = val['TARGET']
    X_val = val.drop(['TARGET'], axis=1)
    
    return X_train, y_train, X_val, y_val

In [26]:
trainTransformedDF.shape

(76020, 6)