# Table of Contents
 <p><div class="lev1"><a href="#Preprocessing"><span class="toc-item-num">1&nbsp;&nbsp;</span>Preprocessing</a></div><div class="lev2"><a href="#Imports-and-loading-the-data"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>Imports and loading the data</a></div><div class="lev2"><a href="#Cleaning-the-data"><span class="toc-item-num">1.2&nbsp;&nbsp;</span>Cleaning the data</a></div><div class="lev3"><a href="#Remove-constant-a-duplicate-columns"><span class="toc-item-num">1.2.1&nbsp;&nbsp;</span>Remove constant a duplicate columns</a></div><div class="lev3"><a href="#Save-the-IDs-and-TARGETs-and-drop-them-from-the-dataframe"><span class="toc-item-num">1.2.2&nbsp;&nbsp;</span>Save the IDs and TARGETs and drop them from the dataframe</a></div><div class="lev3"><a href="#Look-for-outliers-and-missing-values"><span class="toc-item-num">1.2.3&nbsp;&nbsp;</span>Look for outliers and missing values</a></div><div class="lev1"><a href="#Feature-Analysis"><span class="toc-item-num">2&nbsp;&nbsp;</span>Feature Analysis</a></div><div class="lev2"><a href="#Select-K-Best"><span class="toc-item-num">2.1&nbsp;&nbsp;</span>Select K Best</a></div><div class="lev2"><a href="#Select-false-discovery-rate"><span class="toc-item-num">2.2&nbsp;&nbsp;</span>Select false discovery rate</a></div><div class="lev2"><a href="#Select-false-positive-rate"><span class="toc-item-num">2.3&nbsp;&nbsp;</span>Select false positive rate</a></div><div class="lev2"><a href="#Select-family-wise-error-rates"><span class="toc-item-num">2.4&nbsp;&nbsp;</span>Select family-wise error rates</a></div><div class="lev1"><a href="#Classification"><span class="toc-item-num">3&nbsp;&nbsp;</span>Classification</a></div><div class="lev2"><a href="#Basic-Logistic-Regression"><span class="toc-item-num">3.1&nbsp;&nbsp;</span>Basic Logistic Regression</a></div><div class="lev2"><a href="#Random-Forest"><span class="toc-item-num">3.2&nbsp;&nbsp;</span>Random Forest</a></div><div class="lev1"><a href="#Outputting-the-result"><span class="toc-item-num">4&nbsp;&nbsp;</span>Outputting the result</a></div>

# Preprocessing
## Imports and loading the data

In [2]:
%matplotlib inline
from __future__ import division

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

# Input data files are available in the "./input/" directory.
# load data
df_train = pd.read_csv('./input/train.csv')
df_test = pd.read_csv('./input/test.csv')

df_train.head()

Unnamed: 0,ID,var3,var15,imp_ent_var16_ult1,imp_op_var39_comer_ult1,imp_op_var39_comer_ult3,imp_op_var40_comer_ult1,imp_op_var40_comer_ult3,imp_op_var40_efect_ult1,imp_op_var40_efect_ult3,...,saldo_medio_var33_hace2,saldo_medio_var33_hace3,saldo_medio_var33_ult1,saldo_medio_var33_ult3,saldo_medio_var44_hace2,saldo_medio_var44_hace3,saldo_medio_var44_ult1,saldo_medio_var44_ult3,var38,TARGET
0,1,2,23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,39205.17,0
1,3,2,34,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,49278.03,0
2,4,2,23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,67333.77,0
3,8,2,37,0.0,195.0,195.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,64007.97,0
4,10,2,39,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,117310.979016,0


## Cleaning the data
### Remove constant a duplicate columns
We remove any constant columns and any duplicated columns (identical values) as these can have no signature in the dependent variable. Note that we remove the constant and duplicate columns in the training set **and the test set**.

In [3]:
# remove constant columns
remove = []
for col in df_train.columns:
    if df_train[col].std() == 0:
        remove.append(col)

df_train.drop(remove, axis=1, inplace=True)
df_test.drop(remove, axis=1, inplace=True)

# remove duplicated columns
remove = []
c = df_train.columns
for i in range(len(c)-1):
    v = df_train[c[i]].values
    for j in range(i+1,len(c)):
        if np.array_equal(v,df_train[c[j]].values):
            remove.append(c[j])

df_train.drop(remove, axis=1, inplace=True)
df_test.drop(remove, axis=1, inplace=True)

### Save the IDs and TARGETs and drop them from the dataframe

In [4]:
IDs = df_train["ID"]
IDs_test = df_test["ID"]
TARGETs = df_train["TARGET"]

df_train.drop(["ID", "TARGET"], axis=1, inplace=True)
df_test.drop(["ID"], axis=1, inplace=True)

df_train.head()

Unnamed: 0,var3,var15,imp_ent_var16_ult1,imp_op_var39_comer_ult1,imp_op_var39_comer_ult3,imp_op_var40_comer_ult1,imp_op_var40_comer_ult3,imp_op_var40_efect_ult1,imp_op_var40_efect_ult3,imp_op_var40_ult1,...,saldo_medio_var29_ult3,saldo_medio_var33_hace2,saldo_medio_var33_hace3,saldo_medio_var33_ult1,saldo_medio_var33_ult3,saldo_medio_var44_hace2,saldo_medio_var44_hace3,saldo_medio_var44_ult1,saldo_medio_var44_ult3,var38
0,2,23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,39205.17
1,2,34,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,49278.03
2,2,23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,67333.77
3,2,37,0.0,195.0,195.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,64007.97
4,2,39,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,117310.979016


### Look for outliers and missing values

Before we work hard to remove outliers and missing values, we should perform a rough feature extraction to determine which columns are important. We can then focus on cleaning up those columns.

# Feature Analysis
Now we are left with a training data set containing 306 independent variables. We somehow have to determine which of these affect customer satisfaction. Here we try a few different feature selection tests based on the ANOVA F-value and compare the results. This is a univariate test.

In [5]:
X, y = df_train, TARGETs
n_factors = 20

## Select K Best

In [6]:
from sklearn.feature_selection import SelectKBest, f_classif

selector = SelectKBest(f_classif, k=n_factors)
selector.fit(X,y)
X_kbest = X.loc[:,selector.get_support()] # Subset the original data frame
print(type(X), " ", X_kbest.shape)

kbest_results = np.transpose([X_kbest.columns.values, 
                        selector.scores_[selector.get_support()]/max(selector.scores_[selector.get_support()]), 
                        selector.pvalues_[selector.get_support()]])

kbest_results_df = pd.DataFrame(kbest_results, 
                          index= kbest_results[:,0], 
                          columns = ["Feature", "Score", "p-value"]).sort_values("Score", ascending = False)

print(kbest_results_df)

<class 'pandas.core.frame.DataFrame'>   (76020, 20)
                                 Feature      Score       p-value
ind_var30                      ind_var30          1             0
num_meses_var5_ult3  num_meses_var5_ult3   0.978848             0
num_var30                      num_var30   0.849217  3.83395e-321
num_var42                      num_var42   0.817038  3.61245e-309
ind_var5                        ind_var5   0.812819  1.34417e-307
num_var5                        num_var5   0.797562  6.45245e-302
var36                              var36   0.466311  4.63701e-178
var15                              var15   0.451801  1.30228e-172
num_var4                        num_var4   0.281929  1.13782e-108
num_var35                      num_var35   0.258924  5.48092e-100
ind_var8_0                    ind_var8_0  0.0950596   6.36386e-38
num_var8_0                    num_var8_0  0.0948828   7.42947e-38
ind_var13                      ind_var13  0.0684528   8.67724e-28
ind_var13_0             

## Select false discovery rate

In [7]:
from sklearn.feature_selection import SelectFdr, f_classif

alpha = 0.01 # upper bound on est. false discovery rate
selector = SelectFdr(f_classif, alpha = 0.5)
selector.fit(X,y)
X_fdr= X.loc[:,selector.get_support()] # Subset the original data frame
print(type(X), " ", X_fdr.shape)

fdr_results = np.transpose([X_fdr.columns.values, 
                        selector.scores_[selector.get_support()]/max(selector.scores_[selector.get_support()]), 
                        selector.pvalues_[selector.get_support()]])

fdr_results_df = pd.DataFrame(fdr_results, 
                          index= fdr_results[:,0], 
                          columns = ["Feature", "Score", "p-value"]).sort_values("Score", ascending = False)

print(fdr_results_df.iloc[0:n_factors,:])

<class 'pandas.core.frame.DataFrame'>   (76020, 169)
                                 Feature      Score       p-value
ind_var30                      ind_var30          1             0
num_meses_var5_ult3  num_meses_var5_ult3   0.978848             0
num_var30                      num_var30   0.849217  3.83395e-321
num_var42                      num_var42   0.817038  3.61245e-309
ind_var5                        ind_var5   0.812819  1.34417e-307
num_var5                        num_var5   0.797562  6.45245e-302
var36                              var36   0.466311  4.63701e-178
var15                              var15   0.451801  1.30228e-172
num_var4                        num_var4   0.281929  1.13782e-108
num_var35                      num_var35   0.258924  5.48092e-100
ind_var8_0                    ind_var8_0  0.0950596   6.36386e-38
num_var8_0                    num_var8_0  0.0948828   7.42947e-38
ind_var13                      ind_var13  0.0684528   8.67724e-28
ind_var13_0            

## Select false positive rate

In [8]:
from sklearn.feature_selection import SelectFpr, f_classif

alpha = 0.01 # upper bound on est. false positive rate
selector = SelectFpr(f_classif, alpha = 0.5)
selector.fit(X,y)
X_fpr= X.loc[:,selector.get_support()] # Subset the original data frame
print(type(X), " ", X_fpr.shape)

fpr_results = np.transpose([X_fpr.columns.values, 
                        selector.scores_[selector.get_support()]/max(selector.scores_[selector.get_support()]), 
                        selector.pvalues_[selector.get_support()]])

fpr_results_df = pd.DataFrame(fpr_results, 
                          index= fpr_results[:,0], 
                          columns = ["Feature", "Score", "p-value"]).sort_values("Score", ascending = False)

print(fpr_results_df.iloc[0:n_factors,:])

<class 'pandas.core.frame.DataFrame'>   (76020, 201)
                                 Feature      Score       p-value
ind_var30                      ind_var30          1             0
num_meses_var5_ult3  num_meses_var5_ult3   0.978848             0
num_var30                      num_var30   0.849217  3.83395e-321
num_var42                      num_var42   0.817038  3.61245e-309
ind_var5                        ind_var5   0.812819  1.34417e-307
num_var5                        num_var5   0.797562  6.45245e-302
var36                              var36   0.466311  4.63701e-178
var15                              var15   0.451801  1.30228e-172
num_var4                        num_var4   0.281929  1.13782e-108
num_var35                      num_var35   0.258924  5.48092e-100
ind_var8_0                    ind_var8_0  0.0950596   6.36386e-38
num_var8_0                    num_var8_0  0.0948828   7.42947e-38
ind_var13                      ind_var13  0.0684528   8.67724e-28
ind_var13_0            

## Select family-wise error rates

In [9]:
from sklearn.feature_selection import SelectFwe, f_classif

alpha = 0.01 # upper bound on est. false positive rate
selector = SelectFwe(f_classif, alpha = 0.5)
selector.fit(X,y)
X_Fwe= X.loc[:,selector.get_support()] # Subset the original data frame
print(type(X), " ", X_Fwe.shape)

Fwe_results = np.transpose([X_Fwe.columns.values, 
                        selector.scores_[selector.get_support()]/max(selector.scores_[selector.get_support()]), 
                        selector.pvalues_[selector.get_support()]])

Fwe_results_df = pd.DataFrame(Fwe_results, 
                          index= Fwe_results[:,0], 
                          columns = ["Feature", "Score", "p-value"]).sort_values("Score", ascending = False)

print(Fwe_results_df.iloc[0:n_factors,:])

<class 'pandas.core.frame.DataFrame'>   (76020, 102)
                                 Feature      Score       p-value
ind_var30                      ind_var30          1             0
num_meses_var5_ult3  num_meses_var5_ult3   0.978848             0
num_var30                      num_var30   0.849217  3.83395e-321
num_var42                      num_var42   0.817038  3.61245e-309
ind_var5                        ind_var5   0.812819  1.34417e-307
num_var5                        num_var5   0.797562  6.45245e-302
var36                              var36   0.466311  4.63701e-178
var15                              var15   0.451801  1.30228e-172
num_var4                        num_var4   0.281929  1.13782e-108
num_var35                      num_var35   0.258924  5.48092e-100
ind_var8_0                    ind_var8_0  0.0950596   6.36386e-38
num_var8_0                    num_var8_0  0.0948828   7.42947e-38
ind_var13                      ind_var13  0.0684528   8.67724e-28
ind_var13_0            

# Classification
All of these tests yield the same best features. That gives some confidence that at least some of these factors are quite important. To streamline the process, we will use scikit-learn's pipeline interface. We will try a few different classifier algorithms and compare the results.

In [10]:
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix
X_test = df_test

## Basic Logistic Regression
Doesn't seem to work so well!

In [11]:
from sklearn.linear_model import LogisticRegression
n_features = 50
clf = Pipeline([
  ('feature_selection', SelectKBest(f_classif, k=n_features)),
  ('classification', LogisticRegression(penalty = "l2", n_jobs=4, C=1,
                                        class_weight = "balanced",
                                        warm_start = False
                                       ))
])
clf.fit(X, y)

# Test on the training set:
y_test_pred = clf.predict(X)
confusion_matrix(TARGETs, y_test_pred)

array([[48388, 24624],
       [  778,  2230]])

## Random Forest
Promising results. Computationally expensive, because we have so many features. We need the number of trees to be significantly larger than the number of features.

In [12]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import VarianceThreshold
from sklearn.metrics import roc_auc_score

n_features = 'all' #Only use the top 'n_features' features
n_estimators = 500 #Number of trees
weights = {0: 1, 1:3.45} #Attempt to balance the classes
clf = Pipeline([
        ('remove_zero_variance', VarianceThreshold()),
        ('feature_selection', SelectKBest(f_classif, k=n_features)),
        ('classification', RandomForestClassifier(n_estimators,
                                                max_features = 30,
                                                n_jobs=4,
                                                class_weight= weights,
                                                warm_start=False))
])
clf.fit(X, y)

y_test_pred = clf.predict(X)

# Test on the training set:
y_test_pred = clf.predict(X)
print(confusion_matrix(TARGETs, y_test_pred))

# Calculate the roc_auc score
print('Overall AUC:', roc_auc_score(y, clf.predict_proba(X)[:,1]))

[[72841   171]
 [  223  2785]]
Overall AUC: 0.998818272987


Those numbers don't look too bad.

# Outputting the result

In [13]:
y_probs = clf.predict_proba(X_test)[:,1]


submission = pd.DataFrame({"ID":IDs_test, "TARGET":y_probs})
submission.to_csv("submission.csv", index=False, float_format="%10.8f")