# Table of Contents
 <p><div class="lev1"><a href="#Preprocessing"><span class="toc-item-num">1&nbsp;&nbsp;</span>Preprocessing</a></div><div class="lev3"><a href="#Imports-and-loading-the-data"><span class="toc-item-num">1.0.1&nbsp;&nbsp;</span>Imports and loading the data</a></div><div class="lev3"><a href="#Cleaning-the-data"><span class="toc-item-num">1.0.2&nbsp;&nbsp;</span>Cleaning the data</a></div><div class="lev4"><a href="#Remove-constant-a-duplicate-columns"><span class="toc-item-num">1.0.2.1&nbsp;&nbsp;</span>Remove constant a duplicate columns</a></div><div class="lev4"><a href="#Save-the-IDs-and-TARGETs-and-drop-them-from-the-dataframe"><span class="toc-item-num">1.0.2.2&nbsp;&nbsp;</span>Save the IDs and TARGETs and drop them from the dataframe</a></div><div class="lev4"><a href="#Look-for-outliers-and-missing-values"><span class="toc-item-num">1.0.2.3&nbsp;&nbsp;</span>Look for outliers and missing values</a></div><div class="lev1"><a href="#Feature-Analysis"><span class="toc-item-num">2&nbsp;&nbsp;</span>Feature Analysis</a></div>

# Preprocessing
### Imports and loading the data

In [1]:
from __future__ import division

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

# Input data files are available in the "./input/" directory.
# load data
df_train = pd.read_csv('./input/train.csv')
df_test = pd.read_csv('./input/test.csv')

df_train.head()

Unnamed: 0,ID,var3,var15,imp_ent_var16_ult1,imp_op_var39_comer_ult1,imp_op_var39_comer_ult3,imp_op_var40_comer_ult1,imp_op_var40_comer_ult3,imp_op_var40_efect_ult1,imp_op_var40_efect_ult3,...,saldo_medio_var33_hace2,saldo_medio_var33_hace3,saldo_medio_var33_ult1,saldo_medio_var33_ult3,saldo_medio_var44_hace2,saldo_medio_var44_hace3,saldo_medio_var44_ult1,saldo_medio_var44_ult3,var38,TARGET
0,1,2,23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,39205.17,0
1,3,2,34,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,49278.03,0
2,4,2,23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,67333.77,0
3,8,2,37,0.0,195.0,195.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,64007.97,0
4,10,2,39,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,117310.979016,0


### Cleaning the data
#### Remove constant a duplicate columns
We remove any constant columns and any duplicated columns (identical values) as these can have no signature in the dependent variable. Note that we remove the constant and duplicate columns in the training set **and the test set**.

In [2]:
# remove constant columns
remove = []
for col in df_train.columns:
    if df_train[col].std() == 0:
        remove.append(col)

df_train.drop(remove, axis=1, inplace=True)
df_test.drop(remove, axis=1, inplace=True)

# remove duplicated columns
remove = []
c = df_train.columns
for i in range(len(c)-1):
    v = df_train[c[i]].values
    for j in range(i+1,len(c)):
        if np.array_equal(v,df_train[c[j]].values):
            remove.append(c[j])

df_train.drop(remove, axis=1, inplace=True)
df_test.drop(remove, axis=1, inplace=True)

#### Save the IDs and TARGETs and drop them from the dataframe

In [3]:
IDs = df_train["ID"]
TARGETs = df_train["TARGET"]

df_train.drop(["ID", "TARGET"], axis=1, inplace=True)
df_test.drop(["ID"], axis=1, inplace=True)

df_train.head()

Unnamed: 0,var3,var15,imp_ent_var16_ult1,imp_op_var39_comer_ult1,imp_op_var39_comer_ult3,imp_op_var40_comer_ult1,imp_op_var40_comer_ult3,imp_op_var40_efect_ult1,imp_op_var40_efect_ult3,imp_op_var40_ult1,...,saldo_medio_var29_ult3,saldo_medio_var33_hace2,saldo_medio_var33_hace3,saldo_medio_var33_ult1,saldo_medio_var33_ult3,saldo_medio_var44_hace2,saldo_medio_var44_hace3,saldo_medio_var44_ult1,saldo_medio_var44_ult3,var38
0,2,23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,39205.17
1,2,34,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,49278.03
2,2,23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,67333.77
3,2,37,0.0,195.0,195.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,64007.97
4,2,39,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,117310.979016


#### Look for outliers and missing values

Before we work hard to remove outliers and missing values, we should perform a rough feature extraction to determine which columns are important. We can then focus on cleaning up those columns.

# Feature Analysis
Now we are left with a training data set containing 306 independent variables. We somehow have to determine which of these affect customer satisfaction. As a first pass, we use the scikit-learn package to perform univariate feature selection (SelectKBest) with the F-test for feature scoring (f_classif). 

In [29]:
from sklearn.feature_selection import SelectKBest, SelectPercentile, f_classif

X, y = df_train, TARGETs
print(type(X), " ", X.shape)

selector = SelectKBest(f_classif, k=10)
selector.fit(X,y)
print(type(selector))
scores = -np.log10(selector.pvalues_)
scores /= scores.max()

print(scores, " ", len(scores))

<class 'pandas.core.frame.DataFrame'>   (76020, 306)
<class 'sklearn.feature_selection.univariate_selection.SelectKBest'>
[  0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
   0.   0.   0.   0.   0.   0.  nan   0.   0.   0.   0.   0.   0.   0.   0.
   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
   0.   0.   0.   0.   0.   0.   0.   0.  