# Table of Contents
 <p><div class="lev1"><a href="#Preprocessing"><span class="toc-item-num">1&nbsp;&nbsp;</span>Preprocessing</a></div><div class="lev2"><a href="#Imports-and-loading-the-data"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>Imports and loading the data</a></div><div class="lev2"><a href="#Cleaning-the-data"><span class="toc-item-num">1.2&nbsp;&nbsp;</span>Cleaning the data</a></div><div class="lev3"><a href="#Remove-constant-a-duplicate-columns"><span class="toc-item-num">1.2.1&nbsp;&nbsp;</span>Remove constant a duplicate columns</a></div><div class="lev3"><a href="#Save-the-IDs-and-TARGETs-and-drop-them-from-the-dataframe"><span class="toc-item-num">1.2.2&nbsp;&nbsp;</span>Save the IDs and TARGETs and drop them from the dataframe</a></div><div class="lev1"><a href="#Logistic-regression"><span class="toc-item-num">2&nbsp;&nbsp;</span>Logistic regression</a></div><div class="lev1"><a href="#Output"><span class="toc-item-num">3&nbsp;&nbsp;</span>Output</a></div>

# Preprocessing
## Imports and loading the data

In [2]:
%matplotlib inline
from __future__ import division

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

# Input data files are available in the "./input/" directory.
# load data
df_train = pd.read_csv('../input/train.csv')
df_test = pd.read_csv('../input/test.csv')

## Cleaning the data
### Remove constant a duplicate columns
We remove any constant columns and any duplicated columns (identical values) as these can have no signature in the dependent variable. Note that we remove the constant and duplicate columns in the training set **and the test set**.

In [3]:
# remove constant columns
remove = []
for col in df_train.columns:
    if df_train[col].std() == 0:
        remove.append(col)

df_train.drop(remove, axis=1, inplace=True)
df_test.drop(remove, axis=1, inplace=True)

# remove duplicated columns
remove = []
c = df_train.columns
for i in range(len(c)-1):
    v = df_train[c[i]].values
    for j in range(i+1,len(c)):
        if np.array_equal(v,df_train[c[j]].values):
            remove.append(c[j])

df_train.drop(remove, axis=1, inplace=True)
df_test.drop(remove, axis=1, inplace=True)

### Save the IDs and TARGETs and drop them from the dataframe

In [4]:
IDs = df_train["ID"]
IDs_test = df_test["ID"]
TARGETs = df_train["TARGET"]

df_train.drop(["ID", "TARGET"], axis=1, inplace=True)
df_test.drop(["ID"], axis=1, inplace=True)

# Logistic regression

In [29]:
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import VarianceThreshold
from sklearn.metrics import confusion_matrix, roc_auc_score

X, y = df_train, TARGETs

n_features = 40 #Only use the top 'n_features' features
weights = {0: 1, 1: 2} #Attempt to balance the classes
clf = Pipeline([
        ('remove_zero_variance', VarianceThreshold()),
        ('feature_selection', SelectKBest(f_classif, k=n_features)),
        ('classification', LogisticRegression(penalty = "l2", 
                                              solver = "sag",
                                              max_iter = 500,
                                              tol = 0.001,
                                              n_jobs=4, C=1,
                                              class_weight = weights, 
                                              warm_start = False
                                       ))
])
clf.fit(X, y)

y_test_pred = clf.predict(X)

# Test on the training set:
y_test_pred = clf.predict(X)
print(confusion_matrix(TARGETs, y_test_pred))

# Calculate the roc_auc score
print('Overall AUC:', roc_auc_score(y, clf.predict_proba(X)[:,1]))



[[72977    35]
 [ 2996    12]]
Overall AUC: 0.473699815704


# Output

In [6]:
from datetime import datetime
y_probs = clf.predict_proba(df_test)[:,1]

# Stamp the output with the current time
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

submission = pd.DataFrame({"ID":IDs_test, "TARGET":y_probs})
submission.to_csv("../results/logistic_reg_" + timestamp + ".csv", index=False, float_format="%10.8f")