In [None]:
import pandas as pd
import numpy as np

from sklearn import linear_model
from sklearn.model_selection import train_test_split

import seaborn as sns

Since we saw in the model by Clemens Mzr (https://www.kaggle.com/clemensmzr/simple-multivariate-gaussian-anomaly-detection) and Johansing (https://www.kaggle.com/johansing/multivariate-gaussian-vs-oneclasssvm-97-recall) based on multivariate Gaussian that the frauds are no outliers and therefore outlier detection from a Gaussian distribution is not good for fraud detection, I try to implement a simple linear logistic regression (further down also with polynomial features) to see how it performs -- despite the only few fraud cases that presumably complicate the learning of a decision boundary.
One reaches an F1 score of 0.81 on the test set.

Reading in the data
====

In [None]:
data = pd.read_csv("../input/creditcard.csv")
data.head()

Simple logistic regression
=====

In [None]:
data.shape

In [None]:
X = data[data.columns[0:30]]
y = data["Class"]

In [None]:
X.shape

A quick try without splitting into training and test set.

In [None]:
lr = linear_model.LogisticRegression()
lr.fit(X, y)
print(lr.score(X, y))

Getting the different evaluation measures for skewed data. average_precision_score is area under the precision-recall curve.
score is the mean accuracy.

In [None]:
from sklearn.metrics import f1_score, average_precision_score, precision_score, recall_score

In [None]:
y_pred=lr.predict(X)
print("Accuracy:", lr.score(X, y))
print("Precision:", precision_score(y, y_pred))
print("Recall:", recall_score(y, y_pred))
print("F1:", f1_score(y, y_pred))
print("Area under precision Recall:", average_precision_score(y, y_pred))


Regularization optimization
=====

Optimize regularization for f1 score.

In [None]:
from sklearn.model_selection import cross_val_score, train_test_split

split into train and test set.

In [None]:
X_Legit=data.query("Class==0").drop(["Amount","Class"],1)
y_Legit=data.query("Class==0")["Class"]
X_Fraud=data.query("Class==1").drop(["Amount","Class"],1)
y_Fraud=data.query("Class==1")["Class"]
#split data into training and cv set
X_train_l, X_test_l, y_train_l, y_test_l = train_test_split(X_Legit, y_Legit, test_size=0.3)
X_train_f, X_test_f, y_train_f, y_test_f = train_test_split(X_Fraud, y_Fraud, test_size=0.3)
X_test = X_test_l.append(X_test_f)
y_test = y_test_l.append(y_test_f)
X_train = X_train_l.append(X_train_f)
y_train = y_train_l.append(y_train_f)

In [None]:
def cv_run(X_train, X_test, y_train, y_test):
    bestC = 1.
    bestScore = 0.
    for C in [0.003, 0.01, 0.03, 0.1, 0.3, 1, 3, 10, 30]:
        lr = linear_model.LogisticRegression(C=C)
        lr.fit(X_train, y_train)
        y_pred_train=lr.predict(X_train)
        score = f1_score(y_train, y_pred_train)
        if score > bestScore:
            bestC = C
            bestScore = score
    print( "Best C:", bestC)

    lr = linear_model.LogisticRegression(C=bestC)
    lr.fit(X_train, y_train)
    y_pred_test=lr.predict(X_test)
    y_pred_train=lr.predict(X_train)
    print("Train score:", lr.score(X_train, y_train))
    print("Test score:", lr.score(X_test, y_test))
    print("Train F1:", f1_score(y_train, y_pred_train))
    print("Test F1:", f1_score(y_test, y_pred_test))

In [None]:
cv_run(X_train, X_test, y_train, y_test)

before we add polynomial features, we normalize the features. Shouldnt change much since it's mostly PCA data, but let's check anyways.

Feature scaling
======

In [None]:
from sklearn import preprocessing

In [None]:
X_test_scaled = preprocessing.scale(X_test)
X_train_scaled = preprocessing.scale(X_train)

In [None]:
cv_run(X_train_scaled, X_test_scaled, y_train, y_test)

Feature scaling improved the F1 score by 4 %. The only features that should be really affected are Amount and Time.

# Adding polynomial features

In [None]:
from sklearn.preprocessing import PolynomialFeatures

Adding polynomial features and then running the regularization optimization is pretty costly for this data set. With more then 2nd order polynomials I run out of memory. With second order polynomials I run out of time. So we just do it for a fixed regularization parameter C=2 and see what happens.

In [None]:
poly = PolynomialFeatures(2)
X_train_poly = poly.fit_transform(X_train_scaled)
X_test_poly = poly.fit_transform(X_test_scaled) 

In [None]:
def lr_run(X_train, X_test, y_train, y_test):
    lr = linear_model.LogisticRegression()
    lr.fit(X_train, y_train)
    y_test_pred=lr.predict(X_test)
    print("Accuracy on training set:", lr.score(X_train, y_train))
    print("Accuracy on test set:", lr.score(X_test, y_test))
    print("Precision on test set:", precision_score(y_test, y_test_pred))
    print("Recall on test set:", recall_score(y_test, y_test_pred))
    print("F1 on test set:", f1_score(y_test, y_test_pred))
    print("Area under precision Recall on test set:", average_precision_score(y_test, y_test_pred))


In [None]:
lr_run(X_train_poly, X_test_poly, y_train, y_test)

In [None]:
#poly = PolynomialFeatures(2)
#X_poly = poly.fit_transform(X_scaled) 
#cv_run(X_poly, y_scaled)

Removing "Time" to compare performance

In [None]:
X_train_wo_t = np.delete(X_train_scaled, 0, 1)
X_test_wo_t = np.delete(X_test_scaled, 0, 1)

In [None]:
poly = PolynomialFeatures(2)
X_train_wo_t_poly = poly.fit_transform(X_train_wo_t)
X_test_wo_t_poly = poly.fit_transform(X_test_wo_t) 

In [None]:
lr_run(X_train_wo_t_poly, X_test_wo_t_poly, y_train, y_test)

So including the time of transaction, the F1 score goes actually down.