In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.decomposition import PCA, IncrementalPCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import recall_score, confusion_matrix, precision_score, accuracy_score
from sklearn.linear_model import SGDClassifier
from sklearn.base import clone
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

# Dataset Exploration
First let's explore quickly the dataset. I won't explain all what we have as it is described on above

In [None]:
dataset = pd.read_csv("../input/creditcard.csv")

In [None]:
print(dataset.head())
print(dataset.describe())
print(dataset.info())

In [None]:
dataset['Class'].value_counts()

So as explain in this exercice we clearly have an unbalanced dataset with only 492 fraud and 284315 normal transaction (called non_fraud later)
Let's now explore the repartition of the amount of each transactions

In [None]:
plt.hist(dataset['Amount'], bins=50)
plt.show()

As we can imagine mainly all transaction are below 1500 \$. In order to reduce the range of amount we can check how many frauds we have above 3000 \$

In [None]:
dataset[(dataset['Amount'] > 3000) & (dataset['Class']==1)]

Good point, there is none so we can remove transaction with an amoutn above 3000 \$. After we can check the repartition of frauds based on the amount and the cost for the bank as Warranty

In [None]:
dataset = dataset[dataset['Amount'] < 3000]

In [None]:
fraud = dataset[dataset['Class']==1]
plt.hist(fraud['Amount'], bins=50)
plt.show()

In [None]:
bins = 50
Amount_max = 3000

Y = []
C = []
X = list(range(0, Amount_max, bins))
for i in X:
    s = fraud[(fraud['Amount'] > i) & (fraud['Amount'] <= i + bins)]['Amount'].sum()
    Y.append(s)
    if len(C) > 0:
        c = C[-1] + s
    else:
        c = s
    C.append(c)
    print("{} => {} $ - {}".format(i, s, c))

plt.bar(X, Y, width=50)
plt.ylabel('Cost')
plt.title('Cost of Frauds per amount')
plt.show()

plt.plot(X, C)
plt.show()

So we can see that most of frauds are below 500 \$. Nevertheless in term of cost, all fraud below 500 \$ cost 31500 \$ to the bank (50\% of the total cost of frauds). If we want to avoid around 90\% of fraud costs, we should consider frauds up to 1500 \$. In the first time we will try to catch a maximum a fraud and depending on the result we may focus only on frauds < 1500 \$

# Model simplification

We clearly have a unbalanced dataset as we have only 0.17% of frauds. One good thing to do in such case is to try some <a href="https://en.wikipedia.org/wiki/Oversampling_and_undersampling_in_data_analysis" target="_blank">Random Undersampling</a>

<b>For now, we only explore the dataset, the dataset will be splitted for evaluation later</b>

In [None]:
random_seed = 42
n_non_fraud = [100, 1000, 10000, 100000, dataset[dataset["Class"] == 0]["Class"].count()]         # min : 1 - max : 284807-492
n_components = 3
print(n_non_fraud)

In [None]:
for sample_size in n_non_fraud:
    a = dataset[dataset["Class"] == 1]                                                # we keep all frauds
    b = dataset[dataset["Class"] == 0].sample(sample_size, random_state=random_seed)  # we take "sample_size" non fraud to balance the ratio fraud/non_fraud

    dataset_us = pd.concat([a, b]).sample(frac=1, random_state=random_seed)           # merge and shuffle both dataset
    
    y = dataset_us["Class"]
    X = dataset_us.drop(["Time", "Class"], axis=1)
    
    X_scale = StandardScaler().fit_transform(X)
    X_proj = PCA(n_components=n_components).fit_transform(X_scale)
    
    plt.scatter(X_proj[:, 0], X_proj[:, 1], s=X_proj[:, 2], c=y)

    plt.xlabel("PCA1")
    plt.ylabel("PCA2")
    plt.title("{}-points".format(sample_size))
    #plt.savefig("{}-points".format(sample_size), dpi=600)
    plt.show()

With 100 and 1000 non-frauds, we can see that non fraud are packed but some fraud are also grouped. With 10k and there is still some yellow points included in violet ones. With the full dataset, the reduction is useless as we packed all points.
Nevertheless, with 100000 points, we have a nice split in 2 dimensions. We can fix this value to fit the PCA and use it in the full datraset afterward.

In [None]:
# fit the PCA with 100k non-frauds
a = dataset[dataset["Class"] == 1]
b = dataset[dataset["Class"] == 0].sample(100000, random_state=random_seed)

dataset = pd.concat([a, b]).sample(frac=1, random_state=random_seed)

y = dataset["Class"]
X = dataset.drop(["Time", "Class"], axis=1)

X_scale = StandardScaler().fit_transform(dataset)
pca = PCA(n_components=0.95, svd_solver="full")
X_proj = pca.fit(X_scale)

# transform the full dataset with the pca create previously
dataset = pd.read_csv("../input/creditcard.csv")
y = dataset["Class"]
X = dataset.drop(["Time", "Class"], axis=1)

X_scale = StandardScaler().fit_transform(dataset)
X_proj = pca.transform(X_scale)

# Setting up a model

Above instead of keeping only the 3 main dimensions, we reduce dimensions until having 5% loss. We can check how many features we have :

In [None]:
print(X_proj.shape)

Unfortunately, we drop only 2 additionnal dimensions but it's better than nothing. We can check also that our reduction still allow a nice split.

In [None]:
plt.scatter(X_proj[:, 0], X_proj[:, 1], s=X_proj[:, 2], c=y)

plt.xlabel("PCA1")
plt.ylabel("PCA2")
plt.title("{}-points".format(X_proj.shape[0]))
#plt.savefig("{}-points".format(sample_size), dpi=600)
plt.show()

For this model, it would be bad to use a standard split as we have an unbalanced dataset (492 frauds for 280k non-frauds). In such case we should definitely go for a StratifiedKFold with let say 5 folds to have around 100 frauds in each fold.

For now we gonna try some classification model and our target won't be the count of good guess. In this exercice it makes no sense as we can easily reach 99.8% as we only have 0.17% fraud in total. A classifier saying non fraud everytime would get 99.8%.

Instead our score will be the number of non detected frauds (False Negative). So we must maximise the <b>Precision</b>

Just as reminder, confusion matrix is :

\begin{vmatrix}
Non\_Fraud\_detected\_as\_non\_fraud &  Fraud\_detected\_as\_non\_fraud \\
Non\_Fraud\_detected\_as\_Fraud &  Fraud\_detected\_as\_Fraud
\end{vmatrix}

In [None]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=random_seed)  #shuffle is required to avoid having unbalance folds
sgd_clf = SGDClassifier(random_state=random_seed)
for train_index, test_index in skf.split(X_proj, y):
    clone_clf = clone(sgd_clf)
    X_train, X_test = X_proj[train_index], X_proj[test_index]
    y_train, y_test = y[train_index], y[test_index]
    clone_clf.fit(X_train, y_train)
    y_pred = clone_clf.predict(X_test)
    recall = recall_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    print("\nRecall:\t\t {:.4f} \nPrecision:\t {:.4f}".format(recall, precision))
    print(confusion_matrix(y_test, y_pred))

In [None]:
tree_clf = DecisionTreeClassifier(max_depth=7, random_state=random_seed)
for train_index, test_index in skf.split(X_proj, y):
    clone_clf = clone(tree_clf)
    X_train, X_test = X_proj[train_index], X_proj[test_index]
    y_train, y_test = y[train_index], y[test_index]
    clone_clf.fit(X_train, y_train)
    y_pred = clone_clf.predict(X_test)
    recall = recall_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    print(recall, precision)
    print(confusion_matrix(y_test, y_pred))

In [None]:
# svc_clf = SVC(gamma=2, C=1)
# for train_index, test_index in skf.split(X_proj, y):
#     clone_clf = clone(svc_clf)
#     X_train, X_test = X_proj[train_index], X_proj[test_index]
#     y_train, y_test = y[train_index], y[test_index]
#     clone_clf.fit(X_train, y_train)
#     y_pred = clone_clf.predict(X_test)
#     recall = recall_score(y_test, y_pred)
#     precision = precision_score(y_test, y_pred)
#     print(recall, precision)
#     print(confusion_matrix(y_test, y_pred))

#     Usign this model make the computer crach :(

In [None]:
mlp_clf = MLPClassifier(hidden_layer_sizes=(50, 20), random_state=random_seed)
for train_index, test_index in skf.split(X_proj, y):
    clone_clf = clone(mlp_clf)
    X_train, X_test = X_proj[train_index], X_proj[test_index]
    y_train, y_test = y[train_index], y[test_index]
    clone_clf.fit(X_train, y_train)
    y_pred = clone_clf.predict(X_test)
    recall = recall_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    print(recall, precision)
    print(confusion_matrix(y_test, y_pred))

The MLPClassifier give again a better result than the Tree Classifier. Topology hasn't been reviewed as we have a perfect catch but maybe layer size can be reduced to ease calculation. This is really great because that means we won't have refunds to do to victims as we catch all frauds and we won't also need employee to check some possible frauds (this model will ask for only few check every day (which are Non_Fraud detected as Fraud) ).


# Conclusion

By using the MLPClassifier, we can nearly all frauds and having nearly no False Positive. There is just a need to prepare all data first in the StandardScaler and in the PCA. Just to finish let's compute the score on the whole dataset (attention result may be below as we will also "evaluate" the training set)

In [None]:
best_model = clone_clf
y_pred = best_model.predict(X_proj)

In [None]:
print("Accuracy score : {}".format(accuracy_score(y, y_pred)))
print("Precision score : {}".format(precision_score(y, y_pred)))
print("Recall score : {}".format(recall_score(y, y_pred)))
print("Confusion Matrix : {}".format(confusion_matrix(y, y_pred)))