# Credit card fraud detection


    Binary classification problem to determine whether a transaction is a fraud/non fraud. The datasets contains transactions made by credit cards in September 2013 by european cardholders.
    To guarantee anonimity all the independent variables are transformed into numerical using PCA transformations.

In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import sklearn
from sklearn.ensemble import IsolationForest

In [None]:
!pwd

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
fraud = pd.read_csv('/kaggle/input/creditcardfraud/creditcard.csv')
fraud.head(3)

In [None]:
fraud.columns

In [None]:
fraud.info()

In [None]:
fraud.describe()

In [None]:
fraud.shape

In [None]:
fraud.isna().sum().any()

    There are no NaN values in the entire dataset.

In [None]:
fraud['Class'].value_counts()

In [None]:
plt.figure(figsize=(9,4))

plt.bar(['non fraud', 'fraud'], np.log10(fraud['Class'].value_counts().to_numpy()), width=0.3, color=['navy', 'firebrick'], zorder=3)

plt.title('Non fraud/fraud count', fontsize=14)

plt.ylabel('class count (log10 scale)')
plt.xlabel('non fraud/fraud')
plt.grid(color='y', axis='y', linewidth=0.5)

plt.show()
print('Non frauds: {} \nFrauds: {}'.format(fraud['Class'].value_counts()[0], fraud['Class'].value_counts()[1]))

    Thus we can see the dataset is higly unbalanced.

## Data distributions and correlation matrix

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(18,4))
sns.histplot(data=fraud.loc[fraud['Class'] == 0], x="Time", ax=ax[0], color='navy', kde=True)
ax[0].set_title('Non frauds distribution of transaction time', fontsize=14)

sns.histplot(data=fraud.loc[fraud['Class'] == 1], x="Time", ax=ax[1], color='firebrick', kde=True)
ax[1].set_title('Frauds distribution of transaction time', fontsize=14)

plt.show()

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(18,4))

sns.histplot(data=fraud.loc[(fraud['Class'] == 0) & (fraud['Amount'] <= 1000)], x="Amount", color='navy', ax=ax[0], kde=True)
ax[0].set_title('Non frauds distribution of transaction amount in (0-1000) interval', fontsize=14)

sns.histplot(data=fraud.loc[fraud['Class'] == 1], x="Amount", ax=ax[1], color='firebrick', kde=True)
ax[1].set_title('Frauds distribution of transaction amount', fontsize=14)

plt.show()

In [None]:
plt.subplots(figsize=(11,9))

corr = fraud.corr()
sns.heatmap(corr, cmap='YlOrBr', annot_kws={'size':20})
plt.title("Correlation matrix of whole dataset", fontsize=14)
plt.show()

## Outliers removal

    The adopted outlier removal technique is Isolation forest.

In [None]:
removal_fraud = IsolationForest(max_samples='auto', random_state=150, contamination='auto', n_jobs=-1)
removal_nofraud = IsolationForest(max_samples='auto', random_state=150, contamination='auto', n_jobs=-1)

f = fraud.loc[(fraud['Class'] == 1)]
nof =  fraud.loc[(fraud['Class'] == 0)]

mask_f = removal_fraud.fit_predict(f[[col for col in f.columns if 'V' in col]])
mask_nof = removal_nofraud.fit_predict(nof[[col for col in nof.columns if 'V' in col]])

print(f.shape)
print(nof.shape)

In [None]:
(mask_f == -1).sum()

In [None]:
(mask_nof == -1).sum()

In [None]:
fraud_outliers = pd.concat([f.iloc[(mask_f == -1)], nof.iloc[(mask_nof == -1)]])
fraud_clean = pd.concat([f.iloc[~(mask_f == -1)], nof.iloc[~(mask_nof == -1)]])

In [None]:
fraud_outliers['Class'].value_counts()

In [None]:
fraud_clean['Class'].value_counts()

### Plot the outliers distributions

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(18,4))

sns.histplot(data=fraud_outliers.loc[fraud_outliers['Class'] == 0], x="Time", ax=ax[0], color='navy', kde=True)
ax[0].set_title('Non frauds distribution of transaction time', fontsize=14)

sns.histplot(data=fraud_outliers.loc[fraud_outliers['Class'] == 1], x="Time", ax=ax[1], color='firebrick', kde=True)
ax[1].set_title('Frauds distribution of transaction time', fontsize=14)

fig.suptitle('Outlier distribution of Time', fontsize=14)

plt.show()

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(18,4))

sns.histplot(data=fraud_outliers.loc[(fraud_outliers['Class'] == 0)], x="Amount", color='navy', ax=ax[0], kde=True)
ax[0].set_title('Non frauds distribution of transaction amount', fontsize=14)

sns.histplot(data=fraud_outliers.loc[fraud_outliers['Class'] == 1], x="Amount", ax=ax[1], color='firebrick', kde=True)
ax[1].set_title('Frauds distribution of transaction amount', fontsize=14)

fig.suptitle('Outlier distribution of Transaction Amount', fontsize=14)

plt.show()

### Plot of distributions of clean data

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(18,4))
sns.histplot(data=fraud_clean.loc[fraud_clean['Class'] == 0], x="Time", ax=ax[0], color='navy', kde=True)
ax[0].set_title('Non frauds distribution of transaction time', fontsize=14)

sns.histplot(data=fraud_clean.loc[fraud_clean['Class'] == 1], x="Time", ax=ax[1], color='firebrick', kde=True)
ax[1].set_title('Frauds distribution of transaction time', fontsize=14)

fig.suptitle('Clean data distribution of Time', fontsize=14)

plt.show()

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(18,4))

sns.histplot(data=fraud_clean.loc[(fraud_clean['Class'] == 0)], x="Amount", color='navy', ax=ax[0], kde=True)
ax[0].set_title('Non frauds distribution of transaction amount', fontsize=14)

sns.histplot(data=fraud_clean.loc[fraud_clean['Class'] == 1], x="Amount", ax=ax[1], color='firebrick', kde=True)
ax[1].set_title('Frauds distribution of transaction amount', fontsize=14)

fig.suptitle('Clean data distribution of transaction amount', fontsize=14)

plt.show()

## 1. Binary classification without Data Augmentation, Undersampling techniques

## 2. Binary classification with Undersampling technique

## 3. Binary classification with Data Augmentation (SMOTE) technique