# Context
It is important that credit card companies are able to recognize fraudulent credit card transactions so that customers are not charged for items that they did not purchase.

### Content
The datasets contains transactions made by credit cards in September 2013 by european cardholders. This dataset presents transactions that occurred in two days, where we have 492 frauds out of 284,807 transactions. The dataset is highly unbalanced, the positive class (frauds) account for 0.172% of all transactions.

It contains only numerical input variables which are the result of a PCA transformation. Unfortunately, due to confidentiality issues, we cannot provide the original features and more background information about the data. Features V1, V2, ... V28 are the principal components obtained with PCA, the only features which have not been transformed with PCA are 'Time' and 'Amount'. Feature 'Time' contains the seconds elapsed between each transaction and the first transaction in the dataset. The feature 'Amount' is the transaction Amount, this feature can be used for example-dependant cost-senstive learning. Feature 'Class' is the response variable and it takes value 1 in case of fraud and 0 otherwise.

### Acknowledgements
The dataset has been collected and analysed during a research collaboration of Worldline and the Machine Learning Group (http://mlg.ulb.ac.be) of ULB (Université Libre de Bruxelles) on big data mining and fraud detection. More details on current and past projects on related topics are available on http://mlg.ulb.ac.be/BruFence and http://mlg.ulb.ac.be/ARTML

Please cite: Andrea Dal Pozzolo, Olivier Caelen, Reid A. Johnson and Gianluca Bontempi. Calibrating Probability with Undersampling for Unbalanced Classification. In Symposium on Computational Intelligence and Data Mining (CIDM), IEEE, 2015

# Importing required Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import interp, stats
from itertools import cycle
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import confusion_matrix, cohen_kappa_score, classification_report
from sklearn.metrics import r2_score, roc_auc_score, roc_curve, auc

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier

from sklearn.preprocessing import StandardScaler
import seaborn as sns
from pylab import rcParams

%matplotlib inline
sns.set(style='whitegrid', palette='muted', font_scale=1.5)

In [None]:
# load the data
data = pd.read_csv("../input/creditcard.csv")

# Exploratory Data Analysis

In [None]:
# get column names
colNames = data.columns.values
colNames

In [None]:
# get dataframe dimensions
print ("Dimension of dataset:", data.shape)

In [None]:
# get attribute summaries
print(data.describe())

In [None]:
# get class distribution
print ("Normal transaction:", data['Class'][data['Class']==0].count()) #class = 0
print ("Fraudulent transaction:", data['Class'][data['Class']==1].count()) #class = 1

In [None]:
# separate classes into different datasets
normal_class = data.query('Class == 0')
fraudulent_class = data.query('Class == 1')

# randomize the datasets
normal_class = normal_class.sample(frac=1,random_state=69)
fraudulent_class = fraudulent_class.sample(frac=1,random_state=69)

In [None]:
f, (ax1, ax2) = plt.subplots(2, 1, sharex=True, figsize=(15,9))
f.suptitle('Time of transaction vs Amount by class')

ax1.scatter(fraudulent_class.Time, fraudulent_class.Amount)
ax1.set_title('Fraud')

ax2.scatter(normal_class.Time, normal_class.Amount)
ax2.set_title('Normal')

plt.xlabel('Time (in Seconds)')
plt.ylabel('Amount')
plt.show()

### The above graph shows that **Time** is irrelevent for detecting fraudulent transactions

In [None]:
f, (ax1, ax2) = plt.subplots(2, 1, sharex=True, figsize=(15,9))
f.suptitle('Amount per transaction by class')

bins = 50

ax1.hist(fraudulent_class.Amount, bins = bins)
ax1.set_title('Fraud')

ax2.hist(normal_class.Amount, bins = bins)
ax2.set_title('Normal')

plt.xlabel('Amount ($)')
plt.ylabel('Number of Transactions')
plt.xlim((0, 20000))
plt.yscale('log')
plt.show();

### The above graph shows that most of the fraudulent transactions are of very low amount

In [None]:
data = data.drop(['Time'], axis=1)
data['Amount'] = StandardScaler().fit_transform(data['Amount'].values.reshape(-1, 1))

In [None]:
# separate classes into different datasets
normal_class = data.query('Class == 0')
fraudulent_class = data.query('Class == 1')

# randomize the datasets
normal_class = normal_class.sample(frac=1,random_state=69)
fraudulent_class = fraudulent_class.sample(frac=1,random_state=69)

# Oversampling to deal with class imbalance

The examples of the majority class, in this case the normal transactions, drastically outnumber the 
incidences of fraudulent transactions in our dataset. One of the strategies employed in the data science community is 
to generate synthetic data points for under-represented class to improve the learning function.

In [None]:
X = data.drop(['Class'], axis = 1)

y = data['Class']

In [None]:
def plot_2d_space(X, y, label='Classes'):   
    colors = ['#1F77B4', '#FF7F0E']
    markers = ['o', 's']
    plt.figure(figsize=(12, 9), dpi=80)
    for l, c, m in zip(np.unique(y), colors, markers):
        plt.scatter(X[y==l, 0], X[y==l, 1], c=c, label=l, marker=m)
    plt.title(label)
    plt.legend(loc='upper right')
    plt.show()

In [None]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(ratio='minority', random_state=69)
X_sm, y_sm = smote.fit_sample(X, y)

plot_2d_space(X_sm, y_sm, 'SMOTE over-sampling')

# Time to train and test the performance of various models

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_sm, y_sm, test_size=0.3, random_state=69)

In [None]:
# See category counts for test data
category, records = np.unique(y_test, return_counts= True)
cat_counts = dict(zip(category,records))

print(cat_counts)

### Random Forest Classifier

In [None]:
rf_model = RandomForestClassifier(n_estimators=100)

In [None]:
rf_model.fit(X_train,y_train)

In [None]:
pred_rf = rf_model.predict(X_test)

In [None]:
print(confusion_matrix(y_test,pred_rf))
print()
print(classification_report(y_test,pred_rf))

In [None]:
print("Cohen's Kappa Score:\t",round(cohen_kappa_score(y_test,pred_rf),4)*100)
print()
print("R-Squared Score:\t",round(r2_score(y_test,pred_rf),4)*100)
print()
print("Area Under ROC Curve:\t",round(roc_auc_score(y_test,pred_rf),4)*100)

In [None]:
'''
# Checking 10-fold Cross-Validation Score for this model

kfold = StratifiedKFold(n_splits=5, random_state=69)

# use area under the precision-recall curve to show classification accuracy
scoring = 'roc_auc'
results = cross_val_score(rf_model, X_sm, y_sm, cv=kfold, scoring = scoring)
print( "AUC: %.3f (%.3f)" % (results.mean(), results.std()) )
'''

In [None]:
'''
# change size of Matplotlib plot
fig_size = plt.rcParams["figure.figsize"] # Get current size

old_fig_params = fig_size
# new figure parameters
fig_size[0] = 15
fig_size[1] = 10
   
plt.rcParams["figure.figsize"] = fig_size # set new size
'''

In [None]:
'''
# plot roc-curve
# code adapted from http://scikit-learn.org

mean_tpr = 0.0
mean_fpr = np.linspace(0, 1, 100)

colors = cycle(['cyan', 'indigo', 'seagreen', 'yellow', 'blue', 'darkorange'])
lw = 2

i = 0
for (train, test), color in zip(kfold.split(X_sm, y_sm), colors):
    probas_ = rf_model.fit(X_sm[train], y_sm[train]).predict_proba(X_sm[test])
    # Compute ROC curve and area under the curve
    fpr, tpr, thresholds = roc_curve(y_sm[test], probas_[:, 1])
    mean_tpr += interp(mean_fpr, fpr, tpr)
    mean_tpr[0] = 0.0
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, lw=lw, color=color,
             label='ROC fold %d (area = %0.2f)' % (i, roc_auc))

    i += 1
plt.plot([0, 1], [0, 1], linestyle='--', lw=lw, color='k',
         label='Luck')

mean_tpr /= kfold.get_n_splits(X_sm, y_sm)
mean_tpr[-1] = 1.0
mean_auc = auc(mean_fpr, mean_tpr)
plt.plot(mean_fpr, mean_tpr, color='g', linestyle='--',
         label='Mean ROC (area = %0.2f)' % mean_auc, lw=lw)

plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.show()
'''