# Import Basic Libraries:

In [None]:
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# 1. Data Input, Preparation, & Exploration

## 1.1 Read in Transaction Data

In [None]:
data = pd.read_csv("../input/creditcard.csv")
print("No. of Rows: \t\t", data.shape[0])
print("No. of Columns: \t", data.shape[1])
data.head()

## 1.2 Class Distribution

As described in the Credit Card Fraud summary, this is a highly unbalanced dataset.  Check out the histogram and class counts below (Class 0 for normal, Class 1 for fraudulent).

In [None]:
norm_count = data[data['Class'] == 0].shape[0] # Normal transactions
fraud_count = data[data['Class'] == 1].shape[0] # Fraudulent transcations
total_count = data.shape[0]
print("No. of normal transactions: \t\t", norm_count)
print("No. of fraudulent transactions: \t", fraud_count)
print("% normal transactions: \t\t", norm_count/total_count * 100)
print("% fraudulent transcations: \t", fraud_count/total_count * 100)
pd.value_counts(data['Class'], sort = True).sort_index().plot(kind='bar')
plt.title("Class Histogram")
plt.xlabel("Class")
plt.ylabel("Frequency")

## 1.3 Standardize Input Data

Before we tackle the class imbalance, let's bring the transaction dollar amount onto a standard scale (i.e. mean = 0, std = 1.0).  This is a common preprocessing step.  The other feature variables have been outputted from Principal Component Analysis so we can leave them alone.  Additionally, we'll drop the unscaled dollar amount column and the time column.  A time series analysis would be interesting, but we don't have information for which account is making a particular transaction so the time column won't be very helpful.

In [None]:
from sklearn.preprocessing import StandardScaler

data['Amount_scl'] = StandardScaler().fit_transform(data['Amount'].values.reshape(-1, 1))
data = data.drop(['Time','Amount'],axis=1)
data.head()

In [None]:
X = data.ix[:, data.columns != 'Class'] # features
y = data['Class'] # labels
print("X.shape: ", X.shape)
print("y.shape: ", y.shape)

## 1.4 Split Entire Dataset into Train-Test Sets

Next, we will break up the data into training and test sets.  The test set will be untouched until the final evaluation of each model so we have an unbiased estimate of model performance.

In [None]:
from sklearn.model_selection import train_test_split

# 70% training data, 30% testing data
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3)

## 1.5 Undersample Training Data to Create Balanced Class Distributions

Now we will handle the skewed class distribution using a technique known as undersampling:

 1.  Get the number of fraudulent transactions and the indices of the corresponding rows in our training data
 2. Get the indices of normal transactions in our training data
 3. Take a random sample of normal transactions with a sample size equal to the number of fraudulent transactions
 4. Combine the fraudulent transactions and the random sample of normal transactions to get balanced training data set (50% fraud, 50% normal).

In [None]:
# Get the indices of the fraudulent and normal classes:
fraud_idx = np.array(y_train[y_train == 1].index)
num_fraud = len(fraud_idx)
normal_idx = y_train[y_train == 0].index

# From the normal indices, sample a random subset (subset size = # of frauds):
normal_idx_sample = np.random.choice(normal_idx, num_fraud, replace=False)
normal_idx_sample = np.array(normal_idx_sample)

# Group together our normal and fraud indices:
# (we'll have a balanced class distribution, 50% normal, 50% fraud)
undersample_idx = np.concatenate([fraud_idx,normal_idx_sample])

# Grab the records at the indices:
undersample_data = data.iloc[undersample_idx,:]

# Split into features and labels:
X_undersample = undersample_data.ix[:, undersample_data.columns != 'Class']
y_undersample = undersample_data['Class']

norm_count = undersample_data[undersample_data['Class'] == 0].shape[0]
fraud_count = undersample_data[undersample_data['Class'] == 1].shape[0]

print("---Undersampled Data Set---")
print("No. of normal transactions: \t", norm_count)
print("No. of fraudulent transactions: \t\t", fraud_count)

# 2. Logistic Regression Classifier

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import auc,roc_auc_score,roc_curve,recall_score,f1_score

## 2.1 Tune Hyperparameters with Cross Validation

Here we'll search the hyperparameter space and use 5-fold cross validation on the undersampled training data to output accuracy, recall, and F1 metrics.  The goal is to find the best combination of hyperparameters that gives the highest F1-score.  The cross validation step here further subdivides the training data into two smaller components: (1) training subset and (2) validation subset.  The training subset is used to train the model on with a pair of hyperparameters, and the validation subset is used to score the trained model.  5-fold CV means we repeat this process of splitting up the data 5 different times to get a more accurate score.  

Once we identify the pair of hyperparameters with the highest score, we will use them to train the model on the full training data (i.e. training subset + validation subset).  Keep in mind, we still haven't touched the test dataset from Section 1.4.  We will save that for final model evaluation to get an unbiased result.

In [None]:
def get_best_hypers_lr(X, y):
    """ Search parameter space for the optimal values.
    
             -Perform Logistic Regression using a range of C parameter values and two different
             penalty terms (L1 & L2)
             -Compute mean recall,accuracy and f1-scores using kfold cross validation 
             for each run
             -Output the C parameter and penalty term with the best f1 score
    """
    c_range = [0.01, 0.1, 1.0, 10.0, 100.0]
    f1_max = 0
    best_c = 0
    penalty = ''
    
    for c_param in c_range:
        print('='*25)
        print('C parameter: ', c_param)
        print('='*25)
        print('')
    
        print('-'*25)
        print('L1-penalty')
        print('-'*25)
        print('')
        
        lr_l1 = LogisticRegression(C=c_param, penalty='l1')
        acc_score = cross_val_score(lr_l1, X, y, cv=5)
        recall_score = cross_val_score(lr_l1, X, y, cv=5, scoring='recall')
        f1_score = cross_val_score(lr_l1, X,y,cv=5, scoring='f1')
        l1_f1=np.mean(f1_score)
        
        print("Mean Accuracy: %0.3f (+/- %0.3f)" % (np.mean(acc_score), np.std(acc_score)) )
        print("Mean Recall: %0.3f (+/- %0.3f)" % (np.mean(recall_score), np.std(recall_score)) )
        print("Mean F1: %0.3f (+/- %0.3f)" % (np.mean(f1_score), np.std(f1_score)) )
        print('')
        
        print('-'*25)
        print('L2-penalty')
        print('-'*25)
        print('')
        
        lr_l2 = LogisticRegression(C=c_param, penalty='l2')
        score = cross_val_score(lr_l2, X, y, cv=5)
        recall_score = cross_val_score(lr_l2, X, y, cv=5, scoring='recall')
        f1_score = cross_val_score(lr_l2, X, y, cv=5, scoring='f1')
        l2_f1 = np.mean(f1_score)
        
        print("Mean Accuracy: %0.3f (+/- %0.3f)" % (np.mean(acc_score), np.std(acc_score)) )
        print("Mean Recall: %0.3f (+/- %0.3f)" % (np.mean(recall_score), np.std(recall_score)) )
        print("Mean F1: %0.3f (+/- %0.3f)" % (np.mean(f1_score), np.std(f1_score)) )
        print('')
        
        # compare l1_f1 & l2_f1:
        if l2_f1 > l1_f1:
            # compare to max:
            if l2_f1 > f1_max:
                f1_max = l2_f1
                best_c = c_param
                penalty='l2'
        else:
            # compare to max:
            if l1_f1 > f1_max:
                f1_max = l1_f1
                best_c = c_param
                penalty='l1'
            

    print('*'*25)
    print('Optimal C parameter = ', best_c)
    print('Optimal penalty = ', penalty)
    print('Optimal F1 = ', f1_max)
    print('*'*25)
    
    return best_c, penalty

In [None]:
best_c_lr, penalty_lr = get_best_hypers_lr(X_undersample,y_undersample)

## 2.2 Train Model On Undersampled Training Data & Evaluate on Full Test Data

With our optimal hyperparameters found, let's set them on a fresh Linear Regression model and train it on the "full" training data (this is still our 50/50 balanced training set).

The final step is to use the unseen test data from Section 1.4 to evaluate the model performance.  It's important to remember that the test set has a similar class imbalance as the original transaction data we started with in Section 1.2.  For evaluation, we're using Receiver Operation Characteristic (ROC) curves and Area Under the Curve (AUC) to strike a balance between model sensitivity and specificity.

In [None]:
# Use best hyperparameters:
lr = LogisticRegression(C=best_c_lr, penalty=penalty_lr)
# Train on full undersample data set:
lr.fit(X_undersample, y_undersample)
# Test on unseen test data set:
y_pred_score = lr.decision_function(X_test.values)
# Compute ROC metrics:
fpr, tpr, thresholds = roc_curve(y_test.values, y_pred_score)
# Get AUC:
roc_auc = auc(fpr,tpr)

# Plot ROC:
plt.title('ROC Curve - Linear Regression')
plt.plot(fpr, tpr, label='AUC = %0.2f' % roc_auc)
plt.plot([0,1],[0,1],'r--')
plt.legend(loc='lower right')
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

# 3. Support Vector Machine Classifier

In [None]:
from sklearn.svm import LinearSVC

## 3.1 Tune Hyperparameters with Cross Validation

In [None]:
def get_best_hypers_svc(X, y):
    """ Search parameter space for the optimal values.
    
             -Perform Support Vector Classifier using a range of C parameter values and two different
             penalty terms (L1 & L2)
             -Compute mean recall, accuracy, and f1-scores using kfold cross validation
             for each run
             -Output the C parameter and penalty term with the best f1-score
    """
    c_range = [0.01, 0.1, 1.0, 10.0, 100.0]
    f1_max = 0
    best_c = 0
    penalty = ''
    
    for c_param in c_range:
        print('='*25)
        print('C parameter: ', c_param)
        print('='*25)
        print('')
    
        print('-'*25)
        print('L1-penalty')
        print('-'*25)
        print('')
        
        svc_l1 = LinearSVC(C=c_param, penalty='l1', dual=False)
        acc_score = cross_val_score(svc_l1, X, y, cv=5)
        recall_score = cross_val_score(svc_l1, X, y, cv=5, scoring='recall')
        f1_score = cross_val_score(svc_l1, X, y, cv=5, scoring='f1')
        l1_f1 = np.mean(f1_score)
        
        print("Mean Accuracy: %0.3f (+/- %0.3f)" % (np.mean(acc_score), np.std(acc_score)) )
        print("Mean Recall: %0.3f (+/- %0.3f)" % (np.mean(recall_score), np.std(recall_score)) )
        print("Mean F1: %0.3f (+/- %0.3f)" % (np.mean(f1_score), np.std(f1_score)) )
        print('')
        
        print('-'*25)
        print('L2-penalty')
        print('-'*25)
        print('')
        
        svc_l2 = LinearSVC(C=c_param, penalty='l2')
        score = cross_val_score(svc_l2, X, y, cv=5)
        recall_score = cross_val_score(svc_l2, X, y, cv=5, scoring='recall')
        f1_score = cross_val_score(svc_l2, X, y, cv=5, scoring='f1')
        l2_f1 = np.mean(f1_score)
        
        print("Mean Accuracy: %0.3f (+/- %0.3f)" % (np.mean(acc_score), np.std(acc_score)) )
        print("Mean Recall: %0.3f (+/- %0.3f)" % (np.mean(recall_score), np.std(recall_score)) )
        print("Mean F1: %0.3f (+/- %0.3f)" % (np.mean(f1_score), np.std(f1_score)) )
        print('')
        
        # compare l1_recall & l2_recall:
        if l2_f1 > l1_f1:
            # compare to max:
            if l2_f1 > f1_max:
                f1_max = l2_f1
                best_c = c_param
                penalty='l2'
        else:
            # compare to max:
            if l1_f1 > f1_max:
                f1_max = l1_f1
                best_c = c_param
                penalty='l1'
            

    print('*'*25)
    print('Optimal C parameter = ', best_c)
    print('Optimal penalty = ', penalty)
    print('Optimal F1 = ', f1_max)
    print('*'*25)
    
    return best_c, penalty

In [None]:
best_c_svc, penalty_svc = get_best_hypers_svc(X_undersample, y_undersample)

## 3.2 Train Model on Undersampled Training Data & Evaluate on Full Test Data

In [None]:
# Use best hyperparameters:
dual_svc = (penalty_svc == 'l2') # 'dual' option must be set to false if penalty is 'l1'
svc = LinearSVC(C=best_c_svc, penalty=penalty_svc, dual=dual_svc)
# Train on full undersample data set:
svc.fit(X_undersample, y_undersample)
# Test on unseen test data set:
y_pred_score = svc.decision_function(X_test.values)
# Compute ROC metrics:
fpr, tpr, thresholds = roc_curve(y_test.values, y_pred_score)
# Get AUC:
roc_auc = auc(fpr,tpr)

# Plot ROC:
plt.title('ROC Curve - SVC')
plt.plot(fpr, tpr, label='AUC = %0.2f' % roc_auc)
plt.plot([0,1],[0,1],'r--')
plt.legend(loc='lower right')
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

# 4. Decision Tree Classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier

## 4.1 Omit Hyperparameter Tuning, Use Cross Validation to Get Mean F1-Score

In [None]:
dt = DecisionTreeClassifier(random_state=0)
acc_score = cross_val_score(dt, X_undersample, y_undersample, cv=5)
recall_score = cross_val_score(dt, X_undersample, y_undersample, cv=5, scoring='recall')
f1_score = cross_val_score(dt, X_undersample, y_undersample, cv=5, scoring='f1')
print("Accuracy Score: %0.3f (+/- %0.3f)" % (np.mean(acc_score), np.std(acc_score)) )
print("Recall Score: %0.3f (+/- %0.3f)" % (np.mean(recall_score), np.std(recall_score)) )
print("Mean F1: %0.3f (+/- %0.3f)" % (np.mean(f1_score), np.std(f1_score)) )

## 4.2 Train Model on Undersampled Data & Evaluate on Full Test Data

In [None]:
# Train on full undersample data set:
dt.fit(X_train, y_train)
# Test on unseen test data set:
y_pred_score = dt.predict_proba(X_test.values)[:,1]
# Compute ROC metrics:
fpr, tpr, thresholds = roc_curve(y_test.values,y_pred_score)
# Get AUC:
roc_auc = auc(fpr, tpr)
                         
                                            
# Plot ROC:
plt.title('ROC Curve - DecisionTree')
plt.plot(fpr, tpr, label = 'AUC = %0.2f' % roc_auc)
plt.plot([0,1],[0,1],'r--')
plt.legend(loc='lower right')
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

# 5. Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

## 5.1 Omit Hyperparameter Tuning, Use Cross Validation to Get Mean F1-Score

In [None]:
rfc = RandomForestClassifier(random_state=0)
acc_score = cross_val_score(rfc, X_undersample, y_undersample, cv=5)
recall_score = cross_val_score(rfc, X_undersample, y_undersample, cv=5, scoring='recall')
f1_score = cross_val_score(rfc, X_undersample, y_undersample, cv=5, scoring='f1')
print("Accuracy Score: %0.3f (+/- %0.3f)" % (np.mean(acc_score), np.std(acc_score)) )
print("Recall Score: %0.3f (+/- %0.3f)" % (np.mean(recall_score), np.std(recall_score)) )
print("Mean F1: %0.3f (+/- %0.3f)" % (np.mean(f1_score), np.std(f1_score)) )

## 5.2 Train Model on Undersampled Data & Evaluate on Full Test Data

In [None]:
# Train on full undersample data set:
rfc.fit(X_train, y_train)
# Test on unseen test data set:
y_pred_score = rfc.predict_proba(X_test.values)[:,1]
# Compute ROC metrics:
fpr, tpr, thresholds = roc_curve(y_test.values,y_pred_score)
# Get AUC:
roc_auc = auc(fpr, tpr)
                         
                                            
# Plot ROC:
plt.title('ROC Curve - RandomForest')
plt.plot(fpr, tpr, label = 'AUC = %0.2f' % roc_auc)
plt.plot([0,1],[0,1],'r--')
plt.legend(loc='lower right')
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()