In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.neural_network import MLPClassifier
import time

%matplotlib inline
import os
import warnings
warnings.filterwarnings('ignore')

In [2]:
# data exploration

In [3]:
df = pd.read_csv('../input/PS_20174392719_1491204439457_log.csv')
df.head()

In [4]:
# set naming convention
df = df.rename(columns={'oldbalanceOrg':'oldBalanceSender', 'newbalanceOrig':'newBalanceSender', 'oldbalanceDest':'oldBalanceReceiver', 'newbalanceDest':'newBalanceReceiver', 'nameOrig':'nameSender', 'nameDest':'nameReceiver'})

In [5]:
df.info()

In [6]:
# are there nulls?
df.isnull().any()

## Columns Information
* step (int): Unit of time in real world. 1 step = 1 hour of time. Total steps is 744 (31 days of simulation)
* type (object): CASH-IN, CASH-OUT, DEBIT, PAYMENT, TRANSFER
* amount (amount): amount of transaction in local currency
* nameSender (object): customer who started transaction
* oldBalanceSender (float): initial balance before transaction
* newBalanceSender (float): new balance before transaction
* nameReceiver (object): customer who receives transactin
* oldBalanceReceiver (float): initial balance before transaction. No information for customers who starts with M (merchants)
* newBalanceReceiver (float): new balance before transaction. No information for customers who starts with M (merchants)
* isFraud (int): marks whether transactions were fraud
* isFlaggedFraud (int): marks whether a transaction is an illegal attempt. An illegal attempt is attempting to transfer more than 200,000 in a single transaction
<br><br> At first look, there are no missing data. But upon closer information, it is because the dataset is zero filled.

In [7]:
# Is there no information for destination merchants?
print('The column information states that there are no receiving bank information (how much money is in their account) for customers who\'s name starts with M. In the dataset, there are {} rows with data on customers who\'s name starts with M.'
      .format(len(df[df.nameReceiver.str.startswith('M')][(df['oldBalanceReceiver'] != 0) | (df['newBalanceReceiver'] != 0)])))

In [8]:
# What about sending merchants?

In [9]:
print('There are {} rows with data on customers who\'s name starts with M for sending money.'
      .format(len(df[df.nameSender.str.startswith('M')][(df['oldBalanceSender'] != 0) | (df['newBalanceSender'] != 0)])))

In [10]:
# verify limitations of isFlaggedFraud
df[(df['amount'] > 200000) & (df['isFraud'] == 1) & (df['type'] == 'TRANSFER')].head(10)

In [11]:
# condition for isFlaggedFraud doesn't seem to match actual data. The only consistency is that all isFlaggedFraud are greater than $200,000 and isFraud is true. However, the reverse is not true.

In [12]:
# where does fraud occur?

In [13]:
df[df['isFraud'] == 1].type.drop_duplicates()

In [14]:
# fraud occurs only in TRANSFER and CASH_OUTS

In [15]:
print ('There are a total of {} fraudulent transactions out of {} transactions, or {:.2f}%.'
      .format(len(df[df['isFraud'] == 1]), len(df), (len(df[df['isFraud'] == 1]) / len(df) * 100)))

## number of fraudulent transactions are low compared to non fraudulent transactions

In [16]:
# check to see if there are any merchants with fraudulent charges
print('There are a total of {} fraudulent transactions out of {} transactions for Merchants.'
      .format(len(df[(df['isFraud'] == 1) & (df.nameReceiver.str.startswith('M'))]), len(df[df.nameReceiver.str.startswith('M')])))

In [17]:
# check how many fraudulent transactions have empty newBalanceDest
print('There are {} fraudulent transactions out of {} with 0 balance for the receiving account, or {:.2f}%.'
      .format(len(df[(df['newBalanceReceiver'] == 0) & (df['isFraud'] == 1)]),
              len(df[df['newBalanceReceiver'] == 0]),
             (len(df[(df['newBalanceReceiver'] == 0) & (df['isFraud'] == 1)])/len(df[df['newBalanceReceiver'] == 0]) * 100)))

In [18]:
# check how many non fraudulent transactions have empty newBalanceDest
print('There are {} non fraudulent transactions out of {} with 0 balance for the receiving account, or {:.2f}%.'
      .format(len(df[(df['newBalanceReceiver'] == 0) & (df['isFraud'] == 0)]),
              len(df[df['newBalanceReceiver'] == 0]),
             (len(df[(df['newBalanceReceiver'] == 0) & (df['isFraud'] == 0)])/len(df[df['newBalanceReceiver'] == 0]) * 100)))

In [19]:
# find average amount sent for fraudulent charges
df[df['isFraud'] == 1]['amount'].describe()

In [20]:
# data cleaning

In [21]:
# only two types of transactions occurs in fraud
df_clean = df.loc[(df.type == 'TRANSFER') | (df.type == 'CASH_OUT')]
# fraud occurs only in customers who's name does not start with M
df_clean = df_clean[~df_clean.nameSender.str.startswith('M') | ~df_clean.nameReceiver.str.startswith('M')]

In [22]:
df_clean.head()

In [23]:
# clean up the data, remove unnecessary columns
df_clean.drop(['nameSender', 'nameReceiver', 'isFlaggedFraud', 'step'], 1, inplace=True)

# only two values for type, convert to bool; TRANSFER = 1, CASH_OUT = 0
df_clean['type'] = np.where(df_clean['type'] == 'TRANSFER', 1, 0)
df_clean = df_clean.reset_index(drop=True)

In [24]:
df_clean.head()

In [25]:
df_clean.info()

In [26]:
# there is some descrepancy in the data. oldBalanceSender - amount should equal newBalanceSender
# and oldBalanceReceiver + amount should equal newBalanceReceiver but doesn't always occur. create features
# and remove previous balance from both sender and receiver
df_feature = pd.DataFrame(df_clean)
df_feature['errorBalanceSender'] = df_feature.oldBalanceSender - df_feature.amount + df_feature.newBalanceSender
df_feature['errorBalanceReceiver'] = df_feature.oldBalanceReceiver + df_feature.amount - df_feature.newBalanceReceiver
df_feature.drop(['oldBalanceSender', 'oldBalanceReceiver'], 1, inplace=True)
df_feature = df_feature.rename(columns={'newBalanceSender':'balanceSender', 'newBalanceReceiver':'balanceReceiver'})
df_feature['noErrors'] = np.where((df_feature['errorBalanceSender'] == 0) & (df_feature['errorBalanceReceiver'] == 0), 1, 0)
df_feature.head(5)

In [27]:
df_fraud = df_feature[df_feature.isFraud == 1]
df_notFraud = df_feature[df_feature.isFraud == 0]

In [28]:
for col in df_feature.loc[:, ~df_feature.columns.isin(['type', 'isFraud'])]:
    sns.distplot(df_fraud[col])
    sns.distplot(df_notFraud[col])
    plt.legend(['Fraud', 'Not Fraud'], ncol=2, loc='upper right')
    plt.show()

In [29]:
# explore amount, errorBalanceSender more indepth
print(df_fraud.amount.describe())
print(df_notFraud.amount.describe())

In [30]:
print(df_fraud.errorBalanceSender.describe())
print(df_notFraud.errorBalanceSender.describe())

In [31]:
#find how many are wrong 

In [32]:
f, axes = plt.subplots(ncols=4, figsize=(14, 4), sharex=True)
sns.despine(left=True)
axes[0].set_title('Fraudulent Charges')
axes[1].set_title('Fraudulent Charges')
axes[2].set_title('Non Fraudulent Charges')
axes[3].set_title('Non Fraudulent Charges')
sns.distplot(df_fraud.errorBalanceSender, ax=axes[0])
sns.distplot(df_fraud.errorBalanceReceiver, ax=axes[1])
sns.distplot(df_notFraud.errorBalanceSender, ax=axes[2])
sns.distplot(df_notFraud.errorBalanceReceiver, ax=axes[3])
plt.setp(axes, yticks=[])
plt.tight_layout()

In [33]:
sns.distplot(df_fraud.amount)
plt.title('Fraud Amount Distribution')
df_fraud.amount.describe()

In [34]:
sns.distplot(df_notFraud.amount)
plt.title('Non Fraud Amount Distribution')
df_notFraud.amount.describe()

In [35]:
def printErrorOrigin(df):
    print ('Number of charges with 0 error balance from the Originating account is', 
       len(df[df.errorBalanceSender == 0]), 'or ~', (int)((len(df[(df.errorBalanceSender == 0)])/len(df))*100), '%')

def printErrorDest(df):
    print ('Number of charges with 0 error balance from the Destination account is', 
       len(df[df.errorBalanceReceiver == 0]), 'or ~', (int)((len(df[df.errorBalanceReceiver == 0])/len(df))*100), '%')

In [36]:
print('Fraudulent Charges')
printErrorOrigin(df_fraud)
printErrorDest(df_fraud)
print('-' * 40)
print('Non Fraudulent Charges')
printErrorOrigin(df_notFraud)
printErrorDest(df_notFraud)

In [37]:
ax = sns.heatmap(df_feature.corr(), vmin=-.25)
ax.set_title('All Transactions')

In [38]:
ax = sns.heatmap(df_fraud.loc[:, ~df_fraud.columns.isin(['isFraud'])].corr(), vmin=-.25)
ax.set_title('Fraud Transactions')

In [39]:
ax = sns.heatmap(df_notFraud.loc[:, ~df_notFraud.columns.isin(['isFraud'])].corr(), vmin=-.25)
ax.set_title('Non Fraud Transactions')

Based on the correlation maps, it appears that only column noErrors is correlated to whether a transaction is marked fraud or not.  

In [40]:
def setTrainingData(df, test_size):
    X = df.loc[:, ~df.columns.isin(['isFraud'])]
    Y = df.isFraud

    return train_test_split(X, Y, test_size=test_size)

trainX, testX, trainY, testY = setTrainingData(df_feature, .2)

make the sample size of fraud/nonfraud be same, then try oversampling fraud
include cross validation

## Run models without changing any sample size of fraud and nonfraud transactions.

In [41]:
def drawConfusionMatrix(cm, labels):
    fig = plt.figure()
    ax = fig.add_subplot(111)
    cax = ax.matshow(cm)
    plt.title('Confusion Matrix of Transactions')
    fig.colorbar(cax)
    ax.set_xticklabels(['']+labels)
    ax.set_yticklabels(['']+labels)
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.show()

def correctFraudCount(y, y_pred):
    labels = ['Not Fraud', 'Fraud']
    cm = confusion_matrix(y, y_pred)
    print(pd.DataFrame(confusion_matrix(y, y_pred),
                       ['Actual Not Fraud', 'Actual Fraud'],
                       ['Predicted Not Fraud', 'Predicted Fraud']))
    y = y.values.reshape(-1, 1)
    count, total = [0, 0]

    for i in range(len(y)):
        if (y[i]==1):
            if (y_pred[i] == 1):
                count = count + 1
            total = total + 1
    print(count, 'fraudulent charges correctly identified out of a total of', total, 'fraudulent charges or {:.3f}%'.format(count/total*100))
#     drawConfusionMatrix(cm, labels)
    
def printModel(model, testX, testY, y_pred):
    print('Percent Accuracy: {:.3f}%'.format(model.score(testX, testY)*100))
    correctFraudCount(testY, y_pred)

def runModel(name, model, trainX, trainY, testX, testY):
    print('-' * 20, name, '-' * 20)
    start_time = time.time()
    model.fit(trainX, trainY)
    print("--- Model Fitting in %s seconds ---" % (time.time() - start_time))
    start_time = time.time()
    y_pred = model.predict(testX)
    print("--- Model Predicting in %s seconds ---" % (time.time() - start_time))
    printModel(model, testX, testY, y_pred)

## Run models changing sample size of nonfraudulent transactions to equal fraudulent transactions.

In [42]:
# data is highly skewed, keep data of fradulent charges, but use subsample of non fraudulent charges
lr = LogisticRegression()
randomForest = RandomForestClassifier(max_depth=8)
gradientBoost = GradientBoostingClassifier(learning_rate=.5, max_depth=7)

df_fraud = df_feature[df_feature['isFraud']==1]
df_notFraud = df_feature[df_feature['isFraud']==0].sample(n=len(df_fraud))

sample_data = pd.concat([df_fraud, df_notFraud], ignore_index=True)
trainX, testX, trainY, testY = setTrainingData(sample_data, .3)

In [43]:
runModel('Logistic Regression', lr, trainX, trainY, testX, testY)
runModel('Random Forest', randomForest, trainX, trainY, testX, testY)
runModel('Gradient Boosting', gradientBoost, trainX, trainY, testX, testY)

# Now test for Multi Layer Perceptrons

In [44]:
mlp = MLPClassifier(hidden_layer_sizes=(100,))
runModel('Multi Layer Perceptron', mlp, trainX, trainY, testX, testY)

In [45]:
mlp = MLPClassifier(hidden_layer_sizes=(100,100))
runModel('Multi Layer Perceptron', mlp, trainX, trainY, testX, testY)

In [46]:
mlp = MLPClassifier(hidden_layer_sizes=(100,100,100))
runModel('Multi Layer Perceptron', mlp, trainX, trainY, testX, testY)

In [47]:
mlp = MLPClassifier(hidden_layer_sizes=(100,100,100,100))
runModel('Multi Layer Perceptron', mlp, trainX, trainY, testX, testY)

In [48]:
mlp = MLPClassifier(hidden_layer_sizes=(1000,1000))
runModel('Multi Layer Perceptron', mlp, trainX, trainY, testX, testY)

In [50]:
models = [lr, randomForest, gradientBoost, mlp]
names = ['Logistic Regression', 'Random Forest', 'Gradient Boost', 'Multi Layer Perceptron']
i = 0

print('Cross Validation Scores')
for model in models:
    print('-'*40, names[i], '-'*40)
    print(cross_val_score(model, testX, testY, cv=10))
    i += 1

## MLP is not as accurate as either Gradient Boosting or Random Forest for this analysis and is also more expensive in terms of performance.