In [None]:
# Import all libraries we are planning to use.
import pandas as pd 
import numpy as np
from collections import Counter
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt
import seaborn as sns


# Import dataset.
cc_dataframe = pd.read_csv("creditcard.csv")

# See how many rows and columns
print(f"Rows: {cc_dataframe.shape[0]} and Columns: {cc_dataframe.shape[1]}")

In [None]:
# What are the column labels
cc_dataframe.columns

In [None]:
# Analyze first and last 10 values of data to see what we have to work with.
cc_dataframe.head(10)

In [None]:
cc_dataframe.tail(10)

In [None]:
# All values are continious numbers except class which is discrete.
# Lets look for missing data, larger missing count will be on top
cc_dataframe.isnull().sum().sort_values(ascending = False).head(5)

In [None]:

# No missing data, great!
# Time amount, and class are original values, other values are PCA transformation due to confidentiality.
# First I will explore time, amount, and class because I know what they represent.
cc_dataframe[["Time","Amount","Class"]].info()

In [None]:
# Time and amount are floats, class is integer.
cc_dataframe[["Time","Amount","Class"]].describe()

In [None]:

# Time is incremental starting at 0 and going up to 172792 seconds.
# The class field data is integer represented by 0 as non fraud, and 1 as fraud.
# Only 0.17% of all transactions are fraud, which is heavily unbalanced.
# We can confirm that by looking at the count of each class value
cc_dataframe.Class.value_counts()

In [None]:
# Create new column to interpret as hour
hourly_dataframe = cc_dataframe.copy()
hourly_dataframe['Hour'] = hourly_dataframe['Time'].apply(lambda x: np.floor(x / 3600))

# Lets look at the amount of transactions for each hour described as fraudulent or not.
sns.lineplot(data=hourly_dataframe, x="Hour", y="Amount", hue="Class")
plt.suptitle("Transaction amount per Hour")
plt.show()

In [None]:
# Lets look at the volume of transactions for each hour described as fraudulent or not.
hourly_agregate = hourly_dataframe.groupby(['Hour','Class'])['Class'].aggregate(['count', 'max']).reset_index()
hourly_dataframe_agregate = pd.DataFrame(hourly_agregate)
hourly_dataframe_agregate.columns = ['Hour', 'Class', 'Transactions', 'Max']
fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(12,6))
s = sns.lineplot(ax = ax1, x="Hour", y="Transactions", data=hourly_dataframe_agregate.loc[hourly_dataframe_agregate.Class==0])
s = sns.lineplot(ax = ax2, x="Hour", y="Transactions", data=hourly_dataframe_agregate.loc[hourly_dataframe_agregate.Class==1], color="orange")
plt.suptitle("Transactions per Hour")
plt.show()


In [None]:
# Lets look at the volume of transactions for each hour described as fraudulent or not.
time_amount_aggregate = cc_dataframe.groupby(['Time','Class'])['Amount'].aggregate(['min', 'max', 'count', 'sum', 'mean', 'median', 'var']).reset_index()
amount_over_time_dataframe = pd.DataFrame(time_amount_aggregate)
amount_over_time_dataframe.columns = ['Time', 'Class', 'Min', 'Max', 'Transactions', 'Sum', 'Mean', 'Median', 'Var']
fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(12,6))
s = sns.lineplot(ax = ax1, x="Time", y="Max", data=amount_over_time_dataframe.loc[amount_over_time_dataframe.Class==0])
s = sns.lineplot(ax = ax2, x="Time", y="Max", data=amount_over_time_dataframe.loc[amount_over_time_dataframe.Class==1], color="orange")
plt.suptitle("Largest Transaction per Time")
plt.show()

In [None]:
# There is a clear relation between the hour in which transaction occurs
# and the amount for a fraudulent transaction.
# Lets see if there are features which have direct correlation between each other.
plt.figure(figsize = (7,7))
correlation_dataframe = cc_dataframe.corr()
sns.heatmap(correlation_dataframe,xticklabels=correlation_dataframe.columns,yticklabels=correlation_dataframe.columns,linewidths=.1,cmap="Reds")
plt.show()

In [None]:
# Normalize the data to fit between 0-1 floating point values only
for column in ['Time','Amount']:
    cc_dataframe[column] = cc_dataframe[column]  / cc_dataframe[column].abs().max()

In [None]:
# Instead of regular kfold, we are using stratified to make sure distribution of classes is equal for the unbalanced dataset.
five_fold = StratifiedKFold(n_splits=5, random_state=1, shuffle=True)

# Next we will create differente variations of data to test
# Separate dataset between valid and invalid transactions
valid_transactions_dataframe = cc_dataframe[cc_dataframe.Class == 0]
invalid_transactions_dataframe = cc_dataframe[cc_dataframe.Class == 1]

# Randomly trim invalid transaction so data is not unbalanced
trimmed_valid_transactions_dataframe = resample(valid_transactions_dataframe, n_samples=(invalid_transactions_dataframe.shape[0]*3), random_state=7)

# Merge the trimmed valid transactions with the invalid transactions
trimmed_cc_dataframe = pd.concat([trimmed_valid_transactions_dataframe,invalid_transactions_dataframe],axis=0)

# Extract our X and y dataframes to run cross validation score with
X_trimmed = trimmed_cc_dataframe.drop(columns='Class')
y_trimmed = trimmed_cc_dataframe.Class

X_original = cc_dataframe.drop(columns='Class')
y_original = cc_dataframe.Class

# Use SMOTE library to oversample the invalid transactions
X_smote, y_smote = SMOTE().fit_resample(X_original, y_original)


# Create data structure to iterate over
X_y_dictionary = {
  'original': [X_original,y_original],
  'trimmed': [X_trimmed,y_trimmed],
  'SMOTE': [X_smote,y_smote]
}


In [None]:
# Random Forest Classifier
RF = RandomForestClassifier()
results = {}
for trial_type, X_y in X_y_dictionary.items():
  scores = cross_val_score(RF, X_y[0], X_y[1], scoring='accuracy', cv=five_fold, n_jobs=-1)
  accuracy = str(round(((sum(scores) / len(scores)) * 100), 2)) + '%'
  results.update({trial_type: accuracy})

results

In [None]:

# Splits data into train and test.
X_original_train, X_original_test, y_original_train, y_original_test = train_test_split(X_original, y_original, test_size=0.25, random_state=7, stratify=y_original)
X_trimmed_train, X_trimmed_test, y_trimmed_train, y_trimmed_test = train_test_split(X_trimmed, y_trimmed, test_size=0.25, random_state=7, stratify=y_trimmed)
X_smote_train, X_smote_test, y_smote_train, y_smote_test = train_test_split(X_smote, y_smote, test_size=0.25, random_state=7, stratify=y_smote)

# Merge test and data used to validate prediction
original_test_merged = X_original_test.join(y_original_test)
trimmed_test_merged = X_trimmed_test.join(y_trimmed_test)
smote_test_merged = X_smote_test.join(y_smote_test)

# This test dataset is used run predictions and validate predictions.
test_dataset = {
  'original': {
    'valid': original_test_merged[original_test_merged.Class == 0].drop(columns='Class'),
    'invalid': original_test_merged[original_test_merged.Class == 1].drop(columns='Class'),
    'valid_count': original_test_merged.Class.value_counts()[0],
    'invalid_count': original_test_merged.Class.value_counts()[1]
    },
  'trimmed': {
    'valid': trimmed_test_merged[trimmed_test_merged.Class == 0].drop(columns='Class'),
    'invalid': trimmed_test_merged[trimmed_test_merged.Class == 1].drop(columns='Class'),
    'valid_count': trimmed_test_merged.Class.value_counts()[0],
    'invalid_count': trimmed_test_merged.Class.value_counts()[1]
    },
  'SMOTE': {
    'valid': smote_test_merged[smote_test_merged.Class == 0].drop(columns='Class'),
    'invalid': smote_test_merged[smote_test_merged.Class == 1].drop(columns='Class'),
    'valid_count': smote_test_merged.Class.value_counts()[0],
    'invalid_count': smote_test_merged.Class.value_counts()[1]
  }
}


In [None]:
# Create a structure with the different trials fitted to the classifier.
classifiers = {
    'original': RF.fit(X_original_train,y_original_train),
    'trimmed': RF.fit(X_trimmed_train,y_trimmed_train),
    'SMOTE': RF.fit(X_smote_train, y_smote_train),
 }



In [None]:

# Test the data against the respective testing set.
prediction_results = {}
for classifier_type, classifier_object in classifiers.items():
  # Predict against test dataset consisting of only valid or invalid transactions.
  # Then compare success rate against that particular set.
  valid_predictions = Counter(classifier_object.predict(test_dataset[classifier_type]['valid']))
  invalid_predictions = Counter(classifier_object.predict(test_dataset[classifier_type]['invalid']))
  # Prediction 0 is valid, 1 is invalid. Calculate rate of valid and invalid against the actual count.
  if valid_predictions[0]:
    valid_rate = str(round((valid_predictions[0] / test_dataset[classifier_type]['valid_count']) * 100,2)) + '%'
  else:
    valid_rate = '0%'
    
  if invalid_predictions[1]:
    invalid_rate = str(round((invalid_predictions[1] / test_dataset[classifier_type]['invalid_count']) * 100,2)) + '%'
  else:
    invalid_rate = '0%'
  # Write results
  prediction_results.update({
    classifier_type: {
    'valid_accuracy': valid_rate,
    'invalid_accuracy': invalid_rate
  }})
  

from pprint import pprint
pprint(prediction_results)

In [None]:
# Test the data against the respective testing set.
prediction_results = {}

classifier_object = classifiers['trimmed']
test_against = 'original'

valid_predictions = Counter(classifier_object.predict(test_dataset[test_against]['valid']))
invalid_predictions = Counter(classifier_object.predict(test_dataset[test_against]['invalid']))
# Prediction 0 is valid, 1 is invalid. Calculate rate of valid and invalid against the actual count.
if valid_predictions[0]:
  valid_rate = str(round((valid_predictions[0] / test_dataset[test_against]['valid_count']) * 100,2)) + '%'
else:
  valid_rate = '0%'
  
if invalid_predictions[1]:
  invalid_rate = str(round((invalid_predictions[1] / test_dataset[test_against]['invalid_count']) * 100,2)) + '%'
else:
  invalid_rate = '0%'
# Write results
prediction_results.update({
  test_against: {
  'valid_accuracy': valid_rate,
  'invalid_accuracy': invalid_rate
}})
  

from pprint import pprint
pprint(prediction_results)