In [None]:
# I use interactive python notebooks, so please adjust accordingly
# if you copy and paste any of this code and are running a script instead.
import pandas as pd

df = pd.read_csv("C:\\Users\\ericj\\Documents\\credit_fraud\\creditcard.csv")

# Taking a look at the columns statistical summary.
df.describe()

In [None]:
import matplotlib.pyplot as plt

# normalizing the class label counts to set the heights of the bars and the displays
counts = df['Class'].value_counts(normalize=True) * 100
counts[1] = counts[1] + 0.6 # ensuring that the bar for the minority class is actually visible

# creating a bar plot
plt.bar(x = df['Class'].unique(), height = counts, color = (0.8,0.5,0), tick_label = ['Legitimate', 'Fraudulent'])
plt.title(label = 'Percentage of Transactions in each Class')
plt.ylim(0,105) # expanding y-axis upward for better visual aesthetic

# creating and roughly centering the labels
plt.text(x = -0.05, y = 60, s = "{0:5.3f}%".format(counts[0]))
plt.text(x = 0.95, y = 10, s = "{0:4.3f}%".format(counts[1] - 0.6)) # offsetting minority class bar height adjustment for text display
plt.show()

In [None]:
import seaborn
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

# using a copy of df I started with for correlation analysis
df_for_correlation = df.copy(deep=True)

# The Amount column covers a huge range. Converting to log-space.
eps = 0.001 # 0 => 0.1¢
df_for_correlation['Log_Amount'] = np.log(df_for_correlation.pop('Amount')+eps)

# splitting data into features and labels
y_all = df_for_correlation['Class']
X_all = df_for_correlation.drop(columns= ['Time', 'Class'])

# normalizing feature data to perform correlation analysis with Class labels
# recall columns V1 - V28 were created  by PCA and so are already normalized
# the only column that is not already normalized is the Log_Amount column 
# "renormalizing" the other columns will have no significant effect
scaler = StandardScaler()
X_np  = scaler.fit_transform(X_all)
# easier to run and plot correlation analysis from pandas
df_transformed = pd.DataFrame(data = X_np, columns = X_all.columns) 
# adding Class label back to dataset
df_transformed['Class'] = y_all

# creating the Pearson correlation matrix.
corr_matrix = df_transformed.corr()
corr_series = corr_matrix['Class'].abs().sort_values(ascending=False)
seaborn.heatmap(corr_matrix)
plt.show()

# Taking the 14 features that are most highly correlated with class
# The Class variable is included in this list at index 0! 
# Note we are not going to use Class to predict... Class
important_features = corr_series[1:16].index.tolist()
print(important_features)

In [None]:
from collections import Counter
from sklearn.model_selection import train_test_split

import numpy as np
import imblearn

y = df['Class']
X = df.drop(columns = ['Class', 'Time'])

seed = 7
test_size = 0.3

# splitting the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = test_size, random_state = seed)

# I obtained the important features list by ordering the absolute values.
# of a correlation analysis creating a list.
# I have provided a hard-coded version of that list here.
# important_features = ['V17', 'V14', 'V12', 'V10', 'V16', 'V3', 'V7', 'V11', 'V4', 'V18', 'V1', 'V9', 'V5', 'V2', 'V6']

X_train = np.array(X_train[important_features])
X_test = np.array(X_test[important_features])

# basic shape inspections
print('Training labels shape:', y_train.shape)
print('Test labels shape:', y_test.shape)
print('Training features shape:', X_train.shape)
print('Test features shape:', X_test.shape)

# creating imblearn resampling object
# sampling strategy is the propotion of output
# resampled data that is the minority class
over_and_under_sample =  imblearn.combine.SMOTETomek(sampling_strategy = 1.0, n_jobs = -1, random_state = seed)
X_train, y_train = over_and_under_sample.fit_resample(X_train, y_train)

# checking under- and over-sample counts
counter_train = Counter(y_train)
counter_test = Counter(y_test)
print(counter_train, counter_test)

In [None]:
# from xgboost import XGBClassifier

# # declaring an out-of-box-XGBoost classifier
# model = XGBClassifier()
# print(model)

In [None]:
from xgboost import XGBClassifier

# declaring an XGBoost classifier
model = XGBClassifier(learning_rate = 0.01, n_estimators = 600, max_depth = 5, objective = 'binary:logistic', eval_metric = 'logloss', base_score = 0.95, gamma = 1.55, reg_lambda = 9, random_state = seed)
print(model)

In [None]:
# Fitting the model
model.fit(X_train, y_train)

In [None]:
from sklearn import metrics

y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred]

# evaluate our predictions
confusion_matrix = metrics.confusion_matrix(y_test, predictions)
#print("Accuracy: %.2f%%" % (accuracy * 100)
print(confusion_matrix)

recall = metrics.recall_score(y_test, predictions)
print("Recall: %.2f%%" % (recall * 100))

accuracy = metrics.accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100))

precision = metrics.precision_score(y_test, predictions)
print("precision: %.2f%%" % (precision * 100))

In [None]:
from sklearn.metrics import classification_report
target_names = ['Legal', 'Fraud']

print(classification_report(y_true = y_test, y_pred = predictions, target_names = target_names, 
digits = 3))

# evaluate the predictions
confusion_matrix = metrics.confusion_matrix(y_test, predictions)
#print("Accuracy: %.2f%%" % (accuracy * 100)
print(confusion_matrix, "\n \n", "Recall: %.2f%%" % (recall * 100), "\n", "Accuracy: %.2f%%" % (accuracy * 100),
"\n", "precision: %.2f%%" % (precision * 100))