In [1]:
import numpy as np
import pandas as pd
import scipy as sp
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as stats

%matplotlib inline

from sklearn.preprocessing import StandardScaler
from matplotlib.patches import Rectangle
from pprint import pprint as pp
import csv
from pathlib import Path
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline 
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import r2_score, classification_report, confusion_matrix, accuracy_score, roc_auc_score, roc_curve, precision_recall_curve, average_precision_score
from sklearn.metrics import homogeneity_score, silhouette_score
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import MiniBatchKMeans, DBSCAN
from itertools import product

In [201]:
df = pd.read_csv('../assets/creditcard.csv')

In [202]:
df.shape

(284807, 31)

In [203]:
df.drop(['Time','Amount'], axis=1, inplace=True)
df.head()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Class
0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,0.090794,...,0.251412,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,0
1,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,-0.166974,...,-0.069083,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,0
2,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,0.207643,...,0.52498,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,0
3,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,-0.054952,...,-0.208038,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,0
4,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,0.753074,...,0.408542,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,0


In [204]:
X = df.drop('Class',axis=1) # independent columns - features
y = df.loc[:,'Class']       # target column - Class

In [205]:
print("Input Shape : ", X.shape)
print("Output Shape : ", y.shape)

Input Shape :  (284807, 28)
Output Shape :  (284807,)


In [206]:
#Train test split into train and validation sets
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.03)


In [207]:
print(X_train.shape,X_test.shape,y_train.shape,y_test.shape)

(276262, 28) (8545, 28) (276262,) (8545,)


In [208]:
new_df = pd.concat([pd.DataFrame(X_test,columns=X_test.columns),pd.DataFrame(y_test)],axis=1)

new_df = new_df.rename({0:'Class'},axis=1)

new_df.head()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Class
192019,2.271619,-1.369435,-0.903227,-1.448867,-1.391278,-1.026582,-1.007507,-0.270049,-1.178829,1.580577,...,-0.458837,-0.161257,0.033168,0.232366,-0.040112,-0.200107,-0.157518,0.007677,-0.050965,0
184481,-1.940745,-1.201531,1.445564,-2.173858,0.024075,-0.230734,-1.434249,0.977634,-0.206405,-0.40158,...,0.340618,0.257005,0.023941,-0.309287,-0.039663,0.504372,-0.188596,0.163107,-0.140378,0
142093,1.246849,-0.383907,0.898373,-0.721547,-1.251039,-0.833794,-0.60566,-0.005309,1.895362,-1.065548,...,-0.156736,-0.018268,0.173542,-0.030464,0.390655,0.484424,-0.68463,0.090463,0.034199,0
69349,-17.040644,9.262318,-15.901569,2.51814,-10.254723,-3.503558,-7.54849,11.137281,0.162848,0.48687,...,0.341773,0.075578,-1.542569,-0.660104,0.449694,0.94478,-0.382601,-0.414142,-0.331443,0
164392,1.285105,-2.55994,-2.809825,-2.599266,-0.552376,-0.74677,0.541567,-0.455993,0.419278,-0.459698,...,0.464667,0.089803,-0.16985,-0.424827,0.257248,0.225834,-0.774681,-0.062258,0.022316,0


In [209]:
new_df.shape

(8545, 29)

In [210]:
new_df.Class.value_counts()

0    8530
1      15
Name: Class, dtype: int64

In [211]:
labels = new_df.Class

In [212]:
cols = list((df.columns.values))

In [213]:
cols

['V1',
 'V2',
 'V3',
 'V4',
 'V5',
 'V6',
 'V7',
 'V8',
 'V9',
 'V10',
 'V11',
 'V12',
 'V13',
 'V14',
 'V15',
 'V16',
 'V17',
 'V18',
 'V19',
 'V20',
 'V21',
 'V22',
 'V23',
 'V24',
 'V25',
 'V26',
 'V27',
 'V28',
 'Class']

In [214]:
# Take the float values of df for X
X = new_df[cols].values.astype(np.float)

In [215]:
X.shape

(8545, 29)

### Preprocessing: MinMaxScaler

In [216]:
# Define the scaler and apply to the data
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

### DBSCAN

In [217]:
# Initialize and fit the DBscan model
db = DBSCAN(eps=0.9, min_samples=10, n_jobs=-1).fit(X_scaled)

# Obtain the predicted labels and calculate number of clusters
pred_labels = db.labels_
n_clusters = len(set(pred_labels)) - (1 if -1 in labels else 0)

In [218]:
# Print performance metrics for DBscan
print(f'Estimated number of clusters: {n_clusters}')
print(f'Homogeneity: {homogeneity_score(labels, pred_labels):0.3f}')
print(f'Silhouette Coefficient: {silhouette_score(X_scaled, pred_labels):0.3f}')

Estimated number of clusters: 3
Homogeneity: 0.975
Silhouette Coefficient: 0.650


### Assessing smallest clusters

In [219]:
# Count observations in each cluster number
counts = np.bincount(pred_labels[pred_labels >= 0])

# Print the result
print(counts)

[8529   10]


In [220]:
# Sort the sample counts of the clusters and take the top 3 smallest clusters
smallest_clusters = np.argsort(counts)[:3]

In [221]:
# Print the results 
print(f'The smallest clusters are clusters: {smallest_clusters}')

The smallest clusters are clusters: [1 0]


In [222]:
# Print the counts of the smallest clusters only
print(f'Their counts are: {counts[smallest_clusters]}')

Their counts are: [  10 8529]


### Results verification

In [223]:
# Create a dataframe of the predicted cluster numbers and fraud labels 
df = pd.DataFrame({'clusternr':pred_labels,'fraud':labels})

# Create a condition flagging fraud for the smallest clusters 
df['predicted_fraud'] = np.where((df['clusternr'].isin([21, 17, 9])), 1 , 0)

In [224]:
# Run a crosstab on the results 
print(pd.crosstab(df['fraud'], df['predicted_fraud'], rownames=['Actual Fraud'], colnames=['Flagged Fraud']))

Flagged Fraud     0
Actual Fraud       
0              8530
1                15
