In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

# Classifier Libraries
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import MeanShift
import collections

# Other Libraries
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
# from imblearn.pipeline import make_pipeline as imbalanced_make_pipeline
# from imblearn.over_sampling import SMOTE
# from imblearn.under_sampling import NearMiss
# from imblearn.metrics import classification_report_imbalanced
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, accuracy_score, classification_report
from collections import Counter
from sklearn.model_selection import KFold, StratifiedKFold
# import tensorflow as tf
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA, TruncatedSVD
import matplotlib.patches as mpatches
import time

from sklearn.metrics.cluster import adjusted_rand_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import average_precision_score

import warnings
import random
warnings.filterwarnings("ignore")

random.seed(42)

In [2]:
data = pd.read_csv(".\data\creditcard.csv")

In [3]:
df = data.copy()

In [5]:
#check for any null values
data.isnull().sum().max()

In [6]:
# The classes are heavily skewed we need to solve this issue later.
print('No Frauds', round(data['Class'].value_counts()[0]/len(data) * 100,2), '% of the dataset = ',data['Class'].value_counts()[0],'records')
print('Frauds', round(data['Class'].value_counts()[1]/len(data) * 100,2), '% of the dataset = ',data['Class'].value_counts()[1],'records')

In [7]:
colors = ["#0101DF", "#DF0101"]

sns.countplot('Class', data=data, palette=colors)
plt.title('Class Distributions \n (0: No Fraud || 1: Fraud)', fontsize=14)

In [8]:
f, (ax1, ax2) = plt.subplots(2, 1, sharex=True, figsize=(12,4))

bins = 50

ax1.hist(df.Time[df.Class == 1], bins = bins)
ax1.set_title('Fraud')

ax2.hist(df.Time[df.Class == 0], bins = bins)
ax2.set_title('Normal')

plt.xlabel('Time (in Seconds)')
plt.ylabel('Number of Transactions')
plt.show()

In [10]:
seconds_in_day = 24*60*60

df['sin_time'] = np.sin(2*np.pi*df.Time/seconds_in_day)
df['cos_time'] = np.cos(2*np.pi*df.Time/seconds_in_day)

df.drop('Time', axis=1, inplace=True)

In [11]:
# All the features of this dataset, except for time and amount, have already undergone PCA transformation which mean that 
#they have already been scaled. In this step we will scale the 'Amount' feature.
# Since most of our data has already been scaled we should scale the columns that are left to scale (Amount and Time)
from sklearn.preprocessing import StandardScaler, RobustScaler, normalize, MinMaxScaler

# RobustScaler is less prone to outliers.

# std_scaler = StandardScaler()
# rob_scaler = RobustScaler()
# # norm = normalize()
mm_scaler = MinMaxScaler(feature_range=(0,1))

df['scaled_amount'] = mm_scaler.fit_transform(df['Amount'].values.reshape(-1,1))
df.drop('Amount', axis=1, inplace=True)
# #df['scaled_time'] = rob_scaler.fit_transform(df['Time'].values.reshape(-1,1))

# # df.drop(['Time','Amount'], axis=1, inplace=True)

In [12]:
# Since our classes are highly skewed we should make them equivalent in order to have a normal distribution of the classes.

# Lets shuffle the data before creating the subsamples
X_orig = df.drop('Class', axis=1)
y_orig = df['Class']

df = df.sample(frac=1)

# amount of fraud classes 492 rows.
fraud_df = df.loc[df['Class'] == 1]
non_fraud_df = df.loc[df['Class'] == 0][:492]

normal_distributed_df = pd.concat([fraud_df, non_fraud_df])

# Shuffle dataframe rows
new_df = normal_distributed_df.sample(frac=1, random_state=42)

df.head()

X = new_df.drop('Class', axis=1)
y = new_df['Class']

In [13]:
# Modeling the data as is using kmeans clustering

kmeans = KMeans(n_clusters=2)
kmeans.fit(X_orig)
labels = kmeans.predict(X_orig)

kmeans_RI = adjusted_rand_score(y_orig, labels)#calculate_rand_index(y, labels)#rand_index_score(y, labels)
print("KMeans RI score :",kmeans_RI)
f1 = f1_score(y_orig, labels)
print("f1 score :", f1)
precision = precision_score(y_orig, labels)
print("precision :",precision)
recall = recall_score(y_orig, labels)
print("recall :",recall)
average_precision = average_precision_score(y_orig, labels)
print("Average Precision-Recall Score :",average_precision)
accuracy = accuracy_score(y_orig, labels)
print("Accuracy :",accuracy)


KMeans RI score : 0.003082901128496951
f1 score : 0.006294806784402868
precision : 0.0034436579299789555
recall : 0.036585365853658534
Average Precision-Recall Score : 0.0017902724220599945
Accuracy : 0.9800461365064763


In [14]:
# # Modeling the data as is using AgglomerativeClustering

clustering = AgglomerativeClustering(n_clusters=2, linkage="complete").fit_predict(X)

kmeans_RI = adjusted_rand_score(y, clustering)#calculate_rand_index(y, labels)#rand_index_score(y, labels)
print("KMeans RI score :",kmeans_RI)
f1 = f1_score(y, clustering)
print("f1 score :", f1)
precision = precision_score(y, clustering)
print("precision :",precision)
recall = recall_score(y, clustering)
print("recall :",recall)
average_precision = average_precision_score(y, clustering)
print("Average Precision-Recall Score :",average_precision)
accuracy = accuracy_score(y, clustering)
print("Accuracy :",accuracy)

KMeans RI score : 0.0001413498983376267
f1 score : 0.6584531143052703
precision : 0.49638802889576883
recall : 0.9776422764227642
Average Precision-Recall Score : 0.4964687843472862
Accuracy : 0.4928861788617886


## Lets try these same clustering algorithms with over sampling the under represented(fraud) data

In [15]:
from sklearn.utils import resample

# # Separate input features and target
# y = df.Class
# X = df.drop('Class', axis=1)

X_new = df.copy()

# separate minority and majority classes
not_fraud = X_new[X_new.Class==0]
fraud = X_new[X_new.Class==1]

# upsample minority
fraud_upsampled = resample(fraud,
                          replace=True, # sample with replacement
                          n_samples=len(not_fraud), # match number in majority class
                          random_state=27) # reproducible results

# combine majority and upsampled minority
upsampled = pd.concat([not_fraud, fraud_upsampled])

upsampled.Class.value_counts()

y_upsampled = upsampled.Class
X_upsampled = upsampled.drop('Class', axis=1)

#### For AGNEST
upsampled = upsampled.sample(frac=1, random_state=42)

fraud_df = upsampled.loc[upsampled['Class'] == 1][:900]
non_fraud_df = upsampled.loc[upsampled['Class'] == 0][:900]

new_df = pd.concat([fraud_df, non_fraud_df])

# Shuffle dataframe rows
new_df = new_df.sample(frac=1, random_state=42)

X_agnest = new_df.drop('Class', axis=1)
y_agnest = new_df['Class']

In [16]:
# Modeling the upsampled data as is using kmeans clustering

kmeans = KMeans(n_clusters=2)
kmeans.fit(X_upsampled)
labels = kmeans.predict(X_upsampled)

f1 = f1_score(y_upsampled, labels)
print("f1 score :", f1)
precision = precision_score(y_upsampled, labels)
print("precision :",precision)
recall = recall_score(y_upsampled, labels)
print("recall :",recall)
average_precision = average_precision_score(y_upsampled, labels)
print("Average Precision-Recall Score :",average_precision)
accuracy = accuracy_score(y_upsampled, labels)
print("Accuracy :",accuracy)


f1 score : 0.12803959475254872
precision : 0.6775770242027911
recall : 0.07069975203559432
Average Precision-Recall Score : 0.5125546515783561
Accuracy : 0.5185287445263176


In [17]:
# Modeling the upsampled data as is using AgglomerativeClustering
# amount of fraud classes 492 rows.


clustering = AgglomerativeClustering(n_clusters=2, linkage="complete").fit_predict(X_agnest)

kmeans_RI = adjusted_rand_score(y_agnest, clustering)#calculate_rand_index(y, labels)#rand_index_score(y, labels)
print("KMeans RI score :",kmeans_RI)
f1 = f1_score(y_agnest, clustering)
print("f1 score :", f1)
precision = precision_score(y_agnest, clustering)
print("precision :",precision)
recall = recall_score(y_agnest, clustering)
print("recall :",recall)
average_precision = average_precision_score(y_agnest, clustering)
print("Average Precision-Recall Score :",average_precision)
accuracy = accuracy_score(y_agnest, clustering)
print("Accuracy :",accuracy)

KMeans RI score : -9.78610199803425e-06
f1 score : 0.015350877192982457
precision : 0.5833333333333334
recall : 0.0077777777777777776
Average Precision-Recall Score : 0.5006481481481482
Accuracy : 0.5011111111111111


## Lets try under sampling the over-represented negative(non-fraud) class

In [18]:
# still using our separated classes fraud and not_fraud from above

# downsample majority
not_fraud_downsampled = resample(not_fraud,
                                replace = False, # sample without replacement
                                n_samples = len(fraud), # match minority n
                                random_state = 27) # reproducible results

# combine minority and downsampled majority
downsampled = pd.concat([not_fraud_downsampled, fraud])

# checking counts
downsampled.Class.value_counts()

y_undersampled = downsampled.Class
X_undersampled = downsampled.drop('Class', axis=1)

In [19]:
# Modeling the undersampled data as is using kmeans clustering

kmeans = KMeans(n_clusters=2)
kmeans.fit(X_undersampled)
labels = kmeans.predict(X_undersampled)

f1 = f1_score(y_undersampled, labels)
print("f1 score :", f1)
precision = precision_score(y_undersampled, labels)
print("precision :",precision)
recall = recall_score(y_undersampled, labels)
print("recall :",recall)
average_precision = average_precision_score(y_undersampled, labels)
print("Average Precision-Recall Score :",average_precision)
accuracy = accuracy_score(y_undersampled, labels)
print("Accuracy :",accuracy)

f1 score : 0.1347905282331512
precision : 0.6491228070175439
recall : 0.07520325203252033
Average Precision-Recall Score : 0.5112145200399373
Accuracy : 0.5172764227642277


In [20]:
# Modeling the undersampled data as is using AgglomerativeClustering

clustering = AgglomerativeClustering(n_clusters=2, linkage="complete").fit_predict(X_undersampled)

f1 = f1_score(y_undersampled, clustering)
print("f1 score :", f1)
precision = precision_score(y_undersampled, clustering)
print("precision :",precision)
recall = recall_score(y_undersampled, clustering)
print("recall :",recall)
average_precision = average_precision_score(y_undersampled, clustering)
print("Average Precision-Recall Score :",average_precision)
accuracy = accuracy_score(y_undersampled, clustering)
print("Accuracy :",accuracy)

f1 score : 0.6602608098833219
precision : 0.49844559585492226
recall : 0.9776422764227642
Average Precision-Recall Score : 0.4984803487931252
Accuracy : 0.4969512195121951


## Lets try over-sampling the under-represented class using SMOTE(Synthetic Minority Over-sampling Technique) algorithm.

In [21]:
from imblearn.over_sampling import SMOTE

# Separate input features and target
y = df.Class
X = df.drop('Class', axis=1)

sm = SMOTE(random_state=27)#, ratio=1.0)
X_smote, y_smote = sm.fit_sample(X, y)

X_agnest = X_smote[:500]
y_agnest = y_smote[:500]

Using TensorFlow backend.


In [22]:
# Modeling the undersampled data using SMOTE and kmeans clustering

kmeans = KMeans(n_clusters=2)
kmeans.fit(X_smote)
labels = kmeans.predict(X_smote)

f1 = f1_score(y_smote, labels)
print("f1 score :", f1)
precision = precision_score(y_smote, labels)
print("precision :",precision)
recall = recall_score(y_smote, labels)
print("recall :",recall)
average_precision = average_precision_score(y_smote, labels)
print("Average Precision-Recall Score :",average_precision)
accuracy = accuracy_score(y_smote, labels)
print("Accuracy :",accuracy)

f1 score : 0.13228178458490494
precision : 0.6796959117752618
recall : 0.0732708439582857
Average Precision-Recall Score : 0.513166471111627
Accuracy : 0.5193711200604962


In [24]:
# Modeling the undersampled data using SMOTE and AgglomerativeClustering

clustering = AgglomerativeClustering(n_clusters=2, linkage="complete").fit_predict(X_agnest)

f1 = f1_score(y_agnest, clustering)
print("f1 score :", f1)
precision = precision_score(y_agnest, clustering)
print("precision :",precision)
recall = recall_score(y_agnest, clustering)
print("recall :",recall)
average_precision = average_precision_score(y_agnest, clustering)
print("Average Precision-Recall Score :",average_precision)
accuracy = accuracy_score(y_agnest, clustering)
print("Accuracy :",accuracy)

f1 score : 0.004065040650406504
precision : 0.002036659877800407
recall : 1.0
Average Precision-Recall Score : 0.002036659877800407
Accuracy : 0.02
