# Fraud Detection with Graph databases and Machine Learning

## Importing the required Python libraries

In [None]:
import numpy as np
import pandas as pd
from py2neo import Graph
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import KFold, train_test_split, StratifiedKFold
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report
from imblearn.over_sampling import SMOTE 
from collections import Counter
from sklearn.decomposition import PCA

## Loading and exploring the banksim dataset 

In [None]:
banksim_df = pd.read_csv("../dataset/bs140513_032310.csv")
banksim_df.head()

In [None]:
banksim_df.info()

Viewing the split of the output classes - fraudulent and genuine transactions

In [None]:
banksim_df['fraud'].value_counts()

In [None]:
# Number of unique values per column in the banksim dataset
banksim_df.nunique()

## Preprocessing the data 

In [None]:
# Obtaining the number of null values in each column
banksim_df.isna().sum()

Looks like there are no null values in the dataset.

In [None]:
# Retrieving the class attribute from the dataframe
Y_before_smote = banksim_df['fraud']
Y_before_smote.head()

In [None]:
'''
Removing unwanted columns
Since zipcodeOri and zipMerchant have the same value for all the rows, these columns are redundant
'''

feature_df = banksim_df.drop(['step', 'customer', 'zipcodeOri', 'zipMerchant', 'fraud'], axis=1)

In [None]:
feature_df.head()

In [None]:
# One hot encoding the categorical variables
feature_df = pd.get_dummies(feature_df, columns=['age', 'gender', 'category', 'merchant'])
feature_df.head()

In [None]:
# Standardizing the features
standard_scaler = StandardScaler()
scaled_df = pd.DataFrame(standard_scaler.fit_transform(feature_df), columns = feature_df.columns)

scaled_df.head()

In [None]:
# Performing dimensionality reduction using PCA

# Limiting the number of components such that 95% of the variance is explained
pca = PCA(0.95, svd_solver='full')
scaled_df = pca.fit_transform(scaled_df)

scaled_df.shape


## Training supervised learning models using intrinsic features from the dataset

In [None]:
k_fold = StratifiedKFold(n_splits=5, random_state=None, shuffle=False)
svm_classifier = SVC(gamma="auto")
random_forest = RandomForestClassifier(max_depth=20, n_estimators=150)
logistic_regression = LogisticRegression(solver='lbfgs', max_iter=5000)
labels = Y_before_smote

In [None]:
# Logistic Regression Classifier

for train_index, test_index in k_fold.split(scaled_df, labels):

    X_train, X_test = scaled_df[train_index], scaled_df[test_index]
    y_train, y_test = labels[train_index], labels[test_index]

    clf = logistic_regression.fit(X_train, y_train)
    predictions = clf.predict(X_test)
    
    print(classification_report(y_test, predictions))
    

In [None]:
# Testing the logistic regression classifier after performing oversampling on the training data using SMOTE 

X_train, X_test, y_train, y_test = train_test_split(scaled_df, labels, test_size=0.20)#, random_state=42, stratify='array-like')

# Handling the imbalance in the dataset using SMOTE

print('Original dataset shape %s' % Counter(Y_before_smote))

#X_before_smote = X_train
sm = SMOTE()#random_state=1)

# Applying smote to dataset, result is nparray
X_after_smote, Y_after_smote = sm.fit_resample(X_train, y_train)
                                            
                                               
print('dataset shape after smote %s' % Counter(Y_after_smote))

clf = logistic_regression.fit(X_after_smote, Y_after_smote)

predictions = clf.predict(X_test)
print(classification_report(y_test, predictions))

In [None]:
# Random Forest Classifier

for train_index, test_index in k_fold.split(scaled_df, labels):

    X_train, X_test = scaled_df[train_index], scaled_df[test_index]
    y_train, y_test = labels[train_index], labels[test_index]

    clf = random_forest.fit(X_train, y_train)
    predictions = clf.predict(X_test)
    
    print(classification_report(y_test, predictions))
    

In [None]:

# Testing the random forest classifier after performing oversampling on the training data using SMOTE 

X_train, X_test, y_train, y_test = train_test_split(scaled_df, labels, test_size=0.20)#, random_state=42, stratify='array-like')


# Handling the imbalance in the dataset using SMOTE

print('Original dataset shape %s' % Counter(Y_before_smote))

#X_before_smote = X_train
sm = SMOTE()#random_state=1)

# Applying smote to dataset, result is nparray
X_after_smote, Y_after_smote = sm.fit_resample(X_train, y_train)
                                            
                                               
print('dataset shape after smote %s' % Counter(Y_after_smote))

clf = random_forest.fit(X_after_smote, Y_after_smote)

predictions = clf.predict(X_test)
print(classification_report(y_test, predictions))

In [None]:
# SVM Classifier
    
X_train, X_test, y_train, y_test = train_test_split(scaled_df, labels, test_size=0.20)

clf = svm_classifier.fit(X_train, y_train)
predictions = clf.predict(X_test)
    
print(classification_report(y_test, predictions))

## Extracting network features

In [None]:
# Connecting to the Neo4j database
graph = Graph(password="password")

# Query to fetch the network features from Neo4j
query = """
MATCH (p:Placeholder)
RETURN p.id AS id, p.degree AS degree, p.pagerank as pagerank, p.community AS community 
"""

data = graph.run(query)

records = {}

for record in data:
    records[record['id']] = {'degree': record['degree'], 'pagerank': record['pagerank'], 'community': record['community']}


In [None]:
banksim_df = pd.read_csv("../dataset/bs140513_032310.csv")


In [None]:
def load_degree(record):
    return records[record.split("'")[1]]['degree']
def load_community(record):
    return str(records[record.split("'")[1]]['community'])
def load_pagerank(record):
    return records[record.split("'")[1]]['pagerank']

In [None]:
banksim_df['merchant_degree'] = banksim_df['merchant'].apply(load_degree)
banksim_df['customer_degree'] = banksim_df['customer'].apply(load_degree)
banksim_df['merchant_pagerank'] = banksim_df['merchant'].apply(load_pagerank)
banksim_df['customer_pagerank'] = banksim_df['customer'].apply(load_pagerank)
banksim_df['merchant_community'] = banksim_df['merchant'].apply(load_community)
banksim_df['customer_community'] = banksim_df['customer'].apply(load_community)

In [None]:
banksim_df.head()

In [None]:
banksim_df.info()

In [None]:
labels = banksim_df['fraud']

# Dropping the unnecessary columns
feature_df = banksim_df.drop(['step', 'age', 'gender', 'customer', 'zipcodeOri', 'zipMerchant', 'fraud'], axis=1)

# One hot encoding the categorical variables
feature_df = pd.get_dummies(feature_df, columns=['category', 'merchant', 'merchant_community', 'customer_community'])


In [None]:
# Standardizing the features
standard_scaler = StandardScaler()
scaled_df = pd.DataFrame(standard_scaler.fit_transform(feature_df), columns = feature_df.columns)

scaled_df.head()

In [None]:
scaled_df = scaled_df.values
labels = labels.values

## Training supervised learning models using intrinsic features as well as graph based features

In [None]:
k_fold = StratifiedKFold(n_splits=5, random_state=None, shuffle=False)

random_forest = RandomForestClassifier(max_depth=20, n_estimators=150)
svm_classifier = SVC(gamma="auto")
logistic_regression = LogisticRegression(solver='lbfgs', max_iter=5000)


In [None]:
# Logistic Regression Classifier

for train_index, test_index in k_fold.split(scaled_df, labels):

    X_train, X_test = scaled_df[train_index], scaled_df[test_index]
    y_train, y_test = labels[train_index], labels[test_index]

    clf = logistic_regression.fit(X_train, y_train)
    predictions = clf.predict(X_test)
    
    print(classification_report(y_test, predictions))

In [None]:
# Random Forest Classifier
    
X_train, X_test, y_train, y_test = train_test_split(scaled_df, labels, test_size=0.20)

clf = random_forest.fit(X_train, y_train)
predictions = clf.predict(X_test)
    
print(classification_report(y_test, predictions))

In [None]:
# SVM Classifier
    
X_train, X_test, y_train, y_test = train_test_split(scaled_df, labels, test_size=0.20)

clf = svm_classifier.fit(X_train, y_train)
predictions = clf.predict(X_test)
    
print(classification_report(y_test, predictions))