# CP3403 Data Mining
## Report: Credit Card Fraud

### Group: Matthew Marsh, Dannielle Jones and Callum Gracie

This data mining explores: If there is a relationship between the geographical location of the merchant to where credit card fraud occurs?

# Import Packages and Get Data

In [None]:
import pandas as pd
import numpy as np
#import seaborn as sns
import matplotlib.pyplot as plt
#from datetime import datetime, date
from sklearn.cluster import DBSCAN

In [None]:
data = pd.read_csv('data/fraudTrain.csv')  # Read
#pd.set_option('display.float_format', lambda x:'%f'%x)  # Format

In [None]:
#print('Number of Records: {}'.format(len(data)))
print('Dataset rows: {} columns: {}'.format(data.shape[0], data.shape[1]))
data.head()

# Pre-Processing: NaN Data and Missing Data

In [None]:
# Check dataset for missing or NaN values
print('Dataset rows: {} columns: {}'.format(data.shape[0], data.shape[1]))
missing_values_count = data.isna().sum()
print(missing_values_count)

# Pre-Processing: Convert and Format Data

In [None]:
# View the unnamed column, then get and rename
print(data.iloc[:,0])
data = data.rename(columns={data.columns[0]: "column_id"})
data.set_index('column_id', inplace=True)
data.head()

In [None]:
# Convert to numeric
data['amt'] = pd.to_numeric(data['amt'], errors='coerce')
data['zip'] = pd.to_numeric(data['zip'], errors='coerce')
data['lat'] = pd.to_numeric(data['lat'], errors='coerce')
data['long'] = pd.to_numeric(data['long'], errors='coerce')
data['city_pop'] = pd.to_numeric(data['city_pop'], errors='coerce')
data['merch_lat'] = pd.to_numeric(data['merch_long'], errors='coerce')
data['is_fraud'] = pd.to_numeric(data['is_fraud'], errors='coerce')

In [None]:
# Processing date of birth
data['dob'] = pd.to_datetime(data['dob'])  # convert to datetime object
data['year_of_birth'] = data['dob'].dt.year  # extract year
data['month_of_birth'] = data['dob'].dt.month  # extract month
data['day_of_birth'] = data['dob'].dt.day  # extract day
data.head()

In [None]:
# Processing transaction date and time
data['trans_date_trans_time'] = pd.to_datetime(data['trans_date_trans_time'])  # convert to datetime object
data['year_of_trans'] = data['trans_date_trans_time'].dt.year  # extract year
data['month_of_trans'] = data['trans_date_trans_time'].dt.month  # extract month
data['day_of_trans'] = data['trans_date_trans_time'].dt.day  # extract day
data['time_of_trans'] = data['trans_date_trans_time'].dt.time  # extract time
data.head()

In [None]:
# Processing Gender into binary
gender_count = data['gender'].value_counts()
data['is_female'] = data['gender'].apply(lambda x: 1 if x.upper() == "F" else 0)
is_female_count = data['is_female'].value_counts()
print("Gender Count: \n{}".format(gender_count))
print("is_female Count: \n{}".format(is_female_count))
data.head()

# Pre-Processing: Create Data Sub-Set

In [None]:
# Create a sub-set of merchant longitude and latitude
sub1 = data[['merch_lat', 'merch_long', 'is_fraud']]
print('Subset rows: {} columns: {}'.format(sub1.shape[0], sub1.shape[1]))
sub1.head()

In [None]:
# Check how many cases are fraud
is_fraud_count = sub1[(sub1['is_fraud'] == 1)]
print('Fraud count: {}'.format(len(is_fraud_count)))
is_fraud_count.head()

In [None]:
sub2 = sub1.copy()

# Data Mining Technique/Method:
## Visualisation: Pre-Processing

In [None]:
# Current data subset
sub2.head()

In [None]:
# Get only longitude and latitude
sub2 = data[['merch_lat', 'merch_long']]
sub2.head()

In [None]:
sub3 = sub2.copy()

## Visualisation: Plots/Graphs

In [None]:
# Visualise spatial data using scatter plot
%matplotlib inline

plt.title("Merchant Longitude and Latitude Scatter Plot")
plt.xlabel("Latitude")
plt.ylabel("Longitude")
plt.scatter(sub3['merch_lat'], sub3['merch_long'])
plt.show()

### Conclusions:
Overall, there appears to be three clusters and a general linear relationship. The sample size it too large to determine any specific dot patterns.

In [None]:
# Get random sub-sample
np.random.seed(42)

sub_fraction = 0.2
random_fraction_sub = sub3.sample(frac=sub_fraction, random_state=42)

sub_size = 300
random_size_sub = sub3.sample(n=sub_size, random_state=42)

print(f"Current size of data: {len(sub3)} \n")
print(f"Random fraction sub-sample: \n{random_size_sub} Records Count: {len(random_fraction_sub)} \n")
print(f"Random size sub-sample: \n{random_size_sub} Records Count: {len(random_size_sub)}")

In [None]:
# Visualise random sub-sample spatial data using scatter plot
plt.title(f"Merchant Longitude and Latitude Scatter Plot Random Sample {sub_fraction * 100}%")
plt.xlabel("Latitude")
plt.ylabel("Longitude")
plt.scatter(random_fraction_sub['merch_lat'], random_fraction_sub['merch_long'])
plt.show()

# Visualise spatial data using scatter plot
plt.title(f"Merchant Longitude and Latitude Scatter Plot Random Sample Size: {sub_size}")
plt.xlabel("Latitude")
plt.ylabel("Longitude")
plt.scatter(random_size_sub['merch_lat'], random_size_sub['merch_long'])
plt.show()

### DBScan clustering algorithm

In [None]:
dbscan_data = DBSCAN(eps=15.5, min_samples=5).fit(sub2)
core_samples_mask = np.zeros_like(dbscan_data.labels_, dtype=bool)
core_samples_mask[dbscan_data.core_sample_indices_] = True
labels = pd.DataFrame(dbscan_data.labels_, columns=['Cluster ID'])
result = pd.concat((sub3, labels), axis=1)
result.plot.scatter('merch_lat', 'merch_long', c='Cluster ID', colormap='jet')

## Visualisation: Results and Data