# CP3403 Data Mining
## Report: Credit Card Fraud

### Group: Matthew Marsh, Dannielle Jones and Callum Gracie

This data mining explores: If there is a relationship between the geographical location of the merchant to where credit card fraud occurs?
Data selection: Merchant longitude and latitude normalised

# Import Packages and Get Data

In [None]:
import pandas as pd
import numpy as np
#import seaborn as sns
import matplotlib.pyplot as plt
#from datetime import datetime, date
from sklearn.cluster import AgglomerativeClustering

In [None]:
data = pd.read_csv('../data/fraudTrain.csv')  # Read
#pd.set_option('display.float_format', lambda x:'%f'%x)  # Format

In [None]:
#print('Number of Records: {}'.format(len(data)))
print('Dataset rows: {} columns: {}'.format(data.shape[0], data.shape[1]))
data.head()

# Pre-Processing: NaN Data and Missing Data

In [None]:
# Check dataset for missing or NaN values
print('Dataset rows: {} columns: {}'.format(data.shape[0], data.shape[1]))
missing_values_count = data.isna().sum()
print(missing_values_count)

# Pre-Processing: Convert and Format Data

In [None]:
# View the unnamed column, then get and rename
print(data.iloc[:,0])
data = data.rename(columns={data.columns[0]: "column_id"})
data.set_index('column_id', inplace=True)
data.head()

In [None]:
# Convert to numeric
data['amt'] = pd.to_numeric(data['amt'], errors='coerce')
data['zip'] = pd.to_numeric(data['zip'], errors='coerce')
data['lat'] = pd.to_numeric(data['lat'], errors='coerce')
data['long'] = pd.to_numeric(data['long'], errors='coerce')
data['city_pop'] = pd.to_numeric(data['city_pop'], errors='coerce')
data['merch_lat'] = pd.to_numeric(data['merch_long'], errors='coerce')
data['is_fraud'] = pd.to_numeric(data['is_fraud'], errors='coerce')

In [None]:
# Processing date of birth
data['dob'] = pd.to_datetime(data['dob'])  # convert to datetime object
data['year_of_birth'] = data['dob'].dt.year  # extract year
data['month_of_birth'] = data['dob'].dt.month  # extract month
data['day_of_birth'] = data['dob'].dt.day  # extract day
data.head()

In [None]:
# Processing transaction date and time
data['trans_date_trans_time'] = pd.to_datetime(data['trans_date_trans_time'])  # convert to datetime object
data['year_of_trans'] = data['trans_date_trans_time'].dt.year  # extract year
data['month_of_trans'] = data['trans_date_trans_time'].dt.month  # extract month
data['day_of_trans'] = data['trans_date_trans_time'].dt.day  # extract day
data['time_of_trans'] = data['trans_date_trans_time'].dt.time  # extract time
data.head()

In [None]:
# Processing Gender into binary
gender_count = data['gender'].value_counts()
data['is_female'] = data['gender'].apply(lambda x: 1 if x.upper() == "F" else 0)
is_female_count = data['is_female'].value_counts()
print("Gender Count: \n{}".format(gender_count))
print("is_female Count: \n{}".format(is_female_count))
data.head()

# Pre-Processing: Create Data Sub-Set

In [None]:
# Create a sub-set of merchant longitude and latitude
sub1 = data[['merch_lat', 'merch_long', 'is_fraud']]
print('Subset rows: {} columns: {}'.format(sub1.shape[0], sub1.shape[1]))
sub1.head()

In [None]:
# Check how many cases are fraud
is_fraud_count = sub1[(sub1['is_fraud'] == 1)]
print('Fraud count: {}'.format(len(is_fraud_count)))
is_fraud_count.head()

In [None]:
sub2 = sub1.copy()

# Data Mining Technique/Method:
## Visualisation: Pre-Processing

In [None]:
# Current data subset
sub2.head()

In [None]:
# Get only longitude and latitude and normalise
sub2 = data[['merch_lat', 'merch_long']]
sub2.head()

In [None]:
sub3 = sub2.copy()

## Visualisation: Plots/Graphs
### Hierarchical Clustering for Merchant Longitude and Latitude - All cases

In [None]:
%matplotlib inline

# Pre-processing for visualisation
np.random.seed(42)

sub_fraction = 0.01
random_fraction_sub = sub3.sample(frac=sub_fraction, random_state=42)

print(f"Current size of data: {len(sub3)} \n")
print(f"Random fraction sub-sample: \n{random_fraction_sub} Records Count: {len(random_fraction_sub)}")

In [None]:
# Visualise spatial data using scatter plot as locations
plt.title("Scatter Plot for ALl Cases")
plt.figure(figsize=(10, 7))
plt.subplots_adjust(bottom=0.1)
plt.scatter(random_fraction_sub.iloc[:,0], random_fraction_sub.iloc[:,1])
plt.show()

In [None]:
# Spatial data using Agglomerate Clustering
cluster = AgglomerativeClustering(n_clusters=3, metric='euclidean', linkage='single')
cluster.fit_predict(sub3)
print(cluster.labels_)

In [None]:
# Scatter Plot the clusters
plt.title("Hierarchical Clustering Scatter Plot All Cases")
plt.scatter(sub3.iloc[:,0], sub3.iloc[:,1], c=cluster.labels_, cmap='rainbow')

#### Conclusions:
Hierarchical clustering produces a clustering result closer to expected, this will now be compared to fraud versus non-fraud cases and see if there are any differences.


### Hierarchical Clustering for Merchant Longitude and Latitude - Fraud versus Non-Fraud Cases

In [None]:
sub1.head()

In [None]:
# Pre-processing for visualisation
np.random.seed(42)

sub_fraction = 0.01
random_fraction_sub = sub1.sample(frac=sub_fraction, random_state=42)

print(f"Current size of data: {len(sub1)} \n")
print(f"Random fraction sub-sample: \n{random_fraction_sub} Records Count: {len(random_fraction_sub)}")

In [None]:
# Get fraud  cases
is_fraud_cases = random_fraction_sub[(random_fraction_sub['is_fraud'] == 1)]
print('Fraud count: {}'.format(len(is_fraud_cases)))
is_fraud_cases.head()

In [None]:
is_fraud_cases = [['merch_lat', 'merch_long']]
is_fraud_cases.head()

In [None]:
# Get  non-fraud cases
is_not_fraud_cases = random_fraction_sub[(random_fraction_sub['is_fraud'] != 1)]
print('Fraud count: {}'.format(len(is_not_fraud_cases)))
is_not_fraud_cases.head()

In [None]:
is_not_fraud_cases = [['merch_lat', 'merch_long']]
is_not_fraud_cases.head()

In [None]:
# Visualise fraud cases
plt.title("Scatter Plot Fraud Cases")
plt.figure(figsize=(10, 7))
plt.subplots_adjust(bottom=0.1)
plt.scatter(is_fraud_cases.iloc[:,0], is_fraud_cases.iloc[:,1])
plt.show()

In [None]:
# Visualise non-fraud cases
plt.title("Scatter Plot Non-Fraud Cases")
plt.figure(figsize=(10, 7))
plt.subplots_adjust(bottom=0.1)
plt.scatter(is_not_fraud_cases.iloc[:,0], is_not_fraud_cases.iloc[:,1])
plt.show()

In [None]:
# Agglomerate Clustering for fraud cases
cluster = AgglomerativeClustering(n_clusters=3, metric='euclidean', linkage='single')
cluster.fit_predict(is_fraud_cases)
print(cluster.labels_)

In [None]:
# Agglomerate Clustering for non-fraud cases
cluster = AgglomerativeClustering(n_clusters=3, metric='euclidean', linkage='single')
cluster.fit_predict(is_not_fraud_cases)
print(cluster.labels_)

In [None]:
# Plot the clusters
plt.title("Hierarchical clustering Fraud Cases")
plt.scatter(is_fraud_cases.iloc[:,0], is_fraud_cases.iloc[:,1], c=cluster.labels_, cmap='rainbow')

In [None]:
# Plot the clusters
plt.title("Hierarchical clustering Non-Fraud Cases")
plt.scatter(is_not_fraud_cases.iloc[:,0], is_not_fraud_cases.iloc[:,1], c=cluster.labels_, cmap='rainbow')

## Discussion: Results and Data
