# CP3403 Data Mining
## Report: Credit Card Fraud

### Group: Matthew Marsh, Dannielle Jones and Callum Gracie

This data mining explores: A time series to see if there is a relationship between city population to fraud cases over time.

# Import Packages and Get Data

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
#from datetime import datetime, date
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.stattools import acf, pacf
from sklearn.metrics import mean_squared_error
from pmdarima.arima import auto_arima
from math import sqrt

In [None]:
data = pd.read_csv('../data/fraudTrain.csv')  # Read
#pd.set_option('display.float_format', lambda x:'%f'%x)  # Format

In [None]:
#print('Number of Records: {}'.format(len(data)))
print('Dataset rows: {} columns: {}'.format(data.shape[0], data.shape[1]))
data.head()

# Pre-Processing: NaN Data and Missing Data

In [None]:
# Check dataset for missing or NaN values
print('Dataset rows: {} columns: {}'.format(data.shape[0], data.shape[1]))
missing_values_count = data.isna().sum()
print(missing_values_count)

# Pre-Processing: Convert and Format Data

In [None]:
# View the unnamed column, then get and rename
print(data.iloc[:,0])
data = data.rename(columns={data.columns[0]: "column_id"})
data.set_index('column_id', inplace=True)
data.head()

In [None]:
# Convert to numeric
data['amt'] = pd.to_numeric(data['amt'], errors='coerce')
data['zip'] = pd.to_numeric(data['zip'], errors='coerce')
data['lat'] = pd.to_numeric(data['lat'], errors='coerce')
data['long'] = pd.to_numeric(data['long'], errors='coerce')
data['city_pop'] = pd.to_numeric(data['city_pop'], errors='coerce')
data['merch_lat'] = pd.to_numeric(data['merch_long'], errors='coerce')
data['is_fraud'] = pd.to_numeric(data['is_fraud'], errors='coerce')

In [None]:
# Processing date of birth
data['dob'] = pd.to_datetime(data['dob'])  # convert to datetime object
data['year_of_birth'] = data['dob'].dt.year  # extract year
data['month_of_birth'] = data['dob'].dt.month  # extract month
data['day_of_birth'] = data['dob'].dt.day  # extract day
data.head()

In [None]:
# Processing transaction date and time
data['trans_date_trans_time'] = pd.to_datetime(data['trans_date_trans_time'])  # convert to datetime object
data['trans_year'] = data['trans_date_trans_time'].dt.year  # extract year
data['trans_month'] = data['trans_date_trans_time'].dt.month  # extract month
data['trans_day'] = data['trans_date_trans_time'].dt.day  # extract day
data['trans_time'] = data['trans_date_trans_time'].dt.time  # extract time
data.head()

In [None]:
# Processing Gender into binary
gender_count = data['gender'].value_counts()
data['is_female'] = data['gender'].apply(lambda x: 1 if x.upper() == "F" else 0)
is_female_count = data['is_female'].value_counts()
print("Gender Count: \n{}".format(gender_count))
print("is_female Count: \n{}".format(is_female_count))
data.head()

In [None]:
sub1 = data.copy()

# Pre-Processing: Create Data Sub-Set

In [None]:
# Reduce sample size for computation: Get random sub-sample
np.random.seed(42)
sub_fraction = 0.01

random_fraction_sub = sub1.sample(frac=sub_fraction, random_state=42)
print(f"Current size of data: {len(sub1)} \n")
print(f"Records Count: {len(random_fraction_sub)}")

sub1 = random_fraction_sub
print('Subset rows: {} columns: {}'.format(sub1.shape[0], sub1.shape[1]))
sub1.head()

In [None]:
# Check how many cases are fraud
is_fraud_count = sub1[(sub1['is_fraud'] == 1)]
print('Fraud count: {}'.format(len(is_fraud_count)))
is_not_fraud_count = sub1[(sub1['is_fraud'] != 1)]
print('Non-Fraud count: {}'.format(len(is_not_fraud_count)))
is_fraud_count.head()

In [None]:
sub1 = sub1[(sub1['is_fraud'] == 1)]
sub2 = sub1.copy()

# Data Mining Technique/Method: Time Series
## Visualisation: Pre-Processing

In [None]:
# Set date/time as index and sub-sample relevant columns
sub2['trans_date'] = sub2['trans_date_trans_time'].dt.date  # extract date
sub2['trans_date'] = pd.to_datetime((sub2['trans_date']))
sub2.head(2)

In [None]:
sub2.set_index('trans_date', inplace=True)
sub2.head(2)

In [None]:
sub2 = sub2[['city_pop', 'is_fraud', 'trans_year', 'trans_month']].copy()
sub2 = sub2.sort_values(by='trans_date')

In [None]:
sub2.head(2)  # View start date

In [None]:
sub2.tail(2)  # View end date

In [None]:
# Slice specific time frame
date_range_from = '2019-06-01'
date_range_to = '2020-06-01'
select_year = '2019'

transaction_all_data = sub2.copy()  # All dates
# transaction_all_data.reset_index(inplace=True)
# transaction_all_data.set_index('trans_year', inplace=True)
transaction_all_data.head()

In [None]:
transaction_range_data = sub2[date_range_from:date_range_to]  # Range of date
# transaction_range_data.reset_index(inplace=True)
# transaction_range_data.set_index('trans_year', inplace=True)
transaction_range_data.tail()

In [None]:
transaction_year_data = sub2.loc[select_year]  # Specific year
# transaction_year_data.reset_index(inplace=True)
# transaction_year_data.set_index('trans_month', inplace=True)
transaction_year_data.head()

## Visualisation: Plots/Graphs

In [None]:
%matplotlib inline

# Plot of transaction data - all
plt.title("Time Series Plot Transactions to City Population")
plt.plot(transaction_all_data['city_pop'])

In [None]:
plt.title("Box Plot Transactions to City Population")
ax = sns.boxplot(data=transaction_all_data, x='trans_year', y='city_pop')

In [None]:
# Plot of transaction data - date range
plt.title(f"Time Series Plot Transactions to City Population Between {date_range_from} to {date_range_to}")
plt.plot(transaction_year_data)

In [None]:
plt.title("Box Plot Transactions to City Population")
ax3 = sns.boxplot(data=transaction_range_data, x='trans_year', y='city_pop')

In [None]:
# Plot of transaction data - year
plt.title(f"Time Series Plot Transactions to City Population in Year {select_year}")
plt.plot(transaction_year_data)

In [None]:
plt.title("Box Plot Transactions to City Population")
ax2 = sns.boxplot(data=transaction_year_data, x='trans_month', y='city_pop')

### Stationary Check

In [None]:
# Perform Stationary Test
def test_stationary(timeseries, number_of_months):

    #Determing rolling statistics
    rolling_mean = timeseries.rolling(window=number_of_months).mean()
    rolling_std = timeseries.rolling(window=number_of_months).std()

    #Plot rolling statistics:
    orig = plt.plot(timeseries, color='blue',label='Original')
    mean = plt.plot(rolling_mean, color='red', label='Rolling Mean')
    std = plt.plot(rolling_std, color='black', label = 'Rolling Std')
    plt.legend(loc='best')
    plt.title('Rolling Mean & Standard Deviation')
    plt.show(block=False)

In [None]:
print("Stationary Test - ALl Data")
test_stationary(transaction_all_data['city_pop'], 12)
print()

print(f"Stationary Test - Range {date_range_from} to {date_range_to}")
test_stationary(transaction_range_data['city_pop'], 12)
print()

print(f"Stationary Test - Year {select_year}")
test_stationary(transaction_year_data['city_pop'], 12)
print()

In [None]:
# Perform Dickey-Fuller Test
def test_dickey_fuller(timeseries):
    print('Results of Dickey-Fuller Test:')
    df_test = adfuller(timeseries, autolag='AIC')

    df_output = pd.Series(df_test[0:4], index=['Test Statistic','p-value','#Lags Used','Number of Observations Used'])
    for key,value in df_test[4].items():
        df_output['Critical Value (%s)'%key] = value
    print (df_output)

In [None]:
print("Dickey Fuller Test - ALl Data")
test_dickey_fuller(transaction_all_data['city_pop'])
print()

print(f"Dickey Fuller Test - Range {date_range_from} to {date_range_to}")
test_dickey_fuller(transaction_range_data['city_pop'])
print()

print(f"Dickey Fuller Test - Year {select_year}")
test_dickey_fuller(transaction_year_data['city_pop'])
print()

#### Conclusions:
All data test: The p-value indicates that we reject the null hypothesis of non-stationary, indicating that the data is stationary.
Range data test: The p-value indicates that we reject the null hypothesis of non-stationary, indicating that the data is stationary.
Year data test: The p-value indicates that we reject the null hypothesis of non-stationary, indicating that the data is stationary.

### Auto-correlation

In [None]:
# Perform auto-correlation lag
def auto_correlation_lag(dataframe, column_name, number_of_months):
    """
    Perform auto correlation lag
    :param dataframe:
    :param column_name:
    :param number_of_months:
    :return: auto lag correlation by number of months
    """
    return dataframe[column_name].autocorr(lag=number_of_months)

In [None]:
lag_month_number = 1

print(f"All Data - {lag_month_number} Month Lag: {auto_correlation_lag(transaction_all_data, 'city_pop', lag_month_number)}")
print(f"Range {date_range_from} to {date_range_to} Data - {lag_month_number} Month Lag: {auto_correlation_lag(transaction_all_data, 'city_pop', lag_month_number)}")
print(f"Year {select_year } Data - {lag_month_number} Month Lag: {auto_correlation_lag(transaction_all_data, 'city_pop', lag_month_number)}")

#### Conclusions:
All data lag: Indicates a weak negative linear dependence between observations that are 1 month apart. There is little correlation between consecutive months in the data.
Range data lag: Indicates a weak negative linear dependence between observations that are 1 month apart. There is little correlation between consecutive months in the data.
Year data lag: Indicates a weak negative linear dependence between observations that are 1 month apart. There is little correlation between consecutive months in the data.

### Decomposition

In [None]:
# Get log to make patterns or trends more apparent
def get_log(dataframe, column_name):
    """
    Find log by dataframe and column name
    :param dataframe:
    :param column_name:
    :return: log
    """
    return np.log(dataframe[column_name])

In [None]:
ts_log_all = get_log(transaction_all_data, 'city_pop')
ts_log_range = get_log(transaction_range_data, 'city_pop')
ts_log_year = get_log(transaction_year_data, 'city_pop')

In [None]:
plt.plot(ts_log_all)

In [None]:
plt.plot(ts_log_range)

In [None]:
plt.plot(ts_log_year)

In [None]:
decompose_all = seasonal_decompose(transaction_all_data['city_pop'], model='additive', period=12)
decompose_all.plot()
plt.title("Decompose of Transactions to City Population")
plt.show()

In [None]:
decompose_range = seasonal_decompose(transaction_range_data['city_pop'], model='additive', period=12)
decompose_range.plot()
plt.title(f"Decompose of Transactions to City Population Between {date_range_from} to {date_range_to}")
plt.show()

In [None]:
decompose_year = seasonal_decompose(transaction_year_data['city_pop'], model='additive', period=12)
decompose_year.plot()
plt.title(f"Decompose of Transactions to City Population in Year {select_year}")
plt.show()

#### Conclusion:
There is very little trend seen in any of the data.

### Forecasting

In [None]:
# Split into train and test data
train = transaction_all_data['2019-06-01':'2019-12-01'].copy()
train['train'] = train['city_pop']
train = train[['train']]

test = transaction_all_data['2019-12-01':'2020-06-01'].copy()
test['train'] = test['city_pop']
test = test[['train']]

In [None]:
train.head()

In [None]:
test.head()

In [None]:
# Plot the training and test data prediction
plt.plot(train, color='black')
plt.plot(test, color='red')
plt.title('Train/Test Split for Data')
plt.xlabel('City Population')
plt.ylabel('Date')
sns.set()
plt.show()

In [None]:
# Construct ARIMA model
model = auto_arima(train, trace=True, error_action='ignore', suppress_warnings=True)
model.fit(train)
forecast = model.predict(n_periods=len(test))
forecast = pd.DataFrame(forecast, index=test.index, columns=['Prediction'])

In [None]:
# Display model
plt.plot(train, label='Train')
plt.plot(test, label='Test')
plt.plot(forecast, label='Prediction')
plt.title('#City Population Prediction')
plt.xlabel('Date')
plt.ylabel('Actual #City Population')
plt.legend(loc='upper left', fontsize=8)
plt.show()

In [None]:
# Calculate root mean squared error
test = test.dropna()
forecast = forecast.dropna()
print("Length of test:", len(test))
print("Length of forecast:", len(forecast))

In [None]:
# rmse = sqrt(mean_squared_error(test, forecast))
# print("RMSE: ", rmse)

## Discussion: Visualisation: Results and Data
Time series has not been useful to identify patterns regarding fraud cases to city population. The forecast length of 0 indicates that model did not generate any meaningful predictions so mean_squared_error test was not required.