In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly.express as px
import matplotlib.ticker as ticker
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import seaborn as sns
from scipy.stats import iqr
from sklearn.preprocessing import LabelEncoder
from utils import unzip_file

pd.set_option('display.float_format', '{:,.2f}'.format)
sns.set(style="whitegrid")

In [None]:
extracted_files = unzip_file('archive.zip',  '.')
print("Extracted files:")
for file in extracted_files:
    print(file)

In [None]:
file = 'PS_20174392719_1491204439457_log.csv'

In [None]:
df = pd.read_csv(file)
df.head()

In [None]:
df.shape

# Data sanity check

In [None]:
df = df.drop('isFlaggedFraud', axis=1)

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df.duplicated().sum()

In [None]:
df.describe()

In [None]:
df.dtypes

In [None]:
df.isFraud.value_counts(normalize=True)*100

In [None]:
value_count_trans = df.isFraud.value_counts()
value_count_trans

In [None]:
print(f'Dataset with a 1 to {value_count_trans[0] / value_count_trans[1]} ratio in the fraud to non fraud classes')

We can observe explicit class imbalance, meaning that the "Fraud" class has significantly fewer instances compared to "Not Fraud" class

In [None]:
type_count = df['type'].value_counts()
type_count

In [None]:
non_fraud_df = df[df.isFraud == 0]
fraud_df = df[df.isFraud == 1]
fraud_df['type'].value_counts()

We have only `CASH_OUT` and `TRANSFER` payement type as fraudulent, so we can discard the others for the modelization.

In [None]:
fraud_amounts_by_type = fraud_df.groupby("type")["amount"].sum()
fraud_amounts_by_type

In [None]:
describe_non_fraud_df = pd.DataFrame(pd.DataFrame.describe(non_fraud_df.amount))
describe_fraud_df = pd.DataFrame(pd.DataFrame.describe(fraud_df.amount))

describe_non_fraud_df = describe_non_fraud_df.rename({'amount': 'amount (Non fraud transactions)'}, axis=1)
describe_fraud_df = describe_fraud_df.rename({'amount': 'amount (Fraud transactions)'}, axis=1)

pd.concat([describe_non_fraud_df, describe_fraud_df], axis=1)

- Fraud transactions, on average, have higher amounts versus non-fraud transactions.
- std for fraud transactions is notably higher, higher volatility in amount.
- Minimum amount for fraud transactions = £0, maybe is below £1 ?
- 3rd quartile shows more outliers amounts in fraud transactions

In [None]:
fig = px.pie(non_fraud_df, values='amount', names='type', title='Non Fraud Transactions Amount by Type')
fig.update_layout(legend=dict(
    yanchor="top",
    y=0.99,
    xanchor="left",
    x=0.01
))
fig.show()

In [None]:
fig = px.pie(fraud_df, values='amount', names='type', title='Fraud Transactions Amount by Type')
fig.update_layout(legend=dict(
    yanchor="top",
    y=0.99,
    xanchor="left",
    x=0.2
))
fig.show()

In [None]:
fraud_df['nameOrig'].str[0].unique(), fraud_df['nameDest'].str[0].unique()

We only have customer to customer transactions being fradulent

In [None]:
fig = px.scatter(x=range(len(fraud_df)), y=fraud_df['newbalanceOrig'], title='Customer New Balance for Fraud Transactions', labels={'x':'Transaction id',
                                                                                                                                    'y':'Customer New Balance'},
                width=600, height=400)
fig.show()

In [None]:
def find_missing_integers(lst):
    return [i for i in range(min(lst) + 1, max(lst)) if i not in lst]

# There is no missing time step
find_missing_integers(df.step)

In [None]:
px.histogram(df, x='step', y='amount', color='isFraud')

- It seems there are missing transactions / or we track specific customers as there are very few transactions between 50-120 and from 410 steps onward

- We can't modelize spending behaviour.

In [None]:
px.histogram(df, x='step', color='isFraud')

In [None]:
df.nameOrig.value_counts()

We have very few transactions over time for a given person, maybe a person has different `nameOrig` but it is hard to link it.

In [None]:
merchant_transaction_orig_proportion = (df['nameOrig'].str.startswith('M').sum()/df.shape[0])*100
customer_transaction_orig_proportion = (df['nameOrig'].str.startswith('C').sum()/df.shape[0])*100

merchant_transaction_dest_proportion = (df['nameDest'].str.startswith('M').sum()/df.shape[0])*100
customer_transaction_dest_proportion = (df['nameDest'].str.startswith('C').sum()/df.shape[0])*100

In [None]:
labels = ['Merchant','Customer']
orig_values = [merchant_transaction_orig_proportion, customer_transaction_orig_proportion]
dest_values = [merchant_transaction_dest_proportion, customer_transaction_dest_proportion]


fig = make_subplots(rows=1, cols=2, specs=[[{'type':'domain'}, {'type':'domain'}]])
fig.add_trace(go.Pie(labels=labels, values=orig_values, name='Origin', textinfo='label+percent', insidetextorientation='radial'),
              1, 1)
fig.add_trace(go.Pie(labels=labels, values=dest_values, name='Destination', textinfo='label+percent', insidetextorientation='radial'),
              1, 2)

# Use `hole` to create a donut-like pie chart
fig.update_traces(hole=.9, hoverinfo="label+percent+name")

fig.update_layout(
    title_text="Transaction Origin / Destination",
    # Add annotations in the center of the donut pies.
    annotations=[dict(text='Origin', x=0.18, y=0.5, font_size=20, showarrow=False),
                 dict(text='Destination', x=0.82, y=0.5, font_size=20, showarrow=False)],
legend=dict(
    yanchor="middle",
    y=0.99,
    xanchor="center",
    x=0.05))
fig.show()

In [None]:
merchant_fraud_transaction_orig_proportion = (fraud_df['nameOrig'].str.startswith('M').sum()/df.shape[0])*100
customer_fraud_transaction_orig_proportion = (fraud_df['nameOrig'].str.startswith('C').sum()/df.shape[0])*100

merchant_fraud_transaction_dest_proportion = (fraud_df['nameDest'].str.startswith('M').sum()/df.shape[0])*100
customer_fraud_transaction_dest_proportion = (fraud_df['nameDest'].str.startswith('C').sum()/df.shape[0])*100

In [None]:
labels = ['Merchant','Customer']
orig_values = [merchant_fraud_transaction_orig_proportion, customer_fraud_transaction_orig_proportion]
dest_values = [merchant_fraud_transaction_dest_proportion, customer_fraud_transaction_dest_proportion]


fig = make_subplots(rows=1, cols=2, specs=[[{'type':'domain'}, {'type':'domain'}]])
fig.add_trace(go.Pie(labels=labels, values=orig_values, name='Origin', textinfo='label+percent', insidetextorientation='radial'),
              1, 1)
fig.add_trace(go.Pie(labels=labels, values=dest_values, name='Destination', textinfo='label+percent', insidetextorientation='radial'),
              1, 2)

# Use `hole` to create a donut-like pie chart
fig.update_traces(hole=.9, hoverinfo="label+percent+name")

fig.update_layout(
    title_text="Transaction Origin / Destination",
    # Add annotations in the center of the donut pies.
    annotations=[dict(text='Origin', x=0.18, y=0.5, font_size=20, showarrow=False),
                 dict(text='Destination', x=0.82, y=0.5, font_size=20, showarrow=False)],
legend=dict(
    yanchor="middle",
    y=0.99,
    xanchor="center",
    x=0.05))
fig.show()

In [None]:
fig = px.imshow(df[['amount', 'oldbalanceOrg',
       'newbalanceOrig', 'oldbalanceDest', 'newbalanceDest',
       'isFraud']].corr(), text_auto='.3f', aspect="auto")
fig.show()

- correlation between:
    - newbalanceOrig / oldbalanceOrg
    - newbalanceDest / oldbalanceDest

In [None]:
fig = make_subplots(rows=1, cols=2, 
                    subplot_titles=("Histogram of Amount from fraudulent transactions before transformation", "Histogram of Amount from fraudulent transactions after transformation"))

fig.append_trace(go.Histogram(x = fraud_df['amount']), row = 1, col = 1)

fig.append_trace(go.Histogram(x = 1/(np.log1p(fraud_df['amount'])), nbinsx=40), row = 1, col = 2)

fig.update_layout(height=600, width= np.inf, title_text="Distribution of  Amount from fraudulent transactions before and after 1/log(1+y) transformation", showlegend=False, title_x = 0.5)
fig.show()


In [None]:
process_df = df.copy()

In [None]:
types_to_exclude = ['CASH_IN', 'DEBIT', 'PAYMENT']
process_df = process_df[~process_df['type'].isin(types_to_exclude)]

Transactions of types `CASH_IN`, `DEBIT`, and `PAYMENT` is excluded.

In [None]:
print(f"Dataset reduced by: {df.shape[0] - process_df.shape[0]} transactions")

In [None]:
fraud_df = process_df[process_df.isFraud == 1]
non_fraud_df = process_df[process_df.isFraud == 0]

In [None]:
((sum(fraud_df['nameOrig'].value_counts() > 1) / len(fraud_df['nameOrig'].unique())) * 100,
(sum(non_fraud_df['nameOrig'].value_counts() > 1) / len(non_fraud_df['nameOrig'].unique())) * 100,
(sum(process_df['nameOrig'].value_counts() > 1) / len(process_df['nameOrig'].unique())) * 100)

We have very little `nameOrig` having more than one transaction.

In [None]:
((sum(fraud_df['nameDest'].value_counts() > 1) / len(fraud_df['nameDest'].unique())) * 100,
(sum(non_fraud_df['nameDest'].value_counts() > 1) / len(non_fraud_df['nameDest'].unique())) * 100,
(sum(process_df['nameDest'].value_counts() > 1) / len(process_df['nameDest'].unique())) * 100)

### Outliers

In [None]:
numeric_columns = ['amount', 'oldbalanceOrg', 'newbalanceOrig', 'oldbalanceDest', 'newbalanceDest']
for column in numeric_columns:
    plt.figure(figsize=(12, 8))
    sns.boxplot(x=np.log1p(process_df[column]))  # Applying log transformation
    plt.title(f'Box plot for log-transformed {column}')
    plt.show()

The `amount`, `oldbalanceOrg`, `newbalanceOrig`, `oldbalanceDest` and `newbalanceDest` columns were log-transformed using `np.log1p(` to make the distributions more symmetric.

The output showcases the log-transformed values for each column, and the subsequent box plots visually represent the distribution of these log-transformed values. It's important to note that the box plots are based on log-transformed values for better visualization and outlier detection.

The presence of numerous points outside the box in the box plots indicates the potential existence of outliers or extreme values. These outliers may significantly deviate from the majority of the data and could impact the statistical analysis or modeling processes.

# Save the data

In [None]:
process_df.to_pickle('process_df.pickle')