In [94]:
print("Bismillah Hirrahamaa Nirrraheem")

Bismillah Hirrahamaa Nirrraheem


In [95]:
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
from plotly.subplots import make_subplots
from dash import html,dcc,Input,Output,callback
pd.set_option("display.float_format","{:.2f}".format)
pio.templates["mod"] = go.layout.Template(layout=dict(font=dict(family="Fira Code")))
pio.templates.default = "plotly_dark+mod"
import statsmodels.api as sm
from scipy.stats import shapiro,kstest
from zipfile import ZipFile
from glob import glob

In [96]:
path = "https://github.com/h4ck4l1/datasets/raw/main/fraud_detection/"

In [97]:
all_file_names = [
    "account_activity",
    "amount_data",
    "anomaly_scores",
    "customer_data",
    "fraud_indicators",
    "merchant_data",
    "suspicious_activity",
    "transaction_category_labels",
    "transaction_metadata",
    "transaction_records"
    ]

In [98]:
len(all_file_names)

10

In [99]:
def preprocess():
    global account_activity,amount_data,anomaly_scores,customer_data,fraud_indicators,merchant_data,suspicious_activity,transaction_category_labels,transaction_metadata,transaction_records
    account_activity = pd.read_csv(path+all_file_names[0]+".csv")
    amount_data = pd.read_csv(path+all_file_names[1]+".csv")
    anomaly_scores = pd.read_csv(path+all_file_names[2]+".csv")
    customer_data = pd.read_csv(path+all_file_names[3]+".csv")
    fraud_indicators = pd.read_csv(path+all_file_names[4]+".csv")
    merchant_data = pd.read_csv(path+all_file_names[5]+".csv")
    suspicious_activity = pd.read_csv(path+all_file_names[6]+".csv")
    transaction_category_labels = pd.read_csv(path+all_file_names[7]+".csv")
    transaction_metadata = pd.read_csv(path+all_file_names[8]+".csv")
    transaction_records = pd.read_csv(path+all_file_names[9]+".csv")
    df = pd.merge(left=account_activity,right=customer_data,right_on="CustomerID",left_on="CustomerID")
    df = pd.merge(left=df,right=transaction_records,left_on="CustomerID",right_on="CustomerID")
    df = pd.merge(left=df,right=suspicious_activity,left_on="CustomerID",right_on="CustomerID")
    df = pd.merge(left=df,right=transaction_metadata,left_on="TransactionID",right_on="TransactionID")
    df = pd.merge(left=df,right=amount_data,left_on="TransactionID",right_on="TransactionID")
    df = pd.merge(left=df,right=fraud_indicators,left_on="TransactionID",right_on="TransactionID")
    df = pd.merge(left=df,right=anomaly_scores,left_on="TransactionID",right_on="TransactionID")
    df = pd.merge(left=df,right=transaction_category_labels,left_on="TransactionID",right_on="TransactionID")
    df = pd.merge(left=df,right=merchant_data,left_on="MerchantID",right_on="MerchantID")
    df.drop(['Name','Address','MerchantName','Location','LastLogin'],axis=1,inplace=True)
    df['Timestamp'] = pd.to_datetime(df['Timestamp'])
    df.sort_values(by="CustomerID",inplace=True)
    return df

In [100]:
df = preprocess()

# Notes

- Location is same as MerchantID
  - Implies that each Merchant is located in a unique location which is same as Merchant Id which is why dropping Location further to ease the analytical process
  - Also Whatever inferences are drawn from Merchant are applicable to Location  
<br>

- LastLogin is converted to Datetime column

- Name and Address are dropped for customers as they are same as CustomerID

In [101]:
time_index = pd.date_range(start=df["Timestamp"].min(),end=df["Timestamp"].max(),freq="H")
display(time_index[0])
time_index[-1]

Timestamp('2022-01-01 00:00:00')

Timestamp('2022-02-11 15:00:00')

In [102]:
df.head()

Unnamed: 0,CustomerID,AccountBalance,Age,TransactionID,Amount,SuspiciousFlag,Timestamp,MerchantID,TransactionAmount,FraudIndicator,AnomalyScore,Category
0,1001,9507.27,54,787,33.67,0,2022-02-02 18:00:00,2627,87.38,0,0.03,Food
2,1003,1715.32,40,641,30.98,0,2022-01-27 16:00:00,2496,46.04,0,0.96,Travel
3,1004,3101.51,30,450,11.17,0,2022-01-19 17:00:00,2184,85.09,0,0.53,Other
5,1004,3101.51,30,921,45.85,0,2022-02-08 08:00:00,2111,36.53,0,0.65,Other
8,1005,5405.77,46,599,81.81,0,2022-01-25 22:00:00,2191,82.85,0,0.66,Other


In [103]:
temp = df.copy()
temp = temp.set_index('Timestamp').sort_index()
temp.head()

Unnamed: 0_level_0,CustomerID,AccountBalance,Age,TransactionID,Amount,SuspiciousFlag,MerchantID,TransactionAmount,FraudIndicator,AnomalyScore,Category
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2022-01-01 00:00:00,1952,2869.69,50,1,55.53,0,2701,79.41,0,0.69,Other
2022-01-01 01:00:00,1027,9527.95,46,2,12.88,0,2070,12.05,0,0.08,Online
2022-01-01 02:00:00,1955,9288.36,34,3,50.18,0,2238,33.31,0,0.02,Travel
2022-01-01 03:00:00,1796,5588.05,33,4,41.63,0,2879,46.12,0,0.88,Travel
2022-01-01 04:00:00,1946,7324.79,18,5,78.12,0,2966,54.05,0,0.03,Other


In [104]:
temp_night = temp.between_time(start_time="20:00:00",end_time="8:00:00")
temp_day = temp.between_time(start_time="8:00:00",end_time="20:00:00")
fig = go.Figure()
fig.add_trace(go.Scatter(
    x=temp_night.query("FraudIndicator == 1").index,
    y=temp_night.query("FraudIndicator == 1").Amount,
    marker=dict(size=22,opacity=0.7,color="blue"),
    mode="markers",
    name="Fraudulent at Night\n<br>8 PM to 8 AM"
    ))
fig.add_trace(go.Scatter(
    x=temp_day.query("FraudIndicator == 1").index,
    y=temp_day.query("FraudIndicator == 1").Amount,
    marker=dict(size=22,opacity=0.7,color="red"),
    mode="markers",
    name="Fraudulent at day\n<br>8 AM to 8 PM"
    ))
fig.add_trace(go.Scatter(
    x=temp.query("FraudIndicator == 0").index,
    y=temp.query("FraudIndicator == 0").Amount,
    marker=dict(size=5,color="yellow",opacity=0.5),
    mode="markers",
    name="Normal",
    visible="legendonly"
))
fig.add_annotation(text="*Toggle normal to see <br>the non-fraudulent transactions<br> that took place",x=1.15,y=-0.10,xanchor="right",yanchor="bottom",showarrow=False,xref="x domain",yref="y domain")
fig.update_layout(title=dict(text="Fraudulent Activity by Time from January 1,2022 to February 11,2022",font=dict(size=30)),height=700)
fig.update_xaxes(title=dict(text="Time",font=dict(size=20)))
fig.update_yaxes(title=dict(text="Amount",font=dict(size=20)))
fig.show()

##### There are quite a bit of Fraudulent Transactions that took place at night as day its nearly half and half

Fraudulent Transactions percentage at Day on total transactions :

In [105]:
temp_night.shape[0]/temp.shape[0]

0.542

Fraudulent Transactions percentage at Night on total transactions :

In [106]:
temp_day.shape[0]/temp.shape[0]

0.541

In [107]:
temp = df.copy()

In [108]:
temp.loc[temp.AnomalyScore > 0.5].FraudIndicator.value_counts()

FraudIndicator
0    468
1     21
Name: count, dtype: int64

In [109]:
temp.loc[temp.AnomalyScore < 0.5].FraudIndicator.value_counts()

FraudIndicator
0    487
1     24
Name: count, dtype: int64

In [110]:
temp.FraudIndicator.value_counts()

FraudIndicator
0    955
1     45
Name: count, dtype: int64

In [111]:
temp.query("FraudIndicator == 1").AnomalyScore.mean()

0.42827367983466963

In [112]:
temp.query("FraudIndicator == 0").AnomalyScore.mean()

0.4952978147290206

In [113]:
temp.head()

Unnamed: 0,CustomerID,AccountBalance,Age,TransactionID,Amount,SuspiciousFlag,Timestamp,MerchantID,TransactionAmount,FraudIndicator,AnomalyScore,Category
0,1001,9507.27,54,787,33.67,0,2022-02-02 18:00:00,2627,87.38,0,0.03,Food
2,1003,1715.32,40,641,30.98,0,2022-01-27 16:00:00,2496,46.04,0,0.96,Travel
3,1004,3101.51,30,450,11.17,0,2022-01-19 17:00:00,2184,85.09,0,0.53,Other
5,1004,3101.51,30,921,45.85,0,2022-02-08 08:00:00,2111,36.53,0,0.65,Other
8,1005,5405.77,46,599,81.81,0,2022-01-25 22:00:00,2191,82.85,0,0.66,Other


- CusotmerID
  <br>
  <br>
  - Continuous Column with different customers
  <br>
  <br>
  - Value counts will tell us how many transactions they have done
  <br>
  <br>
- Account Balance
  <br>
  <br>
  - Continuous Column with amounts
  <br>
  <br>
  - The Amount of money left in their bank account
  <br>
  <br>
- Age
  <br>
  <br>
  - Countinuous Column with ages
  <br>
  <br>
  - Ages at which they made a transactions
  <br>
  <br>
- Transaction ID
  <br>
  <br>
  - Unique IDs given to transactinos, Continuos Column
  <br>
  <br>
- Transaction Amount
  <br>
  <br>
  - Amount of the transactions thats carried out, Countinuos Column
  <br>
  <br>
- Suspicious Flag
  <br>
  <br>
  - Suspicious flag 0 or 1, 0 for not 1 for yes, Categorical Column
  <br>
  <br>
- Timestamp
  <br>
  <br>
  - Time at which transaction has been carried out, Countinuos time series column
  <br>
  <br>
- MerchantID
  <br>
  <br>
  - UniqueID at the the transaction has been carried out, Contiuous Column
  <br>
  <br>
- Amount
  <br>
  <br>
  - Transaction Amount of the fraudulent activity, Continuous Column
  <br>
  <br>
- Fraud Indicator
  <br>
  <br>
  - Whether its been flagged Fraud or not
  <br>
  <br>
  - Categorical Column
  <br>
  <br>
- Anomaly Score
  <br>
  <br>
  - Continuous Column, The score given whether its potentially fraud or not
  <br>
  <br>