## Imports
---

In [None]:
import pandas as pd
import plotly.express as px
from src.utils.dataset import get_full_transactions_dataset



## Data set load
---

In [2]:
df = get_full_transactions_dataset()

2025-02-26 09:18:21.556 
  command:

    streamlit run C:\Users\ferna\AppData\Roaming\Python\Python312\site-packages\ipykernel_launcher.py [ARGUMENTS]


In [3]:
df.shape

(4484942, 9)

In [4]:
df.head(3)

Unnamed: 0,timestamp,sender,receiver,amount_received,receiving_currency,amount_paid,payment_currency,payment_format,is_laundering
0,2022/09/01 00:20,3208_8000F4580,1_8000F5340,0.01,US Dollar,0.01,US Dollar,Cheque,0
1,2022/09/01 00:26,12_8000EC280,2439_8017BF800,7.66,US Dollar,7.66,US Dollar,Credit Card,0
2,2022/09/01 00:21,1_8000EDEC0,211050_80AEF5310,383.71,US Dollar,383.71,US Dollar,Credit Card,0


In [5]:
df.columns

Index(['timestamp', 'sender', 'receiver', 'amount_received',
       'receiving_currency', 'amount_paid', 'payment_currency',
       'payment_format', 'is_laundering'],
      dtype='object')

## Exploratory analysis
---

In [6]:
df.dtypes

timestamp              object
sender                 object
receiver               object
amount_received       float64
receiving_currency     object
amount_paid           float64
payment_currency       object
payment_format         object
is_laundering           int64
dtype: object

In [7]:
df.isna().sum()

timestamp             0
sender                0
receiver              0
amount_received       0
receiving_currency    0
amount_paid           0
payment_currency      0
payment_format        0
is_laundering         0
dtype: int64

In [8]:
df.head()

Unnamed: 0,timestamp,sender,receiver,amount_received,receiving_currency,amount_paid,payment_currency,payment_format,is_laundering
0,2022/09/01 00:20,3208_8000F4580,1_8000F5340,0.01,US Dollar,0.01,US Dollar,Cheque,0
1,2022/09/01 00:26,12_8000EC280,2439_8017BF800,7.66,US Dollar,7.66,US Dollar,Credit Card,0
2,2022/09/01 00:21,1_8000EDEC0,211050_80AEF5310,383.71,US Dollar,383.71,US Dollar,Credit Card,0
3,2022/09/01 00:04,1_8000F4510,11813_8011305D0,9.82,US Dollar,9.82,US Dollar,Credit Card,0
4,2022/09/01 00:08,1_8000F4FE0,245335_812ED62E0,4.01,US Dollar,4.01,US Dollar,Credit Card,0


### Accounts

In [9]:
senders = df.sender.unique().tolist()
receivers = df.receiver.unique().tolist()
all_customers = set(senders + receivers)

In [10]:
print(f"Number of unique senders: {len(senders)}")
print(f"Number of unique receivers: {len(receivers)}")
print(f"Number of unique receivers: {len(all_customers)}")

Number of unique senders: 305756
Number of unique receivers: 284340
Number of unique receivers: 422690


### Currency

In [11]:
df["payment_currency"].unique()

array(['US Dollar', 'Bitcoin', 'Euro', 'Australian Dollar', 'Yuan',
       'Rupee', 'Mexican Peso', 'UK Pound', 'Canadian Dollar',
       'Swiss Franc', 'Yen', 'Brazil Real', 'Shekel', 'Ruble',
       'Saudi Riyal'], dtype=object)

In [12]:
currency_count = df.payment_currency.value_counts().reset_index()
fig = px.pie(
    currency_count,
    values="count",
    names="payment_currency",
    color_discrete_sequence=px.colors.qualitative.Pastel
)
fig.show()

### Payment format

In [13]:
payment_format_count = df.payment_format.value_counts().reset_index()
fig = px.pie(
    payment_format_count,
    values='count',
    names='payment_format',
    color_discrete_sequence=px.colors.qualitative.Pastel
)
fig.show()

### Frauds

In [14]:
frauds_count = df.is_laundering.value_counts().reset_index()
frauds_count["is_laundering"] = frauds_count["is_laundering"].map({1: "True", 0: "False"})

In [15]:
frauds_count

Unnamed: 0,is_laundering,count
0,False,4479776
1,True,5166


In [16]:
fig = px.pie(
    frauds_count,
    values="count",
    names="is_laundering",
    color_discrete_sequence=px.colors.qualitative.Pastel
)
fig.show()

### Timestamp

In [17]:
df ["timestamp"] = pd.to_datetime(df['timestamp'])
df["hour"] = df["timestamp"].dt.hour
df["week_day"] = df["timestamp"].dt.day_name()
df["week_day_number"] = df["timestamp"].dt.day_of_week

In [18]:
timestamp_no_fraud_df = df.groupby(["week_day", "week_day_number", "hour"], as_index=False).sender.count().rename(columns={"sender": "count"})
timestamp_no_fraud_df = timestamp_no_fraud_df.sort_values(by=["week_day_number", "hour"])

timestamp_fraud_df = df[df["is_laundering"] == 1].groupby(["week_day", "week_day_number", "hour"], as_index=False).sender.count().rename(columns={"sender": "count"})
timestamp_fraud_df = timestamp_fraud_df.sort_values(by=["week_day_number", "hour"])

In [19]:
px.density_heatmap(timestamp_no_fraud_df, x='week_day', y='hour', z='count', color_continuous_scale='Viridis').show()
px.density_heatmap(timestamp_fraud_df, x='week_day', y='hour', z='count', color_continuous_scale='Viridis').show()