# USE NODE2VEC/DEEPWALK TO GENERATE ACCOUNT EMBEDDINGS

In [1]:
# setup
from modules.data_loader import *
from modules.feature_engineering import *
from modules.visualizer import *

In [2]:
# #run if you dont have the data downloaded this is a few gb of data
# import kagglehub
# path = kagglehub.dataset_download("ealtman2019/ibm-transactions-for-anti-money-laundering-aml")
# print("Path to dataset files:", path)

## 1. Create our dataframes from raw data

In [3]:
dataset_name = "HI-Small"

print(f"Loading {dataset_name}...\n")
trans_df = load_transactions(dataset_size=dataset_name)
accounts_df = load_accounts(dataset_size=dataset_name)
# patterns_df = load_patterns(dataset_size=dataset_name)


Loading HI-Small...


Loading transactions from: /home/linch/.cache/kagglehub/datasets/ealtman2019/ibm-transactions-for-anti-money-laundering-aml/versions/8/HI-Small_Trans.csv
File size: 453.6 MB

Loaded 5,078,345 transactions
Date range: 2022-09-01 00:00:00 to 2022-09-18 16:18:00
Laundering transactions: 5,177 (0.102%)

Loading accounts from: /home/linch/.cache/kagglehub/datasets/ealtman2019/ibm-transactions-for-anti-money-laundering-aml/versions/8/HI-Small_accounts.csv

Loaded 518,581 accounts from 30470 banks


## 2. Feature engineering - For this notebook, we calculate several different statistics to do manual feature engineering

In [None]:
# convert all currencies to USD for normaliztion

trans_df = convert_currency_to_USD(trans_df)

In [5]:
# Successfully verified that currency conversion is within margin of error

# trans_df_verification = trans_df.copy()
# trans_df_verification["diff"] = trans_df["amount_received"] - trans_df["amount_paid"]
# show = ["amount_received", "receiving_currency", "amount_paid", "payment_currency", "diff"]
# print(trans_df_verification.loc[trans_df_verification['diff'] > 50, show])

In [6]:
# compute sinusoidal temporal encodings and normalized unix timestamp

trans_df = temporal_encoding(trans_df)
print(trans_df['hour_sin'].max())
print(trans_df['hour_sin'].min())
print(trans_df['hour_cos'].max())
print(trans_df['hour_cos'].min())
print(trans_df['time_normalized'].max())
print(trans_df['time_normalized'].min())
print(trans_df['time_normalized'].mean())

1.0
-1.0
1.0
-1.0
4.282624796869565
-1.3776303742917988
1.2980283339374991e-13


In [None]:
# give each currency and payment method a unique integer ID

trans_df = encode_currency_ids(trans_df)
trans_df = encode_payment_format_ids(trans_df)
print(trans_df['receiving_currency_id'])

0          12
1          12
2          12
3          12
4          12
           ..
5078340     1
5078341     1
5078342     1
5078343     1
5078344     1
Name: receiving_currency_id, Length: 5078345, dtype: category
Categories (15, int64): [0, 1, 3, 4, ..., 2, 8, 9, 10]


In [None]:
# give each account a unique integer ID

trans_df, account_to_id, id_to_account = encode_account_ids(trans_df)
print(max(trans_df['to_account_id'].max(), trans_df['from_account_id'].max()))
print(len(accounts_df))
print(len(accounts_df['account_id'].unique()))

# When working with transactions df, we see that only 515079 + 1 = 515080 accounts appear. This means only 515800 accounts have transacted. We will ignore accounts with no transactions.
# There are 518581 rows in the accounts df, we see there are 518581 rows. However there are only 518573 unique account numbers, which corroborates with the datacard on kaggle. This is likely because there are a few accounts that have different values for bank but share the same account id. We will treat them as the same account for now as we are ignoring the bank data.

515079
518581
518573


## 3. Temporal train/test split and account statistics

In [None]:
# split data temporally: first 80% for training, last 20% for testing

train_df, test_df = temporal_train_test_split(trans_df, train_ratio=0.8)

TEMPORAL TRAIN/TEST SPLIT

Train Set:
  Date range: 2022-09-01 00:00:00 to 2022-09-08 16:12:00
  Transactions: 4,062,676
  Laundering: 3,380 (0.083%)

Test Set:
  Date range: 2022-09-08 16:12:00 to 2022-09-18 16:18:00
  Transactions: 1,015,669
  Laundering: 1,797 (0.177%)


In [12]:
# Normalize amounts (log transform, zero center, scale) for training set

train_df = normalize_amounts(train_df)
print("Train set amount normalization:")
print(f"  amount_paid - Mean: {train_df['amount_paid'].mean():.2e}, Std: {train_df['amount_paid'].std():.4f}")
print(f"  amount_received - Mean: {train_df['amount_received'].mean():.2e}, Std: {train_df['amount_received'].std():.4f}")

Train set amount normalization:
  amount_paid - Mean: 8.88e-16, Std: 1.0000
  amount_received - Mean: -6.99e-16, Std: 1.0000


In [13]:
# Normalize amounts for test set

test_df = normalize_amounts(test_df)
print("Test set amount normalization:")
print(f"  amount_paid - Mean: {test_df['amount_paid'].mean():.2e}, Std: {test_df['amount_paid'].std():.4f}")
print(f"  amount_received - Mean: {test_df['amount_received'].mean():.2e}, Std: {test_df['amount_received'].std():.4f}")

Test set amount normalization:
  amount_paid - Mean: -3.86e-16, Std: 1.0000
  amount_received - Mean: 7.16e-16, Std: 1.0000
