# Temporal transformations

## Original code

In [None]:
import random
import networkx as nx
import pandas as pd
import numpy as np
import ipywidgets as widgets
import os
import matplotlib.pyplot as plt
import warnings
from tabulate import tabulate
from tqdm import trange
from IPython import get_ipython
from IPython.display import display
from time import monotonic
from pprint import pprint
from google.colab import drive

from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler

# Project Source Code
src_path = os.path.abspath(os.path.join(os.getcwd(), "../../src"))
sys.path.append(src_path)
from helpers import add_cell_timer

add_cell_timer()

In [None]:
df = pd.read_csv("../../HI-Small_Trans.csv", parse_dates=["Timestamp"])

Rename columns for clarity and standard formatting 

In [None]:
df.rename(
    columns={
        "Timestamp": "timestamp",
        "From Bank": "from_bank",
        "Account": "from_account",
        "To Bank": "to_bank",
        "Account.1": "to_account",
        "Amount Received": "received_amount",
        "Receiving Currency": "received_currency",
        "Amount Paid": "sent_amount",
        "Payment Currency": "sent_currency",
        "Payment Format": "payment_type",
        "Is Laundering": "is_laundering",
    },
    inplace=True,
)

Drop duplicates

In [None]:
df.drop_duplicates(inplace=True)

### Preprocessing function for whole dataset

In [None]:
np.random.seed(42)

def df_label_encoder(df, columns):
      le = preprocessing.LabelEncoder()
      for i in columns:
          df[i] = le.fit_transform(df[i].astype(str))
      return df

def preprocess(df):
  """
  Preprocesses the entire dataframe, including node mappings, label encoding,
  and time processing which is independent of data split.
  """
  ## Create unique account - ID mapping ##
  # Get unique account-bank combos (a couple of acct numbers found at multiple banks)
  df['from_account_id'] = df['from_bank'].astype(str) + '_' + df['from_account'].astype(str)
  df['to_account_id'] = df['to_bank'].astype(str) + '_' + df['to_account'].astype(str)

  # Get list of unique account ids
  df = df.reset_index(drop=True)
  from_nodes = df["from_account_id"].drop_duplicates().reset_index(drop=True)
  to_nodes = df["to_account_id"].drop_duplicates().reset_index(drop=True)
  all_nodes = pd.concat([from_nodes, to_nodes]).drop_duplicates().reset_index(drop=True)

  # Map node identifiers to integer indices
  node_mapping = {node: idx for idx, node in enumerate(all_nodes)}
  df["from_account_idx"] = df["from_account_id"].map(node_mapping)
  df["to_account_idx"] = df["to_account_id"].map(node_mapping)

  ## Label encode categorical vars ##
  # Use label encoding and let model learn (instead of one-hot embeddings)
  df = df_label_encoder(df, ["payment_type", "sent_currency", "received_currency", "from_bank", "to_bank"])

  ## Currency conversion ## (not using for now, just looking at temporal feats)
  # usd_conversion = currency.get_usd_conversion(df)
  # df['Amount Paid (USD)'] = df.apply(lambda row: row['Amount Paid'] * usd_conversion.get(row['Payment Currency'], 1), axis=1)
  # df['Amount Received (USD)'] = df.apply(lambda row: row['Amount Received'] * usd_conversion.get(row['Receiving Currency'], 1), axis=1)

  ## Time transformations ##
  # Extract items from timestamp
  df["time_of_day"] = df["timestamp"].dt.time
  df["hour_of_day"] = df["timestamp"].dt.hour
  df["day_of_week"] = df["timestamp"].dt.weekday # 0=Monday,...,6=Sunday
  df["seconds_since_midnight"] = (
    df["timestamp"].dt.hour * 3600 +  # Convert hours to seconds
    df["timestamp"].dt.minute * 60 +  # Convert minutes to seconds
    df["timestamp"].dt.second         # Keep seconds
  )

  # Transform timestamp to raw int unix
  df["timestamp_int"] = df["timestamp"].astype(int) / 10**9

  # Apply cyclical encoding
  df["day_sin"] = np.sin(2 * np.pi * df["day_of_week"] / 7)
  df["day_cos"] = np.cos(2 * np.pi * df["day_of_week"] / 7)
  df["time_of_day_sin"] = np.sin(2 * np.pi * df["seconds_since_midnight"] / 86400)
  df["time_of_day_cos"] = np.cos(2 * np.pi * df["seconds_since_midnight"] / 86400)

  # Create binary weekend indicator
  df["is_weekend"] = df["day_of_week"].isin([5, 6]).astype(int)

  df.drop(columns=["from_account","to_account"], inplace=True)

  return df

### Preprocessing function for train-val-test separately

In [None]:
def preprocess_split(df, scaler_time, scaler_amount):
    """
    Normalizes timestamp and transaction amounts using given scalers.
    """
    # Sort transactions by time
    df = df.sort_values(by=["from_account_idx", "timestamp"])

    # Apply scaling
    df["timestamp_scaled"] = scaler_time.transform(df[["timestamp_int"]])
    # df[["sent_amount_scaled", "received_amount_scaled"]] = scaler_amount.transform(df[["sent_amount", "received_amount"]])

    # Compute time difference between transactions per account and convert to seconds
    df["time_diff_from_acct"] = df.groupby("from_account_idx")["timestamp"].diff().dt.total_seconds().fillna(0)
    df["time_diff_to_acct"] = df.groupby("to_account_idx")["timestamp"].diff().dt.total_seconds().fillna(0)

    return df

### Run preprocessing

In [None]:
# Preprocess entire df
df_original = df.copy()
df_transformed = preprocess(df_original)

# Split train and test data using random stratification
train_df_rs, test_df_rs = train_test_split(
    df_transformed,
    test_size=0.2,
    stratify=df_transformed["is_laundering"],
    random_state=42
)

### Random stratified
### Scaling the data
# Choose a scaler (MinMax or StandardScaler)
scaler1 = MinMaxScaler() # Choose a scaler (MinMax or StandardScaler)
scaler2 = StandardScaler()
# Fit scalar to training data before preprocessing
# Fit only on training data, then transform train/val/test separately
# This makes scaling consistent (not confusing model)
scaler1.fit(train_df_rs[["timestamp_int"]])
scaler2.fit(train_df_rs[["sent_amount", "received_amount"]])  # Fit only on training data
### Preprocess
train_df_rs, test_df_rs = preprocess_split(train_df_rs, scaler1, scaler2), preprocess_split(test_df_rs, scaler1, scaler2)

## Tranformations for model pipeline

### Applied to whole dataset

Combine bank + account for unique IDs as there were a few duplicate account numbers at different banks in the dataset:

In [None]:
## Create unique account - ID mapping ##
# Get unique account-bank combos (a couple of acct numbers found at multiple banks)
df['from_account_id'] = df['from_bank'].astype(str) + '_' + df['from_account'].astype(str)
df['to_account_id'] = df['to_bank'].astype(str) + '_' + df['to_account'].astype(str)
df.drop(columns=["from_account","to_account"], inplace=True)

Map new node ids (from_bank + from_account, to_bank + to_account) to integer indices:

In [None]:
# Get list of unique account ids
df = df.reset_index(drop=True)
from_nodes = df["from_account_id"].drop_duplicates().reset_index(drop=True)
to_nodes = df["to_account_id"].drop_duplicates().reset_index(drop=True)
all_nodes = pd.concat([from_nodes, to_nodes]).drop_duplicates().reset_index(drop=True)

# Map node identifiers to integer indices
node_mapping = {node: idx for idx, node in enumerate(all_nodes)}
df["from_account_idx"] = df["from_account_id"].map(node_mapping)
df["to_account_idx"] = df["to_account_id"].map(node_mapping)

Extract items from timestamp:

In [None]:
# Extract items from timestamp
df["hour_of_day"] = df["timestamp"].dt.hour
df["day_of_week"] = df["timestamp"].dt.weekday # 0=Monday,...,6=Sunday
df["seconds_since_midnight"] = (
df["timestamp"].dt.hour * 3600 +  # Convert hours to seconds
df["timestamp"].dt.minute * 60 +  # Convert minutes to seconds
df["timestamp"].dt.second         # Keep seconds
)
# Transform timestamp to raw int unix
df["timestamp_int"] = df["timestamp"].astype(int) / 10**9

Cyclincal encoding:

In [None]:
# Apply cyclical encoding
df["day_sin"] = np.sin(2 * np.pi * df["day_of_week"] / 7)
df["day_cos"] = np.cos(2 * np.pi * df["day_of_week"] / 7)
df["time_of_day_sin"] = np.sin(2 * np.pi * df["seconds_since_midnight"] / 86400)
df["time_of_day_cos"] = np.cos(2 * np.pi * df["seconds_since_midnight"] / 86400)

Create binary weekend indicator

In [None]:
df["is_weekend"] = df["day_of_week"].isin([5, 6]).astype(int)

### Applied on train/val/test separately

In [None]:
def preprocess_split(df, scaler_time, scaler_amount):
    """
    Normalizes timestamp and transaction amounts using given scalers.
    """
    # Sort transactions by time
    df = df.sort_values(by=["from_account_idx", "timestamp"])

    # Apply scaling
    df["timestamp_scaled"] = scaler_time.transform(df[["timestamp_int"]])
    
    # Compute time difference between transactions per account and convert to seconds
    df["time_diff_from_acct"] = df.groupby("from_account_idx")["timestamp"].diff().dt.total_seconds().fillna(0)
    df["time_diff_to_acct"] = df.groupby("to_account_idx")["timestamp"].diff().dt.total_seconds().fillna(0)

    return df