# Import Dataset

In [1]:
import numpy as np
import pandas as pd

# Load Dataset
file_path = "/Users/zoe_mac/Desktop/EDA/HI-Small_Trans.csv"
df = pd.read_csv(file_path)

In [2]:
df.head()

Unnamed: 0,Timestamp,From Bank,Account,To Bank,Account.1,Amount Received,Receiving Currency,Amount Paid,Payment Currency,Payment Format,Is Laundering
0,2022/09/01 00:20,10,8000EBD30,10,8000EBD30,3697.34,US Dollar,3697.34,US Dollar,Reinvestment,0
1,2022/09/01 00:20,3208,8000F4580,1,8000F5340,0.01,US Dollar,0.01,US Dollar,Cheque,0
2,2022/09/01 00:00,3209,8000F4670,3209,8000F4670,14675.57,US Dollar,14675.57,US Dollar,Reinvestment,0
3,2022/09/01 00:02,12,8000F5030,12,8000F5030,2806.97,US Dollar,2806.97,US Dollar,Reinvestment,0
4,2022/09/01 00:06,10,8000F5200,10,8000F5200,36682.97,US Dollar,36682.97,US Dollar,Reinvestment,0


# Create Features

In [None]:
from tqdm.notebook import tqdm

# Example DataFrame with index as transaction ID
df_nodes = df.copy()
# Reset index to ensure it can be used as unique identifier
df_nodes = df_nodes.reset_index()  # now 'index' column holds transaction IDs
df_nodes = df_nodes.rename(columns={'index': 'TransactionID'})

# Step 1. Build an edge list for outgoing neighbors.
# We define an edge from a transaction (Source) to a transaction (Neighbor) if
# the receiver of the Source (Account.1) equals the sender of the Neighbor (Account).
edges = df_nodes[['TransactionID', 'Account.1']].rename(
    columns={'TransactionID': 'Source', 'Account.1': 'Receiver'}
)

# Merge the edges with the original transactions to get the neighbors’ features.
neighbors = edges.merge(df_nodes, left_on='Receiver', right_on='Account', how='left')

# Step 2. Compute aggregated statistics for a given feature over outgoing neighbors.
def compute_aggregated_stats(neighbors_df, feature):
    """
    Given a neighbors DataFrame and a feature name,
    compute the minimum, maximum, and standard deviation for that feature
    for each Source (transaction), aggregated over its neighbors.
    """
    agg = neighbors_df.groupby('Source')[feature].agg(['min', 'max', 'std']).reset_index()
    agg.columns = ['TransactionID', f'{feature}_min', f'{feature}_max', f'{feature}_std']
    return agg

# Function to compute aggregated stats for a list of features with a progress bar.
def compute_all_aggregated_stats(neighbors_df, feature_list):
    agg_dfs = []
    for feat in tqdm(feature_list, desc="Aggregating features"):
        agg_df = compute_aggregated_stats(neighbors_df, feat)
        agg_dfs.append(agg_df)
    # Merge all aggregated DataFrames together on 'TransactionID'
    # Start with the first one and merge sequentially.
    agg_all = agg_dfs[0]
    for df_temp in agg_dfs[1:]:
        agg_all = agg_all.merge(df_temp, on="TransactionID", how="outer")
    return agg_all

# Specify the list of features you wish to aggregate
features_to_aggregate = ["Amount Received"]  # Extend this list with other numeric features as needed

# Compute all aggregated statistics with progress bar.
agg_features = compute_all_aggregated_stats(neighbors, features_to_aggregate)

# Step 3. Merge the aggregated statistics back onto the original transactions.
df_nodes_agg = df_nodes.merge(agg_features, on="TransactionID", how="left")

# Fill missing values (if a transaction has no outgoing neighbors) with 0.
for col in df_nodes_agg.columns:
    if any(col.endswith(suffix) for suffix in ['_min', '_max', '_std']):
        df_nodes_agg[col] = df_nodes_agg[col].fillna(0)

print(df_nodes_agg)
