# Initial Preprocessing

In [11]:
# Run model with all features
mp = ModelPipeline("/Users/zoe_mac/Desktop/EDA/subset_transactions2.csv")

# basic cleaning & features
mp.rename_columns()
mp.drop_duplicates()
mp.check_for_null()
mp.extract_currency_features()
mp.extract_time_features()
mp.cyclical_encoding()
mp.create_unique_ids()
mp.extract_additional_time_features()

# one‑hot encode only for some cols
mp.apply_one_hot_encoding(
    onehot_categorical_features=["sent_currency", "received_currency", "payment_type"]
)

INFO:root:Running preprocessing pipeline...

INFO:root:Extracting currency features...
INFO:root:Extracting time features...
INFO:root:Adding cyclical encoding to time features...
INFO:root:Error in preprocessing: 'ModelPipeline' object has no attribute 'binary_weekend'


# Train/Val/Test Split

In [12]:
mp.split_train_test_val(split_type="temporal", test_size=0.15, val_size=0.15)

INFO:root:Using the following set of 'X_cols'
INFO:root:['currency_changed', 'day_cos', 'day_of_week', 'day_sin', 'hour_of_day', 'is_weekend', 'payment_type', 'received_amount', 'received_amount_usd', 'received_currency', 'seconds_since_midnight', 'sent_amount', 'sent_amount_usd', 'sent_currency', 'time_of_day_cos', 'time_of_day_sin', 'timestamp_int', 'timestamp_scaled']
INFO:root:Data split using temporal method.


(        currency_changed  day_cos  day_of_week       day_sin  hour_of_day  \
 51491                  0     -1.0            3  1.224647e-16            0   
 4970                   0     -1.0            3  1.224647e-16            0   
 54530                  0     -1.0            3  1.224647e-16            0   
 63921                  0     -1.0            3  1.224647e-16            0   
 10371                  0     -1.0            3  1.224647e-16            0   
 ...                  ...      ...          ...           ...          ...   
 875162                 0     -0.5            2  8.660254e-01           16   
 875281                 0     -0.5            2  8.660254e-01           16   
 875478                 0     -0.5            2  8.660254e-01           16   
 875841                 0     -0.5            2  8.660254e-01           16   
 875840                 1     -0.5            2  8.660254e-01           16   
 
         is_weekend  payment_type  received_amount  received_a

# Calculate Graph-based Features

In [17]:
mp.compute_split_specific_node_features(
    graph_features=["sent_amount_usd","received_amount_usd"]
)

INFO:root:Getting train-test-split-specific node features
INFO:root:Computing train node features...
INFO:root:Computing val node features...


✅ Computed node features for train with 107090 nodes.


INFO:root:Computing test node features...


✅ Computed node features for val with 107355 nodes.
✅ Computed node features for test with 107583 nodes.


# Check with GNN

In [23]:
mp.X_train

Unnamed: 0,currency_changed,day_cos,day_of_week,day_sin,edge_id,hour_of_day,is_weekend,payment_type,received_amount,received_amount_usd,...,sent_currency_UK Pound,sent_currency_US Dollar,sent_currency_Yen,sent_currency_Yuan,time_diff_from,time_of_day_cos,time_of_day_sin,timestamp_int,timestamp_scaled,turnaround_time
0,0,-1.0,3,1.224647e-16,0,0,0,Reinvestment,1096.91,8.534399e+04,...,0.0,0.0,0.0,0.0,-1.0,1.000000,0.000000,1.661990e+09,1.661990e+09,-1.0
1534,0,-1.0,3,1.224647e-16,1534,0,0,Credit Card,19152.04,1.634479e+04,...,0.0,0.0,0.0,0.0,-1.0,1.000000,0.000000,1.661990e+09,1.661990e+09,-1.0
1533,0,-1.0,3,1.224647e-16,1533,0,0,Reinvestment,28688.18,1.921409e+05,...,0.0,0.0,0.0,1.0,-1.0,1.000000,0.000000,1.661990e+09,1.661990e+09,-1.0
1532,0,-1.0,3,1.224647e-16,1532,0,0,Credit Card,4.85,4.139101e+00,...,0.0,0.0,0.0,0.0,-1.0,1.000000,0.000000,1.661990e+09,1.661990e+09,-1.0
1531,0,-1.0,3,1.224647e-16,1531,0,0,Reinvestment,16956.70,1.447124e+04,...,0.0,0.0,0.0,0.0,-1.0,1.000000,0.000000,1.661990e+09,1.661990e+09,-1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
875665,0,-0.5,2,8.660254e-01,875665,16,0,Cheque,385262.15,2.997494e+07,...,0.0,0.0,0.0,0.0,180.0,-0.387972,-0.921671,1.662568e+09,1.662568e+09,-1.0
875664,0,-0.5,2,8.660254e-01,875664,16,0,Credit Card,12871.48,1.287148e+04,...,0.0,1.0,0.0,0.0,420.0,-0.387972,-0.921671,1.662568e+09,1.662568e+09,14700.0
875663,0,-0.5,2,8.660254e-01,875663,16,0,Credit Card,63955.70,6.740222e+06,...,0.0,0.0,1.0,0.0,60.0,-0.387972,-0.921671,1.662568e+09,1.662568e+09,5700.0
875662,0,-0.5,2,8.660254e-01,875662,16,0,Cheque,2.06,1.758051e+00,...,0.0,0.0,0.0,0.0,1020.0,-0.387972,-0.921671,1.662568e+09,1.662568e+09,16380.0


In [27]:
# as a list
# your columns
my_cols = mp.X_train.columns.tolist()

# your friend’s columns (copy‑&‑paste their list here)
friend_cols = [
    'received_amount', 'sent_amount',
    'log_exchange_rate', 'sent_amount_usd',
    'timestamp_scaled', 'time_diff_from', 'time_diff_to',
    'turnaround_time', 'day_cos', 'day_sin', 'time_of_day_cos',
    'time_of_day_sin', 'received_currency_Australian Dollar',
    'received_currency_Bitcoin', 'received_currency_Brazil Real',
    'received_currency_Canadian Dollar', 'received_currency_Euro',
    'received_currency_Mexican Peso', 'received_currency_Ruble',
    'received_currency_Rupee', 'received_currency_Saudi Riyal',
    'received_currency_Shekel', 'received_currency_Swiss Franc',
    'received_currency_UK Pound', 'received_currency_US Dollar',
    'received_currency_Yuan', 'sent_currency_Australian Dollar',
    'sent_currency_Bitcoin', 'sent_currency_Brazil Real',
    'sent_currency_Canadian Dollar', 'sent_currency_Euro',
    'sent_currency_Mexican Peso', 'sent_currency_Ruble',
    'sent_currency_Rupee', 'sent_currency_Saudi Riyal',
    'sent_currency_Shekel', 'sent_currency_Swiss Franc',
    'sent_currency_UK Pound', 'sent_currency_US Dollar',
    'sent_currency_Yuan', 'payment_type_ACH', 'payment_type_Bitcoin',
    'payment_type_Cash', 'payment_type_Cheque',
    'payment_type_Credit Card', 'payment_type_Reinvestment',
    'payment_type_Wire'
]

# compare as sets
print("Exact same columns?", set(my_cols) == set(friend_cols))

# what you have that they don’t
print("You have but they don’t:", sorted(set(my_cols) - set(friend_cols)))

# what they have that you don’t
print("They have but you don’t:", sorted(set(friend_cols) - set(my_cols)))

Exact same columns? False
You have but they don’t: ['currency_changed', 'day_of_week', 'edge_id', 'hour_of_day', 'is_weekend', 'received_amount_usd', 'received_currency_Yen', 'seconds_since_midnight', 'sent_currency_Yen', 'timestamp_int']
They have but you don’t: ['log_exchange_rate', 'time_diff_to']


In [28]:
mp.train_nodes

Unnamed: 0,node_id,degree_centrality,in_degree,out_degree,pagerank,out_count,out_sum,out_mean,out_std,out_min,...,in_sum,in_mean,in_std,in_min,in_max,num_unique_out_partners,num_unique_in_partners,net_flow,avg_txn_in,avg_txn_out
0,0,0.000056,0.000037,0.000019,0.000003,14.0,1.127412e+07,8.052942e+05,5.098178e+05,85343.986037,...,6.758098e+07,2.180032e+06,2.818153e+06,8.534399e+04,8.289030e+06,2.0,4.0,-5.630686e+07,2.180032e+06,8.052942e+05
1,1,0.000037,0.000019,0.000019,0.000006,3.0,2.275002e+04,7.583340e+03,9.225350e+03,21.420000,...,1.006931e+04,2.517328e+03,2.262098e+03,2.142000e+01,4.866600e+03,2.0,2.0,1.268071e+04,2.517328e+03,7.583340e+03
2,2,0.000075,0.000009,0.000065,0.000003,63.0,3.303730e+06,5.244016e+04,2.425593e+05,35.799567,...,1.865087e+06,1.434682e+05,4.648219e+05,1.168599e+02,1.682219e+06,7.0,1.0,1.438643e+06,1.434682e+05,5.244016e+04
3,3,0.000028,0.000009,0.000019,0.000002,2.0,2.801776e+03,1.400888e+03,1.967452e+03,9.689571,...,9.689571e+00,9.689571e+00,0.000000e+00,9.689571e+00,9.689571e+00,2.0,1.0,2.792086e+03,9.689571e+00,1.400888e+03
4,4,0.000047,0.000028,0.000019,0.000012,3.0,2.356892e+06,7.856306e+05,1.349400e+06,6.900000,...,2.173288e+04,9.055367e+02,2.637024e+03,6.900000e+00,1.312005e+04,2.0,3.0,2.335159e+06,9.055367e+02,7.856306e+05
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
107085,107085,0.000009,0.000000,0.000009,0.000002,1.0,8.266209e+02,8.266209e+02,0.000000e+00,826.620920,...,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,1.0,0.0,0.000000e+00,0.000000e+00,8.266209e+02
107086,107086,0.000009,0.000000,0.000009,0.000002,1.0,5.101010e+02,5.101010e+02,0.000000e+00,510.101043,...,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,1.0,0.0,0.000000e+00,0.000000e+00,5.101010e+02
107087,107087,0.000028,0.000009,0.000019,0.000010,2.0,8.443531e+07,4.221765e+07,5.969579e+07,6357.345521,...,8.442895e+07,8.442895e+07,0.000000e+00,8.442895e+07,8.442895e+07,2.0,1.0,6.357346e+03,8.442895e+07,4.221765e+07
107088,107088,0.000009,0.000000,0.000009,0.000002,1.0,5.612109e+01,5.612109e+01,0.000000e+00,56.121087,...,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,1.0,0.0,0.000000e+00,0.000000e+00,5.612109e+01


# Combine Node_features & Edge Features

In [None]:
def make_edge_node_table(df, indices, node_df):
    # 1. pick your split’s edges
    edges = df.loc[indices].copy()

    # 2. index your node features
    node_feats = node_df.set_index("node_id")
    sender_feats   = node_feats.add_prefix("sender_")
    receiver_feats = node_feats.add_prefix("receiver_")

    # 3. join on the two index columns
    edges = edges.join(sender_feats,   on="from_account_idx")
    edges = edges.join(receiver_feats, on="to_account_idx")

    return edges

train_table = make_edge_node_table(mp.df, mp.train_indices, mp.train_nodes)
val_table   = make_edge_node_table(mp.df, mp.val_indices,   mp.val_nodes)
test_table  = make_edge_node_table(mp.df, mp.test_indices,  mp.test_nodes)

In [None]:
def make_X_y(df, label_col="is_laundering", drop_cols=None):
    """
    Given a DataFrame `df`, returns (X, y) where:
      - y is df[label_col]
      - X is df with drop_cols removed (and label_col if in drop_cols)
    """
    # default to dropping common ID+label columns
    default_drops = [
        label_col,
        "from_account_idx", "to_account_idx",
        "from_account", "to_account",
        "from_bank", "to_bank",
        "edge_id", "timestamp_int"
    ]
    drops = set(default_drops) if drop_cols is None else set(drop_cols)

    y = df[label_col].copy()
    X = df.drop(columns=list(drops & set(df.columns)))
    return X, y

train_X, train_y = make_X_y(train_table)
val_X,   val_y   = make_X_y(val_table)
test_X,  test_y  = make_X_y(test_table)

# Run model with all features

In [None]:
from sklearn.metrics import classification_report, roc_auc_score

# CatBoost Model
final_model = CatBoostClassifier(
    iterations=1000,
    depth=8,
    learning_rate=0.05,
    loss_function='Logloss',
    eval_metric='F1',
    class_weights={0: 1, 1: 12},
    random_seed=42,
    silent=True
)

model.fit(X_train, y_train, eval_set=(X_val, y_val), use_best_model=True)

# Evaluate
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]

print("Classification Report:")
print(classification_report(y_test, y_pred))
print(f"ROC AUC Score: {roc_auc_score(y_test, y_prob):.4f}")