# Example notebook for Catboost

## Notebook configuration

In [1]:
# Load some libraries
import os 
import sys
import pandas as pd
import numpy as np

# Set up system path, and import our custom modules
# helpers: for cell timer
# pipeline: all data preprocessing
# model: for GNN model & trainer
sys.path.append(os.path.abspath(os.path.join("..", "..", "src")))
from helpers import add_cell_timer
from pipeline import ModelPipeline
from pipeline.catboost_pipeline import CatBoostPipeline
add_cell_timer()

data_file = "../../data/subset_transactions2.csv"

## Load and preprocess data

In [2]:
pl = CatBoostPipeline(data_file)

In [3]:
pl.rename_columns()
pl.drop_duplicates()
pl.check_for_null()
pl.extract_currency_features()
pl.extract_time_features()
pl.create_unique_ids()
# pl.extract_additional_time_features()
pl.cyclical_encoding()
pl.apply_one_hot_encoding()



## Split data into train/val/test, and continue with split-specific feature engineering
There are some features that, if engineered or standardized using the whole dataset, could result in data leakage between our train/val/test sets. Therefore, we must split the data prior to these calculations. 
For CatBoost, we will use a temporal split.

In [4]:
pl.df.columns

Index(['from_bank', 'to_bank', 'received_amount', 'sent_amount',
       'is_laundering', 'log_exchange_rate', 'sent_amount_usd', 'is_weekend',
       'timestamp_int', 'timestamp_scaled', 'edge_id', 'from_account_idx',
       'to_account_idx', 'day_cos', 'day_sin', 'time_of_day_cos',
       'time_of_day_sin', 'received_currency_Australian Dollar',
       'received_currency_Bitcoin', 'received_currency_Brazil Real',
       'received_currency_Canadian Dollar', 'received_currency_Euro',
       'received_currency_Mexican Peso', 'received_currency_Ruble',
       'received_currency_Rupee', 'received_currency_Saudi Riyal',
       'received_currency_Shekel', 'received_currency_Swiss Franc',
       'received_currency_UK Pound', 'received_currency_US Dollar',
       'received_currency_Yen', 'received_currency_Yuan',
       'sent_currency_Australian Dollar', 'sent_currency_Bitcoin',
       'sent_currency_Brazil Real', 'sent_currency_Canadian Dollar',
       'sent_currency_Euro', 'sent_currency_Mex

In [5]:
edge_feats = list(set(pl.df.columns)-set(['hour_of_day','is_weekend','sent_amount','received_amount']))

# Temporal split for edges
pl.split_train_test_val(X_cols=edge_feats) # default is temporal split, keeps account_idx's for node feature mapping

Keeping from_account_idx and to_account_idx (for merging node feats onto tabular data for Catboost)


### Create node features
Node features are specific to accounts, and include graph based features like pagerank and degree centrality, as well as some aggregate statistics such as net flow (total amount sent-total amount received for a specific account). 

In [6]:
# Compute node features split-specifically
pl.compute_split_specific_node_features()

# Scale only relevant node features (others like pagerank left raw)
pl.scale_node_data_frames()

✅ Computed node features for train with 107090 nodes.
✅ Computed node features for val with 107355 nodes.
✅ Computed node features for test with 107583 nodes.


In [7]:
print(pl.train_nodes.columns) # peek at cols

Index(['node_id', 'degree_centrality', 'pagerank', 'net_flow', 'avg_txn_out',
       'avg_txn_in', 'std_txn_out', 'std_txn_in', 'num_unique_out_partners',
       'num_unique_in_partners'],
      dtype='object')


### Add node features to df
With CatBoost, we are working with tbaular transaction data, so need to merge out node-level stats onto this. 

In [8]:
pl.X_train.columns

Index(['sent_currency_Rupee', 'payment_type_Reinvestment', 'timestamp_scaled',
       'received_currency_Saudi Riyal', 'received_currency_Rupee',
       'received_currency_Swiss Franc', 'payment_type_Cheque',
       'sent_amount_usd', 'sent_currency_Yen', 'payment_type_Bitcoin',
       'payment_type_Wire', 'day_cos', 'log_exchange_rate', 'day_sin',
       'sent_currency_Euro', 'sent_currency_Shekel', 'sent_currency_UK Pound',
       'edge_id', 'sent_currency_Mexican Peso', 'payment_type_ACH',
       'received_currency_Canadian Dollar', 'sent_currency_Canadian Dollar',
       'received_currency_Mexican Peso', 'received_currency_Ruble',
       'sent_currency_Australian Dollar',
       'received_currency_Australian Dollar', 'received_currency_Bitcoin',
       'sent_currency_Yuan', 'received_currency_Shekel', 'time_of_day_cos',
       'received_currency_US Dollar', 'sent_currency_Ruble', 'time_of_day_sin',
       'sent_currency_Bitcoin', 'received_currency_Euro', 'from_account_idx',
      

In [9]:
pl.add_node_graph_feats_to_df()

In [10]:
pl.X_train.columns

Index(['sent_currency_Rupee', 'payment_type_Reinvestment', 'timestamp_scaled',
       'received_currency_Saudi Riyal', 'received_currency_Rupee',
       'received_currency_Swiss Franc', 'payment_type_Cheque',
       'sent_amount_usd', 'sent_currency_Yen', 'payment_type_Bitcoin',
       'payment_type_Wire', 'day_cos', 'log_exchange_rate', 'day_sin',
       'sent_currency_Euro', 'sent_currency_Shekel', 'sent_currency_UK Pound',
       'edge_id', 'sent_currency_Mexican Peso', 'payment_type_ACH',
       'received_currency_Canadian Dollar', 'sent_currency_Canadian Dollar',
       'received_currency_Mexican Peso', 'received_currency_Ruble',
       'sent_currency_Australian Dollar',
       'received_currency_Australian Dollar', 'received_currency_Bitcoin',
       'sent_currency_Yuan', 'received_currency_Shekel', 'time_of_day_cos',
       'received_currency_US Dollar', 'sent_currency_Ruble', 'time_of_day_sin',
       'sent_currency_Bitcoin', 'received_currency_Euro',
       'sent_currency_US D

need to drop timestamp_int and edge_id before running catboost

In [11]:
# Scale edge features
pl.numerical_scaling(numerical_features=['timestamp_scaled','sent_amount_usd']) #,'time_diff_from','time_diff_to', 'turnaround_time'])

# Model