# Example notebook for Catboost

## Notebook configuration

In [None]:
# Load some libraries
import os 
import sys
import pandas as pd
import numpy as np

# Set up system path, and import our custom modules
# helpers: for cell timer
# pipeline: all data preprocessing
# model: for GNN model & trainer
sys.path.append(os.path.abspath(os.path.join("..", "..", "src")))
from helpers import add_cell_timer
from pipeline import ModelPipeline
from pipeline.catboost_pipeline import CatBoostPipeline
add_cell_timer()

data_file = "../../data/subset_transactions2.csv"

## Load and preprocess data

In [None]:
pl = CatboostPipeline(data_file)

In [3]:
pl.rename_columns()
pl.drop_duplicates()
pl.check_for_null()
pl.extract_currency_features()
pl.extract_time_features()
pl.create_unique_ids()
pl.extract_additional_time_features()
pl.cyclical_encoding()
pl.apply_one_hot_encoding()



## Split data into train/val/test, and continue with split-specific feature engineering
There are some features that, if engineered or standardized using the whole dataset, could result in data leakage between our train/val/test sets. Therefore, we must split the data prior to these calculations. 
For CatBoost, we will use a temporal split.

In [None]:
pl.df.columns

In [None]:
# Temporal split for edges
pl.split_train_test_val(X_cols=) # default is temporal

Keeping from_account_idx (for merging node feats onto tabular data for Catboost)


(        day_cos       day_sin  edge_id  hour_of_day  is_weekend  \
 0          -1.0  1.224647e-16        0            0           0   
 1534       -1.0  1.224647e-16     1534            0           0   
 1533       -1.0  1.224647e-16     1533            0           0   
 1532       -1.0  1.224647e-16     1532            0           0   
 1531       -1.0  1.224647e-16     1531            0           0   
 ...         ...           ...      ...          ...         ...   
 875665     -0.5  8.660254e-01   875665           16           0   
 875664     -0.5  8.660254e-01   875664           16           0   
 875663     -0.5  8.660254e-01   875663           16           0   
 875662     -0.5  8.660254e-01   875662           16           0   
 875661     -0.5  8.660254e-01   875661           16           0   
 
         log_exchange_rate  payment_type_ACH  payment_type_Bitcoin  \
 0                0.693147               0.0                   0.0   
 1534             0.693147               0

### Create node features
Node features are specific to accounts, and include graph based features like pagerank and degree centrality, as well as some aggregate statistics such as net flow (total amount sent-total amount received for a specific account). 

In [5]:
# Compute node features split-specifically
pl.compute_split_specific_node_features()

# Scale only relevant node features (others like pagerank left raw)
_,_,_, = pl.scale_node_data_frames()

✅ Computed node features for train with 107090 nodes.
✅ Computed node features for val with 107355 nodes.
✅ Computed node features for test with 107583 nodes.


In [6]:
print(pl.train_nodes.columns) # peek at cols

Index(['node_id', 'degree_centrality', 'pagerank', 'net_flow', 'avg_txn_out',
       'avg_txn_in', 'std_txn_out', 'std_txn_in', 'num_unique_out_partners',
       'num_unique_in_partners'],
      dtype='object')


### Add node features to df
With CatBoost, we are working with tbaular transaction data, so need to merge out node-level stats onto this. 

In [7]:
pl.add_node_graph_feats_to_df()

(        day_cos       day_sin  edge_id  hour_of_day  is_weekend  \
 0          -1.0  1.224647e-16        0            0           0   
 1          -1.0  1.224647e-16     1534            0           0   
 2          -1.0  1.224647e-16     1533            0           0   
 3          -1.0  1.224647e-16     1532            0           0   
 4          -1.0  1.224647e-16     1531            0           0   
 ...         ...           ...      ...          ...         ...   
 875625     -0.5  8.660254e-01   875665           16           0   
 875626     -0.5  8.660254e-01   875664           16           0   
 875627     -0.5  8.660254e-01   875663           16           0   
 875628     -0.5  8.660254e-01   875662           16           0   
 875629     -0.5  8.660254e-01   875661           16           0   
 
         log_exchange_rate  payment_type_ACH  payment_type_Bitcoin  \
 0                0.693147               0.0                   0.0   
 1                0.693147               0

In [None]:
# Scale edge features
pl.numerical_scaling(numerical_features=['timestamp_scaled','sent_amount_usd','time_diff_from','time_diff_to', 'turnaround_time'])

(        day_cos       day_sin  edge_id  hour_of_day  is_weekend  \
 0          -1.0  1.224647e-16        0            0           0   
 1          -1.0  1.224647e-16     1534            0           0   
 2          -1.0  1.224647e-16     1533            0           0   
 3          -1.0  1.224647e-16     1532            0           0   
 4          -1.0  1.224647e-16     1531            0           0   
 ...         ...           ...      ...          ...         ...   
 875625     -0.5  8.660254e-01   875665           16           0   
 875626     -0.5  8.660254e-01   875664           16           0   
 875627     -0.5  8.660254e-01   875663           16           0   
 875628     -0.5  8.660254e-01   875662           16           0   
 875629     -0.5  8.660254e-01   875661           16           0   
 
         log_exchange_rate  payment_type_ACH  payment_type_Bitcoin  \
 0                0.693147               0.0                   0.0   
 1                0.693147               0

# Model