In [17]:
from __future__ import print_function, division
import numpy as np 
import pandas as pd 

import os
        
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_recall_fscore_support, f1_score, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold

import optuna
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.nn.functional as F
import xgboost as xgb
torch.manual_seed(15)

<torch._C.Generator at 0x7f9d11efb2f0>

# Import data

In [4]:
df_classes = pd.read_csv("elliptic_bitcoin_dataset/elliptic_txs_classes.csv")
df_edges = pd.read_csv("elliptic_bitcoin_dataset/elliptic_txs_edgelist.csv")
df_features = pd.read_csv("elliptic_bitcoin_dataset/elliptic_txs_features.csv", header=None)
df_features.columns = ['txId', 'time_step'] + [f'trans_feat_{i}' for i in range(93)] + [f'agg_feat_{i}' for i in range(72)]
df_features = pd.merge(df_features,df_classes,left_on="txId",right_on="txId",how='left')
df_features['class'] = df_features['class'].apply(lambda x: '0' if x == "unknown" else x)

# Train and test data

In [7]:
tx_features = ["trans_feat_"+str(i) for i in range(93)]
agg_features = ["agg_feat_"+str(i) for i in range(72)]
data = df_features[(df_features['class']=='1') | (df_features['class']=='2')]
X = data[tx_features + agg_features] 
y = data['class']
y = y.apply(lambda x: 0 if x == '2' else 1 )
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=15,shuffle=False)

# Fine-tuning

In [12]:
rsn=15
kf = StratifiedKFold(n_splits=3)

def objective_xgb(trial):
    n_estimators = trial.suggest_int('n_estimators', 20, 320)
    max_depth = trial.suggest_int('max_depth', 1, 20)
    learning_rate = trial.suggest_float('learning_rate', 1e-4, 1e-1, log=True)
    min_child_weight = trial.suggest_int('min_child_weight', 1, 20)
    gamma = trial.suggest_float('gamma', 0, 1)
    subsample = trial.suggest_float('subsample', 0.1, 1.0)
    colsample_bytree = trial.suggest_float('colsample_bytree', 0.1, 1.0)

    params = {
        'max_depth': max_depth,
        'learning_rate': learning_rate,
        'min_child_weight': min_child_weight,
        'gamma': gamma,
        'subsample': subsample,
        'colsample_bytree': colsample_bytree
    }
    model = xgb.XGBClassifier(n_estimators=n_estimators, **params)
    f1_scores = cross_val_score(model, X_train, y_train, cv=kf, scoring='f1_micro')
    return f1_scores.mean()

def print_trial_callback(study, trial):
    print(f"Trial {trial.number}: {trial.value}")
    print(f"Params: {trial.params}")

In [14]:
#XGB
study_xgb = optuna.create_study(direction='maximize')
study_xgb.optimize(objective_xgb, n_trials=30, callbacks=[print_trial_callback])

xgb_best_params = study_xgb.best_trial.params
xgbc = xgb.XGBClassifier(**xgb_best_params, random_state=rsn).fit(X_train, y_train)
xgb_preds = xgbc.predict(X_test)
xgb_prec, xgb_rec, xgb_f1, xgb_num = precision_recall_fscore_support(y_test, xgb_preds, average=None)


[I 2023-10-04 12:22:30,022] A new study created in memory with name: no-name-c87fc4b3-d2d3-417f-80cf-5e5363b7ca92
[I 2023-10-04 12:22:33,356] Trial 0 finished with value: 0.8875253111952577 and parameters: {'n_estimators': 53, 'max_depth': 10, 'learning_rate': 0.007251779774762484, 'min_child_weight': 4, 'gamma': 0.8848518863952797, 'subsample': 0.7484253696091445, 'colsample_bytree': 0.9073313258780329}. Best is trial 0 with value: 0.8875253111952577.


Trial 0: 0.8875253111952577
Params: {'n_estimators': 53, 'max_depth': 10, 'learning_rate': 0.007251779774762484, 'min_child_weight': 4, 'gamma': 0.8848518863952797, 'subsample': 0.7484253696091445, 'colsample_bytree': 0.9073313258780329}


[I 2023-10-04 12:22:38,762] Trial 1 finished with value: 0.9651160050795218 and parameters: {'n_estimators': 219, 'max_depth': 15, 'learning_rate': 0.008170108539597468, 'min_child_weight': 18, 'gamma': 0.14053849704246602, 'subsample': 0.2636346810474465, 'colsample_bytree': 0.20412117985510647}. Best is trial 1 with value: 0.9651160050795218.


Trial 1: 0.9651160050795218
Params: {'n_estimators': 219, 'max_depth': 15, 'learning_rate': 0.008170108539597468, 'min_child_weight': 18, 'gamma': 0.14053849704246602, 'subsample': 0.2636346810474465, 'colsample_bytree': 0.20412117985510647}


[I 2023-10-04 12:22:43,648] Trial 2 finished with value: 0.8197733525498481 and parameters: {'n_estimators': 300, 'max_depth': 4, 'learning_rate': 0.06510708583141818, 'min_child_weight': 18, 'gamma': 0.5634243875273641, 'subsample': 0.625582662318, 'colsample_bytree': 0.9868717705666786}. Best is trial 1 with value: 0.9651160050795218.


Trial 2: 0.8197733525498481
Params: {'n_estimators': 300, 'max_depth': 4, 'learning_rate': 0.06510708583141818, 'min_child_weight': 18, 'gamma': 0.5634243875273641, 'subsample': 0.625582662318, 'colsample_bytree': 0.9868717705666786}


[I 2023-10-04 12:22:48,292] Trial 3 finished with value: 0.9120047839090946 and parameters: {'n_estimators': 176, 'max_depth': 9, 'learning_rate': 0.03530066573003325, 'min_child_weight': 4, 'gamma': 0.36755839560173964, 'subsample': 0.9438352696935766, 'colsample_bytree': 0.12885695447483966}. Best is trial 1 with value: 0.9651160050795218.


Trial 3: 0.9120047839090946
Params: {'n_estimators': 176, 'max_depth': 9, 'learning_rate': 0.03530066573003325, 'min_child_weight': 4, 'gamma': 0.36755839560173964, 'subsample': 0.9438352696935766, 'colsample_bytree': 0.12885695447483966}


[I 2023-10-04 12:22:55,328] Trial 4 finished with value: 0.9753331120474624 and parameters: {'n_estimators': 275, 'max_depth': 9, 'learning_rate': 0.005221004727527383, 'min_child_weight': 1, 'gamma': 0.060071690764745944, 'subsample': 0.267771515127166, 'colsample_bytree': 0.19908627339554574}. Best is trial 4 with value: 0.9753331120474624.


Trial 4: 0.9753331120474624
Params: {'n_estimators': 275, 'max_depth': 9, 'learning_rate': 0.005221004727527383, 'min_child_weight': 1, 'gamma': 0.060071690764745944, 'subsample': 0.267771515127166, 'colsample_bytree': 0.19908627339554574}


[I 2023-10-04 12:22:59,037] Trial 5 finished with value: 0.8875253111952577 and parameters: {'n_estimators': 268, 'max_depth': 2, 'learning_rate': 0.0015646513904588948, 'min_child_weight': 14, 'gamma': 0.34787807893046985, 'subsample': 0.46331112322059875, 'colsample_bytree': 0.21403333658411855}. Best is trial 4 with value: 0.9753331120474624.


Trial 5: 0.8875253111952577
Params: {'n_estimators': 268, 'max_depth': 2, 'learning_rate': 0.0015646513904588948, 'min_child_weight': 14, 'gamma': 0.34787807893046985, 'subsample': 0.46331112322059875, 'colsample_bytree': 0.21403333658411855}


[I 2023-10-04 12:23:01,927] Trial 6 finished with value: 0.8875253111952577 and parameters: {'n_estimators': 49, 'max_depth': 15, 'learning_rate': 0.0015002663236101596, 'min_child_weight': 2, 'gamma': 0.610077222338752, 'subsample': 0.9744172533751791, 'colsample_bytree': 0.631448825354699}. Best is trial 4 with value: 0.9753331120474624.


Trial 6: 0.8875253111952577
Params: {'n_estimators': 49, 'max_depth': 15, 'learning_rate': 0.0015002663236101596, 'min_child_weight': 2, 'gamma': 0.610077222338752, 'subsample': 0.9744172533751791, 'colsample_bytree': 0.631448825354699}


[I 2023-10-04 12:23:05,309] Trial 7 finished with value: 0.7923125158565615 and parameters: {'n_estimators': 142, 'max_depth': 10, 'learning_rate': 0.04220589727900873, 'min_child_weight': 8, 'gamma': 0.8635755651482873, 'subsample': 0.33553987030843063, 'colsample_bytree': 0.6716276290978999}. Best is trial 4 with value: 0.9753331120474624.


Trial 7: 0.7923125158565615
Params: {'n_estimators': 142, 'max_depth': 10, 'learning_rate': 0.04220589727900873, 'min_child_weight': 8, 'gamma': 0.8635755651482873, 'subsample': 0.33553987030843063, 'colsample_bytree': 0.6716276290978999}


[I 2023-10-04 12:23:06,339] Trial 8 finished with value: 0.8875253111952577 and parameters: {'n_estimators': 22, 'max_depth': 13, 'learning_rate': 0.008822434434564427, 'min_child_weight': 13, 'gamma': 0.3568007060358419, 'subsample': 0.20197714571352804, 'colsample_bytree': 0.3568935473972258}. Best is trial 4 with value: 0.9753331120474624.


Trial 8: 0.8875253111952577
Params: {'n_estimators': 22, 'max_depth': 13, 'learning_rate': 0.008822434434564427, 'min_child_weight': 13, 'gamma': 0.3568007060358419, 'subsample': 0.20197714571352804, 'colsample_bytree': 0.3568935473972258}


[I 2023-10-04 12:23:09,769] Trial 9 finished with value: 0.7634710484883769 and parameters: {'n_estimators': 114, 'max_depth': 9, 'learning_rate': 0.024296435369671035, 'min_child_weight': 9, 'gamma': 0.9450999031952263, 'subsample': 0.8444866711380007, 'colsample_bytree': 0.2895902384967275}. Best is trial 4 with value: 0.9753331120474624.


Trial 9: 0.7634710484883769
Params: {'n_estimators': 114, 'max_depth': 9, 'learning_rate': 0.024296435369671035, 'min_child_weight': 9, 'gamma': 0.9450999031952263, 'subsample': 0.8444866711380007, 'colsample_bytree': 0.2895902384967275}


[I 2023-10-04 12:23:15,650] Trial 10 finished with value: 0.8875253111952577 and parameters: {'n_estimators': 317, 'max_depth': 20, 'learning_rate': 0.00018126784375417174, 'min_child_weight': 7, 'gamma': 0.01598417239517428, 'subsample': 0.1249426795968212, 'colsample_bytree': 0.46356979260404363}. Best is trial 4 with value: 0.9753331120474624.


Trial 10: 0.8875253111952577
Params: {'n_estimators': 317, 'max_depth': 20, 'learning_rate': 0.00018126784375417174, 'min_child_weight': 7, 'gamma': 0.01598417239517428, 'subsample': 0.1249426795968212, 'colsample_bytree': 0.46356979260404363}


[I 2023-10-04 12:23:20,083] Trial 11 finished with value: 0.9595014437236934 and parameters: {'n_estimators': 225, 'max_depth': 16, 'learning_rate': 0.0039665604214786775, 'min_child_weight': 20, 'gamma': 0.0026890496685574306, 'subsample': 0.3183225982883512, 'colsample_bytree': 0.13303905760728477}. Best is trial 4 with value: 0.9753331120474624.


Trial 11: 0.9595014437236934
Params: {'n_estimators': 225, 'max_depth': 16, 'learning_rate': 0.0039665604214786775, 'min_child_weight': 20, 'gamma': 0.0026890496685574306, 'subsample': 0.3183225982883512, 'colsample_bytree': 0.13303905760728477}


[I 2023-10-04 12:23:23,920] Trial 12 finished with value: 0.9640115073171184 and parameters: {'n_estimators': 231, 'max_depth': 6, 'learning_rate': 0.015200862717690123, 'min_child_weight': 16, 'gamma': 0.1402516244796412, 'subsample': 0.10019550433673072, 'colsample_bytree': 0.344962802347598}. Best is trial 4 with value: 0.9753331120474624.


Trial 12: 0.9640115073171184
Params: {'n_estimators': 231, 'max_depth': 6, 'learning_rate': 0.015200862717690123, 'min_child_weight': 16, 'gamma': 0.1402516244796412, 'subsample': 0.10019550433673072, 'colsample_bytree': 0.344962802347598}


[I 2023-10-04 12:23:29,375] Trial 13 finished with value: 0.9467998945418636 and parameters: {'n_estimators': 213, 'max_depth': 19, 'learning_rate': 0.003039827301877006, 'min_child_weight': 12, 'gamma': 0.16384804644500023, 'subsample': 0.42880874057590646, 'colsample_bytree': 0.10582597300259544}. Best is trial 4 with value: 0.9753331120474624.


Trial 13: 0.9467998945418636
Params: {'n_estimators': 213, 'max_depth': 19, 'learning_rate': 0.003039827301877006, 'min_child_weight': 12, 'gamma': 0.16384804644500023, 'subsample': 0.42880874057590646, 'colsample_bytree': 0.10582597300259544}


[I 2023-10-04 12:23:39,106] Trial 14 finished with value: 0.876903860495242 and parameters: {'n_estimators': 266, 'max_depth': 13, 'learning_rate': 0.011744281791973175, 'min_child_weight': 1, 'gamma': 0.1984618867001684, 'subsample': 0.330454461124317, 'colsample_bytree': 0.25435748259012103}. Best is trial 4 with value: 0.9753331120474624.


Trial 14: 0.876903860495242
Params: {'n_estimators': 266, 'max_depth': 13, 'learning_rate': 0.011744281791973175, 'min_child_weight': 1, 'gamma': 0.1984618867001684, 'subsample': 0.330454461124317, 'colsample_bytree': 0.25435748259012103}


[I 2023-10-04 12:23:43,282] Trial 15 finished with value: 0.9033827538444891 and parameters: {'n_estimators': 174, 'max_depth': 6, 'learning_rate': 0.08040169586639774, 'min_child_weight': 16, 'gamma': 0.0741327631107644, 'subsample': 0.24130100985727496, 'colsample_bytree': 0.44757390723230783}. Best is trial 4 with value: 0.9753331120474624.


Trial 15: 0.9033827538444891
Params: {'n_estimators': 174, 'max_depth': 6, 'learning_rate': 0.08040169586639774, 'min_child_weight': 16, 'gamma': 0.0741327631107644, 'subsample': 0.24130100985727496, 'colsample_bytree': 0.44757390723230783}


[I 2023-10-04 12:23:51,051] Trial 16 finished with value: 0.8590775073247995 and parameters: {'n_estimators': 268, 'max_depth': 17, 'learning_rate': 0.02299843515805856, 'min_child_weight': 6, 'gamma': 0.25888202361208623, 'subsample': 0.4282515186613643, 'colsample_bytree': 0.23114192300961203}. Best is trial 4 with value: 0.9753331120474624.


Trial 16: 0.8590775073247995
Params: {'n_estimators': 268, 'max_depth': 17, 'learning_rate': 0.02299843515805856, 'min_child_weight': 6, 'gamma': 0.25888202361208623, 'subsample': 0.4282515186613643, 'colsample_bytree': 0.23114192300961203}


[I 2023-10-04 12:23:56,333] Trial 17 finished with value: 0.7640851139560106 and parameters: {'n_estimators': 190, 'max_depth': 12, 'learning_rate': 0.004716404886777776, 'min_child_weight': 10, 'gamma': 0.09912195185462734, 'subsample': 0.5986487862458094, 'colsample_bytree': 0.39090736216189625}. Best is trial 4 with value: 0.9753331120474624.


Trial 17: 0.7640851139560106
Params: {'n_estimators': 190, 'max_depth': 12, 'learning_rate': 0.004716404886777776, 'min_child_weight': 10, 'gamma': 0.09912195185462734, 'subsample': 0.5986487862458094, 'colsample_bytree': 0.39090736216189625}


[I 2023-10-04 12:23:58,833] Trial 18 finished with value: 0.8875253111952577 and parameters: {'n_estimators': 116, 'max_depth': 7, 'learning_rate': 0.0017442400447336046, 'min_child_weight': 20, 'gamma': 0.004371603094406082, 'subsample': 0.2065310402063304, 'colsample_bytree': 0.20396780456365718}. Best is trial 4 with value: 0.9753331120474624.


Trial 18: 0.8875253111952577
Params: {'n_estimators': 116, 'max_depth': 7, 'learning_rate': 0.0017442400447336046, 'min_child_weight': 20, 'gamma': 0.004371603094406082, 'subsample': 0.2065310402063304, 'colsample_bytree': 0.20396780456365718}


[I 2023-10-04 12:24:04,986] Trial 19 finished with value: 0.8875253111952577 and parameters: {'n_estimators': 245, 'max_depth': 14, 'learning_rate': 0.0008460188237191085, 'min_child_weight': 11, 'gamma': 0.221311953399761, 'subsample': 0.48749748102090484, 'colsample_bytree': 0.5240178523502241}. Best is trial 4 with value: 0.9753331120474624.


Trial 19: 0.8875253111952577
Params: {'n_estimators': 245, 'max_depth': 14, 'learning_rate': 0.0008460188237191085, 'min_child_weight': 11, 'gamma': 0.221311953399761, 'subsample': 0.48749748102090484, 'colsample_bytree': 0.5240178523502241}


[I 2023-10-04 12:24:11,729] Trial 20 finished with value: 0.9139682498264391 and parameters: {'n_estimators': 292, 'max_depth': 18, 'learning_rate': 0.013400790511356765, 'min_child_weight': 15, 'gamma': 0.2736333883812108, 'subsample': 0.2793977042718116, 'colsample_bytree': 0.31755882849600564}. Best is trial 4 with value: 0.9753331120474624.


Trial 20: 0.9139682498264391
Params: {'n_estimators': 292, 'max_depth': 18, 'learning_rate': 0.013400790511356765, 'min_child_weight': 15, 'gamma': 0.2736333883812108, 'subsample': 0.2793977042718116, 'colsample_bytree': 0.31755882849600564}


[I 2023-10-04 12:24:16,105] Trial 21 finished with value: 0.9644408770240203 and parameters: {'n_estimators': 217, 'max_depth': 7, 'learning_rate': 0.015044950173871084, 'min_child_weight': 17, 'gamma': 0.1349568102468605, 'subsample': 0.148506496458426, 'colsample_bytree': 0.28390069005271745}. Best is trial 4 with value: 0.9753331120474624.


Trial 21: 0.9644408770240203
Params: {'n_estimators': 217, 'max_depth': 7, 'learning_rate': 0.015044950173871084, 'min_child_weight': 17, 'gamma': 0.1349568102468605, 'subsample': 0.148506496458426, 'colsample_bytree': 0.28390069005271745}


[I 2023-10-04 12:24:19,999] Trial 22 finished with value: 0.958304969432277 and parameters: {'n_estimators': 200, 'max_depth': 8, 'learning_rate': 0.005257499286679307, 'min_child_weight': 18, 'gamma': 0.11420236326963584, 'subsample': 0.18385954698369536, 'colsample_bytree': 0.17362250577760716}. Best is trial 4 with value: 0.9753331120474624.


Trial 22: 0.958304969432277
Params: {'n_estimators': 200, 'max_depth': 8, 'learning_rate': 0.005257499286679307, 'min_child_weight': 18, 'gamma': 0.11420236326963584, 'subsample': 0.18385954698369536, 'colsample_bytree': 0.17362250577760716}


[I 2023-10-04 12:24:24,582] Trial 23 finished with value: 0.9633365261077792 and parameters: {'n_estimators': 246, 'max_depth': 11, 'learning_rate': 0.008584922595878796, 'min_child_weight': 18, 'gamma': 0.08068225985674617, 'subsample': 0.15329739748417132, 'colsample_bytree': 0.28586848130695974}. Best is trial 4 with value: 0.9753331120474624.


Trial 23: 0.9633365261077792
Params: {'n_estimators': 246, 'max_depth': 11, 'learning_rate': 0.008584922595878796, 'min_child_weight': 18, 'gamma': 0.08068225985674617, 'subsample': 0.15329739748417132, 'colsample_bytree': 0.28586848130695974}


[I 2023-10-04 12:24:27,197] Trial 24 finished with value: 0.9627839948301115 and parameters: {'n_estimators': 144, 'max_depth': 4, 'learning_rate': 0.020183431908295288, 'min_child_weight': 17, 'gamma': 0.19908265890610055, 'subsample': 0.26711601635981386, 'colsample_bytree': 0.17899780459008252}. Best is trial 4 with value: 0.9753331120474624.


Trial 24: 0.9627839948301115
Params: {'n_estimators': 144, 'max_depth': 4, 'learning_rate': 0.020183431908295288, 'min_child_weight': 17, 'gamma': 0.19908265890610055, 'subsample': 0.26711601635981386, 'colsample_bytree': 0.17899780459008252}


[I 2023-10-04 12:24:30,241] Trial 25 finished with value: 0.9510952379822796 and parameters: {'n_estimators': 289, 'max_depth': 1, 'learning_rate': 0.008012940651506375, 'min_child_weight': 14, 'gamma': 0.13529616093735608, 'subsample': 0.3614491030981073, 'colsample_bytree': 0.25472141208672394}. Best is trial 4 with value: 0.9753331120474624.


Trial 25: 0.9510952379822796
Params: {'n_estimators': 289, 'max_depth': 1, 'learning_rate': 0.008012940651506375, 'min_child_weight': 14, 'gamma': 0.13529616093735608, 'subsample': 0.3614491030981073, 'colsample_bytree': 0.25472141208672394}


[I 2023-10-04 12:24:33,483] Trial 26 finished with value: 0.9606671961035613 and parameters: {'n_estimators': 205, 'max_depth': 4, 'learning_rate': 0.03659724079963899, 'min_child_weight': 20, 'gamma': 0.27045139634211945, 'subsample': 0.10600854553268504, 'colsample_bytree': 0.1114799940926001}. Best is trial 4 with value: 0.9753331120474624.


Trial 26: 0.9606671961035613
Params: {'n_estimators': 205, 'max_depth': 4, 'learning_rate': 0.03659724079963899, 'min_child_weight': 20, 'gamma': 0.27045139634211945, 'subsample': 0.10600854553268504, 'colsample_bytree': 0.1114799940926001}


[I 2023-10-04 12:24:38,998] Trial 27 finished with value: 0.8846665524655357 and parameters: {'n_estimators': 247, 'max_depth': 7, 'learning_rate': 0.01679376231043707, 'min_child_weight': 12, 'gamma': 0.0669876491439647, 'subsample': 0.23784557524177521, 'colsample_bytree': 0.386027025758811}. Best is trial 4 with value: 0.9753331120474624.


Trial 27: 0.8846665524655357
Params: {'n_estimators': 247, 'max_depth': 7, 'learning_rate': 0.01679376231043707, 'min_child_weight': 12, 'gamma': 0.0669876491439647, 'subsample': 0.23784557524177521, 'colsample_bytree': 0.386027025758811}


[I 2023-10-04 12:24:42,400] Trial 28 finished with value: 0.8875253111952577 and parameters: {'n_estimators': 147, 'max_depth': 11, 'learning_rate': 0.003003097829738819, 'min_child_weight': 5, 'gamma': 0.4130089068956302, 'subsample': 0.16672840902935382, 'colsample_bytree': 0.19529781039052085}. Best is trial 4 with value: 0.9753331120474624.


Trial 28: 0.8875253111952577
Params: {'n_estimators': 147, 'max_depth': 11, 'learning_rate': 0.003003097829738819, 'min_child_weight': 5, 'gamma': 0.4130089068956302, 'subsample': 0.16672840902935382, 'colsample_bytree': 0.19529781039052085}


[I 2023-10-04 12:24:50,901] Trial 29 finished with value: 0.7714178686025058 and parameters: {'n_estimators': 320, 'max_depth': 9, 'learning_rate': 0.006409483993872512, 'min_child_weight': 3, 'gamma': 0.16575748365846027, 'subsample': 0.272721157841323, 'colsample_bytree': 0.29922229156982094}. Best is trial 4 with value: 0.9753331120474624.


Trial 29: 0.7714178686025058
Params: {'n_estimators': 320, 'max_depth': 9, 'learning_rate': 0.006409483993872512, 'min_child_weight': 3, 'gamma': 0.16575748365846027, 'subsample': 0.272721157841323, 'colsample_bytree': 0.29922229156982094}


XGB Classifier Model
Precision: 0.990 
Recall: 0.568 
F1 Score: 0.722
Micro-Average F1 Score: 0.9724409448818898


In [20]:
# Evaluation
print("\n\nXGB Classifier Model")
print("Precision: %.3f \nRecall: %.3f \nF1 Score: %.3f" % (xgb_prec[1], xgb_rec[1], xgb_f1[1]))
micro_f1 = f1_score(y_test, xgb_preds, average='micro')
print("Micro-Average F1 Score:", micro_f1)
classification = classification_report(y_test, xgb_preds)
print("Classification Report: \n", classification)



XGB Classifier Model
Precision: 0.990 
Recall: 0.568 
F1 Score: 0.722
Micro-Average F1 Score: 0.9724409448818898
Classification Report: 
               precision    recall  f1-score   support

           0       0.97      1.00      0.99     13091
           1       0.99      0.57      0.72       879

    accuracy                           0.97     13970
   macro avg       0.98      0.78      0.85     13970
weighted avg       0.97      0.97      0.97     13970



In [16]:
# Best params
xgb_best_params

{'n_estimators': 275,
 'max_depth': 9,
 'learning_rate': 0.005221004727527383,
 'min_child_weight': 1,
 'gamma': 0.060071690764745944,
 'subsample': 0.267771515127166,
 'colsample_bytree': 0.19908627339554574}

# Embedding

In [21]:
#Then add embedding features and run it all over again...

embed_names = ["emb_"+str(i) for i in range(1,51)]
embeddings = pd.read_csv('elliptic_bitcoin_dataset/elliptic.emb',delimiter=" ",skiprows=1,header=None)
embeddings.columns = ['txId'] + ["emb_"+str(i) for i in range(1,51)]

data = df_features[(df_features['class']=='1') | (df_features['class']=='2')]
data = pd.merge(data,embeddings,how='inner')
X = data[tx_features+agg_features+embed_names]
y = data['class']
y = y.apply(lambda x: 0 if x == '2' else 1 )
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=15,shuffle=False)

In [25]:
#XGB
study_xgb = optuna.create_study(direction='maximize')
study_xgb.optimize(objective_xgb, n_trials=20, callbacks=[print_trial_callback])

xgb_best_params = study_xgb.best_trial.params
xgbc = xgb.XGBClassifier(**xgb_best_params, random_state=rsn).fit(X_train, y_train)
xgb_preds = xgbc.predict(X_test)
xgb_prec, xgb_rec, xgb_f1, xgb_num = precision_recall_fscore_support(y_test, xgb_preds, average=None)

[I 2023-10-04 12:36:48,143] A new study created in memory with name: no-name-aeee6744-0132-48a4-a1e7-ecc2e5a7389c


[I 2023-10-04 12:37:00,856] Trial 0 finished with value: 0.887473525678869 and parameters: {'n_estimators': 249, 'max_depth': 14, 'learning_rate': 0.00033095825586489704, 'min_child_weight': 20, 'gamma': 0.35714827378744807, 'subsample': 0.8450006684154654, 'colsample_bytree': 0.8875084020743679}. Best is trial 0 with value: 0.887473525678869.


Trial 0: 0.887473525678869
Params: {'n_estimators': 249, 'max_depth': 14, 'learning_rate': 0.00033095825586489704, 'min_child_weight': 20, 'gamma': 0.35714827378744807, 'subsample': 0.8450006684154654, 'colsample_bytree': 0.8875084020743679}


[I 2023-10-04 12:37:07,423] Trial 1 finished with value: 0.7327888774186851 and parameters: {'n_estimators': 195, 'max_depth': 18, 'learning_rate': 0.004841990774581337, 'min_child_weight': 20, 'gamma': 0.9482291259579523, 'subsample': 0.8226381464741157, 'colsample_bytree': 0.5331141424011663}. Best is trial 0 with value: 0.887473525678869.


Trial 1: 0.7327888774186851
Params: {'n_estimators': 195, 'max_depth': 18, 'learning_rate': 0.004841990774581337, 'min_child_weight': 20, 'gamma': 0.9482291259579523, 'subsample': 0.8226381464741157, 'colsample_bytree': 0.5331141424011663}


[I 2023-10-04 12:37:09,654] Trial 2 finished with value: 0.887473525678869 and parameters: {'n_estimators': 71, 'max_depth': 10, 'learning_rate': 0.000609336152560853, 'min_child_weight': 20, 'gamma': 0.587333645876906, 'subsample': 0.10945180825643815, 'colsample_bytree': 0.41719520462369586}. Best is trial 0 with value: 0.887473525678869.


Trial 2: 0.887473525678869
Params: {'n_estimators': 71, 'max_depth': 10, 'learning_rate': 0.000609336152560853, 'min_child_weight': 20, 'gamma': 0.587333645876906, 'subsample': 0.10945180825643815, 'colsample_bytree': 0.41719520462369586}


[I 2023-10-04 12:37:12,475] Trial 3 finished with value: 0.7120378996106744 and parameters: {'n_estimators': 51, 'max_depth': 9, 'learning_rate': 0.05164607116042524, 'min_child_weight': 12, 'gamma': 0.3839917469691956, 'subsample': 0.8298047133680132, 'colsample_bytree': 0.9464514577602904}. Best is trial 0 with value: 0.887473525678869.


Trial 3: 0.7120378996106744
Params: {'n_estimators': 51, 'max_depth': 9, 'learning_rate': 0.05164607116042524, 'min_child_weight': 12, 'gamma': 0.3839917469691956, 'subsample': 0.8298047133680132, 'colsample_bytree': 0.9464514577602904}


[I 2023-10-04 12:37:25,596] Trial 4 finished with value: 0.9103097995167816 and parameters: {'n_estimators': 208, 'max_depth': 13, 'learning_rate': 0.002444301668264382, 'min_child_weight': 2, 'gamma': 0.008489354567771112, 'subsample': 0.7744726725868947, 'colsample_bytree': 0.3397421300686435}. Best is trial 4 with value: 0.9103097995167816.


Trial 4: 0.9103097995167816
Params: {'n_estimators': 208, 'max_depth': 13, 'learning_rate': 0.002444301668264382, 'min_child_weight': 2, 'gamma': 0.008489354567771112, 'subsample': 0.7744726725868947, 'colsample_bytree': 0.3397421300686435}


[I 2023-10-04 12:37:28,377] Trial 5 finished with value: 0.9643631201916231 and parameters: {'n_estimators': 72, 'max_depth': 19, 'learning_rate': 0.029715619497538776, 'min_child_weight': 16, 'gamma': 0.19760318519452913, 'subsample': 0.17898434445187558, 'colsample_bytree': 0.4364008080894991}. Best is trial 5 with value: 0.9643631201916231.


Trial 5: 0.9643631201916231
Params: {'n_estimators': 72, 'max_depth': 19, 'learning_rate': 0.029715619497538776, 'min_child_weight': 16, 'gamma': 0.19760318519452913, 'subsample': 0.17898434445187558, 'colsample_bytree': 0.4364008080894991}


[I 2023-10-04 12:37:32,751] Trial 6 finished with value: 0.887473525678869 and parameters: {'n_estimators': 162, 'max_depth': 4, 'learning_rate': 0.0005466198807683364, 'min_child_weight': 12, 'gamma': 0.25888167206320134, 'subsample': 0.7585955101367238, 'colsample_bytree': 0.634628576291521}. Best is trial 5 with value: 0.9643631201916231.


Trial 6: 0.887473525678869
Params: {'n_estimators': 162, 'max_depth': 4, 'learning_rate': 0.0005466198807683364, 'min_child_weight': 12, 'gamma': 0.25888167206320134, 'subsample': 0.7585955101367238, 'colsample_bytree': 0.634628576291521}


[I 2023-10-04 12:37:35,960] Trial 7 finished with value: 0.887473525678869 and parameters: {'n_estimators': 62, 'max_depth': 15, 'learning_rate': 0.00012260902699902143, 'min_child_weight': 12, 'gamma': 0.6601830872978524, 'subsample': 0.6150811603495036, 'colsample_bytree': 0.8877124270585637}. Best is trial 5 with value: 0.9643631201916231.


Trial 7: 0.887473525678869
Params: {'n_estimators': 62, 'max_depth': 15, 'learning_rate': 0.00012260902699902143, 'min_child_weight': 12, 'gamma': 0.6601830872978524, 'subsample': 0.6150811603495036, 'colsample_bytree': 0.8877124270585637}


[I 2023-10-04 12:37:44,142] Trial 8 finished with value: 0.887473525678869 and parameters: {'n_estimators': 247, 'max_depth': 15, 'learning_rate': 0.0004283612797320777, 'min_child_weight': 20, 'gamma': 0.9879592396749104, 'subsample': 0.9198707712511314, 'colsample_bytree': 0.3376927781173215}. Best is trial 5 with value: 0.9643631201916231.


Trial 8: 0.887473525678869
Params: {'n_estimators': 247, 'max_depth': 15, 'learning_rate': 0.0004283612797320777, 'min_child_weight': 20, 'gamma': 0.9879592396749104, 'subsample': 0.9198707712511314, 'colsample_bytree': 0.3376927781173215}


[I 2023-10-04 12:37:46,030] Trial 9 finished with value: 0.887473525678869 and parameters: {'n_estimators': 78, 'max_depth': 2, 'learning_rate': 0.00025834684216907575, 'min_child_weight': 2, 'gamma': 0.08597849340542307, 'subsample': 0.6488312304717282, 'colsample_bytree': 0.10034968521102411}. Best is trial 5 with value: 0.9643631201916231.


Trial 9: 0.887473525678869
Params: {'n_estimators': 78, 'max_depth': 2, 'learning_rate': 0.00025834684216907575, 'min_child_weight': 2, 'gamma': 0.08597849340542307, 'subsample': 0.6488312304717282, 'colsample_bytree': 0.10034968521102411}


[I 2023-10-04 12:37:50,868] Trial 10 finished with value: 0.8822498400305133 and parameters: {'n_estimators': 135, 'max_depth': 20, 'learning_rate': 0.05177806299622465, 'min_child_weight': 16, 'gamma': 0.2223413347553722, 'subsample': 0.30318874515472133, 'colsample_bytree': 0.6680136196184752}. Best is trial 5 with value: 0.9643631201916231.


Trial 10: 0.8822498400305133
Params: {'n_estimators': 135, 'max_depth': 20, 'learning_rate': 0.05177806299622465, 'min_child_weight': 16, 'gamma': 0.2223413347553722, 'subsample': 0.30318874515472133, 'colsample_bytree': 0.6680136196184752}


[I 2023-10-04 12:38:04,270] Trial 11 finished with value: 0.7587270499116671 and parameters: {'n_estimators': 291, 'max_depth': 12, 'learning_rate': 0.006637727714910985, 'min_child_weight': 2, 'gamma': 0.05446937088742758, 'subsample': 0.456183579499455, 'colsample_bytree': 0.29833021533765813}. Best is trial 5 with value: 0.9643631201916231.


Trial 11: 0.7587270499116671
Params: {'n_estimators': 291, 'max_depth': 12, 'learning_rate': 0.006637727714910985, 'min_child_weight': 2, 'gamma': 0.05446937088742758, 'subsample': 0.456183579499455, 'colsample_bytree': 0.29833021533765813}


[I 2023-10-04 12:38:09,338] Trial 12 finished with value: 0.8311708324874836 and parameters: {'n_estimators': 130, 'max_depth': 7, 'learning_rate': 0.01307744621997517, 'min_child_weight': 7, 'gamma': 7.230804101646116e-05, 'subsample': 0.984450609127774, 'colsample_bytree': 0.21304381714593412}. Best is trial 5 with value: 0.9643631201916231.


Trial 12: 0.8311708324874836
Params: {'n_estimators': 130, 'max_depth': 7, 'learning_rate': 0.01307744621997517, 'min_child_weight': 7, 'gamma': 7.230804101646116e-05, 'subsample': 0.984450609127774, 'colsample_bytree': 0.21304381714593412}


[I 2023-10-04 12:38:17,139] Trial 13 finished with value: 0.887473525678869 and parameters: {'n_estimators': 218, 'max_depth': 18, 'learning_rate': 0.0018134860545388322, 'min_child_weight': 7, 'gamma': 0.15092513093425516, 'subsample': 0.39181521406282793, 'colsample_bytree': 0.45049002940633615}. Best is trial 5 with value: 0.9643631201916231.


Trial 13: 0.887473525678869
Params: {'n_estimators': 218, 'max_depth': 18, 'learning_rate': 0.0018134860545388322, 'min_child_weight': 7, 'gamma': 0.15092513093425516, 'subsample': 0.39181521406282793, 'colsample_bytree': 0.45049002940633615}


[I 2023-10-04 12:38:20,575] Trial 14 finished with value: 0.9594828368385858 and parameters: {'n_estimators': 112, 'max_depth': 18, 'learning_rate': 0.01595577993009329, 'min_child_weight': 16, 'gamma': 0.02457018616531137, 'subsample': 0.10224983057882997, 'colsample_bytree': 0.2671129922760398}. Best is trial 5 with value: 0.9643631201916231.


Trial 14: 0.9594828368385858
Params: {'n_estimators': 112, 'max_depth': 18, 'learning_rate': 0.01595577993009329, 'min_child_weight': 16, 'gamma': 0.02457018616531137, 'subsample': 0.10224983057882997, 'colsample_bytree': 0.2671129922760398}


[I 2023-10-04 12:38:22,116] Trial 15 finished with value: 0.887473525678869 and parameters: {'n_estimators': 25, 'max_depth': 20, 'learning_rate': 0.019558559466468964, 'min_child_weight': 16, 'gamma': 0.170693045258559, 'subsample': 0.14244598933930203, 'colsample_bytree': 0.1586906468032283}. Best is trial 5 with value: 0.9643631201916231.


Trial 15: 0.887473525678869
Params: {'n_estimators': 25, 'max_depth': 20, 'learning_rate': 0.019558559466468964, 'min_child_weight': 16, 'gamma': 0.170693045258559, 'subsample': 0.14244598933930203, 'colsample_bytree': 0.1586906468032283}


[I 2023-10-04 12:38:25,748] Trial 16 finished with value: 0.9533735909781901 and parameters: {'n_estimators': 106, 'max_depth': 17, 'learning_rate': 0.02460534855145782, 'min_child_weight': 15, 'gamma': 0.309042894839574, 'subsample': 0.25701519346967755, 'colsample_bytree': 0.24737856999626945}. Best is trial 5 with value: 0.9643631201916231.


Trial 16: 0.9533735909781901
Params: {'n_estimators': 106, 'max_depth': 17, 'learning_rate': 0.02460534855145782, 'min_child_weight': 15, 'gamma': 0.309042894839574, 'subsample': 0.25701519346967755, 'colsample_bytree': 0.24737856999626945}


[I 2023-10-04 12:38:28,962] Trial 17 finished with value: 0.9163536753918793 and parameters: {'n_estimators': 108, 'max_depth': 17, 'learning_rate': 0.09973604167703974, 'min_child_weight': 17, 'gamma': 0.13351057987674883, 'subsample': 0.2166106115105469, 'colsample_bytree': 0.21580531919284776}. Best is trial 5 with value: 0.9643631201916231.


Trial 17: 0.9163536753918793
Params: {'n_estimators': 108, 'max_depth': 17, 'learning_rate': 0.09973604167703974, 'min_child_weight': 17, 'gamma': 0.13351057987674883, 'subsample': 0.2166106115105469, 'colsample_bytree': 0.21580531919284776}


[I 2023-10-04 12:38:33,670] Trial 18 finished with value: 0.9651305610489861 and parameters: {'n_estimators': 161, 'max_depth': 20, 'learning_rate': 0.012863656763929164, 'min_child_weight': 14, 'gamma': 0.21748014880223387, 'subsample': 0.17171000095056022, 'colsample_bytree': 0.4351229387814561}. Best is trial 18 with value: 0.9651305610489861.


Trial 18: 0.9651305610489861
Params: {'n_estimators': 161, 'max_depth': 20, 'learning_rate': 0.012863656763929164, 'min_child_weight': 14, 'gamma': 0.21748014880223387, 'subsample': 0.17171000095056022, 'colsample_bytree': 0.4351229387814561}


[I 2023-10-04 12:38:39,266] Trial 19 finished with value: 0.7942743219337373 and parameters: {'n_estimators': 166, 'max_depth': 20, 'learning_rate': 0.004018140697820689, 'min_child_weight': 10, 'gamma': 0.39845959537245684, 'subsample': 0.3384067757527567, 'colsample_bytree': 0.46739408731522386}. Best is trial 18 with value: 0.9651305610489861.


Trial 19: 0.7942743219337373
Params: {'n_estimators': 166, 'max_depth': 20, 'learning_rate': 0.004018140697820689, 'min_child_weight': 10, 'gamma': 0.39845959537245684, 'subsample': 0.3384067757527567, 'colsample_bytree': 0.46739408731522386}


In [26]:
# Evaluation
print("\n\nXGB Classifier Model")
print("Precision: %.3f \nRecall: %.3f \nF1 Score: %.3f" % (xgb_prec[1], xgb_rec[1], xgb_f1[1]))
micro_f1 = f1_score(y_test, xgb_preds, average='micro')
print("Micro-Average F1 Score:", micro_f1)
classification = classification_report(y_test, xgb_preds)
print("Classification Report: \n", classification)



XGB Classifier Model
Precision: 0.973 
Recall: 0.651 
F1 Score: 0.780
Micro-Average F1 Score: 0.9769390532120604
Classification Report: 
               precision    recall  f1-score   support

           0       0.98      1.00      0.99     13087
           1       0.97      0.65      0.78       876

    accuracy                           0.98     13963
   macro avg       0.97      0.82      0.88     13963
weighted avg       0.98      0.98      0.97     13963



In [27]:
# Best params
xgb_best_params

{'n_estimators': 161,
 'max_depth': 20,
 'learning_rate': 0.012863656763929164,
 'min_child_weight': 14,
 'gamma': 0.21748014880223387,
 'subsample': 0.17171000095056022,
 'colsample_bytree': 0.4351229387814561}