In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

sns.set(style="darkgrid")

np.random.seed(0)
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score


In [2]:
%load_ext autoreload

In [3]:
%autoreload 2

from utils import (
    get_prepared_train_test,
    get_anomaly_detection_report,
    get_detection_score
)

### New split

In [4]:
train_data_raw = pd.read_csv("../dataset/custom/features_fixed/wallets_features_with_users__train.csv")
test_data_raw = pd.read_csv("../dataset/custom/features_fixed/wallets_features_with_users__test.csv")

X_train, X_test, y_train, y_test = get_prepared_train_test(
    train_data_raw,
    test_data_raw,
)
X_train.shape, X_test.shape

((552376, 53), (281034, 53))

In [5]:
X_train_no_users = X_train.drop(columns=[
    'user_addr_cnt', 'user_outcoming_tx_cnt',
    'user_incoming_tx_cnt', 'user_input_users_cnt', 'user_output_users_cnt',
    'user_active_time_steps_cnt', 'user_btc_transacted_total',
    'user_btc_sent_total', 'user_btc_received_total',
    'user_btc_sent_median', 'user_btc_received_median',
    'user_interracted_output_address_cnt',
    'user_interracted_input_address_cnt', 'user_overall_activity_coef',
    'user_user_ts_fees_share_mean', 'user_user_ts_fees_share_min',
    'user_user_ts_fees_share_max', 'user_whole_fee_5',
])
X_test_no_users = X_test.drop(columns=[
    'user_addr_cnt', 'user_outcoming_tx_cnt',
    'user_incoming_tx_cnt', 'user_input_users_cnt', 'user_output_users_cnt',
    'user_active_time_steps_cnt', 'user_btc_transacted_total',
    'user_btc_sent_total', 'user_btc_received_total',
    'user_btc_sent_median', 'user_btc_received_median',
    'user_interracted_output_address_cnt',
    'user_interracted_input_address_cnt', 'user_overall_activity_coef',
    'user_user_ts_fees_share_mean', 'user_user_ts_fees_share_min',
    'user_user_ts_fees_share_max', 'user_whole_fee_5',
])

X_train_no_users.shape, X_test_no_users.shape

((552376, 35), (281034, 35))

In [6]:
X_train

Unnamed: 0,btc_received_total,btc_received_min,btc_received_max,btc_received_mean,incoming_tx_cnt,incoming_tx_input_address_cnt_mean,incoming_tx_output_address_cnt_mean,btc_received_gini,input_address_cnt,incoming_tx_fees_min,...,user_interracted_input_address_cnt,user_overall_activity_coef,user_user_ts_fees_share_mean,user_user_ts_fees_share_min,user_user_ts_fees_share_max,user_whole_fee_5,exchange_flg,transacted_w_exchange_flg,big_bct_received,big_inp_addr
0,6.999303,6.999303,6.999303,6.999303,1.0,1.0,2.0,0.0,1.0,0.001000,...,1.0,0.000000,5.977542,5.977542,5.977542,1.0,False,False,0,0
1,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.000000,0.597754,0.597754,0.597754,1.0,False,False,0,0
2,0.390310,0.390310,0.390310,0.390310,1.0,4.0,2.0,0.0,4.0,0.000100,...,4.0,0.333333,0.597754,0.597754,0.597754,1.0,False,False,0,0
3,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.000000,0.597754,0.597754,0.597754,1.0,False,False,0,0
4,3.200299,3.200299,3.200299,3.200299,1.0,1.0,2.0,0.0,1.0,0.000100,...,1.0,0.000000,0.597754,0.597754,0.597754,1.0,False,False,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
552371,7.938932,7.938932,7.938932,7.938932,1.0,1.0,2.0,0.0,1.0,0.000367,...,1.0,0.000000,0.000000,0.000000,0.000000,0.0,False,True,0,0
552372,7.177773,7.177773,7.177773,7.177773,1.0,1.0,2.0,0.0,1.0,0.000369,...,1.0,0.000000,0.000000,0.000000,0.000000,0.0,False,True,0,0
552373,7.148613,7.148613,7.148613,7.148613,1.0,1.0,2.0,0.0,1.0,0.000369,...,1.0,0.000000,0.000000,0.000000,0.000000,0.0,False,True,0,0
552374,0.118777,0.118777,0.118777,0.118777,1.0,1.0,2.0,0.0,1.0,0.000366,...,358.0,0.110065,2.804893,0.640190,5.910495,6.0,False,True,0,0


In [7]:
X_train_licit = X_train.loc[train_data_raw["class"] == 2]

X_test_licit = X_test.loc[test_data_raw["class"] == 2]
X_test_illicit = X_test.loc[test_data_raw["class"] == 1]
X_test_unknown = X_test.loc[test_data_raw["class"] == 3]

X_train_licit.shape

(168269, 53)

In [8]:
from sklearn.svm import OneClassSVM

model = OneClassSVM(kernel='rbf', gamma='auto', nu=0.05)
model.fit(X_train_licit)


In [9]:
labels_illicit = model.predict(X_test_illicit)
labels_licit = model.predict(X_test_licit)
labels_unknown = model.predict(X_test_unknown)

In [10]:
get_anomaly_detection_report(
    labels_licit,
    labels_illicit,
    labels_unknown
)

Fraud wallets found pct: 0.6512763596004439
Licit wallets fraud pct: 0.6473561385198129
Licit wallets accuracy: 0.35264386148018717
Unknown wallets fraud pct: 0.547716150081566
Unknown wallets accuracy: 0.45228384991843396

Total accuracy: 0.4252403623760826
Total precicion: 0.01801215544232304
Total recall: 0.6512763596004439
Total illicit cnt: 162890 / 281034, true illicit: 4505


In [28]:
np.concatenate([labels_licit, labels_illicit]).shape

(89778,)

In [11]:
all_true = np.concatenate([
    np.ones(labels_illicit.shape),
    np.zeros(labels_licit.shape),
    np.zeros(labels_unknown.shape),  
])

all_preds = np.concatenate([
    labels_illicit >= 0,
    labels_licit >= 0,
    labels_unknown >= 0,
]).astype(int)

prec = precision_score(all_true, all_preds)
rec = recall_score(all_true, all_preds)
prec, rec

(0.013297332069339111, 0.34872364039955606)

In [12]:
from utils import print_confusion_matrix

print_confusion_matrix(all_preds, 0.1, all_true)

	real 1	real 0
pred 1	1571	116573	
pred 0	2934	159956	


In [13]:
(1571+159956)/(1571+159956+2934+116573)

0.5747596376239174