In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

sns.set(style="darkgrid")

np.random.seed(0)
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

In [2]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score
from utils import get_best_threshold

In [3]:
wallets_features = pd.read_csv("../dataset/custom/wallets_features_aggregated.csv")
%load_ext autoreload

In [4]:
%autoreload 2

from utils import (
    prepare_wallets_features_data,
    get_training_data
)

In [5]:
good_wallets = wallets_features[wallets_features["class"] == 2]
bad_wallets = wallets_features[wallets_features["class"] == 1]

good_wallets.shape, bad_wallets.shape

((251088, 46), (14266, 46))

In [7]:

prep_data = prepare_wallets_features_data(wallets_features[wallets_features["class"] != 3], type="full")
good_wallets = prep_data[prep_data["class"] == 2]
bad_wallets = prep_data[prep_data["class"] == 1]
good_wallets[:3000].shape, bad_wallets.shape

((3000, 44), (14266, 44))

In [26]:
from sklearn.svm import OneClassSVM

prep_data = prepare_wallets_features_data(wallets_features[wallets_features["class"] != 3], type="full")

X_train, X_test, y_train, y_test = get_training_data(prep_data, no_unknown=True, binary=False)

good_wallets = X_train[y_train == 2]
bad_train_wallets = X_train[y_train == 1]

test_data = pd.concat([X_test, bad_train_wallets])
test_target = pd.concat([y_test, y_train[y_train == 1]])

test_data.shape, test_target.shape


((89633, 43), (89633,))

In [53]:
good_wallets.shape, bad_train_wallets.shape, X_train.shape

((175721, 43), (10026, 43), (185747, 43))

In [27]:
from sklearn.svm import OneClassSVM

# prep_data = prepare_wallets_features_data(wallets_features[wallets_features["class"] != 3], type="full")

# X_train, X_test, y_train, y_test = get_training_data(prep_data, no_unknown=False)

# good_wallets = prep_data[prep_data["class"] == 2]
# bad_wallets = prep_data[prep_data["class"] == 1]

# good_wallets.drop("class", axis=1, inplace=True)
# bad_wallets.drop("class", axis=1, inplace=True)

# good_wallets = good_wallets[:100000]
# bad_wallets = bad_wallets


column_transformer = ColumnTransformer(
    transformers=[
        ('min_max_scale', MinMaxScaler(), list(set(prep_data.columns) - set(["addrId", "class"]))),
    ]
)

pipeline = Pipeline(steps=[
    ('transformers', column_transformer),
    ('oc_svm', OneClassSVM(kernel="rbf", gamma="auto", nu=0.05))
])

model = pipeline.fit(good_wallets)
y_pred = model.predict(test_data)
# y_pred_vals = model.predict(X_test)

# get_best_threshold(y_pred, y_test)

In [39]:
((y_pred == -1) * (test_target == 1)).sum(), (test_target == 1).shape

(565, (89633,))

In [55]:
y_pred1 = model.predict(bad_train_wallets)

In [57]:
(y_pred1 == 1).sum() / y_pred1.shape[0]

0.9603032116497108

In [58]:
bad_train_wallets

Unnamed: 0,num_timesteps_appeared_in,num_addr_transacted_multiple,num_txs_as_sender,num_txs_as receiver,lifetime_in_blocks,total_txs,btc_transacted_total,btc_sent_total,btc_received_total,fees_total,...,user_output_users_cnt,user_active_time_steps_cnt,user_btc_sent_total,user_btc_received_total,user_interracted_output_address_cnt,user_interracted_input_address_cnt,user_overall_activity_coef,user_whole_fee_5,addr_gini,whole_fees_5
6756,1.0,0.0,6.0,30.0,72.0,36.0,19.232433,9.616216,9.616216,0.016617,...,2.0,1,24.922905,17.388800,2.0,20.0,0.000000,0.0,0.0,0.0
7534,1.0,0.0,1.0,0.0,0.0,1.0,0.042047,0.042047,0.000000,0.003441,...,3.0,18,12.027439,5.246751,3.0,113.0,0.252978,0.0,0.0,0.0
12835,1.0,0.0,2.0,2.0,2.0,4.0,0.006160,0.003080,0.003080,0.160957,...,1.0,1,2.714874,2.714874,1.0,217.0,0.000000,0.0,0.0,0.0
13995,1.0,0.0,1.0,0.0,0.0,1.0,0.004272,0.004272,0.000000,0.285129,...,1.0,1,2.813016,0.942745,1.0,70.0,0.000000,0.0,0.0,0.0
100,1.0,0.0,0.0,1.0,0.0,1.0,1.676183,0.000000,1.676183,0.000267,...,0.0,1,0.000000,1.676183,0.0,1.0,0.000000,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13986,1.0,0.0,2.0,2.0,14.0,4.0,0.036568,0.018284,0.018284,0.593970,...,1.0,1,2.813016,0.942745,1.0,70.0,0.000000,0.0,0.0,0.0
9268,1.0,0.0,2.0,2.0,2.0,4.0,0.013200,0.006600,0.006600,0.045543,...,1.0,1,4.138774,4.138774,1.0,93.0,0.000000,0.0,0.0,0.0
11534,1.0,0.0,2.0,2.0,4.0,4.0,0.008000,0.004000,0.004000,0.089526,...,2.0,1,21.819429,21.819429,2.0,356.0,0.000000,0.0,0.0,0.0
3890,1.0,0.0,2.0,2.0,2.0,4.0,0.186531,0.093266,0.093266,0.001165,...,2.0,1,0.093266,0.093266,2.0,3.0,0.000000,0.0,0.0,0.0


In [15]:
(y_pred == -1).sum() / y_pred.shape[0], f"accurately {(y_pred == -1).sum()} / {y_pred.shape[0]}"

(0.8914902565540446, 'accurately 12718 / 14266')

In [None]:
y_pred1 = model.predict(good_wallets)


0.97297

In [14]:
(y_pred1 == 1).sum() / y_pred1.shape[0], f"badly classified {(y_pred1 == -1).sum()} / {y_pred1.shape[0]}"

(0.97297, 'badly classified 2703 / 100000')