# Package Import

In [1]:
import dask.dataframe as dd
import os
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import make_column_transformer
from sklearn.impute import SimpleImputer
import pandas as pd

In [2]:
path = 'C:/Eigene Dateien/Masterarbeit/FraudDetection/Daten/tx_out_filesplit/'
os.chdir(path)

# Read data, check nulls, upsample and encode

In [3]:
df = dd.read_parquet('final_data_set')
df = df.set_index('address')
df_illicit = df[df['illicit'] == 1]
df_licit = df[df['illicit'] == 0]
df.head()

Unnamed: 0_level_0,count_addresses,count_addresses_sender,count_addresses_receiver,count_transactions,count_transactions_sender,count_transactions_receiver,count_transactions_s_equal_r,darknet_markets,lifetime,min_transaction_value,...,mean_transactions_fee,mean_transactions_fee_sender,mean_transactions_fee_receiver,mean_transactions_volume,mean_transactions_volume_sender,mean_transactions_volume_receiver,concentration_addresses,concentration_addresses_sender,concentration_addresses_receiver,illicit
address,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1121fvgKUFNeBmVSLLcPaNS8TtWn4GrZdX,3,2,2,2,1.0,1,0.0,61.0,1,0.013723,...,4.2e-05,3.3e-05,5.2e-05,0.013723,0.013723,0.013723,-1.0,1.0,1.0,0
1122LfQRzJdC31Tw9h8XyBRqT7QUM9yKrZ,271,2,269,2,,2,,54.1,6,7e-06,...,4.6e-05,,4.6e-05,1.2e-05,,1.2e-05,-269.0,,-267.0,0
1122Mgd8GnaBcBv6gJ5QcvKt76GqGsRvjY,7,2,6,2,1.0,1,0.0,59.0,1,0.193508,...,2.2e-05,3.1e-05,1.3e-05,0.193508,0.193508,0.193508,-5.0,1.0,1.0,0
11239Kbtbp9v6iQa7NhFFiZ4ByhKLwThq4,6,3,4,2,1.0,1,0.0,42.5,91,2.6e-05,...,0.000236,4.5e-05,0.000426,2.6e-05,2.6e-05,2.6e-05,-4.0,1.0,1.0,0
1123LLCPfHznjsYh4exJbkyiUAfkC7pbAt,2424,303,2148,118,51.0,67,0.0,42.9,151,9e-05,...,0.000973,0.000616,0.001245,0.493333,0.570716,0.434429,-19.709402,-5.04,-31.530303,0


Check if null values

In [4]:
df_is_null = df.isnull().sum().reset_index()
df_is_null = df_is_null[df_is_null[0] > 0]
df_is_null = df_is_null.rename(columns = {'index': 'feature is null', 0: 'count'})
df_is_null.compute()

Unnamed: 0,feature is null,count
4,count_transactions_sender,16510
6,count_transactions_s_equal_r,16510
11,std_transaction_value,14946
12,min_transaction_value_sender,16510
13,max_transaction_value_sender,16510
14,std_transaction_value_sender,239719
17,std_transaction_value_receiver,236633
19,std_balance,14851
23,std_addresses_per_transaction_sender,14837
27,std_addresses_per_transaction_receiver,14837


Closer look to the feature count_transactions_sender is null values. To make assumptions to other features. (Left: count_transactions_sender feature columns Right: original dataframe)

In [5]:
df_is_null_1 = df[df['count_transactions_sender'].isnull()].isnull().sum().reset_index()
df_is_null_1 = dd.concat([df_is_null_1[df_is_null_1[0] > 0], df_is_null], axis = 1)
df_is_null_1.compute()

We're assuming that the indices of each dataframes are 
 aligned. This assumption is not generally safe.


Unnamed: 0,index,0,feature is null,count
4,count_transactions_sender,16510,count_transactions_sender,16510
6,count_transactions_s_equal_r,16510,count_transactions_s_equal_r,16510
11,std_transaction_value,14851,std_transaction_value,14946
12,min_transaction_value_sender,16510,min_transaction_value_sender,16510
13,max_transaction_value_sender,16510,max_transaction_value_sender,16510
14,std_transaction_value_sender,16510,std_transaction_value_sender,239719
17,std_transaction_value_receiver,14851,std_transaction_value_receiver,236633
19,std_balance,14848,std_balance,14851
23,std_addresses_per_transaction_sender,14837,std_addresses_per_transaction_sender,14837
27,std_addresses_per_transaction_receiver,14837,std_addresses_per_transaction_receiver,14837


The null values appeared because of missing sender transactions. Therefore a imputation with 0 is suitable.

In [6]:
df_is_null_1[df_is_null_1['index'].str.contains('sender')].compute()

Unnamed: 0,index,0,feature is null,count
4,count_transactions_sender,16510,count_transactions_sender,16510
12,min_transaction_value_sender,16510,min_transaction_value_sender,16510
13,max_transaction_value_sender,16510,max_transaction_value_sender,16510
14,std_transaction_value_sender,16510,std_transaction_value_sender,239719
23,std_addresses_per_transaction_sender,14837,std_addresses_per_transaction_sender,14837
33,transaction_volume_sender_btc,16510,transaction_volume_sender_btc,16510
36,transaction_volume_sender_euro,16510,transaction_volume_sender_euro,16510
39,transaction_fee_sender,16510,transaction_fee_sender,16510
43,mean_time_diff_transaction_sender,16510,mean_time_diff_transaction_sender,239635
44,std_time_diff_transaction_sender,16510,std_time_diff_transaction_sender,250474


In [7]:
df_is_null_1[~df_is_null_1['index'].str.contains('sender')].compute()

Unnamed: 0,index,0,feature is null,count
6,count_transactions_s_equal_r,16510,count_transactions_s_equal_r,16510
11,std_transaction_value,14851,std_transaction_value,14946
17,std_transaction_value_receiver,14851,std_transaction_value_receiver,236633
19,std_balance,14848,std_balance,14851
27,std_addresses_per_transaction_receiver,14837,std_addresses_per_transaction_receiver,14837
31,std_addresses_per_transaction,14837,std_addresses_per_transaction,14837
41,mean_time_diff_transaction,14837,mean_time_diff_transaction,14837
42,std_time_diff_transaction,15578,std_time_diff_transaction,237273
45,mean_time_diff_transaction_receiver,14837,mean_time_diff_transaction_receiver,236532
46,std_time_diff_transaction_receiver,15578,std_time_diff_transaction_receiver,248931


"count_transactions_s_equal_r" and "mean_transactions_s_equal_r" compare sender and receiver. If there are no senders, there is nothing to compare. Therefore 0 is imputed.

In [8]:
rest_isnull = df_is_null[~(df_is_null_1['index'].str.contains('sender') | df_is_null_1['index'].str.contains('s_equal_r'))]
rest_isnull.compute()

Unnamed: 0,feature is null,count
11,std_transaction_value,14946
17,std_transaction_value_receiver,236633
19,std_balance,14851
27,std_addresses_per_transaction_receiver,14837
31,std_addresses_per_transaction,14837
41,mean_time_diff_transaction,14837
42,std_time_diff_transaction,237273
45,mean_time_diff_transaction_receiver,236532
46,std_time_diff_transaction_receiver,248931


'time_diff' can be nan if there is only one transaction. Same is if the standard deviation is calculated but there is only one transaction (correction term of the standard deviation n-1 results in 0 division). Therefore std_transaction_value, std_balance, std_addresses_per_transaction_receiver, std_addresses_per_transaction and mean_time_diff_transaction can be imputed with 0

In [9]:
(df[df['std_transaction_value'].isnull()]['count_transactions'] == 1).sum().compute()

14837

The same problem with receiver transactions which are 0. Therefore std_transaction_value_receiver, mean_time_diff_transaction_receiver and std_time_diff_transaction_receiver can be imputed with 0.

In [10]:
(df[df['std_transaction_value_receiver'].isnull()]['count_transactions_receiver'] == 1).sum().compute()

236536

In [11]:
df[df['std_transaction_value_receiver'].isnull()][['count_transactions_receiver', 'std_transaction_value_receiver']].head()

Unnamed: 0_level_0,count_transactions_receiver,std_transaction_value_receiver
address,Unnamed: 1_level_1,Unnamed: 2_level_1
1121fvgKUFNeBmVSLLcPaNS8TtWn4GrZdX,1,
1122Mgd8GnaBcBv6gJ5QcvKt76GqGsRvjY,1,
11239Kbtbp9v6iQa7NhFFiZ4ByhKLwThq4,1,
1124Vc7iWXAWYjJqexhwi2oxMnbfxppvRd,1,
1125oHW3KCah1esMomGda4kPqLYcQcfmfT,1,


In [12]:
df[((df['std_transaction_value_receiver'].isnull()) & (df['std_transaction_value_receiver'] > 1))].compute()

Unnamed: 0_level_0,count_addresses,count_addresses_sender,count_addresses_receiver,count_transactions,count_transactions_sender,count_transactions_receiver,count_transactions_s_equal_r,darknet_markets,lifetime,min_transaction_value,...,mean_transactions_fee,mean_transactions_fee_sender,mean_transactions_fee_receiver,mean_transactions_volume,mean_transactions_volume_sender,mean_transactions_volume_receiver,concentration_addresses,concentration_addresses_sender,concentration_addresses_receiver,illicit
address,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1


Conclusion to null-values: mostly a result of no transactions in this category or only one transaction. Therefore imputation of 0 for null values is suggested.

Upsample

In [13]:
df_illicit = df_illicit.sample(frac = 3.876,
                               replace = True,
                               random_state = 190)

Check if upsampled data contains all illicit addresses. The sample method doesn't gurantee for it. Therefore the missing entries were added manually.

In [14]:
len(df[df['illicit'] == 1].index.unique())

13561

In [15]:
len(df_illicit.index.unique())

13312

In [16]:
len(df_illicit.index)

52595

In [24]:
index_df = df_illicit.index.unique().compute()
df_1 = df[df['illicit'] == 1].compute()
df_1 = df_1[~df_1.index.isin(index_df)]
df_1

Unnamed: 0_level_0,count_addresses,count_addresses_sender,count_addresses_receiver,count_transactions,count_transactions_sender,count_transactions_receiver,count_transactions_s_equal_r,darknet_markets,lifetime,min_transaction_value,...,mean_transactions_fee,mean_transactions_fee_sender,mean_transactions_fee_receiver,mean_transactions_volume,mean_transactions_volume_sender,mean_transactions_volume_receiver,concentration_addresses,concentration_addresses_sender,concentration_addresses_receiver,illicit
address,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
126DFo5prJxqBv48ZPRjCoHb3mxkHViqjx,1145,837,310,18,8.0,10,0.0,39.6,97,0.000695,...,0.009858,0.021076,0.000884,0.008153,0.009172,0.007338,-66.294118,-118.428571,-33.333333,1
1271BQNWeDc8CpWgirPjgVeHmJss8Y5tsj,1827,1176,734,13,6.0,7,0.0,45.8,227,0.000005,...,0.012599,0.026368,0.000798,0.056251,0.060938,0.052233,-151.166667,-234.000000,-121.166667,1
13KVgnAeh2afikNT22ZgtcsTETdRLmHgQf,255,6,251,2,1.0,1,0.0,48.0,11,0.000298,...,0.000464,0.000245,0.000682,0.000298,0.000298,0.000298,-253.000000,1.000000,1.000000,1
13c1qs8kgQg6u8xQPsLwy37Wd4ovZweBwR,6246,1617,4737,662,265.0,397,0.0,45.8,263,0.000005,...,0.000572,0.000340,0.000728,0.021690,0.027092,0.018084,-8.447806,-5.121212,-10.959596,1
13w2fsyT97RoE1E7jjoVDwrzXoztgPTchZ,611,3,608,1,,1,,49.0,1,0.000005,...,0.000202,,0.000202,0.000005,,0.000005,1.000000,,1.000000,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3QpNHy9GjNxGp3VXKeh8hEJMxMUSayYkGh,19,7,15,2,1.0,1,0.0,49.0,2,0.018048,...,0.000381,0.000343,0.000419,0.018048,0.018048,0.018048,-17.000000,1.000000,1.000000,1
3QsLNUptraBrHt8SPgHRZkYHcBuigtADcg,133,1,133,1,,1,,37.0,1,0.000523,...,0.002556,,0.002556,0.000523,,0.000523,1.000000,,1.000000,1
3Qu3sQREAMT6gKR94CihDkLeeKyXT7A2EJ,8,6,4,2,1.0,1,0.0,44.6,38,0.168764,...,0.000285,0.000343,0.000227,0.168764,0.168764,0.168764,-6.000000,1.000000,1.000000,1
3QyjuxejZuBRkpP7HhovEvnUET4o47J8c8,7,5,4,2,1.0,1,0.0,49.0,1,0.016708,...,0.000182,0.000343,0.000020,0.016708,0.016708,0.016708,-5.000000,1.000000,1.000000,1


In [25]:
df_ml = dd.concat([df_licit, df_illicit, df_1], axis = 0)
len(df_ml.index.unique())

264139

In [None]:
len(df_ml.index)

In [26]:
df_ml_features = df_ml.iloc[:, :-1]
df_ml_target = df_ml.iloc[:, [-1]]

Encoding

In [27]:
SimpleImputer(strategy='constant', fill_value=0).fit_transform(X = df_ml_features)

ValueError: Input X contains infinity or a value too large for dtype('float64').

In [None]:


num_pipeline = make_pipeline(SimpleImputer(strategy='constant', fill_value=0, copy=False))
num_pipeline.fit_transform(df_ml.iloc[:, 1:])

In [None]:
num_attribs = df_ml.iloc[:, 1:].columns

num_pipeline = make_pipeline(SimpleImputer(strategy='constant', fill_value=0, copy=False),
                             StandardScaler())

preprocessing = make_column_transformer((num_pipeline, num_attribs), remainder = 'passthrough').set_output(transform="pandas")

df_encoded = preprocessing.fit_transform(df_ml.iloc[:, 1:])

In [None]:
num_attribs = df_ml.iloc[:, 1:].columns

num_pipeline = make_pipeline('standardscaler', StandardScaler())

preprocessing = make_column_transformer( 
        (num_pipeline, num_attribs))

df_encoded = preprocessing.fit_transform(df.iloc[:, :-1], 
                                         df.iloc[:, -1])

df_encoded_upsample = preprocessing.fit_transform(df_ml.iloc[:, :-1], 
                                                  df_ml.iloc[:, -1])


# General

In [None]:
print(df.info())
print(df.describe())

# Pairplots

In [None]:
sns.pairplot(plot_set, 
             markers='o', 
             hue='illicit', 
             plot_kws={'alpha' : 0.25})
plt.show()

# Complete correlation

In [None]:
corr_all = df.corr()

vmax = np.round(corr.iloc[:-1, -1].abs().sort_values(ascending = False).head(1)[0], 1)

ax = sns.heatmap(corr, 
                 vmax = vmax, 
                 center = 0)

# Correlation with target variable

In [None]:
highlight_val = np.concatenate([corr_all['illicit'].head(3).values, corr_all['illicit'].tail(4).values], axis = 0)
colors = ['cornflowerblue' if i not in highlight_val else 'lightgreen' if i in highlight_val and i > 0 else 'lightcoral' for i in corrs[:-1]]

plt.bar(corr_all['illicit'].index, corr_all['illicit'], color = colors)
plt.title("Correlation between the features and 'credit rating'")
plt.xticks(rotation = 90)
plt.show()