In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

sns.set(style="darkgrid")

np.random.seed(0)
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

In [2]:
CUTOFF = 40

addr_tx_df = pd.read_csv("../../dataset/Elliptic++ Dataset/AddrTx_edgelist.csv").rename(columns={"input_address": "addrId"})
tx_addr_df = pd.read_csv("../../dataset/Elliptic++ Dataset/TxAddr_edgelist.csv").rename(columns={"output_address": "addrId"})

tx_info = pd.read_csv("../../dataset/Elliptic++ Dataset/txs_features.csv")[["txId", "Time step"]]

addr_to_ts_output = addr_tx_df \
    .rename(columns={"txId": "outcoming_txId"})

addr_to_ts_input = tx_addr_df \
    .rename(columns={"txId": "incoming_txId"})


In [3]:
TX_FEATS_LIST = [
    'txId',
    'Time step',
    'in_txs_degree',
    'out_txs_degree',
    'total_BTC',
    'fees',
    'size',

    'num_input_addresses',
    'num_output_addresses',

    'in_BTC_min',
    'in_BTC_max',
    'in_BTC_mean',
    'in_BTC_median',
    'in_BTC_total',
    'out_BTC_min',
    'out_BTC_max',
    'out_BTC_mean',
    'out_BTC_median',
    'out_BTC_total'
]
tx_info = pd.read_csv("../../dataset/Elliptic++ Dataset/txs_features.csv")[TX_FEATS_LIST]
list(tx_info.columns)

['txId',
 'Time step',
 'in_txs_degree',
 'out_txs_degree',
 'total_BTC',
 'fees',
 'size',
 'num_input_addresses',
 'num_output_addresses',
 'in_BTC_min',
 'in_BTC_max',
 'in_BTC_mean',
 'in_BTC_median',
 'in_BTC_total',
 'out_BTC_min',
 'out_BTC_max',
 'out_BTC_mean',
 'out_BTC_median',
 'out_BTC_total']

In [4]:
addr_to_ts_input

Unnamed: 0,incoming_txId,addrId
0,230325127,1GASxu5nMntiRKdVtTVRvEbP965G51bhHH
1,230325127,14YRXHHof4BY1TVxN5FqYPcEdpmXiYT78a
2,230325139,1GFdrdgtG34GChM8SMpMwcXFc4nYbH1A5G
3,86875675,19q57SeCEzTnWrWVXA43nZzhSiXkYggh7c
4,86875675,1Kk1NVYnCE8ALXDhgMM6HqTt1jDSvi6QBA
...,...,...
837119,157659306,3MfN5to5K5be2RupWE8rjJHQ6V9L8ypWeh
837120,157668825,38jMiiZs2C5n5MPkyc5pSA7wwW6H4p6hPa
837121,125788182,3G9b7hWZccuft1V4eGUcZqTZaxsqx699bM
837122,157670868,1JERHCgwHG2Z7T3KjNpEwj3fJNX8vSfCX2


In [5]:
user_to_tx_input_feats = addr_to_ts_input \
    .merge(
        tx_info,
        how="left",
        left_on="incoming_txId",
        right_on="txId"
    )
user_to_tx_output_feats = addr_to_ts_output \
    .merge(
        tx_info,
        how="left",
        left_on="outcoming_txId",
        right_on="txId"
    )

user_to_tx_input_feats.head()

Unnamed: 0,incoming_txId,addrId,txId,Time step,in_txs_degree,out_txs_degree,total_BTC,fees,size,num_input_addresses,...,in_BTC_min,in_BTC_max,in_BTC_mean,in_BTC_median,in_BTC_total,out_BTC_min,out_BTC_max,out_BTC_mean,out_BTC_median,out_BTC_total
0,230325127,1GASxu5nMntiRKdVtTVRvEbP965G51bhHH,230325127,1,0.0,1.0,6.999303,0.001,225.0,1.0,...,7.000303,7.000303,7.000303,7.000303,7.000303,0.173495,6.825808,3.499652,3.499652,6.999303
1,230325127,14YRXHHof4BY1TVxN5FqYPcEdpmXiYT78a,230325127,1,0.0,1.0,6.999303,0.001,225.0,1.0,...,7.000303,7.000303,7.000303,7.000303,7.000303,0.173495,6.825808,3.499652,3.499652,6.999303
2,230325139,1GFdrdgtG34GChM8SMpMwcXFc4nYbH1A5G,230325139,1,0.0,1.0,5.525802,0.0001,486.0,3.0,...,0.007822,4.323354,1.841967,1.194726,5.525902,5.525802,5.525802,5.525802,5.525802,5.525802
3,86875675,19q57SeCEzTnWrWVXA43nZzhSiXkYggh7c,86875675,1,0.0,1.0,11.811174,0.0001,521.0,3.0,...,3.684089,4.274925,3.937091,3.85226,11.811274,1.266853,10.544321,5.905587,5.905587,11.811174
4,86875675,1Kk1NVYnCE8ALXDhgMM6HqTt1jDSvi6QBA,86875675,1,0.0,1.0,11.811174,0.0001,521.0,3.0,...,3.684089,4.274925,3.937091,3.85226,11.811274,1.266853,10.544321,5.905587,5.905587,11.811174


In [6]:
user_to_tx_input_feats_with_addrs = user_to_tx_input_feats \
    .merge(addr_tx_df.rename(columns={"addrId": "input_address"}), how="left", on="txId")[["addrId", "txId", "Time step", "input_address"]]
user_to_tx_input_feats_with_addrs

Unnamed: 0,addrId,txId,Time step,input_address
0,1GASxu5nMntiRKdVtTVRvEbP965G51bhHH,230325127,1,14YRXHHof4BY1TVxN5FqYPcEdpmXiYT78a
1,14YRXHHof4BY1TVxN5FqYPcEdpmXiYT78a,230325127,1,14YRXHHof4BY1TVxN5FqYPcEdpmXiYT78a
2,1GFdrdgtG34GChM8SMpMwcXFc4nYbH1A5G,230325139,1,13Lhad3SAmu2vqYg2dxbNcxH7LE77kJu2w
3,19q57SeCEzTnWrWVXA43nZzhSiXkYggh7c,86875675,1,1MAQQZn7EHP6J3erXByCciFiVcgS8ZhWqz
4,1Kk1NVYnCE8ALXDhgMM6HqTt1jDSvi6QBA,86875675,1,1MAQQZn7EHP6J3erXByCciFiVcgS8ZhWqz
...,...,...,...,...
2868959,3MfN5to5K5be2RupWE8rjJHQ6V9L8ypWeh,157659306,49,3MfN5to5K5be2RupWE8rjJHQ6V9L8ypWeh
2868960,38jMiiZs2C5n5MPkyc5pSA7wwW6H4p6hPa,157668825,49,3DzbpEogZ1mn9FgCHcmzYPLDbV9GuxYHpi
2868961,3G9b7hWZccuft1V4eGUcZqTZaxsqx699bM,125788182,49,34yD1sQg6C16aANCtibYXRj5NsX6tt4v5R
2868962,1JERHCgwHG2Z7T3KjNpEwj3fJNX8vSfCX2,157670868,49,1JERHCgwHG2Z7T3KjNpEwj3fJNX8vSfCX2


In [7]:
user_to_tx_input_feats["whole_fee_4_flg"] = (
    np.floor(user_to_tx_input_feats["fees"] * (10**4)) == 
    user_to_tx_input_feats["fees"] * (10**4)
).astype(int)
user_to_tx_input_feats["whole_fee_4_flg"].value_counts()

whole_fee_4_flg
0    607257
1    229867
Name: count, dtype: int64

In [8]:
user_to_tx_output_feats_with_addrs = user_to_tx_output_feats \
    .merge(tx_addr_df.rename(columns={"addrId": "output_address"}), how="left", on="txId")[["addrId", "txId", "Time step", "output_address"]]
user_to_tx_output_feats_with_addrs

Unnamed: 0,addrId,txId,Time step,output_address
0,14YRXHHof4BY1TVxN5FqYPcEdpmXiYT78a,230325127,1,1GASxu5nMntiRKdVtTVRvEbP965G51bhHH
1,14YRXHHof4BY1TVxN5FqYPcEdpmXiYT78a,230325127,1,14YRXHHof4BY1TVxN5FqYPcEdpmXiYT78a
2,13Lhad3SAmu2vqYg2dxbNcxH7LE77kJu2w,230325139,1,1GFdrdgtG34GChM8SMpMwcXFc4nYbH1A5G
3,1MAQQZn7EHP6J3erXByCciFiVcgS8ZhWqz,86875675,1,19q57SeCEzTnWrWVXA43nZzhSiXkYggh7c
4,1MAQQZn7EHP6J3erXByCciFiVcgS8ZhWqz,86875675,1,1Kk1NVYnCE8ALXDhgMM6HqTt1jDSvi6QBA
...,...,...,...,...
2868959,3MfN5to5K5be2RupWE8rjJHQ6V9L8ypWeh,157659306,49,3MfN5to5K5be2RupWE8rjJHQ6V9L8ypWeh
2868960,3DzbpEogZ1mn9FgCHcmzYPLDbV9GuxYHpi,157668825,49,38jMiiZs2C5n5MPkyc5pSA7wwW6H4p6hPa
2868961,34yD1sQg6C16aANCtibYXRj5NsX6tt4v5R,125788182,49,3G9b7hWZccuft1V4eGUcZqTZaxsqx699bM
2868962,1JERHCgwHG2Z7T3KjNpEwj3fJNX8vSfCX2,157670868,49,1JERHCgwHG2Z7T3KjNpEwj3fJNX8vSfCX2


In [9]:
user_to_tx_output_feats["whole_fee_4_flg"] = (
    np.floor(user_to_tx_output_feats["fees"] * (10**4)) == 
    user_to_tx_output_feats["fees"] * (10**4)
).astype(int)
user_to_tx_output_feats["whole_fee_4_flg"].value_counts()

whole_fee_4_flg
0    367923
1    109194
Name: count, dtype: int64

# Addr feats by ts

In [10]:
addr_feats = pd.concat([user_to_tx_output_feats[["addrId", "Time step"]], user_to_tx_input_feats[["addrId", "Time step"]]])
addr_feats = addr_feats.drop_duplicates(["addrId", "Time step"])

In [11]:
def gini_coefficient(values):
    sorted_values = sorted(values)
    n = len(values)
    cumsum = sum([(i+1) * val for i, val in enumerate(sorted_values)])
    return (2 * cumsum) / (n * sum(values)) - (n + 1) / n

def safe_gini_coefficient(values):
    if len(values) < 2:
        return 0
    if sum(values) == 0:
        return np.nan
    try:
        return gini_coefficient(values)
    except:
        return np.nan
# user_to_ts_gini = wallets_features.groupby('userId')['Time step'].apply(safe_gini_coefficient)


In [12]:
def add_tx_metric(
        addr_feats,
        tx_feats,
        source_name,
        agg_fn,
        dest_name,
        fillna=None,
        group_fields=["addrId", "Time step"]
):
    if agg_fn == "gini":
        addr_to_tx_feat = tx_feats.groupby(group_fields)[source_name].apply(safe_gini_coefficient) \
            .reset_index() \
            .rename(columns={source_name: dest_name})
        
    else:
        addr_to_tx_feat = tx_feats \
            .groupby(group_fields) \
            .agg({
                source_name: agg_fn
            }) \
            .rename(columns={source_name: dest_name})
    res = addr_feats \
        .merge(
            addr_to_tx_feat,
            how="left",
            on=group_fields
        )
    if fillna is not None:
        res[dest_name] = res[dest_name].fillna(fillna)
    return res

In [13]:
addr_feats_full = add_tx_metric(addr_feats, user_to_tx_input_feats, source_name="total_BTC", agg_fn="sum", dest_name="btc_received_total", fillna=0)
addr_feats_full = add_tx_metric(addr_feats_full, user_to_tx_input_feats, source_name="total_BTC", agg_fn="min", dest_name="btc_received_min", fillna=0)
addr_feats_full = add_tx_metric(addr_feats_full, user_to_tx_input_feats, source_name="total_BTC", agg_fn="max", dest_name="btc_received_max", fillna=0)
addr_feats_full = add_tx_metric(addr_feats_full, user_to_tx_input_feats, source_name="total_BTC", agg_fn="mean", dest_name="btc_received_mean", fillna=0)

addr_feats_full = add_tx_metric(addr_feats_full, user_to_tx_input_feats, source_name="txId", agg_fn="nunique", dest_name="incoming_tx_cnt", fillna=0)
addr_feats_full = add_tx_metric(addr_feats_full, user_to_tx_input_feats, source_name="num_input_addresses", agg_fn="mean", dest_name="incoming_tx_input_address_cnt_mean", fillna=0)
addr_feats_full = add_tx_metric(addr_feats_full, user_to_tx_input_feats, source_name="num_output_addresses", agg_fn="mean", dest_name="incoming_tx_output_address_cnt_mean", fillna=0)

addr_feats_full = add_tx_metric(addr_feats_full, user_to_tx_input_feats, source_name="total_BTC", agg_fn="gini", dest_name="btc_received_gini", fillna=0)
addr_feats_full = add_tx_metric(addr_feats_full, user_to_tx_input_feats_with_addrs, source_name="input_address", agg_fn="nunique", dest_name="input_address_cnt", fillna=0)

addr_feats_full = add_tx_metric(addr_feats_full, user_to_tx_input_feats, source_name="fees", agg_fn="mean", dest_name="incoming_tx_fees_mean", fillna=0)
addr_feats_full = add_tx_metric(addr_feats_full, user_to_tx_input_feats, source_name="fees", agg_fn="min", dest_name="incoming_tx_fees_min", fillna=0)
addr_feats_full = add_tx_metric(addr_feats_full, user_to_tx_input_feats, source_name="fees", agg_fn="min", dest_name="incoming_tx_fees_max", fillna=0)
addr_feats_full = add_tx_metric(addr_feats_full, user_to_tx_input_feats, source_name="whole_fee_4_flg", agg_fn="sum", dest_name="incoming_tx_whole_fee_4_cnt", fillna=0)



In [14]:
# Add output data
addr_feats_full1 = add_tx_metric(addr_feats_full,  user_to_tx_output_feats, source_name="total_BTC", agg_fn="sum", dest_name="btc_sent_total", fillna=0)
addr_feats_full1 = add_tx_metric(addr_feats_full1, user_to_tx_output_feats, source_name="total_BTC", agg_fn="min", dest_name="btc_sent_min", fillna=0)
addr_feats_full1 = add_tx_metric(addr_feats_full1, user_to_tx_output_feats, source_name="total_BTC", agg_fn="max", dest_name="btc_sent_max", fillna=0)
addr_feats_full1 = add_tx_metric(addr_feats_full1, user_to_tx_output_feats, source_name="total_BTC", agg_fn="mean", dest_name="btc_sent_mean", fillna=0)
addr_feats_full1 = add_tx_metric(addr_feats_full1, user_to_tx_output_feats, source_name="txId", agg_fn="nunique", dest_name="outcoming_tx_cnt", fillna=0)
addr_feats_full1 = add_tx_metric(addr_feats_full1, user_to_tx_output_feats, source_name="num_output_addresses", agg_fn="mean", dest_name="outcoming_tx_output_address_cnt_mean", fillna=0)
addr_feats_full1 = add_tx_metric(addr_feats_full1, user_to_tx_output_feats, source_name="num_input_addresses", agg_fn="mean", dest_name="outcoming_tx_input_address_cnt_mean", fillna=0)
addr_feats_full1 = add_tx_metric(addr_feats_full1, user_to_tx_output_feats, source_name="total_BTC", agg_fn="gini", dest_name="btc_sent_gini", fillna=0)
addr_feats_full1 = add_tx_metric(addr_feats_full1, user_to_tx_output_feats_with_addrs, source_name="output_address", agg_fn="nunique", dest_name="output_address_cnt", fillna=0)

addr_feats_full1 = add_tx_metric(addr_feats_full1, user_to_tx_output_feats, source_name="fees", agg_fn="mean", dest_name="outcoming_tx_fees_mean", fillna=0)
addr_feats_full1 = add_tx_metric(addr_feats_full1, user_to_tx_output_feats, source_name="fees", agg_fn="min", dest_name="outcoming_tx_fees_min", fillna=0)
addr_feats_full1 = add_tx_metric(addr_feats_full1, user_to_tx_output_feats, source_name="fees", agg_fn="min", dest_name="outcoming_tx_fees_max", fillna=0)
addr_feats_full1 = add_tx_metric(addr_feats_full1, user_to_tx_output_feats, source_name="whole_fee_4_flg", agg_fn="sum", dest_name="outcoming_tx_whole_fee_4_cnt", fillna=0)
addr_feats_full1 = add_tx_metric(addr_feats_full1, user_to_tx_output_feats, source_name="fees", agg_fn="sum", dest_name="outcoming_tx_fees_total", fillna=0)


In [15]:
user_to_tx_output_feats.shape, user_to_tx_output_feats_with_addrs.shape

((477117, 22), (2868964, 4))

In [16]:
addr_feats_full1.shape, addr_feats_full.shape

((920691, 29), (920691, 15))

In [17]:
addr_feats_full1

Unnamed: 0,addrId,Time step,btc_received_total,btc_received_min,btc_received_max,btc_received_mean,incoming_tx_cnt,incoming_tx_input_address_cnt_mean,incoming_tx_output_address_cnt_mean,btc_received_gini,...,outcoming_tx_cnt,outcoming_tx_output_address_cnt_mean,outcoming_tx_input_address_cnt_mean,btc_sent_gini,output_address_cnt,outcoming_tx_fees_mean,outcoming_tx_fees_min,outcoming_tx_fees_max,outcoming_tx_whole_fee_4_cnt,outcoming_tx_fees_total
0,14YRXHHof4BY1TVxN5FqYPcEdpmXiYT78a,1,6.999303,6.999303,6.999303,6.999303,1.0,1.0,2.0,0.0,...,1.0,2.0,1.0,0.0,2.0,0.0010,0.0010,0.0010,1.0,0.0010
1,13Lhad3SAmu2vqYg2dxbNcxH7LE77kJu2w,1,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,...,1.0,1.0,3.0,0.0,1.0,0.0001,0.0001,0.0001,1.0,0.0001
2,1MAQQZn7EHP6J3erXByCciFiVcgS8ZhWqz,1,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,...,1.0,2.0,3.0,0.0,2.0,0.0001,0.0001,0.0001,1.0,0.0001
3,16zs5SVSyADh5WrLNbZbpRLsBsN5uEzgeK,1,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,0.0,1.0,0.0001,0.0001,0.0001,1.0,0.0001
4,1QJpwtUorBKPGUJkSyrRcBKTAHq4CXrdYh,1,3.200299,3.200299,3.200299,3.200299,1.0,1.0,2.0,0.0,...,1.0,2.0,1.0,0.0,2.0,0.0001,0.0001,0.0001,1.0,0.0001
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
920686,13h3xr1LJXQXn9QR3gio62xgdZYAoBrKYQ,49,16.877397,16.877397,16.877397,16.877397,1.0,1.0,2.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0000,0.0000,0.0000,0.0,0.0000
920687,1Dow13WsGdCYewjtxP1acvM47A2XoPx1fF,49,16.871536,16.871536,16.871536,16.871536,1.0,1.0,2.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0000,0.0000,0.0000,0.0,0.0000
920688,18rfNEMrpF4yqw3s2Y9jTSvsfiH6dYC7ph,49,0.319490,0.319490,0.319490,0.319490,1.0,31.0,2.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0000,0.0000,0.0000,0.0,0.0000
920689,1Ct6Sx8uKTubXPcuLX5jSKb6QKMcqf1Ybj,49,16.858818,16.858818,16.858818,16.858818,1.0,1.0,2.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0000,0.0000,0.0000,0.0,0.0000


In [18]:
addr_feats.shape

(920691, 2)

In [19]:
COMMON_ADDRESS_FEATURES = [
    'address',
    'Time step',
    'class',
    
    'first_block_appeared_in',
    'last_block_appeared_in',
    'lifetime_in_blocks',

    # 'num_timesteps_appeared_in',
]

In [20]:
wallets_feats_old = pd.read_csv("../../dataset/Elliptic++ Dataset/wallets_features_classes_combined.csv")[COMMON_ADDRESS_FEATURES] \
    .rename(
        columns={"address": "addrId"}
    ) \
    .drop_duplicates(["addrId", "Time step"])


In [21]:
wallets_feats_full = addr_feats_full1 \
    .merge(wallets_feats_old, on=("addrId", "Time step"), how="left")
wallets_feats_full

Unnamed: 0,addrId,Time step,btc_received_total,btc_received_min,btc_received_max,btc_received_mean,incoming_tx_cnt,incoming_tx_input_address_cnt_mean,incoming_tx_output_address_cnt_mean,btc_received_gini,...,output_address_cnt,outcoming_tx_fees_mean,outcoming_tx_fees_min,outcoming_tx_fees_max,outcoming_tx_whole_fee_4_cnt,outcoming_tx_fees_total,class,first_block_appeared_in,last_block_appeared_in,lifetime_in_blocks
0,14YRXHHof4BY1TVxN5FqYPcEdpmXiYT78a,1,6.999303,6.999303,6.999303,6.999303,1.0,1.0,2.0,0.0,...,2.0,0.0010,0.0010,0.0010,1.0,0.0010,3,391200.0,391200.0,0.0
1,13Lhad3SAmu2vqYg2dxbNcxH7LE77kJu2w,1,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,...,1.0,0.0001,0.0001,0.0001,1.0,0.0001,3,391200.0,391200.0,0.0
2,1MAQQZn7EHP6J3erXByCciFiVcgS8ZhWqz,1,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,...,2.0,0.0001,0.0001,0.0001,1.0,0.0001,3,391200.0,399271.0,8071.0
3,16zs5SVSyADh5WrLNbZbpRLsBsN5uEzgeK,1,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,...,1.0,0.0001,0.0001,0.0001,1.0,0.0001,3,391200.0,391200.0,0.0
4,1QJpwtUorBKPGUJkSyrRcBKTAHq4CXrdYh,1,3.200299,3.200299,3.200299,3.200299,1.0,1.0,2.0,0.0,...,2.0,0.0001,0.0001,0.0001,1.0,0.0001,3,391200.0,391200.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
920686,13h3xr1LJXQXn9QR3gio62xgdZYAoBrKYQ,49,16.877397,16.877397,16.877397,16.877397,1.0,1.0,2.0,0.0,...,0.0,0.0000,0.0000,0.0000,0.0,0.0000,3,487975.0,487975.0,0.0
920687,1Dow13WsGdCYewjtxP1acvM47A2XoPx1fF,49,16.871536,16.871536,16.871536,16.871536,1.0,1.0,2.0,0.0,...,0.0,0.0000,0.0000,0.0000,0.0,0.0000,3,487975.0,487975.0,0.0
920688,18rfNEMrpF4yqw3s2Y9jTSvsfiH6dYC7ph,49,0.319490,0.319490,0.319490,0.319490,1.0,31.0,2.0,0.0,...,0.0,0.0000,0.0000,0.0000,0.0,0.0000,3,487975.0,487975.0,0.0
920689,1Ct6Sx8uKTubXPcuLX5jSKb6QKMcqf1Ybj,49,16.858818,16.858818,16.858818,16.858818,1.0,1.0,2.0,0.0,...,0.0,0.0000,0.0000,0.0000,0.0,0.0000,3,487975.0,487975.0,0.0


In [22]:
wallets_feats_full["class"].value_counts()

class
3    629272
2    276699
1     14720
Name: count, dtype: int64

In [23]:
wallets_feats_full.to_csv("../../dataset/custom/features_fixed/wallets_features_by_ts.csv", index=False)

In [24]:
CUTOFF = 35
CUTOFF_LEFT = 35

In [25]:
def get_training_agg_feats(addr_feats, user_to_tx_input_feats, user_to_tx_input_feats_with_addrs, user_to_tx_output_feats, user_to_tx_output_feats_with_addrs):
    addr_feats_full = add_tx_metric(addr_feats, user_to_tx_input_feats, source_name="total_BTC", agg_fn="sum", dest_name="btc_received_total", fillna=0, group_fields=["addrId"])
    addr_feats_full = add_tx_metric(addr_feats_full, user_to_tx_input_feats, source_name="total_BTC", agg_fn="min", dest_name="btc_received_min", fillna=0, group_fields=["addrId"])
    addr_feats_full = add_tx_metric(addr_feats_full, user_to_tx_input_feats, source_name="total_BTC", agg_fn="max", dest_name="btc_received_max", fillna=0, group_fields=["addrId"])
    addr_feats_full = add_tx_metric(addr_feats_full, user_to_tx_input_feats, source_name="total_BTC", agg_fn="mean", dest_name="btc_received_mean", fillna=0, group_fields=["addrId"])

    addr_feats_full = add_tx_metric(addr_feats_full, user_to_tx_input_feats, source_name="txId", agg_fn="nunique", dest_name="incoming_tx_cnt", fillna=0, group_fields=["addrId"])
    addr_feats_full = add_tx_metric(addr_feats_full, user_to_tx_input_feats, source_name="num_input_addresses", agg_fn="mean", dest_name="incoming_tx_input_address_cnt_mean", fillna=0, group_fields=["addrId"])
    addr_feats_full = add_tx_metric(addr_feats_full, user_to_tx_input_feats, source_name="num_output_addresses", agg_fn="mean", dest_name="incoming_tx_output_address_cnt_mean", fillna=0, group_fields=["addrId"])

    addr_feats_full = add_tx_metric(addr_feats_full, user_to_tx_input_feats, source_name="total_BTC", agg_fn="gini", dest_name="btc_received_gini", fillna=0, group_fields=["addrId"])
    addr_feats_full = add_tx_metric(addr_feats_full, user_to_tx_input_feats_with_addrs, source_name="input_address", agg_fn="nunique", dest_name="input_address_cnt", fillna=0, group_fields=["addrId"])

    addr_feats_full = add_tx_metric(addr_feats_full, user_to_tx_input_feats, source_name="fees", agg_fn="mean", dest_name="incoming_tx_fees_mean", fillna=0, group_fields=["addrId"])
    addr_feats_full = add_tx_metric(addr_feats_full, user_to_tx_input_feats, source_name="fees", agg_fn="min", dest_name="incoming_tx_fees_min", fillna=0, group_fields=["addrId"])
    addr_feats_full = add_tx_metric(addr_feats_full, user_to_tx_input_feats, source_name="fees", agg_fn="min", dest_name="incoming_tx_fees_max", fillna=0, group_fields=["addrId"])
    addr_feats_full = add_tx_metric(addr_feats_full, user_to_tx_input_feats, source_name="whole_fee_4_flg", agg_fn="sum", dest_name="incoming_tx_whole_fee_4_cnt", fillna=0, group_fields=["addrId"])

    addr_feats_full1 = add_tx_metric(addr_feats_full,  user_to_tx_output_feats, source_name="total_BTC", agg_fn="sum", dest_name="btc_sent_total", fillna=0, group_fields=["addrId"])
    addr_feats_full1 = add_tx_metric(addr_feats_full1, user_to_tx_output_feats, source_name="total_BTC", agg_fn="min", dest_name="btc_sent_min", fillna=0, group_fields=["addrId"])
    addr_feats_full1 = add_tx_metric(addr_feats_full1, user_to_tx_output_feats, source_name="total_BTC", agg_fn="max", dest_name="btc_sent_max", fillna=0, group_fields=["addrId"])
    addr_feats_full1 = add_tx_metric(addr_feats_full1, user_to_tx_output_feats, source_name="total_BTC", agg_fn="mean", dest_name="btc_sent_mean", fillna=0, group_fields=["addrId"])
    addr_feats_full1 = add_tx_metric(addr_feats_full1, user_to_tx_output_feats, source_name="txId", agg_fn="nunique", dest_name="outcoming_tx_cnt", fillna=0, group_fields=["addrId"])
    addr_feats_full1 = add_tx_metric(addr_feats_full1, user_to_tx_output_feats, source_name="num_output_addresses", agg_fn="mean", dest_name="outcoming_tx_output_address_cnt_mean", fillna=0, group_fields=["addrId"])
    addr_feats_full1 = add_tx_metric(addr_feats_full1, user_to_tx_output_feats, source_name="num_input_addresses", agg_fn="mean", dest_name="outcoming_tx_input_address_cnt_mean", fillna=0, group_fields=["addrId"])
    addr_feats_full1 = add_tx_metric(addr_feats_full1, user_to_tx_output_feats, source_name="total_BTC", agg_fn="gini", dest_name="btc_sent_gini", fillna=0, group_fields=["addrId"])
    addr_feats_full1 = add_tx_metric(addr_feats_full1, user_to_tx_output_feats_with_addrs, source_name="output_address", agg_fn="nunique", dest_name="output_address_cnt", fillna=0, group_fields=["addrId"])

    addr_feats_full1 = add_tx_metric(addr_feats_full1, user_to_tx_output_feats, source_name="fees", agg_fn="mean", dest_name="outcoming_tx_fees_mean", fillna=0, group_fields=["addrId"])
    addr_feats_full1 = add_tx_metric(addr_feats_full1, user_to_tx_output_feats, source_name="fees", agg_fn="min", dest_name="outcoming_tx_fees_min", fillna=0, group_fields=["addrId"])
    addr_feats_full1 = add_tx_metric(addr_feats_full1, user_to_tx_output_feats, source_name="fees", agg_fn="min", dest_name="outcoming_tx_fees_max", fillna=0, group_fields=["addrId"])
    addr_feats_full1 = add_tx_metric(addr_feats_full1, user_to_tx_output_feats, source_name="whole_fee_4_flg", agg_fn="sum", dest_name="outcoming_tx_whole_fee_4_cnt", fillna=0, group_fields=["addrId"])
    addr_feats_full1 = add_tx_metric(addr_feats_full1, user_to_tx_output_feats, source_name="fees", agg_fn="sum", dest_name="outcoming_tx_fees_total", fillna=0, group_fields=["addrId"])

    addr_feats_full1 = add_tx_metric(addr_feats_full1, user_to_tx_output_feats, source_name="Time step", agg_fn="nunique", dest_name="outcoming_tx_ts_cnt", fillna=0, group_fields=["addrId"])
    addr_feats_full1 = add_tx_metric(addr_feats_full1, user_to_tx_output_feats, source_name="Time step", agg_fn="gini", dest_name="outcoming_tx_ts_gini", fillna=0, group_fields=["addrId"])

    addr_feats_full1 = add_tx_metric(addr_feats_full1, user_to_tx_input_feats, source_name="Time step", agg_fn="nunique", dest_name="incoming_tx_ts_cnt", fillna=0, group_fields=["addrId"])
    addr_feats_full1 = add_tx_metric(addr_feats_full1, user_to_tx_input_feats, source_name="Time step", agg_fn="gini", dest_name="incoming_tx_ts_gini", fillna=0, group_fields=["addrId"])

    return addr_feats_full1


In [26]:
addr_feats_full_source = addr_feats.drop("Time step", axis=1).drop_duplicates(["addrId"])

addr_feats_train = addr_feats[addr_feats["Time step"] <= CUTOFF]
user_to_tx_input_feats_train = user_to_tx_input_feats[user_to_tx_input_feats["Time step"] <= CUTOFF]
user_to_tx_input_feats_with_addrs_train = user_to_tx_input_feats_with_addrs[user_to_tx_input_feats_with_addrs["Time step"] <= CUTOFF]
user_to_tx_output_feats_train = user_to_tx_output_feats[user_to_tx_output_feats["Time step"] <= CUTOFF]
user_to_tx_output_feats_with_addrs_train = user_to_tx_output_feats_with_addrs[user_to_tx_output_feats_with_addrs["Time step"] <= CUTOFF]


addr_feats_train = addr_feats_train.drop("Time step", axis=1).drop_duplicates(["addrId"])
user_to_tx_input_feats_train = user_to_tx_input_feats_train
user_to_tx_input_feats_with_addrs_train = user_to_tx_input_feats_with_addrs_train


addr_feats_test = addr_feats[addr_feats["Time step"] > CUTOFF_LEFT]
user_to_tx_input_feats_test = user_to_tx_input_feats[user_to_tx_input_feats["Time step"] > CUTOFF_LEFT]
user_to_tx_input_feats_with_addrs_test = user_to_tx_input_feats_with_addrs[user_to_tx_input_feats_with_addrs["Time step"] > CUTOFF_LEFT]
user_to_tx_output_feats_test = user_to_tx_output_feats[user_to_tx_output_feats["Time step"] > CUTOFF_LEFT]
user_to_tx_output_feats_with_addrs_test = user_to_tx_output_feats_with_addrs[user_to_tx_output_feats_with_addrs["Time step"] > CUTOFF_LEFT]


addr_feats_test = addr_feats_test.drop("Time step", axis=1).drop_duplicates(["addrId"])
user_to_tx_input_feats_test = user_to_tx_input_feats_test
user_to_tx_input_feats_with_addrs_test = user_to_tx_input_feats_with_addrs_test


In [27]:
addr_feats_train.shape, addr_feats_test.shape, "\n", user_to_tx_input_feats_train.shape, user_to_tx_input_feats_test.shape

((552376, 1), (281034, 1), '\n', (586667, 22), (250457, 22))

In [28]:
wallets_features_train_agg = get_training_agg_feats(
    addr_feats_train,
    user_to_tx_input_feats_train,
    user_to_tx_input_feats_with_addrs_train,
    user_to_tx_output_feats=user_to_tx_output_feats_train,
    user_to_tx_output_feats_with_addrs=user_to_tx_output_feats_with_addrs_train,
)

In [29]:
wallets_features_test_agg = get_training_agg_feats(
    addr_feats_test,
    user_to_tx_input_feats_test,
    user_to_tx_input_feats_with_addrs_test,
    user_to_tx_output_feats=user_to_tx_output_feats_test,
    user_to_tx_output_feats_with_addrs=user_to_tx_output_feats_with_addrs_test,
)

In [30]:
wallets_features_full_agg = get_training_agg_feats(
    addr_feats_full_source,
    user_to_tx_input_feats,
    user_to_tx_input_feats_with_addrs,
    user_to_tx_output_feats=user_to_tx_output_feats,
    user_to_tx_output_feats_with_addrs=user_to_tx_output_feats_with_addrs,
)

In [31]:
wallets_features_train_agg = wallets_features_train_agg.merge(wallets_feats_old.drop_duplicates(["addrId"]), on=("addrId"), how="left")
wallets_features_test_agg = wallets_features_test_agg.merge(wallets_feats_old.drop_duplicates(["addrId"]), on=("addrId"), how="left")
wallets_features_full_agg = wallets_features_full_agg.merge(wallets_feats_old.drop_duplicates(["addrId"]), on=("addrId"), how="left")

In [32]:
wallets_features_train_agg.to_csv("../../dataset/custom/features_fixed/wallets_features__train_new_split.csv", index=False)
wallets_features_test_agg.to_csv("../../dataset/custom/features_fixed/wallets_features__test_new_split.csv", index=False)

In [33]:
wallets_features_full_agg.to_csv("../../dataset/custom/features_fixed/wallets_features_agg.csv", index=False)

In [34]:
wallets_features_train_agg["class"].value_counts()

class
3    374301
2    168269
1      9806
Name: count, dtype: int64

In [35]:
wallets_features_test_agg["class"].value_counts()

class
3    191256
2     85273
1      4505
Name: count, dtype: int64

# Add users features

In [36]:
USERS_VALID_COLUMNS = [
    'userId',
    'addr_cnt',
    'outcoming_tx_cnt',
    'incoming_tx_cnt',
    'input_users_cnt',
    'output_users_cnt',
    'class',
    'active_time_steps_cnt',
    'btc_transacted_total',
    'btc_sent_total',
    'btc_received_total',
    'btc_sent_median',
    'btc_received_median',
    'interracted_output_address_cnt',
    'interracted_input_address_cnt',
    'overall_activity_coef',
    'user_ts_fees_share_mean',
    'user_ts_fees_share_min',
    'user_ts_fees_share_max',
    'whole_fee_5',
]
users_feats = pd.read_csv("../../dataset/custom/UserData.csv") \
    .drop(["class", "whole_fee_4", "whole_fee_6"], axis=1) \
    .rename(columns={
        x: ("user_" + x) for x in USERS_VALID_COLUMNS[1:]
    })

addr_to_user = pd.read_csv("../../dataset/custom/AddrUser.csv")
    
users_feats.head()

Unnamed: 0,userId,user_addr_cnt,user_outcoming_tx_cnt,user_incoming_tx_cnt,user_input_users_cnt,user_output_users_cnt,user_active_time_steps_cnt,user_btc_transacted_total,user_btc_sent_total,user_btc_received_total,user_btc_sent_median,user_btc_received_median,user_interracted_output_address_cnt,user_interracted_input_address_cnt,user_overall_activity_coef,user_user_ts_fees_share_mean,user_user_ts_fees_share_min,user_user_ts_fees_share_max,user_whole_fee_5
0,1,1,0.0,1.0,1.0,0.0,1,0.004675,0.0,0.004675,0.0,0.004675,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,2,1,1.0,5.0,5.0,1.0,1,0.06,0.03,0.03,0.0,0.0008,1.0,5.0,0.0,0.977704,0.977704,0.977704,1.0
2,3,1,0.0,1.0,1.0,0.0,1,0.0049,0.0,0.0049,0.0,0.0049,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,4,5656,167.0,4110.0,3546.0,215.0,26,23970.064106,12838.606215,11131.457891,0.03835,0.015823,328.0,11097.0,0.084105,9.835605,3.099915,19.605217,0.0
4,5,1,0.0,1.0,1.0,0.0,1,0.007039,0.0,0.007039,0.0,0.007039,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [37]:
wallets_features_full_agg_with_users = wallets_features_full_agg \
    .merge(
        addr_to_user,
        how="left",
        on="addrId"
    ) \
    .merge(
        users_feats,
        how="left",
        on="userId"
    )


In [38]:
wallets_features_train_agg_with_users = wallets_features_train_agg \
    .merge(
        addr_to_user,
        how="left",
        on="addrId"
    ) \
    .merge(
        users_feats,
        how="left",
        on="userId"
    )
wallets_features_test_agg_with_users = wallets_features_test_agg \
    .merge(
        addr_to_user,
        how="left",
        on="addrId"
    ) \
    .merge(
        users_feats,
        how="left",
        on="userId"
    )
wallets_features_test_agg_with_users.shape

(281034, 56)

In [39]:
wallets_features_full_agg_with_users

Unnamed: 0,addrId,btc_received_total,btc_received_min,btc_received_max,btc_received_mean,incoming_tx_cnt,incoming_tx_input_address_cnt_mean,incoming_tx_output_address_cnt_mean,btc_received_gini,input_address_cnt,...,user_btc_received_total,user_btc_sent_median,user_btc_received_median,user_interracted_output_address_cnt,user_interracted_input_address_cnt,user_overall_activity_coef,user_user_ts_fees_share_mean,user_user_ts_fees_share_min,user_user_ts_fees_share_max,user_whole_fee_5
0,14YRXHHof4BY1TVxN5FqYPcEdpmXiYT78a,6.999303,6.999303,6.999303,6.999303,1.0,1.0,2.0,0.0,1.0,...,0.000000,0.174495,0.000000,2.0,1.0,0.000000,5.977542,5.977542,5.977542,1.0
1,13Lhad3SAmu2vqYg2dxbNcxH7LE77kJu2w,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.000000,5.525902,0.000000,1.0,0.0,0.000000,0.597754,0.597754,0.597754,1.0
2,1MAQQZn7EHP6J3erXByCciFiVcgS8ZhWqz,0.390310,0.390310,0.390310,0.390310,1.0,4.0,2.0,0.0,4.0,...,0.760619,5.905637,0.190155,2.0,4.0,0.333333,0.597754,0.597754,0.597754,1.0
3,16zs5SVSyADh5WrLNbZbpRLsBsN5uEzgeK,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.000000,3.770000,0.000000,1.0,0.0,0.000000,0.597754,0.597754,0.597754,1.0
4,1QJpwtUorBKPGUJkSyrRcBKTAHq4CXrdYh,3.200299,3.200299,3.200299,3.200299,1.0,1.0,2.0,0.0,1.0,...,0.000000,2.317006,0.000000,2.0,1.0,0.000000,0.597754,0.597754,0.597754,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
822937,13h3xr1LJXQXn9QR3gio62xgdZYAoBrKYQ,16.877397,16.877397,16.877397,16.877397,1.0,1.0,2.0,0.0,1.0,...,0.005785,0.000000,0.005785,0.0,1.0,0.000000,0.000000,0.000000,0.000000,0.0
822938,1Dow13WsGdCYewjtxP1acvM47A2XoPx1fF,16.871536,16.871536,16.871536,16.871536,1.0,1.0,2.0,0.0,1.0,...,0.012649,0.000000,0.012649,0.0,1.0,0.000000,0.000000,0.000000,0.000000,0.0
822939,18rfNEMrpF4yqw3s2Y9jTSvsfiH6dYC7ph,0.319490,0.319490,0.319490,0.319490,1.0,31.0,2.0,0.0,29.0,...,0.009000,0.000000,0.009000,0.0,29.0,0.000000,0.000000,0.000000,0.000000,0.0
822940,1Ct6Sx8uKTubXPcuLX5jSKb6QKMcqf1Ybj,16.858818,16.858818,16.858818,16.858818,1.0,1.0,2.0,0.0,1.0,...,0.010000,0.000000,0.010000,0.0,1.0,0.000000,0.000000,0.000000,0.000000,0.0


In [40]:
wallets_features_test_agg.shape

(281034, 37)

In [41]:
wallets_features_full_agg_with_users.to_csv("../../dataset/custom/features_fixed/wallets_features_with_users_agg.csv", index=False)

In [42]:
wallets_features_train_agg_with_users.to_csv("../../dataset/custom/features_fixed/wallets_features_with_users__train.csv", index=False)
wallets_features_test_agg_with_users.to_csv("../../dataset/custom/features_fixed/wallets_features_with_users__test.csv", index=False)