In [1]:
import os
import pandas as pd
import numpy as np
from pathlib import Path

In [2]:
DATA_DIR = "../elliptic_dataset"

# wallet data
WALLETS_FEATURES = "wallets_features.csv"
WALLETS_CLASSES = "wallets_classes.csv"

# wallet to wallet edges
ADDRESS_TO_ADDRESS = "AddrAddr_edgelist.csv"


## 1. Load and inspect the actors dataset

#### 1.1.Let's first inspect the features of wallets

In [4]:
wallet_features = pd.read_csv(os.path.join(DATA_DIR, WALLETS_FEATURES))
print(wallet_features.shape[0])

1268260


In [5]:
wallet_features.head(5)

Unnamed: 0,address,Time step,num_txs_as_sender,num_txs_as receiver,first_block_appeared_in,last_block_appeared_in,lifetime_in_blocks,total_txs,first_sent_block,first_received_block,...,blocks_btwn_output_txs_min,blocks_btwn_output_txs_max,blocks_btwn_output_txs_mean,blocks_btwn_output_txs_median,num_addr_transacted_multiple,transacted_w_address_total,transacted_w_address_min,transacted_w_address_max,transacted_w_address_mean,transacted_w_address_median
0,111112TykSw72ztDN2WJger4cynzWYC5w,25,0.0,1.0,439586.0,439586.0,0.0,1.0,0.0,439586.0,...,0.0,0.0,0.0,0.0,0.0,24.0,1.0,1.0,1.0,1.0
1,1111DAYXhoxZx2tsRnzimfozo783x1yC2,25,0.0,8.0,439589.0,485959.0,46370.0,8.0,0.0,439589.0,...,0.0,20164.0,6624.285714,8060.0,0.0,8.0,1.0,1.0,1.0,1.0
2,1111DAYXhoxZx2tsRnzimfozo783x1yC2,29,0.0,8.0,439589.0,485959.0,46370.0,8.0,0.0,439589.0,...,0.0,20164.0,6624.285714,8060.0,0.0,8.0,1.0,1.0,1.0,1.0
3,1111DAYXhoxZx2tsRnzimfozo783x1yC2,39,0.0,8.0,439589.0,485959.0,46370.0,8.0,0.0,439589.0,...,0.0,20164.0,6624.285714,8060.0,0.0,8.0,1.0,1.0,1.0,1.0
4,1111DAYXhoxZx2tsRnzimfozo783x1yC2,39,0.0,8.0,439589.0,485959.0,46370.0,8.0,0.0,439589.0,...,0.0,20164.0,6624.285714,8060.0,0.0,8.0,1.0,1.0,1.0,1.0


#### 1.2. Lets now look at the wallet labels:
- Class 1 - Illicit (should be 14,266, so 2%)
- Class 2 - Licit (should be 251,088, so 31%)
- Class 3 - Unknown (the rest)

In [6]:
wallet_classes = pd.read_csv(os.path.join(DATA_DIR, WALLETS_CLASSES))
print(wallet_classes.shape[0])

822942


In [7]:
wallet_classes.head(5)

Unnamed: 0,address,class
0,111112TykSw72ztDN2WJger4cynzWYC5w,2
1,1111DAYXhoxZx2tsRnzimfozo783x1yC2,3
2,1111VHuXEzHaRCgXbVwojtaP7Co3QABb,2
3,111218KKkh1JJFRHbwM16AwCiVCc4m7he1,3
4,1115LWW3xsD9jT9VRY7viCN9S34RVAAuA,2


Let's verify the counts

In [8]:
illicit_count = wallet_classes[wallet_classes["class"] == 1].shape[0]
licit_count = wallet_classes[wallet_classes["class"] == 2].shape[0]
unknown_count = wallet_classes[wallet_classes["class"] == 3].shape[0]

print(f"Illicit transaction count: {illicit_count}")
print(f"Licit transaction count: {licit_count}")
print(f"Unknown transaction count: {unknown_count}")

Illicit transaction count: 14266
Licit transaction count: 251088
Unknown transaction count: 557588


Ok so the illicit and licit counts check out. When populating over time we will see that the counts should match up ot the total of wallet_features

#### 1.3. Let's inspect the address to address edges

In [9]:
address_to_address = pd.read_csv(os.path.join(DATA_DIR, ADDRESS_TO_ADDRESS))
print(address_to_address.shape[0])

2868964


In [10]:
address_to_address.head(5)

Unnamed: 0,input_address,output_address
0,14YRXHHof4BY1TVxN5FqYPcEdpmXiYT78a,1GASxu5nMntiRKdVtTVRvEbP965G51bhHH
1,14YRXHHof4BY1TVxN5FqYPcEdpmXiYT78a,14YRXHHof4BY1TVxN5FqYPcEdpmXiYT78a
2,13Lhad3SAmu2vqYg2dxbNcxH7LE77kJu2w,1GFdrdgtG34GChM8SMpMwcXFc4nYbH1A5G
3,1MAQQZn7EHP6J3erXByCciFiVcgS8ZhWqz,19q57SeCEzTnWrWVXA43nZzhSiXkYggh7c
4,1MAQQZn7EHP6J3erXByCciFiVcgS8ZhWqz,1Kk1NVYnCE8ALXDhgMM6HqTt1jDSvi6QBA


Ok this is not enough, we want to have some edge features. Lets build this ourselves and include transaction features such as amount etc.

## 2. Prepare an address-address transaction list with extra features (amount time etc)

In [11]:
TRANSACTION_FEATURES = "txs_features.csv"
TRANSACTION_CLASSES = "txs_classes.csv"
TRANSACTION_EDGE_LIST = "txs_edgelist.csv"
ADDRESES_TO_TRANSACTIONS = "AddrTx_edgelist.csv"
TRANSACTIONS_TO_ADDRESSES = "TxAddr_edgelist.csv"

transaction_edges = pd.read_csv(os.path.join(DATA_DIR, TRANSACTION_EDGE_LIST))
transaction_features = pd.read_csv(os.path.join(DATA_DIR, TRANSACTION_FEATURES))
transaction_classes = pd.read_csv(os.path.join(DATA_DIR, TRANSACTION_CLASSES))
address_transaction = pd.read_csv(os.path.join(DATA_DIR, ADDRESES_TO_TRANSACTIONS))
transaction_address = pd.read_csv(os.path.join(DATA_DIR, TRANSACTIONS_TO_ADDRESSES))

In [12]:
transaction_edges.head(5)

Unnamed: 0,txId1,txId2
0,230425980,5530458
1,232022460,232438397
2,230460314,230459870
3,230333930,230595899
4,232013274,232029206


In [13]:
transaction_features.shape, transaction_classes.shape

((203769, 184), (203769, 2))

In [14]:
key_transaction_cols = ['txId', 'Time step', 'in_txs_degree', 'out_txs_degree', 'total_BTC', 
                        'fees', 'size', 'num_input_addresses', 'num_output_addresses',
                        'in_BTC_total', 'out_BTC_total']
                        
transaction_features[key_transaction_cols].head()

Unnamed: 0,txId,Time step,in_txs_degree,out_txs_degree,total_BTC,fees,size,num_input_addresses,num_output_addresses,in_BTC_total,out_BTC_total
0,3321,1,1.0,0.0,0.533972,0.0001,225.0,1.0,2.0,0.534072,0.533972
1,11108,1,1.0,1.0,5.611778,0.0001,225.0,1.0,2.0,5.611878,5.611778
2,51816,1,1.0,1.0,0.456508,0.0001,226.0,1.0,2.0,0.456608,0.456508
3,68869,1,0.0,1.0,9.3088,0.0001,853.0,3.0,2.0,9.3089,9.3088
4,89273,1,1.0,288.0,852.16468,0.0,445268.0,1.0,13107.0,852.16468,852.16468


In [17]:
address_transaction.head()

Unnamed: 0,input_address,txId
0,14YRXHHof4BY1TVxN5FqYPcEdpmXiYT78a,230325127
1,13Lhad3SAmu2vqYg2dxbNcxH7LE77kJu2w,230325139
2,1MAQQZn7EHP6J3erXByCciFiVcgS8ZhWqz,86875675
3,16zs5SVSyADh5WrLNbZbpRLsBsN5uEzgeK,230325147
4,1QJpwtUorBKPGUJkSyrRcBKTAHq4CXrdYh,230325154


Build the complete address-to-address transaction dataframe

In [15]:
# merge address_transaction with transaction_features
address_transaction_with_features = address_transaction.merge(
    transaction_features, 
    on='txId', 
    how='left'
)

# merge with transaction_address to get output addresses
address_address_transactions = address_transaction_with_features.merge(
    transaction_address, 
    on='txId', 
    how='inner'
)

# merge with transaction classes (we might want to drop that later)
address_address_transactions = address_address_transactions.merge(
    transaction_classes, 
    on='txId', 
    how='left'
)

# sort index
address_address_transactions['class'].value_counts().sort_index()
address_address_transactions.shape

(2868964, 187)

We can see that the number of entries is same as for address-address. This checks out. We also have more entries than transaction features - thsi is because each BTC transaction can include mutliple actors. So a transaction with N inputs and M outputs will have NxM entries added to addresse_address_transactions table.

In [122]:
address_address_transactions.groupby('Time step').size().describe()

count        49.000000
mean      58550.285714
std       40317.169123
min        9932.000000
25%       26861.000000
50%       59974.000000
75%       73202.000000
max      190334.000000
dtype: float64

Let's have a look if the BTC amount features differ - also the amounts are misleading as this is a total amount of transactions, and those could include multiple addresses and a bunch of sub-flows -w e coudl scrape this data (painful) on rely on diff features or maybe normalize by using the num_***_addresses features.

In [17]:
address_address_transactions.head(5)

Unnamed: 0,input_address,txId,Time step,Local_feature_1,Local_feature_2,Local_feature_3,Local_feature_4,Local_feature_5,Local_feature_6,Local_feature_7,...,in_BTC_mean,in_BTC_median,in_BTC_total,out_BTC_min,out_BTC_max,out_BTC_mean,out_BTC_median,out_BTC_total,output_address,class
0,14YRXHHof4BY1TVxN5FqYPcEdpmXiYT78a,230325127,1,-0.128834,0.048298,-1.201369,-0.12197,-0.043875,-0.113002,-0.061584,...,7.000303,7.000303,7.000303,0.173495,6.825808,3.499652,3.499652,6.999303,1GASxu5nMntiRKdVtTVRvEbP965G51bhHH,3
1,14YRXHHof4BY1TVxN5FqYPcEdpmXiYT78a,230325127,1,-0.128834,0.048298,-1.201369,-0.12197,-0.043875,-0.113002,-0.061584,...,7.000303,7.000303,7.000303,0.173495,6.825808,3.499652,3.499652,6.999303,14YRXHHof4BY1TVxN5FqYPcEdpmXiYT78a,3
2,13Lhad3SAmu2vqYg2dxbNcxH7LE77kJu2w,230325139,1,-0.138128,-0.184668,-1.201369,0.028105,-0.063725,-0.113002,0.547008,...,1.841967,1.194726,5.525902,5.525802,5.525802,5.525802,5.525802,5.525802,1GFdrdgtG34GChM8SMpMwcXFc4nYbH1A5G,3
3,1MAQQZn7EHP6J3erXByCciFiVcgS8ZhWqz,86875675,1,-0.098483,-0.184668,-1.201369,0.028105,-0.043875,-0.113002,0.547008,...,3.937091,3.85226,11.811274,1.266853,10.544321,5.905587,5.905587,11.811174,19q57SeCEzTnWrWVXA43nZzhSiXkYggh7c,3
4,1MAQQZn7EHP6J3erXByCciFiVcgS8ZhWqz,86875675,1,-0.098483,-0.184668,-1.201369,0.028105,-0.043875,-0.113002,0.547008,...,3.937091,3.85226,11.811274,1.266853,10.544321,5.905587,5.905587,11.811174,1Kk1NVYnCE8ALXDhgMM6HqTt1jDSvi6QBA,3


In [123]:
for col in ["input_address", "output_address"]:
    address_address_transactions[col] = address_address_transactions[col].astype(str).str.strip()

wallet_features["address"] = wallet_features["address"].astype(str).str.strip()


tx_addrs = set(address_address_transactions["input_address"]) | set(address_address_transactions["output_address"])
wallet_addrs = set(wallet_features["address"])

all_present = wallet_addrs.issubset(tx_addrs)
missing = wallet_addrs - tx_addrs

print("All wallet addresses in transactions:", all_present)
print("Wallet count:", len(wallet_addrs), "| In transactions:", len(tx_addrs), "| Missing:", len(missing))
print("Sample missing:", list(sorted(missing))[:10])

All wallet addresses in transactions: True
Wallet count: 822942 | In transactions: 822942 | Missing: 0
Sample missing: []


Prepeare a version iwth the known subset of features

In [89]:
address_address_transactions_known_features = address_address_transactions.loc[
    :, ~address_address_transactions.columns.str.startswith(("Aggregate_feature", "Local_feature"))
]

In [90]:
address_address_transactions_known_features.head(5)

Unnamed: 0,input_address,txId,Time step,in_txs_degree,out_txs_degree,total_BTC,fees,size,num_input_addresses,num_output_addresses,...,in_BTC_mean,in_BTC_median,in_BTC_total,out_BTC_min,out_BTC_max,out_BTC_mean,out_BTC_median,out_BTC_total,output_address,class
0,14YRXHHof4BY1TVxN5FqYPcEdpmXiYT78a,230325127,1,0.0,1.0,6.999303,0.001,225.0,1.0,2.0,...,7.000303,7.000303,7.000303,0.173495,6.825808,3.499652,3.499652,6.999303,1GASxu5nMntiRKdVtTVRvEbP965G51bhHH,3
1,14YRXHHof4BY1TVxN5FqYPcEdpmXiYT78a,230325127,1,0.0,1.0,6.999303,0.001,225.0,1.0,2.0,...,7.000303,7.000303,7.000303,0.173495,6.825808,3.499652,3.499652,6.999303,14YRXHHof4BY1TVxN5FqYPcEdpmXiYT78a,3
2,13Lhad3SAmu2vqYg2dxbNcxH7LE77kJu2w,230325139,1,0.0,1.0,5.525802,0.0001,486.0,3.0,1.0,...,1.841967,1.194726,5.525902,5.525802,5.525802,5.525802,5.525802,5.525802,1GFdrdgtG34GChM8SMpMwcXFc4nYbH1A5G,3
3,1MAQQZn7EHP6J3erXByCciFiVcgS8ZhWqz,86875675,1,0.0,1.0,11.811174,0.0001,521.0,3.0,2.0,...,3.937091,3.85226,11.811274,1.266853,10.544321,5.905587,5.905587,11.811174,19q57SeCEzTnWrWVXA43nZzhSiXkYggh7c,3
4,1MAQQZn7EHP6J3erXByCciFiVcgS8ZhWqz,86875675,1,0.0,1.0,11.811174,0.0001,521.0,3.0,2.0,...,3.937091,3.85226,11.811274,1.266853,10.544321,5.905587,5.905587,11.811174,1Kk1NVYnCE8ALXDhgMM6HqTt1jDSvi6QBA,3


In [91]:
print(address_address_transactions_known_features.columns.to_list())

['input_address', 'txId', 'Time step', 'in_txs_degree', 'out_txs_degree', 'total_BTC', 'fees', 'size', 'num_input_addresses', 'num_output_addresses', 'in_BTC_min', 'in_BTC_max', 'in_BTC_mean', 'in_BTC_median', 'in_BTC_total', 'out_BTC_min', 'out_BTC_max', 'out_BTC_mean', 'out_BTC_median', 'out_BTC_total', 'output_address', 'class']


### Calculate properly agregated per-timestep features

In [92]:
print(address_address_transactions_known_features["txId"].nunique())
print(address_address_transactions_known_features.shape[0])

# transactions include multiple adresses

202804
2868964


In [93]:
adresses_list = address_address_transactions_known_features['input_address'].unique().tolist() + address_address_transactions_known_features['output_address'].unique().tolist()
addresses_unique = list(set(adresses_list))
print(len(addresses_unique))

822942


In [94]:
address_address_transactions_known_features.insert(0, "id", range(0, address_address_transactions_known_features.shape[0]))

In [None]:
address_address_transactions_known_features.set_index("id", drop=False, inplace=True)
address_address_transactions_known_features.head(2)

Unnamed: 0_level_0,id,input_address,txId,Time step,in_txs_degree,out_txs_degree,total_BTC,fees,size,num_input_addresses,...,in_BTC_mean,in_BTC_median,in_BTC_total,out_BTC_min,out_BTC_max,out_BTC_mean,out_BTC_median,out_BTC_total,output_address,class
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,14YRXHHof4BY1TVxN5FqYPcEdpmXiYT78a,230325127,1,0.0,1.0,6.999303,0.001,225.0,1.0,...,7.000303,7.000303,7.000303,0.173495,6.825808,3.499652,3.499652,6.999303,1GASxu5nMntiRKdVtTVRvEbP965G51bhHH,3
1,1,14YRXHHof4BY1TVxN5FqYPcEdpmXiYT78a,230325127,1,0.0,1.0,6.999303,0.001,225.0,1.0,...,7.000303,7.000303,7.000303,0.173495,6.825808,3.499652,3.499652,6.999303,14YRXHHof4BY1TVxN5FqYPcEdpmXiYT78a,3


In [None]:
input_adddress_time_steps_ids = address_address_transactions_known_features[['input_address', 'Time step', 'id']].drop_duplicates()
output_adddress_time_steps_ids = address_address_transactions_known_features[['output_address', 'Time step', 'id']].drop_duplicates()

(417525, 2)
(721887, 2)


In [109]:
rows = []
numeric_cols = [
    "fees","size","in_txs_degree","out_txs_degree","num_input_addresses","num_output_addresses",
    "total_BTC","in_BTC_min","in_BTC_max","in_BTC_mean","in_BTC_median","in_BTC_total",
    "out_BTC_min","out_BTC_max","out_BTC_mean","out_BTC_median","out_BTC_total",
]
numeric_cols = [c for c in numeric_cols if c in address_address_transactions_known_features.columns]

In [110]:
def _agg(df, prefix):
    out = {}
    if df is None or df.empty:
        out[f"{prefix}num"] = 0
        out[f"{prefix}total_fees"] = 0.0
        out[f"{prefix}mean_fees"] = float("nan")
        out[f"{prefix}median_fees"] = float("nan")
        for col in numeric_cols:
            out[f"{prefix}{col}_sum"] = 0.0
            out[f"{prefix}{col}_mean"] = float("nan")
            out[f"{prefix}{col}_median"] = float("nan")
        # btc convenience
        base = "btc_in" if prefix=="in_" else "btc_out"
        out[f"{prefix}total_{base}"] = 0.0
        out[f"{prefix}mean_{base}"] = float("nan")
        out[f"{prefix}median_{base}"] = float("nan")
        return out

    out[f"{prefix}num"] = int(df["txId"].nunique()) if "txId" in df.columns else int(len(df))

    fees = df["fees"] if "fees" in df.columns else pd.Series(dtype=float)
    out[f"{prefix}total_fees"]  = float(fees.sum()) if not fees.empty else 0.0
    out[f"{prefix}mean_fees"]   = float(fees.mean()) if not fees.empty else float("nan")
    out[f"{prefix}median_fees"] = float(fees.median()) if not fees.empty else float("nan")

    if prefix == "in_":   # wallet receives -> summarize tx total out
        btc_series, base = df.get("out_BTC_total", pd.Series(dtype=float)), "btc_in"
    else:                  # wallet sends   -> summarize tx total in
        btc_series, base = df.get("in_BTC_total",  pd.Series(dtype=float)), "btc_out"
    out[f"{prefix}total_{base}"]  = float(btc_series.sum()) if not btc_series.empty else 0.0
    out[f"{prefix}mean_{base}"]   = float(btc_series.mean()) if not btc_series.empty else float("nan")
    out[f"{prefix}median_{base}"] = float(btc_series.median()) if not btc_series.empty else float("nan")

    for col in numeric_cols:
        if col in df.columns:
            out[f"{prefix}{col}_sum"]    = float(df[col].sum())
            out[f"{prefix}{col}_mean"]   = float(df[col].mean())
            out[f"{prefix}{col}_median"] = float(df[col].median())
        else:
            out[f"{prefix}{col}_sum"]    = 0.0
            out[f"{prefix}{col}_mean"]   = float("nan")
            out[f"{prefix}{col}_median"] = float("nan")
    return out

In [None]:
inp_map = input_adddress_time_steps_ids.groupby("input_address", sort=False).indices
out_map = output_adddress_time_steps_ids.groupby("output_address", sort=False).indices

from tqdm import tqdm
for address in tqdm(addresses_unique):
    inp_rows = input_adddress_time_steps_ids.take(inp_map.get(address, []))
    out_rows = output_adddress_time_steps_ids.take(out_map.get(address, []))

    unqiue_time_steps = set(inp_rows['Time step'].tolist() + out_rows['Time step'].tolist()) 
    for time_step in unqiue_time_steps:
        inp_rows_ts = inp_rows[inp_rows['Time step'] <= time_step]
        out_rows_ts = out_rows[out_rows['Time step'] <= time_step] 

        input_data_df  = address_address_transactions_known_features.loc[inp_rows_ts['id'].to_numpy()]
        output_data_df = address_address_transactions_known_features.loc[out_rows_ts['id'].to_numpy()]

        row = {"address": address, "Time step": time_step}
        row.update(_agg(output_data_df, prefix="in_"))
        row.update(_agg(input_data_df,  prefix="out_"))
        rows.append(row)



100%|██████████| 822942/822942 [21:07<00:00, 649.25it/s]


In [112]:
wallets_ts_final = pd.DataFrame(rows).sort_values(["address","Time step"]).reset_index(drop=True)
wallets_ts_final.head(5)

Unnamed: 0,address,Time step,appears_at_t,in_num,in_total_fees,in_mean_fees,in_median_fees,in_total_btc_in,in_mean_btc_in,in_median_btc_in,...,out_out_BTC_max_median,out_out_BTC_mean_sum,out_out_BTC_mean_mean,out_out_BTC_mean_median,out_out_BTC_median_sum,out_out_BTC_median_mean,out_out_BTC_median_median,out_out_BTC_total_sum,out_out_BTC_total_mean,out_out_BTC_total_median
0,111112TykSw72ztDN2WJger4cynzWYC5w,25,True,1,0.169015,0.007042,0.007042,154.368773,6.432032,6.432032,...,,0.0,,,0.0,,,0.0,,
1,1111DAYXhoxZx2tsRnzimfozo783x1yC2,25,True,1,0.000122,0.000122,0.000122,0.01989,0.01989,0.01989,...,,0.0,,,0.0,,,0.0,,
2,1111DAYXhoxZx2tsRnzimfozo783x1yC2,29,True,2,0.000245,0.000123,0.000123,0.153667,0.076833,0.076833,...,,0.0,,,0.0,,,0.0,,
3,1111DAYXhoxZx2tsRnzimfozo783x1yC2,39,True,4,0.001405,0.000351,0.000351,0.162912,0.040728,0.012617,...,,0.0,,,0.0,,,0.0,,
4,1111DAYXhoxZx2tsRnzimfozo783x1yC2,43,True,6,0.001889,0.000315,0.000242,0.2191,0.036517,0.014352,...,,0.0,,,0.0,,,0.0,,


In [113]:
count_non_t = wallets_ts_final[wallets_ts_final["appears_at_t"]==False].shape[0]
print(f"Number of rows where address does not appear at time step: {count_non_t}")

Number of rows where address does not appear at time step: 0


In [114]:
wallets_ts_final = wallets_ts_final.drop(columns=["appears_at_t"])

In [117]:
num_both_in_out = ((wallets_ts_final["in_num"]>0) & (wallets_ts_final["out_num"]>0)).sum()
print(f"Number of rows where address has both in and out transactions: {num_both_in_out}")

Number of rows where address has both in and out transactions: 247522


In [119]:
wallets_ts_final_filled = wallets_ts_final.fillna(0)
wallets_ts_final_filled.head(5)

Unnamed: 0,address,Time step,in_num,in_total_fees,in_mean_fees,in_median_fees,in_total_btc_in,in_mean_btc_in,in_median_btc_in,in_fees_sum,...,out_out_BTC_max_median,out_out_BTC_mean_sum,out_out_BTC_mean_mean,out_out_BTC_mean_median,out_out_BTC_median_sum,out_out_BTC_median_mean,out_out_BTC_median_median,out_out_BTC_total_sum,out_out_BTC_total_mean,out_out_BTC_total_median
0,111112TykSw72ztDN2WJger4cynzWYC5w,25,1,0.169015,0.007042,0.007042,154.368773,6.432032,6.432032,0.169015,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1111DAYXhoxZx2tsRnzimfozo783x1yC2,25,1,0.000122,0.000122,0.000122,0.01989,0.01989,0.01989,0.000122,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1111DAYXhoxZx2tsRnzimfozo783x1yC2,29,2,0.000245,0.000123,0.000123,0.153667,0.076833,0.076833,0.000245,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1111DAYXhoxZx2tsRnzimfozo783x1yC2,39,4,0.001405,0.000351,0.000351,0.162912,0.040728,0.012617,0.001405,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1111DAYXhoxZx2tsRnzimfozo783x1yC2,43,6,0.001889,0.000315,0.000242,0.2191,0.036517,0.014352,0.001889,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [120]:
wallets_ts_final_filled.to_csv(os.path.join(DATA_DIR, "wallets_features_until_t.csv"), index=False)

In [125]:
expected_pairs = set(map(tuple, wallet_features[["address","Time step"]]
                         .drop_duplicates()
                         .to_records(index=False)))

built_pairs = set(map(tuple, wallets_ts_final[["address","Time step"]]
                      .drop_duplicates()
                      .to_records(index=False)))

print("All (address, t) covered:", expected_pairs.issubset(built_pairs))
print("Missing pairs:", len(expected_pairs - built_pairs))
print("Extra pairs:", len(built_pairs - expected_pairs))
print("Sample missing:", list(expected_pairs - built_pairs)[:10])

All (address, t) covered: True
Missing pairs: 0
Extra pairs: 0
Sample missing: []


In [127]:
print(len(wallet_features))
print(len(wallets_ts_final))

1268260
920691


In [128]:
wf_pairs = wallet_features[["address","Time step"]]
n_unique_pairs = len(wf_pairs.drop_duplicates())
print("unique pairs in wallet_features:", n_unique_pairs)
print("rows in wallets_ts_final:", len(wallets_ts_final))

dups_mask = wf_pairs.duplicated(keep=False)
print("duplicated (address,t) rows in wallet_features:", dups_mask.sum())


dup_groups = (wallet_features[dups_mask]
              .groupby(["address","Time step"])
              .size()
              .sort_values(ascending=False)
              .head(10))
print("top duplicate groups:\n", dup_groups)

assert n_unique_pairs == len(wallets_ts_final)

unique pairs in wallet_features: 920691
rows in wallets_ts_final: 920691
duplicated (address,t) rows in wallet_features: 594114
top duplicate groups:
 address                             Time step
3P9wZeSQN44aozpCzjvCzW3bkM8RdZnYng  10           568
366Dgw4pi3rnvu5zizVWZF6nijWxZWc6RA  19           479
3QBEjSdESMK2puAvs7J8d15e8SBnsX1pN1  33           397
13vHWR3iLsHeYwT42RnuKYNBoVPrKKZgRv  39           393
1JS2HEY2WgesydnEBTxKnTWUVe6SccXPuL  10           372
366Dgw4pi3rnvu5zizVWZF6nijWxZWc6RA  20           348
33fjpQKTzzyQWu6T4PcXUxgbqRbd618u8U  9            333
1P9RQEr2XeE3PEb44ZE35sfZRRW1JHU8qx  45           313
3Kd1EUax2vjYNL1Hg1i11SoSd8brvKcrHp  14           260
3GjVjz9Rx3Vq97C5NnonsfZdzRV7LLa9LR  33           237
dtype: int64


Write to csv - the edgelist is divided into 8 parts so that LFS doesnt bitch abt it

In [33]:
def split_into_n_parts(df, n, out_dir, base):
    parts = np.array_split(df, n)
    paths = []
    for i, part in enumerate(parts, 1):
        p = os.path.join(out_dir, f"{base}_part_{i}.csv")
        part.to_csv(p, index=False)

split_into_n_parts(address_address_transactions, n=8, out_dir=DATA_DIR, base="AddrTxAddr_edgelist")

  return bound(*args, **kwds)


The rest is good to go at once

In [None]:
# address_address_transactions.to_csv(os.path.join(DATA_DIR, "AddrTxAddr_edgelist.csv"), index=False)
address_pair_aggregated.to_csv(os.path.join(DATA_DIR, "AddrTxAddr_edgelist_aggregated.csv"), index=False)
address_address_transactions_known_features.to_csv(os.path.join(DATA_DIR, "AddrTxAddr_edgelist_known_features.csv"), index=False)