In [3]:
import os
import pandas as pd
import numpy as np
from pathlib import Path

In [4]:
DATA_DIR = "elliptic_dataset"

# wallet data
WALLETS_FEATURES = "wallets_features.csv"
WALLETS_CLASSES = "wallets_classes.csv"

# wallet to wallet edges
ADDRESS_TO_ADDRESS = "AddrAddr_edgelist.csv"

# To build the address to address graph (we dont need this probably)
ADDRESES_TO_TRANSACTIONS = "AddrTx_edgelist.csv"
TRANSACTIONS_TO_ADDRESSES = "TxAddr_edgelist.csv"


## 1. Load and inspect the actors dataset

#### 1.1.Let's first inspect the features of wallets

In [6]:
wallet_features = pd.read_csv(os.path.join(DATA_DIR, WALLETS_FEATURES))

In [13]:
print(wallet_features.shape[0])

1268260


In [16]:
wallet_features.head(5)

Unnamed: 0,address,Time step,num_txs_as_sender,num_txs_as receiver,first_block_appeared_in,last_block_appeared_in,lifetime_in_blocks,total_txs,first_sent_block,first_received_block,...,blocks_btwn_output_txs_min,blocks_btwn_output_txs_max,blocks_btwn_output_txs_mean,blocks_btwn_output_txs_median,num_addr_transacted_multiple,transacted_w_address_total,transacted_w_address_min,transacted_w_address_max,transacted_w_address_mean,transacted_w_address_median
0,111112TykSw72ztDN2WJger4cynzWYC5w,25,0.0,1.0,439586.0,439586.0,0.0,1.0,0.0,439586.0,...,0.0,0.0,0.0,0.0,0.0,24.0,1.0,1.0,1.0,1.0
1,1111DAYXhoxZx2tsRnzimfozo783x1yC2,25,0.0,8.0,439589.0,485959.0,46370.0,8.0,0.0,439589.0,...,0.0,20164.0,6624.285714,8060.0,0.0,8.0,1.0,1.0,1.0,1.0
2,1111DAYXhoxZx2tsRnzimfozo783x1yC2,29,0.0,8.0,439589.0,485959.0,46370.0,8.0,0.0,439589.0,...,0.0,20164.0,6624.285714,8060.0,0.0,8.0,1.0,1.0,1.0,1.0
3,1111DAYXhoxZx2tsRnzimfozo783x1yC2,39,0.0,8.0,439589.0,485959.0,46370.0,8.0,0.0,439589.0,...,0.0,20164.0,6624.285714,8060.0,0.0,8.0,1.0,1.0,1.0,1.0
4,1111DAYXhoxZx2tsRnzimfozo783x1yC2,39,0.0,8.0,439589.0,485959.0,46370.0,8.0,0.0,439589.0,...,0.0,20164.0,6624.285714,8060.0,0.0,8.0,1.0,1.0,1.0,1.0


#### 1.2. Lets now look at the wallet labels:
- Class 1 - Illicit (should be 14,266, so 2%)
- Class 2 - Licit (should be 251,088, so 31%)
- Class 3 - Unknown (the rest)

In [8]:
wallet_classes = pd.read_csv(os.path.join(DATA_DIR, WALLETS_CLASSES))

In [15]:
print(wallet_classes.shape[0])

822942


In [10]:
wallet_classes.head(5)

Unnamed: 0,address,class
0,111112TykSw72ztDN2WJger4cynzWYC5w,2
1,1111DAYXhoxZx2tsRnzimfozo783x1yC2,3
2,1111VHuXEzHaRCgXbVwojtaP7Co3QABb,2
3,111218KKkh1JJFRHbwM16AwCiVCc4m7he1,3
4,1115LWW3xsD9jT9VRY7viCN9S34RVAAuA,2


Let's verify the counts

In [14]:
illicit_count = wallet_classes[wallet_classes["class"] == 1].shape[0]
licit_count = wallet_classes[wallet_classes["class"] == 2].shape[0]
unknown_count = wallet_classes[wallet_classes["class"] == 3].shape[0]

print(f"Illicit transaction count: {illicit_count}")
print(f"Licit transaction count: {licit_count}")
print(f"Unknown transaction count: {unknown_count}")

Illicit transaction count: 14266
Licit transaction count: 251088
Unknown transaction count: 557588


Ok so the illicit and licit counts check out. When populating over time we will see that the counts should match up ot the total of wallet_features

#### 1.3. Let's inspect the address to address edges

In [17]:
address_to_address = pd.read_csv(os.path.join(DATA_DIR, ADDRESS_TO_ADDRESS))

In [18]:
print(address_to_address.shape[0])

2868964


In [19]:
address_to_address.head(5)

Unnamed: 0,input_address,output_address
0,14YRXHHof4BY1TVxN5FqYPcEdpmXiYT78a,1GASxu5nMntiRKdVtTVRvEbP965G51bhHH
1,14YRXHHof4BY1TVxN5FqYPcEdpmXiYT78a,14YRXHHof4BY1TVxN5FqYPcEdpmXiYT78a
2,13Lhad3SAmu2vqYg2dxbNcxH7LE77kJu2w,1GFdrdgtG34GChM8SMpMwcXFc4nYbH1A5G
3,1MAQQZn7EHP6J3erXByCciFiVcgS8ZhWqz,19q57SeCEzTnWrWVXA43nZzhSiXkYggh7c
4,1MAQQZn7EHP6J3erXByCciFiVcgS8ZhWqz,1Kk1NVYnCE8ALXDhgMM6HqTt1jDSvi6QBA


Ok this is not enough, we want to have some edge features. Lets build this ourselves and include transaction features such as amount etc.

## 2. Prepare an address-address transaction list with extra features (amount time etc)