In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

sns.set(style="darkgrid")

np.random.seed(0)
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

In [33]:
CUTOFF = 40
addr_tx_df = pd.read_csv("../dataset/Elliptic++ Dataset/AddrTx_edgelist.csv")
tx_addr_df = pd.read_csv("../dataset/Elliptic++ Dataset/TxAddr_edgelist.csv")

tx_info = pd.read_csv("../dataset/Elliptic++ Dataset/txs_features.csv")[["txId", "Time step"]]

addr_to_ts = addr_tx_df \
    .merge(
        tx_info,
        how="left",
        on="txId"
    ) \
    .rename(columns={"input_address": "addrId"})

addr_to_ts_inp = tx_addr_df \
    .merge(
        tx_info,
        how="left",
        on="txId"
    ) \
    .rename(columns={"output_address": "addrId"})


addr_to_ts_full = pd.concat([addr_to_ts_inp, addr_to_ts])

train_split = addr_to_ts_full[
    (addr_to_ts_full["Time step"] <= CUTOFF)
].drop_duplicates(["addrId", "txId"])

test_split = addr_to_ts_full[
    (addr_to_ts_full["Time step"] > CUTOFF)
].drop_duplicates(["addrId", "txId"])


In [34]:
train_split.shape, test_split.shape

((1012737, 3), (255523, 3))

In [35]:
train_split["addrId"].nunique(), test_split["addrId"].nunique()

(646229, 185027)

In [36]:
wallets_features_raw = pd.read_csv("../dataset/Elliptic++ Dataset/wallets_features_classes_combined.csv")
wallets_to_user = pd.read_csv("../dataset/custom/AddrUser.csv")
user_data = pd.read_csv("../dataset/custom/UserData.csv")

In [37]:
def get_split(valid_addr_tx, ts_cutoff, cutoff_type="left"):
    if cutoff_type == "left":
        wallets_with_features = wallets_features_raw[
            wallets_features_raw["address"].isin(valid_addr_tx["addrId"]) &
            (wallets_features_raw["Time step"] <= ts_cutoff)
        ]
    else:
        wallets_with_features = wallets_features_raw[
            wallets_features_raw["address"].isin(valid_addr_tx["addrId"]) &
            (wallets_features_raw["Time step"] > ts_cutoff)
        ]
    wallets_with_features["addrId"] = wallets_with_features["address"]
    wallets_with_features.drop("address", axis=1, inplace=True)

    # Gini
    wallet_to_ts = wallets_with_features[["addrId", "Time step"]]
    wallet_to_ts = wallet_to_ts.drop_duplicates(subset=['addrId', 'Time step'])

    def gini_coefficient(values):
        sorted_values = sorted(values)
        n = len(values)
        cumsum = sum([(i+1) * val for i, val in enumerate(sorted_values)])
        return (2 * cumsum) / (n * sum(values)) - (n + 1) / n

    def safe_gini_coefficient(values):
        if len(values) < 2:
            return 0
        if sum(values) == 0:
            return 0
        try:
            return gini_coefficient(values)
        except:
            return 0

    wallet_to_gini = wallet_to_ts.groupby('addrId')['Time step'].apply(safe_gini_coefficient)
    wallet_to_gini = wallet_to_gini.reset_index().rename(columns={"Time step": "addr_gini"})

    # Fees
    wallets_to_tx = pd.read_csv("../dataset/Elliptic++ Dataset/AddrTx_edgelist.csv")
    wallets_to_tx["addrId"] = wallets_to_tx["input_address"]
    wallets_to_tx.drop("input_address", axis=1, inplace=True)

    wallets_to_tx = wallets_to_tx[
        wallets_to_tx["addrId"].isin(valid_addr_tx["addrId"]) &
        wallets_to_tx["txId"].isin(valid_addr_tx["txId"])
    ]
    tx_to_fee = pd.read_csv("../dataset/Elliptic++ Dataset/txs_features.csv")[["txId", "fees", "Time step"]]
    tx_to_fee = tx_to_fee[tx_to_fee["txId"].isin(valid_addr_tx["txId"])]

    wallets_to_tx_features = pd.merge(
        left=wallets_to_tx,
        right=tx_to_fee,
        left_on="txId",
        right_on="txId",
        how="left",
    )
    wallets_to_tx_features = wallets_to_tx_features.drop_duplicates(subset=['addrId', 'txId', "Time step"])
    wallets_to_tx_features.drop("txId", axis=1, inplace=True)
    wallets_to_tx_features["whole_fee_5"] = (np.floor(wallets_to_tx_features["fees"] * (10**5)) == (wallets_to_tx_features["fees"] * (10**5))).astype(int)
    wallets_to_tx_features.drop("fees", axis=1, inplace=True)
    
    addr_to_whole_in_ts = wallets_to_tx_features.groupby(["addrId", "Time step"])["whole_fee_5"].sum().reset_index()
    addr_to_whole_in_ts = addr_to_whole_in_ts.rename(columns={"whole_fee_5": "whole_fees_in_ts_5"})

    addr_to_whole_fee = wallets_to_tx_features.groupby(["addrId"])["whole_fee_5"].sum().reset_index()
    addr_to_whole_fee = addr_to_whole_fee.rename(columns={"whole_fee_5": "whole_fees_5"})

    wallets_to_fees_median = pd.merge(
        left=wallets_to_tx,
        right=tx_to_fee,
        left_on="txId",
        right_on="txId",
        how="left",
    )
    wallets_to_fees_median = wallets_to_fees_median.drop_duplicates(subset=['addrId', 'txId'])
    wallets_to_fees_median.drop("txId", axis=1, inplace=True)
    wallets_to_fees_median.drop("Time step", axis=1, inplace=True)

    wallets_to_fees_median = wallets_to_fees_median.groupby(["addrId"])["fees"].median().reset_index()
    wallets_to_fees_median = wallets_to_fees_median.rename(columns={"fees": "fees_median"})

    wallets_to_whole_features = pd.merge(
        left=wallets_with_features,
        right=wallet_to_gini,
        left_on="addrId",
        right_on="addrId",
        how="left",
    )
    wallets_to_whole_features.drop("fees_median", axis=1, inplace=True)
    wallets_to_whole_features = pd.merge(
        left=wallets_to_whole_features,
        right=addr_to_whole_fee,
        left_on="addrId",
        right_on="addrId",
        how="left",
    )
    wallets_to_whole_features = pd.merge(
        left=wallets_to_whole_features,
        right=wallets_to_fees_median,
        left_on="addrId",
        right_on="addrId",
        how="left",
    )

    wallets_to_whole_features = pd.merge(
        left=wallets_to_whole_features,
        right=addr_to_whole_in_ts,
        on=["addrId", "Time step"],
        how="left",
    )
    wallets_to_whole_features["whole_fees_5"] = wallets_to_whole_features["whole_fees_5"].fillna(0)
    wallets_to_whole_features["whole_fees_in_ts_5"] = wallets_to_whole_features["whole_fees_in_ts_5"].fillna(0)

    return wallets_to_whole_features



In [38]:
train_data = get_split(
    train_split,
    CUTOFF
)
test_data = get_split(
    test_split,
    CUTOFF,
    cutoff_type="right"
)

In [39]:
train_data["class"].value_counts(), train_data.shape

(class
 3    722647
 2    265709
 1     24381
 Name: count, dtype: int64,
 (1012737, 61))

In [40]:
test_data["class"].value_counts(), test_data.shape

(class
 3    178141
 2     73162
 1      4220
 Name: count, dtype: int64,
 (255523, 61))

In [41]:
wallets_features_raw.shape

(1268260, 58)

In [None]:
train_data.to_csv(
    "../dataset/custom/train_test_split/wallets_only/wallets_features__train.csv",
    index=False,
)
test_data.to_csv(
    "../dataset/custom/train_test_split/wallets_only/wallets_features__test.csv",
    index=False,
)

## Aggregating by wallet

In [8]:
wallets_feats_train = pd.read_csv("../dataset/custom/train_test_split/wallets_only/wallets_features__train.csv")
wallets_feats_test = pd.read_csv("../dataset/custom/train_test_split/wallets_only/wallets_features__test.csv")
wallets_feats_train.head()

Unnamed: 0,Time step,class,num_txs_as_sender,num_txs_as receiver,first_block_appeared_in,last_block_appeared_in,lifetime_in_blocks,total_txs,first_sent_block,first_received_block,...,transacted_w_address_total,transacted_w_address_min,transacted_w_address_max,transacted_w_address_mean,transacted_w_address_median,addrId,addr_gini,whole_fees_5,fees_median,whole_fees_in_ts_5
0,25,2,0.0,1.0,439586.0,439586.0,0.0,1.0,0.0,439586.0,...,24.0,1.0,1.0,1.0,1.0,111112TykSw72ztDN2WJger4cynzWYC5w,0.0,0.0,,0.0
1,25,3,0.0,8.0,439589.0,485959.0,46370.0,8.0,0.0,439589.0,...,8.0,1.0,1.0,1.0,1.0,1111DAYXhoxZx2tsRnzimfozo783x1yC2,0.100358,0.0,,0.0
2,29,3,0.0,8.0,439589.0,485959.0,46370.0,8.0,0.0,439589.0,...,8.0,1.0,1.0,1.0,1.0,1111DAYXhoxZx2tsRnzimfozo783x1yC2,0.100358,0.0,,0.0
3,39,3,0.0,8.0,439589.0,485959.0,46370.0,8.0,0.0,439589.0,...,8.0,1.0,1.0,1.0,1.0,1111DAYXhoxZx2tsRnzimfozo783x1yC2,0.100358,0.0,,0.0
4,39,3,0.0,8.0,439589.0,485959.0,46370.0,8.0,0.0,439589.0,...,8.0,1.0,1.0,1.0,1.0,1111DAYXhoxZx2tsRnzimfozo783x1yC2,0.100358,0.0,,0.0


In [13]:
feats_train = wallets_feats_train.drop_duplicates("addrId").drop(columns=["Time step"], axis=1).to_csv(
    "../dataset/custom/train_test_split/wallets_only/wallets_features_agg__train.csv",
    index=False,
)

In [14]:
feats_test = wallets_feats_test.drop_duplicates("addrId").drop(columns=["Time step"], axis=1).to_csv(
    "../dataset/custom/train_test_split/wallets_only/wallets_features_agg__test.csv",
    index=False,
)

In [17]:
feats_train = wallets_feats_train.drop_duplicates("addrId").drop(columns=["Time step"], axis=1)
feats_test = wallets_feats_test.drop_duplicates("addrId").drop(columns=["Time step"], axis=1)