In [8]:
%reload_ext autoreload
%autoreload 2

import warnings

import polars as pl

warnings.filterwarnings("ignore")
warnings.filterwarnings("ignore", category=FutureWarning)

_ = pl.Config.set_tbl_rows(10)
_ = pl.Config.set_tbl_cols(20)

In [9]:
from sdpc.data import (
    joined_dex_swaps_df,
    joined_token_transfers_df,
    joined_train_df,
    joined_transactions_df,
    test_data_df,
    wallet_addresses_df,
)

addresses: pl.DataFrame = wallet_addresses_df()
train_df: pl.DataFrame = joined_train_df()
transactions_df: pl.DataFrame = joined_transactions_df()
dex_swaps_df: pl.DataFrame = joined_dex_swaps_df()
token_transfers_df: pl.DataFrame = joined_token_transfers_df()
test_df: pl.DataFrame = test_data_df()

# Features start with addresses
features_df = addresses

In [10]:
# Print stats of unique addresses in train and test datasets
train_addresses = train_df.select("address").unique()
test_addresses = test_df.select("address").unique()

print(f"Number of unique addresses in train dataset: {train_addresses.height}")
print(f"Number of unique addresses in test dataset: {test_addresses.height}")

# Check for any overlap between train and test addresses
overlap = train_addresses.join(test_addresses, on="address", how="inner")
print(f"Number of addresses that appear in both train and test: {overlap.height}")

Number of unique addresses in train dataset: 99067
Number of unique addresses in test dataset: 20369
Number of addresses that appear in both train and test: 2


## Graph Features

In [11]:
community_data = pl.read_csv(
    "../data/processed/addresses_community_simple.csv"
).with_columns(
    pl.col("address").count().over("community").alias("community_size"),
)

features_df = features_df.join(community_data, on="address", how="left")

In [12]:
# Load network metrics data
network_metrics = pl.read_csv("../data/processed/network_metrics.csv").drop(
    "label", "split"
)

# Join network metrics with features dataframe
features_df = features_df.join(network_metrics, on="address", how="left")


In [13]:
# Load node2vec embeddings
node2vec_embeddings = pl.read_parquet(
    "../data/processed/node2vec_embeddings.parquet"
).drop("label", "split")

# Join node2vec embeddings with features dataframe
features_df = features_df.join(node2vec_embeddings, on="address", how="left")


# Flipside Data

In [14]:
flipside_addresses_labels: pl.DataFrame = pl.read_parquet(
    "../data/external/flipside_address_labels.parquet"
)

features_df = features_df.join(
    flipside_addresses_labels,
    on="address",
    how="left",
)

In [15]:
flipside_contracts_labels: pl.DataFrame = pl.read_parquet(
    "../data/external/flipside_contracts_data.parquet"
)

flipside_contracts = flipside_contracts_labels.get_column("address")
flipside_contracts_creators = flipside_contracts_labels.get_column("creator_address")

features_df = features_df.with_columns(
    pl.col("address").is_in(flipside_contracts).alias("flipside_is_contract"),
    pl.col("address")
    .is_in(flipside_contracts_creators)
    .alias("flipside_is_contract_creator"),
)


## Known Sybil Lists Hits

In [16]:
zk_cluster_list = (
    pl.read_csv("../data/external/zk_cluster_list.csv", ignore_errors=True)
    .select(pl.col("Wallet Address").alias("address"))
    .filter(pl.col("address").str.starts_with("0x"))
)

features_df = features_df.with_columns(
    pl.col("address")
    .is_in(zk_cluster_list.get_column("address"))
    .alias("zk_cluster_list_hit")
)

In [17]:
# Read the ZKSync sybil list
all_zksync_sybil_list = pl.DataFrame()
for file in [
    "../data/external/zksync_sybil_list_0.csv",
    "../data/external/zksync_sybil_list_1.csv",
    "../data/external/zksync_sybil_list_2.csv",
]:
    zksync_sybil_list = pl.read_csv(file)
    all_zksync_sybil_list = pl.concat([all_zksync_sybil_list, zksync_sybil_list])

features_df = features_df.with_columns(
    pl.col("address")
    .is_in(all_zksync_sybil_list.get_column("userId"))
    .alias("zksync_sybil_list_hit")
)

In [18]:
# Read the Layer Zero wallet list
layer_zero_wallet_list = pl.read_csv("../data/external/layer_zero_wallet_list.csv")
layer_zero_wallet_addresses = layer_zero_wallet_list.get_column("ADDRESS")

# Add a column indicating if the address is in the Layer Zero wallet list
features_df = features_df.with_columns(
    pl.col("address")
    .is_in(layer_zero_wallet_addresses)
    .alias("layer_zero_wallet_list_hit")
)

In [19]:
# Read the CT App LZ list
ct_app_lz_list = pl.read_parquet("../data/external/ct_app_lz_list.parquet")

# Add a column indicating if the address is in the CT App LZ list
features_df = features_df.with_columns(
    pl.col("address")
    .is_in(ct_app_lz_list.get_column("Line"))
    .alias("ct_app_lz_list_hit")
)

# Read the CT App LZ list
ct_app_bn_wl = pl.read_parquet("../data/external/ct_app_bn_wl.parquet")

# Add a column indicating if the address is in the CT App LZ list
features_df = features_df.with_columns(
    pl.col("address")
    .is_in(ct_app_bn_wl.get_column("Address"))
    .alias("ct_app_bn_wl_hit")
)


In [20]:
lz_initial_list = pl.read_parquet("../data/external/lz_initial_list.parquet")

features_df = features_df.with_columns(
    pl.col("address")
    .is_in(lz_initial_list.get_column("ADDRESS"))
    .alias("lz_initial_list_hit")
)


In [21]:
# Read the LZ provisional sybil list
lz_provisional_sybil_list = pl.read_parquet(
    "../data/external/lz_provisional_sybil_list.parquet"
)

features_df = features_df.with_columns(
    pl.col("address")
    .is_in(lz_provisional_sybil_list.get_column("address"))
    .alias("lz_provisional_sybil_list_hit")
)

In [22]:
# Read the Hop sybil list
hop_sybil_list = pl.read_csv("../data/external/hop_sybils.csv")

# Add a column indicating if the address is in the Hop sybil list
features_df = features_df.with_columns(
    pl.col("address")
    .is_in(hop_sybil_list.get_column("address"))
    .alias("hop_sybil_list_hit")
)

# Read the Hop sybil list
hop_all_data_filtered = pl.read_csv("../data/external/hop_all_data_filtered.csv")

# Add a column indicating if the address is in the Hop sybil list
features_df = features_df.with_columns(
    pl.col("address")
    .is_in(hop_all_data_filtered.get_column("Wallet"))
    .alias("hop_all_data_filtered_hit")
)


## Sybil Labeling

In [23]:
features_df = features_df.join(
    train_df.select(["address", "label"]),
    on="address",
    how="left",
)

## Transactions

In [24]:
transactions_df = (
    transactions_df.select(
        pl.col("BLOCK_NUMBER").alias("block_number"),
        pl.col("BLOCK_TIMESTAMP").alias("block_timestamp"),
        pl.col("TX_HASH").alias("tx_hash"),
        pl.col("FROM_ADDRESS").alias("from_address"),
        pl.col("TO_ADDRESS").alias("to_address"),
        pl.col("VALUE").alias("value"),
        pl.col("TX_FEE").alias("tx_fee"),
        pl.col("GAS_PRICE").alias("gas_price"),
        pl.col("GAS_LIMIT").alias("gas_limit"),
        pl.col("GAS_USED").alias("gas_used"),
        pl.col("INPUT_DATA").alias("input_data"),
        pl.col("NETWORK").alias("network"),
    )
    .join(
        features_df,
        left_on="from_address",
        right_on="address",
        how="left",
    )
    .join(
        features_df,
        left_on="to_address",
        right_on="address",
        how="left",
        suffix="_to",
    )
)

In [25]:
common_aggregations = [
    pl.col("block_number").n_unique().alias("unique_block_numbers"),
    pl.col("block_timestamp").min().alias("min_block_timestamp"),
    pl.col("block_timestamp").max().alias("max_block_timestamp"),
    pl.col("tx_hash").n_unique().alias("unique_tx_hashes"),
    pl.col("value").sum().alias("total_value"),
    pl.col("value").mean().alias("avg_value"),
    pl.col("value").max().alias("max_value"),
    pl.col("value").min().alias("min_value"),
    pl.col("tx_fee").sum().alias("total_tx_fee"),
    pl.col("tx_fee").mean().alias("avg_tx_fee"),
    pl.col("gas_price").sum().cast(pl.Int64).alias("total_gas_price"),
    pl.col("gas_price").mean().cast(pl.Int64).alias("avg_gas_price"),
    pl.col("gas_limit").sum().cast(pl.Int64).alias("total_gas_limit"),
    pl.col("gas_limit").mean().cast(pl.Int64).alias("avg_gas_limit"),
    pl.col("gas_used").sum().cast(pl.Int64).alias("total_gas_used"),
    pl.col("gas_used").mean().cast(pl.Int64).alias("avg_gas_used"),
    pl.col("network").n_unique().alias("unique_networks"),
    pl.col("community").n_unique().alias("unique_communities"),
    pl.col("community_size").mean().alias("avg_community_size"),
]

from_aggregations = [
    pl.col("from_address").n_unique().alias("unique_from_addresses"),
    pl.col("value").sort_by("block_timestamp").first().alias("first_tx_from_value"),
    # pl.col("passport_stamps_score").mean().alias("avg_passport_stamps_score"),
    pl.col("flipside_address_name").n_unique().alias("address_name_count"),
    pl.col("flipside_is_contract").mean().alias("avg_flipside_is_contract"),
    pl.col("flipside_is_contract").sum().alias("flipside_is_contract_count"),
]

to_aggregations = [
    pl.col("to_address").n_unique().alias("unique_to_addresses"),
    pl.col("value").sort_by("block_timestamp").first().alias("first_tx_to_value"),
    # pl.col("passport_stamps_score_to").mean().alias("avg_passport_stamps_score_to"),
    pl.col("flipside_address_name_to").n_unique().alias("address_name_count_to"),
    pl.col("flipside_is_contract_to").mean().alias("avg_flipside_is_contract_to"),
    pl.col("flipside_is_contract_to").sum().alias("flipside_is_contract_count_to"),
]

from_all_metrics_df = (
    transactions_df.group_by(["from_address"])
    .agg(
        *common_aggregations,
        *to_aggregations,
        pl.col("tx_hash")
        .filter(pl.col("label_to") == 1)
        .n_unique()
        .alias("num_transactions_to_sybil"),
        pl.col("to_address")
        .filter(pl.col("label_to") == 1)
        .n_unique()
        .alias("num_unique_to_sybil_addresses"),
    )
    .rename({"from_address": "address"})
)

to_all_metrics_df = (
    transactions_df.group_by(["to_address"])
    .agg(
        *common_aggregations,
        *from_aggregations,
        pl.col("tx_hash")
        .filter(pl.col("label") == 1)
        .n_unique()
        .alias("num_transactions_from_sybil"),
        pl.col("from_address")
        .filter(pl.col("label") == 1)
        .n_unique()
        .alias("num_unique_from_sybil_addresses"),
    )
    .rename({"to_address": "address"})
)

from_network_transactions_metrics_df = (
    transactions_df.group_by(["from_address", "network"])
    .agg(
        *common_aggregations,
        pl.col("to_address").n_unique().alias("unique_to_addresses"),
    )
    .pivot(on="network", index="from_address")
).rename({"from_address": "address"})

to_network_transactions_metrics_df = (
    transactions_df.group_by(["to_address", "network"])
    .agg(
        *common_aggregations,
        pl.col("from_address").n_unique().alias("unique_from_addresses"),
    )
    .pivot(on="network", index="to_address")
).rename({"to_address": "address"})


## DEX Swaps

In [26]:
dex_swaps_df = (
    dex_swaps_df.select(
        pl.col("BLOCK_NUMBER").alias("block_number"),
        pl.col("BLOCK_TIMESTAMP").alias("block_timestamp"),
        pl.col("TX_HASH").alias("tx_hash"),
        pl.col("ORIGIN_FROM_ADDRESS").alias("origin_from_address"),
        pl.col("ORIGIN_TO_ADDRESS").alias("origin_to_address"),
        pl.col("CONTRACT_ADDRESS").alias("contract_address"),
        pl.col("POOL_NAME").alias("pool_name"),
        pl.col("AMOUNT_IN_USD").cast(pl.Int64).alias("amount_in_usd"),
        pl.col("AMOUNT_OUT_USD").cast(pl.Int64).alias("amount_out_usd"),
        pl.col("SENDER").alias("sender"),
        pl.col("TX_TO").alias("tx_to"),
        pl.col("PLATFORM").alias("platform"),
        pl.col("TOKEN_IN").alias("token_in"),
        pl.col("TOKEN_OUT").alias("token_out"),
        pl.col("SYMBOL_IN").alias("symbol_in"),
        pl.col("SYMBOL_OUT").alias("symbol_out"),
        pl.col("NETWORK").alias("network"),
    )
    .join(
        features_df,
        left_on="origin_from_address",
        right_on="address",
        how="left",
    )
    .join(
        features_df,
        left_on="origin_to_address",
        right_on="address",
        how="left",
        suffix="_to",
    )
)

In [27]:
transactions_df

block_number,block_timestamp,tx_hash,from_address,to_address,value,tx_fee,gas_price,gas_limit,gas_used,…,zk_cluster_list_hit_to,zksync_sybil_list_hit_to,layer_zero_wallet_list_hit_to,ct_app_lz_list_hit_to,ct_app_bn_wl_hit_to,lz_initial_list_hit_to,lz_provisional_sybil_list_hit_to,hop_sybil_list_hit_to,hop_all_data_filtered_hit_to,label_to
"decimal[38,0]",datetime[ms],str,str,str,f64,f64,f64,"decimal[38,0]","decimal[38,0]",…,bool,bool,bool,bool,bool,bool,bool,bool,bool,"decimal[1,0]"
11059545,2020-10-15 09:29:52,"""0xa708fd5c7bb25d6bd7f4ef9d8878…","""0x8fd00f170fdf3772c5ebdcd90bf2…","""0x3ada4c7efe5b8d636ec146270e74…",0.114535,0.000126,6.0,100000,21000,…,false,false,false,false,false,false,false,false,false,0
11061738,2020-10-15 17:26:38,"""0x5cac2e3f829a60564c749422d11f…","""0x00de4b13153673bcae2616b67bf8…","""0x14d33bee5c6ec6c9540b86a24da1…",35.5,0.001155,55.0,21000,21000,…,,,,,,,,,,
11059434,2020-10-15 09:01:31,"""0x94436ef6efb8d4d3e87a4171a01b…","""0x2e1eec9908c9e324551b6aaba3e8…","""0xe1531c9c5eb5c07af24aa8fb0b1f…",0.2,0.002856,136.0,21000,21000,…,,,,,,,,,,
11061593,2020-10-15 16:53:27,"""0xf51f0b918a543cf6314111ed4530…","""0x7caf480cc01c14d66b680146a50d…","""0x69ae0b74d23a741a25a6e997de64…",0.005139,0.0019635,93.5,21000,21000,…,,,,,,,,,,
11063007,2020-10-15 22:00:22,"""0x6f1f945ccf2313595cdb305e99fb…","""0xcc0837a34af3ef10b8f95c65bca1…","""0x7a250d5630b4cf539739df2c5dac…",2.0,0.010686,84.0,168107,127212,…,,,,,,,,,,
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
12731122,2024-04-04 17:26:31,"""0xf68a1d675a0c1a1cc86b3ef42465…","""0x25e58696fbc8eb88301ee06a57f7…","""0xaa7b1a92384d5acc9afcc3edc2d5…",0.000777,0.000007,0.060159,195484,123745,…,,,,,,,,,,
12715153,2024-04-04 08:34:13,"""0x188d0efc2e172344fd46a74948dc…","""0x119bb0c3ddb2495cf69f8cc0a70a…","""0xe3b53af74a4bf62ae55110552908…",0.000335,6.9756e-7,0.003341,218822,208794,…,,,,,,,,,,
12729411,2024-04-04 16:29:29,"""0x2dbd056cdc0c7298519b94cb452a…","""0xf1ef8477bbdfc733b54fc66bf911…","""0x3fc91a3afd70395cd496c647d5a6…",0.005,0.000018,0.162292,187592,109621,…,,,,,,,,,,
12715310,2024-04-04 08:39:27,"""0x6eba51362d5b929d4b587eb71f90…","""0x76bf0b7d61878ec164618ae00f2b…","""0x1195cf65f83b3a5768f3c496d3a0…",0.000075,6.8308e-7,0.003459,205800,197485,…,,,,,,,,,,


In [28]:
common_aggregations = [
    pl.col("tx_hash").n_unique().alias("unique_tx_hashes"),
    pl.col("contract_address").n_unique().alias("unique_contract_addresses"),
    pl.col("block_timestamp").min().alias("min_block_timestamp"),
    pl.col("block_timestamp").max().alias("max_block_timestamp"),
    pl.col("pool_name").n_unique().alias("unique_pool_names"),
    pl.col("amount_in_usd").sum().alias("total_amount_in_usd"),
    pl.col("amount_in_usd").mean().alias("avg_amount_in_usd"),
    pl.col("amount_in_usd").max().alias("max_amount_in_usd"),
    pl.col("amount_in_usd").min().alias("min_amount_in_usd"),
    pl.col("amount_out_usd").sum().alias("total_amount_out_usd"),
    pl.col("amount_out_usd").mean().alias("avg_amount_out_usd"),
    pl.col("amount_out_usd").max().alias("max_amount_out_usd"),
    pl.col("amount_out_usd").min().alias("min_amount_out_usd"),
    pl.col("platform").n_unique().alias("unique_platforms"),
    pl.col("platform")
    .value_counts()
    .head(1)
    .struct.field("platform")
    .first()
    .alias("most_common_platform"),
    pl.col("community").n_unique().alias("unique_communities"),
    pl.col("community_size").mean().alias("avg_community_size"),
]

from_aggregations = [
    pl.col("origin_from_address").n_unique().alias("unique_origin_from_addresses")
]
to_aggregations = [
    pl.col("origin_to_address").n_unique().alias("unique_origin_to_addresses")
]

dex_from_all_metrics_df = (
    dex_swaps_df.group_by(["origin_from_address"])
    .agg(
        *common_aggregations,
        *to_aggregations,
    )
    .rename({"origin_from_address": "address"})
)

dex_to_all_metrics_df = (
    dex_swaps_df.group_by(["origin_to_address"])
    .agg(
        *common_aggregations,
        *from_aggregations,
    )
    .rename({"origin_to_address": "address"})
)

## Token Transfers

In [30]:
token_transfers_df = (
    token_transfers_df.select(
        pl.col("BLOCK_NUMBER").alias("block_number"),
        pl.col("BLOCK_TIMESTAMP").alias("block_timestamp"),
        pl.col("TX_HASH").alias("tx_hash"),
        pl.col("ORIGIN_FROM_ADDRESS").alias("origin_from_address"),
        pl.col("ORIGIN_TO_ADDRESS").alias("origin_to_address"),
        pl.col("CONTRACT_ADDRESS").alias("contract_address"),
        pl.col("FROM_ADDRESS").alias("from_address"),
        pl.col("TO_ADDRESS").alias("to_address"),
        pl.col("AMOUNT_USD").cast(pl.Int64, wrap_numerical=True).alias("amount_usd"),
        pl.col("SYMBOL").alias("symbol"),
        pl.col("NETWORK").alias("network"),
    )
    .join(
        features_df,
        left_on="from_address",
        right_on="address",
        how="left",
    )
    .join(
        features_df,
        left_on="to_address",
        right_on="address",
        how="left",
        suffix="_to",
    )
)


In [31]:
common_aggregations = [
    pl.col("tx_hash").n_unique().alias("unique_tx_hashes"),
    pl.col("contract_address").n_unique().alias("unique_contract_addresses"),
    pl.col("block_timestamp").min().alias("min_block_timestamp"),
    pl.col("block_timestamp").max().alias("max_block_timestamp"),
    pl.col("symbol").n_unique().alias("unique_symbols"),
    pl.col("amount_usd").sum().alias("total_amount_usd"),
    pl.col("amount_usd").mean().alias("avg_amount_usd"),
    pl.col("amount_usd").max().alias("max_amount_usd"),
    pl.col("amount_usd").min().alias("min_amount_usd"),
    pl.col("network").n_unique().alias("unique_networks"),
    pl.col("symbol")
    .value_counts()
    .head(1)
    .struct.field("symbol")
    .first()
    .alias("most_common_symbol"),
]

to_aggregations = [
    pl.col("to_address").n_unique().alias("unique_to_addresses"),
]

from_aggregations = [
    pl.col("from_address").n_unique().alias("unique_from_addresses"),
]

token_transfers_from_all_metrics_df = (
    token_transfers_df.group_by(["from_address"])
    .agg(
        *common_aggregations,
        *to_aggregations,
    )
    .rename({"from_address": "address"})
)

token_transfers_to_all_metrics_df = (
    token_transfers_df.group_by(["to_address"])
    .agg(
        *common_aggregations,
        *from_aggregations,
    )
    .rename({"to_address": "address"})
)


## Aggregate Events

In [32]:
features_df = (
    features_df.join(
        from_all_metrics_df,
        on="address",
        how="left",
        suffix="_from_all",
    )
    .join(
        to_all_metrics_df,
        on="address",
        how="left",
        suffix="_to_all",
    )
    .join(
        from_network_transactions_metrics_df,
        on="address",
        how="left",
        suffix="_from_network",
    )
    .join(
        to_network_transactions_metrics_df,
        on="address",
        how="left",
        suffix="_to_network",
    )
    .join(
        dex_from_all_metrics_df,
        on="address",
        how="left",
        suffix="_dex_from_all",
    )
    .join(
        dex_to_all_metrics_df,
        on="address",
        how="left",
        suffix="_dex_to_all",
    )
    .join(
        token_transfers_from_all_metrics_df,
        on="address",
        how="left",
        suffix="_token_transfers_from_all",
    )
    .join(
        token_transfers_to_all_metrics_df,
        on="address",
        how="left",
        suffix="_token_transfers_to_all",
    )
)

## Extra Features

In [33]:
import polars.selectors as cs

features_df = features_df.with_columns(
    pl.min_horizontal(features_df.select(cs.datetime()).columns).alias(
        "first_interaction"
    ),
    pl.max_horizontal(features_df.select(cs.datetime()).columns).alias(
        "last_interaction"
    ),
).with_columns(
    (pl.col("last_interaction") - pl.col("first_interaction"))
    .dt.total_days()
    .alias("interaction_duration"),
    (pl.datetime(2025, 4, 26) - pl.col("first_interaction"))
    .dt.total_days()
    .alias("days_since_first_interaction"),
    (pl.datetime(2025, 4, 26) - pl.col("last_interaction"))
    .dt.total_days()
    .alias("days_since_last_interaction"),
)

In [34]:
features_df = features_df.with_columns(
    pl.sum_horizontal(pl.all().is_null()).alias("null_count")
)

In [35]:
features_df = features_df.with_columns(
    # 1. Total outgoing transactions (native, DEX, token) per day of interaction
    (
        (
            pl.col("unique_tx_hashes").fill_null(0)
            + pl.col("unique_tx_hashes_dex_from_all").fill_null(0)
            + pl.col("unique_tx_hashes_token_transfers_from_all").fill_null(0)
        )
        / (pl.col("interaction_duration").fill_null(0) + 1.0)
    ).alias("tx_per_day"),
    # 2. Average ETH value per outgoing native transaction
    (
        pl.col("total_value").fill_null(0)
        / (pl.col("unique_tx_hashes").fill_null(0) + 1e-9)
    ).alias("avg_native_eth_out_value"),
    # 3. Average ETH value per incoming native transaction
    (
        pl.col("total_value_to_all").fill_null(0)
        / (pl.col("unique_tx_hashes_to_all").fill_null(0) + 1e-9)
    ).alias("avg_native_eth_in_value"),
    # 4. Ratio of count of outgoing native ETH transactions to incoming native ETH transactions
    (
        pl.col("unique_tx_hashes").fill_null(0)
        / (pl.col("unique_tx_hashes_to_all").fill_null(0) + 1e-9)
    ).alias("native_tx_flow_ratio"),
    # 5. Ratio of total native transaction fees to total native ETH value sent
    (
        pl.col("total_tx_fee").fill_null(0)
        / (pl.col("total_value").fill_null(0) + 1e-9)
    ).alias("fee_to_value_ratio_native_out"),
    # 6. Proportion of outgoing native transactions sent to addresses labeled as Sybil
    (
        pl.col("num_transactions_to_sybil").fill_null(0)
        / (pl.col("unique_tx_hashes").fill_null(0) + 1e-9)
    ).alias("outgoing_to_sybil_tx_ratio"),
    # 7. Average USD value of assets swapped in DEXs by the address
    (
        pl.col("total_amount_in_usd").fill_null(0)
        / (pl.col("unique_tx_hashes_dex_from_all").fill_null(0) + 1e-9)
    ).alias("avg_dex_swap_in_usd_value"),
    # 8. Average USD value of tokens transferred out by the address
    (
        pl.col("total_amount_usd").fill_null(0)
        / (pl.col("unique_tx_hashes_token_transfers_from_all").fill_null(0) + 1e-9)
    ).alias("avg_token_transfer_out_usd_value"),
    # 9. Proportion of outgoing native transactions on Base network
    (
        pl.col("unique_tx_hashes_base").fill_null(0)
        / (pl.col("unique_tx_hashes").fill_null(0) + 1e-9)
    ).alias("proportion_base_native_tx_out"),
    # 10. Average number of unique recipients per outgoing native transaction
    (
        pl.col("unique_to_addresses").fill_null(0)
        / (pl.col("unique_tx_hashes").fill_null(0) + 1e-9)
    ).alias("recipient_diversity_native_out"),
)

## Postprocessing

In [36]:
# Get columns that have only nulls in the train set
train_df = features_df.filter(pl.col("split") == "train")
train_null_cols = [
    col
    for col in train_df.columns
    if (train_df[col].is_null().sum() == train_df.height)
]

features_df = features_df.drop(train_null_cols)

In [37]:
features_df.shrink_to_fit().write_parquet(
    "../data/processed/features_df.parquet", compression="zstd"
)

: 