In [8]:
%reload_ext autoreload
%autoreload 2

import warnings

import polars as pl

warnings.filterwarnings("ignore")
warnings.filterwarnings("ignore", category=FutureWarning)

_ = pl.Config.set_tbl_rows(10)
_ = pl.Config.set_tbl_cols(20)

In [40]:
from sdpc.data import joined_train_df, test_data_df

train_df = joined_train_df()
test_df = test_data_df()
features_df = pl.read_parquet("../data/processed/features_df.parquet")

train_df = train_df.join(features_df.drop("label"), on="address", how="left")
test_df = test_df.join(features_df, on="address", how="left")

In [41]:
print(f"Train set shape: {train_df.shape}")

Train set shape: (99067, 267)


### Train Cleaning

In [42]:
train_df.filter(pl.col("flipside_is_contract") == False)

address,label,split,community,community_size,degree,degree_centrality,pagerank,eigenvector_centrality,clustering_coefficient,…,tx_per_day,avg_native_eth_out_value,avg_native_eth_in_value,native_tx_flow_ratio,fee_to_value_ratio_native_out,outgoing_to_sybil_tx_ratio,avg_dex_swap_in_usd_value,avg_token_transfer_out_usd_value,proportion_base_native_tx_out,recipient_diversity_native_out
str,"decimal[1,0]",str,i64,u32,i64,f64,f64,f64,f64,…,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""0xd4bd9ccf9339d4c7cf216520fd20…",0,"""train""",273690,1,0,0.0,0.000001,0.0,0.0,…,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"""0xc504596cc84dda3c5445721de880…",1,"""train""",2,26657,2,0.000003,0.000002,0.0,0.0,…,0.023632,0.024608,0.046297,1.833333,0.050961,0.0,15.0,17.2,0.181818,0.909091
"""0x53901b834d8121c512185d182163…",0,"""train""",174853,1,0,0.0,0.000001,0.0,0.0,…,3.0,9.6188e-8,0.0,2.0000e9,2.451349,0.0,0.0,0.0,1.0,1.0
"""0xc52021c8457cf9976b7103034ffd…",0,"""train""",204897,1,0,0.0,0.000001,0.0,0.0,…,0.01506,0.029446,0.031335,1.0,0.028883,0.0,0.0,235.0,0.0,1.0
"""0x083b59dcb9655d2d58443d1ea228…",0,"""train""",143224,1,0,0.0,0.000001,0.0,0.0,…,0.166667,0.0,0.0,0.0,0.0,0.0,0.0,2877.999997,0.0,0.0
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""0x09057f206f565a29bb9cc8846453…",0,"""train""",175979,2,1,0.000001,0.000002,0.0,0.0,…,0.012153,0.0,0.216264,0.0,0.0,0.0,0.0,2300.285714,0.0,0.0
"""0x292d726143c8a8ed80191cff8c79…",0,"""train""",106657,1,0,0.0,0.000001,0.0,0.0,…,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"""0x96990bf33d7878ef734eaccb23c8…",0,"""train""",292668,1,0,0.0,0.000001,0.0,0.0,…,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"""0x100b1f4670e63c6da4a4b78d7cd8…",0,"""train""",84648,1,0,0.0,0.000001,0.0,0.0,…,2.0,1.0000e-8,0.000002,1.0,34.309469,0.0,0.0,0.0,1.0,1.0


In [43]:
print(f"Train set shape: {train_df.shape}")

# Remove contracts
print(f"Removing {train_df.filter(pl.col('flipside_is_contract')).shape[0]} contracts")
train_df = train_df.filter(pl.col("flipside_is_contract") == False)  # noqa: E712

# # Remove CEX addresses
print(
    f"Removing {train_df.filter(pl.col('flipside_label_type') == 'cex').shape[0]} cex addresses"
)
train_df = train_df.filter(
    (pl.col("flipside_label_type") != "cex") | (pl.col("flipside_label_type").is_null())
)

Train set shape: (99067, 267)
Removing 6294 contracts
Removing 10912 cex addresses


In [None]:
import polars.selectors as cs

common_drop_cols = (
    [
        "address",
        "split",
        "label",
    ]
    + [
        # cs.starts_with("n2v_")
    ]
)

X_train = train_df.drop(common_drop_cols)
y_train = train_df["label"].cast(float)

X_test = test_df.drop(common_drop_cols)

print(f"Train set shape: {X_train.shape}")
print(f"Test set shape: {X_test.shape}")

Train set shape: (81861, 264)
Test set shape: (20369, 264)


In [46]:
from sklearn.ensemble import HistGradientBoostingClassifier, RandomForestClassifier
from sklearn.feature_selection import SelectFromModel, VarianceThreshold
from sklearn.pipeline import FeatureUnion, Pipeline
from skrub import AggTarget, MinHashEncoder, TableVectorizer

datetime_cols = X_train.select(X_train.select(cs.datetime())).columns
categorical_cols = X_train.select(X_train.select(cs.string())).columns

X = X_train.shrink_to_fit().to_pandas()
y = y_train.shrink_to_fit().to_pandas()

for col in categorical_cols:
    X[col] = X[col].astype("category")

target_encodings = FeatureUnion([
    (
        f"target_encoding_{column}",
        AggTarget(main_key=column, operations=["mean", "median"]),
    )
    for column in [
        "zk_cluster_list_hit",
        "zksync_sybil_list_hit",
        "layer_zero_wallet_list_hit",
        "ct_app_lz_list_hit",
        "ct_app_bn_wl_hit",
        "lz_initial_list_hit",
        "lz_provisional_sybil_list_hit",
        "hop_sybil_list_hit",
        # "num_transactions_to_sybil",
        # "num_transactions_from_sybil",
        # "most_common_platform",
        "community_size",
        "degree",
    ]
])

vectorizer = TableVectorizer(
    drop_null_fraction=1,
    cardinality_threshold=50,
    high_cardinality=MinHashEncoder(),  # type: ignore
    n_jobs=-1,
)

model = HistGradientBoostingClassifier(max_iter=500)

pipeline = Pipeline([
    ("target_encoding", target_encodings),
    ("vectorizer", vectorizer),
    ("variance_threshold", VarianceThreshold()),
    (
        "feature_selection",
        SelectFromModel(
            estimator=RandomForestClassifier(n_jobs=-1, n_estimators=300),
            threshold="median",
        ),
    ),
    ("model", model),
])

In [None]:
import numpy as np
from sklearn.model_selection import cross_validate

cv_results = cross_validate(pipeline, X, y, scoring="roc_auc", cv=5)
print(f"ROC AUC: {np.mean(cv_results['test_score'])}")

  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)
  return col.

## Test Predictions

In [7]:
pipeline.fit(X, y)

In [8]:
pobabilities = pipeline.predict_proba(X_test.to_pandas())



In [9]:
from datetime import datetime

current_datetime = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

name = "final-pipeline-flipside-vec-fs-hgbc"
test_df.select(
    pl.col("address").alias("ADDRESS"),
    pl.lit(pobabilities[:, 1]).alias("PRED"),
).write_csv(f"../data/submissions/{current_datetime}-{name}.csv")

## Postprocessig 

In [24]:
# passport_model_scores = pl.read_parquet(
#     "../data/external/passport_model_scores.parquet"
# )

uniswap_verified_wallets = (
    pl.read_csv("../data/external/uniswap_verified_wallets.csv")
    .select(pl.col("address"))
    .to_series()
)

# TODO: layer_zero_wallet_list has also a large amount of accurate hits, same as lz_provisional_sybil_list


preds_df = (
    test_df.with_columns(
        pl.col("address").alias("ADDRESS"),
        pl.lit(pobabilities[:, 1]).alias("PRED"),
    )
    .with_columns(
        pl.when(pl.col("flipside_is_contract") == 1)
        .then(0)
        .otherwise(pl.col("PRED"))
        .alias("PRED")
    )
    .with_columns(
        pl.when(pl.col("flipside_label").is_null())
        .then(pl.col("PRED"))
        .otherwise(0)
        .alias("PRED")
    )
    .with_columns(
        pl.when(pl.col("zk_cluster_list_hit"))
        .then(1)
        .otherwise(pl.col("PRED"))
        .alias("PRED")
    )
    # .join(passport_model_scores, left_on="ADDRESS", right_on="address", how="left")
    # .with_columns(
    #     pl.when(pl.col("score") > 30).then(0).otherwise(pl.col("PRED")).alias("PRED")
    # )
    # .with_columns(
    #     pl.when(pl.col("score").is_not_null())
    #     .then(pl.col("PRED") * (1 - pl.col("score") / 100))
    #     .otherwise(pl.col("PRED"))
    #     .alias("PRED")
    # )
    .with_columns(
        pl.when(pl.col("ADDRESS").is_in(uniswap_verified_wallets))
        .then(0)
        .otherwise(pl.col("PRED"))
        .alias("PRED")
    )
    .select(
        pl.col("ADDRESS"),
        pl.col("PRED"),
    )
)

In [25]:
current_datetime = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

preds_df.select(
    pl.col("ADDRESS"),
    pl.col("PRED"),
).write_csv(
    f"../data/submissions/{current_datetime}-{name}-post.csv",
    float_scientific=False,
)