In [6]:
%reload_ext autoreload
%autoreload 2

import warnings

import polars as pl

warnings.filterwarnings("ignore")
warnings.filterwarnings("ignore", category=FutureWarning)

_ = pl.Config.set_tbl_rows(10)
_ = pl.Config.set_tbl_cols(20)

In [7]:
from sdpc.data import joined_train_df, test_data_df

train_df = joined_train_df()
test_df = test_data_df()
features_df = pl.read_parquet("../data/processed/features_df.parquet")

train_df = train_df.join(features_df.drop("label"), on="address", how="left")
test_df = test_df.join(features_df, on="address", how="left")

In [8]:
print(f"Train set shape: {train_df.shape}")

Train set shape: (99067, 267)


### Train Cleaning

In [9]:
print(f"Train set shape: {train_df.shape}")

# Remove contracts
print(f"Removing {train_df.filter(pl.col('flipside_is_contract')).shape[0]} contracts")
train_df = train_df.filter(pl.col("flipside_is_contract") == False)  # noqa: E712

# # Remove CEX addresses
print(
    f"Removing {train_df.filter(pl.col('flipside_label_type') == 'cex').shape[0]} cex addresses"
)
train_df = train_df.filter(
    (pl.col("flipside_label_type") != "cex") | (pl.col("flipside_label_type").is_null())
)

Train set shape: (99067, 267)
Removing 6294 contracts
Removing 10912 cex addresses


In [10]:
import polars.selectors as cs

common_drop_cols = (
    [
        "address",
        "split",
        "label",
    ]
    + [
        # cs.starts_with("n2v_")
    ]
)

X_train = train_df.drop(common_drop_cols)
y_train = train_df["label"].cast(float)

X_test = test_df.drop(common_drop_cols)

print(f"Train set shape: {X_train.shape}")
print(f"Test set shape: {X_test.shape}")

Train set shape: (81861, 264)
Test set shape: (20369, 264)


In [11]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel, VarianceThreshold
from sklearn.pipeline import FeatureUnion, Pipeline
from skrub import AggTarget, MinHashEncoder, TableVectorizer

datetime_cols = X_train.select(X_train.select(cs.datetime())).columns
categorical_cols = X_train.select(X_train.select(cs.string())).columns

X = X_train.shrink_to_fit().to_pandas()
y = y_train.shrink_to_fit().to_pandas()

for col in categorical_cols:
    X[col] = X[col].astype("category")

target_encodings = FeatureUnion([
    (
        f"target_encoding_{column}",
        AggTarget(main_key=column, operations=["mean", "median"]),
    )
    for column in [
        "zk_cluster_list_hit",
        "zksync_sybil_list_hit",
        "layer_zero_wallet_list_hit",
        "ct_app_lz_list_hit",
        "ct_app_bn_wl_hit",
        "lz_initial_list_hit",
        "lz_provisional_sybil_list_hit",
        "hop_sybil_list_hit",
        # "num_transactions_to_sybil",
        # "num_transactions_from_sybil",
        # "most_common_platform",
        "community_size",
        "degree",
    ]
])

vectorizer = TableVectorizer(
    drop_null_fraction=1,
    cardinality_threshold=50,
    high_cardinality=MinHashEncoder(),  # type: ignore
    n_jobs=-1,
)

from lightgbm import LGBMClassifier

model = LGBMClassifier(n_estimators=500, force_col_wise=True, n_jobs=-1)

pipeline = Pipeline([
    ("target_encoding", target_encodings),
    ("vectorizer", vectorizer),
    ("variance_threshold", VarianceThreshold()),
    (
        "feature_selection",
        SelectFromModel(
            estimator=RandomForestClassifier(n_jobs=-1, n_estimators=200),
            threshold="median",
        ),
    ),
    ("model", model),
])

In [12]:
import numpy as np
from sklearn.model_selection import cross_validate

cv_results = cross_validate(pipeline, X, y, scoring="roc_auc", cv=5)
print(f"ROC AUC: {np.mean(cv_results['test_score'])}")

  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)
  return col.

[LightGBM] [Info] Number of positive: 2022, number of negative: 63466
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.689871 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 392249
[LightGBM] [Info] Number of data points in the train set: 65488, number of used features: 2175
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.030876 -> initscore=-3.446417
[LightGBM] [Info] Start training from score -3.446417


  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)
  return col.

[LightGBM] [Info] Number of positive: 2023, number of negative: 63466
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.731452 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 383033
[LightGBM] [Info] Number of data points in the train set: 65489, number of used features: 2150
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.030891 -> initscore=-3.445923
[LightGBM] [Info] Start training from score -3.445923


  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)
  return col.

[LightGBM] [Info] Number of positive: 2023, number of negative: 63466
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.117247 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 390866
[LightGBM] [Info] Number of data points in the train set: 65489, number of used features: 2175
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.030891 -> initscore=-3.445923
[LightGBM] [Info] Start training from score -3.445923


  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)
  return col.

[LightGBM] [Info] Number of positive: 2022, number of negative: 63467
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.206414 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 392422
[LightGBM] [Info] Number of data points in the train set: 65489, number of used features: 2170
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.030875 -> initscore=-3.446433
[LightGBM] [Info] Start training from score -3.446433


  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)
  return col.

[LightGBM] [Info] Number of positive: 2022, number of negative: 63467
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.266308 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 377571
[LightGBM] [Info] Number of data points in the train set: 65489, number of used features: 2145
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.030875 -> initscore=-3.446433
[LightGBM] [Info] Start training from score -3.446433


  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)


ROC AUC: 0.9997313018363488


## Test Predictions

In [13]:
pipeline.fit(X, y)

  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)


[LightGBM] [Info] Number of positive: 2528, number of negative: 79333
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.325189 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 394296
[LightGBM] [Info] Number of data points in the train set: 81861, number of used features: 2180
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.030882 -> initscore=-3.446226
[LightGBM] [Info] Start training from score -3.446226


In [14]:
pobabilities = pipeline.predict_proba(X_test.to_pandas())

  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)
  return col.replace(r"^\s*$", "", regex=True)


In [15]:
from datetime import datetime

current_datetime = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

name = "final-pipeline-all-hit-te-vec-fs-lgbm"
test_df.select(
    pl.col("address").alias("ADDRESS"),
    pl.lit(pobabilities[:, 1]).alias("PRED"),
).write_csv(f"../data/submissions/{current_datetime}-{name}.csv")

## Postprocessig 

In [16]:
# passport_model_scores = pl.read_parquet(
#     "../data/external/passport_model_scores.parquet"
# )

ct_app_s3_1_all_sybils = pl.read_csv(
    "../data/external/ct_app_s3_1_all_sybils.csv"
).get_column("Line")

uniswap_verified_wallets = (
    pl.read_csv("../data/external/uniswap_verified_wallets.csv")
    .select(pl.col("address"))
    .to_series()
)

# TODO: layer_zero_wallet_list has also a large amount of accurate hits, same as lz_provisional_sybil_list


preds_df = (
    test_df.with_columns(
        pl.col("address").alias("ADDRESS"),
        pl.lit(pobabilities[:, 1]).alias("PRED"),
    )
    .with_columns(
        pl.when(pl.col("flipside_is_contract") == 1)
        .then(0)
        .otherwise(pl.col("PRED"))
        .alias("PRED")
    )
    .with_columns(
        pl.when(pl.col("flipside_label").is_null())
        .then(pl.col("PRED"))
        .otherwise(0)
        .alias("PRED")
    )
    .with_columns(
        pl.when(pl.col("zk_cluster_list_hit"))
        .then(1)
        .otherwise(pl.col("PRED"))
        .alias("PRED")
    )
    # .join(passport_model_scores, left_on="ADDRESS", right_on="address", how="left")
    # .with_columns(
    #     pl.when(pl.col("score") > 30).then(0).otherwise(pl.col("PRED")).alias("PRED")
    # )
    # .with_columns(
    #     pl.when(pl.col("score").is_not_null())
    #     .then(pl.col("PRED") * (1 - pl.col("score") / 100))
    #     .otherwise(pl.col("PRED"))
    #     .alias("PRED")
    # )
    .with_columns(
        pl.when(pl.col("ADDRESS").is_in(uniswap_verified_wallets))
        .then(0)
        .otherwise(pl.col("PRED"))
        .alias("PRED")
    )
    .with_columns(
        pl.when(pl.col("ADDRESS").is_in(ct_app_s3_1_all_sybils))
        .then(1)
        .otherwise(pl.col("PRED"))
        .alias("PRED")
    )
    .select(
        pl.col("ADDRESS"),
        pl.col("PRED"),
    )
)

In [17]:
current_datetime = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

preds_df.select(
    pl.col("ADDRESS"),
    pl.col("PRED"),
).write_csv(
    f"../data/submissions/{current_datetime}-{name}-post.csv",
    float_scientific=False,
)