In [6]:
%reload_ext autoreload
%autoreload 2

import warnings

import polars as pl

warnings.filterwarnings("ignore")
warnings.filterwarnings("ignore", category=FutureWarning)

_ = pl.Config.set_tbl_rows(10)
_ = pl.Config.set_tbl_cols(20)

In [7]:
from sdpc.data import joined_train_df, test_data_df

train_df = joined_train_df()
test_df = test_data_df()
features_df = pl.read_parquet("../data/processed/features_df.parquet")

train_df = train_df.join(features_df.drop("label"), on="address", how="left")
test_df = test_df.join(features_df, on="address", how="left")

### Train Cleaning

In [8]:
print(f"Train set shape: {train_df.shape}")

Train set shape: (99520, 164)


In [9]:
common_drop_cols = [
    "address",
    "split",
    "label",
]

X_train = train_df.drop(common_drop_cols)
y_train = train_df["label"].cast(float)

X_test = test_df.drop(common_drop_cols)

print(f"Train set shape: {X_train.shape}")
print(f"Test set shape: {X_test.shape}")

Train set shape: (99520, 161)
Test set shape: (21542, 161)


In [22]:
import polars.selectors as cs
from sklearn.ensemble import HistGradientBoostingClassifier, RandomForestClassifier
from sklearn.feature_selection import SelectFromModel, VarianceThreshold
from sklearn.pipeline import Pipeline
from skrub import TableVectorizer

datetime_cols = X_train.select(X_train.select(cs.datetime())).columns
categorical_cols = X_train.select(X_train.select(cs.string())).columns

X = X_train.shrink_to_fit().to_pandas()
y = y_train.shrink_to_fit().to_pandas()

vectorizer = TableVectorizer(
    drop_null_fraction=1,
    cardinality_threshold=50,
    # high_cardinality=MinHashEncoder(),  # type: ignore
    n_jobs=-1,
)

model = HistGradientBoostingClassifier(max_iter=500)

pipeline = Pipeline([
    ("vectorizer", vectorizer),
    ("variance_threshold", VarianceThreshold()),
    (
        "feature_selection",
        SelectFromModel(
            estimator=RandomForestClassifier(n_jobs=-1, n_estimators=200),
            threshold="0.8*median",
        ),
    ),
    ("model", model),
])

In [23]:
import numpy as np
from sklearn.model_selection import cross_validate

cv_results = cross_validate(pipeline, X, y, scoring="roc_auc", cv=5)
print(f"ROC AUC: {np.mean(cv_results['test_score'])}")



KeyboardInterrupt: 

## Test Predictions

In [7]:
pipeline.fit(X, y)

In [8]:
pobabilities = pipeline.predict_proba(X_test.to_pandas())



In [9]:
from datetime import datetime

current_datetime = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

name = "final-pipeline-flipside-vec-fs-hgbc"
test_df.select(
    pl.col("address").alias("ADDRESS"),
    pl.lit(pobabilities[:, 1]).alias("PRED"),
).write_csv(f"../data/submissions/{current_datetime}-{name}.csv")

## Postprocessig 

In [24]:
# passport_model_scores = pl.read_parquet(
#     "../data/external/passport_model_scores.parquet"
# )

uniswap_verified_wallets = (
    pl.read_csv("../data/external/uniswap_verified_wallets.csv")
    .select(pl.col("address"))
    .to_series()
)

# TODO: layer_zero_wallet_list has also a large amount of accurate hits, same as lz_provisional_sybil_list


preds_df = (
    test_df.with_columns(
        pl.col("address").alias("ADDRESS"),
        pl.lit(pobabilities[:, 1]).alias("PRED"),
    )
    .with_columns(
        pl.when(pl.col("flipside_is_contract") == 1)
        .then(0)
        .otherwise(pl.col("PRED"))
        .alias("PRED")
    )
    .with_columns(
        pl.when(pl.col("flipside_label").is_null())
        .then(pl.col("PRED"))
        .otherwise(0)
        .alias("PRED")
    )
    .with_columns(
        pl.when(pl.col("zk_cluster_list_hit"))
        .then(1)
        .otherwise(pl.col("PRED"))
        .alias("PRED")
    )
    # .join(passport_model_scores, left_on="ADDRESS", right_on="address", how="left")
    # .with_columns(
    #     pl.when(pl.col("score") > 30).then(0).otherwise(pl.col("PRED")).alias("PRED")
    # )
    # .with_columns(
    #     pl.when(pl.col("score").is_not_null())
    #     .then(pl.col("PRED") * (1 - pl.col("score") / 100))
    #     .otherwise(pl.col("PRED"))
    #     .alias("PRED")
    # )
    .with_columns(
        pl.when(pl.col("ADDRESS").is_in(uniswap_verified_wallets))
        .then(0)
        .otherwise(pl.col("PRED"))
        .alias("PRED")
    )
    .select(
        pl.col("ADDRESS"),
        pl.col("PRED"),
    )
)

In [25]:
current_datetime = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

preds_df.select(
    pl.col("ADDRESS"),
    pl.col("PRED"),
).write_csv(
    f"../data/submissions/{current_datetime}-{name}-post.csv",
    float_scientific=False,
)