In [1]:
%reload_ext autoreload
%autoreload 2

import warnings

import polars as pl

warnings.filterwarnings("ignore")
warnings.filterwarnings("ignore", category=FutureWarning)

_ = pl.Config.set_tbl_rows(10)
_ = pl.Config.set_tbl_cols(20)

In [2]:
from sdpc.data import joined_train_df, test_data_df

train_df = joined_train_df()
test_df_original = test_data_df()
features_df = pl.read_parquet("../data/processed/features_df_graph_jorge.parquet")

train_df = train_df.join(features_df.drop("label"), on="address", how="left")
test_df = test_df_original.join(features_df, on="address", how="left")

In [3]:
print(f"Train set shape: {train_df.shape}")

Train set shape: (99067, 250)


### Train Cleaning

In [4]:
print(f"Train set shape: {train_df.shape}")

# Remove contracts
print(f"Removing {train_df.filter(pl.col('flipside_is_contract')).shape[0]} contracts")
train_df = train_df.filter(pl.col("flipside_is_contract") == False)  # noqa: E712

# # Remove CEX addresses
print(
    f"Removing {train_df.filter(pl.col('flipside_label_type') == 'cex').shape[0]} cex addresses"
)
train_df = train_df.filter(
    (pl.col("flipside_label_type") != "cex") | (pl.col("flipside_label_type").is_null())
)

Train set shape: (99067, 250)
Removing 6294 contracts
Removing 10912 cex addresses


In [5]:
# STRING

# Drop columns with string (Utf8) dtype from train_df
str_cols = [col for col, dtype in zip(train_df.columns, train_df.dtypes) if dtype == pl.Utf8]
train_df = train_df.drop(str_cols)
test_df = test_df.drop(str_cols)
print(str_cols)

['address', 'split', 'flipside_address_name', 'flipside_label_type', 'flipside_label', 'most_common_platform', 'most_common_symbol', 'most_common_symbol_token_transfers_to_all']


In [6]:
# DATETIME

# drop columns that are datetime

datetime_cols = [col for col, dtype in zip(train_df.columns, train_df.dtypes) if dtype == pl.Datetime]
train_df = train_df.drop(datetime_cols)
test_df = test_df.drop(datetime_cols)

In [7]:
import polars.selectors as cs

common_drop_cols = [
    #"address",
    #"split",
    "label",
] + [
    cs.starts_with("n2v_")  # Embeddings
]

X_train = train_df.drop(common_drop_cols)
y_train = train_df["label"].cast(float)

X_test = test_df.drop(common_drop_cols)

print(f"Train set shape: {X_train.shape}")
print(f"Test set shape: {X_test.shape}")

Train set shape: (81861, 157)
Test set shape: (20369, 157)


# Model

In [8]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.impute import SimpleImputer
import numpy as np
from imblearn.under_sampling import NearMiss, TomekLinks
from sklearn.model_selection import StratifiedKFold

# Split train_df into features (X) and target (y)
#X_original = X_train.drop("label")  # Drop the target column
#y_original = y_train["label"].cast(float)

# Convert to pandas for imblearn compatibility
X_pd = X_train.to_pandas()
y_np = y_train.to_pandas()
test_df_pd = X_test.to_pandas()

# Ensure all columns are numeric
X_pd = X_pd.fillna(-1)
test_df_pd = test_df_pd.fillna(-1)

In [None]:
# Apply Tomek Links cleaning before NearMiss
tl = TomekLinks()
X_pd, y_np = tl.fit_resample(X_pd, y_np)

# Apply NearMiss under-sampling after Tomek Links
nm = NearMiss(version=3)
X, y = nm.fit_resample(X_pd, y_np)

#X,y = X_pd, y_np

# Initialize the Random Forest Classifier with OOB scoring enabled
clf = RandomForestClassifier(
    n_estimators=300,  # Number of trees
    random_state=42,
    #class_weight= "balanced_subsample",
    oob_score=True,  # Enable Out-of-Bag scoring
    n_jobs=-1  # Use all available cores
)

# Stratified K-Fold Cross-Validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(clf, X, y, cv=skf, scoring="roc_auc")

# Train the model on the entire training set
clf.fit(X, y)

# Print OOB score
print(f"OOB Score: {clf.oob_score_:.4f}")

# Print Cross-Validation scores
print(f"Cross-Validation AUC Scores: {cv_scores}")
print(f"Mean CV AUC Score: {np.mean(cv_scores):.4f}")

# Feature importance
feature_importances = clf.feature_importances_
feature_names = X.columns
sorted_indices = np.argsort(feature_importances)[::-1]

print("\nFeature Importances:")
for idx in sorted_indices:
    print(f"{feature_names[idx]}: {feature_importances[idx]:.4f}")

OOB Score: 0.9965
Cross-Validation AUC Scores: [0.99974525 0.99988978 0.99995552 0.99982597 1.        ]
Mean CV AUC Score: 0.9999

Feature Importances:
native_tx_flow_ratio: 0.0959
unique_communities: 0.0494
avg_gas_price_to_all: 0.0494
min_value_to_all: 0.0468
unique_tx_hashes_to_all: 0.0438
avg_value_to_all: 0.0405
total_gas_price_to_all: 0.0391
total_gas_limit_to_all: 0.0373
total_value_to_all: 0.0345
max_value_to_all: 0.0342
total_tx_fee_to_all: 0.0341
num_transactions_from_sybil: 0.0340
unique_networks_to_all: 0.0338
unique_block_numbers_to_all: 0.0316
avg_tx_fee_to_all: 0.0309
flipside_is_contract_count: 0.0280
address_name_count: 0.0280
first_tx_from_value: 0.0278
total_gas_used_to_all: 0.0278
avg_native_eth_in_value: 0.0249
unique_from_addresses: 0.0228
num_unique_from_sybil_addresses: 0.0124
unique_tx_hashes_ethereum_to_network: 0.0117
unique_from_addresses_ethereum: 0.0088
avg_gas_price_ethereum_to_network: 0.0088
total_gas_price_ethereum_to_network: 0.0088
max_value_ethereum

In [10]:
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_predict
y_pred = cross_val_predict(clf, X, y, cv=skf)  # 5-fold CV predictions
y_proba = cross_val_predict(clf, X, y, cv=skf, method='predict_proba')[:, 1]
# Calculate ROC AUC score
roc_auc = roc_auc_score(y, y_proba)
print(f"ROC AUC Score: {roc_auc:.4f}")

# Generate the classification report
report = classification_report(y, y_pred, target_names=["Class 0", "Class 1"])
print(report)

ROC AUC Score: 0.9999
              precision    recall  f1-score   support

     Class 0       0.99      1.00      0.99      2543
     Class 1       1.00      0.99      0.99      2543

    accuracy                           0.99      5086
   macro avg       0.99      0.99      0.99      5086
weighted avg       0.99      0.99      0.99      5086



In [11]:
test_proba = clf.predict_proba(test_df_pd)[:, 1]
test_pred = clf.predict(test_df_pd)

In [12]:
y.mean(), y_pred.mean(), test_pred.mean(), test_proba.mean()

(0.5, 0.49705072748721985, 0.9899847807943444, 0.9879981016904773)

## Postprocessig 

In [13]:
# passport_model_scores = pl.read_parquet(
#     "../data/external/passport_model_scores.parquet"
# )

# uniswap_verified_wallets = (
#     pl.read_csv("../data/external/uniswap_verified_wallets.csv")
#     .select(pl.col("address"))
#     .to_series()
# )


# add to test_df_original the probabilities as PRED
sub = test_df_original.with_columns(pl.Series("PRED", test_proba))
# rename address to ADDRESS
sub = sub.rename({"address": "ADDRESS"})
# include timestamp in the filename
import datetime
now = datetime.datetime.now()
timestamp = now.strftime("%Y%m%d-%H%M%S")
name = "limon-rf-" + timestamp + ".csv"
print(name)

# sub = (sub.with_columns(
#         pl.when(pl.col("flipside_is_contract") == 1)
#         .then(0)
#         .otherwise(pl.col("PRED"))
#         .alias("PRED"))
#     .with_columns(
#         pl.when(pl.col("flipside_flipside_label").is_null())
#         .then(pl.col("PRED"))
#         .otherwise(0)
#         .alias("PRED")
#     )
#     # .with_columns(
#     #     pl.when(pl.col("zk_cluster_list_hit"))
#     #     .then(1)
#     #     .otherwise(pl.col("PRED"))
#     #     .alias("PRED")
#     # )
#     # .join(passport_model_scores, left_on="ADDRESS", right_on="address", how="left")
#     # .with_columns(
#     #     pl.when(pl.col("score") > 30).then(0).otherwise(pl.col("PRED")).alias("PRED")
#     # )
#     # .with_columns(
#     #     pl.when(pl.col("score").is_not_null())
#     #     .then(pl.col("PRED") / pl.col("score"))
#     #     .otherwise(pl.col("PRED"))
#     #     .alias("PRED")
#     # )
#     .with_columns(
#         pl.when(pl.col("ADDRESS").is_in(uniswap_verified_wallets))
#         .then(0)
#         .otherwise(pl.col("PRED"))
#         .alias("PRED")
#    )
#     .select(
#         pl.col("ADDRESS"),
#         pl.col("PRED"),
#     )
# )
sub = sub.select(
        pl.col("ADDRESS"),
        pl.col("PRED"),
    )
# save the submission file in data/submissions as csv
sub.write_csv("../data/submissions/" + name)

limon-rf-20250514-225942.csv


In [14]:
# mean of sub
mean = sub["PRED"].mean()
print(f"Mean of submission: {mean:.4f}")

Mean of submission: 0.9880
