<a href="https://colab.research.google.com/github/farahBassoumi/defi-sandwich-attack-detection/blob/main/sybil_detection_pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os
import joblib
import pandas as pd
pd.set_option('display.max_columns', None)
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.auto import tqdm
import time

import matplotlib
matplotlib.rcParams['font.family'] = 'Arial'  # Avoid glyph warnings


class InteractiveSybilDetectionPipeline:
    def __init__(self, df):
        self.df = df.copy()
        self.processed_df = None
        self.feature_importance = None
        self.scaler = StandardScaler()
        self.encoders = {}
        self.step_history = []
        self.model = None
        self.selected_features = None
        self.predictions_df = None
        self.training_feature_order = None

        print("=" * 60)
        print("SYBIL DETECTION PIPELINE INITIALIZED")
        print("=" * 60)
        self._show_dataset_overview(self.df, "Original Dataset")

    def _show_dataset_overview(self, df, title):
        print(f"\n{title.upper()}:")
        print("-" * 40)
        print(f"Shape: {df.shape[0]:,} rows × {df.shape[1]} columns")

        if 'is_sybil' in df.columns:
            dist = df['is_sybil'].value_counts()
            total = len(df)
            print("Target Distribution:")
            non_sybil_count = dist.get(False, 0)
            sybil_count = dist.get(True, 0)
            print(f"  • Non-Sybil: {non_sybil_count:,} ({non_sybil_count / total * 100:.1f}%)")
            print(f"  • Sybil:     {sybil_count:,} ({sybil_count / total * 100:.1f}%)")
        else:
            print("Target Distribution: 'is_sybil' column not found.")

        missing_count = df.isnull().sum().sum()
        if missing_count > 0:
            print(f"Missing Values: {missing_count:,}")
        else:
            print("Missing Values: None")


    def aggregate_features(self, inference_mode=False):
        print("\n" + "=" * 60)
        print("PHASE 1: FEATURE AGGREGATION")
        print("=" * 60)

        # Preprocessing steps
        preprocessing_tasks = [
            ("Converting detecttime to datetime format",
             lambda: pd.to_datetime(self.df['detecttime'], errors='coerce') if 'detecttime' in self.df.columns else None),
            ("Creating detect_date column",
             lambda: setattr(self.df, 'detect_date', self.df['detecttime'].dt.date) if 'detect_date' not in self.df.columns and 'detecttime' in self.df.columns else None),
            ("Converting value column to numeric",
             lambda: pd.to_numeric(self.df['value'], errors='coerce') if 'value' in self.df.columns else None)
        ]

        print("\nPreprocessing Data...")
        with tqdm(total=len(preprocessing_tasks), desc="Preprocessing",
                  bar_format="{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}]",
                  colour='blue') as pbar:
            for desc, func in preprocessing_tasks:
                pbar.set_description(f"Preprocessing: {desc}")
                result = func()
                if result is not None:
                    if 'detecttime' in desc.lower():
                        self.df['detecttime'] = result
                    elif 'value' in desc.lower():
                        self.df['value'] = result
                pbar.update(1)
                time.sleep(0.1)

        # Numeric feature aggregation
        numeric_cols = [
            'nonce', 'gas', 'gasprice', 'value', 'blockspending',
            'time_pending_by_blocknative', 'gasused', 'maxpriorityfeepergas',
            'maxfeepergas', 'basefeepergas', 'time_pending',
            'was_evicted', 'was_rejected'
        ]

        available_numeric_cols = [col for col in numeric_cols if col in self.df.columns and pd.api.types.is_numeric_dtype(self.df[col])]
        aggregated = []

        print(f"\nAggregating {len(available_numeric_cols)} numeric features...")
        with tqdm(total=len(available_numeric_cols), desc="Numeric Aggregation",
                  bar_format="{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}]",
                  colour='green') as pbar:
            for col in available_numeric_cols:
                pbar.set_description(f"Aggregating: {col}")
                agg = self.df.groupby('fromaddress')[col].agg(['count', 'mean', 'std', 'min', 'max', 'median'])
                aggregated.append(agg.add_prefix(f"{col}_"))
                pbar.update(1)
                time.sleep(0.05)

        # Categorical and temporal aggregation
        group_dict = {
            'hash': 'count',
            'status': lambda x: (x == 'confirmed').sum(),
            'failurereason': lambda x: (x != 'none').sum(),
            'toaddress': 'nunique',
            'type': lambda x: x.mode().iloc[0] if not x.mode().empty else 'unknown',
            'region': lambda x: x.mode().iloc[0] if not x.mode().empty else 'unknown',
            'drop_reason': lambda x: x.notna().sum(),
            'detecttime': lambda x: (x.max() - x.min()).total_seconds() if x.notna().any() else 0,
            'detect_date': 'nunique',
        }
        if not inference_mode and 'is_sybil' in self.df.columns:
            group_dict['is_sybil'] = 'any'

        print("\nAggregating transaction statistics...")
        tx_stats = self.df.groupby('fromaddress').agg(group_dict).rename(columns={
            'hash': 'total_transactions',
            'status': 'successful_transactions',
            'failurereason': 'failed_transactions',
            'toaddress': 'unique_recipients',
            'type': 'primary_tx_type',
            'region': 'primary_region',
            'drop_reason': 'dropped_transactions',
            'detecttime': 'activity_timespan_seconds',
            'detect_date': 'active_days',
            'is_sybil': 'is_sybil' if not inference_mode else None
        })

        aggregated.append(tx_stats)
        self.processed_df = pd.concat(aggregated, axis=1)

        # Create derived features
        print("Creating derived ratio features...")
        total_tx = self.processed_df['total_transactions'].replace(0, np.nan)
        self.processed_df['success_rate'] = self.processed_df['successful_transactions'] / total_tx
        self.processed_df['failure_rate'] = self.processed_df['failed_transactions'] / total_tx
        self.processed_df['unique_recipient_ratio'] = self.processed_df['unique_recipients'] / total_tx
        self.processed_df['drop_rate'] = self.processed_df['dropped_transactions'] / total_tx
        self.processed_df['tx_frequency'] = self.processed_df['total_transactions'] / np.maximum(self.processed_df['active_days'], 1)

        # Fill any NaNs created by division
        self.processed_df.fillna(0, inplace=True)

        print(f"\nFeature aggregation completed successfully.")
        print(f"Result: {len(self.processed_df):,} addresses with {self.processed_df.shape[1]} features")

    def handle_missing_values(self, threshold_pct=0):
        print("\n" + "=" * 60)
        print("PHASE 2: MISSING VALUE HANDLING")
        print("=" * 60)

        missing_pct = (self.processed_df.isnull().mean()) * 100
        cols_to_fill = missing_pct[missing_pct > threshold_pct].index.tolist()

        if not cols_to_fill:
            print("No missing values found - skipping this phase.")
            return

        print(f"Processing {len(cols_to_fill)} columns with missing values...")

        with tqdm(total=len(cols_to_fill), desc="Filling Missing Values",
                  bar_format="{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}]",
                  colour='yellow') as pbar:
            for col in cols_to_fill:
                pbar.set_description(f"Filling: {col[:30]}")
                if self.processed_df[col].dtype == 'object':
                    mode = self.processed_df[col].mode()
                    fill_val = mode.iloc[0] if not mode.empty else 'unknown'
                else:
                    fill_val = self.processed_df[col].median()
                self.processed_df[col] = self.processed_df[col].fillna(fill_val)
                pbar.update(1)

        remaining_nulls = self.processed_df.isnull().sum().sum()
        print(f"Missing value handling completed. Remaining nulls: {remaining_nulls}")

    def encode_categorical_features(self, inference_mode=False):
        print("\n" + "=" * 60)
        print("PHASE 3: CATEGORICAL ENCODING")
        print("=" * 60)

        categorical_columns = self.processed_df.select_dtypes(include=['object']).columns.tolist()
        categorical_columns = [col for col in categorical_columns if col != 'is_sybil']

        if not categorical_columns:
            print("No categorical features found - skipping this phase.")
            return

        print(f"Encoding {len(categorical_columns)} categorical features...")

        with tqdm(total=len(categorical_columns), desc="Encoding Categories",
                  bar_format="{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}]",
                  colour='cyan') as pbar:
            for col in categorical_columns:
                pbar.set_description(f"Encoding: {col}")
                if inference_mode:
                    if col in self.encoders:
                        le = self.encoders[col]
                        self.processed_df[col] = self.processed_df[col].map(
                            lambda x: le.transform([x])[0] if x in le.classes_ else -1
                        )
                    else:
                        self.processed_df[col] = -1
                else:
                    le = LabelEncoder()
                    self.processed_df[col] = le.fit_transform(self.processed_df[col].astype(str))
                    self.encoders[col] = le
                pbar.update(1)

        print(f"Categorical encoding completed for {len(categorical_columns)} features.")

    def remove_low_variance_features(self, threshold=0.01, inference_mode=False):
        print("\n" + "=" * 60)
        print("PHASE 4: LOW VARIANCE FEATURE REMOVAL")
        print("=" * 60)

        numeric_df = self.processed_df.select_dtypes(include=[np.number])
        if not inference_mode and 'is_sybil' in numeric_df.columns:
            numeric_df = numeric_df.drop(columns='is_sybil')

        print("Analyzing feature variance...")
        low_var_cols = numeric_df.var()[numeric_df.var() < threshold].index.tolist()

        if low_var_cols:
            self.processed_df.drop(columns=low_var_cols, inplace=True)
            print(f"Removed {len(low_var_cols)} low variance features:")
            for col in low_var_cols[:5]:  # Show first 5
                print(f"  • {col}")
            if len(low_var_cols) > 5:
                print(f"  • ... and {len(low_var_cols) - 5} more")
        else:
            print("No low variance features found.")

    def prepare_for_modeling(self, test_size=0.2, random_state=42):
        print("\n" + "=" * 60)
        print("PHASE 5: DATA PREPARATION FOR MODELING")
        print("=" * 60)

        X = self.processed_df.drop(columns='is_sybil')
        y = self.processed_df['is_sybil']
        self.training_feature_order = X.columns.tolist()

        print("Scaling features...")
        X_scaled = pd.DataFrame(self.scaler.fit_transform(X), columns=X.columns)

        print("Splitting dataset...")
        X_train, X_test, y_train, y_test = train_test_split(
            X_scaled, y, test_size=test_size, stratify=y, random_state=random_state
        )

        print(f"Dataset prepared successfully:")
        print(f"  • Training set: {len(X_train):,} samples")
        print(f"  • Test set: {len(X_test):,} samples")
        print(f"  • Features: {len(X_scaled.columns)}")

        return X_train, X_test, y_train, y_test

    def run_model_with_top_17_features(self, X_train, X_test, y_train, y_test):
        print("\n" + "=" * 60)
        print("PHASE 6: MODEL TRAINING & EVALUATION")
        print("=" * 60)

        print("Selecting top 17 features using mutual information...")
        selector = SelectKBest(score_func=mutual_info_classif, k=17)
        X_train_sel = selector.fit_transform(X_train, y_train)
        X_test_sel = selector.transform(X_test)
        self.selected_features = X_train.columns[selector.get_support()].tolist()

        print("\nSelected Features:")
        for i, feature in enumerate(self.selected_features, 1):
            print(f"  {i:2d}. {feature}")

        print("\nApplying SMOTE for class balancing...")
        smote = SMOTE(random_state=42)
        X_train_res, y_train_res = smote.fit_resample(X_train_sel, y_train)

        print(f"Training set after SMOTE: {len(X_train_res):,} samples")

        print("\nTraining XGBoost classifier...")
        self.model = XGBClassifier(
            n_estimators=100,
            max_depth=6,
            learning_rate=0.1,
            subsample=0.8,
            colsample_bytree=0.8,
            eval_metric='logloss',
            random_state=42,
            n_jobs=-1,
            verbosity=0  # Suppress XGBoost output
        )

        with tqdm(total=1, desc="Training Model",
                  bar_format="{l_bar}{bar}| {elapsed}",
                  colour='magenta') as pbar:
            self.model.fit(X_train_res, y_train_res)
            pbar.update(1)

        print("\nGenerating predictions...")
        y_probs = self.model.predict_proba(X_test_sel)[:, 1]
        y_pred = (y_probs >= 0.7).astype(int)

        self.predictions_df = pd.DataFrame({
            'y_true': y_test.values,
            'y_pred': y_pred,
            'y_prob': y_probs
        }).reset_index(drop=True)

        print("\n" + "=" * 60)
        print("MODEL EVALUATION RESULTS (Threshold = 0.7)")
        print("=" * 60)
        print(classification_report(y_test, y_pred))
        print(f"ROC AUC Score: {roc_auc_score(y_test, y_probs):.4f}")

    def predict_sybil_addresses(self, new_df):
        print("\n" + "=" * 60)
        print("INFERENCE PIPELINE")
        print("=" * 60)

        self.df = new_df.copy()

        inference_steps = [
            ("Feature Aggregation", self.aggregate_features, True),
            ("Missing Value Handling", self.handle_missing_values, None),
            ("Categorical Encoding", self.encode_categorical_features, True),
            ("Low Variance Removal", self.remove_low_variance_features, True),
            ("Feature Alignment & Prediction", self._align_and_predict, None)
        ]

        with tqdm(total=len(inference_steps), desc="Inference Progress",
                  bar_format="{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}]",
                  colour='red') as pbar:
            for step_name, func, *args in inference_steps:
                pbar.set_description(f"Running: {step_name}")
                if args and args[0] is not None:
                    func(args[0])
                else:
                    func()
                pbar.update(1)
                time.sleep(0.2)

        return self.inference_result

    def _align_and_predict(self):
        if not self.selected_features:
            raise ValueError("Model not trained. No selected features found.")
        if not self.training_feature_order:
            raise ValueError("Training feature order missing.")

        current_features = set(self.processed_df.columns)
        required_features = set(self.selected_features)

        missing_features = required_features - current_features
        extra_features = current_features - required_features

        print("\nFeature Alignment Analysis:")
        print(f"  • Model features: {len(self.selected_features)}")
        print(f"  • Available features: {len(self.processed_df.columns)}")
        if missing_features:
            print(f"  • Missing features (filled with 0): {len(missing_features)}")
        if extra_features:
            print(f"  • Extra features (ignored): {len(extra_features)}")

        print("\nAligning features with training schema...")
        aligned_df = pd.DataFrame(index=self.processed_df.index)

        with tqdm(total=len(self.training_feature_order), desc="Feature Alignment",
                  bar_format="{l_bar}{bar}| {n_fmt}/{total_fmt}",
                  colour='orange') as pbar:
            for feat in self.training_feature_order:
                if feat in self.processed_df.columns:
                    aligned_df[feat] = self.processed_df[feat]
                else:
                    aligned_df[feat] = 0
                pbar.update(1)

        prediction_steps = ["Scaling features", "Selecting model features", "Generating predictions"]

        with tqdm(total=len(prediction_steps), desc="Prediction",
                  bar_format="{l_bar}{bar}| {n_fmt}/{total_fmt}",
                  colour='purple') as pred_pbar:

            pred_pbar.set_description("Scaling features")
            X_scaled = pd.DataFrame(
                self.scaler.transform(aligned_df),
                columns=self.training_feature_order,
                index=aligned_df.index
            )
            pred_pbar.update(1)

            pred_pbar.set_description("Selecting features")
            X_model = X_scaled[self.selected_features]
            pred_pbar.update(1)

            pred_pbar.set_description("Making predictions")
            probs = self.model.predict_proba(X_model)[:, 1]

            # Set a custom threshold to control sensitivity
            custom_threshold = 0.80
            preds = (probs >= custom_threshold).astype(int)
            pred_pbar.update(1)

        self.inference_result = pd.DataFrame({
            'fromaddress': self.processed_df.index,
            'predicted_is_sybil': preds,
            'sybil_probability': probs
        })

        sybil_count = self.inference_result['predicted_is_sybil'].sum()
        total_count = len(self.inference_result)
        sybil_pct = (sybil_count / total_count) * 100

        print(f"\nPrediction Results:")
        print(f"  • Total addresses: {total_count:,}")
        print(f"  • Predicted Sybil: {sybil_count:,} ({sybil_pct:.1f}%)")
        print(f"  • Predicted Legitimate: {total_count - sybil_count:,} ({100 - sybil_pct:.1f}%)")

    def save_model_artifacts(self, directory="model_artifacts"):
        print(f"\nSaving model artifacts to '{directory}'...")
        os.makedirs(directory, exist_ok=True)

        artifacts = [
            ("Model", self.model, "sybil_model.pkl"),
            ("Selected Features", self.selected_features, "selected_features.pkl"),
            ("Scaler", self.scaler, "scaler.pkl"),
            ("Encoders", self.encoders, "encoders.pkl"),
            ("Feature Order", self.training_feature_order, "training_feature_order.pkl")
        ]

        with tqdm(total=len(artifacts), desc="Saving Artifacts",
                  bar_format="{l_bar}{bar}| {n_fmt}/{total_fmt}",
                  colour='green') as pbar:
            for name, obj, filename in artifacts:
                pbar.set_description(f"Saving: {name}")
                joblib.dump(obj, f"{directory}/{filename}")
                pbar.update(1)

        print("Model artifacts saved successfully.")

    def load_model_artifacts(self, directory="model_artifacts"):
        print(f"\nLoading model artifacts from '{directory}'...")

        artifacts = [
            ("Model", "sybil_model.pkl"),
            ("Selected Features", "selected_features.pkl"),
            ("Scaler", "scaler.pkl"),
            ("Encoders", "encoders.pkl"),
            ("Feature Order", "training_feature_order.pkl")
        ]

        with tqdm(total=len(artifacts), desc="Loading Artifacts",
                  bar_format="{l_bar}{bar}| {n_fmt}/{total_fmt}",
                  colour='blue') as pbar:
            for name, filename in artifacts:
                pbar.set_description(f"Loading: {name}")
                if name == "Model":
                    self.model = joblib.load(f"{directory}/{filename}")
                elif name == "Selected Features":
                    self.selected_features = joblib.load(f"{directory}/{filename}")
                elif name == "Scaler":
                    self.scaler = joblib.load(f"{directory}/{filename}")
                elif name == "Encoders":
                    self.encoders = joblib.load(f"{directory}/{filename}")
                elif name == "Feature Order":
                    self.training_feature_order = joblib.load(f"{directory}/{filename}")
                pbar.update(1)

        print("Model artifacts loaded successfully.")

    def run_full_pipeline(self):
        print("\n" + "=" * 60)
        print("EXECUTING FULL SYBIL DETECTION PIPELINE")
        print("=" * 60)

        pipeline_start = time.time()

        self.aggregate_features()
        self.handle_missing_values()
        self.encode_categorical_features()
        self.remove_low_variance_features()
        X_train, X_test, y_train, y_test = self.prepare_for_modeling()
        self.run_model_with_top_17_features(X_train, X_test, y_train, y_test)

        pipeline_duration = time.time() - pipeline_start

        print("\n" + "=" * 60)
        print("PIPELINE EXECUTION COMPLETED")
        print("=" * 60)
        print(f"Total execution time: {pipeline_duration:.2f} seconds")
        print("Pipeline ready for inference or model saving.")

In [None]:
cleaned_df=pd.read_csv('cleaned_df.csv', sep=',', low_memory=False)
unlabeled_merged = cleaned_df.drop(columns='is_sybil')

In [None]:
cleaned_df.info()
cleaned_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2266619 entries, 0 to 2266618
Data columns (total 28 columns):
 #   Column                       Dtype  
---  ------                       -----  
 0   Unnamed: 0                   int64  
 1   detecttime                   object 
 2   hash                         object 
 3   status                       object 
 4   region                       object 
 5   curblocknumber               float64
 6   failurereason                object 
 7   blockspending                float64
 8   time_pending_by_blocknative  float64
 9   nonce                        float64
 10  gas                          float64
 11  gasprice                     float64
 12  value                        object 
 13  toaddress                    object 
 14  fromaddress                  object 
 15  type                         float64
 16  maxpriorityfeepergas         float64
 17  maxfeepergas                 float64
 18  basefeepergas                float64
 19  

Unnamed: 0.1,Unnamed: 0,detecttime,hash,status,region,curblocknumber,failurereason,blockspending,time_pending_by_blocknative,nonce,gas,gasprice,value,toaddress,fromaddress,type,maxpriorityfeepergas,maxfeepergas,basefeepergas,stuck,gasused,detect_date,was_evicted,drop_reason,was_rejected,rejection_reason,time_pending,is_sybil
0,0,2024-01-15 00:00:25.570000+00:00,0x0faeff4cb2ed1d589cd3c45cfdf64a9e0f037834b738...,failed,us-east-1,19008566.0,"Reverted: \0x849eaf98\""""",2.0,11248.0,890.0,215884.0,,163000000000000000,0x3fc91a3afd70395cd496c647d5a6cc9d4b2b7fad,0xe5b427d12ce20ddb478f5c15bde4ed74414ed0ba,2.0,100000000.0,26247100000.0,21314960000.0,False,177381.0,2024-01-15,,,,,,False
1,1,2024-01-15 00:00:25.570000+00:00,0xf52d98d43063a4a11205ee4d3d033da746806322478d...,failed,us-east-1,19008566.0,"Reverted: \0x849eaf98\""""",1.0,4895.0,21.0,242490.0,,380000000000000000,0x3fc91a3afd70395cd496c647d5a6cc9d4b2b7fad,0x2a40d415f217a5ff50bb92885aa55c8898b6981f,2.0,100000000.0,24829840000.0,21314960000.0,False,199552.0,2024-01-15,,,,,,False
2,2,2024-01-15 00:00:25.955000+00:00,0xf52d98d43063a4a11205ee4d3d033da746806322478d...,failed,eu-central-1,19008566.0,"Reverted: \0x849eaf98\""""",,,21.0,242490.0,,380000000000000000,0x3fc91a3afd70395cd496c647d5a6cc9d4b2b7fad,0x2a40d415f217a5ff50bb92885aa55c8898b6981f,2.0,100000000.0,24829840000.0,21314960000.0,False,199552.0,2024-01-15,,,,,,False
3,3,2024-01-15 00:00:25.955000+00:00,0x0faeff4cb2ed1d589cd3c45cfdf64a9e0f037834b738...,failed,eu-central-1,19008566.0,"Reverted: \0x849eaf98\""""",,,890.0,215884.0,,163000000000000000,0x3fc91a3afd70395cd496c647d5a6cc9d4b2b7fad,0xe5b427d12ce20ddb478f5c15bde4ed74414ed0ba,2.0,100000000.0,26247100000.0,21314960000.0,False,177381.0,2024-01-15,,,,,,False
4,4,2024-01-15 00:00:26.081000+00:00,0x0faeff4cb2ed1d589cd3c45cfdf64a9e0f037834b738...,failed,ap-southeast-1,19008566.0,"Reverted: \0x849eaf98\""""",,,890.0,215884.0,,163000000000000000,0x3fc91a3afd70395cd496c647d5a6cc9d4b2b7fad,0xe5b427d12ce20ddb478f5c15bde4ed74414ed0ba,2.0,100000000.0,26247100000.0,21314960000.0,False,177381.0,2024-01-15,,,,,,False


In [None]:
pipeline = InteractiveSybilDetectionPipeline(cleaned_df)
pipeline.run_full_pipeline()
pipeline.save_model_artifacts()

SYBIL DETECTION PIPELINE INITIALIZED

ORIGINAL DATASET:
----------------------------------------
Shape: 2,266,620 rows × 27 columns
Target Distribution:
  • Non-Sybil: 2,241,012 (98.9%)
  • Sybil:     25,608 (1.1%)
Missing Values: 5,560,202

EXECUTING FULL SYBIL DETECTION PIPELINE

PHASE 1: FEATURE AGGREGATION

Preprocessing Data...


Preprocessing:   0%|          | 0/3 [00:00<?]


Aggregating 13 numeric features...


Numeric Aggregation:   0%|          | 0/13 [00:00<?]


Aggregating transaction statistics...
Creating derived ratio features...

Feature aggregation completed successfully.
Result: 126,718 addresses with 93 features

PHASE 2: MISSING VALUE HANDLING
No missing values found - skipping this phase.

PHASE 3: CATEGORICAL ENCODING
Encoding 1 categorical features...


Encoding Categories:   0%|          | 0/1 [00:00<?]

Categorical encoding completed for 1 features.

PHASE 4: LOW VARIANCE FEATURE REMOVAL
Analyzing feature variance...
Removed 2 low variance features:
  • success_rate
  • failure_rate

PHASE 5: DATA PREPARATION FOR MODELING
Scaling features...
Splitting dataset...
Dataset prepared successfully:
  • Training set: 101,374 samples
  • Test set: 25,344 samples
  • Features: 90

PHASE 6: MODEL TRAINING & EVALUATION
Selecting top 17 features using mutual information...

Selected Features:
   1. nonce_mean
   2. gas_mean
   3. gas_min
   4. gas_max
   5. gas_median
   6. value_mean
   7. value_min
   8. value_max
   9. value_median
  10. gasused_mean
  11. gasused_min
  12. gasused_max
  13. gasused_median
  14. maxfeepergas_count
  15. maxfeepergas_mean
  16. maxfeepergas_max
  17. basefeepergas_max

Applying SMOTE for class balancing...
Training set after SMOTE: 193,482 samples

Training XGBoost classifier...


Training Model:   0%|          | 00:00


Generating predictions...

MODEL EVALUATION RESULTS (Threshold = 0.7)
              precision    recall  f1-score   support

       False       0.99      0.95      0.97     24186
        True       0.44      0.76      0.56      1158

    accuracy                           0.94     25344
   macro avg       0.71      0.86      0.76     25344
weighted avg       0.96      0.94      0.95     25344

ROC AUC Score: 0.9565

PIPELINE EXECUTION COMPLETED
Total execution time: 122.44 seconds
Pipeline ready for inference or model saving.

Saving model artifacts to 'model_artifacts'...


Saving Artifacts:   0%|          | 0/5

Model artifacts saved successfully.


In [None]:
unlabeled_merged.head()

Unnamed: 0,detecttime,hash,status,region,curblocknumber,failurereason,blockspending,time_pending_by_blocknative,nonce,gas,gasprice,value,toaddress,fromaddress,type,maxpriorityfeepergas,maxfeepergas,basefeepergas,stuck,gasused,detect_date,was_evicted,drop_reason,was_rejected,rejection_reason,time_pending
0,2024-01-15 00:00:25.570000+00:00,0x0faeff4cb2ed1d589cd3c45cfdf64a9e0f037834b738...,failed,us-east-1,19008566.0,"Reverted: \0x849eaf98\""""",2.0,11248.0,890.0,215884.0,,163000000000000000,0x3fc91a3afd70395cd496c647d5a6cc9d4b2b7fad,0xe5b427d12ce20ddb478f5c15bde4ed74414ed0ba,2.0,100000000.0,26247100000.0,21314960000.0,False,177381.0,2024-01-15,,,,,
1,2024-01-15 00:00:25.570000+00:00,0xf52d98d43063a4a11205ee4d3d033da746806322478d...,failed,us-east-1,19008566.0,"Reverted: \0x849eaf98\""""",1.0,4895.0,21.0,242490.0,,380000000000000000,0x3fc91a3afd70395cd496c647d5a6cc9d4b2b7fad,0x2a40d415f217a5ff50bb92885aa55c8898b6981f,2.0,100000000.0,24829840000.0,21314960000.0,False,199552.0,2024-01-15,,,,,
2,2024-01-15 00:00:25.955000+00:00,0xf52d98d43063a4a11205ee4d3d033da746806322478d...,failed,eu-central-1,19008566.0,"Reverted: \0x849eaf98\""""",,,21.0,242490.0,,380000000000000000,0x3fc91a3afd70395cd496c647d5a6cc9d4b2b7fad,0x2a40d415f217a5ff50bb92885aa55c8898b6981f,2.0,100000000.0,24829840000.0,21314960000.0,False,199552.0,2024-01-15,,,,,
3,2024-01-15 00:00:25.955000+00:00,0x0faeff4cb2ed1d589cd3c45cfdf64a9e0f037834b738...,failed,eu-central-1,19008566.0,"Reverted: \0x849eaf98\""""",,,890.0,215884.0,,163000000000000000,0x3fc91a3afd70395cd496c647d5a6cc9d4b2b7fad,0xe5b427d12ce20ddb478f5c15bde4ed74414ed0ba,2.0,100000000.0,26247100000.0,21314960000.0,False,177381.0,2024-01-15,,,,,
4,2024-01-15 00:00:26.081000+00:00,0x0faeff4cb2ed1d589cd3c45cfdf64a9e0f037834b738...,failed,ap-southeast-1,19008566.0,"Reverted: \0x849eaf98\""""",,,890.0,215884.0,,163000000000000000,0x3fc91a3afd70395cd496c647d5a6cc9d4b2b7fad,0xe5b427d12ce20ddb478f5c15bde4ed74414ed0ba,2.0,100000000.0,26247100000.0,21314960000.0,False,177381.0,2024-01-15,,,,,


In [None]:
pipeline = InteractiveSybilDetectionPipeline(unlabeled_merged)

pipeline.load_model_artifacts()
predictions = pipeline.predict_sybil_addresses(unlabeled_merged)

SYBIL DETECTION PIPELINE INITIALIZED

ORIGINAL DATASET:
----------------------------------------
Shape: 2,266,620 rows × 26 columns
Target Distribution: 'is_sybil' column not found.
Missing Values: 5,560,202

Loading model artifacts from 'model_artifacts'...


Loading Artifacts:   0%|          | 0/5

Model artifacts loaded successfully.

INFERENCE PIPELINE


Inference Progress:   0%|          | 0/5 [00:00<?]


PHASE 1: FEATURE AGGREGATION

Preprocessing Data...


Preprocessing:   0%|          | 0/3 [00:00<?]


Aggregating 13 numeric features...


Numeric Aggregation:   0%|          | 0/13 [00:00<?]


Aggregating transaction statistics...
Creating derived ratio features...

Feature aggregation completed successfully.
Result: 126,718 addresses with 92 features

PHASE 2: MISSING VALUE HANDLING
No missing values found - skipping this phase.

PHASE 3: CATEGORICAL ENCODING
Encoding 1 categorical features...


Encoding Categories:   0%|          | 0/1 [00:00<?]

Categorical encoding completed for 1 features.

PHASE 4: LOW VARIANCE FEATURE REMOVAL
Analyzing feature variance...
Removed 16 low variance features:
  • was_evicted_mean
  • was_evicted_std
  • was_evicted_max
  • was_evicted_median
  • was_rejected_mean
  • ... and 11 more

Feature Alignment Analysis:
  • Model features: 17
  • Available features: 76
  • Extra features (ignored): 59

Aligning features with training schema...


Feature Alignment:   0%|          | 0/90

Prediction:   0%|          | 0/3


Prediction Results:
  • Total addresses: 126,718
  • Predicted Sybil: 4,351 (3.4%)
  • Predicted Legitimate: 122,367 (96.6%)


In [None]:
predictions.head(-1)

Unnamed: 0,fromaddress,predicted_is_sybil,sybil_probability
0,0x0000000000000f25a072efa232d8efc0b5ce2436,0,0.045589
1,0x000000000002e33d9a86567c6dfe6d92f6777d1e,0,0.010782
2,0x0000000000055772357e58581a2a2c5c6d9e8f64,0,0.005391
3,0x0000000000211b7cdd33049a9f1985013babb784,0,0.032927
4,0x0000000000234a48603574189845c2d27028dad3,0,0.046289
...,...,...,...
126712,0xfffe352a7d2c2a2bfde15199b9ba0a891d76dc57,0,0.194112
126713,0xfffe567b190edabd46e95db356e4e8a7331ea7d3,0,0.006410
126714,0xffff8298631efa764238485543fcff82b878ce1e,0,0.025891
126715,0xffffab07392dbd555c8d46429fe14018ec71a5a3,0,0.087937


In [None]:
percentage_non_sybil = (predictions['predicted_is_sybil'] == 1).mean() * 100
print(f"🔎 {percentage_non_sybil:.2f}% of addresses were predicted as Sybil (1).")


🔎 4.98% of addresses were predicted as Sybil (1).
