# Random Forest baseline

In [2]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score

# Step 1: Load data
features_df = pd.read_csv('elliptic_txs_features.csv', header=None)
classes_df  = pd.read_csv('elliptic_txs_classes.csv')

# Step 2: Process columns
features_df.columns = ['txId', 'time_step'] + [f'feature_{i}' for i in range(features_df.shape[1] - 2)]
classes_df.columns = ['txId', 'class']

# Step 3: Clean txId
features_df['txId'] = features_df['txId'].astype(str).str.strip()
classes_df['txId'] = classes_df['txId'].astype(str).str.strip()

# Step 4: Process labels
classes_df['class'] = classes_df['class'].replace('unknown', np.nan)
classes_df['class'] = classes_df['class'].astype(float)

# Step 5: Merge
data_df = pd.merge(features_df, classes_df, on='txId')
data_df['time_step'] = data_df['time_step'].astype(int)
labeled_df = data_df[data_df['class'].isin([1, 2])]

# Step 6: Split train/test
train_df = labeled_df[labeled_df['time_step'] <= 34]
test_df  = labeled_df[labeled_df['time_step'] >= 35]

X_train = train_df.drop(columns=['txId', 'time_step', 'class'])
y_train = (train_df['class'] == 1).astype(int)

X_test = test_df.drop(columns=['txId', 'time_step', 'class'])
y_test = (test_df['class'] == 1).astype(int)

# Step 7: Run RF 5 times with different random seeds
f1_scores = []

seeds = [0, 1, 2, 3, 4]

for seed in seeds:
    rf_clf = RandomForestClassifier(
        n_estimators=100,
        max_depth=10,
        random_state=seed,
        n_jobs=-1
    )
    rf_clf.fit(X_train, y_train)
    y_pred = rf_clf.predict(X_test)
    f1 = f1_score(y_test, y_pred, pos_label=1)
    f1_scores.append(f1)
    print(f"Run with random_state={seed} - Illicit F1-score: {f1:.4f}")

# Step 8: Median F1-score
median_f1 = np.median(f1_scores)
print("\nFinal RF Baseline - Median Illicit F1-score over 5 runs: {:.4f}".format(median_f1))


Run with random_state=0 - Illicit F1-score: 0.8253
Run with random_state=1 - Illicit F1-score: 0.8215
Run with random_state=2 - Illicit F1-score: 0.8255
Run with random_state=3 - Illicit F1-score: 0.8235
Run with random_state=4 - Illicit F1-score: 0.8212

Final RF Baseline - Median Illicit F1-score over 5 runs: 0.8235


# XGBoost Baseline

In [8]:
# --- XGBoost Baseline: 5-run Median Version ---

import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.metrics import f1_score

# Step 1: Load labeled data
labeled_df = pd.read_csv('elliptic_txs_features_labeled.csv')

# Debug
print("Loaded labeled CSV rows:", labeled_df.shape[0])
print("Unique time_steps:", sorted(labeled_df['time_step'].unique()))
print("Label distribution:\n", labeled_df['class'].value_counts())

# Step 2: Split train/test
train_df = labeled_df[labeled_df['time_step'] <= 34]
test_df  = labeled_df[labeled_df['time_step'] >= 35]

print("Train samples:", len(train_df))
print("Test samples:", len(test_df))
print("Train label dist:\n", train_df['class'].value_counts())
print("Test label dist:\n", test_df['class'].value_counts())

# Step 3: Prepare features and labels
X_train = train_df.drop(columns=['txId', 'time_step', 'class'])
y_train = (train_df['class'] == 1).astype(int)

X_test = test_df.drop(columns=['txId', 'time_step', 'class'])
y_test = (test_df['class'] == 1).astype(int)

# Step 4: Run XGBoost 5 times with different random states
f1_scores = []
seeds = [0, 1, 2, 3, 4]

for seed in seeds:
    xgb_clf = xgb.XGBClassifier(
        tree_method='hist',
        # use_label_encoder=False,
        eval_metric='logloss',
        n_estimators=100,
        max_depth=6,
        learning_rate=0.1,
        random_state=seed
    )
    
    xgb_clf.fit(X_train, y_train)
    y_pred = xgb_clf.predict(X_test)
    
    f1 = f1_score(y_test, y_pred, pos_label=1)
    f1_scores.append(f1)
    print(f"Run with random_state={seed} - Illicit F1-score: {f1:.4f}")

# Step 5: Median F1-score
median_f1 = np.median(f1_scores)
print("\nFinal XGBoost Baseline - Median Illicit F1-score over 5 runs: {:.4f}".format(median_f1))


Loaded labeled CSV rows: 46564
Unique time_steps: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49]
Label distribution:
 class
2.0    42019
1.0     4545
Name: count, dtype: int64
Train samples: 29894
Test samples: 16670
Train label dist:
 class
2.0    26432
1.0     3462
Name: count, dtype: int64
Test label dist:
 class
2.0    15587
1.0     1083
Name: count, dtype: int64
Run with random_state=0 - Illicit F1-score: 0.8014
Run with random_state=1 - Illicit F1-score: 0.8014
Run with random_state=2 - Illicit F1-score: 0.8014
Run with random_state=3 - Illicit F1-score: 0.8014
Run with random_state=4 - Illicit F1-score: 0.8014

Final XGBoost Baseline - Median Illicit F1-score over 5 runs: 0.8014


# Logistic Regression Baseline

In [11]:
# --- Logistic Regression Baseline: 5-run Median Version ---

import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

# Step 1: Load data
features_df = pd.read_csv('elliptic_txs_features.csv', header=None)
classes_df  = pd.read_csv('elliptic_txs_classes.csv')

# Step 2: Assign column names
features_df.columns = ['txId', 'time_step'] + [f'feature_{i}' for i in range(features_df.shape[1] - 2)]
classes_df.columns = ['txId', 'class']

# Step 3: Clean txId
features_df['txId'] = features_df['txId'].astype(str).str.strip()
classes_df['txId'] = classes_df['txId'].astype(str).str.strip()

# Step 4: Clean class column
classes_df['class'] = classes_df['class'].replace('unknown', np.nan)
classes_df['class'] = classes_df['class'].astype(float)

# Step 5: Merge and clean
data_df = pd.merge(features_df, classes_df, on='txId')
data_df['time_step'] = data_df['time_step'].astype(int)

# Step 6: Keep labeled samples
labeled_df = data_df[data_df['class'].isin([1, 2])]

# Step 7: Split into train/test
train_df = labeled_df[labeled_df['time_step'] <= 34]
test_df  = labeled_df[labeled_df['time_step'] >= 35]

X_train = train_df.drop(columns=['txId', 'time_step', 'class'])
y_train = (train_df['class'] == 1).astype(int)

X_test = test_df.drop(columns=['txId', 'time_step', 'class'])
y_test = (test_df['class'] == 1).astype(int)

# Step 8: Train and evaluate with 5 different random seeds
f1_scores = []
seeds = [0, 1, 2, 3, 4]

for seed in seeds:
    lr_clf = LogisticRegression(max_iter=1000, solver='lbfgs', random_state=seed)
    lr_clf.fit(X_train, y_train)
    y_pred = lr_clf.predict(X_test)
    
    f1 = f1_score(y_test, y_pred, pos_label=1)
    f1_scores.append(f1)
    print(f"Run with random_state={seed} - Illicit F1-score: {f1:.4f}")

# Step 9: Median F1-score
median_f1 = np.median(f1_scores)
print("\nFinal Logistic Regression Baseline - Median Illicit F1-score over 5 runs: {:.4f}".format(median_f1))


Run with random_state=0 - Illicit F1-score: 0.4437
Run with random_state=1 - Illicit F1-score: 0.4437
Run with random_state=2 - Illicit F1-score: 0.4437
Run with random_state=3 - Illicit F1-score: 0.4437
Run with random_state=4 - Illicit F1-score: 0.4437

Final Logistic Regression Baseline - Median Illicit F1-score over 5 runs: 0.4437
