In [1]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.metrics import f1_score
import warnings
warnings.filterwarnings('ignore') 

In [None]:
# =========================== #
# 1. LOAD DATA
# =========================== #
# Replace the file paths below if necessary
features = pd.read_csv('elliptic_txs_features.csv', header=None)
# Explicit column naming for safety
features.columns = ['txId', 'time_step'] + [f'f{i}' for i in range(1, 166)]
# Confirm time_step range
print("time_step min:", features['time_step'].min(), "| max:", features['time_step'].max())
print("Examples of time_step values:", np.unique(features['time_step'])[:10])

classes = pd.read_csv('elliptic_txs_classes.csv', header=0)
classes.columns = ['txId', 'label']

# --- 2. CLEAN & MERGE ---
features['txId'] = features['txId'].astype(str)
classes['txId'] = classes['txId'].astype(str)
df = features.merge(classes, on='txId', how='inner')
df['label'] = df['label'].astype(str)
df['Y'] = np.where(df['label'] == '1', 1, 
                   np.where(df['label'] == '2', 0, np.nan))
df_labelled = df[df['Y'].notna()].copy()

# Make sure time_step is numeric
df['time_step'] = pd.to_numeric(df['time_step'], errors='coerce')

# ============= #
# 3. SPLIT DATA #
# ============= #
# Train: time_step <=34; Test: time_step >34
train_mask = df['time_step'] <= 34
test_mask = df['time_step'] > 34
df_train = df_labelled[train_mask].copy()
df_test = df_labelled[test_mask].copy()

feature_columns = [f'f{i}' for i in range(1, 166)]
X_train = df_train[feature_columns].values
y_train = df_train['Y'].values
X_test = df_test[feature_columns].values
y_test = df_test['Y'].values

print(f"Train samples: {len(X_train)} | Test samples: {len(X_test)}")
if len(X_train) == 0 or len(X_test) == 0:
    raise ValueError("No data in train or test split. Check your filtering logic.")

# ============================ #
# 4. FULL SUPERVISED XGBOOST   #
# ============================ #
print("\n===== FULL SUPERVISED XGBOOST =====")
f1_scores = []
for seed in range(5):
    model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=seed)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    f1 = f1_score(y_test, y_pred, pos_label=1)
    f1_scores.append(f1)
    print(f'Run {seed+1}: Illicit F1 = {f1:.4f}')
print(f'Mean illicit F1 over 5 runs: {np.mean(f1_scores):.4f}')

# ======================================== #
# 5. SIMULATED ACTIVE LEARNING LOOP (OPTIONAL)
# ======================================== #
BATCH_SIZE = 50
ACTIVE_LABEL_GOAL = 1500
N_RUNS = 5
all_active_results = []

print("\n===== SIMULATED ACTIVE LEARNING (Uncertainty Sampling) =====")
for repeat in range(N_RUNS):
    np.random.seed(repeat)
    labelled_pool = pd.DataFrame()
    unlabelled_pool = df_train.copy()
    found_illicit = False

    # Warm-up: ensure at least one illicit label to start
    while not found_illicit and len(unlabelled_pool) >= BATCH_SIZE:
        batch = unlabelled_pool.sample(BATCH_SIZE, random_state=np.random.randint(10000))
        labelled_pool = pd.concat([labelled_pool, batch], ignore_index=True)
        unlabelled_pool = unlabelled_pool.drop(batch.index)
        found_illicit = any(batch['Y'] == 1)
    if not found_illicit:
        print(f"No illicit found in warm-up for run {repeat}!")
        continue

    run_scores = []
    for i in range(0, ACTIVE_LABEL_GOAL, BATCH_SIZE):
        X_lab = labelled_pool[feature_columns].values
        y_lab = labelled_pool['Y'].values
        model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=repeat)
        model.fit(X_lab, y_lab)
        y_pred = model.predict(X_test)
        f1 = f1_score(y_test, y_pred, pos_label=1)
        run_scores.append(f1)

        if len(labelled_pool) >= ACTIVE_LABEL_GOAL or len(unlabelled_pool) == 0:
            break

        # Uncertainty sampling: batch with predicted probability closest to 0.5
        X_unlab = unlabelled_pool[feature_columns].values
        if X_unlab.shape[0] == 0:
            break
        proba = model.predict_proba(X_unlab)[:, 1]
        select_count = min(BATCH_SIZE, X_unlab.shape[0])
        uncertain_idx = np.argsort(np.abs(proba - 0.5))[:select_count]
        new_batch = unlabelled_pool.iloc[uncertain_idx]
        labelled_pool = pd.concat([labelled_pool, new_batch], ignore_index=True)
        unlabelled_pool = unlabelled_pool.drop(new_batch.index)
    all_active_results.append(run_scores)

# Compute per-batch median and 95% CI
all_active_results = np.array([scores + [np.nan]*(30-len(scores)) for scores in all_active_results if len(scores) > 0])
medians = np.nanmedian(all_active_results, axis=0)
cis = 1.57 * np.nanstd(all_active_results, axis=0) / np.sqrt(N_RUNS)  # Approx 95% CI for 5 runs

for idx, (median, ci) in enumerate(zip(medians, cis)):
    print(f'Batch {((idx+1)*BATCH_SIZE):>4}: Median F1 = {median:.4f} ± {ci:.4f}')


time_step min: 1 | max: 49
Examples of time_step values: [ 1  2  3  4  5  6  7  8  9 10]
Train samples: 29894 | Test samples: 16670

===== FULL SUPERVISED XGBOOST =====
Run 1: Illicit F1 = 0.8041
Run 2: Illicit F1 = 0.8041
Run 3: Illicit F1 = 0.8041
Run 4: Illicit F1 = 0.8041
Run 5: Illicit F1 = 0.8041
Mean illicit F1 over 5 runs: 0.8041

===== SIMULATED ACTIVE LEARNING (Uncertainty Sampling) =====


In [3]:
# =========== #
# 6. ON UNLABELLED (unknown) SAMPLES: INFERENCE #
# =========== #

# First, get all samples with 'unknown' label
df_unknown = df[(df['label'] == 'unknown')].copy()
if df_unknown.shape[0] == 0:
    print("No unknown samples found in data.")
else:
    X_unknown = df_unknown[feature_columns].values

    print("\n===== FINAL MODEL INFERENCE ON UNKNOWN SAMPLES =====")
    # Use the current model (the final model) to make predictions on unknown samples
    y_pred_unknown = model.predict(X_unknown)
    y_proba_unknown = model.predict_proba(X_unknown)[:, 1]  # Probability of being illicit

    # Add the prediction results to the DataFrame
    df_unknown['predicted_label'] = y_pred_unknown
    df_unknown['predicted_illicit_proba'] = y_proba_unknown

    # You can print the first few rows to inspect the output
    print(df_unknown[['txId', 'predicted_label', 'predicted_illicit_proba']].head(20))

    # Optional: save the results
    df_unknown[['txId', 'predicted_label', 'predicted_illicit_proba']].to_csv('unknown_predictions.csv', index=False)


===== FINAL MODEL INFERENCE ON UNKNOWN SAMPLES =====
         txId  predicted_label  predicted_illicit_proba
0   230425980                0                 0.005387
1     5530458                0                 0.001445
2   232022460                0                 0.016954
4   230460314                0                 0.000033
5   230459870                0                 0.000542
6   230333930                0                 0.000103
7   230595899                0                 0.001869
8   232013274                0                 0.001425
12   36411953                0                 0.000105
13  230405052                0                 0.000905
14   34194980                0                 0.000021
15    5529846                0                 0.000057
18  230409257                0                 0.005339
19   32877982                0                 0.006651
20  230351738                0                 0.000493
21  195218118                0                 0.0