In [None]:
# --- Active Learning: IF + EMC + XGBoost ---
# Paper reproduction version: N=5 runs, median F1 + 95% CI

# Step 1: Import packages
import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest
import xgboost as xgb
from sklearn.metrics import f1_score
import matplotlib.pyplot as plt

# Step 2: Load labeled data
labeled_df = pd.read_csv('elliptic_txs_features_labeled.csv')

# Check data
print("Loaded labeled CSV rows:", labeled_df.shape[0])
print("Unique time_steps:", sorted(labeled_df['time_step'].unique()))
print("Label distribution:\n", labeled_df['class'].value_counts())

# Step 3: Split train/test by time_step
train_df = labeled_df[labeled_df['time_step'] <= 34].reset_index(drop=True)
test_df  = labeled_df[labeled_df['time_step'] >= 35].reset_index(drop=True)

# Prepare test set
X_test = test_df.drop(columns=['txId', 'time_step', 'class'])
y_test = (test_df['class'] == 1).astype(int)

# Feature columns
feature_columns = [col for col in train_df.columns if col.startswith('feature_') or col.startswith('f')]

# Config
batch_size = 50
max_iterations = 30
N_runs = 5

# Storage for all runs
all_f1_scores = []

# Step 4: Repeat N=5 runs
for run in range(N_runs):
    print(f"\n==========================")
    print(f"=== Run {run+1}/{N_runs}, Random State {run+1}")
    print("==========================")
    
    # Initialize pools
    unlabeled_pool = train_df.copy()
    labeled_pool = pd.DataFrame(columns=train_df.columns)
    f1_scores = []
    
    # Warm-up using Isolation Forest
    print("\n===== Warm-up stage: Isolation Forest =====")
    iso_forest = IsolationForest(random_state=run+1)
    iso_forest.fit(unlabeled_pool[feature_columns])
    anomaly_scores = -iso_forest.score_samples(unlabeled_pool[feature_columns])
    unlabeled_pool['anomaly_score'] = anomaly_scores
    unlabeled_pool = unlabeled_pool.sort_values(by='anomaly_score', ascending=False).reset_index(drop=True)
    
    found_illicit = False
    pointer = 0
    warmup_illicit = 0
    
    while not found_illicit and pointer < len(unlabeled_pool):
        batch = unlabeled_pool.iloc[pointer:pointer+batch_size]
        labeled_pool = pd.concat([labeled_pool, batch], ignore_index=True)
        pointer += batch_size
        warmup_illicit = (labeled_pool['class'] == 1).sum()
        found_illicit = warmup_illicit > 0
    
    print(f"Warm-up selected {len(labeled_pool)} samples.")
    print(f"Warm-up found {warmup_illicit} illicit samples.")
    
    # Remove warm-up samples
    unlabeled_pool = unlabeled_pool.iloc[pointer:].drop(columns='anomaly_score').reset_index(drop=True)
    
    # Active Learning loop
    iteration = 1
    print("\n===== Active Learning loop: Expected Model Change =====")
    
    while len(unlabeled_pool) > 0 and iteration <= max_iterations:
        X_train = labeled_pool[feature_columns]
        y_train = (labeled_pool['class'] == 1).astype(int)
        
        xgb_clf = xgb.XGBClassifier(
            tree_method='hist',
            eval_metric='logloss',
            n_estimators=100,
            max_depth=6,
            learning_rate=0.1,
            random_state=run+1
        )
        
        xgb_clf.fit(X_train, y_train)
        
        y_pred = xgb_clf.predict(X_test)
        f1 = f1_score(y_test, y_pred, pos_label=1)
        f1_scores.append(f1)
        print(f"[Run {run+1}] XGBoost_Model_{iteration:02d} illicit F1-score: {f1:.4f}")
        
        # EMC sampling
        X_unlabeled = unlabeled_pool[feature_columns]
        proba = xgb_clf.predict_proba(X_unlabeled)[:, 1]
        emc_score = proba * (1 - proba)
        
        select_idx = np.argsort(-emc_score)[:batch_size]
        new_batch = unlabeled_pool.iloc[select_idx]
        
        labeled_pool = pd.concat([labeled_pool, new_batch], ignore_index=True)
        unlabeled_pool = unlabeled_pool.drop(unlabeled_pool.index[select_idx]).reset_index(drop=True)
        
        iteration += 1
    
    # Save F1-scores of this run
    all_f1_scores.append(f1_scores)

# Step 5: Aggregate results — median + 95% CI
# Pad runs with np.nan to same length
max_len = max(len(scores) for scores in all_f1_scores)
f1_matrix = np.full((N_runs, max_len), np.nan)
for i, scores in enumerate(all_f1_scores):
    f1_matrix[i, :len(scores)] = scores

# Median & CI
median_f1 = np.nanmedian(f1_matrix, axis=0)
std_f1 = np.nanstd(f1_matrix, axis=0)
ci_95 = 1.57 * std_f1 / np.sqrt(N_runs)  # Approx 95% CI

# --- Already computed:
# f1_matrix = shape (5, N_iter)
# median_f1 = np.nanmedian(f1_matrix, axis=0)

# Print Mean F1-score per cycle (across 5 runs)
mean_f1 = np.nanmean(f1_matrix, axis=0)

print("\n===== Mean F1-score across 5 runs per cycle =====")
for i, f1 in enumerate(mean_f1):
    print(f"Cycle {i+1:02d}, Labeled {(i+1)*batch_size} samples: Mean F1 = {f1:.4f}")

# Step 6: Plot
x_axis = np.arange(1, len(median_f1)+1) * batch_size

plt.figure(figsize=(8, 5))
plt.plot(x_axis, median_f1, marker='o', label='Median F1 (N=5 runs)')
plt.fill_between(x_axis, median_f1 - ci_95, median_f1 + ci_95, alpha=0.2, label='~95% CI')
plt.title("Active Learning (EMC) N=5 runs — Median F1 + 95% CI")
plt.xlabel("Number of Labeled Samples")
plt.ylabel("Illicit F1-score")
plt.legend()
plt.grid(True)
plt.show()

In [None]:
# --- Active Learning: IF + Uncertainty Sampling + XGBoost ---
# Paper reproduction version: N=5 runs, median F1 + 95% CI

# Step 1: Import packages
import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest
import xgboost as xgb
from sklearn.metrics import f1_score
import matplotlib.pyplot as plt

# Step 2: Load labeled data
labeled_df = pd.read_csv('elliptic_txs_features_labeled.csv')

# Check data
print("Loaded labeled CSV rows:", labeled_df.shape[0])
print("Unique time_steps:", sorted(labeled_df['time_step'].unique()))
print("Label distribution:\n", labeled_df['class'].value_counts())

# Step 3: Split train/test by time_step
train_df = labeled_df[labeled_df['time_step'] <= 34].reset_index(drop=True)
test_df  = labeled_df[labeled_df['time_step'] >= 35].reset_index(drop=True)

# Prepare test set
X_test = test_df.drop(columns=['txId', 'time_step', 'class'])
y_test = (test_df['class'] == 1).astype(int)

# Feature columns
feature_columns = [col for col in train_df.columns if col.startswith('feature_') or col.startswith('f')]

# Config
batch_size = 50
max_iterations = 30  # 30 iterations = up to 1500 samples
N_runs = 5

# Storage for all runs
all_f1_scores = []

# Step 4: Repeat N=5 runs
for run in range(N_runs):
    print(f"\n==========================")
    print(f"=== Run {run+1}/{N_runs}, Random State {run+1}")
    print("==========================")
    
    # Initialize pools
    unlabeled_pool = train_df.copy()
    labeled_pool = pd.DataFrame(columns=train_df.columns)
    f1_scores = []
    
    # Warm-up using Isolation Forest
    print("\n===== Warm-up stage: Isolation Forest =====")
    iso_forest = IsolationForest(random_state=run+1)
    iso_forest.fit(unlabeled_pool[feature_columns])
    anomaly_scores = -iso_forest.score_samples(unlabeled_pool[feature_columns])
    unlabeled_pool['anomaly_score'] = anomaly_scores
    unlabeled_pool = unlabeled_pool.sort_values(by='anomaly_score', ascending=False).reset_index(drop=True)
    
    found_illicit = False
    pointer = 0
    warmup_illicit = 0
    
    while not found_illicit and pointer < len(unlabeled_pool):
        batch = unlabeled_pool.iloc[pointer:pointer+batch_size]
        labeled_pool = pd.concat([labeled_pool, batch], ignore_index=True)
        pointer += batch_size
        warmup_illicit = (labeled_pool['class'] == 1).sum()
        found_illicit = warmup_illicit > 0
    
    print(f"Warm-up selected {len(labeled_pool)} samples.")
    print(f"Warm-up found {warmup_illicit} illicit samples.")
    
    # Remove warm-up samples
    unlabeled_pool = unlabeled_pool.iloc[pointer:].drop(columns='anomaly_score').reset_index(drop=True)
    
    # Active Learning loop
    iteration = 1
    print("\n===== Active Learning loop: Uncertainty Sampling =====")
    
    while len(unlabeled_pool) > 0 and iteration <= max_iterations:
        X_train = labeled_pool[feature_columns]
        y_train = (labeled_pool['class'] == 1).astype(int)
        
        xgb_clf = xgb.XGBClassifier(
            tree_method='hist',
            eval_metric='logloss',
            n_estimators=100,
            max_depth=6,
            learning_rate=0.1,
            random_state=run+1
        )
        
        xgb_clf.fit(X_train, y_train)
        
        y_pred = xgb_clf.predict(X_test)
        f1 = f1_score(y_test, y_pred, pos_label=1)
        f1_scores.append(f1)
        print(f"[Run {run+1}] XGBoost_Model_{iteration:02d} illicit F1-score: {f1:.4f}")
        
        # Uncertainty sampling
        X_unlabeled = unlabeled_pool[feature_columns]
        proba = xgb_clf.predict_proba(X_unlabeled)[:, 1]
        uncertainty = np.abs(proba - 0.5)
        
        select_idx = np.argsort(uncertainty)[:batch_size]
        new_batch = unlabeled_pool.iloc[select_idx]
        
        labeled_pool = pd.concat([labeled_pool, new_batch], ignore_index=True)
        unlabeled_pool = unlabeled_pool.drop(unlabeled_pool.index[select_idx]).reset_index(drop=True)
        
        iteration += 1
    
    # Save F1-scores of this run
    all_f1_scores.append(f1_scores)

# Step 5: Aggregate results — median + 95% CI
# Pad runs with np.nan to same length
max_len = max(len(scores) for scores in all_f1_scores)
f1_matrix = np.full((N_runs, max_len), np.nan)
for i, scores in enumerate(all_f1_scores):
    f1_matrix[i, :len(scores)] = scores


# Step 5b: Print Mean F1-score across 5 runs per cycle
mean_f1 = np.nanmean(f1_matrix, axis=0)

print("\n===== Mean F1-score across 5 runs per cycle =====")
for i, f1 in enumerate(mean_f1):
    print(f"Cycle {i+1:02d}, Labeled {(i+1)*batch_size} samples: Mean F1 = {f1:.4f}")


# Median & CI
median_f1 = np.nanmedian(f1_matrix, axis=0)
std_f1 = np.nanstd(f1_matrix, axis=0)
ci_95 = 1.57 * std_f1 / np.sqrt(N_runs)  # Approx 95% CI

# Step 6: Plot
x_axis = np.arange(1, len(median_f1)+1) * batch_size

plt.figure(figsize=(8, 5))
plt.plot(x_axis, median_f1, marker='o', label='Median F1 (N=5 runs)')
plt.fill_between(x_axis, median_f1 - ci_95, median_f1 + ci_95, alpha=0.2, label='~95% CI')
plt.title("Active Learning (Uncertainty Sampling) N=5 runs — Median F1 + 95% CI")
plt.xlabel("Number of Labeled Samples")
plt.ylabel("Illicit F1-score")
plt.legend()
plt.grid(True)
plt.show()