In [28]:
from modAL.models import ActiveLearner
from modAL.uncertainty import uncertainty_sampling
from sklearn.linear_model import LogisticRegression
import numpy as np
import pandas as pd


In [29]:
df_m = pd.read_csv('../dataset/minority.csv')

In [30]:
df = pd.read_csv('../dataset/majority.csv')

In [31]:
# Step 1: Prepare X and y
X_labeled = df_m.drop(columns=['isFraud']).values  # frauds
y_labeled = df_m['isFraud'].values

# Sample a small number of legit rows to seed the model
df_seed = df.sample(n=20, random_state=42)
X_seed = df_seed.drop(columns=['isFraud']).values
y_seed = df_seed['isFraud'].values

# Combine with minority (fraud) set
X_initial = np.vstack([X_labeled, X_seed])
y_initial = np.concatenate([y_labeled, y_seed])

# Remove seed rows from majority df and pool
df = df.drop(df_seed.index).reset_index(drop=True)
X_pool = df.drop(columns=['isFraud']).values
y_pool = df['isFraud'].values


In [32]:
learner = ActiveLearner(
    estimator=LogisticRegression(max_iter=200),
    query_strategy=uncertainty_sampling,
    X_training=X_initial,
    y_training=y_initial
)


In [33]:
# Step 3: Iteratively query most uncertain majority samples
B = int(0.05 * len(df_m))  # 5% of minority size
T = 20  # total iterations

selected_batches = []
selected_indices = []

In [23]:

for t in range(T):
    print(f"Iteration {t+1}/{T}")

    # Query most uncertain legitimate samples
    query_idx, _ = learner.query(X_pool, n_instances=B)

    # Save batch
    batch = df.iloc[query_idx].copy()
    selected_batches.append(batch)

    # Teach the model (optional if you want model to evolve)
    learner.teach(X_pool[query_idx], y_pool[query_idx])

    # Remove queried samples from the pool
    X_pool = np.delete(X_pool, query_idx, axis=0)
    y_pool = np.delete(y_pool, query_idx, axis=0)
    df = df.drop(df.index[query_idx]).reset_index(drop=True)


Iteration 1/20




Iteration 2/20




Iteration 3/20




Iteration 4/20




Iteration 5/20




Iteration 6/20




Iteration 7/20




Iteration 8/20




Iteration 9/20




Iteration 10/20




Iteration 11/20




Iteration 12/20




Iteration 13/20




Iteration 14/20




Iteration 15/20




Iteration 16/20




Iteration 17/20




Iteration 18/20




Iteration 19/20




Iteration 20/20




In [24]:
final = pd.concat(selected_batches)
final.shape

(8200, 6)

In [25]:
df_m.shape

(8213, 6)

In [26]:
final.to_csv('../dataset/majority_informative.csv', index=False)

In [27]:
final.head()

Unnamed: 0.1,Unnamed: 0,type,oldbalanceOrg,newbalanceOrig,oldbalanceDest,isFraud
2438739,2441017,0,28488.83,35434.0,955.51,0
2299733,2301877,0,5393405.29,5405360.93,280003.74,0
4601619,4605340,0,10579.0,21995.77,20104.93,0
2765162,2767579,0,39659.0,47338.39,54399.4,0
1557452,1559211,0,4922014.34,4931816.15,41993.56,0
