In [3]:
from modAL.models import ActiveLearner
from modAL.uncertainty import uncertainty_sampling
from sklearn.linear_model import LogisticRegression
import numpy as np
import pandas as pd


In [4]:
df_m = pd.read_csv('../dataset/minority.csv')

In [5]:
df = pd.read_csv('../dataset/majority.csv')

In [6]:
# Step 1: Prepare X and y
X_labeled = df_m.drop(columns=['isFraud']).values  # frauds
y_labeled = df_m['isFraud'].values

# Sample a small number of legit rows to seed the model
df_seed = df.sample(n=20, random_state=42)
X_seed = df_seed.drop(columns=['isFraud']).values
y_seed = df_seed['isFraud'].values

# Combine with minority (fraud) set
X_initial = np.vstack([X_labeled, X_seed])
y_initial = np.concatenate([y_labeled, y_seed])

# Remove seed rows from majority df and pool
df = df.drop(df_seed.index).reset_index(drop=True)
X_pool = df.drop(columns=['isFraud']).values
y_pool = df['isFraud'].values


In [7]:
learner = ActiveLearner(
    estimator=LogisticRegression(max_iter=200),
    query_strategy=uncertainty_sampling,
    X_training=X_initial,
    y_training=y_initial
)


In [8]:
# Step 3: Iteratively query most uncertain majority samples
B = int(0.05 * len(df_m))  # 5% of minority size
T = 20  # total iterations

selected_batches = []
selected_indices = []

In [9]:

for t in range(T):
    print(f"Iteration {t+1}/{T}")

    # Query most uncertain legitimate samples
    query_idx, _ = learner.query(X_pool, n_instances=B)

    # Save batch
    batch = df.iloc[query_idx].copy()
    selected_batches.append(batch)

    # Teach the model (optional if you want model to evolve)
    learner.teach(X_pool[query_idx], y_pool[query_idx])

    # Remove queried samples from the pool
    X_pool = np.delete(X_pool, query_idx, axis=0)
    y_pool = np.delete(y_pool, query_idx, axis=0)
    df = df.drop(df.index[query_idx]).reset_index(drop=True)


Iteration 1/20




Iteration 2/20




Iteration 3/20




Iteration 4/20




Iteration 5/20




Iteration 6/20




Iteration 7/20




Iteration 8/20




Iteration 9/20




Iteration 10/20




Iteration 11/20




Iteration 12/20




Iteration 13/20




Iteration 14/20




Iteration 15/20




Iteration 16/20




Iteration 17/20




Iteration 18/20


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Iteration 19/20




Iteration 20/20


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [10]:
final = pd.concat(selected_batches)
final.shape

(8200, 8)

In [11]:
df_m.shape

(8213, 8)

In [12]:
final.to_csv('../dataset/majority_informative.csv', index=False)

In [13]:
final.head()

Unnamed: 0.1,Unnamed: 0,type,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud
2431590,2433864,0,8311.8,871553.85,879865.65,14865.15,6553.34,0
6062716,6068447,0,31931.85,10320041.27,10351973.12,70876.85,38944.99,0
5987285,5992034,0,17255.72,10946.0,28201.72,3665216.93,3647961.21,0
3579880,3582834,0,15558.61,3658693.32,3674251.93,173519.39,157960.78,0
3636412,3639537,0,9442.86,0.0,9442.86,89555.86,80113.01,0
