# Setup & Imports

In [1]:
import os
import pandas as pd
import json
from tqdm import tqdm
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model
from trl import SFTTrainer
import torch

# Set paths
PROJECT_DIR = r"C:\Users\ghosh\Desktop\Predictive-Transaction-intelligence-for-bfsi"
DATASET_PATH = os.path.join(PROJECT_DIR, "Dataset", "Fraud.csv")  # Your fraud dataset
OUTPUT_DIR = os.path.join(PROJECT_DIR, "models", "phi3-fraud-detector")
os.makedirs(OUTPUT_DIR, exist_ok=True)

print("Setup complete!")


Setup complete!


# Load & Explore Dataset

In [2]:
df = pd.read_csv(DATASET_PATH)
print(f"Dataset shape: {df.shape}")
print(f"Columns: {list(df.columns)}")
print(f"Fraud rate: {df['fraud_flag'].mean():.4f}" if 'fraud_flag' in df.columns else "No fraud_flag!")

df.head()

Dataset shape: (6362620, 11)
Columns: ['step', 'type', 'amount', 'nameOrig', 'oldbalanceOrg', 'newbalanceOrig', 'nameDest', 'oldbalanceDest', 'newbalanceDest', 'isFraud', 'isFlaggedFraud']
No fraud_flag!


Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


# Preprocess → JSONL

In [4]:

import pandas as pd
import json
from tqdm import tqdm
import os

# ------------------------------------------------------------------
# Paths
PROJECT_DIR = r"C:\Users\ghosh\Desktop\Predictive-Transaction-intelligence-for-bfsi"
DATASET_PATH = os.path.join(PROJECT_DIR, "Dataset", "Fraud.csv")
JSONL_PATH   = os.path.join(PROJECT_DIR, "Dataset", "fraud_train.jsonl")

# ------------------------------------------------------------------
# 1. Load data
df = pd.read_csv(DATASET_PATH)
print(f"Loaded {df.shape[0]:,} rows, {df.shape[1]} columns")
print("Columns:", df.columns.tolist())

# ------------------------------------------------------------------
# 2. Map PaySim columns → the fields we need in the prompt
#    - isFraud          → label
#    - type             → transaction type (CASH-OUT, TRANSFER, etc.)
#    - amount           → amount
#    - nameOrig         → sender (use as "user")
#    - nameDest         → receiver (use as "merchant")
#    - step             → hour of the day (step % 24)
#    - oldbalanceOrg    → sender balance before
#    - newbalanceOrig   → sender balance after
#    - oldbalanceDest   → receiver balance before
#    - newbalanceDest   → receiver balance after
#    - isFlaggedFraud   → not used for training

# Add derived fields
df["user_id"]   = df["nameOrig"]
df["merchant"]  = df["nameDest"]
df["category"]  = df["type"]
df["time"]      = (df["step"] % 24).astype(str) + ":00"   # hour only
df["fraud_flag"]= df["isFraud"]

# ------------------------------------------------------------------
# 3. Balanced sampling (7 500 fraud + 7 500 safe)
fraud = df[df["isFraud"] == 1].sample(7_500, random_state=42)
safe  = df[df["isFraud"] == 0].sample(7_500, random_state=42)
df_sample = pd.concat([fraud, safe]).sample(frac=1, random_state=42).reset_index(drop=True)

# ------------------------------------------------------------------
# 4. Compute average spend **per sender** (user_id)
user_avg = df_sample.groupby("user_id")["amount"].mean().to_dict()

# ------------------------------------------------------------------
# 5. Helper: clean time string
def clean_time(t):
    return "unknown" if pd.isna(t) else str(t)

# ------------------------------------------------------------------
# 6. Build instruction-style example
def make_example(row):
    avg = user_avg.get(row["user_id"], 100.0)
    return {
        "messages": [
            {
                "role": "system",
                "content": (
                    "You are a transaction safety expert. Classify the transaction as FRAUD or SAFE "
                    "and explain using amount, time, type, sender/receiver balances, and average user spend."
                )
            },
            {
                "role": "user",
                "content": (
                    f"Sender: {row['user_id'][:10]}… (balance before: ${row['oldbalanceOrg']:.2f}, "
                    f"after: ${row['newbalanceOrig']:.2f}). "
                    f"Avg spend: ${avg:.2f}. "
                    f"Transaction: ${row['amount']:.2f} of type {row['type']} to {row['merchant'][:10]}… "
                    f"at {clean_time(row['time'])}. Fraud?"
                )
            },
            {
                "role": "assistant",
                "content": (
                    f"{'FRAUD' if row['isFraud'] else 'SAFE'}. "
                    f"Amount: ${row['amount']:.2f}, Type: {row['type']}, "
                    f"Time: {clean_time(row['time'])}, "
                    f"Sender balance change: ${row['oldbalanceOrg']-row['newbalanceOrig']:.2f}, "
                    f"Avg spend: ${avg:.2f}"
                )
            }
        ]
    }

# ------------------------------------------------------------------
# 7. Write JSONL
os.makedirs(os.path.dirname(JSONL_PATH), exist_ok=True)
with open(JSONL_PATH, "w", encoding="utf-8") as f:
    for _, row in tqdm(df_sample.iterrows(), total=len(df_sample), desc="Writing JSONL"):
        f.write(json.dumps(make_example(row)) + "\n")

print(f"\nSaved {len(df_sample):,} examples → {JSONL_PATH}")

Loaded 6,362,620 rows, 11 columns
Columns: ['step', 'type', 'amount', 'nameOrig', 'oldbalanceOrg', 'newbalanceOrig', 'nameDest', 'oldbalanceDest', 'newbalanceDest', 'isFraud', 'isFlaggedFraud']


Writing JSONL: 100%|██████████| 15000/15000 [00:02<00:00, 6911.81it/s]


Saved 15,000 examples → C:\Users\ghosh\Desktop\Predictive-Transaction-intelligence-for-bfsi\Dataset\fraud_train.jsonl





# Load Model & Tokenizer (4-bit)

In [3]:
model_name = "microsoft/Phi-3.5-mini-instruct"

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    torch_dtype=torch.float16,
    quantization_config={
        "load_in_4bit": True,
        "bnb_4bit_compute_dtype": torch.float16,
        "bnb_4bit_use_double_quant": True,
        "bnb_4bit_quant_type": "nf4"
    },
    trust_remote_code=True
)

model = prepare_model_for_kbit_training(model)
print("Model loaded in 4-bit!")

`flash-attention` package not found, consider installing for better performance: No module named 'flash_attn'.
Current `flash-attention` does not support `window_size`. Either upgrade or use `attn_implementation='eager'`.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Model loaded in 4-bit!


# Setup LoRA

In [4]:
peft_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 1,572,864 || all params: 3,822,652,416 || trainable%: 0.0411


# Load Dataset & Format

In [5]:
jsonl_path= os.path.join(PROJECT_DIR, "Dataset", "fraud_train.jsonl")
dataset = Dataset.from_json(jsonl_path)

def format_chat(example):
    return {"text": tokenizer.apply_chat_template(example["messages"], tokenize=False)}

dataset = dataset.map(format_chat, remove_columns=dataset.column_names)
dataset = dataset.train_test_split(test_size=0.1)
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 13500
    })
    test: Dataset({
        features: ['text'],
        num_rows: 1500
    })
})


# Training Arguments & Trainer

In [6]:
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=16,
    warmup_steps=50,
    max_steps=800,
    learning_rate=2e-4,
    fp16=True,
    logging_steps=10,
    save_steps=200,
    save_total_limit=2,
    optim="paged_adamw_8bit",
    report_to=[],
    disable_tqdm=False
)

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    dataset_text_field="text",
    max_seq_length=512,
    peft_config=peft_config,
    args=training_args,
)

print("Trainer ready!")


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Map:   0%|          | 0/13500 [00:00<?, ? examples/s]

Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
max_steps is given, it will override any value given in num_train_epochs


Trainer ready!


# START TRAINING

In [15]:
# === CELL: RESUME WITH COMPATIBLE RNG LOADER (PYTORCH 2.0+ FIX) ===
from trl import SFTTrainer
import os
import torch
import numpy as np
import random
from torch.serialization import add_safe_globals
from transformers import Trainer

# === 1. ALLOW NUMPY (AS BEFORE) ===
add_safe_globals([np.core.multiarray._reconstruct])
add_safe_globals([np.ndarray])
add_safe_globals([np.dtype])
for name in dir(np):
    obj = getattr(np, name)
    if isinstance(obj, type) and issubclass(obj, np.generic):
        add_safe_globals([obj])

print("NumPy allowlisted")

# === 2. COMPATIBLE RNG PATCH (WORKS WITH PYTORCH 2.0–2.6+) ===
def compatible_load_rng_state(self, checkpoint):
    """Patched RNG loader for PyTorch compatibility"""
    rng_file = os.path.join(checkpoint, "rng_state.pth")
    if os.path.isfile(rng_file):
        checkpoint_rng_state = torch.load(rng_file, weights_only=False)  # Safe for your checkpoint
        
        # Python RNG
        random.setstate(checkpoint_rng_state["python"])
        
        # NumPy RNG
        np.random.set_state(checkpoint_rng_state["numpy"])
        
        # Torch CPU RNG (standard method)
        if "torch" in checkpoint_rng_state:
            torch.set_rng_state(checkpoint_rng_state["torch"])
        
        # Torch CUDA RNG (standard method)
        if "torch_cuda" in checkpoint_rng_state:
            torch.cuda.set_rng_state(checkpoint_rng_state["torch_cuda"])
        
        # For multi-device (if PyTorch 2.1+ and function exists)
        if hasattr(torch, 'set_rng_state_all') and "torch" in checkpoint_rng_state:
            try:
                torch.set_rng_state_all(checkpoint_rng_state["torch"])
            except:
                pass  # Fallback to standard above
        
        print("RNG states restored (CPU + GPU)")
    else:
        print("No RNG file found — continuing without RNG restore")

# Apply patch
Trainer._load_rng_state = compatible_load_rng_state

print("RNG loader patched (compatible with PyTorch 2.0+)")

# === 3. CHECKPOINT ===
OUTPUT_DIR = r"C:\Users\ghosh\Desktop\Predictive-Transaction-intelligence-for-bfsi\models\phi3-fraud-detector"
checkpoints = [d for d in os.listdir(OUTPUT_DIR) if d.startswith("checkpoint-")]
latest_checkpoint = sorted(checkpoints, key=lambda x: int(x.split('-')[-1]))[-1]
resume_from = os.path.join(OUTPUT_DIR, latest_checkpoint)

print(f"Resuming from: {resume_from} (step {latest_checkpoint.split('-')[-1]})")

# === 4. RECREATE SFTTrainer (IGNORE DEPRECATION WARNINGS — THEY'RE HARMLESS) ===
trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    tokenizer=tokenizer,
    dataset_text_field="text",
    max_seq_length=512,
    peft_config=peft_config,
)

# Fix padding_side warning (harmless but fixes overflow)
tokenizer.padding_side = "right"

# === 5. RESUME ===
trainer.train(resume_from_checkpoint=resume_from)

print(f"Training resumed from step {trainer.state.global_step}")
print("Training complete!")

NumPy allowlisted
RNG loader patched (compatible with PyTorch 2.0+)
Resuming from: C:\Users\ghosh\Desktop\Predictive-Transaction-intelligence-for-bfsi\models\phi3-fraud-detector\checkpoint-400 (step 400)



Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
max_steps is given, it will override any value given in num_train_epochs


  0%|          | 0/800 [00:00<?, ?it/s]

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


RNG states restored (CPU + GPU)


  return fn(*args, **kwargs)
You are not running the flash-attention implementation, expect numerical differences.


: 

# Merge & Save Final Model

In [None]:
from peft import AutoPeftModelForCausalLM

merged_model = AutoPeftModelForCausalLM.from_pretrained(
    os.path.join(OUTPUT_DIR, "final"),
    device_map="cpu",
    torch_dtype=torch.float16
)
merged_model = merged_model.merge_and_unload()

final_path = os.path.join(PROJECT_DIR, "models", "phi3-fraud-merged")
merged_model.save_pretrained(final_path)
tokenizer.save_pretrained(final_path)

print(f"Final merged model saved to {final_path}")

# Test Inference

In [None]:
from transformers import pipeline

pipe = pipeline(
    "text-generation",
    model=final_path,
    tokenizer=final_path,
    max_new_tokens=120,
    temperature=0.3,
    device=0  # GPU
)

prompt = [
    {"role": "system", "content": "You are a fraud detection expert."},
    {"role": "user", "content": "User: 35yo male, USA. Avg spend: $120. Transaction: $980 on Electronics at 2:45 AM. IP in Nigeria. Fraud?"}
]

input_text = tokenizer.apply_chat_template(prompt, tokenize=False, add_generation_prompt=True)
output = pipe(input_text)[0]["generated_text"]
print(output.split("assistant")[-1].strip())

# Export to ONNX

In [None]:
# === FINAL CELL: EXPORT TO ONNX → api/ DIRECTORY ===
import os, torch, subprocess, shutil
from transformers import AutoTokenizer
from peft import AutoPeftModelForCausalLM

# ------------------------------------------------------------------
# Paths
PROJECT_DIR = r"C:\Users\ghosh\Desktop\Predictive-Transaction-intelligence-for-bfsi"
LORA_DIR    = os.path.join(PROJECT_DIR, "models", "phi3-fraud-detector", "final")
API_DIR     = os.path.join(PROJECT_DIR, "api")
ONNX_DIR    = os.path.join(API_DIR, "onnx_model")
os.makedirs(ONNX_DIR, exist_ok=True)

# ------------------------------------------------------------------
# 1. Load & merge LoRA model
print("Loading and merging LoRA model...")
model = AutoPeftModelForCausalLM.from_pretrained(
    LORA_DIR, device_map="cpu", torch_dtype=torch.float16
)
merged = model.merge_and_unload()
tokenizer = AutoTokenizer.from_pretrained(LORA_DIR, trust_remote_code=True)

# Save merged model temporarily (required for ONNX export)
TEMP_DIR = os.path.join(PROJECT_DIR, "temp_phi3_merged")
merged.save_pretrained(TEMP_DIR)
tokenizer.save_pretrained(TEMP_DIR)

# ------------------------------------------------------------------
# 2. Export to ONNX
print("Exporting to ONNX...")
onnx_path = os.path.join(ONNX_DIR, "phi3_fraud_detector.onnx")
cmd = [
    "python", "-m", "transformers.onnx",
    "--model", TEMP_DIR,
    "--feature=causal-lm",
    "--atol=1e-3",
    ONNX_DIR
]
result = subprocess.run(cmd, capture_output=True, text=True)

if result.returncode == 0:
    print("ONNX export successful!")
else:
    raise RuntimeError(f"ONNX export failed:\n{result.stderr}")

# ------------------------------------------------------------------
# 3. Copy tokenizer files
for fname in ["tokenizer.json", "tokenizer_config.json", "special_tokens_map.json"]:
    src = os.path.join(TEMP_DIR, fname)
    dst = os.path.join(ONNX_DIR, fname)
    if os.path.exists(src):
        shutil.copy(src, dst)

# ------------------------------------------------------------------
# 4. Create onnx_predict.py helper
helper_code = '''import onnxruntime as ort
import numpy as np
from transformers import AutoTokenizer
import os

class Phi3ONNXFraudDetector:
    def __init__(self, model_dir):
        self.tokenizer = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True)
        model_path = os.path.join(model_dir, "phi3_fraud_detector.onnx")
        self.session = ort.InferenceSession(
            model_path,
            providers=['CUDAExecutionProvider', 'CPUExecutionProvider']
        )
    
    def predict(self, prompt: str, max_length: int = 128):
        inputs = self.tokenizer(prompt, return_tensors="np")
        input_ids = inputs["input_ids"].astype(np.int64)
        generated = input_ids.copy()
        for _ in range(max_length):
            outputs = self.session.run(None, {"input_ids": generated})
            next_token = np.argmax(outputs[0][:, -1, :], axis=-1, keepdims=True)
            generated = np.concatenate([generated, next_token], axis=1)
            if next_token.item() == self.tokenizer.eos_token_id:
                break
        return self.tokenizer.decode(generated[0], skip_special_tokens=True)
'''

helper_path = os.path.join(API_DIR, "onnx_predict.py")
with open(helper_path, "w", encoding="utf-8") as f:
    f.write(helper_code)

# ------------------------------------------------------------------
# 5. Cleanup
shutil.rmtree(TEMP_DIR, ignore_errors=True)

print("\nONNX EXPORT COMPLETE!")
print(f"ONNX model   : {onnx_path}")
print(f"Helper script: {helper_path}")
print("Use in FastAPI: from onnx_predict import Phi3ONNXFraudDetector")