In [67]:
pip install pandas numpy scikit-learn imbalanced-learn transformers datasets torch





In [9]:
import pandas as pd
import numpy as np
import os
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE
from torch.utils.data import DataLoader, TensorDataset
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

In [11]:
# ============== CONFIG ==============
CHUNK_SIZE = 50000
DATA_DIR = ""
OUTPUT_FILE = "final_dataset.csv"
SYNTHETIC_DATA_SIZE = 5000

In [13]:
# ============== 1. LOAD & PROCESS DATA ==============
def load_csv_efficiently(file_path, usecols=None):
    """Loads a CSV in chunks to avoid MemoryError."""
    chunks = []
    for chunk in pd.read_csv(file_path, chunksize=CHUNK_SIZE, usecols=usecols, low_memory=False):
        chunks.append(chunk)
    return pd.concat(chunks, ignore_index=True)

print("🔄 Loading datasets...")
credit_card_df = load_csv_efficiently(os.path.join(DATA_DIR, "creditcard.csv"), usecols=["Amount", "Class"])
paysim_df = load_csv_efficiently(os.path.join(DATA_DIR, "PS_20174392719_1491204439457_log.csv"), usecols=["amount", "isFraud"])
transaction_df = load_csv_efficiently(os.path.join(DATA_DIR, "train_transaction.csv"), usecols=["TransactionID", "isFraud", "TransactionAmt", "card1", "card2"])
identity_df = load_csv_efficiently(os.path.join(DATA_DIR, "train_identity.csv"), usecols=["TransactionID", "DeviceType"])

# ✅ Load SEC EDGAR Financial Data (NEW)
sec_num_df = load_csv_efficiently(os.path.join(DATA_DIR, "sec_num_data.csv"), usecols=["adsh", "value"])
sec_sub_df = load_csv_efficiently(os.path.join(DATA_DIR, "sec_sub_data.csv"), usecols=["adsh", "sic", "form"])

print("✅ Datasets loaded successfully!")


🔄 Loading datasets...
✅ Datasets loaded successfully!


In [14]:
# Optimize memory
def optimize_memory(df):
    """Converts data types to reduce memory usage."""
    for col in df.select_dtypes(include=['float64']).columns:
        df[col] = df[col].astype(np.float32)
    for col in df.select_dtypes(include=['int64']).columns:
        df[col] = df[col].astype(np.int32)
    return df

credit_card_df = optimize_memory(credit_card_df)
paysim_df = optimize_memory(paysim_df)
transaction_df = optimize_memory(transaction_df)
identity_df = optimize_memory(identity_df)
sec_num_df = optimize_memory(sec_num_df)
sec_sub_df = optimize_memory(sec_sub_df)

In [15]:
# Merge and process data
credit_card_df.rename(columns={"Class": "isFraud", "Amount": "TransactionAmt"}, inplace=True)
paysim_df.rename(columns={"amount": "TransactionAmt"}, inplace=True)
ieee_merged_df = transaction_df.merge(identity_df, on="TransactionID", how="left")

# ✅ Merge SEC Data on `adsh` Column
sec_merged_df = sec_num_df.merge(sec_sub_df, on="adsh", how="left")

# Merge all datasets
transaction_data = pd.concat([credit_card_df, paysim_df, ieee_merged_df, sec_merged_df], axis=0, ignore_index=True)
transaction_data.drop(columns=["TransactionID", "adsh"], inplace=True, errors="ignore")

In [16]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder

# Separate numeric and categorical columns
numeric_cols = transaction_data.select_dtypes(include=['number']).columns
categorical_cols = transaction_data.select_dtypes(exclude=['number']).columns

# Handle missing values for numeric columns using median
num_imputer = SimpleImputer(strategy="median")
transaction_data[numeric_cols] = num_imputer.fit_transform(transaction_data[numeric_cols])

# Handle missing values for categorical columns using most frequent value
cat_imputer = SimpleImputer(strategy="most_frequent")
transaction_data[categorical_cols] = cat_imputer.fit_transform(transaction_data[categorical_cols])




In [17]:


# Encode categorical variables
cat_cols = transaction_data.select_dtypes(include=['object']).columns
for col in cat_cols:
    le = LabelEncoder()
    transaction_data[col] = le.fit_transform(transaction_data[col].astype(str))

In [18]:
# Feature Scaling
scaler = StandardScaler()
num_cols = transaction_data.select_dtypes(include=["int32", "float32"]).columns
transaction_data[num_cols] = scaler.fit_transform(transaction_data[num_cols])

In [19]:
# ============== 2. SYNTHETIC DATA USING VAE ==============
print("🤖 Generating synthetic fraud transactions using Variational Autoencoder (VAE)...")

class VAE(nn.Module):
    def __init__(self, input_dim, latent_dim=16):
        super(VAE, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.ReLU(),
            nn.Linear(64, latent_dim)
        )
        self.decoder = nn.Sequential(
            nn.Linear(latent_dim, 64),
            nn.ReLU(),
            nn.Linear(64, input_dim)
        )

    def forward(self, x):
        latent = self.encoder(x)
        reconstructed = self.decoder(latent)
        return reconstructed

fraud_data = transaction_data[transaction_data["isFraud"] == 1].drop(columns=["isFraud"])
input_dim = fraud_data.shape[1]
X_train_tensor = torch.tensor(fraud_data.values, dtype=torch.float32)
dataset = TensorDataset(X_train_tensor)
dataloader = DataLoader(dataset, batch_size=64, shuffle=True)

vae = VAE(input_dim)
optimizer = optim.Adam(vae.parameters(), lr=0.001)
loss_fn = nn.MSELoss()

for epoch in range(10):
    for batch in dataloader:
        optimizer.zero_grad()
        reconstructed = vae(batch[0])
        loss = loss_fn(reconstructed, batch[0])
        loss.backward()
        optimizer.step()

latent_space = torch.randn(SYNTHETIC_DATA_SIZE, 16)
synthetic_data = vae.decoder(latent_space).detach().numpy()
synthetic_df = pd.DataFrame(synthetic_data, columns=fraud_data.columns)
synthetic_df["isFraud"] = 1

print(f"✅ Generated {SYNTHETIC_DATA_SIZE} synthetic fraud transactions!")

🤖 Generating synthetic fraud transactions using Variational Autoencoder (VAE)...
✅ Generated 5000 synthetic fraud transactions!


In [20]:
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121


Looking in indexes: https://download.pytorch.org/whl/cu121
Note: you may need to restart the kernel to use updated packages.


In [None]:
# ============== 3. SYNTHETIC DATA USING GEN AI (Mistral 7B) ==============
print("🤖 Generating additional synthetic transactions using Mistral 7B...")
import requests
import json

# Hugging Face API Key (replace with your key)
HUGGINGFACE_API_KEY = "***"
API_URL = "**"
HEADERS = {"Authorization": f"Bearer {HUGGINGFACE_API_KEY}"}

def generate_fraudulent_transaction():
    """Generates synthetic financial fraud transactions using Hugging Face API."""
    prompt = """
    Generate a JSON representation of a fraudulent financial transaction with realistic attributes:
    - TransactionAmt: A realistic amount (e.g., 1000.45)
    - Card1, Card2: Numeric values representing card IDs
    - DeviceType: "desktop" or "mobile"
    - isFraud: Always 1
    """

    payload = {"inputs": prompt, "parameters": {"max_length": 150, "temperature": 0.7}}
    response = requests.post(API_URL, headers=HEADERS, json=payload)

    if response.status_code == 200:
        try:
            json_str = response.json()[0]['generated_text']
            synthetic_transaction = eval(json_str)  # Convert string to dictionary
            return synthetic_transaction
        except:
            return None  # Skip invalid output
    else:
        print(f"❌ API Error: {response.text}")
        return None

# Generate 1000 synthetic transactions
synthetic_gen_ai_data = []
for _ in range(50):
    transaction = generate_fraudulent_transaction()
    if transaction:
        synthetic_gen_ai_data.append(transaction)

synthetic_gen_ai_df = pd.DataFrame(synthetic_gen_ai_data)

print(f"✅ Generated {len(synthetic_gen_ai_df)} transactions using Gen AI via Hugging Face API!")


🤖 Generating additional synthetic transactions using Mistral 7B...
✅ Generated 0 transactions using Gen AI via Hugging Face API!


In [22]:
# ============== 4. FINAL DATASET & SAVE ==============
final_balanced_data = pd.concat([transaction_data, synthetic_df, synthetic_gen_ai_df], ignore_index=True)

print(f"✅ Final dataset shape: {final_balanced_data.shape}")
final_balanced_data.to_csv(OUTPUT_FILE, index=False)
print(f"🎉 Final dataset saved as {OUTPUT_FILE}!")

✅ Final dataset shape: (10296973, 8)
🎉 Final dataset saved as final_dataset.csv!


In [23]:
## 3️⃣ Feature Engineering using Gen AI

In [21]:
import pandas as pd
import numpy as np
import os
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import VarianceThreshold, mutual_info_classif
from sklearn.decomposition import PCA
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import time
import requests

In [None]:
# ============== CONFIG ==============
UPLOAD_FILE = "uploaded_data.csv"  # User uploads this file dynamically
OUTPUT_FILE = "processed_data.csv"
HUGGING_FACE_API_URL = "Add your url" #replaced to push
HEADERS = {"Authorization": "Bearer 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"}  # Replace with your Hugging Face API key



# ============== 1. LOAD DATASET ==============
def wait_for_file(file_path, timeout=300):
    """
    Waits for the user to upload the CSV file. Checks every 5 seconds for a max of `timeout` seconds.
    """
    print("🔄 Waiting for user to upload dataset...")

    elapsed_time = 0
    while not os.path.exists(file_path):
        if elapsed_time >= timeout:
            print(f"🚨 No file uploaded within {timeout} seconds. Exiting.")
            exit()
        time.sleep(5)  # Check every 5 seconds
        elapsed_time += 5
        print(f"⏳ Waiting... ({elapsed_time}/{timeout} sec)")

    print(f"✅ File '{file_path}' detected! Loading...")

def load_csv(file_path):
    """Loads a CSV file efficiently after user uploads it."""
    return pd.read_csv(file_path, low_memory=False)

# **Wait for user to upload file**
wait_for_file(UPLOAD_FILE)

# **Load the uploaded file**
try:
    uploaded_df = load_csv(UPLOAD_FILE)
    print(f"✅ Uploaded dataset loaded with shape: {uploaded_df.shape}")
except Exception as e:
    print(f"❌ Error loading CSV: {e}")
    exit()


🔄 Waiting for user to upload dataset...
⏳ Waiting... (5/300 sec)
⏳ Waiting... (10/300 sec)
⏳ Waiting... (15/300 sec)
⏳ Waiting... (20/300 sec)
⏳ Waiting... (25/300 sec)
⏳ Waiting... (30/300 sec)
⏳ Waiting... (35/300 sec)
⏳ Waiting... (40/300 sec)
⏳ Waiting... (45/300 sec)
⏳ Waiting... (50/300 sec)
⏳ Waiting... (55/300 sec)
⏳ Waiting... (60/300 sec)
⏳ Waiting... (65/300 sec)
⏳ Waiting... (70/300 sec)
⏳ Waiting... (75/300 sec)
⏳ Waiting... (80/300 sec)
⏳ Waiting... (85/300 sec)
⏳ Waiting... (90/300 sec)
⏳ Waiting... (95/300 sec)
⏳ Waiting... (100/300 sec)
⏳ Waiting... (105/300 sec)
⏳ Waiting... (110/300 sec)
⏳ Waiting... (115/300 sec)
⏳ Waiting... (120/300 sec)
⏳ Waiting... (125/300 sec)
⏳ Waiting... (130/300 sec)
⏳ Waiting... (135/300 sec)
⏳ Waiting... (140/300 sec)
⏳ Waiting... (145/300 sec)
⏳ Waiting... (150/300 sec)
⏳ Waiting... (155/300 sec)
⏳ Waiting... (160/300 sec)
⏳ Waiting... (165/300 sec)
⏳ Waiting... (170/300 sec)
⏳ Waiting... (175/300 sec)
⏳ Waiting... (180/300 sec)
⏳ Waitin

In [11]:
# ============== 2. HANDLE MISSING VALUES ==============
def handle_missing_values(df):
    """Handles missing values: median for numerical, most-frequent for categorical."""
    num_cols = df.select_dtypes(include=["int64", "float64"]).columns
    cat_cols = df.select_dtypes(include=["object"]).columns
    
    if len(num_cols) > 0:
        num_imputer = SimpleImputer(strategy="median")
        df[num_cols] = num_imputer.fit_transform(df[num_cols])
    
    if len(cat_cols) > 0:
        cat_imputer = SimpleImputer(strategy="most_frequent")
        df[cat_cols] = cat_imputer.fit_transform(df[cat_cols])
    
    return df

uploaded_df = handle_missing_values(uploaded_df)

In [15]:
# ============== 3. ENCODE CATEGORICAL FEATURES ==============
def encode_categorical(df):
    """Encodes categorical variables using Label Encoding."""
    cat_cols = df.select_dtypes(include=["object"]).columns
    for col in cat_cols:
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col].astype(str))
    return df

uploaded_df = encode_categorical(uploaded_df)

# ============== 4. FEATURE SELECTION ==============
def select_features(df):
    """Automatically selects best features for anomaly detection."""
    print("📊 Selecting important features...")

    # 4.1 Remove low variance features
    selector = VarianceThreshold(threshold=0.01)
    df_selected = selector.fit_transform(df)

    # 4.2 Mutual Information for Feature Importance
    X = df.drop(columns=["isFraud"], errors="ignore")
    y = df["isFraud"] if "isFraud" in df.columns else None

    selected_features = X.columns  # ✅ Define it before the condition

    if y is not None:
        mi_scores = mutual_info_classif(X, y, discrete_features="auto")
        feature_scores = pd.Series(mi_scores, index=X.columns).sort_values(ascending=False)
        selected_features = feature_scores.index[:10]  # Take top 10 features

    df_selected = df[selected_features]  # ✅ Always defined

    print(f"✅ Selected {df_selected.shape[1]} features.")
    return df_selected

selected_df = select_features(uploaded_df)

📊 Selecting important features...
✅ Selected 9 features.


In [23]:
# ============== 5. GEN AI FEATURE ANALYSIS (Hugging Face API) ==============
print("🤖 Using Hugging Face API to analyze feature importance...")

def analyze_features_with_genai(feature_names):
    """Calls Hugging Face API to determine the best features for anomaly detection."""
    prompt = f"""
    The following are column names from a financial transactions dataset:
    {', '.join(feature_names)}
    
    Based on your expertise, suggest the top 5 features that are most relevant for anomaly detection in financial fraud cases.
    """

    response = requests.post(HUGGING_FACE_API_URL, headers=HEADERS, json={"inputs": prompt})
    
    if response.status_code == 200:
        output_text = response.json()[0]["generated_text"]
        
        # Extract features from Gen AI response
        suggested_features = []
        for feature in feature_names:
            if feature in output_text:
                suggested_features.append(feature)

        print(f"🤖 Gen AI suggests using these features: {suggested_features}")
        return suggested_features[:5]  # Take top 5 from Gen AI suggestion
    else:
        print(f"⚠️ Error: {response.status_code} - {response.text}")
        return feature_names[:5]  # Default to first 5 features if API fails

genai_selected_features = analyze_features_with_genai(selected_df.columns)
selected_df = selected_df[genai_selected_features]

🤖 Using Hugging Face API to analyze feature importance...
🤖 Gen AI suggests using these features: ['As of Date', 'Company', 'Account', 'AU', 'Currency', 'Primary Account', 'Secondary Account', 'GL Balance', 'iHub Balance']


In [25]:
# ============== 6. FALLBACK MECHANISM ==============
if len(genai_selected_features) == 0:
    print("⚠️ Gen AI did not return any valid features, falling back to statistical selection.")
    selected_df = select_features(uploaded_df)  # Use original statistical feature selection
else:
    selected_df = selected_df[genai_selected_features]
# ============== 6. FEATURE SCALING ==============
scaler = StandardScaler()
selected_df = pd.DataFrame(scaler.fit_transform(selected_df), columns=selected_df.columns)

# ============== 7. SAVE FINAL DATASET ==============
selected_df.to_csv(OUTPUT_FILE, index=False)
print(f"🎉 Processed dataset saved as {OUTPUT_FILE}!")


🎉 Processed dataset saved as processed_data.csv!


In [27]:
selected_df.shape


(100, 5)

In [None]:
import pandas as pd
import numpy as np
import os
import requests
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import VarianceThreshold, mutual_info_classif
import joblib

# ============== CONFIG ==============
TRAIN_FILE = "final_dataset.csv"  # For model training
USER_UPLOAD_FILE = "uploaded_data.csv"  # User's input file for real-time detection
PROCESSED_TRAIN_OUTPUT = "processed_train_data_final.csv"  # Final training data
PROCESSED_USER_OUTPUT = "processed_data.csv"  # Processed user data for API
HUGGING_FACE_API_URL = "Add your url" #replaced to push
HEADERS = {"Authorization": "Bearer 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"}  # Replace with your Hugging Face API key

# ============== 1. LOAD DATASET ==============
def load_csv(file_path):
    """Loads a CSV file dynamically, waits for user upload if missing."""
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"🚨 No file found! Please upload '{file_path}' first.")
    return pd.read_csv(file_path, low_memory=False)

print("🔄 Loading training and user-uploaded datasets...")
try:
    train_df = load_csv(TRAIN_FILE)  # Training dataset
    print(f"✅ Training dataset loaded with shape: {train_df.shape}")
    
    if os.path.exists(USER_UPLOAD_FILE):
        user_df = load_csv(USER_UPLOAD_FILE)  # User-uploaded dataset
        print(f"✅ User dataset loaded with shape: {user_df.shape}")
    else:
        user_df = None  # No user upload yet
    
except FileNotFoundError as e:
    print(e)
    exit()

# ============== 2. HANDLE MISSING VALUES ==============
def handle_missing_values(df):
    """Handles missing values: median for numerical, most-frequent for categorical."""
    num_cols = df.select_dtypes(include=["int64", "float64"]).columns
    cat_cols = df.select_dtypes(include=["object"]).columns
    
    if len(num_cols) > 0:
        num_imputer = SimpleImputer(strategy="median")
        df[num_cols] = num_imputer.fit_transform(df[num_cols])
    
    if len(cat_cols) > 0:
        cat_imputer = SimpleImputer(strategy="most_frequent")
        df[cat_cols] = cat_imputer.fit_transform(df[cat_cols])
    
    return df

train_df = handle_missing_values(train_df)
if user_df is not None:
    user_df = handle_missing_values(user_df)

# ============== 3. ENCODE CATEGORICAL FEATURES ==============
def encode_categorical(df):
    """Encodes categorical variables using Label Encoding."""
    cat_cols = df.select_dtypes(include=["object"]).columns
    for col in cat_cols:
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col].astype(str))
    return df

train_df = encode_categorical(train_df)
if user_df is not None:
    user_df = encode_categorical(user_df)

# ============== 4. FEATURE SELECTION ==============
def select_features(df):
    """Automatically selects best features for anomaly detection, ensuring non-empty data."""
    if df.empty:
        print("⚠️ Dataset is empty! Skipping feature selection.")
        return df  # Return unchanged

    print("📊 Selecting important features...")

    # Store original column names
    original_columns = df.columns

    # Remove low variance features
    selector = VarianceThreshold(threshold=0.01)
    df_selected = selector.fit_transform(df)

    # Restore column names for selected features
    selected_mask = selector.get_support()  # Get True/False mask of selected features
    selected_columns = original_columns[selected_mask]  # Apply mask to get column names

    df_selected = pd.DataFrame(df_selected, columns=selected_columns)  # Convert back to DataFrame

    # Mutual Information for Feature Importance (Only if "isFraud" column exists)
    if "isFraud" in df.columns:
        X = df_selected
        y = df["isFraud"]
        
        mi_scores = mutual_info_classif(X, y, discrete_features="auto")
        feature_scores = pd.Series(mi_scores, index=X.columns).sort_values(ascending=False)
        top_features = feature_scores.index[:10]  # Take top 10 features
        df_selected = df_selected[top_features]

    print(f"✅ Selected {df_selected.shape[1]} features.")
    return df_selected


train_df = select_features(train_df)
if user_df is not None:
    user_df = select_features(user_df)

# ============== 5. GEN AI FEATURE ANALYSIS (Hugging Face API) ==============
print("🤖 Using Hugging Face API to analyze feature importance...")

def analyze_features_with_genai(feature_names):
    """Calls Hugging Face API to determine the best features for anomaly detection."""
    prompt = f"""
    The following are column names from a financial transactions dataset:
    {', '.join(feature_names)}
    
    Based on your expertise, suggest the top 5 features that are most relevant for anomaly detection in financial fraud cases.
    """

    response = requests.post(HUGGING_FACE_API_URL, headers=HEADERS, json={"inputs": prompt})
    
    if response.status_code == 200:
        output_text = response.json()[0]["generated_text"]
        
        # Extract features from Gen AI response
        suggested_features = []
        for feature in feature_names:
            if feature in output_text:
                suggested_features.append(feature)

        print(f"🤖 Gen AI suggests using these features: {suggested_features}")
        return suggested_features[:5]  # Take top 5 from Gen AI suggestion
    else:
        print(f"⚠️ Error: {response.status_code} - {response.text}")
        return feature_names[:5]  # Default to first 5 features if API fails

genai_selected_features = analyze_features_with_genai(train_df.columns)


# ✅ Handle missing columns more robustly
available_features = [f for f in genai_selected_features if f in train_df.columns]

if not available_features:
    print("⚠️ No Gen AI suggested features found in dataset. Using first 5 available features instead.")
    available_features = train_df.columns[:5].tolist()  # Convert to list to avoid indexing errors

# Ensure selected columns exist before applying selection
available_features = [f for f in available_features if f in train_df.columns]

if not available_features:
    raise ValueError("🚨 Critical Error: No valid features found for training. Please check dataset integrity!")

train_df = train_df[available_features]

if user_df is not None:
    user_df = user_df[[f for f in available_features if f in user_df.columns]]


# Ensure dataset is not empty before scaling
if not train_df.empty:
    scaler = StandardScaler()
    train_df = pd.DataFrame(scaler.fit_transform(train_df), columns=train_df.columns)
    joblib.dump(scaler, "scaler.pkl")  # Save scaler for later use
else:
    print("⚠️ Train dataset is empty! Skipping scaling.")

if user_df is not None and not user_df.empty:
    user_df = pd.DataFrame(scaler.transform(user_df), columns=user_df.columns)
else:
    print("⚠️ User dataset is empty or missing! Skipping scaling.")
    


# ============== 7. SAVE FINAL DATASET ==============
train_df.to_csv(PROCESSED_TRAIN_OUTPUT, index=False)
print(f"🎉 Processed training dataset saved as {PROCESSED_TRAIN_OUTPUT}!")

if user_df is not None:
    user_df.to_csv(PROCESSED_USER_OUTPUT, index=False)
    print(f"🎉 Processed user dataset saved as {PROCESSED_USER_OUTPUT}!")
else:
    print("📢 No user dataset found, waiting for upload.")


🔄 Loading training and user-uploaded datasets...
✅ Training dataset loaded with shape: (10296973, 8)
✅ User dataset loaded with shape: (100, 9)
📊 Selecting important features...
✅ Selected 7 features.
📊 Selecting important features...
✅ Selected 7 features.
🤖 Using Hugging Face API to analyze feature importance...
🤖 Gen AI suggests using these features: ['form', 'DeviceType', 'TransactionAmt', 'value', 'card1', 'card2', 'sic']
⚠️ User dataset is empty or missing! Skipping scaling.
🎉 Processed training dataset saved as processed_train_data_final.csv!
🎉 Processed user dataset saved as processed_data.csv!


In [None]:
import pandas as pd
import numpy as np
import os
import joblib
import tensorflow as tf
from tensorflow import keras
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from pyod.models.hbos import HBOS
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# ============== CONFIG ==============
PROCESSED_TRAIN_FILE = "processed_train_data_final.csv"
MODEL_SAVE_PATH = "hybrid_anomaly_model.pkl"
AUTOENCODER_SAVE_PATH = "autoencoder_model.h5"
SCALER_PATH = "scaler.pkl"

# ============== 1. LOAD PROCESSED DATA ==============
def load_data(file_path):
    """Loads processed training dataset."""
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"🚨 No file found! Please run Step 3 first: '{file_path}'")
    return pd.read_csv(file_path)

print("🔄 Loading processed dataset...")
df = load_data(PROCESSED_TRAIN_FILE)
print(f"✅ Loaded dataset with shape: {df.shape}")

# Ensure no missing values remain
df = df.fillna(0)

# ============== 2. FEATURE SCALING ==============
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df)
joblib.dump(scaler, SCALER_PATH)  # Save scaler for real-time detection

# ============== 3. TRAIN-TEST SPLIT ==============
X_train, X_test = train_test_split(df_scaled, test_size=0.2, random_state=42)

# ============== 4. AUTOENCODER MODEL ==============
def build_autoencoder(input_dim):
    """Builds and compiles an autoencoder model."""
    encoder = keras.Sequential([
        keras.layers.Dense(64, activation="relu", input_shape=(input_dim,)),
        keras.layers.Dense(32, activation="relu"),
        keras.layers.Dense(16, activation="relu")
    ])

    decoder = keras.Sequential([
        keras.layers.Dense(32, activation="relu", input_shape=(16,)),
        keras.layers.Dense(64, activation="relu"),
        keras.layers.Dense(input_dim, activation="sigmoid")
    ])

    autoencoder = keras.Sequential([encoder, decoder])
    autoencoder.compile(optimizer="adam", loss="mse")
    return autoencoder, encoder

print("🔧 Training Autoencoder...")
autoencoder, encoder = build_autoencoder(X_train.shape[1])
autoencoder.fit(X_train, X_train, epochs=10, batch_size=32, validation_data=(X_test, X_test), verbose=1)
autoencoder.save(AUTOENCODER_SAVE_PATH)  # Save model
print("✅ Autoencoder training complete!")

# Compute reconstruction errors
train_errors = np.mean(np.abs(autoencoder.predict(X_train) - X_train), axis=1)
threshold = np.percentile(train_errors, 95)  # 95th percentile as anomaly threshold
print(f"📊 Autoencoder Threshold: {threshold}")

# ============== 5. TRAIN OTHER ANOMALY DETECTION MODELS ==============
print("🔧 Training Isolation Forest...")
iso_forest = IsolationForest(contamination=0.05, random_state=42)
iso_forest.fit(X_train)

print("🔧 Training Local Outlier Factor (LOF)...")
lof = LocalOutlierFactor(n_neighbors=20, contamination=0.05)
lof.fit(X_train)

print("🔧 Training HBOS...")
hbos = HBOS(contamination=0.05)
hbos.fit(X_train)

# ============== 6. HYBRID MODEL PREDICTION ==============
def predict_anomalies(X):
    """Runs data through all models and combines results."""
    autoencoder_preds = np.mean(np.abs(autoencoder.predict(X) - X), axis=1) > threshold
    iso_preds = iso_forest.predict(X) == -1  # -1 means anomaly
    lof_preds = lof.fit_predict(X) == -1  # -1 means anomaly
    hbos_preds = hbos.predict(X) == 1  # 1 means anomaly in HBOS

    # Majority Voting
    hybrid_preds = (autoencoder_preds.astype(int) + iso_preds.astype(int) +
                    lof_preds.astype(int) + hbos_preds.astype(int)) >= 2  # Majority vote
    return hybrid_preds

# ============== 7. EVALUATION METRICS ==============
def evaluate_model(y_true, y_pred):
    """Calculates evaluation metrics."""
    acc = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, zero_division=1)
    recall = recall_score(y_true, y_pred, zero_division=1)
    f1 = f1_score(y_true, y_pred, zero_division=1)
    print(f"📊 Model Performance: Accuracy={acc:.2f}, Precision={precision:.2f}, Recall={recall:.2f}, F1={f1:.2f}")

# Since we don't have labels in anomaly detection, assume top 5% highest reconstruction errors are anomalies
y_test_true = (np.mean(np.abs(autoencoder.predict(X_test) - X_test), axis=1) > threshold).astype(int)
y_test_pred = predict_anomalies(X_test)

evaluate_model(y_test_true, y_test_pred)

# ============== 8. SAVE MODELS ==============
joblib.dump({"iso_forest": iso_forest, "lof": lof, "hbos": hbos, "threshold": threshold}, MODEL_SAVE_PATH)
print(f"✅ Hybrid Model saved to {MODEL_SAVE_PATH}")


🔄 Loading processed dataset...
✅ Loaded dataset with shape: (10296973, 5)
🔧 Training Autoencoder...


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10
[1m257425/257425[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m485s[0m 2ms/step - loss: 0.7439 - val_loss: 0.6339
Epoch 2/10
[1m257425/257425[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m489s[0m 2ms/step - loss: 0.8971 - val_loss: 0.6339
Epoch 3/10
[1m257425/257425[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m481s[0m 2ms/step - loss: 0.9368 - val_loss: 0.6339
Epoch 4/10
[1m257425/257425[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m526s[0m 2ms/step - loss: 0.8903 - val_loss: 0.6339
Epoch 5/10
[1m257425/257425[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m515s[0m 2ms/step - loss: 1.0166 - val_loss: 0.6339
Epoch 6/10
[1m257425/257425[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m510s[0m 2ms/step - loss: 0.8584 - val_loss: 0.6339
Epoch 7/10
[1m257425/257425[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m506s[0m 2ms/step - loss: 0.9459 - val_loss: 0.6339
Epoch 8/10
[1m257425/257425[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m513s[0m 2ms/step - loss: 0.



✅ Autoencoder training complete!
[1m257425/257425[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m235s[0m 914us/step
📊 Autoencoder Threshold: 0.7459511914998989
🔧 Training Isolation Forest...
🔧 Training Local Outlier Factor (LOF)...


In [None]:
import pandas as pd
import numpy as np
import joblib
import requests
import json
from tensorflow import keras

# ============== CONFIG ==============
PROCESSED_USER_FILE = "processed_data.csv"  # User-uploaded data after Step 3 processing
MODEL_PATH = "hybrid_anomaly_model.pkl"  # Trained hybrid model
AUTOENCODER_PATH = "autoencoder_model.h5"  # Trained autoencoder model
SCALER_PATH = "scaler.pkl"  # Feature scaler
HUGGING_FACE_API_URL = "Add your url" #replaced to push
HEADERS = {"Authorization": "Bearer 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"}  # Replace with your Hugging Face API key

# ============== 1. LOAD MODELS & DATA ==============
print("🔄 Loading trained models and user-uploaded dataset...")

# Load processed user-uploaded data
def load_csv(file_path):
    """Loads a CSV file dynamically."""
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"🚨 No file found! Please upload '{file_path}' first.")
    return pd.read_csv(file_path, low_memory=False)

user_df = load_csv(PROCESSED_USER_FILE)
scaler = joblib.load(SCALER_PATH)  # Load scaler

# Load trained hybrid model components
model_components = joblib.load(MODEL_PATH)
iso_forest = model_components["iso_forest"]
lof = model_components["lof"]
hbos = model_components["hbos"]
threshold = model_components["threshold"]

# Load trained Autoencoder model
autoencoder = keras.models.load_model(AUTOENCODER_PATH)

# Scale user data
user_df_scaled = scaler.transform(user_df)

# ============== 2. PREDICT ANOMALIES ==============
print("🔍 Detecting anomalies...")

def predict_anomalies(X):
    """Runs data through all models and combines results."""
    autoencoder_preds = np.mean(np.abs(autoencoder.predict(X) - X), axis=1) > threshold
    iso_preds = iso_forest.predict(X) == -1
    lof_preds = lof.fit_predict(X) == -1
    hbos_preds = hbos.predict(X) == 1  # 1 means anomaly in HBOS

    # Majority Voting
    hybrid_preds = (autoencoder_preds.astype(int) + iso_preds.astype(int) +
                    lof_preds.astype(int) + hbos_preds.astype(int)) >= 2  # Majority vote
    return hybrid_preds

user_df["Anomaly"] = predict_anomalies(user_df_scaled)
anomalies = user_df[user_df["Anomaly"] == 1]  # Filter detected anomalies

print(f"📢 Detected {len(anomalies)} anomalies!")

# ============== 3. ROOT CAUSE ANALYSIS USING GEN AI ==============
def get_root_cause_analysis(row):
    """Calls Hugging Face API to determine the root cause of an anomaly."""
    input_text = f"""
    A financial transaction was flagged as an anomaly. Here are the details:
    {json.dumps(row.to_dict(), indent=2)}
    
    Can you analyze the possible root cause of this anomaly based on common fraud detection patterns?
    """
    response = requests.post(HUGGING_FACE_API_URL, headers=HEADERS, json={"inputs": input_text})
    
    if response.status_code == 200:
        return response.json()[0]["generated_text"]
    else:
        return "⚠️ Root Cause Analysis unavailable due to API error."

if len(anomalies) > 0:
    print("🤖 Performing Root Cause Analysis on anomalies...")
    anomalies["Root_Cause"] = anomalies.apply(get_root_cause_analysis, axis=1)

# ============== 4. AUTOMATED CORRECTIVE ACTIONS USING GEN AI ==============
def get_corrective_action(row):
    """Suggests automated corrective actions based on anomaly details."""
    input_text = f"""
    A transaction was detected as an anomaly with these details:
    {json.dumps(row.to_dict(), indent=2)}
    
    Based on AI-driven insights, what are the best corrective actions that can be taken?
    """
    response = requests.post(HUGGING_FACE_API_URL, headers=HEADERS, json={"inputs": input_text})
    
    if response.status_code == 200:
        return response.json()[0]["generated_text"]
    else:
        return "⚠️ Corrective Actions unavailable due to API error."

if len(anomalies) > 0:
    print("🤖 Generating corrective actions for anomalies...")
    anomalies["Corrective_Action"] = anomalies.apply(get_corrective_action, axis=1)

# ============== 5. SAVE OUTPUT ==============
anomalies.to_csv("anomalies_with_analysis.csv", index=False)
print(f"🎉 Anomaly report saved as 'anomalies_with_analysis.csv'!")

# Display example anomaly
if len(anomalies) > 0:
    print("\n📢 Sample Anomaly Report:")
    print(anomalies.head(1).to_string())


In [35]:
pip install pyod


Collecting pyod
  Downloading pyod-2.0.4.tar.gz (169 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Building wheels for collected packages: pyod
  Building wheel for pyod (setup.py): started
  Building wheel for pyod (setup.py): finished with status 'done'
  Created wheel for pyod: filename=pyod-2.0.4-py3-none-any.whl size=200540 sha256=94c00f5c18a4387dad9ce64a577d021939fa00aff44b5c4907dc7108f81ed47c
  Stored in directory: c:\users\sahan\appdata\local\pip\cache\wheels\c1\f3\c3\67f847c010f2e3bb0515531e8f6ad3735eb1518c0f08165447
Successfully built pyod
Installing collected packages: pyod
Successfully installed pyod-2.0.4
Note: you may need to restart the kernel to use updated packages.


In [37]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn.preprocessing import StandardScaler
from pyod.models.hbos import HBOS
import joblib

# ============== 1. LOAD TRAINING DATASET (Step 1 & Step 2 Output) ==============
TRAIN_FILE = "final_dataset.csv"  # This should contain historical + synthetic fraud data
df = pd.read_csv(TRAIN_FILE)

# ============== 2. PREPARE FEATURES ==============
X = df.drop(columns=["isFraud"], errors="ignore")  # Drop target variable if it exists
y = df["isFraud"] if "isFraud" in df.columns else None  # Target variable

# Normalize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Save the scaler for later use
joblib.dump(scaler, "scaler.pkl")

# ============== 3. AUTOENCODER TRAINING ==============
def build_autoencoder(input_dim):
    input_layer = Input(shape=(input_dim,))
    encoded = Dense(32, activation="relu")(input_layer)
    encoded = Dense(16, activation="relu")(encoded)
    encoded = Dense(8, activation="relu")(encoded)
    
    decoded = Dense(16, activation="relu")(encoded)
    decoded = Dense(32, activation="relu")(decoded)
    decoded = Dense(input_dim, activation="linear")(decoded)
    
    autoencoder = Model(input_layer, decoded)
    autoencoder.compile(optimizer="adam", loss="mse")
    
    return autoencoder

autoencoder = build_autoencoder(X_scaled.shape[1])

# Train the autoencoder
autoencoder.fit(X_scaled, X_scaled, epochs=10, batch_size=64, shuffle=True, verbose=1)

# Save the trained autoencoder
autoencoder.save("autoencoder_model.h5")

# Get reconstruction errors
X_reconstructed = autoencoder.predict(X_scaled)
reconstruction_error = np.mean(np.abs(X_scaled - X_reconstructed), axis=1)

# ============== 4. TRAIN ISOLATION FOREST ==============
isolation_forest = IsolationForest(n_estimators=100, contamination=0.05, random_state=42)
isolation_forest.fit(X_scaled)
iso_preds = isolation_forest.predict(X_scaled)

# Convert predictions to 0 (normal) & 1 (anomaly)
iso_preds = np.where(iso_preds == -1, 1, 0)

# ============== 5. TRAIN LOCAL OUTLIER FACTOR (LOF) ==============
lof = LocalOutlierFactor(n_neighbors=20, contamination=0.05)
lof_preds = lof.fit_predict(X_scaled)

# Convert predictions to 0 (normal) & 1 (anomaly)
lof_preds = np.where(lof_preds == -1, 1, 0)

# ============== 6. TRAIN HBOS (Histogram-Based Outlier Detection) ==============
hbos = HBOS(contamination=0.05)
hbos.fit(X_scaled)
hbos_preds = hbos.predict(X_scaled)

# Convert predictions to 0 (normal) & 1 (anomaly)
hbos_preds = np.where(hbos_preds == 1, 1, 0)

# Save models
joblib.dump(isolation_forest, "isolation_forest.pkl")
joblib.dump(lof, "lof.pkl")
joblib.dump(hbos, "hbos.pkl")

# ============== 7. COMBINE ALL MODELS INTO A HYBRID SYSTEM ==============
df["Autoencoder_Error"] = reconstruction_error
df["IsolationForest_Pred"] = iso_preds
df["LOF_Pred"] = lof_preds
df["HBOS_Pred"] = hbos_preds

# Majority Voting: If 2 or more models detect an anomaly → Flag as anomaly
df["Hybrid_Anomaly_Label"] = (df[["IsolationForest_Pred", "LOF_Pred", "HBOS_Pred"]].sum(axis=1) >= 2).astype(int)

# Save the hybrid model's predictions
df.to_csv("trained_hybrid_model.csv", index=False)

print("🎉 Hybrid anomaly detection model trained and saved successfully!")


Epoch 1/10
[1m160891/160891[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m267s[0m 2ms/step - loss: 0.0795
Epoch 2/10
[1m160891/160891[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1933s[0m 12ms/step - loss: 0.2416
Epoch 3/10
[1m160891/160891[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m253s[0m 2ms/step - loss: 0.0839
Epoch 4/10
[1m160891/160891[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m255s[0m 2ms/step - loss: 0.2268
Epoch 5/10
[1m160891/160891[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m243s[0m 2ms/step - loss: 0.2084
Epoch 6/10
[1m160891/160891[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m267s[0m 2ms/step - loss: 0.1705
Epoch 7/10
[1m160891/160891[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m269s[0m 2ms/step - loss: 0.2518
Epoch 8/10
[1m160891/160891[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m278s[0m 2ms/step - loss: 0.2196
Epoch 9/10
[1m160891/160891[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m320s[0m 2ms/step - loss: 0.0648
Epoch 10/10
[1m1



[1m321781/321781[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m285s[0m 887us/step
🎉 Hybrid anomaly detection model trained and saved successfully!


In [118]:
import joblib

# Select the final features actually used in training
selected_features = list(df.columns)  # After preprocessing, ensure these are used

# Save these selected features for inference (overwrite Step 3's `selected_features.pkl`)
joblib.dump(selected_features, "selected_features.pkl")
print(f"✅ Saved selected features for model training: {selected_features}")

✅ Saved selected features for model training: ['form', 'DeviceType', 'TransactionAmt', 'value', 'card1']


In [1]:
import pandas as pd
import numpy as np
import os
import requests
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import VarianceThreshold, mutual_info_classif
import joblib

# ============== CONFIG ==============
TRAIN_FILE = "final_dataset.csv"  # Training dataset from Step 1 & 2
PROCESSED_TRAIN_OUTPUT = "processed_train_data_final.csv"  # Final processed training data
FEATURES_FILE = "selected_features.pkl"  # Save selected features for consistency
SCALER_FILE = "scaler.pkl"  # Save scaler for real-time use
HUGGING_FACE_API_URL = "https://api-inference.huggingface.co/models/mistralai/Mistral-7B-Instruct-v0.1"
HEADERS = {"Authorization": "Bearer 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"}  # Replace with your Hugging Face API key

# ============== 1. LOAD TRAINING DATASET ==============
def load_csv(file_path):
    """Loads a CSV file dynamically, waits for user upload if missing."""
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"🚨 No file found! Please upload '{file_path}' first.")
    return pd.read_csv(file_path, low_memory=False)

print("🔄 Loading training dataset...")
try:
    train_df = load_csv(TRAIN_FILE)  # Training dataset
    print(f"✅ Training dataset loaded with shape: {train_df.shape}")
except FileNotFoundError as e:
    print(e)
    exit()

# ============== 2. HANDLE MISSING VALUES ==============
def handle_missing_values(df):
    """Handles missing values: median for numerical, most-frequent for categorical."""
    num_cols = df.select_dtypes(include=["int64", "float64"]).columns
    cat_cols = df.select_dtypes(include=["object"]).columns
    
    if len(num_cols) > 0:
        num_imputer = SimpleImputer(strategy="median")
        df[num_cols] = num_imputer.fit_transform(df[num_cols])
    
    if len(cat_cols) > 0:
        cat_imputer = SimpleImputer(strategy="most_frequent")
        df[cat_cols] = cat_imputer.fit_transform(df[cat_cols])
    
    return df

train_df = handle_missing_values(train_df)

# ============== 3. ENCODE CATEGORICAL FEATURES ==============
def encode_categorical(df):
    """Encodes categorical variables using Label Encoding."""
    cat_cols = df.select_dtypes(include=["object"]).columns
    for col in cat_cols:
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col].astype(str))
    return df

train_df = encode_categorical(train_df)

# ============== 4. FEATURE SELECTION ==============
def select_features(df):
    """Automatically selects best features for anomaly detection."""
    print("📊 Selecting important features...")

    # Remove low variance features
    selector = VarianceThreshold(threshold=0.01)
    df_selected = pd.DataFrame(selector.fit_transform(df), columns=df.columns[selector.get_support()])

    # Mutual Information for Feature Importance
    if "isFraud" in df.columns:
        X = df.drop(columns=["isFraud"])
        y = df["isFraud"]
        mi_scores = mutual_info_classif(X, y, discrete_features="auto")
        feature_scores = pd.Series(mi_scores, index=X.columns).sort_values(ascending=False)
        selected_features = feature_scores.index[:min(10, len(feature_scores))]  # Handle cases with <10 features
        df_selected = df[selected_features]
    else:
        df_selected = df  # If no labels, keep original

    print(f"✅ Selected {df_selected.shape[1]} features.")
    
    # Save selected features for real-time processing consistency
    joblib.dump(df_selected.columns.tolist(), FEATURES_FILE)
    
    return df_selected

train_df = select_features(train_df)

# ============== 5. GEN AI FEATURE ANALYSIS (Hugging Face API) ==============
print("🤖 Using Hugging Face API to analyze feature importance...")

def analyze_features_with_genai(feature_names):
    """Calls Hugging Face API to determine the best features for anomaly detection."""
    prompt = f"""
    The following are column names from a financial transactions dataset:
    {', '.join(feature_names)}
    
    Based on your expertise, suggest the top 5 features that are most relevant for anomaly detection in financial fraud cases.
    """

    response = requests.post(HUGGING_FACE_API_URL, headers=HEADERS, json={"inputs": prompt})
    
    if response.status_code == 200:
        output_text = response.json()[0]["generated_text"]
        
        # Extract features from Gen AI response
        suggested_features = [feature for feature in feature_names if feature in output_text]

        print(f"🤖 Gen AI suggests using these features: {suggested_features}")
        return suggested_features[:min(5, len(suggested_features))]  # Handle cases with <5 features
    else:
        print(f"⚠️ Error: {response.status_code} - {response.text}")
        return feature_names[:min(5, len(feature_names))]  # Default to available features

genai_selected_features = analyze_features_with_genai(train_df.columns)

# ✅ **Fix: Handle missing columns dynamically**
available_features = [f for f in genai_selected_features if f in train_df.columns]

if not available_features:
    print("⚠️ No Gen AI suggested features found in dataset. Using first available features instead.")
    available_features = train_df.columns[:min(5, len(train_df.columns))]

train_df = train_df[available_features]

# ============== 6. FEATURE SCALING ==============
scaler = StandardScaler()
train_df = pd.DataFrame(scaler.fit_transform(train_df), columns=train_df.columns)

# Save the scaler for real-time processing
joblib.dump(scaler, SCALER_FILE)

# ============== 7. SAVE FINAL DATASET ==============
train_df.to_csv(PROCESSED_TRAIN_OUTPUT, index=False)
print(f"🎉 Processed training dataset saved as {PROCESSED_TRAIN_OUTPUT}!")


🔄 Loading training dataset...
✅ Training dataset loaded with shape: (10296973, 8)
📊 Selecting important features...
✅ Selected 7 features.
🤖 Using Hugging Face API to analyze feature importance...
🤖 Gen AI suggests using these features: ['form', 'DeviceType', 'TransactionAmt', 'value', 'card1', 'card2', 'sic']
🎉 Processed training dataset saved as processed_train_data_final.csv!


In [116]:
# Import necessary libraries
import pandas as pd
import numpy as np
import os
import joblib
import tensorflow as tf
from tensorflow import keras
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from pyod.models.hbos import HBOS
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# ============== CONFIG ==============
PROCESSED_TRAIN_FILE = "processed_train_data_final.csv"
MODEL_SAVE_PATH = "hybrid_anomaly_model.pkl"
AUTOENCODER_SAVE_PATH = "autoencoder_model.h5"
SCALER_PATH = "scaler.pkl"

# ============== 1. LOAD PROCESSED DATA ==============
def load_data(file_path):
    """Loads processed training dataset."""
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"🚨 No file found! Please run Step 3 first: '{file_path}'")
    return pd.read_csv(file_path)

print("🔄 Loading processed dataset...")
df = load_data(PROCESSED_TRAIN_FILE)
print(f"✅ Loaded dataset with shape: {df.shape}")

# Ensure no missing values remain
df = df.fillna(0)


🔄 Loading processed dataset...
✅ Loaded dataset with shape: (10296973, 5)


In [23]:
# ============== 2. FEATURE SCALING ==============
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df)

# Save the scaler for real-time detection
joblib.dump(scaler, SCALER_PATH)
print("✅ Feature Scaling Complete! Scaler Saved.")


✅ Feature Scaling Complete! Scaler Saved.


In [25]:
# ============== 3. TRAIN-TEST SPLIT ==============
X_train, X_test = train_test_split(df_scaled, test_size=0.2, random_state=42)
print(f"✅ Data split complete! Train Shape: {X_train.shape}, Test Shape: {X_test.shape}")
# ============== 4. AUTOENCODER MODEL ==============
def build_autoencoder(input_dim):
    """Builds and compiles an autoencoder model."""
    encoder = keras.Sequential([
        keras.layers.Dense(64, activation="relu", input_shape=(input_dim,)),
        keras.layers.Dense(32, activation="relu"),
        keras.layers.Dense(16, activation="relu")
    ])

    decoder = keras.Sequential([
        keras.layers.Dense(32, activation="relu", input_shape=(16,)),
        keras.layers.Dense(64, activation="relu"),
        keras.layers.Dense(input_dim, activation="sigmoid")
    ])

    autoencoder = keras.Sequential([encoder, decoder])
    autoencoder.compile(optimizer="adam", loss="mse")
    return autoencoder, encoder

print("🔧 Training Autoencoder...")
autoencoder, encoder = build_autoencoder(X_train.shape[1])

# Train Autoencoder
autoencoder.fit(X_train, X_train, epochs=10, batch_size=32, validation_data=(X_test, X_test), verbose=1)

# Save Model
autoencoder.save(AUTOENCODER_SAVE_PATH)
print("✅ Autoencoder training complete!")

# Compute reconstruction errors
train_errors = np.mean(np.abs(autoencoder.predict(X_train) - X_train), axis=1)
threshold = np.percentile(train_errors, 95)  # 95th percentile as anomaly threshold
print(f"📊 Autoencoder Threshold: {threshold}")

# ============== 5. TRAIN OTHER MODELS ==============

# Optimize Isolation Forest & LOF using subsampling for speed
subsample_size = min(50000, len(X_train))  # Use max 50K samples for training (speeds up process)
X_train_sampled = X_train[np.random.choice(len(X_train), subsample_size, replace=False)]

print("🔧 Training Isolation Forest...")
iso_forest = IsolationForest(contamination=0.05, random_state=42, n_jobs=-1)
iso_forest.fit(X_train_sampled)

print("🔧 Training Local Outlier Factor (LOF)...")
lof = LocalOutlierFactor(n_neighbors=20, contamination=0.05, n_jobs=-1)
lof.fit(X_train_sampled)

print("🔧 Training HBOS...")
hbos = HBOS(contamination=0.05)
hbos.fit(X_train_sampled)

print("✅ Model Training Complete!")
# ============== 6. HYBRID MODEL PREDICTION ==============
def predict_anomalies(X):
    """Runs data through all models and combines results."""
    autoencoder_preds = np.mean(np.abs(autoencoder.predict(X) - X), axis=1) > threshold
    iso_preds = iso_forest.predict(X) == -1  # -1 means anomaly
    lof_preds = lof.fit_predict(X) == -1  # -1 means anomaly
    hbos_preds = hbos.predict(X) == 1  # 1 means anomaly in HBOS

    # Majority Voting
    hybrid_preds = (autoencoder_preds.astype(int) + iso_preds.astype(int) +
                    lof_preds.astype(int) + hbos_preds.astype(int)) >= 2  # Majority vote
    return hybrid_preds
# ============== 7. EVALUATION METRICS ==============
def evaluate_model(y_true, y_pred):
    """Calculates evaluation metrics."""
    acc = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, zero_division=1)
    recall = recall_score(y_true, y_pred, zero_division=1)
    f1 = f1_score(y_true, y_pred, zero_division=1)
    print(f"📊 Model Performance: Accuracy={acc:.2f}, Precision={precision:.2f}, Recall={recall:.2f}, F1={f1:.2f}")

# Assume top 5% highest reconstruction errors are anomalies
y_test_true = (np.mean(np.abs(autoencoder.predict(X_test) - X_test), axis=1) > threshold).astype(int)
y_test_pred = predict_anomalies(X_test)

evaluate_model(y_test_true, y_test_pred)
# ============== 8. SAVE MODELS ==============
joblib.dump({
    "iso_forest": iso_forest,
    "lof": lof,
    "hbos": hbos,
    "threshold": threshold
}, MODEL_SAVE_PATH)

print(f"✅ Hybrid Model saved to {MODEL_SAVE_PATH}")



✅ Data split complete! Train Shape: (8237578, 5), Test Shape: (2059395, 5)
🔧 Training Autoencoder...


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10
[1m257425/257425[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m476s[0m 2ms/step - loss: 0.7526 - val_loss: 0.6339
Epoch 2/10
[1m257425/257425[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m471s[0m 2ms/step - loss: 0.8282 - val_loss: 0.6339
Epoch 3/10
[1m257425/257425[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m500s[0m 2ms/step - loss: 0.9895 - val_loss: 0.6339
Epoch 4/10
[1m257425/257425[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m459s[0m 2ms/step - loss: 0.8695 - val_loss: 0.6353
Epoch 5/10
[1m257425/257425[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m517s[0m 2ms/step - loss: 0.6653 - val_loss: 0.6339
Epoch 6/10
[1m257425/257425[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m453s[0m 2ms/step - loss: 0.9454 - val_loss: 0.6339
Epoch 7/10
[1m257425/257425[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m456s[0m 2ms/step - loss: 0.8789 - val_loss: 0.6339
Epoch 8/10
[1m257425/257425[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m492s[0m 2ms/step - loss: 0.



✅ Autoencoder training complete!
[1m257425/257425[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m204s[0m 793us/step
📊 Autoencoder Threshold: 0.7459511914998989
🔧 Training Isolation Forest...
🔧 Training Local Outlier Factor (LOF)...
🔧 Training HBOS...
✅ Model Training Complete!
[1m64357/64357[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 722us/step
[1m64357/64357[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 741us/step
📊 Model Performance: Accuracy=0.99, Precision=0.92, Recall=0.81, F1=0.86
✅ Hybrid Model saved to hybrid_anomaly_model.pkl


In [120]:
import joblib

# Select the final features actually used in training
selected_features = list(df.columns)  # After preprocessing, ensure these are used

# Save these selected features for inference (overwrite Step 3's `selected_features.pkl`)
joblib.dump(selected_features, "selected_features.pkl")
print(f"✅ Saved selected features for model training: {selected_features}")

✅ Saved selected features for model training: ['form', 'DeviceType', 'TransactionAmt', 'value', 'card1']


In [52]:
pip install fuzzywuzzy

Collecting fuzzywuzzy
  Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl.metadata (4.9 kB)
Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl (18 kB)
Installing collected packages: fuzzywuzzy
Successfully installed fuzzywuzzy-0.18.0
Note: you may need to restart the kernel to use updated packages.


In [122]:
import pandas as pd
import numpy as np
import joblib
import tensorflow as tf
from tensorflow import keras
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from pyod.models.hbos import HBOS
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.losses import MeanSquaredError
import requests

# ============== 1. LOAD TRAINED MODELS & SCALER ==============
print("🔄 Loading trained models...")
models = joblib.load("hybrid_anomaly_model.pkl")  # Load trained models
scaler = joblib.load("scaler.pkl")  # Load trained scaler
trained_features = joblib.load("selected_features.pkl")  # Load feature names used in training

# Explicitly define the MSE loss function
mse = MeanSquaredError()
autoencoder = keras.models.load_model("autoencoder_model.h5", custom_objects={"mse": mse})

iso_forest = models["iso_forest"]
lof = models["lof"]
hbos = models["hbos"]
threshold = models["threshold"]

# ============== 2. LOAD USER-UPLOADED DATA ==============
def load_user_data(file_path):
    """Loads user-uploaded dataset."""
    df_user = pd.read_csv(file_path)
    print(f"✅ User dataset loaded: {df_user.shape}")
    return df_user

df_user = load_user_data("uploaded_data.csv")

# ============== 3. ENSURE FEATURE MATCHING ==============
def match_features(df_user, trained_features):
    """
    Ensures user-uploaded dataset has **exactly** the same columns as the trained model.
    - Removes extra columns
    - Adds missing columns (filled with zero)
    - Ensures correct order
    """
    user_columns = df_user.columns.tolist()

    # Remove extra columns that are not in the trained model
    df_user = df_user[[col for col in trained_features if col in user_columns]]

    # Add missing columns with zero values
    for col in trained_features:
        if col not in df_user.columns:
            df_user[col] = 0  # Add missing column with zero values

    # Ensure correct column order
    df_user = df_user[trained_features]

    print(f"✅ Finalized Feature Set for Scaling: {df_user.columns.tolist()}")
    return df_user

# Apply feature matching
df_user_selected = match_features(df_user, trained_features)

# **FIX: Now the dataset has EXACTLY the same features as training**
df_user_scaled = scaler.transform(df_user_selected)

# ============== 4. PREDICT ANOMALIES USING HYBRID MODEL ==============
def predict_anomalies(X):
    """Runs data through all models and combines results."""
    autoencoder_preds = np.mean(np.abs(autoencoder.predict(X) - X), axis=1) > threshold
    iso_preds = iso_forest.predict(X) == -1  # -1 means anomaly
    lof_preds = lof.fit_predict(X) == -1  # -1 means anomaly
    hbos_preds = hbos.predict(X) == 1  # 1 means anomaly in HBOS

    # Majority Voting
    hybrid_preds = (autoencoder_preds.astype(int) + iso_preds.astype(int) +
                    lof_preds.astype(int) + hbos_preds.astype(int)) >= 2  # Majority vote
    return hybrid_preds

df_user["Anomaly"] = predict_anomalies(df_user_scaled)

# ============== 5. ROOT CAUSE ANALYSIS USING GEN AI (Hugging Face API) ==============
def analyze_root_cause(row):
    """Uses Hugging Face API for root cause analysis."""
    payload = {"inputs": f"Analyze anomaly in this data: {row.to_dict()}"}
    response = requests.post("https://api-inference.huggingface.co/models/mistralai/Mistral-7B", json=payload)
    
    if response.status_code == 200:
        return response.json()[0]["generated_text"]
    return "No insight available"

# Apply Gen AI for insights
df_user["Root_Cause_Analysis"] = df_user.apply(lambda row: analyze_root_cause(row) if row["Anomaly"] else "Normal", axis=1)

# ============== 6. SAVE OUTPUT & DISPLAY RESULTS ==============
df_user.to_csv("anomaly_results.csv", index=False)
print(f"✅ Anomaly detection completed! Results saved to 'anomaly_results.csv'.")
df_user[["Anomaly", "Root_Cause_Analysis"]].head(10)  # Show sample results


🔄 Loading trained models...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_user[col] = 0  # Add missing column with zero values
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_user[col] = 0  # Add missing column with zero values
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_user[col] = 0  # Add missing column with zero values
A value is trying to be set on a copy

✅ User dataset loaded: (100, 9)
✅ Finalized Feature Set for Scaling: ['form', 'DeviceType', 'TransactionAmt', 'value', 'card1']




[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step
✅ Anomaly detection completed! Results saved to 'anomaly_results.csv'.


Unnamed: 0,Anomaly,Root_Cause_Analysis
0,False,Normal
1,False,Normal
2,False,Normal
3,False,Normal
4,False,Normal
5,False,Normal
6,False,Normal
7,False,Normal
8,False,Normal
9,False,Normal


In [132]:
import pandas as pd
import numpy as np
import joblib
import tensorflow as tf
from tensorflow import keras
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from pyod.models.hbos import HBOS
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.losses import MeanSquaredError
import requests
import os  # Import the os module

# ============== 1. LOAD TRAINED MODELS & SCALER ==============
print("🔄 Loading trained models...")
models = joblib.load("hybrid_anomaly_model.pkl")  # Load trained models
scaler = joblib.load("scaler.pkl")  # Load trained scaler

trained_features = ['form', 'DeviceType', 'TransactionAmt', 'value', 'card1']  # correct trained features.

# Explicitly define the MSE loss function
mse = MeanSquaredError()

# Load the autoencoder with the custom loss function
autoencoder = keras.models.load_model("autoencoder_model.h5", custom_objects={"mse": mse})
iso_forest = models["iso_forest"]
lof = models["lof"]
hbos = models["hbos"]
threshold = models["threshold"]

# ============== 2. LOAD USER-UPLOADED DATA ==============
def load_user_data(file_path):
    """Loads user-uploaded dataset."""
    try:
        df_user = pd.read_csv(file_path)
        print(f"✅ User dataset loaded: {df_user.shape}")
        return df_user
    except FileNotFoundError:
        print(f"❌ Error: File not found at {file_path}")
        return None  # Return None if file not found
    except Exception as e:
        print(f"❌ Error loading CSV: {e}")
        return None

# ============== 3. FEATURE SELECTION & MAPPING ==============
def ensure_column_compatibility(df_user, trained_features):
    """Ensures the user-uploaded dataset matches the trained model's feature names."""
    user_columns = df_user.columns.tolist()

    # Semantic Mapping Rules
    rename_mapping = {}
    for user_col in user_columns:
        if user_col.lower() in ["account", "customer id", "custid", "account number"]:
            rename_mapping[user_col] = "card1"
        elif user_col.lower() in ["device", "device type", "access unit"]:
            rename_mapping[user_col] = "DeviceType"
        elif user_col.lower() in ["transaction amount", "amount", "gl balance"]:
            rename_mapping[user_col] = "TransactionAmt"
        elif user_col.lower() in ["value", "ihub balance"]:
            rename_mapping[user_col] = "value"
        elif user_col.lower() in ["form"]:
            rename_mapping[user_col] = "form"

    df_user = df_user.rename(columns=rename_mapping)
    user_columns = df_user.columns.tolist()

    matching_cols = [col for col in trained_features if col in user_columns]

    if not matching_cols:
        print("⚠️ Renaming failed. Cannot match all required columns.")
        print("Required columns:", trained_features)
        print("Renamed columns:", user_columns)
        return None

    df_user = df_user[matching_cols]

    for col in trained_features:
        if col not in df_user.columns:
            df_user[col] = 0  # Neutral value for missing columns

    df_user = df_user[trained_features]
    print(f"✅ Finalized Feature Set: {df_user.columns.tolist()}")
    return df_user

# ============== 4. PREPROCESS NEW DATA ==============
def preprocess_new_data(file_path, trained_features, scaler):
    """Preprocesses new data for anomaly detection."""
    df_user = load_user_data(file_path)
    if df_user is None:
        return None

    df_user_numerical = df_user.select_dtypes(include=[np.number])
    df_user_compatible = ensure_column_compatibility(df_user_numerical, trained_features)

    if df_user_compatible is None:
        return None

    try:
        df_user_scaled = scaler.transform(df_user_compatible)
        print("✅ Features scaled successfully.")
        return df_user_scaled
    except ValueError as e:
        print(f"❌ Error during scaling: {e}")
        print("Please ensure your uploaded data's columns match the trained model's required columns.")
        return None

# ============== 5. PREDICT ANOMALIES USING HYBRID MODEL ==============
def predict_anomalies(X):
    """Runs data through all models and combines results."""
    autoencoder_preds = np.mean(np.abs(autoencoder.predict(X) - X), axis=1) > threshold
    iso_preds = iso_forest.predict(X) == -1  # -1 means anomaly
    lof_preds = lof.fit_predict(X) == -1  # -1 means anomaly
    hbos_preds = hbos.predict(X) == 1  # 1 means anomaly in HBOS

    # Majority Voting
    hybrid_preds = (autoencoder_preds.astype(int) + iso_preds.astype(int) +
                    lof_preds.astype(int) + hbos_preds.astype(int)) >= 2  # Majority vote
    return hybrid_preds

# ============== 6. ROOT CAUSE ANALYSIS USING GEN AI (Hugging Face API) ==============
def analyze_root_cause(row):
    """Uses Hugging Face API for root cause analysis."""
    # Get Hugging Face API token from environment variable
    api_token = "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"

    if not api_token:
        print("❌ Error: Hugging Face API token not found in environment variables.")
        return "No insight available"

    headers = {"Authorization": f"Bearer {api_token}"}
    payload = {"inputs": f"Analyze anomaly in this data: {row.to_dict()}"}
    try:
        response = requests.post("https://api-inference.huggingface.co/models/mistralai/Mistral-7B-Instruct-v0.1", json=payload, headers=headers)
        response.raise_for_status()  # Raise an exception for bad status codes
        return response.json()[0]["generated_text"]
    except requests.exceptions.RequestException as e:
        print(f"❌ Error during Hugging Face API call: {e}")
        return "No insight available"

# ============== 7. MAIN EXECUTION ==============
file_path = "uploaded_data.csv"
scaled_data = preprocess_new_data(file_path, trained_features, scaler)

if scaled_data is not None:
    df_user = load_user_data(file_path)
    df_user["Anomaly"] = predict_anomalies(scaled_data)
    df_user["Root_Cause_Analysis"] = df_user.apply(lambda row: analyze_root_cause(row) if row["Anomaly"] else "Normal", axis=1)
    df_user.to_csv("anomaly_results.csv", index=False)
    print(f"✅ Anomaly detection completed! Results saved to 'anomaly_results.csv'.")
    print(df_user[["Anomaly", "Root_Cause_Analysis"]].head(10))
else:
    print("❌ Anomaly detection failed due to preprocessing errors.")

🔄 Loading trained models...




✅ User dataset loaded: (100, 9)
✅ Finalized Feature Set: ['form', 'DeviceType', 'TransactionAmt', 'value', 'card1']
✅ Features scaled successfully.
✅ User dataset loaded: (100, 9)
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step 
❌ Error during Hugging Face API call: 402 Client Error: Payment Required for url: https://api-inference.huggingface.co/models/mistralai/Mistral-7B-Instruct-v0.1
❌ Error during Hugging Face API call: 402 Client Error: Payment Required for url: https://api-inference.huggingface.co/models/mistralai/Mistral-7B-Instruct-v0.1
❌ Error during Hugging Face API call: 402 Client Error: Payment Required for url: https://api-inference.huggingface.co/models/mistralai/Mistral-7B-Instruct-v0.1
❌ Error during Hugging Face API call: 402 Client Error: Payment Required for url: https://api-inference.huggingface.co/models/mistralai/Mistral-7B-Instruct-v0.1
❌ Error during Hugging Face API call: 402 Client Error: Payment Required for url: https://api-inference

In [148]:
import pandas as pd
import numpy as np
import joblib
import tensorflow as tf
from tensorflow import keras
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from pyod.models.hbos import HBOS
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.losses import MeanSquaredError
import requests
import os  # Import the os module

# ============== 1. LOAD TRAINED MODELS & SCALER ==============
print("🔄 Loading trained models...")
models = joblib.load("hybrid_anomaly_model.pkl")  # Load trained models
scaler = joblib.load("scaler.pkl")  # Load trained scaler

trained_features = ['form', 'DeviceType', 'TransactionAmt', 'value', 'card1']  # correct trained features.

# Explicitly define the MSE loss function
mse = MeanSquaredError()

# Load the autoencoder with the custom loss function
autoencoder = keras.models.load_model("autoencoder_model.h5", custom_objects={"mse": mse})
iso_forest = models["iso_forest"]
lof = models["lof"]
hbos = models["hbos"]
threshold = models["threshold"]

# ============== 2. LOAD USER-UPLOADED DATA ==============
def load_user_data(file_path):
    """Loads user-uploaded dataset."""
    try:
        df_user = pd.read_csv(file_path)
        print(f"✅ User dataset loaded: {df_user.shape}")
        return df_user
    except FileNotFoundError:
        print(f"❌ Error: File not found at {file_path}")
        return None  # Return None if file not found
    except Exception as e:
        print(f"❌ Error loading CSV: {e}")
        return None

# ============== 3. FEATURE SELECTION & MAPPING ==============
def ensure_column_compatibility(df_user, trained_features):
    """Ensures the user-uploaded dataset matches the trained model's feature names."""
    user_columns = df_user.columns.tolist()

    # Semantic Mapping Rules
    rename_mapping = {}
    for user_col in user_columns:
        if user_col.lower() in ["account", "customer id", "custid", "account number"]:
            rename_mapping[user_col] = "card1"
        elif user_col.lower() in ["device", "device type", "access unit"]:
            rename_mapping[user_col] = "DeviceType"
        elif user_col.lower() in ["transaction amount", "amount", "gl balance"]:
            rename_mapping[user_col] = "TransactionAmt"
        elif user_col.lower() in ["value", "ihub balance"]:
            rename_mapping[user_col] = "value"
        elif user_col.lower() in ["form"]:
            rename_mapping[user_col] = "form"

    df_user = df_user.rename(columns=rename_mapping)
    user_columns = df_user.columns.tolist()

    matching_cols = [col for col in trained_features if col in user_columns]

    if not matching_cols:
        print("⚠️ Renaming failed. Cannot match all required columns.")
        print("Required columns:", trained_features)
        print("Renamed columns:", user_columns)
        return None

    df_user = df_user[matching_cols]

    for col in trained_features:
        if col not in df_user.columns:
            df_user[col] = 0  # Neutral value for missing columns

    df_user = df_user[trained_features]
    print(f"✅ Finalized Feature Set: {df_user.columns.tolist()}")
    return df_user

# ============== 4. PREPROCESS NEW DATA ==============
def preprocess_new_data(file_path, trained_features, scaler):
    """Preprocesses new data for anomaly detection."""
    df_user = load_user_data(file_path)
    if df_user is None:
        return None

    df_user_numerical = df_user.select_dtypes(include=[np.number])
    df_user_compatible = ensure_column_compatibility(df_user_numerical, trained_features)

    if df_user_compatible is None:
        return None

    try:
        df_user_scaled = scaler.transform(df_user_compatible)
        print("✅ Features scaled successfully.")
        return df_user_scaled
    except ValueError as e:
        print(f"❌ Error during scaling: {e}")
        print("Please ensure your uploaded data's columns match the trained model's required columns.")
        return None

# ============== 5. PREDICT ANOMALIES USING HYBRID MODEL ==============
def predict_anomalies(X):
    """Runs data through all models and combines results."""
    autoencoder_preds = np.mean(np.abs(autoencoder.predict(X) - X), axis=1) > threshold
    iso_preds = iso_forest.predict(X) == -1  # -1 means anomaly
    lof_preds = lof.fit_predict(X) == -1  # -1 means anomaly
    hbos_preds = hbos.predict(X) == 1  # 1 means anomaly in HBOS

    # Majority Voting
    hybrid_preds = (autoencoder_preds.astype(int) + iso_preds.astype(int) +
                    lof_preds.astype(int) + hbos_preds.astype(int)) >= 2  # Majority vote
    return hybrid_preds

# ============== 6. ROOT CAUSE ANALYSIS USING GEN AI (Hugging Face API) ==============
def analyze_root_cause(row):
    """Uses Hugging Face API for root cause analysis."""
    api_token = os.getenv("HUGGING_FACE_API_TOKEN", "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")

    headers = {"Authorization": f"Bearer {api_token}"}
    payload = {"inputs": f"Analyze anomaly in this data: {row.to_dict()}"}
    
    try:
        response = requests.post("https://api-inference.huggingface.co/models/mistralai/Mistral-7B-Instruct-v0.1", json=payload, headers=headers)
        response.raise_for_status()
        return response.json()[0]["generated_text"]
    except requests.exceptions.RequestException as e:
        print(f"❌ Error during API call: {e}")
        return "No insight available"

# ============== 7. AUTOMATED CORRECTIVE ACTIONS USING AGENTIC AI ==============
def suggest_corrective_action(row):
    """Uses Hugging Face API to generate corrective actions for anomalies."""
    api_token = os.getenv("HUGGING_FACE_API_TOKEN", "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")

    headers = {"Authorization": f"Bearer {api_token}"}
    payload = {"inputs": f"Suggest corrective action for this anomaly: {row.to_dict()}"}
    
    try:
        response = requests.post("https://api-inference.huggingface.co/models/mistralai/Mistral-7B-Instruct-v0.1", json=payload, headers=headers)
        response.raise_for_status()
        return response.json()[0]["generated_text"]
    except requests.exceptions.RequestException as e:
        print(f"❌ Error during API call: {e}")
        return "No corrective action available"

# ============== 8. MAIN EXECUTION ==============
file_path = "uploaded_data.csv"
scaled_data = preprocess_new_data(file_path, trained_features, scaler)

if scaled_data is not None:
    df_user = load_user_data(file_path)
    df_user["Anomaly"] = predict_anomalies(scaled_data)
    df_user["Root_Cause_Analysis"] = df_user.apply(lambda row: analyze_root_cause(row) if row["Anomaly"] else "Normal", axis=1)
    df_user["Corrective_Action"] = df_user.apply(lambda row: suggest_corrective_action(row) if row["Anomaly"] else "None", axis=1)
    
    df_user.to_csv("anomaly_results.csv", index=False)
    print(f"✅ Anomaly detection & corrective actions completed! Results saved to 'anomaly_results.csv'.")
    print(df_user[["Anomaly", "Root_Cause_Analysis", "Corrective_Action"]].head(10))
else:
    print("❌ Process failed due to preprocessing errors.")


🔄 Loading trained models...




✅ User dataset loaded: (10, 9)
✅ Finalized Feature Set: ['form', 'DeviceType', 'TransactionAmt', 'value', 'card1']
✅ Features scaled successfully.
✅ User dataset loaded: (10, 9)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 149ms/step




✅ Anomaly detection & corrective actions completed! Results saved to 'anomaly_results.csv'.
   Anomaly                                Root_Cause_Analysis  \
0     True  Analyze anomaly in this data: {'As of Date': '...   
1     True  Analyze anomaly in this data: {'As of Date': '...   
2     True  Analyze anomaly in this data: {'As of Date': '...   
3     True  Analyze anomaly in this data: {'As of Date': '...   
4     True  Analyze anomaly in this data: {'As of Date': '...   
5     True  Analyze anomaly in this data: {'As of Date': '...   
6     True  Analyze anomaly in this data: {'As of Date': '...   
7     True  Analyze anomaly in this data: {'As of Date': '...   
8     True  Analyze anomaly in this data: {'As of Date': '...   
9     True  Analyze anomaly in this data: {'As of Date': '...   

                                   Corrective_Action  
0  Suggest corrective action for this anomaly: {'...  
1  Suggest corrective action for this anomaly: {'...  
2  Suggest corrective actio