In [48]:
"""Retrainable_Anomaly_Detection.ipynb

This script performs anomaly detection using Isolation Forest and Local Outlier Factor (LOF) 
on financial transactions. It preprocesses data, trains models dynamically, and allows retraining 
when feedback is received. This sets up the foundation for integrating an LLM-based feedback loop.
"""

'Retrainable_Anomaly_Detection.ipynb\n\nThis script performs anomaly detection using Isolation Forest and Local Outlier Factor (LOF) \non financial transactions. It preprocesses data, trains models dynamically, and allows retraining \nwhen feedback is received. This sets up the foundation for integrating an LLM-based feedback loop.\n'

In [5]:
import pandas as pd
import numpy as np
import joblib
import os
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn.preprocessing import StandardScaler
import ollama  # Mistral integration

In [6]:
# ========== STEP 1: Load & Preprocess Data ==========
def load_data(file_path):
    """Loads the CSV file and preprocesses the data."""
    data = pd.read_csv(file_path)

    # Clean column names
    data.columns = data.columns.str.strip()
    data = data.rename(columns={
        "As of Date": "As_of_Date",
        "GL Balance": "GL_Balance",
        "Ihub Balance": "Ihub_Balance",
        "Balance Difference": "Balance_Difference",
        "Match Status": "Match_Status",
        "Primary Account": "Primary_Account",
        "Secondary Account": "Secondary_Account"
    })

    # Convert date column
    data["As_of_Date"] = pd.to_datetime(data["As_of_Date"])

    return data

In [7]:
# ========== STEP 2: Train Anomaly Detection Models ==========
def train_anomaly_models(historical_data, model_dir="models/"):
    """Trains Isolation Forest and LOF for anomaly detection and saves a common model."""
    os.makedirs(model_dir, exist_ok=True)

    # Group by entity
    entity_cols = ["Account", "AU", "Currency", "Primary_Account"]
    historical_grouped = historical_data.groupby(entity_cols)

    # Scaling
    scaler = StandardScaler()
    historical_data["Balance_Difference_Scaled"] = scaler.fit_transform(historical_data[["Balance_Difference"]])

    # Train Isolation Forest on ALL historical data
    iso_forest = IsolationForest(contamination=0.05, random_state=42)
    iso_forest.fit(historical_data[["Balance_Difference_Scaled"]])

    # Train LOF with novelty=True for new data predictions
    lof_model = None
    if len(historical_data) > 5:
        lof_model = LocalOutlierFactor(n_neighbors=5, novelty=True)
        lof_model.fit(historical_data[["Balance_Difference_Scaled"]])

    # Save models & scaler
    joblib.dump(iso_forest, os.path.join(model_dir, "isolation_forest.pkl"))
    joblib.dump(lof_model, os.path.join(model_dir, "lof_model.pkl"))
    joblib.dump(scaler, os.path.join(model_dir, "scaler.pkl"))

    print("Models saved successfully.")
    return historical_grouped  # Return grouped historical data for later use

In [13]:
# ========== STEP 4: Query Mistral for Anomaly Explanation ==========
def query_mistral(historical_data, real_time_entity, classification_result):
    """Queries Mistral 7B for anomaly explanation based on historical data."""

    # Convert historical data to a readable format
    historical_text = historical_data.to_string(index=False)

    # Construct LLM prompt
    prompt = f"""
    You are an AI analyzing financial anomalies.

    Real-time entity data:
    {real_time_entity.to_string(index=False)}

    Classification result:
    {classification_result}

    Historical records for this entity:
    {historical_text}

    Based on the historical trends and patterns, give a concise comment as to why this real-time entity is an anomaly.
    """

    # Query Mistral
    response = ollama.chat(model='mistral', messages=[{"role": "user", "content": prompt}])

    return response['message']['content']

In [14]:
# ========== STEP 3: Load Models & Predict Anomalies ==========
def predict_anomalies(real_time_file, historical_grouped, model_dir="models/"):
    """Predicts anomalies for real-time data using Isolation Forest & LOF, considering historical trends per entity."""
    real_time_data = load_data(real_time_file)

    # Load models
    iso_forest = joblib.load(os.path.join(model_dir, "isolation_forest.pkl"))
    lof_model = joblib.load(os.path.join(model_dir, "lof_model.pkl"))
    scaler = joblib.load(os.path.join(model_dir, "scaler.pkl"))

    # Scaling real-time data
    real_time_data["Balance_Difference_Scaled"] = scaler.transform(real_time_data[["Balance_Difference"]])

    results = []
    for _, row in real_time_data.iterrows():
        entity_key = (row["Account"], row["AU"], row["Currency"], row["Primary_Account"])

        # Get historical data for the entity
        if entity_key not in historical_grouped.groups:
            continue  # Skip if no historical data

        # Extract relevant historical data
        entity_historical_data = historical_grouped.get_group(entity_key)

        # Predict anomaly based on scaled balance difference
        balance_diff_scaled = np.array([[row["Balance_Difference_Scaled"]]])

        # Isolation Forest Prediction
        iso_pred = iso_forest.predict(balance_diff_scaled)[0]  # -1 (Anomaly) or 1 (Normal)

        # LOF Prediction (if model exists)
        lof_pred = -1 if lof_model and lof_model.predict(balance_diff_scaled)[0] == -1 else 1

        # Final decision: If either model detects an anomaly, mark it as "Yes"
        final_anomaly = "Yes" if (iso_pred == -1 or lof_pred == -1) else "No"

        # Query Mistral for an explanation if it's an anomaly
        comment = ""
        if final_anomaly == "Yes":
            comment = query_mistral(entity_historical_data, row, final_anomaly)

        # Store results
        result = row.to_dict()
        result["Anomaly"] = final_anomaly
        result["Comment"] = comment
        results.append(result)

    # Save results to a CSV file
    results_df = pd.DataFrame(results)
    results_df.to_csv("results.csv", index=False)
    print("Results saved to results.csv")

    return results_df

In [15]:
# ========== STEP 5: Run the Full Pipeline ==========
if __name__ == "__main__":
    historical_file = "Dataset.csv"
    real_time_file = "real_time_dataset.csv"

    # Step 1: Load Historical Data
    historical_data = load_data(historical_file)

    # Step 2: Train & Save Models, and get grouped historical data
    historical_grouped = train_anomaly_models(historical_data)

    # Step 3: Load Real-Time Data & Predict Anomalies (with Mistral explanations)
    results_df = predict_anomalies(real_time_file, historical_grouped)

    # Display Results
    if not results_df.empty:
        print("\nReal-Time Data Anomaly Predictions:")
        print(results_df[["As_of_Date", "Account", "AU", "Currency", "Primary_Account", "Balance_Difference", "Anomaly", "Comment"]])
    else:
        print("No anomalies detected or no real-time data available.")

Models saved successfully.




Results saved to results.csv

Real-Time Data Anomaly Predictions:
    As_of_Date  Account     AU Currency  Primary_Account  Balance_Difference  \
0   2023-09-09  1642845   7566      USD  ALL OTHER LOANS         1115.308922   
1   2023-09-09  1660793  60475      GBP  ALL OTHER LOANS        -1290.400819   
2   2023-09-09  1662496  48750      EUR  ALL OTHER LOANS         -709.655900   
3   2023-09-09  1666765  66894      EUR  ALL OTHER LOANS        -3429.767358   
4   2023-09-09  1669440   5683      GBP  ALL OTHER LOANS        19228.414370   
..         ...      ...    ...      ...              ...                 ...   
358 2023-09-09  1615118   9977      USD  ALL OTHER LOANS         -322.135157   
359 2023-09-09  1641777  57556      EUR  ALL OTHER LOANS        -5962.477324   
360 2023-09-09  1623838   9579      GBP  ALL OTHER LOANS        -5529.969951   
361 2023-09-09  1696688   8207      EUR  ALL OTHER LOANS       -21844.436090   
362 2023-09-09  1640243  47806      EUR  ALL OTHER LOA