# 📌 Machine Learning-Based Fraud Detection
Using machine learning (ML) to detect hidden fraud patterns instead of rule-based detection (e.g., flagging transactions above $1,000)


## Step 1: Connecting to PostgreSQL

In [64]:
from sqlalchemy import create_engine
import pandas as pd

def connect_db():
    engine = create_engine("postgresql://postgres:965210@localhost:5432/fraud_db")
    return engine

## Step 2: Load Data from PostgreSQL

In [66]:
def load_data():
    engine = connect_db()
    query = """
        SET search_path TO fraud_schema;  -- Ensure correct schema is used
        SELECT customer_id, amount, 
               COUNT(*) OVER (PARTITION BY customer_id) AS transaction_count,
               AVG(amount) OVER (PARTITION BY customer_id) AS avg_transaction_amount,
               COUNT(DISTINCT location) OVER (PARTITION BY customer_id) AS unique_locations,
               COUNT(*) FILTER(WHERE amount > 1000) OVER (PARTITION BY customer_id) AS high_value_txn,
               MAX(is_fraud::int) OVER (PARTITION BY customer_id) AS fraud_label
        FROM transactions;
    """
    df = pd.read_sql(query, engine)
    engine.dispose()  # Close connection
    return df

## Step 3: Train Fraud Detection Model

In [74]:
def train_model(df):
    features = ['amount', 'transaction_count', 'avg_transaction_amount', 'unique_locations', 'high_value_txn']
    X = df[features]
    y = df['fraud_label']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f'Model Accuracy: {accuracy:.2f}')
    return model


## Step 4: Predict Fraud for New Transactions and Store in PostgreSQL

In [76]:
def predict_fraud(model):
    conn = connect_db()
    df = load_data()
    df['fraud_prediction'] = model.predict(df[['amount', 'transaction_count', 'avg_transaction_amount', 'unique_locations', 'high_value_txn']])
    
    with conn.cursor() as cur:
        for _, row in df.iterrows():
            cur.execute(
                """
                UPDATE fraud_schema.transactions
                SET predicted_fraud = %s
                WHERE customer_id = %s;
                """,
                (row['fraud_prediction'], row['customer_id'])
            )
    conn.commit()
    conn.close()
    print("Predictions stored in PostgreSQL.")

In [44]:
# Running the fraud detection pipeline
df = load_data()
model = train_model(df)
predict_fraud(model)


  df = pd.read_sql(query, conn)


DatabaseError: Execution failed on sql '
        SELECT customer_id, amount, COUNT(*) OVER (PARTITION BY customer_id) AS transaction_count,
               AVG(amount) OVER (PARTITION BY customer_id) AS avg_transaction_amount,
               COUNT(DISTINCT location) OVER (PARTITION BY customer_id) AS unique_locations,
               COUNT(*) FILTER(WHERE amount > 1000) OVER (PARTITION BY customer_id) AS high_value_txn,
               MAX(is_fraud::int) OVER (PARTITION BY customer_id) AS fraud_label
        FROM fraud_schema.transactions;
    ': relation "fraud_schema.transactions" does not exist
LINE 7:         FROM fraud_schema.transactions;
                     ^
