In [2]:
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

In [3]:
# Load dataset
data = pd.read_csv("modified_test1.csv")

In [31]:
print(data.columns)


Index(['customer_id', 'customer_account_balance', 'transaction_amount',
       'reported_amount', 'currency', 'country', 'transaction_date',
       'risk_score', 'remediation_action'],
      dtype='object')


In [32]:
# Handling date format issue with correct column name
data['transaction_date'] = pd.to_datetime(data['transaction_date'], format='%d-%m-%Y', errors='coerce')

# Extracting year, month, and day
data['year'] = data['transaction_date'].dt.year
data['month'] = data['transaction_date'].dt.month
data['day'] = data['transaction_date'].dt.day

# Dropping the original date column if needed
data.drop(columns=['transaction_date'], inplace=True)


In [34]:
# Splitting features and target variable
X = data.drop(columns=['fraud_label'])  # Adjust column name based on your dataset
y = data['fraud_label']


KeyError: "['fraud_label'] not found in axis"

In [8]:
# Convert transaction_date to numerical values
def process_date(date):
    try:
        return datetime.strptime(date, "%Y-%m-%d")
    except ValueError:
        try:
            return datetime.strptime(date, "%d-%m-%Y")
        except ValueError:
            return np.nan  # Handle incorrect formats

In [9]:
df["transaction_date"] = df["transaction_date"].apply(process_date)
df["year"] = df["transaction_date"].dt.year
df["month"] = df["transaction_date"].dt.month
df["day"] = df["transaction_date"].dt.day
df.drop(columns=["transaction_date"], inplace=True)  # Remove original date column

In [10]:
# Convert categorical columns to numerical (One-Hot Encoding)
categorical_cols = ["currency", "country", "remediation_action"]
df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

In [13]:
df["is_fraud"] = (df["risk_score"] > 0.7).astype(int)


In [14]:
print(df.columns)


Index(['customer_account_balance', 'transaction_amount', 'reported_amount',
       'risk_score', 'year', 'month', 'day', 'currency_GBP', 'currency_INR',
       'currency_JPY', 'currency_USD', 'country_Germany', 'country_India',
       'country_Russia', 'country_UK', 'country_USA',
       'remediation_action_No Action',
       'remediation_action_Require additional validation', 'is_fraud'],
      dtype='object')


In [15]:
X = df.drop(columns=["is_fraud"])
y = df["is_fraud"]


In [16]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)  # Standardize all numeric features

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

print(f"Training Data Shape: {X_train.shape}, Testing Data Shape: {X_test.shape}")

Training Data Shape: (200, 18), Testing Data Shape: (50, 18)


In [19]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(X_train, y_train)


In [20]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)


In [21]:
from sklearn.metrics import accuracy_score, classification_report

y_pred = model.predict(X_test)

print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")
print("Classification Report:\n", classification_report(y_test, y_pred))


Accuracy: 1.00
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        50

    accuracy                           1.00        50
   macro avg       1.00      1.00      1.00        50
weighted avg       1.00      1.00      1.00        50



In [22]:
import joblib

joblib.dump(model, "fraud_detection_model.pkl")
joblib.dump(scaler, "scaler.pkl")


['scaler.pkl']

In [23]:
model = joblib.load("fraud_detection_model.pkl")
scaler = joblib.load("scaler.pkl")

# Example prediction
sample_data = [[2000, 500, 400, 0.8, 2025, 3, 15, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0]]  # Example input
sample_data_scaled = scaler.transform(sample_data)

prediction = model.predict(sample_data_scaled)
print("Fraudulent Transaction" if prediction[0] == 1 else "Legit Transaction")


Legit Transaction




In [24]:
from sklearn.ensemble import IsolationForest

# Train the Isolation Forest model
iso_forest = IsolationForest(n_estimators=100, contamination=0.02, random_state=42)  # Adjust contamination based on expected fraud ratio
iso_forest.fit(X_train)

# Predict anomalies (fraud cases)
y_pred_iso = iso_forest.predict(X_test)

# Convert predictions (-1 = fraud, 1 = legit)
y_pred_iso = [1 if pred == -1 else 0 for pred in y_pred_iso]

# Evaluate model
from sklearn.metrics import accuracy_score, classification_report

print(f"Isolation Forest Accuracy: {accuracy_score(y_test, y_pred_iso):.2f}")
print("Classification Report:\n", classification_report(y_test, y_pred_iso))


Isolation Forest Accuracy: 1.00
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        50

    accuracy                           1.00        50
   macro avg       1.00      1.00      1.00        50
weighted avg       1.00      1.00      1.00        50



In [25]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model

# Define the Autoencoder model
input_dim = X_train.shape[1]
encoding_dim = 8  # Size of compressed representation

input_layer = Input(shape=(input_dim,))
encoded = Dense(encoding_dim, activation="relu")(input_layer)
decoded = Dense(input_dim, activation="sigmoid")(encoded)

autoencoder = Model(input_layer, decoded)
autoencoder.compile(optimizer="adam", loss="mse")

# Train the Autoencoder
autoencoder.fit(X_train, X_train, epochs=50, batch_size=32, shuffle=True, validation_data=(X_test, X_test))

# Compute reconstruction loss
reconstructed = autoencoder.predict(X_test)
loss = ((X_test - reconstructed) ** 2).mean(axis=1)

# Set a threshold for fraud detection
threshold = loss.mean() + 2 * loss.std()  # Adjust threshold
y_pred_auto = [1 if l > threshold else 0 for l in loss]

# Evaluate
print(f"Autoencoder Accuracy: {accuracy_score(y_test, y_pred_auto):.2f}")
print("Classification Report:\n", classification_report(y_test, y_pred_auto))





Epoch 1/50

Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Autoencoder Accuracy: 0.98
Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.98      0.99        50
           1       0.00      0.00      0.00         0

    accuracy                           0.98        50
   macro avg       0.50      0.49      0.49        50
weighted avg       1.00      0.98      0.99        50



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [27]:
import joblib

# Save the Random Forest Model
joblib.dump(RandomForestClassifier, "fraud_model.pkl")

# Save the Scaler (for normalizing new data)
joblib.dump(scaler, "scaler.pkl")


['scaler.pkl']