Import Necessary Libraries

In [25]:
# Data Handling & Processing
import pandas as pd
import numpy as np
!pip install mlflow
# Data Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Train-Test Split
from sklearn.model_selection import train_test_split

# Feature Scaling
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# Machine Learning Models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier

# Deep Learning Models (Neural Networks)
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM, Conv1D, MaxPooling1D, Flatten

# Model Evaluation
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

# MLOps (Experiment Tracking)
import mlflow
import mlflow.sklearn

# Ignore Warnings
import warnings
warnings.filterwarnings('ignore')




load Preprocessed Data

In [3]:
# Load your preprocessed datasets (ensure the path is correct)
fraud_data = pd.read_csv('/content/Preprocessed_Fraud_Data.csv')
creditcard_data = pd.read_csv('/content/Preprocessed_Creditcard_Data.csv')

# Optional: Check the first few rows
print(fraud_data.head())
print(creditcard_data.head())


   user_id          signup_time        purchase_time  purchase_value  \
0    22058  2015-02-24 22:55:49  2015-04-18 02:47:11       -0.160204   
1   333320  2015-06-07 20:39:50  2015-06-08 01:38:54       -1.142592   
2     1359  2015-01-01 18:52:44  2015-01-01 18:52:45       -1.197169   
3   150084  2015-04-28 21:13:25  2015-05-04 13:54:50        0.385567   
4   221365  2015-07-21 07:09:52  2015-09-09 18:40:53        0.112681   

       device_id  source  browser  sex  age       ip_address  class  \
0  QVPSPJUOCKZAR       2        0    1   39   73275836879972      0   
1  EOGFQPIZPYXFZ       0        0    0   53  350311387865908      0   
2  YSSKYOSJHPPLJ       2        3    1   53  262147382011095      1   
3  ATGTXKYKUDUQN       2        4    1   41  384054244391396      0   
4  NAUITBZFJKHWW       0        4    1   45  415583117452712      0   

   transaction_frequency  transaction_velocity  hour_of_day  day_of_week  
0                    0.0                   0.0            2      

 Feature and Target Separation

In [4]:
# For Fraud Data
X_fraud = fraud_data.drop(columns=['class'])  # All columns except 'class'
y_fraud = fraud_data['class']                # The target variable

# For Credit Card Data
X_creditcard = creditcard_data.drop(columns=['Class'])  # All columns except 'Class'
y_creditcard = creditcard_data['Class']                # The target variable


Train-Test Split



In [40]:
# For Fraud Data
X_train_fraud, X_test_fraud, y_train_fraud, y_test_fraud = train_test_split(
    X_fraud, y_fraud, test_size=0.2, random_state=42, stratify=y_fraud
)

# For Credit Card Data
X_train_cc, X_test_cc, y_train_cc, y_test_cc = train_test_split(
    X_creditcard, y_creditcard, test_size=0.2, random_state=42, stratify=y_creditcard
)


 Feature Scaling

In [36]:
X_train_fraud.info()

<class 'pandas.core.frame.DataFrame'>
Index: 120889 entries, 50481 to 120195
Data columns (total 19 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   user_id                120889 non-null  int64  
 1   purchase_value         120889 non-null  float64
 2   source                 120889 non-null  int64  
 3   browser                120889 non-null  int64  
 4   sex                    120889 non-null  int64  
 5   age                    120889 non-null  int64  
 6   ip_address             120889 non-null  int64  
 7   transaction_frequency  120889 non-null  float64
 8   transaction_velocity   120889 non-null  float64
 9   hour_of_day            120889 non-null  int64  
 10  day_of_week            120889 non-null  int64  
 11  signup_time_year       120889 non-null  int32  
 12  signup_time_month      120889 non-null  int32  
 13  signup_time_day        120889 non-null  int32  
 14  signup_time_hour       120889 non-nul

In [37]:
scaler = StandardScaler()

# For Credit Card Data (scaling both train and test)
X_train_cc_scaled = scaler.fit_transform(X_train_cc)
X_test_cc_scaled = scaler.transform(X_test_cc)



Model Selection and Training for for Y dataset


Logistic Regression (Baseline Model)

In [38]:
# Logistic Regression requires scaled data
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train_cc_scaled, y_train_cc)

# Predict and Evaluate
y_pred_lr = log_reg.predict(X_test_cc_scaled)
print("Logistic Regression Performance (Credit Card Data):")
print(classification_report(y_test_cc, y_pred_lr))


Logistic Regression Performance (Credit Card Data):
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56651
           1       0.85      0.58      0.69        95

    accuracy                           1.00     56746
   macro avg       0.92      0.79      0.84     56746
weighted avg       1.00      1.00      1.00     56746



Decision Tree Classifier



In [22]:
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train_cc, y_train_cc)

y_pred_dt = dt_model.predict(X_test_cc)
print("Decision Tree Performance (Credit Card Data):")
print(classification_report(y_test_cc, y_pred_dt))


Decision Tree Performance (Credit Card Data):
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56651
           1       0.72      0.71      0.71        95

    accuracy                           1.00     56746
   macro avg       0.86      0.85      0.86     56746
weighted avg       1.00      1.00      1.00     56746



Random Forest Classifier

In [23]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_cc, y_train_cc)

y_pred_rf = rf_model.predict(X_test_cc)
print("Random Forest Performance (Credit Card Data):")
print(classification_report(y_test_cc, y_pred_rf))


Random Forest Performance (Credit Card Data):
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56651
           1       0.97      0.73      0.83        95

    accuracy                           1.00     56746
   macro avg       0.99      0.86      0.92     56746
weighted avg       1.00      1.00      1.00     56746



Gradient Boosting with XGBoost

In [26]:
!pip install xgboost
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
xgb_model.fit(X_train_cc, y_train_cc)

y_pred_xgb = xgb_model.predict(X_test_cc)
print("XGBoost Performance (Credit Card Data):")
print(classification_report(y_test_cc, y_pred_xgb))


XGBoost Performance (Credit Card Data):
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56651
           1       0.96      0.75      0.84        95

    accuracy                           1.00     56746
   macro avg       0.98      0.87      0.92     56746
weighted avg       1.00      1.00      1.00     56746



Deep Learning Models

Multi-Layer Perceptron (MLP)

An MLP is a basic feed-forward neural network suitable for tabular data.

In [27]:
# Define the MLP Model
mlp_model = Sequential([
    Dense(64, activation='relu', input_shape=(X_train_cc_scaled.shape[1],)),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')  # Sigmoid for binary classification
])

# Compile the model
mlp_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
mlp_history = mlp_model.fit(X_train_cc_scaled, y_train_cc,
                            epochs=10, batch_size=32,
                            validation_data=(X_test_cc_scaled, y_test_cc))


Epoch 1/10
[1m7094/7094[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 2ms/step - accuracy: 0.9958 - loss: 0.0205 - val_accuracy: 0.9994 - val_loss: 0.0044
Epoch 2/10
[1m7094/7094[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 2ms/step - accuracy: 0.9994 - loss: 0.0031 - val_accuracy: 0.9994 - val_loss: 0.0039
Epoch 3/10
[1m7094/7094[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 2ms/step - accuracy: 0.9993 - loss: 0.0032 - val_accuracy: 0.9994 - val_loss: 0.0046
Epoch 4/10
[1m7094/7094[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 2ms/step - accuracy: 0.9994 - loss: 0.0026 - val_accuracy: 0.9993 - val_loss: 0.0044
Epoch 5/10
[1m7094/7094[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 2ms/step - accuracy: 0.9995 - loss: 0.0021 - val_accuracy: 0.9995 - val_loss: 0.0038
Epoch 6/10
[1m7094/7094[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 3ms/step - accuracy: 0.9995 - loss: 0.0019 - val_accuracy: 0.9994 - val_loss: 0.0043
Epoch 7/10

Convolutional Neural Network (CNN)



Since CNNs are built for spatial data, we reshape the data so that each feature becomes like a “pixel” in a sequence.

In [28]:
# Reshape for CNN: (samples, features, 1)
X_train_cc_cnn = X_train_cc_scaled.reshape(X_train_cc_scaled.shape[0], X_train_cc_scaled.shape[1], 1)
X_test_cc_cnn = X_test_cc_scaled.reshape(X_test_cc_scaled.shape[0], X_test_cc_scaled.shape[1], 1)

# Define the CNN Model
cnn_model = Sequential([
    Conv1D(32, kernel_size=3, activation='relu', input_shape=(X_train_cc_scaled.shape[1], 1)),
    Flatten(),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')
])

cnn_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

cnn_history = cnn_model.fit(X_train_cc_cnn, y_train_cc,
                            epochs=10, batch_size=32,
                            validation_data=(X_test_cc_cnn, y_test_cc))


Epoch 1/10
[1m7094/7094[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 3ms/step - accuracy: 0.9985 - loss: 0.0122 - val_accuracy: 0.9994 - val_loss: 0.0041
Epoch 2/10
[1m7094/7094[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 3ms/step - accuracy: 0.9994 - loss: 0.0028 - val_accuracy: 0.9993 - val_loss: 0.0038
Epoch 3/10
[1m7094/7094[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 4ms/step - accuracy: 0.9992 - loss: 0.0030 - val_accuracy: 0.9993 - val_loss: 0.0044
Epoch 4/10
[1m7094/7094[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 4ms/step - accuracy: 0.9993 - loss: 0.0034 - val_accuracy: 0.9994 - val_loss: 0.0037
Epoch 5/10
[1m7094/7094[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 4ms/step - accuracy: 0.9995 - loss: 0.0028 - val_accuracy: 0.9994 - val_loss: 0.0039
Epoch 6/10
[1m7094/7094[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 3ms/step - accuracy: 0.9995 - loss: 0.0023 - val_accuracy: 0.9994 - val_loss: 0.0037
Epoch 7/10

Recurrent Neural Network (RNN)

RNNs are used for sequential data. We use the same reshaped data as for the CNN.

In [29]:
from tensorflow.keras.layers import SimpleRNN

rnn_model = Sequential([
    SimpleRNN(32, activation='relu', input_shape=(X_train_cc_scaled.shape[1], 1)),
    Dense(1, activation='sigmoid')
])

rnn_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

rnn_history = rnn_model.fit(X_train_cc_cnn, y_train_cc,
                            epochs=10, batch_size=32,
                            validation_data=(X_test_cc_cnn, y_test_cc))


Epoch 1/10
[1m7094/7094[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m52s[0m 7ms/step - accuracy: 0.9984 - loss: 0.0270 - val_accuracy: 0.9990 - val_loss: 0.0049
Epoch 2/10
[1m7094/7094[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m51s[0m 7ms/step - accuracy: 0.9991 - loss: 0.0050 - val_accuracy: 0.9986 - val_loss: 0.0057
Epoch 3/10
[1m7094/7094[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m50s[0m 7ms/step - accuracy: 0.9987 - loss: 0.0062 - val_accuracy: 0.9990 - val_loss: 0.0046
Epoch 4/10
[1m7094/7094[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m84s[0m 7ms/step - accuracy: 0.9992 - loss: 0.0045 - val_accuracy: 0.9992 - val_loss: 0.0047
Epoch 5/10
[1m7094/7094[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 7ms/step - accuracy: 0.9989 - loss: 0.0052 - val_accuracy: 0.9992 - val_loss: 0.0045
Epoch 6/10
[1m7094/7094[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m88s[0m 8ms/step - accuracy: 0.9990 - loss: 0.0049 - val_accuracy: 0.9989 - val_loss: 0.0063
Epoch 7/10

Long Short-Term Memory (LSTM)

LSTM networks are a special kind of RNN capable of learning long-term dependencies.

In [30]:
lstm_model = Sequential([
    LSTM(50, return_sequences=True, input_shape=(X_train_cc_scaled.shape[1], 1)),
    LSTM(50),
    Dense(1, activation='sigmoid')
])

lstm_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

lstm_history = lstm_model.fit(X_train_cc_cnn, y_train_cc,
                              epochs=10, batch_size=32,
                              validation_data=(X_test_cc_cnn, y_test_cc))


Epoch 1/10
[1m7094/7094[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m225s[0m 31ms/step - accuracy: 0.9980 - loss: 0.0165 - val_accuracy: 0.9994 - val_loss: 0.0039
Epoch 2/10
[1m7094/7094[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m220s[0m 31ms/step - accuracy: 0.9993 - loss: 0.0041 - val_accuracy: 0.9991 - val_loss: 0.0060
Epoch 3/10
[1m7094/7094[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m265s[0m 31ms/step - accuracy: 0.9993 - loss: 0.0042 - val_accuracy: 0.9991 - val_loss: 0.0045
Epoch 4/10
[1m7094/7094[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m260s[0m 31ms/step - accuracy: 0.9993 - loss: 0.0035 - val_accuracy: 0.9994 - val_loss: 0.0037
Epoch 5/10
[1m7094/7094[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m256s[0m 30ms/step - accuracy: 0.9994 - loss: 0.0033 - val_accuracy: 0.9992 - val_loss: 0.0045
Epoch 6/10
[1m7094/7094[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m227s[0m 32ms/step - accuracy: 0.9994 - loss: 0.0033 - val_accuracy: 0.9994 - val_loss: 0.003

MLOps – Experiment Tracking with MLflow
MLflow lets us log parameters, metrics, and even the trained model so that you can track your experiments over time.

In [31]:
# Install MLflow if you haven't already (uncomment the next line if needed)
# !pip install mlflow

import mlflow
import mlflow.sklearn

# Set the experiment name (creates a new experiment if not already existing)
mlflow.set_experiment("Fraud Detection Experiment")

# Example: Logging the Random Forest model experiment
with mlflow.start_run():
    # Log the model type as a parameter
    mlflow.log_param("model", "RandomForest")

    # Calculate accuracy on the credit card test set
    accuracy_rf = accuracy_score(y_test_cc, y_pred_rf)
    mlflow.log_metric("accuracy", accuracy_rf)

    # Log the Random Forest model artifact
    mlflow.sklearn.log_model(rf_model, "random_forest_model")

    print("Logged Random Forest Model with accuracy:", accuracy_rf)


2025/02/09 20:18:14 INFO mlflow.tracking.fluent: Experiment with name 'Fraud Detection Experiment' does not exist. Creating a new experiment.


Logged Random Forest Model with accuracy: 0.9995065731505305


In [42]:
import json
y_pred_lr = log_reg.predict(X_test_cc_scaled)

# Store model performance
experiment_results = {
    "Logistic Regression": accuracy_score(y_test_cc, y_pred_lr),
    "Decision Tree": accuracy_score(y_test_cc, y_pred_dt),
    "Random Forest": accuracy_score(y_test_cc, y_pred_rf),
    "XGBoost": accuracy_score(y_test_cc, y_pred_xgb),
}
# Save JSON
with open("experiment_results.json", "w") as f:
    json.dump(experiment_results, f)


NameError: name 'files' is not defined

Saving the Model



In [32]:
import joblib

# Assume rf_model is your trained Random Forest model
joblib.dump(rf_model, 'random_forest_model.pkl')


['random_forest_model.pkl']

Save & Download Processed Data

In [44]:
import pandas as pd

# Save credit card fraud datasets
X_train_cc.to_csv("X_train_credit.csv", index=False)
X_test_cc.to_csv("X_test_credit.csv", index=False)
y_train_cc.to_csv("y_train_credit.csv", index=False)
y_test_cc.to_csv("y_test_credit.csv", index=False)

# Save general fraud datasets
X_train_fraud.to_csv("X_train_fraud.csv", index=False)
X_test_fraud.to_csv("X_test_fraud.csv", index=False)
y_train_fraud.to_csv("y_train_fraud.csv", index=False)
y_test_fraud.to_csv("y_test_fraud.csv", index=False)


 Save & Download ML Models (Sklearn-based)

In [48]:
import joblib

# Save Logistic Regression model
joblib.dump(log_reg, "logistic_regression.pkl")
joblib.dump(dt_model, "decision_tree.pkl")
joblib.dump(rf_model, "random_forest.pkl")
joblib.dump(xgb_model, "xgboost.pkl")




['xgboost.pkl']

Save & Download Deep Learning Models (TensorFlow/Keras)

In [50]:
# Save MLP Model
mlp_model.save("mlp_model.h5")

# Save CNN Model
cnn_model.save("cnn_model.h5")

# Save RNN Model
rnn_model.save("rnn_model.h5")

# Save LSTM Model
lstm_model.save("lstm_model.h5")



