Import Necessary Libraries

In [1]:
# Data Handling & Processing
import pandas as pd
import numpy as np
!pip install mlflow
# Data Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Train-Test Split
from sklearn.model_selection import train_test_split

# Feature Scaling
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# Machine Learning Models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier

# Deep Learning Models (Neural Networks)
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM, Conv1D, MaxPooling1D, Flatten

# Model Evaluation
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

# MLOps (Experiment Tracking)
import mlflow
import mlflow.sklearn

# Ignore Warnings
import warnings
warnings.filterwarnings('ignore')


Collecting mlflow
  Downloading mlflow-2.20.1-py3-none-any.whl.metadata (30 kB)
Collecting mlflow-skinny==2.20.1 (from mlflow)
  Downloading mlflow_skinny-2.20.1-py3-none-any.whl.metadata (31 kB)
Collecting alembic!=1.10.0,<2 (from mlflow)
  Downloading alembic-1.14.1-py3-none-any.whl.metadata (7.4 kB)
Collecting docker<8,>=4.0.0 (from mlflow)
  Downloading docker-7.1.0-py3-none-any.whl.metadata (3.8 kB)
Collecting graphene<4 (from mlflow)
  Downloading graphene-3.4.3-py2.py3-none-any.whl.metadata (6.9 kB)
Collecting gunicorn<24 (from mlflow)
  Downloading gunicorn-23.0.0-py3-none-any.whl.metadata (4.4 kB)
Collecting databricks-sdk<1,>=0.20.0 (from mlflow-skinny==2.20.1->mlflow)
  Downloading databricks_sdk-0.43.0-py3-none-any.whl.metadata (38 kB)
Collecting Mako (from alembic!=1.10.0,<2->mlflow)
  Downloading Mako-1.3.9-py3-none-any.whl.metadata (2.9 kB)
Collecting graphql-core<3.3,>=3.1 (from graphene<4->mlflow)
  Downloading graphql_core-3.2.6-py3-none-any.whl.metadata (11 kB)
Colle

load Preprocessed Data

In [2]:
# Load your preprocessed datasets (ensure the path is correct)
fraud_data = pd.read_csv('/content/Preprocessed_Fraud_Data.csv')
creditcard_data = pd.read_csv('/content/Preprocessed_Creditcard_Data.csv')

# Optional: Check the first few rows
print(fraud_data.head())
print(creditcard_data.head())


   user_id          signup_time        purchase_time  purchase_value  \
0    22058  2015-02-24 22:55:49  2015-04-18 02:47:11       -0.160204   
1   333320  2015-06-07 20:39:50  2015-06-08 01:38:54       -1.142592   
2     1359  2015-01-01 18:52:44  2015-01-01 18:52:45       -1.197169   
3   150084  2015-04-28 21:13:25  2015-05-04 13:54:50        0.385567   
4   221365  2015-07-21 07:09:52  2015-09-09 18:40:53        0.112681   

       device_id  source  browser  sex  age       ip_address  class  \
0  QVPSPJUOCKZAR       2        0    1   39   73275836879972      0   
1  EOGFQPIZPYXFZ       0        0    0   53  350311387865908      0   
2  YSSKYOSJHPPLJ       2        3    1   53  262147382011095      1   
3  ATGTXKYKUDUQN       2        4    1   41  384054244391396      0   
4  NAUITBZFJKHWW       0        4    1   45  415583117452712      0   

   transaction_frequency  transaction_velocity  hour_of_day  day_of_week  
0                    0.0                   0.0            2      

 Feature and Target Separation

In [15]:
# For Fraud Data
X_fraud = fraud_data.drop(columns=['class'])
y_fraud = fraud_data['class']

# For Credit Card Data
X_creditcard = creditcard_data.drop(columns=['Class'])
y_creditcard = creditcard_data['Class']


Train-Test Split



In [8]:
# For Fraud Data
X_train_fraud, X_test_fraud, y_train_fraud, y_test_fraud = train_test_split(
    X_fraud, y_fraud, test_size=0.2, random_state=42, stratify=y_fraud
)



 Feature Scaling

In [9]:
X_test_fraud.info()

<class 'pandas.core.frame.DataFrame'>
Index: 30223 entries, 79867 to 140297
Data columns (total 14 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   user_id                30223 non-null  int64  
 1   signup_time            30223 non-null  object 
 2   purchase_time          30223 non-null  object 
 3   purchase_value         30223 non-null  float64
 4   device_id              30223 non-null  object 
 5   source                 30223 non-null  int64  
 6   browser                30223 non-null  int64  
 7   sex                    30223 non-null  int64  
 8   age                    30223 non-null  int64  
 9   ip_address             30223 non-null  int64  
 10  transaction_frequency  30223 non-null  float64
 11  transaction_velocity   30223 non-null  float64
 12  hour_of_day            30223 non-null  int64  
 13  day_of_week            30223 non-null  int64  
dtypes: float64(3), int64(8), object(3)
memory usage: 3.5+ 

In [10]:
# Feature Engineering on Train/Test Splits (For Fraud Data)
for df in [X_train_fraud, X_test_fraud]:
    for col in ['signup_time', 'purchase_time']:
        df[col] = pd.to_datetime(df[col])
        df[col + '_year'] = df[col].dt.year
        df[col + '_month'] = df[col].dt.month
        df[col + '_day'] = df[col].dt.day
        df[col + '_hour'] = df[col].dt.hour

# Drop Original Datetime and Categorical Columns (For Fraud Data)
for df in [X_train_fraud, X_test_fraud]:
    df.drop(columns=['signup_time', 'purchase_time', 'device_id'], inplace=True)

# Initialize the scaler
scaler = StandardScaler()

# Apply scaling
X_train_fraud_scaled = scaler.fit_transform(X_train_fraud)
X_test_fraud_scaled = scaler.transform(X_test_fraud)


Model Selection and Training for for Y dataset


Logistic Regression (Baseline Model)

In [11]:
# Logistic Regression requires scaled data
X_fraud = fraud_data.drop(columns=['class', 'signup_time', 'purchase_time', 'device_id'])
y_fraud = fraud_data['class']

log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train_fraud, y_train_fraud,)

# Predict and Evaluate
y_pred_lr = log_reg.predict(X_test_fraud)  # Predictions on test set
print("Logistic Regression Performance fraud data:")
print(classification_report(y_test_fraud, y_pred_lr))


Logistic Regression Performance fraud data:
              precision    recall  f1-score   support

           0       0.91      1.00      0.95     27393
           1       0.00      0.00      0.00      2830

    accuracy                           0.91     30223
   macro avg       0.45      0.50      0.48     30223
weighted avg       0.82      0.91      0.86     30223



Decision Tree Classifier



In [13]:
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train_fraud,y_train_fraud,)

y_pred_lr = dt_model.predict(X_test_fraud)
print("Decision Tree Performance (Credit Card Data):")
print(classification_report(y_test_fraud, y_pred_lr))


Decision Tree Performance (Credit Card Data):
              precision    recall  f1-score   support

           0       0.95      0.94      0.95     27393
           1       0.49      0.56      0.52      2830

    accuracy                           0.90     30223
   macro avg       0.72      0.75      0.73     30223
weighted avg       0.91      0.90      0.91     30223



Random Forest Classifier

In [14]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_fraud, y_train_fraud)

y_pred_lr = rf_model.predict(X_test_fraud)
print("Random Forest Performance (Credit Card Data):")
print(classification_report(y_test_fraud, y_pred_lr ))


Random Forest Performance (Credit Card Data):
              precision    recall  f1-score   support

           0       0.95      1.00      0.98     27393
           1       0.98      0.53      0.69      2830

    accuracy                           0.95     30223
   macro avg       0.97      0.76      0.83     30223
weighted avg       0.96      0.95      0.95     30223



Gradient Boosting with XGBoost

In [16]:
!pip install xgboost
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
xgb_model.fit(X_train_fraud, y_train_fraud)

y_pred_xgb = xgb_model.predict(X_test_fraud)
print("XGBoost Performance (Credit Card Data):")
print(classification_report(y_test_fraud, y_pred_xgb))


XGBoost Performance (Credit Card Data):
              precision    recall  f1-score   support

           0       0.95      1.00      0.98     27393
           1       0.98      0.53      0.69      2830

    accuracy                           0.95     30223
   macro avg       0.97      0.76      0.83     30223
weighted avg       0.96      0.95      0.95     30223



Deep Learning Models

Multi-Layer Perceptron (MLP)

An MLP is a basic feed-forward neural network suitable for tabular data.

In [25]:
# Define the MLP Model
mlp_model = Sequential([
    Dense(64, activation='relu', input_shape=(X_train_fraud_scaled.shape[1],)),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')  # Sigmoid for binary classification
])

# Compile the model
mlp_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
mlp_history = mlp_model.fit(X_train_fraud_scaled, y_train_fraud,
                            epochs=10, batch_size=32,
                            validation_data=(X_test_fraud_scaled, y_test_fraud))


Epoch 1/10
[1m3778/3778[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 4ms/step - accuracy: 0.9267 - loss: 0.2401 - val_accuracy: 0.9541 - val_loss: 0.1875
Epoch 2/10
[1m3778/3778[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 4ms/step - accuracy: 0.9562 - loss: 0.1794 - val_accuracy: 0.9550 - val_loss: 0.1832
Epoch 3/10
[1m3778/3778[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 5ms/step - accuracy: 0.9568 - loss: 0.1767 - val_accuracy: 0.9549 - val_loss: 0.1822
Epoch 4/10
[1m3778/3778[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 5ms/step - accuracy: 0.9568 - loss: 0.1763 - val_accuracy: 0.9549 - val_loss: 0.1825
Epoch 5/10
[1m3778/3778[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 5ms/step - accuracy: 0.9559 - loss: 0.1789 - val_accuracy: 0.9550 - val_loss: 0.1825
Epoch 6/10
[1m3778/3778[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 5ms/step - accuracy: 0.9561 - loss: 0.1784 - val_accuracy: 0.9551 - val_loss: 0.1831
Epoch 7/10

Convolutional Neural Network (CNN)



Since CNNs are built for spatial data, we reshape the data so that each feature becomes like a “pixel” in a sequence.

In [27]:
# Reshape for CNN: (samples, features, 1)
X_train_fr_cnn = X_train_fraud_scaled.reshape(X_train_fraud_scaled.shape[0], X_train_fraud_scaled.shape[1], 1)
X_test_fr_cnn = X_test_fraud_scaled.reshape(X_test_fraud_scaled.shape[0], X_test_fraud_scaled.shape[1], 1)

# Define the CNN Model
cnn_model = Sequential([
    Conv1D(32, kernel_size=3, activation='relu', input_shape=(X_train_fraud_scaled.shape[1], 1)),
    Flatten(),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')
])

cnn_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

cnn_history = cnn_model.fit(X_train_fr_cnn, y_train_fraud,
                            epochs=10, batch_size=32,
                            validation_data=(X_test_fr_cnn,  y_test_fraud))


Epoch 1/10
[1m3778/3778[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 3ms/step - accuracy: 0.9408 - loss: 0.2228 - val_accuracy: 0.9521 - val_loss: 0.1934
Epoch 2/10
[1m3778/3778[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 4ms/step - accuracy: 0.9526 - loss: 0.1908 - val_accuracy: 0.9523 - val_loss: 0.1894
Epoch 3/10
[1m3778/3778[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 3ms/step - accuracy: 0.9538 - loss: 0.1861 - val_accuracy: 0.9524 - val_loss: 0.1891
Epoch 4/10
[1m3778/3778[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 3ms/step - accuracy: 0.9549 - loss: 0.1829 - val_accuracy: 0.9534 - val_loss: 0.1882
Epoch 5/10
[1m3778/3778[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 5ms/step - accuracy: 0.9559 - loss: 0.1796 - val_accuracy: 0.9537 - val_loss: 0.1857
Epoch 6/10
[1m3778/3778[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 6ms/step - accuracy: 0.9554 - loss: 0.1807 - val_accuracy: 0.9538 - val_loss: 0.1851
Epoch 7/10

Recurrent Neural Network (RNN)

RNNs are used for sequential data. We use the same reshaped data as for the CNN.

In [28]:
from tensorflow.keras.layers import SimpleRNN

rnn_model = Sequential([
    SimpleRNN(32, activation='relu', input_shape=(X_train_fraud_scaled.shape[1], 1)),
    Dense(1, activation='sigmoid')
])

rnn_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

rnn_history = rnn_model.fit(X_train_fr_cnn, y_train_fraud,
                            epochs=10, batch_size=32,
                            validation_data=(X_test_fr_cnn, y_test_fraud))


Epoch 1/10
[1m3778/3778[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 6ms/step - accuracy: 0.9342 - loss: 0.2380 - val_accuracy: 0.9524 - val_loss: 0.1899
Epoch 2/10
[1m3778/3778[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 6ms/step - accuracy: 0.9547 - loss: 0.1843 - val_accuracy: 0.9533 - val_loss: 0.1864
Epoch 3/10
[1m3778/3778[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 6ms/step - accuracy: 0.9545 - loss: 0.1841 - val_accuracy: 0.9525 - val_loss: 0.1875
Epoch 4/10
[1m3778/3778[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 6ms/step - accuracy: 0.9570 - loss: 0.1762 - val_accuracy: 0.9545 - val_loss: 0.1838
Epoch 5/10
[1m3778/3778[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 6ms/step - accuracy: 0.9568 - loss: 0.1769 - val_accuracy: 0.9539 - val_loss: 0.1845
Epoch 6/10
[1m3778/3778[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 7ms/step - accuracy: 0.9560 - loss: 0.1789 - val_accuracy: 0.9549 - val_loss: 0.1827
Epoch 7/10

Long Short-Term Memory (LSTM)

LSTM networks are a special kind of RNN capable of learning long-term dependencies.

In [30]:
lstm_model = Sequential([
    LSTM(50, return_sequences=True, input_shape=(X_train_fraud_scaled.shape[1], 1)),
    LSTM(50),
    Dense(1, activation='sigmoid')
])

lstm_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

lstm_history = lstm_model.fit(X_train_fr_cnn, y_train_fraud,
                              epochs=10, batch_size=32,
                              validation_data=(X_test_fr_cnn, y_test_fraud))


Epoch 1/10
[1m3778/3778[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m97s[0m 24ms/step - accuracy: 0.9312 - loss: 0.2396 - val_accuracy: 0.9508 - val_loss: 0.1932
Epoch 2/10
[1m3778/3778[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m140s[0m 23ms/step - accuracy: 0.9549 - loss: 0.1821 - val_accuracy: 0.9541 - val_loss: 0.1848
Epoch 3/10
[1m3778/3778[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m86s[0m 23ms/step - accuracy: 0.9566 - loss: 0.1775 - val_accuracy: 0.9542 - val_loss: 0.1850
Epoch 4/10
[1m3778/3778[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m89s[0m 24ms/step - accuracy: 0.9559 - loss: 0.1792 - val_accuracy: 0.9546 - val_loss: 0.1826
Epoch 5/10
[1m3778/3778[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m93s[0m 25ms/step - accuracy: 0.9560 - loss: 0.1789 - val_accuracy: 0.9550 - val_loss: 0.1824
Epoch 6/10
[1m3778/3778[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m133s[0m 22ms/step - accuracy: 0.9567 - loss: 0.1764 - val_accuracy: 0.9551 - val_loss: 0.1814
Ep

MLOps – Experiment Tracking with MLflow
MLflow lets us log parameters, metrics, and even the trained model so that you can track your experiments over time.

In [35]:
# Install MLflow if you haven't already (uncomment the next line if needed)
# !pip install mlflow

import mlflow
import mlflow.sklearn

mlflow.set_experiment("Fraud Detection Experiment")
# Predict using Random Forest model:
y_pred_rf = rf_model.predict(X_test_fraud_scaled)  # Get predictions for Random Forest

# MLflow Experiment Tracking
mlflow.set_experiment("Fraud Detection Experiment")

with mlflow.start_run():
    mlflow.log_param("model", "RandomForest")
    accuracy_rf = accuracy_score(y_test_fraud, y_pred_rf)  # Use y_pred_rf here
    mlflow.log_metric("accuracy", accuracy_rf)
    mlflow.sklearn.log_model(rf_model, "random_forest_model")
    print("Logged Random Forest Model with accuracy:", accuracy_rf)



Logged Random Forest Model with accuracy: 0.21271879032524899


In [39]:
import json
y_pred_lr = log_reg.predict(X_test_fraud_scaled)

# Store model performance
experiment_results = {
    "Logistic Regression": accuracy_score(y_test_fraud , y_pred_lr),
    "Decision Tree": accuracy_score(y_test_fraud , y_pred_rf ),
    "Random Forest": accuracy_score(y_test_fraud,  y_pred_rf),
    "XGBoost": accuracy_score(y_test_fraud , y_pred_xgb),
}
# Save JSON
with open("experiment_results.json", "w") as f:
    json.dump(experiment_results, f)


Saving the Model



In [40]:
import joblib

# Assume rf_model is your trained Random Forest model
joblib.dump(rf_model, 'fraud_random_forest_model.pkl')


['fraud_random_forest_model.pkl']

Save & Download Processed Data

 Save & Download ML Models (Sklearn-based)

In [41]:
import joblib

# Save Logistic Regression model
joblib.dump(log_reg, "fraud_logistic_regression.pkl")
joblib.dump(dt_model, "fraud_decision_tree.pkl")
joblib.dump(rf_model, "fraud_random_forest.pkl")
joblib.dump(xgb_model, "fraud_xgboost.pkl")




['fraud_xgboost.pkl']

Save & Download Deep Learning Models (TensorFlow/Keras)

In [None]:
# Save MLP Model
mlp_model.save("fraud_mlp_model.h5")

# Save CNN Model
cnn_model.save("fraud_cnn_model.h5")

# Save RNN Model
rnn_model.save("fraud_rnn_model.h5")


# Save LSTM Model
lstm_model.save("fraud_lstm_model.h5")



