## Model Building and Training

### Data Preparation

In [23]:
import pandas as pd
import numpy as np
import mlflow
import mlflow.sklearn  # For sklearn models
import mlflow.keras    # For Keras models
import warnings

warnings.filterwarnings("ignore")


In [24]:
fraud_df = pd.read_csv("../Data/fraud_dataset.csv")
creditcard_df = pd.read_csv("../Data/creditcard.csv")

In [25]:
fraud_df.head(5)

Unnamed: 0,user_id,signup_time,purchase_time,purchase_value,device_id,age,ip_address,class,country,transaction_count,transaction_velocity,hour_of_day,day_of_week,browser_FireFox,browser_IE,browser_Opera,browser_Safari,source_Direct,source_SEO,sex_M
0,22058,2015-02-24 22:55:49,2015-04-18 02:47:11,-0.160204,QVPSPJUOCKZAR,39,732758400.0,0,Japan,0.0,0.0,2,5,False,False,False,False,False,True,True
1,333320,2015-06-07 20:39:50,2015-06-08 01:38:54,-1.142592,EOGFQPIZPYXFZ,53,350311400.0,0,United States,0.0,0.0,1,0,False,False,False,False,False,False,False
2,1359,2015-01-01 18:52:44,2015-01-01 18:52:45,-1.197169,YSSKYOSJHPPLJ,53,2621474000.0,1,Unknown,0.0,0.0,18,3,False,False,True,False,False,True,True
3,150084,2015-04-28 21:13:25,2015-05-04 13:54:50,0.385567,ATGTXKYKUDUQN,41,3840542000.0,0,Unknown,0.0,0.0,13,0,False,False,False,True,False,True,True
4,221365,2015-07-21 07:09:52,2015-09-09 18:40:53,0.112681,NAUITBZFJKHWW,45,415583100.0,0,United States,0.0,0.0,18,2,False,False,False,True,False,False,True


In [26]:
creditcard_df.head(5)

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [5]:
fraud_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 151112 entries, 0 to 151111
Data columns (total 20 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   user_id               151112 non-null  int64  
 1   signup_time           151112 non-null  object 
 2   purchase_time         151112 non-null  object 
 3   purchase_value        151112 non-null  float64
 4   device_id             151112 non-null  object 
 5   age                   151112 non-null  int64  
 6   ip_address            151112 non-null  float64
 7   class                 151112 non-null  int64  
 8   country               151112 non-null  object 
 9   transaction_count     151112 non-null  float64
 10  transaction_velocity  151112 non-null  float64
 11  hour_of_day           151112 non-null  int64  
 12  day_of_week           151112 non-null  int64  
 13  browser_FireFox       151112 non-null  bool   
 14  browser_IE            151112 non-null  bool   
 15  

#### Feature and Target Separation

In [27]:
# Correcting data types
fraud_df['signup_time'] = pd.to_datetime(fraud_df['signup_time'])
fraud_df['purchase_time'] = pd.to_datetime(fraud_df['purchase_time'])
fraud_df['ip_address'] = fraud_df['ip_address'].fillna(0).astype(int)

In [28]:
# Create a new feature: time difference between signup and purchase (in minutes)
fraud_df['time_diff_minutes'] = (fraud_df['purchase_time'] - fraud_df['signup_time']).dt.total_seconds() / 60.0

# Extract additional features from 'purchase_time'
fraud_df['purchase_hour'] = fraud_df['purchase_time'].dt.hour
fraud_df['purchase_day'] = fraud_df['purchase_time'].dt.day
fraud_df['purchase_month'] = fraud_df['purchase_time'].dt.month

In [29]:
# Drop the original 'signup_time' and 'purchase_time' columns
fraud_df = fraud_df.drop(columns=['signup_time','purchase_time'])

### Preprocessing

In [30]:
from sklearn.preprocessing import LabelEncoder

#  Label encoding for categorical columns in Fraud_Data
label_encoder = LabelEncoder()
fraud_df['device_id'] = label_encoder.fit_transform(fraud_df['device_id'])
fraud_df['country'] = label_encoder.fit_transform(fraud_df['country'])

In [31]:
from sklearn.preprocessing import LabelEncoder

# Encode categorical columns in fraud_df and creditcard_df
fraud_df_encoded = fraud_df.copy()
creditcard_df_encoded = creditcard_df.copy()

# Encoding the 'country' column in fraud_df (and any other categorical columns, if present)
label_encoder = LabelEncoder()
fraud_df_encoded['country'] = label_encoder.fit_transform(fraud_df_encoded['country'])

In [32]:
# For Fraud_Data.csv
X_fraud = fraud_df.drop(columns=['class'])
y_fraud = fraud_df[['class']]

# For creditcard.csv
X_creditcard = creditcard_df.drop(columns=['Class'])
y_creditcard = creditcard_df[['Class']]

### SMOTE To balance X_fraud and y_fraud datasets

In [33]:
from imblearn.over_sampling import SMOTE

# Apply SMOTE to balance both datasets
smote = SMOTE(random_state=42)

# Balance fraud_df
X_fraud_balanced, y_fraud_balanced = smote.fit_resample(X_fraud, y_fraud)

# Balance creditcard_df
X_creditcard_balanced, y_creditcard_balanced = smote.fit_resample(X_creditcard, y_creditcard)

In [34]:
Number_Fraud = len(y_fraud_balanced[y_fraud_balanced == 1])
Number_notFraudlen = len(y_fraud_balanced[y_fraud_balanced == 0])
# print(f"Number of fraud data: {Number_Fraud}\nNumber of not fraud data: {Number_notFraudlen}")

Number of fraud data: 273922
Number of not fraud data: 273922


In [38]:
import mlflow
import mlflow.sklearn 
from sklearn.model_selection import train_test_split

# Set up an experiment
mlflow.set_experiment("Frauds Detection Experiment")

# Split Fraud Data
X_fraud_train, X_fraud_test, y_fraud_train, y_fraud_test = train_test_split(X_fraud_balanced, y_fraud_balanced, test_size=0.2, random_state=42)

# Split Credit Card Data
X_creditcard_train, X_creditcard_test, y_creditcard_train, y_creditcard_test = train_test_split(X_creditcard_balanced, y_creditcard_balanced, test_size=0.2, random_state=42)
def split_train_test():
    return X_fraud_train, X_fraud_test, y_fraud_train, y_fraud_test,X_creditcard_train, X_creditcard_test, y_creditcard_train, y_creditcard_test


### Model Selection

#### Traditional Models:

#### Extract useful features from the datetime columns

In [19]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report, accuracy_score
import joblib


# Initialize models
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'Gradient Boosting': GradientBoostingClassifier()
}


# Train and log results for sklearn models (Fraud Data)
for name, model in models.items():
    with mlflow.start_run(run_name=name):
        # Create a pipeline with preprocessing and model
        # clf = Pipeline(steps=[('preprocessor', preprocessor),
        #                   ('classifier', model)])
        model.fit(X_fraud_train, y_fraud_train)
        y_pred_fraud = model.predict(X_fraud_test)
        
        # Log parameters and metrics
        mlflow.log_params(model.get_params())
        accuracy = accuracy_score(y_fraud_test, y_pred_fraud)
        mlflow.log_metric("accuracy", accuracy)
        mlflow.sklearn.log_model(model, name)

        # Save the model as a .pkl file
        joblib.dump(model, f"{name}.pkl")

        # Output classification report
        print(f"{name} - Fraud Data:")
        print(classification_report(y_fraud_test, y_pred_fraud))
        print("Accuracy:", accuracy)
        
        mlflow.end_run()  # End the current run



Logistic Regression - Fraud Data:
              precision    recall  f1-score   support

           0       0.69      0.72      0.70     27349
           1       0.71      0.68      0.69     27436

    accuracy                           0.70     54785
   macro avg       0.70      0.70      0.70     54785
weighted avg       0.70      0.70      0.70     54785

Accuracy: 0.6971616318335311




Decision Tree - Fraud Data:
              precision    recall  f1-score   support

           0       0.88      0.85      0.87     27349
           1       0.86      0.89      0.87     27436

    accuracy                           0.87     54785
   macro avg       0.87      0.87      0.87     54785
weighted avg       0.87      0.87      0.87     54785

Accuracy: 0.8694715706854066




Random Forest - Fraud Data:
              precision    recall  f1-score   support

           0       0.89      0.97      0.93     27349
           1       0.97      0.88      0.93     27436

    accuracy                           0.93     54785
   macro avg       0.93      0.93      0.93     54785
weighted avg       0.93      0.93      0.93     54785

Accuracy: 0.9295427580542119




Gradient Boosting - Fraud Data:
              precision    recall  f1-score   support

           0       0.80      0.97      0.88     27349
           1       0.96      0.76      0.85     27436

    accuracy                           0.87     54785
   macro avg       0.88      0.87      0.87     54785
weighted avg       0.88      0.87      0.87     54785

Accuracy: 0.867481974993155


In [30]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv2D, Flatten, LSTM, Embedding, Dropout
from sklearn.preprocessing import StandardScaler


# Scaling data for neural networks
scaler = StandardScaler()
X_creditcard_train_scaled = scaler.fit_transform(X_creditcard_train)
X_creditcard_test_scaled = scaler.transform(X_creditcard_test)

# # Reshape for CNN (assuming 2D structure, modify according to your features)
# X_creditcard_cnn = X_creditcard_scaled.reshape(-1, X_creditcard_scaled.shape[1], 1)

# MLP Model
def create_mlp_model(input_shape):
    model = Sequential()
    model.add(Dense(64, activation='relu', input_shape=(input_shape,)))
    model.add(Dropout(0.5))
    model.add(Dense(32, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    return model

mlp_model = create_mlp_model(X_creditcard.shape[1])
mlp_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
# Start a new MLflow run for MLP
with mlflow.start_run(run_name="MLP"):
    mlflow.log_param("model_type", "MLP")
    mlflow.log_param("epochs", 10)
    mlp_model.fit(X_creditcard_train_scaled, y_creditcard_train, epochs=10, batch_size=32, validation_data=(X_creditcard_test_scaled, y_creditcard_test))

    # Log model and accuracy
    accuracy = mlp_model.evaluate(X_creditcard_test_scaled, y_creditcard_test)[1]
    # Save the model as a .pkl file
    joblib.dump(model, "mlp_model.pkl")
    mlflow.log_metric("accuracy", accuracy)
    mlflow.keras.log_model(mlp_model, "mlp_model")
    mlflow.end_run()  # End the current run
# mlp_model.fit(X_creditcard_train, y_creditcard_train, epochs=10, batch_size=32, validation_data=(X_creditcard_test, y_creditcard_test))


Epoch 1/10
[1m14216/14216[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m49s[0m 3ms/step - accuracy: 0.9689 - loss: 0.0886 - val_accuracy: 0.9950 - val_loss: 0.0166
Epoch 2/10
[1m14216/14216[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 3ms/step - accuracy: 0.9910 - loss: 0.0270 - val_accuracy: 0.9971 - val_loss: 0.0109
Epoch 3/10
[1m14216/14216[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 3ms/step - accuracy: 0.9931 - loss: 0.0212 - val_accuracy: 0.9966 - val_loss: 0.0116
Epoch 4/10
[1m14216/14216[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 3ms/step - accuracy: 0.9941 - loss: 0.0182 - val_accuracy: 0.9980 - val_loss: 0.0083
Epoch 5/10
[1m14216/14216[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m49s[0m 3ms/step - accuracy: 0.9949 - loss: 0.0159 - val_accuracy: 0.9983 - val_loss: 0.0072
Epoch 6/10
[1m14216/14216[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m49s[0m 3ms/step - accuracy: 0.9952 - loss: 0.0150 - val_accuracy: 0.9980 - val_loss: 0.008



In [31]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, Flatten, Dense

# Scale data for Neural Networks
scaler = StandardScaler()
X_creditcard_scaled = scaler.fit_transform(X_creditcard)
X_creditcard_train = scaler.fit_transform(X_creditcard_train)
X_creditcard_test = scaler.fit_transform(X_creditcard_test)

# X_creditcard_cnn = X_creditcard_train.reshape(-1, X_creditcard_scaled.shape[1], 1)
# X_creditcard_cnn_test = X_creditcard_test.reshape(-1, X_creditcard_scaled.shape[1], 1)
# Reshape data for CNN
X_creditcard_train_cnn = X_creditcard_train_scaled.reshape(-1, X_creditcard_train_scaled.shape[1], 1)
X_creditcard_test_cnn = X_creditcard_test_scaled.reshape(-1, X_creditcard_test_scaled.shape[1], 1)


# Adjust the CNN model for 1D data
cnn_model = Sequential()

# Assuming X_creditcard_cnn has shape (samples, 30 features, 1 channel)
cnn_model.add(Conv1D(32, kernel_size=3, activation='relu', input_shape=(30, 1)))
cnn_model.add(Flatten())
cnn_model.add(Dense(1, activation='sigmoid'))

cnn_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
# Start a new MLflow run for CNN
with mlflow.start_run(run_name="CNN"):
    cnn_model.fit(X_creditcard_train_cnn, y_creditcard_train, epochs=10, batch_size=32)
    
    # Log model and accuracy
    accuracy = cnn_model.evaluate(X_creditcard_test_cnn, y_creditcard_test)[1]
    # Save the model as a .pkl file
    joblib.dump(model, "cnn_model.pkl")
    mlflow.log_metric("accuracy", accuracy)
    mlflow.keras.log_model(cnn_model, "cnn_model")
    mlflow.end_run()
# Train the model
# cnn_model.fit(X_creditcard_cnn, y_creditcard_train, epochs=10, batch_size=32)


Epoch 1/10
[1m14216/14216[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 3ms/step - accuracy: 0.9722 - loss: 0.0787
Epoch 2/10
[1m14216/14216[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 3ms/step - accuracy: 0.9887 - loss: 0.0332
Epoch 3/10
[1m14216/14216[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 3ms/step - accuracy: 0.9904 - loss: 0.0278
Epoch 4/10
[1m14216/14216[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 3ms/step - accuracy: 0.9915 - loss: 0.0256
Epoch 5/10
[1m14216/14216[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 3ms/step - accuracy: 0.9921 - loss: 0.0243
Epoch 6/10
[1m14216/14216[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 3ms/step - accuracy: 0.9928 - loss: 0.0218
Epoch 7/10
[1m14216/14216[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 3ms/step - accuracy: 0.9932 - loss: 0.0202
Epoch 8/10
[1m14216/14216[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 3ms/step - accuracy: 0.9939 - loss: 0.0189




In [24]:
# LSTM Model (assuming sequential data)
lstm_model = Sequential()
lstm_model.add(LSTM(50, activation='relu', input_shape=(X_creditcard.shape[1], 1)))
lstm_model.add(Dense(1, activation='sigmoid'))
lstm_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
# Start a new MLflow run for LSTM
with mlflow.start_run(run_name="LSTM"):
    lstm_model.fit(X_creditcard_train_cnn, y_creditcard_train, epochs=10, batch_size=32)
    
    # Log model and accuracy
    accuracy = lstm_model.evaluate(X_creditcard_test_cnn, y_creditcard_test)[1]
    mlflow.log_metric("accuracy", accuracy)
    mlflow.keras.log_model(lstm_model, "lstm_model")
    mlflow.end_run()
# lstm_model.fit(X_creditcard_train, y_creditcard_train, epochs=10, batch_size=32)

Epoch 1/10
[1m14216/14216[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m199s[0m 14ms/step - accuracy: 0.8515 - loss: 0.6964
Epoch 2/10
[1m14216/14216[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m228s[0m 16ms/step - accuracy: 0.9767 - loss: 0.0694
Epoch 3/10
[1m14216/14216[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m231s[0m 16ms/step - accuracy: 0.9861 - loss: 0.0416
Epoch 4/10
[1m14216/14216[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m259s[0m 16ms/step - accuracy: 0.9916 - loss: 0.0238
Epoch 5/10
[1m14216/14216[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m237s[0m 17ms/step - accuracy: 0.9311 - loss: 0.1903
Epoch 6/10
[1m14216/14216[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m223s[0m 16ms/step - accuracy: 0.9757 - loss: 0.0778
Epoch 7/10
[1m14216/14216[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m255s[0m 15ms/step - accuracy: 0.9879 - loss: 0.0334
Epoch 8/10
[1m14216/14216[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m228s[0m 16ms/step - accuracy: 0.9751



In [25]:
# Function to evaluate model performance
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    y_pred_classes = (y_pred > 0.5).astype(int)  # Thresholding for binary classification
    print(classification_report(y_test, y_pred_classes))
    print("Accuracy:", accuracy_score(y_test, y_pred_classes))

# Evaluate the MLP model
evaluate_model(mlp_model, X_creditcard_test, y_creditcard_test)

# Evaluate the CNN model
evaluate_model(cnn_model, X_creditcard_test, y_creditcard_test)

# Evaluate the LSTM model
evaluate_model(lstm_model, X_creditcard_test, y_creditcard_test)


[1m3554/3554[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 2ms/step
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56750
           1       1.00      1.00      1.00     56976

    accuracy                           1.00    113726
   macro avg       1.00      1.00      1.00    113726
weighted avg       1.00      1.00      1.00    113726

Accuracy: 0.9987074195874295
[1m3554/3554[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 2ms/step
              precision    recall  f1-score   support

           0       0.99      1.00      0.99     56750
           1       1.00      0.99      0.99     56976

    accuracy                           0.99    113726
   macro avg       0.99      0.99      0.99    113726
weighted avg       0.99      0.99      0.99    113726

Accuracy: 0.9930007210312506
[1m3554/3554[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 7ms/step
              precision    recall  f1-score   support

 