## Model Building and Training

### Data Preparation

In [2]:
import pandas as pd

In [6]:
fraud_df = pd.read_csv("../Notebooks/fraud_dataset.csv")
creditcard_df = pd.read_csv("../data/creditcard.csv")

#### Feature and Target Separation

In [19]:
# Correcting data types
fraud_df['signup_time'] = pd.to_datetime(fraud_df['signup_time'])
fraud_df['purchase_time'] = pd.to_datetime(fraud_df['purchase_time'])

In [26]:
# Create a new feature: time difference between signup and purchase (in minutes)
fraud_df['time_diff_minutes'] = (fraud_df['purchase_time'] - fraud_df['signup_time']).dt.total_seconds() / 60.0

# Extract additional features from 'purchase_time'
fraud_df['purchase_hour'] = fraud_df['purchase_time'].dt.hour
fraud_df['purchase_day'] = fraud_df['purchase_time'].dt.day
fraud_df['purchase_month'] = fraud_df['purchase_time'].dt.month

In [27]:
# Drop the original 'signup_time' and 'purchase_time' columns
fraud_df = fraud_df.drop(columns=['signup_time','purchase_time'])

In [28]:
# For Fraud_Data.csv
X_fraud = fraud_df.drop(columns=['class'])
y_fraud = fraud_df['class']

# For creditcard.csv
X_creditcard = creditcard_df.drop(columns=['Class'])
y_creditcard = creditcard_df['Class']

In [29]:
from sklearn.preprocessing import LabelEncoder

#  Label encoding for categorical columns in Fraud_Data
label_encoder = LabelEncoder()
X_fraud['device_id'] = label_encoder.fit_transform(X_fraud['device_id'])
# X_fraud['source'] = label_encoder.fit_transform(X_fraud['source'])
# X_fraud['browser'] = label_encoder.fit_transform(X_fraud['browser'])
# X_fraud['sex'] = label_encoder.fit_transform(X_fraud['sex'])

#### Train-Test Split:

In [30]:
from sklearn.model_selection import train_test_split

# Split Fraud Data
X_fraud_train, X_fraud_test, y_fraud_train, y_fraud_test = train_test_split(X_fraud, y_fraud, test_size=0.2, random_state=42)

# Split Credit Card Data
X_creditcard_train, X_creditcard_test, y_creditcard_train, y_creditcard_test = train_test_split(X_creditcard, y_creditcard, test_size=0.2, random_state=42)

In [31]:
X_fraud_train.dtypes

user_id                   int64
purchase_value          float64
device_id                 int32
age                       int64
ip_address                int64
transaction_count       float64
transaction_velocity    float64
hour_of_day               int64
day_of_week               int64
browser_FireFox            bool
browser_IE                 bool
browser_Opera              bool
browser_Safari             bool
source_Direct              bool
source_SEO                 bool
sex_M                      bool
time_diff_minutes       float64
purchase_hour             int32
purchase_day              int32
purchase_month            int32
dtype: object

In [32]:
X_fraud_test.dtypes

user_id                   int64
purchase_value          float64
device_id                 int32
age                       int64
ip_address                int64
transaction_count       float64
transaction_velocity    float64
hour_of_day               int64
day_of_week               int64
browser_FireFox            bool
browser_IE                 bool
browser_Opera              bool
browser_Safari             bool
source_Direct              bool
source_SEO                 bool
sex_M                      bool
time_diff_minutes       float64
purchase_hour             int32
purchase_day              int32
purchase_month            int32
dtype: object

### Model Selection

#### Traditional Models:

#### Extract useful features from the datetime columns

In [25]:
fraud_df.head()

Unnamed: 0,user_id,signup_time,purchase_time,purchase_value,device_id,age,ip_address,class,transaction_count,transaction_velocity,hour_of_day,day_of_week,browser_FireFox,browser_IE,browser_Opera,browser_Safari,source_Direct,source_SEO,sex_M
0,22058,2015-02-24 22:55:49,2015-04-18 02:47:11,-0.160204,QVPSPJUOCKZAR,39,732758368,0,0.0,0.0,2,5,False,False,False,False,False,True,True
1,333320,2015-06-07 20:39:50,2015-06-08 01:38:54,-1.142592,EOGFQPIZPYXFZ,53,350311387,0,0.0,0.0,1,0,False,False,False,False,False,False,False
2,1359,2015-01-01 18:52:44,2015-01-01 18:52:45,-1.197169,YSSKYOSJHPPLJ,53,-2147483648,1,0.0,0.0,18,3,False,False,True,False,False,True,True
3,150084,2015-04-28 21:13:25,2015-05-04 13:54:50,0.385567,ATGTXKYKUDUQN,41,-2147483648,0,0.0,0.0,13,0,False,False,False,True,False,True,True
4,221365,2015-07-21 07:09:52,2015-09-09 18:40:53,0.112681,NAUITBZFJKHWW,45,415583117,0,0.0,0.0,18,2,False,False,False,True,False,False,True


In [67]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Convert 'signup_time' and 'purchase_time' to datetime format if not already done
X_fraud_train['signup_time'] = pd.to_datetime(X_fraud_train['signup_time'])
X_fraud_train['purchase_time'] = pd.to_datetime(X_fraud_train['purchase_time'])

# Create numeric features from datetime columns
X_fraud_train['time_diff_minutes'] = (X_fraud_train['purchase_time'] - X_fraud_train['signup_time']).dt.total_seconds() / 60.0
X_fraud_train['purchase_hour'] = X_fraud_train['purchase_time'].dt.hour
X_fraud_train['purchase_day'] = X_fraud_train['purchase_time'].dt.day
X_fraud_train['purchase_month'] = X_fraud_train['purchase_time'].dt.month

# Drop the original datetime columns
X_fraud_train = X_fraud_train.drop(columns=['signup_time', 'purchase_time'])

# Encode categorical features
label_encoder = LabelEncoder()

# Encode 'device_id', 'source', 'browser', and 'sex'
X_fraud_train['device_id'] = label_encoder.fit_transform(X_fraud_train['device_id'])
X_fraud_train['source'] = label_encoder.fit_transform(X_fraud_train['source'])
X_fraud_train['browser'] = label_encoder.fit_transform(X_fraud_train['browser'])
X_fraud_train['sex'] = label_encoder.fit_transform(X_fraud_train['sex'])

# Verify all columns are now numeric
print(X_fraud_train.dtypes)

# # Now the data should be ready for model training
# model.fit(X_fraud_train, y_fraud_train)


user_id                int64
purchase_value         int64
device_id              int64
source                 int64
browser                int64
sex                    int64
age                    int64
ip_address           float64
time_diff_minutes    float64
purchase_hour          int32
purchase_day           int32
purchase_month         int32
dtype: object


In [33]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report, accuracy_score


# Initialize models
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'Gradient Boosting': GradientBoostingClassifier()
}

# Train and evaluate models for Fraud Data
for name, model in models.items():
    
    model.fit(X_fraud_train, y_fraud_train)
    y_pred_fraud = model.predict(X_fraud_test)
    print(f"{name} - Fraud Data:")
    print(classification_report(y_fraud_test, y_pred_fraud))
    print("Accuracy:", accuracy_score(y_fraud_test, y_pred_fraud))
    print("\n")

Logistic Regression - Fraud Data:
              precision    recall  f1-score   support

           0       0.91      1.00      0.95     27373
           1       1.00      0.00      0.00      2850

    accuracy                           0.91     30223
   macro avg       0.95      0.50      0.48     30223
weighted avg       0.91      0.91      0.86     30223

Accuracy: 0.905800218376733


Decision Tree - Fraud Data:
              precision    recall  f1-score   support

           0       0.95      0.94      0.95     27373
           1       0.50      0.57      0.53      2850

    accuracy                           0.91     30223
   macro avg       0.73      0.75      0.74     30223
weighted avg       0.91      0.91      0.91     30223

Accuracy: 0.9054031697713661


Random Forest - Fraud Data:
              precision    recall  f1-score   support

           0       0.95      1.00      0.98     27373
           1       1.00      0.54      0.70      2850

    accuracy                   

In [34]:
print(X_fraud_train.dtypes)

user_id                   int64
purchase_value          float64
device_id                 int32
age                       int64
ip_address                int64
transaction_count       float64
transaction_velocity    float64
hour_of_day               int64
day_of_week               int64
browser_FireFox            bool
browser_IE                 bool
browser_Opera              bool
browser_Safari             bool
source_Direct              bool
source_SEO                 bool
sex_M                      bool
time_diff_minutes       float64
purchase_hour             int32
purchase_day              int32
purchase_month            int32
dtype: object


In [35]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv2D, Flatten, LSTM, Embedding, Dropout
from sklearn.preprocessing import StandardScaler

# Scale data for Neural Networks
scaler = StandardScaler()
X_creditcard_scaled = scaler.fit_transform(X_creditcard)

# Reshape for CNN (assuming 2D structure, modify according to your features)
X_creditcard_cnn = X_creditcard_scaled.reshape(-1, X_creditcard_scaled.shape[1], 1)

# MLP Model
def create_mlp_model(input_shape):
    model = Sequential()
    model.add(Dense(64, activation='relu', input_shape=(input_shape,)))
    model.add(Dropout(0.5))
    model.add(Dense(32, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    return model

mlp_model = create_mlp_model(X_creditcard.shape[1])
mlp_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
mlp_model.fit(X_creditcard_train, y_creditcard_train, epochs=10, batch_size=32, validation_data=(X_creditcard_test, y_creditcard_test))


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10
[1m7121/7121[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 5ms/step - accuracy: 0.9924 - loss: 44.4865 - val_accuracy: 0.9983 - val_loss: 0.6922
Epoch 2/10
[1m7121/7121[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 6ms/step - accuracy: 0.9978 - loss: 0.2038 - val_accuracy: 0.9983 - val_loss: 0.0128
Epoch 3/10
[1m7121/7121[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 6ms/step - accuracy: 0.9982 - loss: 0.0291 - val_accuracy: 0.9983 - val_loss: 0.0127
Epoch 4/10
[1m7121/7121[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 6ms/step - accuracy: 0.9983 - loss: 0.0169 - val_accuracy: 0.9983 - val_loss: 0.0474
Epoch 5/10
[1m7121/7121[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 5ms/step - accuracy: 0.9983 - loss: 0.0295 - val_accuracy: 0.9983 - val_loss: 0.0127
Epoch 6/10
[1m7121/7121[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 5ms/step - accuracy: 0.9982 - loss: 0.0504 - val_accuracy: 0.9983 - val_loss: 0.0127
Epoch 7/1

<keras.src.callbacks.history.History at 0x1d63801d610>

In [36]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, Flatten, Dense

# Scale data for Neural Networks
scaler = StandardScaler()
X_creditcard_scaled = scaler.fit_transform(X_creditcard)
X_creditcard_train = scaler.fit_transform(X_creditcard_train)
X_creditcard_test = scaler.fit_transform(X_creditcard_test)

X_creditcard_cnn = X_creditcard_train.reshape(-1, X_creditcard_scaled.shape[1], 1)
X_creditcard_cnn_test = X_creditcard_test.reshape(-1, X_creditcard_scaled.shape[1], 1)

# Adjust the CNN model for 1D data
cnn_model = Sequential()

# Assuming X_creditcard_cnn has shape (samples, 30 features, 1 channel)
cnn_model.add(Conv1D(32, kernel_size=3, activation='relu', input_shape=(30, 1)))
cnn_model.add(Flatten())
cnn_model.add(Dense(1, activation='sigmoid'))

cnn_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
cnn_model.fit(X_creditcard_cnn, y_creditcard_train, epochs=10, batch_size=32)


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10
[1m7121/7121[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 5ms/step - accuracy: 0.9947 - loss: 0.0234
Epoch 2/10
[1m7121/7121[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 5ms/step - accuracy: 0.9994 - loss: 0.0034
Epoch 3/10
[1m7121/7121[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 5ms/step - accuracy: 0.9994 - loss: 0.0032
Epoch 4/10
[1m7121/7121[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 5ms/step - accuracy: 0.9995 - loss: 0.0026
Epoch 5/10
[1m7121/7121[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 5ms/step - accuracy: 0.9995 - loss: 0.0025
Epoch 6/10
[1m7121/7121[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 5ms/step - accuracy: 0.9994 - loss: 0.0030
Epoch 7/10
[1m7121/7121[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 5ms/step - accuracy: 0.9995 - loss: 0.0023
Epoch 8/10
[1m7121/7121[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 5ms/step - accuracy: 0.9995 - loss: 0.0028
Epoch 9/10
[1m7

<keras.src.callbacks.history.History at 0x1d63eaa3e10>

In [37]:
# LSTM Model (assuming sequential data)
lstm_model = Sequential()
lstm_model.add(LSTM(50, activation='relu', input_shape=(X_creditcard.shape[1], 1)))
lstm_model.add(Dense(1, activation='sigmoid'))
lstm_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
lstm_model.fit(X_creditcard_train, y_creditcard_train, epochs=10, batch_size=32)

  super().__init__(**kwargs)


Epoch 1/10
[1m7121/7121[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m153s[0m 21ms/step - accuracy: 0.9932 - loss: 1066.4052
Epoch 2/10
[1m7121/7121[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m119s[0m 17ms/step - accuracy: 0.9987 - loss: 0.0154
Epoch 3/10
[1m7121/7121[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m110s[0m 15ms/step - accuracy: 0.9989 - loss: 0.0053
Epoch 4/10
[1m7121/7121[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m107s[0m 15ms/step - accuracy: 0.9989 - loss: 0.0057
Epoch 5/10
[1m7121/7121[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m123s[0m 17ms/step - accuracy: 0.9990 - loss: 0.0046
Epoch 6/10
[1m7121/7121[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m122s[0m 17ms/step - accuracy: 0.9992 - loss: 0.0039
Epoch 7/10
[1m7121/7121[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m112s[0m 16ms/step - accuracy: 0.9992 - loss: 13.5899
Epoch 8/10
[1m7121/7121[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m103s[0m 14ms/step - accuracy: 0.9991 - loss: 0.0

<keras.src.callbacks.history.History at 0x1d6317b3950>

Logistic Regression - Fraud Data:
              precision    recall  f1-score   support

           0       0.91      1.00      0.95     27373
           1       0.00      0.00      0.00      2850

    accuracy                           0.91     30223
   macro avg       0.45      0.50      0.48     30223
weighted avg       0.82      0.91      0.86     30223

Accuracy: 0.9057009562253913


c:\Users\lenovo\Documents\10acadmy\FraudDetection-for-E-commerce-Banking\.venv\Lib\site-packages\sklearn\metrics\_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
c:\Users\lenovo\Documents\10acadmy\FraudDetection-for-E-commerce-Banking\.venv\Lib\site-packages\sklearn\metrics\_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
c:\Users\lenovo\Documents\10acadmy\FraudDetection-for-E-commerce-Banking\.venv\Lib\site-packages\sklearn\metrics\_classification.py:1469: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
Decision Tree - Fraud Data:
              precision    recall  f1-score   support

           0       0.95      0.94      0.95     27373
           1       0.50      0.57      0.53      2850

    accuracy                           0.91     30223
   macro avg       0.73      0.76      0.74     30223
weighted avg       0.91      0.91      0.91     30223

Accuracy: 0.9065943155874665


Random Forest - Fraud Data:
              precision    recall  f1-score   support

           0       0.95      1.00      0.98     27373
           1       1.00      0.54      0.70      2850

    accuracy                           0.96     30223
   macro avg       0.98      0.77      0.84     30223
weighted avg       0.96      0.96      0.95     30223

Accuracy: 0.956390828177216


Gradient Boosting - Fraud Data:
              precision    recall  f1-score   support

           0       0.95      1.00      0.98     27373
           1       1.00      0.54      0.70      2850

    accuracy                           0.96     30223
   macro avg       0.98      0.77      0.84     30223
weighted avg       0.96      0.96      0.95     30223

Accuracy: 0.9564239155609966




In [38]:
# Function to evaluate model performance
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    y_pred_classes = (y_pred > 0.5).astype(int)  # Thresholding for binary classification
    print(classification_report(y_test, y_pred_classes))
    print("Accuracy:", accuracy_score(y_test, y_pred_classes))

# Evaluate the MLP model
evaluate_model(mlp_model, X_creditcard_test, y_creditcard_test)

# Evaluate the CNN model
evaluate_model(cnn_model, X_creditcard_test, y_creditcard_test)

# Evaluate the LSTM model
evaluate_model(lstm_model, X_creditcard_test, y_creditcard_test)


[1m1781/1781[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 4ms/step


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56864
           1       0.00      0.00      0.00        98

    accuracy                           1.00     56962
   macro avg       0.50      0.50      0.50     56962
weighted avg       1.00      1.00      1.00     56962

Accuracy: 0.9982795547909132
[1m1781/1781[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 3ms/step
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56864
           1       0.97      0.66      0.79        98

    accuracy                           1.00     56962
   macro avg       0.98      0.83      0.89     56962
weighted avg       1.00      1.00      1.00     56962

Accuracy: 0.999385555282469
[1m1781/1781[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 7ms/step
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56864
           1       0.80  