In [1]:
# Importing necessary lebraries
import numpy as np
import pandas as pd
import os
import sys
import mlflow
import mlflow.sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler


In [2]:
sys.path.append(os.path.join(os.path.abspath('..')))
# Import modules
from src import data_loading as dl

In [3]:

fraud_df = dl.load_data("processed/processed_fraud_data.csv")
credit_df = dl.load_data("creditcard.csv")

In [4]:
fraud_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 151112 entries, 0 to 151111
Data columns (total 22 columns):
 #   Column                        Non-Null Count   Dtype  
---  ------                        --------------   -----  
 0   Unnamed: 0                    151112 non-null  int64  
 1   user_id                       151112 non-null  int64  
 2   signup_time                   151112 non-null  object 
 3   purchase_time                 151112 non-null  object 
 4   purchase_value                151112 non-null  float64
 5   device_id                     151112 non-null  object 
 6   age                           151112 non-null  float64
 7   ip_address                    151112 non-null  int64  
 8   class                         151112 non-null  int64  
 9   hour_of_day                   151112 non-null  float64
 10  day_of_week                   151112 non-null  float64
 11  purchase_delay                151112 non-null  float64
 12  user_transaction_frequency    151112 non-nul

In [5]:
# Convert signup_time and purchase_time to datetime
fraud_df['signup_time'] = pd.to_datetime(fraud_df['signup_time'])
fraud_df['purchase_time'] = pd.to_datetime(fraud_df['purchase_time'])

In [6]:
# Feature engineering: Calculate the time difference between signup and purchase
fraud_df['time_diff'] = (fraud_df['purchase_time'] - fraud_df['signup_time']).dt.total_seconds()

In [7]:
# Drop unnecessary columns
fraud_data = fraud_df.drop(columns=['Unnamed: 0', 'signup_time', 'purchase_time', 'device_id', 'ip_address'])

In [8]:
fraud_data.head()

Unnamed: 0,user_id,purchase_value,age,class,hour_of_day,day_of_week,purchase_delay,user_transaction_frequency,device_transaction_frequency,user_transaction_velocity,source_Direct,source_SEO,browser_FireFox,browser_IE,browser_Opera,browser_Safari,sex_M,time_diff
0,22058,-0.160204,0.679914,0,-1.377455,0.99102,-0.136057,0.0,-0.261514,-0.230128,False,True,False,False,False,False,True,4506682.0
1,333320,-1.142592,2.304476,0,-1.522122,-1.501259,-1.571877,0.0,-0.261514,-0.229874,False,False,False,False,False,False,False,17944.0
2,1359,-1.197169,2.304476,1,0.937208,-0.005891,-1.577617,0.0,3.941861,4.345476,False,True,False,False,True,False,True,1.0
3,150084,0.385567,0.911994,0,0.213876,-1.501259,-1.420213,0.0,-0.261514,-0.23012,False,True,False,False,False,True,True,492085.0
4,221365,0.112681,1.376155,0,0.937208,-0.504347,-0.182509,0.0,-0.261514,-0.230128,False,False,False,False,False,True,True,4361461.0


In [20]:
duplicate_count = credit_df.duplicated().sum()
print(f"Number of duplicate rows: {duplicate_count}")

Number of duplicate rows: 1081


In [21]:
# Remove duplicates in the credit card data
credit_df = credit_df.drop_duplicates()
duplicate_count = credit_df.duplicated().sum()
print(f"Number of duplicate rows after drop duplicate : {duplicate_count}")

Number of duplicate rows after drop duplicate : 0


In [22]:

sys.path.append(os.path.abspath("../"))


from scripts.model import prepare_data, split_data

from scripts.logger import logger 

 Feature and Target Separation for creditcard.csv

In [23]:
X_credit, y_credit = prepare_data(credit_df, 'Class')

INFO:fraud_detection_logger:Preparing data by separating features and target column: Class


2025-02-07 17:27:28,260 - INFO - Preparing data by separating features and target column: Class


Train-Test Split for creditcard.csv

In [24]:

X_train_credit, X_test_credit, y_train_credit, y_test_credit = split_data(X_credit, y_credit)

INFO:fraud_detection_logger:Splitting data into train and test sets


2025-02-07 17:27:31,077 - INFO - Splitting data into train and test sets


Feature and Target Separation for Fraud_Data.csv

In [25]:

X_fraud, y_fraud = prepare_data(fraud_df, 'class')

INFO:fraud_detection_logger:Preparing data by separating features and target column: class


2025-02-07 17:27:33,676 - INFO - Preparing data by separating features and target column: class


Train-Test Split for Fraud_Data.csv

In [26]:

X_train_fraud, X_test_fraud, y_train_fraud, y_test_fraud = split_data(X_fraud, y_fraud)

INFO:fraud_detection_logger:Splitting data into train and test sets


2025-02-07 17:27:35,731 - INFO - Splitting data into train and test sets


In [27]:
print("Credit Card Data Shapes:")
print("X_train_credit:", X_train_credit.shape)
print("X_test_credit:", X_test_credit.shape)
print("y_train_credit:", y_train_credit.shape)
print("y_test_credit:", y_test_credit.shape)

Credit Card Data Shapes:
X_train_credit: (226980, 30)
X_test_credit: (56746, 30)
y_train_credit: (226980,)
y_test_credit: (56746,)


In [28]:
print("Fraud Data Shapes:")
print("X_train_fraud:", X_train_fraud.shape)
print("X_test_fraud:", X_test_fraud.shape)
print("y_train_fraud:", y_train_fraud.shape)
print("y_test_fraud:", y_test_fraud.shape)

Fraud Data Shapes:
X_train_fraud: (120889, 22)
X_test_fraud: (30223, 22)
y_train_fraud: (120889,)
y_test_fraud: (30223,)


Set the tracking URI to the local server (ensure MLflow tracking server is running)

Train and evaluate Logistic Regression model for creditcard.csv

In [29]:
# Train and evaluate Logistic Regression model for creditcard.csv
with mlflow.start_run(run_name="Logistic Regression - Credit Card Data"):
    logistic_model = LogisticRegression(max_iter=200)  # Increased max_iter
    logistic_model.fit(X_train_credit, y_train_credit)
    y_pred_credit = logistic_model.predict(X_test_credit)
    
    # Generate classification report
    report_credit = classification_report(y_test_credit, y_pred_credit, output_dict=True)
    accuracy_credit = report_credit['accuracy']
    
    # Log parameters, metrics, and model
    mlflow.log_param("model", "Logistic Regression")
    mlflow.log_metric("accuracy", accuracy_credit)
    mlflow.sklearn.log_model(logistic_model, "logistic_model_credit")
    
    # Print classification report
    print("Logistic Regression - Credit Card Data:\n", classification_report(y_test_credit, y_pred_credit))

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Logistic Regression - Credit Card Data:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     56656
           1       0.76      0.63      0.69        90

    accuracy                           1.00     56746
   macro avg       0.88      0.82      0.85     56746
weighted avg       1.00      1.00      1.00     56746

🏃 View run Logistic Regression - Credit Card Data at: http://localhost:5000/#/experiments/0/runs/a4f2cd67e6b142f68d0ec4414de7cd16
🧪 View experiment at: http://localhost:5000/#/experiments/0


In [30]:
# Train and evaluate Logistic Regression model for Fraud_Data.csv
with mlflow.start_run(run_name="Logistic Regression - Fraud Data"):
    logistic_model = LogisticRegression()
    logistic_model.fit(X_train_fraud, y_train_fraud)
    y_pred_fraud = logistic_model.predict(X_test_fraud)
    
    # Generate classification report
    report_fraud = classification_report(y_test_fraud, y_pred_fraud, output_dict=True)
    accuracy_fraud = report_fraud['accuracy']
    
    # Log parameters, metrics, and model
    mlflow.log_param("model", "Logistic Regression")
    mlflow.log_metric("accuracy", accuracy_fraud)
    mlflow.sklearn.log_model(logistic_model, "logistic_model_fraud")
    
    # Print classification report
    print("Logistic Regression - Fraud Data:\n", classification_report(y_test_fraud, y_pred_fraud))

🏃 View run Logistic Regression - Fraud Data at: http://localhost:5000/#/experiments/0/runs/565b3c32d7b74f8ca21208827f1677f9
🧪 View experiment at: http://localhost:5000/#/experiments/0


TypeError: Cannot cast DatetimeArray to dtype float64