In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os, sys
current_dir = os.getcwd()
# Append the parent directory to sys.path
parent_dir = os.path.dirname(current_dir)
sys.path.append(parent_dir)

# ignore warrning message
import warnings
warnings.filterwarnings('ignore')

In [2]:
from scripts.model_development_scripts import ModelPipeline

2024/10/20 13:40:48 INFO mlflow.tracking.fluent: Experiment with name 'Fraud_Detection_Experiment' does not exist. Creating a new experiment.


In [14]:
fraud_data = pd.read_csv('../data/proccessed_fraud_data.csv')
credit_data = pd.read_csv('../data/creditcard.csv')

In [15]:
fraud_data.head()

Unnamed: 0,user_id,signup_time,purchase_time,purchase_value,device_id,age,ip_address,class,ip_int,country,hour_of_day,day_of_week,source_Direct,source_SEO,browser_FireFox,browser_IE,browser_Opera,browser_Safari,sex_M
0,247547,2015-06-28 03:00:34,2015-08-09 03:57:29,0.262069,KIXYSVCHIPQBR,0.206897,16778860.0,0,16778864,Australia,3,6,0,1,0,0,0,1,0
1,220737,2015-01-28 14:21:11,2015-02-11 20:28:28,0.041379,PKYOWQKWGJNJI,0.275862,16842050.0,0,16842045,Thailand,20,2,0,1,0,0,0,0,0
2,390400,2015-03-19 20:49:09,2015-04-11 23:41:23,0.241379,LVCSXLISZHVUO,0.189655,16843660.0,0,16843656,China,23,5,0,0,0,1,0,0,1
3,69592,2015-02-24 06:11:57,2015-05-23 16:40:14,0.317241,UHAUHNXXUADJE,0.206897,16938730.0,0,16938732,China,16,5,1,0,0,0,0,0,0
4,174987,2015-07-07 12:58:11,2015-11-03 04:04:30,0.289655,XPGPMOHIDRMGE,0.327586,16971980.0,0,16971984,Thailand,4,1,0,1,0,0,0,0,0


In [16]:
# Drop unnecessary columns
fraud_data = fraud_data.drop(['user_id', 'device_id', 'ip_address', 'ip_int', 'signup_time', 'purchase_time'], axis=1)

In [17]:
from sklearn.preprocessing import LabelEncoder

# Initialize the label encoder
label_encoder = LabelEncoder()

# Apply label encoding to the 'country' column
fraud_data['country_encoded'] = label_encoder.fit_transform(fraud_data['country'])

# Drop the original 'country' column
fraud_data = fraud_data.drop('country', axis=1)


In [18]:
fraud_data.to_csv('../data/final_preprocessed_fraud_data.csv', index=False)

In [21]:
fraud_data

Unnamed: 0,purchase_value,age,class,hour_of_day,day_of_week,source_Direct,source_SEO,browser_FireFox,browser_IE,browser_Opera,browser_Safari,sex_M,country_encoded
0,0.262069,0.206897,0,3,6,0,1,0,0,0,1,0,7
1,0.041379,0.275862,0,20,2,0,1,0,0,0,0,0,162
2,0.241379,0.189655,0,23,5,0,0,0,1,0,0,1,36
3,0.317241,0.206897,0,16,5,1,0,0,0,0,0,0,36
4,0.289655,0.327586,0,4,1,0,1,0,0,0,0,0,162
...,...,...,...,...,...,...,...,...,...,...,...,...,...
129141,0.324138,0.000000,0,22,1,1,0,0,0,1,0,0,36
129142,0.186207,0.310345,0,22,3,1,0,0,0,0,0,0,36
129143,0.193103,0.413793,0,16,3,0,1,1,0,0,0,1,36
129144,0.000000,0.327586,0,9,4,1,0,0,0,0,0,0,36


### Model Development for Fraud_detaction

In [3]:
# Example usage for fraud dataset:
fraud_data_file_path = '../data/final_preprocessed_fraud_data.csv'
fraud_pipeline = ModelPipeline('fraud', fraud_data_file_path)

In [4]:
fraud_pipeline.load_data()

INFO:root:Loading fraud data from ../data/final_preprocessed_fraud_data.csv...
INFO:root:Data loading complete.


In [5]:
fraud_pipeline.split_data()

INFO:root:Data has been split into train and test sets.


In [6]:
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
models = [
            (LogisticRegression(), 'Logistic Regression'),
            (DecisionTreeClassifier(), 'Decision Tree'),
            (RandomForestClassifier(), 'Random Forest'),
            (GradientBoostingClassifier(), 'Gradient Boosting')
        ]

In [7]:
for model, name in models:
            fraud_pipeline.train_model(model, name)
            report = fraud_pipeline.evaluate_model(model, name)
            fraud_pipeline.log_model(model, name, report)

INFO:root:Training Logistic Regression on fraud dataset...
INFO:root:Logistic Regression training complete.
INFO:root:Evaluating Logistic Regression on fraud dataset...
INFO:root:Logistic Regression evaluation report:
              precision    recall  f1-score   support

           0       0.91      1.00      0.95     35104
           1       0.00      0.00      0.00      3640

    accuracy                           0.91     38744
   macro avg       0.45      0.50      0.48     38744
weighted avg       0.82      0.91      0.86     38744

INFO:root:Logging Logistic Regression to MLflow...
INFO:root:Logistic Regression has been logged and saved in MLflow.
INFO:root:Training Decision Tree on fraud dataset...
INFO:root:Decision Tree training complete.
INFO:root:Evaluating Decision Tree on fraud dataset...
INFO:root:Decision Tree evaluation report:
              precision    recall  f1-score   support

           0       0.95      0.93      0.94     35104
           1       0.45      0.57 

### Model development for Credit Card

In [8]:
#  model development for credit card
creditcard_file_path = '../data/creditcard.csv'
creditcard_pipeline = ModelPipeline('creditcard', creditcard_file_path)

In [9]:
creditcard_pipeline.load_data()

INFO:root:Loading credit card data from ../data/creditcard.csv...
INFO:root:Data loading complete.


In [10]:
creditcard_pipeline.split_data()

INFO:root:Data has been split into train and test sets.


In [11]:
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
credit_models = [
            (LogisticRegression(), 'Logistic Regression'),
            (DecisionTreeClassifier(), 'Decision Tree'),
            (RandomForestClassifier(), 'Random Forest'),
            (GradientBoostingClassifier(), 'Gradient Boosting')
        ]

In [12]:
for model, name in models:
            creditcard_pipeline.train_model(model, name)
            report = creditcard_pipeline.evaluate_model(model, name)
            creditcard_pipeline.log_model(model, name, report)

INFO:root:Training Logistic Regression on creditcard dataset...
INFO:root:Logistic Regression training complete.
INFO:root:Evaluating Logistic Regression on creditcard dataset...
INFO:root:Logistic Regression evaluation report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85307
           1       0.58      0.60      0.59       136

    accuracy                           1.00     85443
   macro avg       0.79      0.80      0.79     85443
weighted avg       1.00      1.00      1.00     85443

INFO:root:Logging Logistic Regression to MLflow...
INFO:root:Logistic Regression has been logged and saved in MLflow.
INFO:root:Training Decision Tree on creditcard dataset...
INFO:root:Decision Tree training complete.
INFO:root:Evaluating Decision Tree on creditcard dataset...
INFO:root:Decision Tree evaluation report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85307
           1  