In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
import mlflow
import mlflow.sklearn
# import tensorflow as tf
# from tensorflow.keras.models import Sequential
# from tensorflow.keras.layers import Dense, Conv1D, LSTM, Dropout, Flatten


In [10]:

# Set up MLflow
mlflow.set_experiment("Fraud Detection")


2024/10/22 18:04:45 INFO mlflow.tracking.fluent: Experiment with name 'Fraud Detection' does not exist. Creating a new experiment.


<Experiment: artifact_location='file:///c:/Users/Firew%20Ayele/Desktop/kifiya/Adey-Innovations-Inc/notebooks/mlruns/969604978361663413', creation_time=1729609485785, experiment_id='969604978361663413', last_update_time=1729609485785, lifecycle_stage='active', name='Fraud Detection', tags={}>

Load the datasets

In [11]:
# Load the datasets
fraud_data = pd.read_csv('../data/Fraud_Data.csv')
credit_card_data = pd.read_csv('../data/creditcard.csv')

In [12]:
fraud_data

Unnamed: 0,user_id,signup_time,purchase_time,purchase_value,device_id,source,browser,sex,age,ip_address,class
0,22058,2015-02-24 22:55:49,2015-04-18 02:47:11,34,QVPSPJUOCKZAR,SEO,Chrome,M,39,7.327584e+08,0
1,333320,2015-06-07 20:39:50,2015-06-08 01:38:54,16,EOGFQPIZPYXFZ,Ads,Chrome,F,53,3.503114e+08,0
2,1359,2015-01-01 18:52:44,2015-01-01 18:52:45,15,YSSKYOSJHPPLJ,SEO,Opera,M,53,2.621474e+09,1
3,150084,2015-04-28 21:13:25,2015-05-04 13:54:50,44,ATGTXKYKUDUQN,SEO,Safari,M,41,3.840542e+09,0
4,221365,2015-07-21 07:09:52,2015-09-09 18:40:53,39,NAUITBZFJKHWW,Ads,Safari,M,45,4.155831e+08,0
...,...,...,...,...,...,...,...,...,...,...,...
151107,345170,2015-01-27 03:03:34,2015-03-29 00:30:47,43,XPSKTWGPWINLR,SEO,Chrome,M,28,3.451155e+09,1
151108,274471,2015-05-15 17:43:29,2015-05-26 12:24:39,35,LYSFABUCPCGBA,SEO,Safari,M,32,2.439047e+09,0
151109,368416,2015-03-03 23:07:31,2015-05-20 07:07:47,40,MEQHCSJUBRBFE,SEO,IE,F,26,2.748471e+09,0
151110,207709,2015-07-09 20:06:07,2015-09-07 09:34:46,46,CMCXFGRHYSTVJ,SEO,Chrome,M,37,3.601175e+09,0


In [None]:
credit_card_data

In [13]:
# Inspect columns of fraud_data
print("Columns in fraud_data:", fraud_data.columns)

Columns in fraud_data: Index(['user_id', 'signup_time', 'purchase_time', 'purchase_value',
       'device_id', 'source', 'browser', 'sex', 'age', 'ip_address', 'class'],
      dtype='object')


In [29]:
# Convert datetime strings to datetime objects
fraud_data['signup_time'] = pd.to_datetime(fraud_data['signup_time'])
fraud_data['purchase_time'] = pd.to_datetime(fraud_data['purchase_time'])

# Extract useful datetime components
fraud_data['signup_hour'] = fraud_data['signup_time'].dt.hour
fraud_data['signup_day'] = fraud_data['signup_time'].dt.dayofweek
fraud_data['purchase_hour'] = fraud_data['purchase_time'].dt.hour
fraud_data['purchase_day'] = fraud_data['purchase_time'].dt.dayofweek

# Drop the original datetime columns
fraud_data = fraud_data.drop(columns=['signup_time', 'purchase_time'])

In [30]:
# Preprocess Fraud_Data.csv
X_fraud = fraud_data.drop(columns=['class'])
y_fraud = fraud_data['class']

In [31]:
# Preprocess creditcard.csv
X_cc = credit_card_data.drop(columns=['Class'])
y_cc = credit_card_data['Class']

In [32]:
# Train-test split
X_train_fraud, X_test_fraud, y_train_fraud, y_test_fraud = train_test_split(X_fraud, y_fraud, test_size=0.2, random_state=42, stratify=y_fraud)
X_train_cc, X_test_cc, y_train_cc, y_test_cc = train_test_split(X_cc, y_cc, test_size=0.2, random_state=42, stratify=y_cc)


In [33]:
# Convert datetime columns if any
if 'TransactionDate' in X_train_fraud.columns:  # Replace with your datetime column
    X_train_fraud['TransactionDate'] = pd.to_datetime(X_train_fraud['TransactionDate'])
    X_train_fraud['TransactionYear'] = X_train_fraud['TransactionDate'].dt.year
    X_train_fraud['TransactionMonth'] = X_train_fraud['TransactionDate'].dt.month
    X_train_fraud['TransactionDay'] = X_train_fraud['TransactionDate'].dt.day
    X_train_fraud['TransactionHour'] = X_train_fraud['TransactionDate'].dt.hour
    # Drop the original datetime column
    X_train_fraud = X_train_fraud.drop('TransactionDate', axis=1)

# Repeat for X_test_fraud if needed

# Select only numeric columns for scaling
numeric_cols_fraud = X_train_fraud.select_dtypes(include=['float64', 'int64']).columns
numeric_cols_cc = X_train_cc.select_dtypes(include=['float64', 'int64']).columns

# Feature scaling (standardization)
scaler = StandardScaler()

X_train_fraud_scaled = scaler.fit_transform(X_train_fraud[numeric_cols_fraud])
X_test_fraud_scaled = scaler.transform(X_test_fraud[numeric_cols_fraud])

X_train_cc_scaled = scaler.fit_transform(X_train_cc[numeric_cols_cc])
X_test_cc_scaled = scaler.transform(X_test_cc[numeric_cols_cc])


In [34]:
# Define Evaluation Function
def evaluate_model(y_true, y_pred):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    roc_auc = roc_auc_score(y_true, y_pred)
    return accuracy, precision, recall, f1, roc_auc


In [36]:

def log_metrics(model_name, accuracy, precision, recall, f1, roc_auc):
    mlflow.log_param("Model", model_name)
    mlflow.log_metric("Accuracy", accuracy)
    mlflow.log_metric("Precision", precision)
    mlflow.log_metric("Recall", recall)
    mlflow.log_metric("F1 Score", f1)
    mlflow.log_metric("ROC-AUC", roc_auc)


In [38]:
# Train and Evaluate Models
def train_and_evaluate(model, X_train, X_test, y_train, y_test, model_name):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy, precision, recall, f1, roc_auc = evaluate_model(y_test, y_pred)
    
    with mlflow.start_run():
        mlflow.sklearn.log_model(model, model_name)
        log_metrics(model_name, accuracy, precision, recall, f1, roc_auc)
        
    print(f"{model_name} - Accuracy: {accuracy}, Precision: {precision}, Recall: {recall}, F1: {f1}, ROC-AUC: {roc_auc}")


In [39]:

# Initialize models
models = {
    'Logistic Regression': LogisticRegression(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(n_estimators=100),
    'Gradient Boosting': GradientBoostingClassifier(),
    'MLP': MLPClassifier(max_iter=500)
}


In [42]:
# Check for non-numeric columns
print(X_train_fraud.dtypes)


user_id             int64
purchase_value      int64
device_id          object
source             object
browser            object
sex                object
age                 int64
ip_address        float64
signup_hour         int32
signup_day          int32
purchase_hour       int32
purchase_day        int32
dtype: object


In [43]:
from sklearn.preprocessing import LabelEncoder

# Label encode 'sex' column
label_encoder = LabelEncoder()
X_train_fraud['sex'] = label_encoder.fit_transform(X_train_fraud['sex'])
X_test_fraud['sex'] = label_encoder.transform(X_test_fraud['sex'])


In [45]:
# Frequency encoding for 'device_id', 'source', and 'browser'
for col in ['device_id', 'source', 'browser']:
    freq_encoding = X_train_fraud[col].value_counts(normalize=True)
    X_train_fraud[col] = X_train_fraud[col].map(freq_encoding)
    X_test_fraud[col] = X_test_fraud[col].map(freq_encoding)


In [46]:
from category_encoders import TargetEncoder

# Target encode 'device_id', 'source', and 'browser'
target_encoder = TargetEncoder(cols=['device_id', 'source', 'browser'])
X_train_fraud = target_encoder.fit_transform(X_train_fraud, y_train_fraud)
X_test_fraud = target_encoder.transform(X_test_fraud)


In [47]:
# Encode only the top N frequent categories and group others
N = 10  # Choose the number of top categories to keep

for col in ['device_id', 'source', 'browser']:
    top_categories = X_train_fraud[col].value_counts().nlargest(N).index
    X_train_fraud[col] = X_train_fraud[col].where(X_train_fraud[col].isin(top_categories), other='Other')
    X_test_fraud[col] = X_test_fraud[col].where(X_test_fraud[col].isin(top_categories), other='Other')

# Then, apply one-hot encoding after reducing categories
X_train_fraud = pd.get_dummies(X_train_fraud, columns=['device_id', 'source', 'browser'], drop_first=True)
X_test_fraud = pd.get_dummies(X_test_fraud, columns=['device_id', 'source', 'browser'], drop_first=True)



In [48]:

# Train and evaluate on Fraud_Data.csv
for model_name, model in models.items():
    train_and_evaluate(model, X_train_fraud, X_test_fraud, y_train_fraud, y_test_fraud, model_name)

# Train and evaluate on creditcard.csv
for model_name, model in models.items():
    train_and_evaluate(model, X_train_cc, X_test_cc, y_train_cc, y_test_cc, model_name)


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Logistic Regression - Accuracy: 0.9063627039010026, Precision: 0.0, Recall: 0.0, F1: 0.0, ROC-AUC: 0.5




Decision Tree - Accuracy: 0.641498196737584, Precision: 0.15670297624153015, Recall: 0.6455830388692579, F1: 0.2521913175512458, ROC-AUC: 0.6433296131081953




Random Forest - Accuracy: 0.5926612182774708, Precision: 0.14252318829650856, Recall: 0.6678445229681979, F1: 0.23491392703996022, ROC-AUC: 0.6263692369887899




Gradient Boosting - Accuracy: 0.18654666975482248, Precision: 0.0882698058215678, Recall: 0.8240282685512368, F1: 0.1594584430236931, ROC-AUC: 0.4723580177494986


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


MLP - Accuracy: 0.9063627039010026, Precision: 0.0, Recall: 0.0, F1: 0.0, ROC-AUC: 0.5


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Logistic Regression - Accuracy: 0.9990344440153085, Precision: 0.7009345794392523, Recall: 0.7653061224489796, F1: 0.7317073170731707, ROC-AUC: 0.8823716881237583




Decision Tree - Accuracy: 0.9990871107053826, Precision: 0.7346938775510204, Recall: 0.7346938775510204, F1: 0.7346938775510204, ROC-AUC: 0.8671183231311658




Random Forest - Accuracy: 0.9995611109160493, Precision: 0.9397590361445783, Recall: 0.7959183673469388, F1: 0.861878453038674, ROC-AUC: 0.8979152191264801




Gradient Boosting - Accuracy: 0.9983146659176293, Precision: 0.5294117647058824, Recall: 0.1836734693877551, F1: 0.2727272727272727, ROC-AUC: 0.5916960481435117




MLP - Accuracy: 0.9987886661282961, Precision: 0.6141732283464567, Recall: 0.7959183673469388, F1: 0.6933333333333334, ROC-AUC: 0.8975283311129741
