In [1]:
import sys
import os

sys.path.append(os.path.abspath('..'))

import pandas as pd
from modules.raw_data_handler import Raw_Data_Handler
from modules.dataset_design import Dataset_Designer
from modules.feature_extractor import Feature_Extractor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import classification_report, accuracy_score
import pickle
import os

In [2]:
raw_handler = Raw_Data_Handler()

customer_data, transaction_data, fraud_data = raw_handler.extract(
    '../data_sources/customer_release.csv', 
    '../data_sources/transactions_release.parquet', 
    '../data_sources/fraud_release.json'
)

In [3]:
cleaned_data = raw_handler.transform(customer_data, transaction_data, fraud_data)
raw_handler.load(cleaned_data, '../storage/dataset/cleaned_data.parquet')

In [4]:
dataset_designer = Dataset_Designer()

cleaned_data = dataset_designer.extract('../storage/dataset/cleaned_data.parquet')

train_data, test_data = dataset_designer.sample(cleaned_data)

dataset_designer.load([train_data, test_data], ['../storage/dataset/train_data.parquet', '../storage/dataset/test_data.parquet'])

In [5]:
feature_extractor = Feature_Extractor()

train_data, test_data = feature_extractor.extract('../storage/dataset/train_data.parquet', '../storage/dataset/test_data.parquet')

train_features, test_features = feature_extractor.transform(train_data, test_data)

In [6]:
train_features = pd.get_dummies(train_features, columns=['category', 'merchant'])
test_features = pd.get_dummies(test_features, columns=['category', 'merchant'])

X_train = train_features.drop(columns=['is_fraud'])
y_train = train_features['is_fraud']

X_test = test_features.drop(columns=['is_fraud'])
y_test = test_features['is_fraud']

In [7]:
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

models = {
    'LogisticRegression': LogisticRegression(max_iter=500),
    'NaiveBayes': GaussianNB(),
    'ExtraTrees': ExtraTreesClassifier(),
}

In [10]:
model_performance = {}
for model_name, model in models.items():
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    
    model_performance[model_name] = accuracy_score(y_test, y_pred)
    
    os.makedirs('../storage/models/artifacts/', exist_ok=True)
    with open(f'../storage/models/artifacts/{model_name}.pkl', 'wb') as f:
        pickle.dump(model, f)

In [11]:
for model_name, performance in model_performance.items():
    print(f"{model_name} Accuracy: {performance:.2f}")

for model_name, model in models.items():
    print(f"Classification Report for {model_name}:")
    print(classification_report(y_test, model.predict(X_test)))

LogisticRegression Accuracy: 1.00
NaiveBayes Accuracy: 0.25
ExtraTrees Accuracy: 1.00
Classification Report for LogisticRegression:




              precision    recall  f1-score   support

         0.0       1.00      0.31      0.47    328225
         1.0       0.01      0.90      0.01      1284

    accuracy                           0.31    329509
   macro avg       0.50      0.61      0.24    329509
weighted avg       0.99      0.31      0.47    329509

Classification Report for NaiveBayes:




              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00    328225
         1.0       0.00      0.00      0.00      1284

    accuracy                           1.00    329509
   macro avg       0.50      0.50      0.50    329509
weighted avg       0.99      1.00      0.99    329509

Classification Report for ExtraTrees:


  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00    328225
         1.0       0.00      0.00      0.00      1284

    accuracy                           1.00    329509
   macro avg       0.50      0.50      0.50    329509
weighted avg       0.99      1.00      0.99    329509



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Let me first address why I used Naive Bayes and Extra Trees classifiers, instead of the recommended Random Forest and SVM classfiers. My computer that I am using takes too long and can't run the Random Forest and SVM classifiers, so I chose the Naive Bayes classifier and Extra Forest classfiers as more lightweight options. However, I have defaulted to using the Logistic Regression given that it is one of the recommended classfiers and that it has a very high accuracy here.