In [1]:
import pandas as pd  
import json  
import os

In [2]:
import sys
sys.path.append(r'C:\Users\shrin\OneDrive\Documents\GitHub\babel-shrinit\securebank')



In [3]:
from modules.raw_data_handler import Raw_Data_Handler
import os
handler = Raw_Data_Handler()

# file paths
customer_file = '../data_sources/customer_release.csv'
transaction_file = '../data_sources/transactions_release.parquet'
fraud_file = '../data_sources/fraud_release.json'

customer_data, transaction_data, fraud_data = handler.extract(customer_file, transaction_file, fraud_file)

# transform data
final_data = handler.transform(customer_data, transaction_data, fraud_data)

final_data_path = '../storage/dataset/final_data.parquet'
handler.load(final_data, final_data_path)


print(f"Cleaned data saved to: {final_data_path}")


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  raw_data['is_fraud'].fillna(0, inplace=True)  # Fill missing fraud labels with 0


Cleaned data saved to: ../storage/dataset/final_data.parquet


In [4]:
from modules.dataset_design import Dataset_Designer

dataset_designer = Dataset_Designer()

final_data = dataset_designer.extract('../storage/dataset/final_data.parquet')

# training and testing split
train_data, test_data = dataset_designer.sample(final_data)

dataset_designer.load([train_data, test_data], ['../storage/dataset/train_data.parquet', '../storage/dataset/test_data.parquet'])

print("Training and testing datasets saved.")


Dataset saved to ../storage/dataset/train_data.parquet
Dataset saved to ../storage/dataset/test_data.parquet
Training and testing datasets saved.


In [5]:
from modules.feature_extractor import Feature_Extractor

feature_extractor = Feature_Extractor()


train_data, test_data = feature_extractor.extract('../storage/dataset/train_data.parquet', '../storage/dataset/test_data.parquet')

train_features, test_features = feature_extractor.transform(train_data, test_data)

In [6]:
import pandas as pd
import os
import joblib
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from modules.feature_extractor import Feature_Extractor

base_dataset_path = os.path.abspath('../storage/dataset')
base_model_path = os.path.abspath('../storage/models/artifacts')

os.makedirs(base_model_path, exist_ok=True)

train_data_path = os.path.join(base_dataset_path, 'train_data.parquet')
test_data_path = os.path.join(base_dataset_path, 'test_data.parquet')


(train_features, train_labels), (test_features, test_labels) = feature_extractor.transform(
    pd.read_parquet(train_data_path),
    pd.read_parquet(test_data_path)
)
# convert to array
train_features = train_features.toarray()
test_features = test_features.toarray()

# models
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Naive Bayes': GaussianNB(),
    'KNN': KNeighborsClassifier(n_neighbors=5)
}

performance_metrics = {}

for model_name, model in models.items():
    print(f"Training {model_name}...")
    model.fit(train_features, train_labels)
    y_pred = model.predict(test_features)
    y_prob = model.predict_proba(test_features)[:, 1] if hasattr(model, "predict_proba") else None

    metrics = {
        'Accuracy': accuracy_score(test_labels, y_pred),
        'Precision': precision_score(test_labels, y_pred),
        'Recall': recall_score(test_labels, y_pred),
        'F1 Score': f1_score(test_labels, y_pred),
        'AUC': roc_auc_score(test_labels, y_prob) if y_prob is not None else 'N/A'
    }
    performance_metrics[model_name] = metrics

    model_filename = f"{model_name.replace(' ', '_').lower()}.pkl"
    model_path = os.path.join(base_model_path, model_filename)
    joblib.dump(model, model_path)
    print(f"Model saved to: {model_path}")

metrics_df = pd.DataFrame(performance_metrics).T
metrics_filename = os.path.join(base_model_path, 'model_performance_metrics.csv')
metrics_df.to_csv(metrics_filename)
print(f"Performance metrics saved to: {metrics_filename}")


Training Logistic Regression...
Model saved to: c:\Users\shrin\OneDrive\Documents\GitHub\babel-shrinit\securebank\storage\models\artifacts\logistic_regression.pkl
Training Naive Bayes...
Model saved to: c:\Users\shrin\OneDrive\Documents\GitHub\babel-shrinit\securebank\storage\models\artifacts\naive_bayes.pkl
Performance metrics saved to: c:\Users\shrin\OneDrive\Documents\GitHub\babel-shrinit\securebank\storage\models\artifacts\model_performance_metrics.csv
