In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score
import mlflow

# Import custom functions from src folder
import sys
sys.path.append('../src')  # Add src folder to Python path
from data_preprocessing import (
    handle_missing_values,
    clean_data,
    perform_eda,
    merge_geolocation_data,
    feature_engineering,
    normalize_data,
    encode_categorical_features,
)
from model_training import (
    prepare_data,
    select_model,
    train_and_evaluate,
    log_experiment,
)

# Load datasets
fraud_data = pd.read_csv('../data/Fraud_Data.csv')
ip_country_data = pd.read_csv('../data/IpAddress_to_Country.csv')
creditcard_data = pd.read_csv('../data/creditcard.csv')

# Display dataset summaries
print("Fraud Data Summary:")
print(fraud_data.info())
print("\nIP Country Data Summary:")
print(ip_country_data.info())
print("\nCredit Card Data Summary:")
print(creditcard_data.info())

In [None]:
# Task 1: Data Analysis and Preprocessing
# 1. Handle Missing Values
faud_data = handle_missing_values(fraud_data, strategy='drop')
creditcard_data = handle_missing_values(creditcard_data, strategy='drop')

In [None]:
# 2. Data Cleaning
fraud_data = clean_data(fraud_data)
creditcard_data = clean_data(creditcard_data)


In [None]:
# 3. Exploratory Data Analysis (EDA)
print("Fraud Data EDA:")
perform_eda(fraud_data)
print("\nCredit Card Data EDA:")
perform_eda(creditcard_data)


In [None]:
# 4. Merge Datasets for Geolocation Analysis
merged_fraud_data = merge_geolocation_data(fraud_data, ip_country_data)

# 5. Feature Engineering
merged_fraud_data = feature_engineering(merged_fraud_data)
creditcard_data = feature_engineering(
    creditcard_data
)  # Add time-based features if needed


In [None]:
# 6. Normalization and Scaling
numerical_features = [
    "purchase_value",
    "age",
    "transaction_frequency",
    "transaction_velocity",
]
merged_fraud_data = normalize_data(merged_fraud_data, numerical_features)

# 7. Encode Categorical Features
categorical_features = ["source", "browser", "sex"]
merged_fraud_data = encode_categorical_features(merged_fraud_data, categorical_features)


In [None]:
# Task 2: Model Building and Training

# 1. Data Preparation
X_train_fraud, X_test_fraud, y_train_fraud, y_test_fraud = prepare_data(
    merged_fraud_data, target_column="class"
)
X_train_credit, X_test_credit, y_train_credit, y_test_credit = prepare_data(
    creditcard_data, target_column="Class"
)

# 2. Model Selection
models = [
    "LogisticRegression",
    "DecisionTree",
    "RandomForest",
    "GradientBoosting",
    "MLP",
]

for model_name in models:
    print(f"\nTraining {model_name} on Fraud Data:")
    model = select_model(model_name)
    train_and_evaluate(model, X_train_fraud, y_train_fraud, X_test_fraud, y_test_fraud)

    print(f"\nTraining {model_name} on Credit Card Data:")
    model = select_model(model_name)
    train_and_evaluate(
        model, X_train_credit, y_train_credit, X_test_credit, y_test_credit
    )


In [None]:
# 3. Log Experiments with MLflow
params = {"model": "RandomForest", "dataset": "Fraud Data"}
metrics = {"roc_auc": roc_auc_score(y_test_fraud, model.predict(X_test_fraud))}
log_experiment(model, params, metrics)
