In [11]:
import os
import joblib
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# 🔹 1. Define paths correctly
BASE_DIR = os.path.abspath(os.path.join(os.getcwd(), ".."))  # Get base directory dynamically
DATA_PATH = os.path.join(BASE_DIR, "data", "cleaned_creditcard.csv")
MODEL_DIR = os.path.join(BASE_DIR, "models")

# Ensure the models directory exists
os.makedirs(MODEL_DIR, exist_ok=True)

# 🔹 2. Check if dataset exists
if not os.path.exists(DATA_PATH):
    raise FileNotFoundError(f"Dataset not found: {DATA_PATH}")

# 🔹 3. Load dataset
df = pd.read_csv(DATA_PATH)

# Check for missing values
if df.isnull().sum().sum() > 0:
    print("Warning: Missing values found! Filling with zeros.")
    df = df.fillna(0)  # Handle missing values if any

# 🔹 4. Separate features and target
X = df.drop(columns=["Class"])
y = df["Class"]

# 🔹 5. Reduce training size to 70%
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, test_size=0.3, random_state=42)

# Convert DataFrame to NumPy arrays for stability
X_train, X_test = X_train.values, X_test.values
y_train, y_test = y_train.values, y_test.values

# 🔹 6. Train Logistic Regression Model
logistic_model = LogisticRegression(solver="liblinear", max_iter=200, random_state=42)
logistic_model.fit(X_train, y_train)
y_pred = logistic_model.predict(X_test)
print(f"🔹 Logistic Regression Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(classification_report(y_test, y_pred))

# 🔹 7. Train Random Forest Model (Optimized Hyperparameters)
rf_model = RandomForestClassifier(n_estimators=100, max_depth=10, min_samples_split=5, n_jobs=-1, random_state=42)
rf_model.fit(X_train, y_train)
y_rf_pred = rf_model.predict(X_test)
print(f"🔹 Random Forest Accuracy: {accuracy_score(y_test, y_rf_pred):.4f}")
print(classification_report(y_test, y_rf_pred))

# 🔹 8. Save models
logistic_model_path = os.path.join(MODEL_DIR, "logistic_regression.pkl")
rf_model_path = os.path.join(MODEL_DIR, "random_forest.pkl")

joblib.dump(logistic_model, logistic_model_path)
joblib.dump(rf_model, rf_model_path)

print(f"✅ Models saved successfully at:\n- {logistic_model_path}\n- {rf_model_path}")

# 🔹 9. Save feature column names for API
feature_columns_path = os.path.join(MODEL_DIR, "feature_columns.pkl")
feature_columns = X.columns.tolist()
joblib.dump(feature_columns, feature_columns_path)

print(f"✅ Feature columns saved successfully at {feature_columns_path}")


FileNotFoundError: Dataset not found: /Users/hadywehbi/credit_card_fraud_detection/notebooks/data/cleaned_creditcard.csv