# Imports & Config

In [1]:
import sys
import os
import pandas as pd

# Add project root to Python path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))  # if notebook is in a subfolder
# or
sys.path.append(os.path.abspath("."))  # if notebook is in project root

from src.task_5.data_split import split_data
from src.task_5.model_factory import get_models
from src.task_5.train import train_and_log_model


  return FileStore(store_uri, store_uri)


# Load Data

In [2]:
df = pd.read_csv("../data/processed/clean_data.csv")

# Assuming this is your data loading cell
# ... (imports) ...

# --- 1. Load Data ---
# df = pd.read_csv(DATA_PATH)
print(f"Data Loaded. Shape: {df.shape}")
print("-" * 30)

# --- 2. Check for Target Column Presence and Type ---
TARGET = "is_high_risk"

if TARGET not in df.columns:
    # CRITICAL CHECK: Does the column exist?
    print(f"FATAL ERROR: Column '{TARGET}' not found in DataFrame.")
    print(f"Available columns: {df.columns.tolist()}")
    raise KeyError("Missing target column.")

# --- 3. Check for Nulls/NaNs in the Target Column ---
target_nan_count = df[TARGET].isnull().sum()
print(f"NaN count in '{TARGET}': {target_nan_count}")

if target_nan_count == len(df):
    # CRITICAL CHECK: Is the entire column empty (NaN)?
    print("FATAL ERROR: Target column is entirely NaN. Check CSV encoding or preprocessing step.")
    raise ValueError("Target column is all null.")

# --- 4. Identify Unique Values (This is where your previous error occurred) ---
unique_targets = df[TARGET].unique()

# Filter out NaNs before checking length, just in case
valid_unique_targets = df[TARGET].dropna().unique()

print(f"Unique non-NaN values in '{TARGET}': {valid_unique_targets}")

if len(valid_unique_targets) != 2:
    print(f"ERROR: Expected 2 unique values, found {len(valid_unique_targets)}.")
    print("The error is likely due to the column being read as an object type (string) or having too many zeros/near-zero values.")
    
    # Check the data type of the target column
    print(f"Dtype of '{TARGET}': {df[TARGET].dtype}")
    
    # If the dtype is 'object', try converting it to numeric:
    if df[TARGET].dtype == 'object':
         df[TARGET] = pd.to_numeric(df[TARGET], errors='coerce')
         valid_unique_targets = df[TARGET].dropna().unique()
         print(f"After coercion, unique values: {valid_unique_targets}")
         if len(valid_unique_targets) != 2:
             raise ValueError("Failed to isolate 2 scaled target values.")
    else:
        # If it's already numeric but the unique list is empty/wrong, the data itself is flawed.
        raise ValueError("Target column data quality issue.")


# --- 5. Re-mapping Logic (Only run if the checks above pass) ---

sorted_targets = sorted(valid_unique_targets)
scaled_to_binary_map = {
    sorted_targets[0]: 0,
    sorted_targets[1]: 1
}
print(f"\nMapping used: {scaled_to_binary_map}")

df[TARGET] = df[TARGET].map(scaled_to_binary_map)

print(f"\nValue counts after re-mapping '{TARGET}':")
print(df[TARGET].value_counts(dropna=False))

# Now proceed to split_data(df, target=TARGET)



Data Loaded. Shape: (95662, 56)
------------------------------
NaN count in 'is_high_risk': 0
Unique non-NaN values in 'is_high_risk': [-0.36093702  2.77056645]

Mapping used: {np.float64(-0.3609370206423462): 0, np.float64(2.7705664501256675): 1}

Value counts after re-mapping 'is_high_risk':
is_high_risk
0    84636
1    11026
Name: count, dtype: int64


# Split Data

In [3]:
# --- CONTINUE HERE ---

# 4. (Continued) Re-mapping Logic is complete and successful.
print(f"\nTarget column is successfully re-mapped to integer classes (0, 1).")
print("-" * 30)

# 6. Data Split (Proceeding from the successful data preparation)
print("--- 6. Splitting Data for Training and Testing ---")

# Assuming your split_data utility function is available and correctly imported:
from src.task_5.data_split import split_data  # Ensure this import points to your utility file

TARGET = "is_high_risk"
# Use the utility function to split the data
X_train, X_test, y_train, y_test = split_data(df.copy(), target=TARGET)

print(f"Data Split Success:")
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"Target split (Train 1s): {y_train.sum() / len(y_train) * 100:.2f}%")
print(f"Target split (Test 1s): {y_test.sum() / len(y_test) * 100:.2f}%")

# The data is now prepared. Proceed to call your training functions.
# Example:
# from src.train_models import train_and_track_model
# logreg_metrics, _ = train_and_track_model(model_name='LogisticRegression', X_train, X_test, y_train, y_test)


Target column is successfully re-mapped to integer classes (0, 1).
------------------------------
--- 6. Splitting Data for Training and Testing ---
Data Split Success:
X_train shape: (76529, 55)
X_test shape: (19133, 55)
Target split (Train 1s): 11.53%
Target split (Test 1s): 11.52%


# Load Models Configuration

In [18]:
models = get_models()
models


{'LogisticRegression': {'model': LogisticRegression(random_state=42, solver='liblinear'),
  'search': 'grid',
  'params': {'C': [0.1, 1, 10], 'penalty': ['l1', 'l2']}},
 'DecisionTree': {'model': DecisionTreeClassifier(random_state=42),
  'search': 'grid',
  'params': {'max_depth': [3, 5, 10], 'min_samples_split': [2, 5, 10]}},
 'RandomForest': {'model': RandomForestClassifier(random_state=42),
  'search': 'random',
  'params': {'n_estimators': [100, 200, 300], 'max_depth': [5, 10, None]}},
 'GradientBoosting': {'model': GradientBoostingClassifier(random_state=42),
  'search': 'random',
  'params': {'n_estimators': [100, 200],
   'learning_rate': [0.01, 0.1],
   'max_depth': [3, 5]}}}

# Logistic Regression

In [20]:
logistic_config = models["LogisticRegression"]

logistic_results = train_and_log_model(
    model_name="LogisticRegression",
    model_config=logistic_config,
    X_train=X_train,
    X_test=X_test,
    y_train=y_train,
    y_test=y_test,
    # experiment_name="Credit_Risk_Task5"
)

logistic_results




✅ Model saved locally at: c:\Users\hp\Desktop\AI projects\bati-bank-credit-scoring\models\LogisticRegression_20251216_171228.pkl


  return FileStore(store_uri)
Successfully registered model 'credit_risk_model'.
Created version '1' of model 'credit_risk_model'.


{'accuracy': 0.884858621230335,
 'precision': 0.525,
 'recall': 0.009523809523809525,
 'f1': 0.018708240534521157,
 'roc_auc': 0.5879506615185457}

# Decision Tree

In [21]:
dt_config = models["DecisionTree"]

decision_tree_results = train_and_log_model(
    model_name="DecisionTree",
    model_config=dt_config,
    X_train=X_train,
    X_test=X_test,
    y_train=y_train,
    y_test=y_test,
    # experiment_name="Credit_Risk_Task5"
)

decision_tree_results




✅ Model saved locally at: c:\Users\hp\Desktop\AI projects\bati-bank-credit-scoring\models\DecisionTree_20251216_171628.pkl


Registered model 'credit_risk_model' already exists. Creating a new version of this model...
Created version '2' of model 'credit_risk_model'.


{'accuracy': 0.8855903412951445,
 'precision': 0.5851063829787234,
 'recall': 0.024943310657596373,
 'f1': 0.04784688995215311,
 'roc_auc': 0.6120003113091489}

#Random Forest

In [22]:
rf_config = models["RandomForest"]

random_forest_results = train_and_log_model(
    model_name="RandomForest",
    model_config=rf_config,
    X_train=X_train,
    X_test=X_test,
    y_train=y_train,
    y_test=y_test,
    # experiment_name="Credit_Risk_Task5"
)

random_forest_results


Registered model 'credit_risk_model' already exists. Creating a new version of this model...
Created version '3' of model 'credit_risk_model'.


✅ Model saved locally at: c:\Users\hp\Desktop\AI projects\bati-bank-credit-scoring\models\RandomForest_20251216_171930.pkl


{'accuracy': 0.8842314326033555,
 'precision': 0.47282608695652173,
 'recall': 0.03945578231292517,
 'f1': 0.07283382168271244,
 'roc_auc': 0.6411550560677958}

# Gradient Boosting (XGBoost)

In [23]:
xgb_config = models["GradientBoosting"]

xgboost_results = train_and_log_model(
    model_name="GradientBoosting",
    model_config=xgb_config,
    X_train=X_train,
    X_test=X_test,
    y_train=y_train,
    y_test=y_test,
    # experiment_name="Credit_Risk_Task5"
)

xgboost_results


Registered model 'credit_risk_model' already exists. Creating a new version of this model...


✅ Model saved locally at: c:\Users\hp\Desktop\AI projects\bati-bank-credit-scoring\models\GradientBoosting_20251217_110832.pkl


Created version '4' of model 'credit_risk_model'.


{'accuracy': 0.8859562013275493,
 'precision': 0.5950413223140496,
 'recall': 0.0326530612244898,
 'f1': 0.061908856405846945,
 'roc_auc': 0.6527416503778575}

# Compare All Models (Manual Table)

In [24]:
import pandas as pd

results_df = pd.DataFrame.from_dict({
    "LogisticRegression": logistic_results,
    "DecisionTree": decision_tree_results,
    "RandomForest": random_forest_results,
    "XGBoost": xgboost_results
}, orient="index")

results_df.sort_values("roc_auc", ascending=False)


Unnamed: 0,accuracy,precision,recall,f1,roc_auc
XGBoost,0.885956,0.595041,0.032653,0.061909,0.652742
RandomForest,0.884231,0.472826,0.039456,0.072834,0.641155
DecisionTree,0.88559,0.585106,0.024943,0.047847,0.612
LogisticRegression,0.884859,0.525,0.009524,0.018708,0.587951


# Identify Best Model

In [25]:
best_model_name = results_df["roc_auc"].idxmax()
best_model_name



'XGBoost'