In [5]:
# =============================================================
# TASK 4 – Proxy Target Variable (IMPROVED WORKFLOW - Target Creation)
# Objective: Calculate RFM target and prepare new input for Task 3
# =============================================================

import os
import sys
import importlib.util
import pandas as pd

# -------------------------------------------------------------
# 1. Set project root and Load data_loader.py
# -------------------------------------------------------------
project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
sys.path.append(project_root)
print("Project Root:", project_root)

# Load data_loader.py module
try:
    loader_path = os.path.join(project_root, "src", "data_loader.py")
    spec = importlib.util.spec_from_file_location("data_loader", loader_path)
    data_loader = importlib.util.module_from_spec(spec)
    sys.modules["data_loader"] = data_loader
    spec.loader.exec_module(data_loader)
    load_data = data_loader.load_data
except FileNotFoundError:
    print("Warning: data_loader.py not found. Assuming standard pd.read_csv logic.")
    load_data = pd.read_csv # Fallback

# -------------------------------------------------------------
# 2. Load Task 4 module
# -------------------------------------------------------------
task4_path = os.path.join(project_root, "src", "task4_proxy_target.py")
try:
    spec = importlib.util.spec_from_file_location("task4_proxy_target", task4_path)
    task4_module = importlib.util.module_from_spec(spec)
    spec.loader.exec_module(task4_module)
except FileNotFoundError:
    print(f"Error: task4_proxy_target.py not found at {task4_path}")
    sys.exit(1)
    
# -------------------------------------------------------------
# 3. Load RAW DATA (The original file)
# -------------------------------------------------------------
raw_data_path = os.path.join(
    project_root,
    "data",
    "raw",
    "data.csv"
)
df_raw = load_data(raw_data_path)
print("Raw Data loaded for RFM:", df_raw.shape)

# -------------------------------------------------------------
# 4. Create proxy high-risk target using the RAW DATA
# -------------------------------------------------------------
# This function calculates RFM, clusters, assigns 'is_high_risk', and merges it
# back into the transaction-level dataframe (df_raw_with_risk).

# Define parameters for the function
CUSTOMER_COL = "CustomerId"
AMOUNT_COL = "Amount"
TIMESTAMP_COL = "TransactionStartTime"
NEW_TARGET_COL = "is_high_risk"

df_raw_with_risk, rfm_table = task4_module.create_high_risk_target(
    df=df_raw.copy(),
    customer_col=CUSTOMER_COL,
    amount_col=AMOUNT_COL,
    transaction_col=TIMESTAMP_COL,
    n_clusters=3,
    random_state=42,
    verbose=True
)

# -------------------------------------------------------------
# 5. Save Augmented Raw Data for Task 3 INPUT
# -------------------------------------------------------------
# The existing 'df_raw' columns plus the new 'is_high_risk' column.
output_path = os.path.join(
    project_root,
    "data",
    "processed",
    "augmented_raw_data_for_task3.csv"
)

# Only keep the original columns and the new target column
cols_to_save = df_raw.columns.tolist() + [NEW_TARGET_COL]

# Ensure FraudResult is the original target, and we are adding the new one.
if "FraudResult" in cols_to_save:
    cols_to_save.remove("FraudResult")
    # For a multi-target model, you would keep FraudResult.
    # We will assume we keep FraudResult and add is_high_risk for flexibility.
    if "FraudResult" not in df_raw_with_risk.columns: # Re-add if logic removed it
        cols_to_save.append("FraudResult") 
    

df_raw_with_risk.to_csv(output_path, index=False)

# -------------------------------------------------------------
# 6. Validation and Next Step Instruction
# -------------------------------------------------------------
print("\n--- Task 4 Result Summary ---")
print(f"Augmented Raw Data (Input for next Task 3 run) saved at: {output_path}")
print("Shape of new input data:", df_raw_with_risk.shape)
print(f"New Target ({NEW_TARGET_COL}) distribution:")
print(df_raw_with_risk[NEW_TARGET_COL].value_counts(normalize=True))

print("\n")
print("✅ NEXT STEP: Modify your Task 3 script to load 'augmented_raw_data_for_task3.csv'")
print("             and set the target variable to 'is_high_risk' (or 'FraudResult')")

Project Root: c:\Users\hp\Desktop\AI projects\bati-bank-credit-scoring
Raw Data loaded for RFM: (95662, 16)
RFM Table (first 5 rows):
         CustomerId  Recency  Frequency  Monetary
0     CustomerId_1       84          1  -10000.0
1    CustomerId_10       84          1  -10000.0
2  CustomerId_1001       90          5   20000.0
3  CustomerId_1002       26         11    4225.0
4  CustomerId_1003       12          6   20000.0

Cluster centroids (scaled RFM features):
 [[ 1.10728029e+00 -1.84866213e-01 -3.31527790e-02]
 [-9.07697835e-02  4.19477521e+01 -3.86727978e+01]
 [-6.99193674e-01  9.84468052e-02  3.78011434e-02]]

Cluster summary (mean RFM values):
            Recency    Frequency      Monetary
Cluster                                      
0        61.859846     7.726699  8.172379e+04
1        29.000000  4091.000000 -1.049000e+08
2        12.716076    34.807692  2.726546e+05

High-risk cluster identified: 0
        CustomerId  Cluster  is_high_risk
0     CustomerId_1        0     