In [2]:
##############################################
# 1) Load and Clean BFSI Data from Scratch
##############################################
import warnings
warnings.filterwarnings("ignore", message="overflow encountered")
warnings.filterwarnings("ignore", message="divide by zero encountered")
warnings.filterwarnings("ignore", message="invalid value encountered in")
warnings.filterwarnings("ignore", message="The default of observed=False is deprecated")
warnings.filterwarnings("ignore", message="overflow encountered in reduce")

import pandas as pd
import numpy as np
import random
import math

# Example: same local pickle from sections 5.3 & 5.4
df = pd.read_pickle("train_df_sample.pkl")
print("Initial shape:", df.shape)

def drop_null_cols(df, threshold=0.8):
    null_percent = df.isnull().mean()
    drop_cols = list(null_percent[null_percent > threshold].index)
    df = df.drop(drop_cols, axis=1)
    print(f"Dropped {len(drop_cols)} columns (>{threshold*100}% missing).")
    return df

df = drop_null_cols(df, 0.8)
print("After dropping high-missing columns:", df.shape)

# Suppose we label-encode the same known categorical columns from section 5.3
from sklearn.preprocessing import LabelEncoder

cat_features = ["B_30","B_38","D_114","D_116","D_117","D_120","D_126","D_63","D_64","D_68"]
cat_features = [f"{cf}_last" for cf in cat_features]
cat_features = [c for c in cat_features if c in df.columns]

le = LabelEncoder()
for c in cat_features:
    df[c] = df[c].astype(str)
    df[c] = df[c].replace("nan","NaN")
    df[c] = le.fit_transform(df[c])

target_col = "target"

# Mean-impute a random subset of numeric columns, or replicate exactly what you did in 5.3
from sklearn.impute import SimpleImputer

numeric_cols = df.select_dtypes(include=['number']).columns.tolist()
numeric_cols = [col for col in numeric_cols if (col not in cat_features) and (col != target_col)]

num_cols_sample = random.sample(numeric_cols, min(100, len(numeric_cols)))
imputer = SimpleImputer(strategy='mean')
df[num_cols_sample] = imputer.fit_transform(df[num_cols_sample])

print("Finished BFSI cleaning. Ready to define subset & do train/test.")
print("Current shape:", df.shape)

##############################################
# 2) Restrict to the columns OptBinning selected
##############################################
# For example, suppose your final binning process (in 5.4) ended up choosing 19 columns:
# (This list is from a hypothetical result. Replace with your real columns if they differ!)
selected_optb_cols = [
    "D_112_min", "S_6_mean", "R_15_mean", "D_56_max", "D_47_mean", "B_6_std",
    "D_51_min", "D_79_last", "B_5_max", "D_65_std", "D_81_std", "D_81_max",
    "D_65_mean", "R_15_std", "D_127_max", "S_5_std", "R_24_std", "B_12_max",
    "D_120_last"
]

# We'll intersect them with df's columns, in case any are missing
selected_optb_cols = [c for c in selected_optb_cols if c in df.columns]
print("OptBinning said these columns are top:", selected_optb_cols)

# Build final subset with target
final_cols = selected_optb_cols + [target_col]
df = df[final_cols]

print("Subsetting to OptBinning columns. New shape:", df.shape)

##############################################
# 3) Split into train/test for subsequent analysis
##############################################
X = df.drop(columns=[target_col])
y = df[target_col]

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.3,
    random_state=42,
    stratify=y
)

print(f"Train shape: {X_train.shape}, Test shape: {X_test.shape}")



Initial shape: (100000, 919)
Dropped 106 columns (>80.0% missing).
After dropping high-missing columns: (100000, 813)
Finished BFSI cleaning. Ready to define subset & do train/test.
Current shape: (100000, 813)
OptBinning said these columns are top: ['D_112_min', 'S_6_mean', 'R_15_mean', 'D_56_max', 'D_47_mean', 'B_6_std', 'D_51_min', 'D_79_last', 'B_5_max', 'D_65_std', 'D_81_std', 'D_81_max', 'D_65_mean', 'R_15_std', 'D_127_max', 'S_5_std', 'R_24_std', 'B_12_max', 'D_120_last']
Subsetting to OptBinning columns. New shape: (100000, 20)
Train shape: (70000, 19), Test shape: (30000, 19)


In [3]:
# We'll combine X_train, X_test with y_train, y_test into reference/current DataFrames.
# This ensures that the 'target' column is included, so we can detect any target drift too.

import pandas as pd

df_train = X_train.copy()
df_train['target'] = y_train.values   # BFSI target

df_test = X_test.copy()
df_test['target'] = y_test.values     # BFSI target

print("Reference data shape:", df_train.shape)
print("Current data shape:", df_test.shape)


Reference data shape: (70000, 20)
Current data shape: (30000, 20)


In [7]:
df_train = df_train.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [8]:
from evidently import ColumnMapping
from evidently.report import Report
from evidently.metric_preset import DataDriftPreset, TargetDriftPreset

# 1) Define column mapping if needed
col_map = ColumnMapping(
    target='target',
    # If you had 'predictions' or 'scores', you could map them here:
    # prediction='model_score'  # for instance
    numerical_features=X_train.select_dtypes(include='number').columns.tolist(),
    categorical_features=X_train.select_dtypes(exclude='number').columns.tolist()
)

# 2) Build a single Evidently Report with both DataDrift and TargetDrift
combined_report = Report(
    metrics=[
        DataDriftPreset(),
        TargetDriftPreset()
    ]
)

# 3) Run the report on your BFSI data
combined_report.run(
    reference_data=df_train,
    current_data=df_test,
    column_mapping=col_map
)


#4) save to HTML
combined_report.save_html("combined_drift_report.html")
