- In this Notebook, I am trying to integrate the training data from the backscore and rest of the period from prj-prod-dataplatform.audit_balance.ml_model_run_details table.
- In this I will compare the training period with each month of test period.

**Steps to Follow**:

* Read the specific model data from prj-prod-dataplatform.audit_balance.ml_model_run_details table
* Expand the calcFeature column to extract all the features for the model
* Read the data from specific backscore table for the training data
* Identify the features and create a list
* Use transform_data function to create the same structure as ml_model_run_details table
* Insert the data to a similar training table - prj-prod-dataplatform.dap_ds_poweruser_playground.ml_training_model_run_details
* Read the specific model data from prj-prod-dataplatform.dap_ds_poweruser_playground.ml_training_model_run_details
* expand the training set from the calcFeature column
* Concatenate both the test and train datasets
* Calculate the PSI using the PSI function comparing it with the train set
* Insert the result to a PSI table prj-prod-dataplatform.dap_ds_poweruser_playground.alpha_cic_sil_model_psi_v4

# **PSI - CSI Calculation**

## Define Libraries

In [1]:
# %% [markdown]
# # Jupyter Notebook Loading Header
#
# This is a custom loading header for Jupyter Notebooks in Visual Studio Code.
# It includes common imports and settings to get you started quickly.
# %% [markdown]
## Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from google.cloud import bigquery
from google.cloud import storage
import os
import tempfile
import time
from datetime import datetime
import uuid
import joblib
import uuid

import gcsfs
import duckdb as dd
import pickle
import joblib
from typing import Union
import io
path = r'C:\Users\Dwaipayan\AppData\Roaming\gcloud\legacy_credentials\dchakroborti@tonikbank.com\adc.json'
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = path
client = bigquery.Client(project='prj-prod-dataplatform')
os.environ["GOOGLE_CLOUD_PROJECT"] = "prj-prod-dataplatform"

# %% [markdown]
## Configure Settings
# Set options or configurations as needed
pd.set_option('display.max_columns', None)
pd.set_option("Display.max_rows", 100)

## Function

In [2]:
# Method 1: Using regex to remove all "Calc_" occurrences
def clean_names_regex(name):
    return re.sub(r'_Calc_', '_', name)

# Method 2: Simple string replacement
def clean_names_replace(name):
    return name.replace('_Calc_', '_')

## expand_calc_features_robust

In [3]:
import pandas as pd
import json

def expand_calc_features_robust(df):
    """
    Expand the calcFeatures JSON column into separate columns and return the complete DataFrame.
    
    Parameters:
    df (pd.DataFrame): Input DataFrame with calcFeatures column containing JSON data
    
    Returns:
    pd.DataFrame: Expanded DataFrame with all original columns plus JSON features as separate columns
    """
    
    # Make a copy to avoid modifying the original DataFrame
    df_expanded = df.copy()
    
    # Parse the calcFeatures JSON column
    calc_features_data = []
    
    for idx, calc_features_str in enumerate(df['calcFeatures']):
        try:
            # Handle None or NaN values
            if pd.isna(calc_features_str):
                calc_features_data.append({})
                continue
            
            # Convert to string if not already
            calc_features_str = str(calc_features_str)
            
            # Parse the JSON string
            features_dict = json.loads(calc_features_str.replace("'", '"'))
            
            # Ensure it's a dictionary, not a list
            if isinstance(features_dict, dict):
                calc_features_data.append(features_dict)
            elif isinstance(features_dict, list):
                # If it's a list, convert to dict with index keys or skip
                print(f"Warning: calcFeatures at index {idx} is a list, converting to dict")
                calc_features_data.append({'raw_list': features_dict})
            else:
                print(f"Warning: calcFeatures at index {idx} is neither dict nor list: {type(features_dict)}")
                calc_features_data.append({})
                
        except (json.JSONDecodeError, AttributeError, TypeError) as e:
            # If parsing fails, create an empty dict and print warning
            print(f"Warning: Could not parse calcFeatures at index {idx}: {e}")
            print(f"  Value: {calc_features_str}")
            calc_features_data.append({})
    
    # Create DataFrame from the parsed JSON data
    calc_features_df = pd.DataFrame(calc_features_data)
    
    # Add prefix to JSON-derived columns to avoid conflicts
    calc_features_df = calc_features_df.add_prefix('calc_')
    
    # Reset index to ensure proper alignment
    df_expanded = df_expanded.reset_index(drop=True)
    calc_features_df = calc_features_df.reset_index(drop=True)
    
    # Combine original DataFrame with expanded calcFeatures
    result_df = pd.concat([df_expanded, calc_features_df], axis=1)
    
    return result_df

### dropping_duplicates

In [4]:
def dropping_duplicates(df: pd.DataFrame) -> pd.DataFrame:
    """
    Drop duplicates based on digitalLoanAccountId, Data_selection, and modelVersionid,
    keeping the first occurrence based on appln_submit_datetime.

    Parameters:
    -----------
    df : pd.DataFrame
        Input dataframe

    Returns:
    --------
    pd.DataFrame with duplicates dropped
    """

    df = df.sort_values(
        ['digitalLoanAccountId', 'Data_selection', 'modelVersionId', 'appln_submit_datetime'],
        ascending=[True, True, True, True],
        na_position='last'
    )

    result = df.drop_duplicates(
        subset=['digitalLoanAccountId', 'Data_selection', 'modelVersionId'],
        keep='first'
    ).copy()

    return result

### PSI pipeline Version 1

In [5]:
import pandas as pd
import numpy as np
from typing import List, Dict, Tuple
import warnings
warnings.filterwarnings('ignore')


def calculate_psi_for_model(dfcombined: pd.DataFrame,
                            configdf: pd.DataFrame,
                            model_display_name: str) -> pd.DataFrame:
    """
    Calculate PSI for a specific model based on configdf combinations.
    
    Parameters:
    -----------
    dfcombined : pd.DataFrame
        Combined dataframe with data for ONE specific model
    configdf : pd.DataFrame
        Reference config dataframe with columns: modelDisplayName, modelVersionId, trenchCategory
    model_display_name : str
        Display name of the model (e.g., 'cic_model_sil')
    
    Returns:
    --------
    pd.DataFrame : PSI results for this model with all combinations
    """
    
    # Filter configdf to only include combinations for this specific modelDisplayName
    model_config = configdf[configdf['modelDisplayName'] == model_display_name].copy()
    
    print(f"\n{'='*80}")
    print(f"Starting PSI Pipeline for Model: {model_display_name}")
    print(f"Total combinations to process: {len(model_config)}")
    print(f"{'='*80}\n")
    
    if len(model_config) == 0:
        print(f"ERROR: No configurations found for modelDisplayName={model_display_name}")
        return pd.DataFrame()
    
    # Process each unique combination
    all_results = []
    
    for idx, config_row in model_config.iterrows():
        model_version_id = config_row['modelVersionId']
        trench_category = config_row['trenchCategory']
        
        print(f"Processing combination {idx + 1}/{len(model_config)}: "
              f"modelVersionId={model_version_id}, trenchCategory={trench_category}")
        
        # Filter data from dfcombined based on modelVersionId and trenchCategory
        combo_df = dfcombined[
            (dfcombined['modelVersionId'] == model_version_id) & 
            (dfcombined['trenchCategory'] == trench_category)
        ].copy()
        
        if len(combo_df) == 0:
            print(f"  Warning: No data found for this combination. Skipping...")
            continue
        
        print(f"  Data points: {len(combo_df)}")
        
        # Expand calcFeatures
        try:
            combo_df = expand_calc_features_robust(combo_df)
            print(f"  Features expanded successfully")
        except Exception as e:
            print(f"  Error expanding features: {e}. Skipping...")
            continue
        
        # Extract feature list (all columns starting with 'calc_')
        feature_list = [col for col in combo_df.columns if col.startswith('calc_')]
        
        if len(feature_list) == 0:
            print(f"  Warning: No features found after expansion. Skipping...")
            continue
        
        print(f"  Features identified: {len(feature_list)}")
        
        # Define segment columns
        segment_columns = ['new_loan_type', 'osType', 'loan_product_type', 'trenchCategory']
        # Filter to only existing columns
        segment_columns = [col for col in segment_columns if col in combo_df.columns]
        
        # Calculate PSI (overall + segments + score)
        try:
            psi_result = calculate_month_on_month_psi(
                combo_df,
                feature_list,
                segment_columns=segment_columns
            )
            
            # Add model metadata
            psi_result['modelDisplayName'] = model_display_name
            psi_result['modelVersionId'] = model_version_id
            psi_result['trenchCategory'] = trench_category
            
            all_results.append(psi_result)
            print(f"  PSI calculated: {len(psi_result)} rows")
            
        except Exception as e:
            print(f"  Error calculating PSI: {e}")
            continue
        
        # Calculate PSI for score (if 'score' column exists)
        if 'score' in combo_df.columns:
            try:
                score_psi = calculate_month_on_month_psi(
                    combo_df,
                    ['score'],
                    segment_columns=segment_columns
                )
                
                # Add model metadata
                score_psi['modelDisplayName'] = model_display_name
                score_psi['modelVersionId'] = model_version_id
                score_psi['trenchCategory'] = trench_category
                
                all_results.append(score_psi)
                print(f"  Score PSI calculated: {len(score_psi)} rows")
                
            except Exception as e:
                print(f"  Error calculating score PSI: {e}")
    
    # Combine all results
    if all_results:
        final_result = pd.concat(all_results, ignore_index=True)
        
        # Reorder columns to match required output
        column_order = ['modelDisplayName', 'modelVersionId', 'trenchCategory',
                       'Feature', 'Feature_Type', 'Segment_Column', 'Segment_Value', 'Month',
                       'Base_Month', 'Current_Month', 'Base_Count', 'Actual_Count',
                       'Expected_Percentage', 'Actual_Percentage', 'PSI']
        
        # Keep only columns that exist
        available_cols = [col for col in column_order if col in final_result.columns]
        final_result = final_result[available_cols]
        
        print(f"\n{'='*80}")
        print(f"Pipeline Complete!")
        print(f"Total rows in final output: {len(final_result)}")
        print(f"Unique combinations processed: {len(final_result[['modelVersionId', 'trenchCategory']].drop_duplicates())}")
        print(f"{'='*80}")
        
        return final_result
    else:
        print("No results generated. Check input data and configurations.")
        return pd.DataFrame()


# def expand_calc_features_robust(df):
#     """
#     Expand the calcFeatures JSON column into separate columns.
#     """
#     import json
    
#     df_expanded = df.copy()
#     calc_features_data = []
    
#     for idx, calc_features_str in enumerate(df['calcFeatures']):
#         try:
#             if pd.isna(calc_features_str):
#                 calc_features_data.append({})
#                 continue
            
#             calc_features_str = str(calc_features_str)
#             features_dict = json.loads(calc_features_str.replace("'", '"'))
            
#             if isinstance(features_dict, dict):
#                 calc_features_data.append(features_dict)
#             elif isinstance(features_dict, list):
#                 calc_features_data.append({'raw_list': features_dict})
#             else:
#                 calc_features_data.append({})
                
#         except (json.JSONDecodeError, AttributeError, TypeError):
#             calc_features_data.append({})
    
#     calc_features_df = pd.DataFrame(calc_features_data)
#     calc_features_df = calc_features_df.add_prefix('calc_')
    
#     df_expanded = df_expanded.reset_index(drop=True)
#     calc_features_df = calc_features_df.reset_index(drop=True)
    
#     result_df = pd.concat([df_expanded, calc_features_df], axis=1)
#     return result_df

def expand_calc_features_robust(df):
    """
    Expand the calcFeatures JSON column into separate columns.
    Column names will be prefixed with modelVersionId_Calc_
    """
    import json
    
    df_expanded = df.copy()
    calc_features_data = []
    model_version_ids = []
    
    for idx, calc_features_str in enumerate(df['calcFeatures']):
        try:
            if pd.isna(calc_features_str):
                calc_features_data.append({})
            else:
                calc_features_str = str(calc_features_str)
                features_dict = json.loads(calc_features_str.replace("'", '"'))
                
                if isinstance(features_dict, dict):
                    calc_features_data.append(features_dict)
                elif isinstance(features_dict, list):
                    calc_features_data.append({'raw_list': features_dict})
                else:
                    calc_features_data.append({})
                    
        except (json.JSONDecodeError, AttributeError, TypeError):
            calc_features_data.append({})
        
        # Capture modelVersionId for each row
        model_version_ids.append(df.iloc[idx]['modelVersionId'])
    
    calc_features_df = pd.DataFrame(calc_features_data)
    
    # Create prefix for each row using modelVersionId
    prefixes = [f"{mvid}_Calc_" for mvid in model_version_ids]
    
    # Rename columns with the dynamic prefix
    calc_features_df.columns = [f"{prefixes[i]}{col}" for i, col in enumerate(calc_features_df.columns) 
                                for _ in range(1)]
    
    # Alternative cleaner approach: rename columns after expanding
    calc_features_df = calc_features_df.add_prefix('calc_')
    calc_features_df.columns = [f"{prefixes[i]}{col.replace('calc_', '')}" 
                                for i, col in enumerate(calc_features_df.columns)]
    
    df_expanded = df_expanded.reset_index(drop=True)
    calc_features_df = calc_features_df.reset_index(drop=True)
    
    result_df = pd.concat([df_expanded, calc_features_df], axis=1)
    return result_df


def identify_feature_types(df: pd.DataFrame, feature_list: List[str]) -> Dict[str, List[str]]:
    """
    Identify categorical and numerical features from the feature list.
    """
    categorical_features = []
    numerical_features = []

    for feature in feature_list:
        if feature not in df.columns:
            continue

        if pd.api.types.is_numeric_dtype(df[feature]):
            unique_vals = df[feature].nunique()
            if unique_vals < 15 and df[feature].dropna().apply(lambda x: x == int(x) if isinstance(x, (int, float)) else False).all():
                categorical_features.append(feature)
            else:
                numerical_features.append(feature)
        else:
            categorical_features.append(feature)

    return {
        'categorical': categorical_features,
        'numerical': numerical_features
    }


def create_bins_for_features(df: pd.DataFrame,
                             numerical_features: List[str],
                             categorical_features: List[str],
                             train_period_df: pd.DataFrame) -> Dict:
    """
    Create bins for numerical features and categorical features based on training period.
    """
    binning_info = {}

    for feature in numerical_features:
        valid_data = train_period_df[feature].dropna()

        if len(valid_data) == 0:
            binning_info[feature] = {'type': 'numerical', 'bins': None, 'bin_ranges': {}}
            continue

        bins = None
        bin_count = None

        try:
            test_bins = np.percentile(valid_data, np.arange(0, 101, 10))
            test_bins = np.unique(test_bins)
            if len(test_bins) >= 11:
                bins = test_bins
                bin_count = 10
        except:
            pass

        if bins is None:
            try:
                test_bins = np.percentile(valid_data, np.arange(0, 101, 20))
                test_bins = np.unique(test_bins)
                if len(test_bins) >= 6:
                    bins = test_bins
                    bin_count = 5
            except:
                pass

        if bins is None:
            try:
                test_bins = np.percentile(valid_data, [0, 33.33, 66.67, 100])
                test_bins = np.unique(test_bins)
                if len(test_bins) >= 4:
                    bins = test_bins
                    bin_count = 3
            except:
                pass

        if bins is None:
            min_val = valid_data.min()
            max_val = valid_data.max()
            bins = np.linspace(min_val, max_val, 6)
            bins = np.unique(bins)
            bin_count = len(bins) - 1

            if bin_count == 1:
                bins = np.array([min_val - 0.1, min_val, min_val + 0.1])
                bin_count = 2

        bins = bins.copy()
        bins[0] = -np.inf
        bins[-1] = np.inf

        bin_ranges = {}
        for i in range(len(bins)-1):
            bin_name = f"Bin_{i+1}"
            bin_ranges[bin_name] = {
                'min': bins[i],
                'max': bins[i+1],
                'range_str': f"[{bins[i]:.2f}, {bins[i+1]:.2f}]" if not np.isinf(bins[i]) and not np.isinf(bins[i+1]) else f"({bins[i]}, {bins[i+1]})"
            }

        binning_info[feature] = {
            'type': 'numerical',
            'bins': bins,
            'bin_ranges': bin_ranges,
            'bin_count': bin_count
        }

    for feature in categorical_features:
        value_counts = train_period_df[feature].value_counts()
        unique_categories = value_counts.index.tolist()

        if len(unique_categories) <= 6:
            top_categories = unique_categories
        else:
            top_categories = value_counts.nlargest(6).index.tolist()

        binning_info[feature] = {
            'type': 'categorical',
            'top_categories': top_categories,
            'bin_ranges': {}
        }

    return binning_info


def apply_binning(df: pd.DataFrame, feature: str, binning_info: Dict) -> pd.Series:
    """
    Apply binning to a feature based on binning information.
    """
    if binning_info['type'] == 'numerical':
        if binning_info['bins'] is None:
            return pd.Series(['Missing'] * len(df), index=df.index)

        bins = binning_info['bins']
        labels = [f"Bin_{i+1}" for i in range(len(bins)-1)]

        binned = pd.cut(df[feature], bins=bins, labels=labels, include_lowest=True, duplicates='drop')
        binned = binned.astype(str)
        binned[df[feature].isna()] = 'Missing'
        return binned

    else:
        top_cats = binning_info['top_categories']
        feature_data = df[feature].astype(str).replace('nan', 'Missing')
        top_cats_str = [str(cat) for cat in top_cats]
        binned = feature_data.apply(lambda x: x if x in top_cats_str else ('Others' if x != 'Missing' else 'Missing'))
        return binned


def calculate_psi(expected_pct: pd.Series, actual_pct: pd.Series, epsilon: float = 0.0001) -> float:
    """
    Calculate Population Stability Index.
    """
    all_bins = expected_pct.index.union(actual_pct.index)
    expected_pct = expected_pct.reindex(all_bins, fill_value=0)
    actual_pct = actual_pct.reindex(all_bins, fill_value=0)

    expected_pct = expected_pct.apply(lambda x: epsilon if x == 0 else x)
    actual_pct = actual_pct.apply(lambda x: epsilon if x == 0 else x)

    expected_pct = expected_pct / expected_pct.sum()
    actual_pct = actual_pct / actual_pct.sum()

    psi_value = np.sum((actual_pct - expected_pct) * np.log(actual_pct / expected_pct))
    return psi_value


def calculate_month_on_month_psi(df: pd.DataFrame,
                                 feature_list: List[str],
                                 segment_columns: List[str] = [],
                                 month_col: str = 'Application_month',
                                 data_selection_col: str = 'Data_selection',
                                 account_id_col: str = 'digitalLoanAccountId') -> pd.DataFrame:
    """
    Calculate PSI for each feature comparing training period vs each month,
    with overall and segment-level breakdowns.
    """
    df = df.copy()

    train_df = df[df[data_selection_col] == 'Train'].copy()
    test_df = df[df[data_selection_col] != 'Train'].copy()

    if len(train_df) == 0:
        raise ValueError("No training data found. Check Data_selection column.")

    feature_types = identify_feature_types(df, feature_list)
    binning_info = create_bins_for_features(df, feature_types['numerical'], feature_types['categorical'], train_df)
    test_months = sorted(test_df[month_col].unique())

    results = []

    # Overall PSI (no segments)
    for feature in feature_list:
        if feature not in df.columns:
            continue

        df[f'{feature}_binned'] = apply_binning(df, feature, binning_info[feature])
        train_baseline = df[df[data_selection_col] == 'Train'][f'{feature}_binned'].value_counts(normalize=True)

        for month in test_months:
            actual_dist = df[df[month_col] == month][f'{feature}_binned'].value_counts(normalize=True)
            psi_value = calculate_psi(train_baseline, actual_dist)

            expected_avg_pct = train_baseline.mean() * 100
            actual_avg_pct = actual_dist.mean() * 100

            base_count = df[df[data_selection_col] == 'Train'][account_id_col].nunique()
            actual_count = df[df[month_col] == month][account_id_col].nunique()

            results.append({
                'Feature': feature,
                'Feature_Type': binning_info[feature]['type'],
                'Segment_Column': 'Overall',
                'Segment_Value': 'All',
                'Month': f"{month}",
                'Base_Month': 'Train',
                'Current_Month': month,
                'Base_Count': base_count,
                'Actual_Count': actual_count,
                'Expected_Percentage': expected_avg_pct,
                'Actual_Percentage': actual_avg_pct,
                'PSI': psi_value
            })

    # Segment-level PSI
    for segment_col in segment_columns:
        if segment_col not in df.columns:
            continue

        segments = df[segment_col].dropna().unique()

        for segment_val in segments:
            segment_df = df[df[segment_col] == segment_val]

            for feature in feature_list:
                if feature not in df.columns:
                    continue

                train_segment = segment_df[segment_df[data_selection_col] == 'Train']
                if len(train_segment) == 0:
                    continue

                train_baseline = train_segment[f'{feature}_binned'].value_counts(normalize=True)

                for month in test_months:
                    actual_segment = segment_df[segment_df[month_col] == month]
                    if len(actual_segment) == 0:
                        continue

                    actual_dist = actual_segment[f'{feature}_binned'].value_counts(normalize=True)
                    psi_value = calculate_psi(train_baseline, actual_dist)

                    expected_avg_pct = train_baseline.mean() * 100
                    actual_avg_pct = actual_dist.mean() * 100

                    base_segment_count = train_segment[account_id_col].nunique()
                    actual_segment_count = actual_segment[account_id_col].nunique()

                    results.append({
                        'Feature': feature,
                        'Feature_Type': binning_info[feature]['type'],
                        'Segment_Column': segment_col,
                        'Segment_Value': segment_val,
                        'Month': f"{month}",
                        'Base_Month': 'Train',
                        'Current_Month': month,
                        'Base_Count': base_segment_count,
                        'Actual_Count': actual_segment_count,
                        'Expected_Percentage': expected_avg_pct,
                        'Actual_Percentage': actual_avg_pct,
                        'PSI': psi_value
                    })

    return pd.DataFrame(results)


# Usage Example:
# psi_results = calculate_psi_for_model(
#     dfcombined=dfcombined,
#     configdf=configdf,
#     model_display_name='cic_model_sil'
# )
# 
# psi_results.to_csv('psi_results_cic_model_sil.csv', index=False)
# print(psi_results.head())
# print(f"\nUnique combinations: \n{psi_results[['modelDisplayName', 'modelVersionId', 'trenchCategory']].drop_duplicates()}")

### PSI Pipeline Version 2

In [6]:
import pandas as pd
import numpy as np
from typing import List, Dict, Tuple
import warnings
warnings.filterwarnings('ignore')


def calculate_psi_for_model(dfcombined: pd.DataFrame,
                            configdf: pd.DataFrame,
                            model_display_name: str) -> pd.DataFrame:
    """
    Calculate PSI for a specific model based on configdf combinations.
    
    Parameters:
    -----------
    dfcombined : pd.DataFrame
        Combined dataframe with data for ONE specific model
    configdf : pd.DataFrame
        Reference config dataframe with columns: modelDisplayName, modelVersionId, trenchCategory
    model_display_name : str
        Display name of the model (e.g., 'cic_model_sil')
    
    Returns:
    --------
    pd.DataFrame : PSI results for this model with all combinations
    """
    
    # Filter configdf to only include combinations for this specific modelDisplayName
    model_config = configdf[configdf['modelDisplayName'] == model_display_name].copy()
    
    print(f"\n{'='*80}")
    print(f"Starting PSI Pipeline for Model: {model_display_name}")
    print(f"Total combinations to process: {len(model_config)}")
    print(f"{'='*80}\n")
    
    if len(model_config) == 0:
        print(f"ERROR: No configurations found for modelDisplayName={model_display_name}")
        return pd.DataFrame()
    
    # Process each unique combination
    all_results = []
    
    for idx, config_row in model_config.iterrows():
        model_version_id = config_row['modelVersionId']
        trench_category = config_row['trenchCategory']
        
        print(f"Processing combination {idx + 1}/{len(model_config)}: "
              f"modelVersionId={model_version_id}, trenchCategory={trench_category}")
        
        # Filter data from dfcombined based on modelVersionId and trenchCategory
        combo_df = dfcombined[
            (dfcombined['modelVersionId'] == model_version_id) & 
            (dfcombined['trenchCategory'] == trench_category)
        ].copy()
        
        # If no data found and trenchCategory is 'ALL', get all data for that modelVersionId
        if len(combo_df) == 0 and trench_category == 'ALL':
            print(f"  No data found for specific combination. Retrieving all data for modelVersionId={model_version_id}...")
            combo_df = dfcombined[
                dfcombined['modelVersionId'] == model_version_id
            ].copy()
        
        if len(combo_df) == 0:
            print(f"  Warning: No data found for this combination. Skipping...")
            continue
        
        print(f"  Data points: {len(combo_df)}")
        
        # Expand calcFeatures
        try:
            combo_df = expand_calc_features_robust(combo_df)
            print(f"  Features expanded successfully")
        except Exception as e:
            print(f"  Error expanding features: {e}. Skipping...")
            continue
        
        # Extract feature list (all columns starting with 'calc_')
        # feature_list = [col for col in combo_df.columns if col.startswith('calc_')]
        feature_list = [col for col in combo_df.columns if '_Calc_' in col]
        
        if len(feature_list) == 0:
            print(f"  Warning: No features found after expansion. Skipping...")
            continue
        
        print(f"  Features identified: {len(feature_list)}")
        
        # Define segment columns
        segment_columns = ['new_loan_type', 'osType', 'loan_product_type', 'trenchCategory']
        # Filter to only existing columns
        segment_columns = [col for col in segment_columns if col in combo_df.columns]
        
        # Calculate PSI (overall + segments + score)
        try:
            psi_result = calculate_month_on_month_psi(
                combo_df,
                feature_list,
                segment_columns=segment_columns
            )
            
            # Add model metadata
            psi_result['modelDisplayName'] = model_display_name
            psi_result['modelVersionId'] = model_version_id
            psi_result['trenchCategory'] = trench_category
            
            all_results.append(psi_result)
            print(f"  PSI calculated: {len(psi_result)} rows")
            
        except Exception as e:
            print(f"  Error calculating PSI: {e}")
            continue
        
        # Calculate PSI for score (if 'score' column exists)
        if 'score' in combo_df.columns:
            try:
                score_psi = calculate_month_on_month_psi(
                    combo_df,
                    ['score'],
                    segment_columns=segment_columns
                )
                
                # Add model metadata
                score_psi['modelDisplayName'] = model_display_name
                score_psi['modelVersionId'] = model_version_id
                score_psi['trenchCategory'] = trench_category
                
                all_results.append(score_psi)
                print(f"  Score PSI calculated: {len(score_psi)} rows")
                
            except Exception as e:
                print(f"  Error calculating score PSI: {e}")
    
    # Combine all results
    if all_results:
        final_result = pd.concat(all_results, ignore_index=True)
        
        # Reorder columns to match required output
        column_order = ['modelDisplayName', 'modelVersionId', 'trenchCategory',
                       'Feature', 'Feature_Type', 'Segment_Column', 'Segment_Value', 'Month',
                       'Base_Month', 'Current_Month', 'Base_Count', 'Actual_Count',
                       'Expected_Percentage', 'Actual_Percentage', 'PSI']
        
        # Keep only columns that exist
        available_cols = [col for col in column_order if col in final_result.columns]
        final_result = final_result[available_cols]
        
        print(f"\n{'='*80}")
        print(f"Pipeline Complete!")
        print(f"Total rows in final output: {len(final_result)}")
        print(f"Unique combinations processed: {len(final_result[['modelVersionId', 'trenchCategory']].drop_duplicates())}")
        print(f"{'='*80}")
        
        return final_result
    else:
        print("No results generated. Check input data and configurations.")
        return pd.DataFrame()


# def expand_calc_features_robust(df):
#     """
#     Expand the calcFeatures JSON column into separate columns.
#     """
#     import json
    
#     df_expanded = df.copy()
#     calc_features_data = []
    
#     for idx, calc_features_str in enumerate(df['calcFeatures']):
#         try:
#             if pd.isna(calc_features_str):
#                 calc_features_data.append({})
#                 continue
            
#             calc_features_str = str(calc_features_str)
#             features_dict = json.loads(calc_features_str.replace("'", '"'))
            
#             if isinstance(features_dict, dict):
#                 calc_features_data.append(features_dict)
#             elif isinstance(features_dict, list):
#                 calc_features_data.append({'raw_list': features_dict})
#             else:
#                 calc_features_data.append({})
                
#         except (json.JSONDecodeError, AttributeError, TypeError):
#             calc_features_data.append({})
    
#     calc_features_df = pd.DataFrame(calc_features_data)
#     calc_features_df = calc_features_df.add_prefix('calc_')
    
#     df_expanded = df_expanded.reset_index(drop=True)
#     calc_features_df = calc_features_df.reset_index(drop=True)
    
#     result_df = pd.concat([df_expanded, calc_features_df], axis=1)
#     return result_df

def expand_calc_features_robust(df):
    """
    Expand the calcFeatures JSON column into separate columns.
    Column names will be prefixed with modelVersionId_Calc_
    """
    import json
    
    df_expanded = df.copy()
    calc_features_data = []
    model_version_ids = []
    
    for idx, calc_features_str in enumerate(df['calcFeatures']):
        try:
            if pd.isna(calc_features_str):
                calc_features_data.append({})
            else:
                calc_features_str = str(calc_features_str)
                features_dict = json.loads(calc_features_str.replace("'", '"'))
                
                if isinstance(features_dict, dict):
                    calc_features_data.append(features_dict)
                elif isinstance(features_dict, list):
                    calc_features_data.append({'raw_list': features_dict})
                else:
                    calc_features_data.append({})
                    
        except (json.JSONDecodeError, AttributeError, TypeError):
            calc_features_data.append({})
        
        # Capture modelVersionId for each row
        model_version_ids.append(df.iloc[idx]['modelVersionId'])
    
    calc_features_df = pd.DataFrame(calc_features_data)
    
    # Create prefix for each row using modelVersionId
    prefixes = [f"{mvid}_Calc_" for mvid in model_version_ids]
    
    # # Rename columns with the dynamic prefix
    # calc_features_df.columns = [f"{prefixes[i]}{col}" for i, col in enumerate(calc_features_df.columns) 
    #                             for _ in range(1)]
    
    # Alternative cleaner approach: rename columns after expanding
    calc_features_df = calc_features_df.add_prefix('calc_')
    calc_features_df.columns = [f"{prefixes[i]}{col.replace('calc_', '')}" 
                                for i, col in enumerate(calc_features_df.columns)]
    
    df_expanded = df_expanded.reset_index(drop=True)
    calc_features_df = calc_features_df.reset_index(drop=True)
    
    result_df = pd.concat([df_expanded, calc_features_df], axis=1)
    return result_df


def identify_feature_types(df: pd.DataFrame, feature_list: List[str]) -> Dict[str, List[str]]:
    """
    Identify categorical and numerical features from the feature list.
    """
    categorical_features = []
    numerical_features = []

    for feature in feature_list:
        if feature not in df.columns:
            continue

        if pd.api.types.is_numeric_dtype(df[feature]):
            unique_vals = df[feature].nunique()
            if unique_vals < 15 and df[feature].dropna().apply(lambda x: x == int(x) if isinstance(x, (int, float)) else False).all():
                categorical_features.append(feature)
            else:
                numerical_features.append(feature)
        else:
            categorical_features.append(feature)

    return {
        'categorical': categorical_features,
        'numerical': numerical_features
    }


def create_bins_for_features(df: pd.DataFrame,
                             numerical_features: List[str],
                             categorical_features: List[str],
                             train_period_df: pd.DataFrame) -> Dict:
    """
    Create bins for numerical features and categorical features based on training period.
    """
    binning_info = {}

    for feature in numerical_features:
        valid_data = train_period_df[feature].dropna()

        if len(valid_data) == 0:
            binning_info[feature] = {'type': 'numerical', 'bins': None, 'bin_ranges': {}}
            continue

        bins = None
        bin_count = None

        try:
            test_bins = np.percentile(valid_data, np.arange(0, 101, 10))
            test_bins = np.unique(test_bins)
            if len(test_bins) >= 11:
                bins = test_bins
                bin_count = 10
        except:
            pass

        if bins is None:
            try:
                test_bins = np.percentile(valid_data, np.arange(0, 101, 20))
                test_bins = np.unique(test_bins)
                if len(test_bins) >= 6:
                    bins = test_bins
                    bin_count = 5
            except:
                pass

        if bins is None:
            try:
                test_bins = np.percentile(valid_data, [0, 33.33, 66.67, 100])
                test_bins = np.unique(test_bins)
                if len(test_bins) >= 4:
                    bins = test_bins
                    bin_count = 3
            except:
                pass

        if bins is None:
            min_val = valid_data.min()
            max_val = valid_data.max()
            bins = np.linspace(min_val, max_val, 6)
            bins = np.unique(bins)
            bin_count = len(bins) - 1

            if bin_count == 1:
                bins = np.array([min_val - 0.1, min_val, min_val + 0.1])
                bin_count = 2

        bins = bins.copy()
        bins[0] = -np.inf
        bins[-1] = np.inf

        bin_ranges = {}
        for i in range(len(bins)-1):
            bin_name = f"Bin_{i+1}"
            bin_ranges[bin_name] = {
                'min': bins[i],
                'max': bins[i+1],
                'range_str': f"[{bins[i]:.2f}, {bins[i+1]:.2f}]" if not np.isinf(bins[i]) and not np.isinf(bins[i+1]) else f"({bins[i]}, {bins[i+1]})"
            }

        binning_info[feature] = {
            'type': 'numerical',
            'bins': bins,
            'bin_ranges': bin_ranges,
            'bin_count': bin_count
        }

    for feature in categorical_features:
        value_counts = train_period_df[feature].value_counts()
        unique_categories = value_counts.index.tolist()

        if len(unique_categories) <= 6:
            top_categories = unique_categories
        else:
            top_categories = value_counts.nlargest(6).index.tolist()

        binning_info[feature] = {
            'type': 'categorical',
            'top_categories': top_categories,
            'bin_ranges': {}
        }

    return binning_info


def apply_binning(df: pd.DataFrame, feature: str, binning_info: Dict) -> pd.Series:
    """
    Apply binning to a feature based on binning information.
    """
    if binning_info['type'] == 'numerical':
        if binning_info['bins'] is None:
            return pd.Series(['Missing'] * len(df), index=df.index)

        bins = binning_info['bins']
        labels = [f"Bin_{i+1}" for i in range(len(bins)-1)]

        binned = pd.cut(df[feature], bins=bins, labels=labels, include_lowest=True, duplicates='drop')
        binned = binned.astype(str)
        binned[df[feature].isna()] = 'Missing'
        return binned

    else:
        top_cats = binning_info['top_categories']
        feature_data = df[feature].astype(str).replace('nan', 'Missing')
        top_cats_str = [str(cat) for cat in top_cats]
        binned = feature_data.apply(lambda x: x if x in top_cats_str else ('Others' if x != 'Missing' else 'Missing'))
        return binned


def calculate_psi(expected_pct: pd.Series, actual_pct: pd.Series, epsilon: float = 0.0001) -> float:
    """
    Calculate Population Stability Index.
    """
    all_bins = expected_pct.index.union(actual_pct.index)
    expected_pct = expected_pct.reindex(all_bins, fill_value=0)
    actual_pct = actual_pct.reindex(all_bins, fill_value=0)

    expected_pct = expected_pct.apply(lambda x: epsilon if x == 0 else x)
    actual_pct = actual_pct.apply(lambda x: epsilon if x == 0 else x)

    expected_pct = expected_pct / expected_pct.sum()
    actual_pct = actual_pct / actual_pct.sum()

    psi_value = np.sum((actual_pct - expected_pct) * np.log(actual_pct / expected_pct))
    return psi_value


def calculate_month_on_month_psi(df: pd.DataFrame,
                                 feature_list: List[str],
                                 segment_columns: List[str] = [],
                                 month_col: str = 'Application_month',
                                 data_selection_col: str = 'Data_selection',
                                 account_id_col: str = 'digitalLoanAccountId') -> pd.DataFrame:
    """
    Calculate PSI for each feature comparing training period vs each month,
    with overall and segment-level breakdowns.
    """
    df = df.copy()

    train_df = df[df[data_selection_col] == 'Train'].copy()
    test_df = df[df[data_selection_col] != 'Train'].copy()

    if len(train_df) == 0:
        raise ValueError("No training data found. Check Data_selection column.")

    feature_types = identify_feature_types(df, feature_list)
    binning_info = create_bins_for_features(df, feature_types['numerical'], feature_types['categorical'], train_df)
    test_months = sorted(test_df[month_col].unique())

    results = []

    # Overall PSI (no segments)
    for feature in feature_list:
        if feature not in df.columns:
            continue

        df[f'{feature}_binned'] = apply_binning(df, feature, binning_info[feature])
        train_baseline = df[df[data_selection_col] == 'Train'][f'{feature}_binned'].value_counts(normalize=True)

        for month in test_months:
            actual_dist = df[df[month_col] == month][f'{feature}_binned'].value_counts(normalize=True)
            psi_value = calculate_psi(train_baseline, actual_dist)

            expected_avg_pct = train_baseline.mean() * 100
            actual_avg_pct = actual_dist.mean() * 100

            base_count = df[df[data_selection_col] == 'Train'][account_id_col].nunique()
            actual_count = df[df[month_col] == month][account_id_col].nunique()

            results.append({
                'Feature': feature,
                'Feature_Type': binning_info[feature]['type'],
                'Segment_Column': 'Overall',
                'Segment_Value': 'All',
                'Month': f"{month}",
                'Base_Month': 'Train',
                'Current_Month': month,
                'Base_Count': base_count,
                'Actual_Count': actual_count,
                'Expected_Percentage': expected_avg_pct,
                'Actual_Percentage': actual_avg_pct,
                'PSI': psi_value
            })

    # Segment-level PSI
    for segment_col in segment_columns:
        if segment_col not in df.columns:
            continue

        segments = df[segment_col].dropna().unique()

        for segment_val in segments:
            segment_df = df[df[segment_col] == segment_val]

            for feature in feature_list:
                if feature not in df.columns:
                    continue

                train_segment = segment_df[segment_df[data_selection_col] == 'Train']
                if len(train_segment) == 0:
                    continue

                train_baseline = train_segment[f'{feature}_binned'].value_counts(normalize=True)

                for month in test_months:
                    actual_segment = segment_df[segment_df[month_col] == month]
                    if len(actual_segment) == 0:
                        continue

                    actual_dist = actual_segment[f'{feature}_binned'].value_counts(normalize=True)
                    psi_value = calculate_psi(train_baseline, actual_dist)

                    expected_avg_pct = train_baseline.mean() * 100
                    actual_avg_pct = actual_dist.mean() * 100

                    base_segment_count = train_segment[account_id_col].nunique()
                    actual_segment_count = actual_segment[account_id_col].nunique()

                    results.append({
                        'Feature': feature,
                        'Feature_Type': binning_info[feature]['type'],
                        'Segment_Column': segment_col,
                        'Segment_Value': segment_val,
                        'Month': f"{month}",
                        'Base_Month': 'Train',
                        'Current_Month': month,
                        'Base_Count': base_segment_count,
                        'Actual_Count': actual_segment_count,
                        'Expected_Percentage': expected_avg_pct,
                        'Actual_Percentage': actual_avg_pct,
                        'PSI': psi_value
                    })

    return pd.DataFrame(results)


# Usage Example:
# psi_results = calculate_psi_for_model(
#     dfcombined=dfcombined,
#     configdf=configdf,
#     model_display_name='cic_model_sil'
# )
# 
# psi_results.to_csv('psi_results_cic_model_sil.csv', index=False)
# print(psi_results.head())
# print(f"\nUnique combinations: \n{psi_results[['modelDisplayName', 'modelVersionId', 'trenchCategory']].drop_duplicates()}")

# Config query

In [7]:
sq = """ WITH base AS (
  -- First part with actual trenchCategory
  SELECT 
    modelDisplayName, 
    modelVersionId, 
    CASE 
      WHEN trenchCategory IS NULL THEN 'ALL'
      WHEN trenchCategory = '' THEN 'ALL'
      ELSE trenchCategory 
    END AS trenchCategory,
(CASE 
      WHEN modelDisplayName LIKE '%sil%' THEN 'SIL'
      WHEN modelDisplayName LIKE '%cash%' THEN 'CASH'
      ELSE 'ALL' 
    END) AS product_category,
  FROM `prj-prod-dataplatform.audit_balance.ml_model_run_details`

  
  UNION ALL
  
  -- Second part with 'ALL' trenchCategory
  SELECT 
    modelDisplayName, 
    modelVersionId, 
    'ALL' AS trenchCategory,
    (CASE 
      WHEN modelDisplayName LIKE '%sil%' THEN 'SIL'
      WHEN modelDisplayName LIKE '%cash%' THEN 'CASH'
      ELSE 'ALL' 
    END) AS product_category,
  FROM `prj-prod-dataplatform.audit_balance.ml_model_run_details`
  WHERE trenchCategory IS NOT NULL

)
SELECT distinct
  modelDisplayName, 
  modelVersionId, 
  trenchCategory, 
  product_category,
FROM base
ORDER BY 4, 1, 2, 3;"""

configdf = client.query(sq).to_dataframe()
configdf

Unnamed: 0,modelDisplayName,modelVersionId,trenchCategory,product_category
0,alpha_income_model,v1,ALL,ALL
1,beta_income_model,v1,ALL,ALL
2,alpha_stack_model_cash,v1,ALL,CASH
3,alpha_stack_model_cash,v1,Trench 1,CASH
4,alpha_stack_model_cash,v1,Trench 2,CASH
5,alpha_stack_model_cash,v1,Trench 3,CASH
6,alpha_stack_model_cash,v1.1,ALL,CASH
7,alpha_stack_model_cash,v1.1,Trench 1,CASH
8,alpha_stack_model_cash,v1.1,Trench 2,CASH
9,apps_score_cash,v1,ALL,CASH


In [8]:
# configdf.to_csv('configdf.csv', index = False)

## SIL

### Queries

### cic_model_sil

### Test

In [9]:
## This is for the test period of Alpha - CIC sil model - reading the data from ml_model_run_details

sq = """
WITH cleaned AS (
  SELECT
    customerId,digitalLoanAccountId,prediction,start_time,end_time,
        case when modelDisplayName = 'Alpha - CIC-SIL-Model' then 'cic_model_sil' else modelDisplayName end as modelDisplayName    
    ,modelVersionId,
    case when trenchCategory is null then 'ALL' 
         when trenchCategory='' then 'ALL'    
    else trenchCategory end trenchCategory,
    REPLACE(REPLACE(calcFeature, "'", '"'), "None", "null") AS calcFeature
  FROM `prj-prod-dataplatform.audit_balance.ml_model_run_details`
  WHERE modelDisplayName in ('Alpha - CIC-SIL-Model', 'cic_model_sil')
  ),
base as
(SELECT distinct

--Alpha_cic_sil_score
  r.customerId,r.digitalLoanAccountId,prediction score
    ,start_time,end_time,modelDisplayName,modelVersionId,
   loanmaster.new_loan_type,
 loanmaster.gender,
    case when loanmaster.loantype='BNPL' and sil_category.store_type =1 then 'Appliance'
    when loanmaster.loantype='BNPL' and sil_category.store_type =2 then 'Mobile'
    when loanmaster.loantype='BNPL' and sil_category.store_type =3 then 'Mall'
    when loanmaster.loantype='BNPL' and sil_category.store_type not in (1,2,3) then store_tagging
    else 'not applicable' end as loan_product_type,
     case when lower(coalesce(loanmaster.osversion_v2, loanmaster.osVersion)) like '%andro%' then 'android'
        when lower(coalesce(loanmaster.osversion_v2, loanmaster.osVersion)) like '%os%' then 'ios'
        when lower(loanmaster.deviceType) like '%andro%' then 'android'
        else 'ios' end osType,
 'cic_model_sil' Model_Name,
 'SIL' as product,
  trenchCategory,
  r.calcFeature calcFeatures,
  'Test' Data_selection,
  coalesce(IF(loanmaster.new_loan_type = 'Flex-up', loanmaster.startApplyDateTime, loanmaster.termsAndConditionsSubmitDateTime),  r.start_time) AS appln_submit_datetime,
  loanmaster.disbursementDateTime,
  format_date('%Y-%m', coalesce(IF(loanmaster.new_loan_type = 'Flex-up', loanmaster.startApplyDateTime, loanmaster.termsAndConditionsSubmitDateTime),  r.start_time)) as Application_month,
FROM cleaned r
left join risk_credit_mis.loan_master_table loanmaster
  ON loanmaster.digitalLoanAccountId = r.digitalLoanAccountId
 left join(SELECT DISTINCT mer_refferal_code, mer_name mer_name,store_type,store_tagging FROM `dl_loans_db_raw.tdbk_merchant_refferal_mtb`
  left join worktable_datachampions.TARGET_SPLIT P on P.STORE_NAME = mer_name
 qualify row_number() over(partition by mer_refferal_code order by  created_dt desc)=1) sil_category on loanmaster.purpleKey=sil_category.mer_refferal_code
qualify row_number() over (partition by r.customerId,r.digitalLoanAccountId, modelVersionId 
order by coalesce(IF(loanmaster.new_loan_type = 'Flex-up', loanmaster.startApplyDateTime, loanmaster.termsAndConditionsSubmitDateTime),  r.start_time) desc) = 1
)
select * from base
;
"""
dfd = client.query(sq).to_dataframe()
print(f"The shape of the dataframe is: {dfd.shape}")
dfd.head()

## this data is not expanded. We will have to expand and get the features from the calcFeatures column

The shape of the dataframe is: (106289, 19)


Unnamed: 0,customerId,digitalLoanAccountId,score,start_time,end_time,modelDisplayName,modelVersionId,new_loan_type,gender,loan_product_type,osType,Model_Name,product,trenchCategory,calcFeatures,Data_selection,appln_submit_datetime,disbursementDateTime,Application_month
0,1518674,9d48d629-d1f8-431f-82ee-c6d0ac9135c6,0.1616481771248587,2025-08-08 07:36:47.086633,2025-08-08 07:36:47.092838,cic_model_sil,v1,SIL Competitor,M,Appliance,android,cic_model_sil,SIL,ALL,"{""run_date"":1754611200000,""cic_Personal_Loans_...",Test,2025-08-08 15:36:38,NaT,2025-08
1,2484418,711faf71-7ca1-4444-83a0-65c38b66bc24,0.1342014732137181,2025-12-09 10:37:00.734040,2025-12-09 10:37:00.877577,cic_model_sil,v2,SIL Competitor,M,Appliance,android,cic_model_sil,SIL,Trench 3,"{""digitalLoanAccountId"": ""711faf71-7ca1-4444-8...",Test,2025-12-09 18:36:50,2025-12-09 18:39:54,2025-12
2,2522811,97936ddd-13f2-469b-b894-2bd88bfcaacd,0.0847659165473142,2025-05-07 08:24:52.470000,2025-05-07 08:24:52.477004,cic_model_sil,v1,SIL-Instore,M,Appliance,android,cic_model_sil,SIL,ALL,"{""run_date"":1746576000000,""cic_Personal_Loans_...",Test,2025-05-07 16:24:43,2025-05-07 16:43:25,2025-05
3,2621639,49114eaa-b9bd-4271-9cf3-eaba5b8c869d,0.0969640234257539,2025-06-10 05:18:02.217310,2025-06-10 05:18:02.223374,cic_model_sil,v1,SIL-Instore,M,Appliance,android,cic_model_sil,SIL,ALL,"{""run_date"":1749513600000,""cic_Personal_Loans_...",Test,2025-06-10 13:17:51,2025-06-10 13:21:45,2025-06
4,2721940,0a6dbc7c-d3b6-454f-9497-bcef8ab79d19,0.1053860055362706,2025-06-15 05:50:42.241466,2025-06-15 05:50:42.247389,cic_model_sil,v1,SIL-Instore,M,Appliance,android,cic_model_sil,SIL,ALL,"{""run_date"":1749945600000,""cic_Personal_Loans_...",Test,2025-06-15 13:50:33,2025-06-15 14:15:40,2025-06


In [10]:
df1 = dfd.copy()

### Train

In [11]:
sq = """WITH cleaned AS (
  SELECT
    customerId,digitalLoanAccountId,prediction,start_time,end_time,
    
    case when modelDisplayName = 'Alpha - CIC-SIL-Model' then 'cic_model_sil' else modelDisplayName end as modelDisplayName
    
    ,modelVersionId,
        case when trenchCategory is null then 'ALL' 
         when trenchCategory = '' then 'ALL'
    else trenchCategory end trenchCategory,
    REPLACE(REPLACE(calcFeature, "'", '"'), "None", "null") AS calcFeature
  FROM prj-prod-dataplatform.dap_ds_poweruser_playground.ml_training_model_run_details
  WHERE modelDisplayName in ('Alpha - CIC-SIL-Model', 'cic_model_sil')
  ),
base as 
(SELECT distinct
  r.customerId,r.digitalLoanAccountId,prediction score
    ,start_time,end_time,modelDisplayName,modelVersionId,
   loanmaster.new_loan_type,
 loanmaster.gender,
    case when loanmaster.loantype='BNPL' and sil_category.store_type =1 then 'Appliance'
    when loanmaster.loantype='BNPL' and sil_category.store_type =2 then 'Mobile'
    when loanmaster.loantype='BNPL' and sil_category.store_type =3 then 'Mall'
    when loanmaster.loantype='BNPL' and sil_category.store_type not in (1,2,3) then store_tagging
    else 'not applicable' end as loan_product_type,
     case when lower(coalesce(loanmaster.osversion_v2, loanmaster.osVersion)) like '%andro%' then 'android'
        when lower(coalesce(loanmaster.osversion_v2, loanmaster.osVersion)) like '%os%' then 'ios'
        when lower(loanmaster.deviceType) like '%andro%' then 'android'
        else 'ios' end osType,
 'Alpha - CIC-SIL-Model' Model_Name,
 'SIL' as product,
  trenchCategory,
  r.calcFeature calcFeatures,
  'Train' Data_selection,
  coalesce(IF(loanmaster.new_loan_type = 'Flex-up', loanmaster.startApplyDateTime, loanmaster.termsAndConditionsSubmitDateTime),  cast(r.start_time as datetime)) AS appln_submit_datetime,
  loanmaster.disbursementDateTime,
  format_date('%Y-%m', coalesce(IF(loanmaster.new_loan_type = 'Flex-up', loanmaster.startApplyDateTime, loanmaster.termsAndConditionsSubmitDateTime),  cast(r.start_time as datetime))) as Application_month,
FROM cleaned r
left join risk_credit_mis.loan_master_table loanmaster
  ON loanmaster.digitalLoanAccountId = r.digitalLoanAccountId
 left join(SELECT DISTINCT mer_refferal_code, mer_name mer_name,store_type,store_tagging FROM `dl_loans_db_raw.tdbk_merchant_refferal_mtb`
  left join worktable_datachampions.TARGET_SPLIT P on P.STORE_NAME = mer_name
 qualify row_number() over(partition by mer_refferal_code order by  created_dt desc)=1) sil_category on loanmaster.purpleKey=sil_category.mer_refferal_code
qualify row_number() over (partition by r.customerId,r.digitalLoanAccountId, modelVersionId 
order by   coalesce(IF(loanmaster.new_loan_type = 'Flex-up', loanmaster.startApplyDateTime, loanmaster.termsAndConditionsSubmitDateTime),  cast(r.start_time as datetime)) desc) = 1
)
select * from base
;
"""

dfd = client.query(sq).to_dataframe()
print(f"The shape of the dataframe is: {dfd.shape}")
dfd.head()

The shape of the dataframe is: (450861, 19)


Unnamed: 0,customerId,digitalLoanAccountId,score,start_time,end_time,modelDisplayName,modelVersionId,new_loan_type,gender,loan_product_type,osType,Model_Name,product,trenchCategory,calcFeatures,Data_selection,appln_submit_datetime,disbursementDateTime,Application_month
0,1998430,12928659-82e7-46b3-8273-7ceba6af82f9,0.123149,2025-12-13T11:19:18.801792,2025-12-13T11:19:18.801792,cic_model_sil,v1,SIL-Instore,F,Appliance,android,Alpha - CIC-SIL-Model,SIL,ALL,"{""cic_days_since_last_inquiry"": 964.0, ""cic_ze...",Train,2023-04-16 17:33:47,2023-04-16 17:54:37,2023-04
1,2087500,b648d234-75b8-4fef-89e9-68b11340c374,0.063274,2025-12-13T11:19:13.034864,2025-12-13T11:19:13.034864,cic_model_sil,v1,SIL-Instore,F,Appliance,android,Alpha - CIC-SIL-Model,SIL,ALL,"{""cic_Personal_Loans_granted_contracts_amt_24M...",Train,2023-06-11 11:21:23,2023-06-11 11:51:24,2023-06
2,2199423,97fc071c-2a2d-493d-9f33-c18a7b52af78,0.126759,2025-12-13T11:19:19.618804,2025-12-13T11:19:19.618804,cic_model_sil,v1,SIL-Instore,M,Appliance,android,Alpha - CIC-SIL-Model,SIL,ALL,"{""cic_days_since_last_inquiry"": 1937.0, ""cic_z...",Train,2023-09-10 19:42:56,2023-09-10 19:45:47,2023-09
3,2209337,470aec72-a94c-40a3-897d-54ec3dae6574,0.16645,2025-12-13T11:19:15.984587,2025-12-13T11:19:15.984587,cic_model_sil,v1,SIL-Instore,M,Appliance,android,Alpha - CIC-SIL-Model,SIL,ALL,"{""cic_days_since_last_inquiry"": 176.0, ""cic_ve...",Train,2023-08-27 11:35:19,2023-08-27 11:44:05,2023-08
4,2209785,c9632cb2-8d6d-4679-9606-1beba49df58b,0.163603,2025-12-13T11:19:22.082892,2025-12-13T11:19:22.082892,cic_model_sil,v1,SIL-Instore,M,Appliance,android,Alpha - CIC-SIL-Model,SIL,ALL,"{""cic_days_since_last_inquiry"": 378.0, ""cic_ze...",Train,2024-09-08 13:08:05,NaT,2024-09


In [12]:
df2 = dfd.copy()

In [13]:
df_concat = pd.concat([df1, df2], ignore_index=True)
print(f"The shape of the concatenated dataframe is: {df_concat.shape}")

The shape of the concatenated dataframe is: (557150, 19)


In [14]:
print(f"The shape of the concatenated dataframe is: {df_concat.shape}")
df_combined = dropping_duplicates(df_concat)
print(f"The shape of the dataframe after dropping duplicates is: {df_combined.shape}")

The shape of the concatenated dataframe is: (557150, 19)
The shape of the dataframe after dropping duplicates is: (557150, 19)


In [15]:
df_combined['score'] = pd.to_numeric(df_combined['score'], errors='coerce')

### PSI calculation

In [16]:
# Usage Example:
psi_results = calculate_psi_for_model(
    dfcombined=df_combined,
    configdf=configdf,
    model_display_name='cic_model_sil'
)
psi_results.head()


Starting PSI Pipeline for Model: cic_model_sil
Total combinations to process: 5

Processing combination 56/5: modelVersionId=v1, trenchCategory=ALL
  Data points: 307884
  Features expanded successfully
  Features identified: 10
  PSI calculated: 890 rows
  Score PSI calculated: 89 rows
Processing combination 57/5: modelVersionId=v2, trenchCategory=ALL
  No data found for specific combination. Retrieving all data for modelVersionId=v2...
  Data points: 249266
  Features expanded successfully
  Features identified: 19
  PSI calculated: 418 rows
  Score PSI calculated: 22 rows
Processing combination 58/5: modelVersionId=v2, trenchCategory=Trench 1
  Data points: 227205
  Features expanded successfully
  Features identified: 19
  PSI calculated: 342 rows
  Score PSI calculated: 18 rows
Processing combination 59/5: modelVersionId=v2, trenchCategory=Trench 2
  Data points: 10425
  Features expanded successfully
  Features identified: 19
  PSI calculated: 342 rows
  Score PSI calculated: 18

Unnamed: 0,modelDisplayName,modelVersionId,trenchCategory,Feature,Feature_Type,Segment_Column,Segment_Value,Month,Base_Month,Current_Month,Base_Count,Actual_Count,Expected_Percentage,Actual_Percentage,PSI
0,cic_model_sil,v1,ALL,v1_Calc_cic_days_since_last_inquiry,numerical,Overall,All,2025-03,Train,2025-03,204453,5417,16.666667,16.666667,0.029234
1,cic_model_sil,v1,ALL,v1_Calc_cic_days_since_last_inquiry,numerical,Overall,All,2025-04,Train,2025-04,204453,541,16.666667,20.0,0.291721
2,cic_model_sil,v1,ALL,v1_Calc_cic_days_since_last_inquiry,numerical,Overall,All,2025-05,Train,2025-05,204453,858,16.666667,16.666667,0.207598
3,cic_model_sil,v1,ALL,v1_Calc_cic_days_since_last_inquiry,numerical,Overall,All,2025-06,Train,2025-06,204453,9934,16.666667,16.666667,0.158975
4,cic_model_sil,v1,ALL,v1_Calc_cic_days_since_last_inquiry,numerical,Overall,All,2025-07,Train,2025-07,204453,14043,16.666667,16.666667,0.157705


In [17]:
import re

psi_results['Feature'] = psi_results['Feature'].str.replace('_Calc_', '_', regex=False)

In [18]:
psi_results['Feature'].value_counts(dropna=False)

Feature
score                                              165
v1_cic_zero_granted_ever_flag                       89
v1_cic_cnt_active_contracts                         89
v1_run_date                                         89
v1_cic_vel_contract_nongranted_cnt_12on24           89
v1_cic_Personal_Loans_granted_contracts_amt_24M     89
v1_cic_days_since_last_inquiry                      89
v1_cic_vel_contract_granted_amt_12on24              89
v1_cic_tot_active_contracts_util                    89
v1_cic_zero_non_granted_ever_flag                   89
v1_cic_max_amt_granted_24M                          89
v2_vel_contract_nongranted_cnt_6on12                76
v2_customerId                                       76
v2_digitalLoanAccountId                             76
v2_Personal_Loans_granted_contracts_amt_24M         76
v2_vel_contract_granted_amt_6on12                   76
v2_tot_active_contracts_util                        76
v2_cnt_active_contracts                             76
v2

In [19]:

# List of features to remove
remove_features = ['v2_customerId', 'v2_digitalLoanAccountId','v2_crifApplicationId', 'v1_run_date']

# Drop rows where feature is in the list
psi_results = psi_results[~psi_results['Feature'].isin(remove_features)]

# Replace 'score' with 'Alpha_cic_sil_score' in the Feature column
psi_results['Feature'] = psi_results['Feature'].replace('score', 'Alpha_cic_sil_score')

# Replace values starting with 'calc_' by removing the prefix
psi_results['Feature'] = psi_results['Feature'].apply(
    lambda x: x[5:] if x.startswith('calc_') else x
)



In [20]:
psi_results[['modelVersionId','Feature']].value_counts()

modelVersionId  Feature                                        
v1              Alpha_cic_sil_score                                89
                v1_cic_vel_contract_granted_amt_12on24             89
                v1_cic_Personal_Loans_granted_contracts_amt_24M    89
                v1_cic_zero_granted_ever_flag                      89
                v1_cic_vel_contract_nongranted_cnt_12on24          89
                v1_cic_zero_non_granted_ever_flag                  89
                v1_cic_tot_active_contracts_util                   89
                v1_cic_max_amt_granted_24M                         89
                v1_cic_days_since_last_inquiry                     89
                v1_cic_cnt_active_contracts                        89
v2              v2_granted_contracts_cnt_6M                        76
                v2_vel_contract_granted_amt_6on12                  76
                v2_vel_contract_closed_amt_3on12                   76
                v2_total_o

In [21]:
# Upload to BigQuery
table_id = "prj-prod-dataplatform.dap_ds_poweruser_playground.alpha_cic_sil_model_psi_v5"
job_config = bigquery.LoadJobConfig(
    write_disposition="WRITE_TRUNCATE",  # or "WRITE_APPEND"
)
job = client.load_table_from_dataframe(psi_results, table_id, job_config=job_config)
job.result()

LoadJob<project=prj-prod-dataplatform, location=asia-southeast1, id=483f14f0-4ad0-4b09-8d01-40d8045ed543>

### Alpha Sil Stack Model

### Test

In [22]:
## This is for the test period of Alpha - CIC sil model - reading the data from ml_model_run_details
# Sil_Alpha_Stack_score
sq = """
WITH cleaned AS (
  SELECT
    customerId,digitalLoanAccountId,prediction,start_time,end_time,
        case when modelDisplayName = 'Alpha - StackingModel' then 'alpha_stack_model_sil' else modelDisplayName end as modelDisplayName    
    ,modelVersionId,
    case when trenchCategory is null then 'ALL' 
         when trenchCategory='' then 'ALL'    
    else trenchCategory end trenchCategory,
    REPLACE(REPLACE(calcFeature, "'", '"'), "None", "null") AS calcFeature
  FROM `prj-prod-dataplatform.audit_balance.ml_model_run_details`
  WHERE modelDisplayName in ('Alpha - StackingModel', 'alpha_stack_model_sil')
  ),
base as
(SELECT distinct
  r.customerId,r.digitalLoanAccountId,prediction score
    ,start_time,end_time,modelDisplayName,modelVersionId,
   loanmaster.new_loan_type,
 loanmaster.gender,
    case when loanmaster.loantype='BNPL' and sil_category.store_type =1 then 'Appliance'
    when loanmaster.loantype='BNPL' and sil_category.store_type =2 then 'Mobile'
    when loanmaster.loantype='BNPL' and sil_category.store_type =3 then 'Mall'
    when loanmaster.loantype='BNPL' and sil_category.store_type not in (1,2,3) then store_tagging
    else 'not applicable' end as loan_product_type,
     case when lower(coalesce(loanmaster.osversion_v2, loanmaster.osVersion)) like '%andro%' then 'android'
        when lower(coalesce(loanmaster.osversion_v2, loanmaster.osVersion)) like '%os%' then 'ios'
        when lower(loanmaster.deviceType) like '%andro%' then 'android'
        else 'ios' end osType,
 'alpha_stack_model_sil' Model_Name,
 'SIL' as product,
  trenchCategory,
  r.calcFeature calcFeatures,
  'Test' Data_selection,
  coalesce(IF(loanmaster.new_loan_type = 'Flex-up', loanmaster.startApplyDateTime, loanmaster.termsAndConditionsSubmitDateTime),  r.start_time) AS appln_submit_datetime,
  loanmaster.disbursementDateTime,
  format_date('%Y-%m', coalesce(IF(loanmaster.new_loan_type = 'Flex-up', loanmaster.startApplyDateTime, loanmaster.termsAndConditionsSubmitDateTime),  r.start_time)) as Application_month,
FROM cleaned r
left join risk_credit_mis.loan_master_table loanmaster
  ON loanmaster.digitalLoanAccountId = r.digitalLoanAccountId
 left join(SELECT DISTINCT mer_refferal_code, mer_name mer_name,store_type,store_tagging FROM `dl_loans_db_raw.tdbk_merchant_refferal_mtb`
  left join worktable_datachampions.TARGET_SPLIT P on P.STORE_NAME = mer_name
 qualify row_number() over(partition by mer_refferal_code order by  created_dt desc)=1) sil_category on loanmaster.purpleKey=sil_category.mer_refferal_code
qualify row_number() over (partition by r.customerId,r.digitalLoanAccountId, modelVersionId 
order by coalesce(IF(loanmaster.new_loan_type = 'Flex-up', loanmaster.startApplyDateTime, loanmaster.termsAndConditionsSubmitDateTime),  r.start_time) desc) = 1
)
select * from base
;
"""
dfd = client.query(sq).to_dataframe()
print(f"The shape of the dataframe is: {dfd.shape}")
dfd.head()

## this data is not expanded. We will have to expand and get the features from the calcFeatures column

The shape of the dataframe is: (106300, 19)


Unnamed: 0,customerId,digitalLoanAccountId,score,start_time,end_time,modelDisplayName,modelVersionId,new_loan_type,gender,loan_product_type,osType,Model_Name,product,trenchCategory,calcFeatures,Data_selection,appln_submit_datetime,disbursementDateTime,Application_month
0,2261106,0bda7051-d30e-4512-b29d-948d72992c05,0.1170764422428511,2025-11-16 02:50:16.875884,2025-11-16 02:50:16.881792,alpha_stack_model_sil,v1,SIL Competitor,M,Appliance,android,alpha_stack_model_sil,SIL,ALL,"{""sb_demo_score"": 0.1247255294, ""s_cic_score"":...",Test,2025-11-16 10:50:04,NaT,2025-11
1,2396022,9b7e76ca-305b-406c-b461-e47a2fb504d4,0.3400910674508632,2025-08-30 07:10:16.442287,2025-08-30 07:10:16.448435,alpha_stack_model_sil,v1,SIL-Instore,M,Appliance,android,alpha_stack_model_sil,SIL,ALL,"{""sb_demo_score"": 0.0791552022, ""s_cic_score"":...",Test,2025-08-30 15:10:06,NaT,2025-08
2,2666605,1bd7de9c-3ba0-4c85-8ffd-2e0583254e74,0.1594890174794873,2025-12-05 03:53:13.250396,2025-12-05 03:53:13.257068,alpha_stack_model_sil,v1,SIL ZERO,F,Appliance,android,alpha_stack_model_sil,SIL,ALL,"{""sb_demo_score"": 0.1373821611, ""s_cic_score"":...",Test,2025-12-05 11:52:47,2025-12-05 12:00:49,2025-12
3,2767429,38c5fff7-94b1-4475-a548-1002b7729164,0.201786610135768,2025-11-17 08:35:47.951855,2025-11-17 08:35:47.957703,alpha_stack_model_sil,v1,SIL-Instore,M,Appliance,android,alpha_stack_model_sil,SIL,ALL,"{""sb_demo_score"": 0.2882636452, ""s_cic_score"":...",Test,2025-11-17 16:35:36,NaT,2025-11
4,2809564,2368e920-d37b-461e-811e-4b753fcb2822,0.1099447335224447,2025-08-29 05:25:57.795590,2025-08-29 05:25:57.800682,alpha_stack_model_sil,v1,SIL Competitor,M,Appliance,ios,alpha_stack_model_sil,SIL,ALL,"{""sb_demo_score"": 0.1052552229, ""s_cic_score"":...",Test,2025-08-29 13:25:47,NaT,2025-08


In [23]:
df1 = dfd.copy()

### Train

In [24]:
sq = """WITH cleaned AS (
  SELECT
    customerId,digitalLoanAccountId,prediction,start_time,end_time,
    
    case when modelDisplayName = 'Alpha - StackingModel' then 'alpha_stack_model_sil' else modelDisplayName end as modelDisplayName 
     ,modelVersionId,
        case when trenchCategory is null then 'ALL' 
         when trenchCategory = '' then 'ALL'
    else trenchCategory end trenchCategory,
    REPLACE(REPLACE(calcFeature, "'", '"'), "None", "null") AS calcFeature
  FROM prj-prod-dataplatform.dap_ds_poweruser_playground.ml_training_model_run_details
  WHERE modelDisplayName in ('Alpha - StackingModel', 'alpha_stack_model_sil')
  ),
base as 
(SELECT distinct
  r.customerId,r.digitalLoanAccountId,prediction score
    ,start_time,end_time,modelDisplayName,modelVersionId,
   loanmaster.new_loan_type,
 loanmaster.gender,
    case when loanmaster.loantype='BNPL' and sil_category.store_type =1 then 'Appliance'
    when loanmaster.loantype='BNPL' and sil_category.store_type =2 then 'Mobile'
    when loanmaster.loantype='BNPL' and sil_category.store_type =3 then 'Mall'
    when loanmaster.loantype='BNPL' and sil_category.store_type not in (1,2,3) then store_tagging
    else 'not applicable' end as loan_product_type,
     case when lower(coalesce(loanmaster.osversion_v2, loanmaster.osVersion)) like '%andro%' then 'android'
        when lower(coalesce(loanmaster.osversion_v2, loanmaster.osVersion)) like '%os%' then 'ios'
        when lower(loanmaster.deviceType) like '%andro%' then 'android'
        else 'ios' end osType,
 'alpha_stack_model_sil' Model_Name,
 'SIL' as product,
  trenchCategory,
  r.calcFeature calcFeatures,
  'Train' Data_selection,
  coalesce(IF(loanmaster.new_loan_type = 'Flex-up', loanmaster.startApplyDateTime, loanmaster.termsAndConditionsSubmitDateTime),  cast(r.start_time as datetime)) AS appln_submit_datetime,
  loanmaster.disbursementDateTime,
  format_date('%Y-%m', coalesce(IF(loanmaster.new_loan_type = 'Flex-up', loanmaster.startApplyDateTime, loanmaster.termsAndConditionsSubmitDateTime),  cast(r.start_time as datetime))) as Application_month,
FROM cleaned r
left join risk_credit_mis.loan_master_table loanmaster
  ON loanmaster.digitalLoanAccountId = r.digitalLoanAccountId
 left join(SELECT DISTINCT mer_refferal_code, mer_name mer_name,store_type,store_tagging FROM `dl_loans_db_raw.tdbk_merchant_refferal_mtb`
  left join worktable_datachampions.TARGET_SPLIT P on P.STORE_NAME = mer_name
 qualify row_number() over(partition by mer_refferal_code order by  created_dt desc)=1) sil_category on loanmaster.purpleKey=sil_category.mer_refferal_code
qualify row_number() over (partition by r.customerId,r.digitalLoanAccountId, modelVersionId 
order by   coalesce(IF(loanmaster.new_loan_type = 'Flex-up', loanmaster.startApplyDateTime, loanmaster.termsAndConditionsSubmitDateTime),  cast(r.start_time as datetime)) desc) = 1
)
select * from base
;
"""

dfd = client.query(sq).to_dataframe()
print(f"The shape of the dataframe is: {dfd.shape}")
dfd.head()

The shape of the dataframe is: (459026, 19)


Unnamed: 0,customerId,digitalLoanAccountId,score,start_time,end_time,modelDisplayName,modelVersionId,new_loan_type,gender,loan_product_type,osType,Model_Name,product,trenchCategory,calcFeatures,Data_selection,appln_submit_datetime,disbursementDateTime,Application_month
0,1564678,1cc0c2c5-bc70-4c3a-a341-ba004f0e1e08,0.523477,2025-12-13T11:42:36.200377,2025-12-13T11:42:36.200377,alpha_stack_model_sil,v2,SIL-Instore,M,Appliance,android,alpha_stack_model_sil,SIL,Trench 2,"{""sb_demo_score"": 0.4063484569105422, ""apps_sc...",Train,2025-01-16 15:33:47,NaT,2025-01
1,1771165,846af18e-e3f5-4e3f-832d-1ae49ec8258a,0.6419,2025-12-13T11:42:35.448685,2025-12-13T11:42:35.448685,alpha_stack_model_sil,v2,SIL Competitor,F,Mall,ios,alpha_stack_model_sil,SIL,Trench 2,"{""sb_demo_score"": 0.47280119049185254, ""s_cred...",Train,2025-04-23 19:47:08,2025-04-23 19:57:47,2025-04
2,1904792,043ce842-af38-4b77-9087-e5592d34427d,0.07773,2025-12-13T11:25:12.936548,2025-12-13T11:25:12.936548,alpha_stack_model_sil,v1,SIL-Instore,F,Appliance,android,alpha_stack_model_sil,SIL,ALL,"{""sb_demo_score"": 0.11030846167149307, ""s_cic_...",Train,2023-02-17 14:49:09,NaT,2023-02
3,1924045,9e43c7af-62b7-4df8-85d4-1f16cb40a060,0.124388,2025-12-13T11:25:30.170465,2025-12-13T11:25:30.170465,alpha_stack_model_sil,v1,SIL-Instore,M,Appliance,android,alpha_stack_model_sil,SIL,ALL,"{""sb_demo_score"": 0.2794174222858253, ""s_cic_s...",Train,2023-06-10 14:37:11,NaT,2023-06
4,2056538,7b9abb97-a8fb-4cd9-8abb-0e3e8f9f70bb,0.081715,2025-12-13T11:25:23.914347,2025-12-13T11:25:23.914347,alpha_stack_model_sil,v1,SIL-Instore,F,Appliance,android,alpha_stack_model_sil,SIL,ALL,"{""sb_demo_score"": 0.09083004173894793, ""s_cic_...",Train,2023-05-22 17:31:13,2023-05-22 18:17:38,2023-05


In [25]:
df2 = dfd.copy()

In [26]:
df_concat = pd.concat([df1, df2], ignore_index=True)
print(f"The shape of the concatenated dataframe is: {df_concat.shape}")

The shape of the concatenated dataframe is: (565326, 19)


In [27]:
print(f"The shape of the concatenated dataframe is: {df_concat.shape}")
df_combined = dropping_duplicates(df_concat)
print(f"The shape of the dataframe after dropping duplicates is: {df_combined.shape}")

The shape of the concatenated dataframe is: (565326, 19)
The shape of the dataframe after dropping duplicates is: (565326, 19)


In [28]:
df_combined['score'] = pd.to_numeric(df_combined['score'], errors='coerce')

### PSI calculation

In [29]:
# Usage Example:
psi_results = calculate_psi_for_model(
    dfcombined=df_combined,
    configdf=configdf,
    model_display_name='alpha_stack_model_sil'
)
psi_results.head()


Starting PSI Pipeline for Model: alpha_stack_model_sil
Total combinations to process: 5

Processing combination 36/5: modelVersionId=v1, trenchCategory=ALL
  Data points: 307884
  Features expanded successfully
  Features identified: 4
  PSI calculated: 356 rows
  Score PSI calculated: 89 rows
Processing combination 37/5: modelVersionId=v2, trenchCategory=ALL
  No data found for specific combination. Retrieving all data for modelVersionId=v2...
  Data points: 257442
  Features expanded successfully
  Features identified: 12
  PSI calculated: 264 rows
  Score PSI calculated: 22 rows
Processing combination 38/5: modelVersionId=v2, trenchCategory=Trench 1
  Data points: 234604
  Features expanded successfully
  Features identified: 12
  PSI calculated: 216 rows
  Score PSI calculated: 18 rows
Processing combination 39/5: modelVersionId=v2, trenchCategory=Trench 2
  Data points: 10843
  Features expanded successfully
  Features identified: 12
  PSI calculated: 216 rows
  Score PSI calcula

Unnamed: 0,modelDisplayName,modelVersionId,trenchCategory,Feature,Feature_Type,Segment_Column,Segment_Value,Month,Base_Month,Current_Month,Base_Count,Actual_Count,Expected_Percentage,Actual_Percentage,PSI
0,alpha_stack_model_sil,v1,ALL,v1_Calc_sb_demo_score,numerical,Overall,All,2025-03,Train,2025-03,204453,5417,10.0,9.090909,0.206866
1,alpha_stack_model_sil,v1,ALL,v1_Calc_sb_demo_score,numerical,Overall,All,2025-04,Train,2025-04,204453,541,10.0,9.090909,1.635542
2,alpha_stack_model_sil,v1,ALL,v1_Calc_sb_demo_score,numerical,Overall,All,2025-05,Train,2025-05,204453,858,10.0,10.0,0.158906
3,alpha_stack_model_sil,v1,ALL,v1_Calc_sb_demo_score,numerical,Overall,All,2025-06,Train,2025-06,204453,9934,10.0,10.0,0.190354
4,alpha_stack_model_sil,v1,ALL,v1_Calc_sb_demo_score,numerical,Overall,All,2025-07,Train,2025-07,204453,14043,10.0,10.0,0.199356


In [30]:
psi_results['Feature'] = psi_results['Feature'].str.replace('_Calc_', '_', regex=False)
psi_results['Feature'].value_counts(dropna=False)


Feature
score                    165
v1_sb_demo_score          89
v1_s_apps_score           89
v1_s_credo_score          89
v1_s_cic_score            89
v2_is_android             76
v2_sa_cic_score_norm      76
v2_s_credo_score_norm     76
v2_apps_score_norm        76
v2_sb_demo_score_norm     76
v2_sa_cic_score           76
v2_ln_os_type             76
v2_trench_category        76
v2_s_credo_score          76
v2_apps_score             76
v2_sb_demo_score          76
v2_sa_stack_score         76
Name: count, dtype: int64

In [31]:
psi_results['Feature'].unique()

array(['v1_sb_demo_score', 'v1_s_cic_score', 'v1_s_apps_score',
       'v1_s_credo_score', 'score', 'v2_sb_demo_score', 'v2_apps_score',
       'v2_s_credo_score', 'v2_sa_cic_score', 'v2_trench_category',
       'v2_ln_os_type', 'v2_is_android', 'v2_sb_demo_score_norm',
       'v2_apps_score_norm', 'v2_s_credo_score_norm',
       'v2_sa_cic_score_norm', 'v2_sa_stack_score'], dtype=object)

In [32]:

# List of features to remove, '
remove_features = [ 'v2_trench_category',
       'v2_ln_os_type', 'v2_is_android', 'v2_sb_demo_score_norm',
       'v2_apps_score_norm', 'v2_s_credo_score_norm',
       'v2_sa_cic_score_norm',
                       ]
# Drop rows where feature is in the list
psi_results = psi_results[~psi_results['Feature'].isin(remove_features)]

# Replace 'score' with 'Sil_Alpha_Stack_score' in the Feature column
psi_results['Feature'] = psi_results['Feature'].replace('score', 'Sil_Alpha_Stack_score')

# Replace values starting with 'calc_' by removing the prefix
psi_results['Feature'] = psi_results['Feature'].apply(
    lambda x: x[5:] if x.startswith('calc_') else x
)




In [33]:
psi_results[['modelVersionId','Feature']].value_counts()
# df2.rename(columns={'calc_beta_demo_score':'calc_sb_demo_score',
#                      'calc_cic_score':'calc_s_cic_score',
#                       'calc_apps_score':'calc_s_apps_score',
#                         'calc_credo_gen_score':'calc_s_credo_score'}, inplace = True)

modelVersionId  Feature              
v1              Sil_Alpha_Stack_score    89
                v1_s_apps_score          89
                v1_s_cic_score           89
                v1_s_credo_score         89
                v1_sb_demo_score         89
v2              Sil_Alpha_Stack_score    76
                v2_apps_score            76
                v2_s_credo_score         76
                v2_sa_cic_score          76
                v2_sa_stack_score        76
                v2_sb_demo_score         76
Name: count, dtype: int64

In [34]:
table_id = "prj-prod-dataplatform.dap_ds_poweruser_playground.alpha_sil_stack_model_psi_v5"
job_config = bigquery.LoadJobConfig(
    write_disposition="WRITE_TRUNCATE",  # or "WRITE_APPEND"
)
job = client.load_table_from_dataframe(psi_results, table_id, job_config=job_config)
job.result()

LoadJob<project=prj-prod-dataplatform, location=asia-southeast1, id=ad48be6b-9124-4c40-b5ea-d052171af227>

### Beta Sil App Score

### Test

In [35]:
sq = """
WITH cleaned AS (
  SELECT
    customerId,digitalLoanAccountId,prediction,start_time,end_time,
    case when modelDisplayName = 'Beta - AppsScoreModel' then 'apps_score_model_sil' else modelDisplayName end as modelDisplayName ,   
    modelVersionId,
    case when trenchCategory is null then 'ALL' 
         when trenchCategory='' then 'ALL'    
            else trenchCategory end trenchCategory,
    REPLACE(REPLACE(calcFeature, "'", '"'), "None", "null") AS calcFeature,
    REPLACE(REPLACE(prediction, "'", '"'), "None", "null") AS prediction_clean
  FROM `prj-prod-dataplatform.audit_balance.ml_model_run_details`
  WHERE modelDisplayName in ('Beta - AppsScoreModel', 'apps_score_model_sil')
    
  ),
base as 
(SELECT
  r.customerId,r.digitalLoanAccountId,prediction,start_time,end_time,
  modelDisplayName,modelVersionId,
     loanmaster.new_loan_type,
 loanmaster.gender,
    case when loanmaster.loantype='BNPL' and sil_category.store_type =1 then 'Appliance'
    when loanmaster.loantype='BNPL' and sil_category.store_type =2 then 'Mobile'
    when loanmaster.loantype='BNPL' and sil_category.store_type =3 then 'Mall'
    when loanmaster.loantype='BNPL' and sil_category.store_type not in (1,2,3) then store_tagging
    else 'not applicable' end as loan_product_type,
     case when lower(coalesce(loanmaster.osversion_v2, loanmaster.osVersion)) like '%andro%' then 'android'
    when lower(coalesce(loanmaster.osversion_v2, loanmaster.osVersion)) like '%os%' then 'ios'
    when lower(loanmaster.deviceType) like '%andro%' then 'android'
    else 'ios' end osType,
 'apps_score_model_sil' Model_Name,
 'SIL' as product,
  trenchCategory,
  'Test' Data_selection,
  -- sil_beta_app_score
  safe_cast(JSON_VALUE(prediction_clean, "$.combined_score") AS float64) as score,
 calcFeature calcFeatures,
    coalesce(IF(loanmaster.new_loan_type = 'Flex-up', loanmaster.startApplyDateTime, loanmaster.termsAndConditionsSubmitDateTime),  r.start_time) AS appln_submit_datetime,
    loanmaster.disbursementDateTime,
    format_date('%Y-%m', coalesce(IF(loanmaster.new_loan_type = 'Flex-up', loanmaster.startApplyDateTime, loanmaster.termsAndConditionsSubmitDateTime),  r.start_time)) as Application_month,
 FROM cleaned r
left join risk_credit_mis.loan_master_table loanmaster
  ON loanmaster.digitalLoanAccountId = r.digitalLoanAccountId
 left join(SELECT DISTINCT mer_refferal_code, mer_name mer_name,store_type,store_tagging FROM `dl_loans_db_raw.tdbk_merchant_refferal_mtb`
  left join worktable_datachampions.TARGET_SPLIT P on P.STORE_NAME = mer_name
 qualify row_number() over(partition by mer_refferal_code order by  created_dt desc)=1) sil_category on loanmaster.purpleKey=sil_category.mer_refferal_code
  qualify row_number() over(partition by r.customerId, r.digitalLoanAccountid,  modelVersionId  order by coalesce(IF(loanmaster.new_loan_type = 'Flex-up', loanmaster.startApplyDateTime, loanmaster.termsAndConditionsSubmitDateTime),  r.start_time) desc) = 1
  )
select * from base
;
"""
dfd = client.query(sq).to_dataframe()
dfd.head()

Unnamed: 0,customerId,digitalLoanAccountId,prediction,start_time,end_time,modelDisplayName,modelVersionId,new_loan_type,gender,loan_product_type,osType,Model_Name,product,trenchCategory,Data_selection,score,calcFeatures,appln_submit_datetime,disbursementDateTime,Application_month
0,1182567,5de29570-dca7-41cf-8937-ac9c6be445e3,"{'cb_score': 0.48666602482737553, 'dl_score': ...",2025-11-16 05:35:59.061189,2025-11-16 05:35:59.540910,apps_score_model_sil,v1,SIL-Instore,F,Mall,android,apps_score_model_sil,SIL,ALL,Test,0.419795,"{""app_cnt_rated_for_3plus_ever"":26,""app_cnt_ed...",2025-11-16 13:35:55,2025-11-16 13:39:08,2025-11
1,1790153,e18ed625-1281-43c9-b080-0a305d893d1b,"{'cb_score': 0.383732867856588, 'dl_score': 0....",2025-10-08 09:48:35.717839,2025-10-08 09:48:36.186986,apps_score_model_sil,v1,SIL-Instore,M,Appliance,android,apps_score_model_sil,SIL,ALL,Test,0.365955,"{""app_cnt_rated_for_3plus_ever"":20,""app_cnt_ed...",2025-10-08 17:48:32,2025-10-08 17:57:03,2025-10
2,1796256,ff2e92e8-7549-4b67-8628-85dd11c75b92,"{'cb_score': 0.43504328301823997, 'dl_score': ...",2025-06-17 14:52:23.287023,2025-06-17 14:52:23.689920,apps_score_model_sil,v1,SIL-Instore,F,Appliance,android,apps_score_model_sil,SIL,ALL,Test,0.434787,"{""app_cnt_rated_for_3plus_ever"":14,""app_cnt_ed...",2025-06-15 16:56:49,2025-06-15 16:59:45,2025-06
3,2063303,9929b650-13ee-420e-a59b-b285e7e8089d,"{'cb_score': 0.3275786296470868, 'dl_score': 0...",2025-06-29 03:08:48.017231,2025-06-29 03:08:48.513659,apps_score_model_sil,v1,SIL-Instore,M,Mall,android,apps_score_model_sil,SIL,ALL,Test,0.435167,"{""app_cnt_rated_for_3plus_ever"":25,""app_cnt_ed...",2025-06-29 11:08:44,2025-07-01 11:57:27,2025-06
4,2207073,a631e405-d0d9-416d-b9fb-c63c8a996a7b,"{'cb_score': 0.3554982624707621, 'dl_score': 0...",2025-11-07 07:58:23.946087,2025-11-07 07:58:24.381201,apps_score_model_sil,v1,SIL Competitor,M,Mall,android,apps_score_model_sil,SIL,ALL,Test,0.410058,"{""app_cnt_rated_for_3plus_ever"":65,""app_cnt_ed...",2025-11-07 15:58:21,NaT,2025-11


In [36]:
df1 = dfd.copy()

### Train

In [37]:
sq = """ 
WITH cleaned AS (
  SELECT
    customerId,digitalLoanAccountId,prediction,start_time,end_time,
    case when modelDisplayName = 'Beta - AppsScoreModel' then 'apps_score_model_sil' else modelDisplayName end as modelDisplayName 
     ,modelVersionId,
        case when trenchCategory is null then 'ALL' 
         when trenchCategory = '' then 'ALL'
    else trenchCategory end trenchCategory,
    REPLACE(REPLACE(calcFeature, "'", '"'), "None", "null") AS calcFeature,
    REPLACE(REPLACE(cast(prediction as string), "'", '"'), "None", "null") AS prediction_clean
  FROM prj-prod-dataplatform.dap_ds_poweruser_playground.ml_training_model_run_details
  WHERE modelDisplayName in ('Beta - AppsScoreModel', 'apps_score_model_sil')
      ),
base as 
(SELECT
  r.customerId,r.digitalLoanAccountId,prediction,start_time,end_time,
  modelDisplayName,modelVersionId,
     loanmaster.new_loan_type,
 loanmaster.gender,
    case when loanmaster.loantype='BNPL' and sil_category.store_type =1 then 'Appliance'
    when loanmaster.loantype='BNPL' and sil_category.store_type =2 then 'Mobile'
    when loanmaster.loantype='BNPL' and sil_category.store_type =3 then 'Mall'
    when loanmaster.loantype='BNPL' and sil_category.store_type not in (1,2,3) then store_tagging
    else 'not applicable' end as loan_product_type,
     case when lower(coalesce(loanmaster.osversion_v2, loanmaster.osVersion)) like '%andro%' then 'android'
    when lower(coalesce(loanmaster.osversion_v2, loanmaster.osVersion)) like '%os%' then 'ios'
    when lower(loanmaster.deviceType) like '%andro%' then 'android'
    else 'ios' end osType,
 'apps_score_model_sil' Model_Name,
 'SIL' as product,
  trenchCategory,
  'Train' Data_selection,
  coalesce(prediction, safe_cast(JSON_VALUE(prediction_clean, "$.combined_score") AS float64)) as score,
 calcFeature calcFeatures,
    IF(loanmaster.new_loan_type = 'Flex-up', loanmaster.startApplyDateTime, loanmaster.termsAndConditionsSubmitDateTime) AS appln_submit_datetime,
    loanmaster.disbursementDateTime,
    format_date('%Y-%m', IF(loanmaster.new_loan_type = 'Flex-up', loanmaster.startApplyDateTime, loanmaster.termsAndConditionsSubmitDateTime)) as Application_month,
 FROM cleaned r
left join risk_credit_mis.loan_master_table loanmaster
  ON loanmaster.digitalLoanAccountId = r.digitalLoanAccountId
 left join(SELECT DISTINCT mer_refferal_code, mer_name mer_name,store_type,store_tagging FROM `dl_loans_db_raw.tdbk_merchant_refferal_mtb`
  left join worktable_datachampions.TARGET_SPLIT P on P.STORE_NAME = mer_name
 qualify row_number() over(partition by mer_refferal_code order by  created_dt desc)=1) sil_category on loanmaster.purpleKey=sil_category.mer_refferal_code
 qualify row_number() over (partition by r.customerId,r.digitalLoanAccountId, modelVersionId 
order by   coalesce(IF(loanmaster.new_loan_type = 'Flex-up', loanmaster.startApplyDateTime, loanmaster.termsAndConditionsSubmitDateTime),  cast(r.start_time as datetime)) desc) = 1
)
select * from base
;
"""

dfd = client.query(sq).to_dataframe()
dfd.head()

Unnamed: 0,customerId,digitalLoanAccountId,prediction,start_time,end_time,modelDisplayName,modelVersionId,new_loan_type,gender,loan_product_type,osType,Model_Name,product,trenchCategory,Data_selection,score,calcFeatures,appln_submit_datetime,disbursementDateTime,Application_month
0,1111410,fa114ef3-7bbe-4031-8131-0713f7e74a3d,0.397246,2025-12-13T11:26:05.003406,2025-12-13T11:26:05.003406,apps_score_model_sil,v1,SIL Repeat,F,Appliance,android,apps_score_model_sil,SIL,ALL,Train,0.397246,"{""app_cnt_rated_for_3plus_ever"": 45.0, ""app_cn...",2025-01-07 16:02:44,NaT,2025-01
1,1183310,3ca56675-900c-4fc6-a801-74500cd41189,0.253085,2025-12-13T11:26:21.565043,2025-12-13T11:26:21.565043,apps_score_model_sil,v1,SIL-Instore,M,Appliance,android,apps_score_model_sil,SIL,ALL,Train,0.253085,"{""app_cnt_rated_for_3plus_ever"": 85.0, ""app_cn...",2024-09-09 12:24:14,NaT,2024-09
2,1276833,c5068e92-3dda-4be6-9f25-e9a9e5b339d9,0.549104,2025-12-13T11:26:39.439063,2025-12-13T11:26:39.439063,apps_score_model_sil,v1,SIL-Instore,F,Appliance,android,apps_score_model_sil,SIL,ALL,Train,0.549104,"{""app_cnt_rated_for_3plus_ever"": 26.0, ""app_cn...",2024-05-05 15:53:10,NaT,2024-05
3,1410278,979e190b-2e35-4d45-b566-507f79be3178,0.604922,2025-12-13T11:26:37.795517,2025-12-13T11:26:37.795517,apps_score_model_sil,v1,SIL ZERO,F,Appliance,android,apps_score_model_sil,SIL,ALL,Train,0.604922,"{""app_cnt_rated_for_3plus_ever"": 14.0, ""app_cn...",2024-09-15 14:36:31,NaT,2024-09
4,1416748,78e01765-e029-4f14-a04d-75c576ad66f2,0.501816,2025-12-13T11:26:05.142948,2025-12-13T11:26:05.142948,apps_score_model_sil,v1,SIL-Instore,M,Appliance,android,apps_score_model_sil,SIL,ALL,Train,0.501816,"{""app_cnt_rated_for_3plus_ever"": 23.0, ""app_cn...",2023-04-17 18:23:51,2023-04-17 18:45:49,2023-04


In [38]:
df2 = dfd.copy()

In [39]:
df_concat = pd.concat([df1, df2], ignore_index=True)
print(f"The shape of the concatenated dataframe is: {df_concat.shape}")

The shape of the concatenated dataframe is: (791226, 20)


In [40]:
print(f"The shape of the concatenated dataframe is: {df_concat.shape}")
df_combined = dropping_duplicates(df_concat)
print(f"The shape of the dataframe after dropping duplicates is: {df_combined.shape}")

The shape of the concatenated dataframe is: (791226, 20)
The shape of the dataframe after dropping duplicates is: (791226, 20)


In [41]:
df_combined['score'] = pd.to_numeric(df_combined['score'], errors='coerce')

### PSI calculation

In [42]:
# Usage Example:
psi_results = calculate_psi_for_model(
    dfcombined=df_combined,
    configdf=configdf,
    model_display_name='apps_score_model_sil'
)
psi_results.head()


Starting PSI Pipeline for Model: apps_score_model_sil
Total combinations to process: 5

Processing combination 41/5: modelVersionId=v1, trenchCategory=ALL
  Data points: 458248
  Features expanded successfully
  Features identified: 18
  PSI calculated: 1422 rows
  Score PSI calculated: 79 rows
Processing combination 42/5: modelVersionId=v2, trenchCategory=ALL
  No data found for specific combination. Retrieving all data for modelVersionId=v2...
  Data points: 332978
  Features expanded successfully
  Features identified: 30
  PSI calculated: 630 rows
  Score PSI calculated: 21 rows
Processing combination 43/5: modelVersionId=v2, trenchCategory=Trench 1
  Data points: 300128
  Features expanded successfully
  Features identified: 10
  PSI calculated: 160 rows
  Score PSI calculated: 16 rows
Processing combination 44/5: modelVersionId=v2, trenchCategory=Trench 2
  Data points: 12489
  Features expanded successfully
  Features identified: 10
  PSI calculated: 170 rows
  Score PSI calcul

Unnamed: 0,modelDisplayName,modelVersionId,trenchCategory,Feature,Feature_Type,Segment_Column,Segment_Value,Month,Base_Month,Current_Month,Base_Count,Actual_Count,Expected_Percentage,Actual_Percentage,PSI
0,apps_score_model_sil,v1,ALL,v1_Calc_app_cnt_rated_for_3plus_ever,numerical,Overall,All,2025-03,Train,2025-03,317385,6506,10.0,10.0,0.004194
1,apps_score_model_sil,v1,ALL,v1_Calc_app_cnt_rated_for_3plus_ever,numerical,Overall,All,2025-04,Train,2025-04,317385,914,10.0,10.0,0.004939
2,apps_score_model_sil,v1,ALL,v1_Calc_app_cnt_rated_for_3plus_ever,numerical,Overall,All,2025-05,Train,2025-05,317385,1389,10.0,10.0,0.011002
3,apps_score_model_sil,v1,ALL,v1_Calc_app_cnt_rated_for_3plus_ever,numerical,Overall,All,2025-06,Train,2025-06,317385,14507,10.0,10.0,0.00349
4,apps_score_model_sil,v1,ALL,v1_Calc_app_cnt_rated_for_3plus_ever,numerical,Overall,All,2025-07,Train,2025-07,317385,20089,10.0,10.0,0.00784


In [43]:

psi_results['Feature'] = psi_results['Feature'].str.replace('_Calc_', '_', regex=False)
psi_results[['modelVersionId','Feature']].value_counts()


modelVersionId  Feature                                       
v1              score                                             79
                v1_app_avg_time_bw_installed_mins_30d             79
                v1_app_vel_finance_30_over_365                    79
                v1_app_median_time_bw_installed_mins_30d          79
                v1_app_first_payday_install_to_apply_days         79
                v1_app_first_competitors_install_to_apply_days    79
                v1_app_cnt_travel_and_local_ever                  79
                v1_app_cnt_rated_for_3plus_ever                   79
                v1_app_cnt_payday_90d                             79
                v1_app_cnt_music_and_audio_ever                   79
                v1_app_cnt_finance_90d                            79
                v1_app_cnt_finance_7d                             79
                v1_app_cnt_finance_30d                            79
                v1_app_cnt_education_eve

In [44]:

# List of features to remove, '
remove_features = ['v2_appScoreModel'                       ]
# Drop rows where feature is in the list
psi_results = psi_results[~psi_results['Feature'].isin(remove_features)]

# Replace 'score' with 'Sil_Alpha_Stack_score' in the Feature column
psi_results['Feature'] = psi_results['Feature'].replace('score', 'sil_beta_app_score')

# Replace values starting with 'calc_' by removing the prefix
psi_results['Feature'] = psi_results['Feature'].apply(
    lambda x: x[5:] if x.startswith('calc_') else x
)




In [45]:
psi_results[['modelVersionId','Feature']].value_counts()

modelVersionId  Feature                                       
v1              sil_beta_app_score                                79
                v1_app_cnt_finance_90d                            79
                v1_app_avg_time_bw_installed_mins_30d             79
                v1_app_median_time_bw_installed_mins_30d          79
                v1_app_first_payday_install_to_apply_days         79
                v1_app_first_competitors_install_to_apply_days    79
                v1_app_cnt_travel_and_local_ever                  79
                v1_app_cnt_rated_for_3plus_ever                   79
                v1_app_cnt_payday_90d                             79
                v1_app_cnt_music_and_audio_ever                   79
                v1_app_vel_finance_30_over_365                    79
                v1_app_cnt_finance_7d                             79
                v1_app_cnt_business_ever                          79
                v1_app_cnt_finance_30d  

In [46]:
table_id = "prj-prod-dataplatform.dap_ds_poweruser_playground.beta_sil_appscore_model_psi_v5"
job_config = bigquery.LoadJobConfig(
    write_disposition="WRITE_TRUNCATE",  # or "WRITE_APPEND"
)
job = client.load_table_from_dataframe(psi_results, table_id, job_config=job_config)
job.result()

LoadJob<project=prj-prod-dataplatform, location=asia-southeast1, id=dd71ce7a-d72a-4a1f-9584-732909b37af6>

## Beta SIL Demo Score

### Test

In [49]:
sq = """
WITH cleaned AS (
  SELECT
    customerId,digitalLoanAccountId,prediction,start_time,end_time,
    case when modelDisplayName = 'Beta - DemoScoreModel' then 'beta_demo_model_sil' else modelDisplayName end as modelDisplayName ,   
    modelVersionId,
    case when trenchCategory is null then 'ALL' 
         when trenchCategory='' then 'ALL'    
            else trenchCategory end trenchCategory,
    REPLACE(REPLACE(calcFeature, "'", '"'), "None", "null") AS calcFeature,
    REPLACE(REPLACE(prediction, "'", '"'), "None", "null") AS prediction_clean
  FROM `prj-prod-dataplatform.audit_balance.ml_model_run_details`
  WHERE modelDisplayName in  ('Beta - DemoScoreModel', 'beta_demo_model_sil')
  ),
base as 
(SELECT
  r.customerId,r.digitalLoanAccountId,prediction,start_time,end_time,
  modelDisplayName,modelVersionId,
     loanmaster.new_loan_type,
 loanmaster.gender,
    case when loanmaster.loantype='BNPL' and sil_category.store_type =1 then 'Appliance'
    when loanmaster.loantype='BNPL' and sil_category.store_type =2 then 'Mobile'
    when loanmaster.loantype='BNPL' and sil_category.store_type =3 then 'Mall'
    when loanmaster.loantype='BNPL' and sil_category.store_type not in (1,2,3) then store_tagging
    else 'not applicable' end as loan_product_type,
     case when lower(coalesce(loanmaster.osversion_v2, loanmaster.osVersion)) like '%andro%' then 'android'
    when lower(coalesce(loanmaster.osversion_v2, loanmaster.osVersion)) like '%os%' then 'ios'
    when lower(loanmaster.deviceType) like '%andro%' then 'android'
    else 'ios' end osType,
 'beta_demo_model_sil' Model_Name,
 'SIL' as product,
  trenchCategory,
  'Test' Data_selection,
  -- sil_beta_demo_score
  prediction as score,
 calcFeature calcFeatures,
    coalesce(IF(loanmaster.new_loan_type = 'Flex-up', loanmaster.startApplyDateTime, loanmaster.termsAndConditionsSubmitDateTime),  r.start_time) AS appln_submit_datetime,
    loanmaster.disbursementDateTime,
    format_date('%Y-%m', coalesce(IF(loanmaster.new_loan_type = 'Flex-up', loanmaster.startApplyDateTime, loanmaster.termsAndConditionsSubmitDateTime),  r.start_time)) as Application_month,
 FROM cleaned r
left join risk_credit_mis.loan_master_table loanmaster
  ON loanmaster.digitalLoanAccountId = r.digitalLoanAccountId
 left join(SELECT DISTINCT mer_refferal_code, mer_name mer_name,store_type,store_tagging FROM `dl_loans_db_raw.tdbk_merchant_refferal_mtb`
  left join worktable_datachampions.TARGET_SPLIT P on P.STORE_NAME = mer_name
 qualify row_number() over(partition by mer_refferal_code order by  created_dt desc)=1) sil_category on loanmaster.purpleKey=sil_category.mer_refferal_code
  qualify row_number() over(partition by r.customerId, r.digitalLoanAccountid,  modelVersionId  order by coalesce(IF(loanmaster.new_loan_type = 'Flex-up', loanmaster.startApplyDateTime, loanmaster.termsAndConditionsSubmitDateTime),  r.start_time) desc) = 1
  )
select * from base
;
"""
dfd = client.query(sq).to_dataframe()
dfd.head()

Unnamed: 0,customerId,digitalLoanAccountId,prediction,start_time,end_time,modelDisplayName,modelVersionId,new_loan_type,gender,loan_product_type,osType,Model_Name,product,trenchCategory,Data_selection,score,calcFeatures,appln_submit_datetime,disbursementDateTime,Application_month
0,1000721,3ed28cd0-2b7c-4f6b-9a37-46a5761466b2,0.1097859906,2025-12-14 03:36:52.366620,2025-12-14 03:36:52.434902,beta_demo_model_sil,v1,SIL Competitor,M,Appliance,android,beta_demo_model_sil,SIL,ALL,Test,0.1097859906,"{""beta_de_ln_vas_opted_flag"": ""1"", ""beta_de_ln...",2025-12-14 11:36:46,2025-12-14 11:50:22,2025-12
1,1350234,51a22976-5da1-4097-9f82-5fe008a65a6f,0.0810933792,2025-07-27 04:28:30.970945,2025-07-27 04:28:31.033531,beta_demo_model_sil,v1,SIL-Instore,M,Appliance,android,beta_demo_model_sil,SIL,ALL,Test,0.0810933792,"{""beta_de_ln_vas_opted_flag"": ""1"", ""beta_de_ln...",2025-07-27 12:28:28,2025-07-27 12:50:14,2025-07
2,1732850,9b015464-3467-40a9-a72d-33409e99c2e0,0.1418460966,2025-08-16 08:19:40.443770,2025-08-16 08:19:40.508948,beta_demo_model_sil,v1,SIL Competitor,F,Mall,ios,beta_demo_model_sil,SIL,ALL,Test,0.1418460966,"{""beta_de_ln_vas_opted_flag"": ""0"", ""beta_de_ln...",2025-08-16 16:19:38,NaT,2025-08
3,2138075,902b04e6-22db-41a2-8677-be2429641eda,0.41393396390825,2025-12-16 03:22:00.477045,2025-12-16 03:22:01.124216,beta_demo_model_sil,v2,SIL Competitor,F,Appliance,android,beta_demo_model_sil,SIL,Trench 2,Test,0.41393396390825,"{""demoScoreModel"": {""ln_vas_opted_flag"": ""0"", ...",2025-12-16 11:21:53,2025-12-16 15:25:36,2025-12
4,2237251,bc40948e-9aa8-4480-bf6f-a34ced197f17,0.1185251639,2025-12-07 03:56:51.890091,2025-12-07 03:56:51.958922,beta_demo_model_sil,v1,SIL Competitor,F,Appliance,ios,beta_demo_model_sil,SIL,ALL,Test,0.1185251639,"{""beta_de_ln_vas_opted_flag"": ""1"", ""beta_de_ln...",2025-12-07 11:56:36,2025-12-07 12:01:04,2025-12


In [50]:
df1 = dfd.copy()

### Train

In [51]:
sq = """ 
WITH cleaned AS (
  SELECT
    customerId,digitalLoanAccountId,prediction,start_time,end_time,
    case when modelDisplayName = 'Beta - DemoScoreModel' then 'beta_demo_model_sil' else modelDisplayName end as modelDisplayName    
     ,modelVersionId,
        case when trenchCategory is null then 'ALL' 
         when trenchCategory = '' then 'ALL'
    else trenchCategory end trenchCategory,
    REPLACE(REPLACE(calcFeature, "'", '"'), "None", "null") AS calcFeature,
    REPLACE(REPLACE(cast(prediction as string), "'", '"'), "None", "null") AS prediction_clean
  FROM prj-prod-dataplatform.dap_ds_poweruser_playground.ml_training_model_run_details
  WHERE modelDisplayName in  ('Beta - DemoScoreModel', 'beta_demo_model_sil')
      ),
base as 
(SELECT
  r.customerId,r.digitalLoanAccountId,prediction,start_time,end_time,
  modelDisplayName,modelVersionId,
     loanmaster.new_loan_type,
 loanmaster.gender,
    case when loanmaster.loantype='BNPL' and sil_category.store_type =1 then 'Appliance'
    when loanmaster.loantype='BNPL' and sil_category.store_type =2 then 'Mobile'
    when loanmaster.loantype='BNPL' and sil_category.store_type =3 then 'Mall'
    when loanmaster.loantype='BNPL' and sil_category.store_type not in (1,2,3) then store_tagging
    else 'not applicable' end as loan_product_type,
     case when lower(coalesce(loanmaster.osversion_v2, loanmaster.osVersion)) like '%andro%' then 'android'
    when lower(coalesce(loanmaster.osversion_v2, loanmaster.osVersion)) like '%os%' then 'ios'
    when lower(loanmaster.deviceType) like '%andro%' then 'android'
    else 'ios' end osType,
 'beta_demo_model_sil' Model_Name,
 'SIL' as product,
  trenchCategory,
  'Train' Data_selection,
  prediction as score,
 calcFeature calcFeatures,
    IF(loanmaster.new_loan_type = 'Flex-up', loanmaster.startApplyDateTime, loanmaster.termsAndConditionsSubmitDateTime) AS appln_submit_datetime,
    loanmaster.disbursementDateTime,
    format_date('%Y-%m', IF(loanmaster.new_loan_type = 'Flex-up', loanmaster.startApplyDateTime, loanmaster.termsAndConditionsSubmitDateTime)) as Application_month,
 FROM cleaned r
left join risk_credit_mis.loan_master_table loanmaster
  ON loanmaster.digitalLoanAccountId = r.digitalLoanAccountId
 left join(SELECT DISTINCT mer_refferal_code, mer_name mer_name,store_type,store_tagging FROM `dl_loans_db_raw.tdbk_merchant_refferal_mtb`
  left join worktable_datachampions.TARGET_SPLIT P on P.STORE_NAME = mer_name
 qualify row_number() over(partition by mer_refferal_code order by  created_dt desc)=1) sil_category on loanmaster.purpleKey=sil_category.mer_refferal_code
 qualify row_number() over (partition by r.customerId,r.digitalLoanAccountId, modelVersionId 
order by   coalesce(IF(loanmaster.new_loan_type = 'Flex-up', loanmaster.startApplyDateTime, loanmaster.termsAndConditionsSubmitDateTime),  cast(r.start_time as datetime)) desc) = 1
)
select * from base
;
"""

dfd = client.query(sq).to_dataframe()
dfd.head()

Unnamed: 0,customerId,digitalLoanAccountId,prediction,start_time,end_time,modelDisplayName,modelVersionId,new_loan_type,gender,loan_product_type,osType,Model_Name,product,trenchCategory,Data_selection,score,calcFeatures,appln_submit_datetime,disbursementDateTime,Application_month
0,1055817,ff1c832f-bfea-4c37-9384-8960636513ed,0.157715,2025-12-13T11:28:21.769592,2025-12-13T11:28:21.769592,beta_demo_model_sil,v1,SIL Repeat,M,Appliance,android,beta_demo_model_sil,SIL,ALL,Train,0.157715,"{""beta_de_ln_vas_opted_flag"": ""1"", ""beta_de_ln...",2024-12-20 14:41:24,2024-12-20 14:44:10,2024-12
1,1075642,0c0ff939-0d41-4b5a-933e-dc86498f2746,0.536496,2025-12-13T11:46:15.243387,2025-12-13T11:46:15.243387,beta_demo_model_sil,v2,SIL-Instore,M,Appliance,android,beta_demo_model_sil,SIL,Trench 2,Train,0.536496,"{""ln_vas_opted_flag"": ""1"", ""ln_doc_type_rolled...",2025-01-06 15:28:43,2025-01-06 15:33:05,2025-01
2,1100922,d77d6f91-d27c-4035-9caf-5b49d3c9cca5,0.091887,2025-12-13T11:28:35.023087,2025-12-13T11:28:35.023087,beta_demo_model_sil,v1,SIL Repeat,F,Appliance,android,beta_demo_model_sil,SIL,ALL,Train,0.091887,"{""beta_de_ln_vas_opted_flag"": ""1"", ""beta_de_ln...",2024-12-27 11:19:26,2024-12-27 11:22:02,2024-12
3,1123651,f2895209-f888-404f-8288-569239f41ebf,0.094283,2025-12-13T11:28:23.574565,2025-12-13T11:28:23.574565,beta_demo_model_sil,v1,SIL Repeat,M,Appliance,ios,beta_demo_model_sil,SIL,ALL,Train,0.094283,"{""beta_de_ln_vas_opted_flag"": ""1"", ""beta_de_ln...",2024-12-21 10:47:52,NaT,2024-12
4,1201657,e45874a5-6f20-4898-b158-e301a7191628,0.236439,2025-12-13T11:28:09.752125,2025-12-13T11:28:09.752125,beta_demo_model_sil,v1,SIL-Instore,F,Appliance,android,beta_demo_model_sil,SIL,ALL,Train,0.236439,"{""beta_de_ln_vas_opted_flag"": ""1"", ""beta_de_ln...",2024-07-20 18:29:48,NaT,2024-07


In [52]:
df2 = dfd.copy()

In [53]:
df_concat = pd.concat([df1, df2], ignore_index=True)
print(f"The shape of the concatenated dataframe is: {df_concat.shape}")

The shape of the concatenated dataframe is: (885339, 20)


In [54]:
print(f"The shape of the concatenated dataframe is: {df_concat.shape}")
df_combined = dropping_duplicates(df_concat)
print(f"The shape of the dataframe after dropping duplicates is: {df_combined.shape}")

The shape of the concatenated dataframe is: (885339, 20)
The shape of the dataframe after dropping duplicates is: (885339, 20)


In [55]:
df_combined['score'] = pd.to_numeric(df_combined['score'], errors='coerce')

### PSI calculation

In [56]:
# Usage Example:
psi_results = calculate_psi_for_model(
    dfcombined=df_combined,
    configdf=configdf,
    model_display_name='beta_demo_model_sil'
)
psi_results.head()


Starting PSI Pipeline for Model: beta_demo_model_sil
Total combinations to process: 5

Processing combination 46/5: modelVersionId=v1, trenchCategory=ALL
  Data points: 508096
  Features expanded successfully
  Features identified: 17
  PSI calculated: 1513 rows
  Score PSI calculated: 89 rows
Processing combination 47/5: modelVersionId=v2, trenchCategory=ALL
  No data found for specific combination. Retrieving all data for modelVersionId=v2...
  Data points: 377243
  Features expanded successfully
  Features identified: 16
  PSI calculated: 352 rows
  Score PSI calculated: 22 rows
Processing combination 48/5: modelVersionId=v2, trenchCategory=Trench 1
  Data points: 349745
  Features expanded successfully
  Features identified: 16
  PSI calculated: 288 rows
  Score PSI calculated: 18 rows
Processing combination 49/5: modelVersionId=v2, trenchCategory=Trench 2
  Data points: 13501
  Features expanded successfully
  Features identified: 16
  PSI calculated: 288 rows
  Score PSI calcula

Unnamed: 0,modelDisplayName,modelVersionId,trenchCategory,Feature,Feature_Type,Segment_Column,Segment_Value,Month,Base_Month,Current_Month,Base_Count,Actual_Count,Expected_Percentage,Actual_Percentage,PSI
0,beta_demo_model_sil,v1,ALL,v1_Calc_beta_de_ln_vas_opted_flag,categorical,Overall,All,2025-03,Train,2025-03,349207,7278,50.0,50.0,0.038641
1,beta_demo_model_sil,v1,ALL,v1_Calc_beta_de_ln_vas_opted_flag,categorical,Overall,All,2025-04,Train,2025-04,349207,1039,50.0,50.0,0.041896
2,beta_demo_model_sil,v1,ALL,v1_Calc_beta_de_ln_vas_opted_flag,categorical,Overall,All,2025-05,Train,2025-05,349207,1561,50.0,50.0,0.047286
3,beta_demo_model_sil,v1,ALL,v1_Calc_beta_de_ln_vas_opted_flag,categorical,Overall,All,2025-06,Train,2025-06,349207,16368,50.0,50.0,0.017988
4,beta_demo_model_sil,v1,ALL,v1_Calc_beta_de_ln_vas_opted_flag,categorical,Overall,All,2025-07,Train,2025-07,349207,22790,50.0,50.0,0.01152


In [57]:

psi_results['Feature'] = psi_results['Feature'].str.replace('_Calc_', '_', regex=False)
psi_results[['modelVersionId','Feature']].value_counts()


modelVersionId  Feature                               
v1              score                                     89
                v1_beta_de_ln_province_bin                89
                v1_beta_de_ln_age_bin                     89
                v1_beta_de_onb_name_email_match_score     89
                v1_beta_de_ln_vas_opted_flag              89
                v1_beta_de_ln_telconame                   89
                v1_beta_de_ln_source_of_funds_new_bin     89
                v1_beta_de_ln_ref2_type                   89
                v1_beta_de_ln_ref1_type                   89
                v1_beta_de_time_bw_onb_loan_appln_mins    89
                v1_beta_de_ln_marital_status              89
                v1_beta_de_ln_employment_type_new_bin     89
                v1_beta_de_ln_email_primary_domain        89
                v1_beta_de_ln_education_level             89
                v1_beta_de_ln_doc_type_rolled             89
                v1_beta_de_ln_

In [58]:

# List of features to remove, '
remove_features = ['v2_appScoreModel'                       ]
# Drop rows where feature is in the list
psi_results = psi_results[~psi_results['Feature'].isin(remove_features)]

# Replace 'score' with 'Sil_Alpha_Stack_score' in the Feature column
psi_results['Feature'] = psi_results['Feature'].replace('score', 'sil_beta_demo_score')

# Replace values starting with 'calc_' by removing the prefix
psi_results['Feature'] = psi_results['Feature'].apply(
    lambda x: x[5:] if x.startswith('calc_') else x
)




In [59]:
psi_results[['modelVersionId','Feature']].value_counts()

modelVersionId  Feature                               
v1              sil_beta_demo_score                       89
                v1_beta_de_ln_province_bin                89
                v1_beta_de_ln_age_bin                     89
                v1_beta_de_onb_name_email_match_score     89
                v1_beta_de_ln_vas_opted_flag              89
                v1_beta_de_ln_telconame                   89
                v1_beta_de_ln_source_of_funds_new_bin     89
                v1_beta_de_ln_ref2_type                   89
                v1_beta_de_ln_ref1_type                   89
                v1_beta_de_time_bw_onb_loan_appln_mins    89
                v1_beta_de_ln_marital_status              89
                v1_beta_de_ln_employment_type_new_bin     89
                v1_beta_de_ln_email_primary_domain        89
                v1_beta_de_ln_education_level             89
                v1_beta_de_ln_doc_type_rolled             89
                v1_beta_de_ln_

In [60]:
table_id = "prj-prod-dataplatform.dap_ds_poweruser_playground.beta_demo_score_model_psi_v5"
job_config = bigquery.LoadJobConfig(
    write_disposition="WRITE_TRUNCATE",  # or "WRITE_APPEND"
)
job = client.load_table_from_dataframe(psi_results, table_id, job_config=job_config)
job.result()

LoadJob<project=prj-prod-dataplatform, location=asia-southeast1, id=bcb10df4-1a9a-421a-9a74-1fd185795cc1>

## Beta SIL STACK Score Model

### Test

In [None]:
sq = """
WITH cleaned AS (
  SELECT
    customerId,digitalLoanAccountId,prediction,start_time,end_time,
    case when modelDisplayName = ''Beta - StackScoreModel' then 'beta_stack_model_sil' else modelDisplayName end as modelDisplayName ,   
    modelVersionId,
    case when trenchCategory is null then 'ALL' 
         when trenchCategory='' then 'ALL'    
            else trenchCategory end trenchCategory,
    REPLACE(REPLACE(calcFeature, "'", '"'), "None", "null") AS calcFeature,
    REPLACE(REPLACE(prediction, "'", '"'), "None", "null") AS prediction_clean
  FROM `prj-prod-dataplatform.audit_balance.ml_model_run_details`
  WHERE modelDisplayName in ('Beta - StackScoreModel', 'beta_stack_model_sil')
  ),
base as 
(SELECT
  r.customerId,r.digitalLoanAccountId,prediction,start_time,end_time,
  modelDisplayName,modelVersionId,
     loanmaster.new_loan_type,
 loanmaster.gender,
    case when loanmaster.loantype='BNPL' and sil_category.store_type =1 then 'Appliance'
    when loanmaster.loantype='BNPL' and sil_category.store_type =2 then 'Mobile'
    when loanmaster.loantype='BNPL' and sil_category.store_type =3 then 'Mall'
    when loanmaster.loantype='BNPL' and sil_category.store_type not in (1,2,3) then store_tagging
    else 'not applicable' end as loan_product_type,
     case when lower(coalesce(loanmaster.osversion_v2, loanmaster.osVersion)) like '%andro%' then 'android'
    when lower(coalesce(loanmaster.osversion_v2, loanmaster.osVersion)) like '%os%' then 'ios'
    when lower(loanmaster.deviceType) like '%andro%' then 'android'
    else 'ios' end osType,
 'beta_stack_model_sil' Model_Name,
 'SIL' as product,
  trenchCategory,
  'Test' Data_selection,
  -- sil_beta_stack_score
  prediction as score,
 calcFeature calcFeatures,
    coalesce(IF(loanmaster.new_loan_type = 'Flex-up', loanmaster.startApplyDateTime, loanmaster.termsAndConditionsSubmitDateTime),  r.start_time) AS appln_submit_datetime,
    loanmaster.disbursementDateTime,
    format_date('%Y-%m', coalesce(IF(loanmaster.new_loan_type = 'Flex-up', loanmaster.startApplyDateTime, loanmaster.termsAndConditionsSubmitDateTime),  r.start_time)) as Application_month,
 FROM cleaned r
left join risk_credit_mis.loan_master_table loanmaster
  ON loanmaster.digitalLoanAccountId = r.digitalLoanAccountId
 left join(SELECT DISTINCT mer_refferal_code, mer_name mer_name,store_type,store_tagging FROM `dl_loans_db_raw.tdbk_merchant_refferal_mtb`
  left join worktable_datachampions.TARGET_SPLIT P on P.STORE_NAME = mer_name
 qualify row_number() over(partition by mer_refferal_code order by  created_dt desc)=1) sil_category on loanmaster.purpleKey=sil_category.mer_refferal_code
  qualify row_number() over(partition by r.customerId, r.digitalLoanAccountid,  modelVersionId  order by coalesce(IF(loanmaster.new_loan_type = 'Flex-up', loanmaster.startApplyDateTime, loanmaster.termsAndConditionsSubmitDateTime),  r.start_time) desc) = 1
  )
select * from base
;
"""
dfd = client.query(sq).to_dataframe()
dfd.head()

Unnamed: 0,customerId,digitalLoanAccountId,prediction,start_time,end_time,modelDisplayName,modelVersionId,new_loan_type,gender,loan_product_type,osType,Model_Name,product,trenchCategory,Data_selection,score,calcFeatures,appln_submit_datetime,disbursementDateTime,Application_month
0,1000721,3ed28cd0-2b7c-4f6b-9a37-46a5761466b2,0.1097859906,2025-12-14 03:36:52.366620,2025-12-14 03:36:52.434902,beta_demo_model_sil,v1,SIL Competitor,M,Appliance,android,beta_demo_model_sil,SIL,ALL,Test,0.1097859906,"{""beta_de_ln_vas_opted_flag"": ""1"", ""beta_de_ln...",2025-12-14 11:36:46,2025-12-14 11:50:22,2025-12
1,1350234,51a22976-5da1-4097-9f82-5fe008a65a6f,0.0810933792,2025-07-27 04:28:30.970945,2025-07-27 04:28:31.033531,beta_demo_model_sil,v1,SIL-Instore,M,Appliance,android,beta_demo_model_sil,SIL,ALL,Test,0.0810933792,"{""beta_de_ln_vas_opted_flag"": ""1"", ""beta_de_ln...",2025-07-27 12:28:28,2025-07-27 12:50:14,2025-07
2,1732850,9b015464-3467-40a9-a72d-33409e99c2e0,0.1418460966,2025-08-16 08:19:40.443770,2025-08-16 08:19:40.508948,beta_demo_model_sil,v1,SIL Competitor,F,Mall,ios,beta_demo_model_sil,SIL,ALL,Test,0.1418460966,"{""beta_de_ln_vas_opted_flag"": ""0"", ""beta_de_ln...",2025-08-16 16:19:38,NaT,2025-08
3,2138075,902b04e6-22db-41a2-8677-be2429641eda,0.41393396390825,2025-12-16 03:22:00.477045,2025-12-16 03:22:01.124216,beta_demo_model_sil,v2,SIL Competitor,F,Appliance,android,beta_demo_model_sil,SIL,Trench 2,Test,0.41393396390825,"{""demoScoreModel"": {""ln_vas_opted_flag"": ""0"", ...",2025-12-16 11:21:53,2025-12-16 15:25:36,2025-12
4,2237251,bc40948e-9aa8-4480-bf6f-a34ced197f17,0.1185251639,2025-12-07 03:56:51.890091,2025-12-07 03:56:51.958922,beta_demo_model_sil,v1,SIL Competitor,F,Appliance,ios,beta_demo_model_sil,SIL,ALL,Test,0.1185251639,"{""beta_de_ln_vas_opted_flag"": ""1"", ""beta_de_ln...",2025-12-07 11:56:36,2025-12-07 12:01:04,2025-12


In [None]:
df1 = dfd.copy()

### Train

In [None]:
sq = """ 
WITH cleaned AS (
  SELECT
    customerId,digitalLoanAccountId,prediction,start_time,end_time,
    case when modelDisplayName = 'Beta - DemoScoreModel' then 'beta_demo_model_sil' else modelDisplayName end as modelDisplayName    
     ,modelVersionId,
        case when trenchCategory is null then 'ALL' 
         when trenchCategory = '' then 'ALL'
    else trenchCategory end trenchCategory,
    REPLACE(REPLACE(calcFeature, "'", '"'), "None", "null") AS calcFeature,
    REPLACE(REPLACE(cast(prediction as string), "'", '"'), "None", "null") AS prediction_clean
  FROM prj-prod-dataplatform.dap_ds_poweruser_playground.ml_training_model_run_details
  WHERE modelDisplayName in  ('Beta - DemoScoreModel', 'beta_demo_model_sil')
      ),
base as 
(SELECT
  r.customerId,r.digitalLoanAccountId,prediction,start_time,end_time,
  modelDisplayName,modelVersionId,
     loanmaster.new_loan_type,
 loanmaster.gender,
    case when loanmaster.loantype='BNPL' and sil_category.store_type =1 then 'Appliance'
    when loanmaster.loantype='BNPL' and sil_category.store_type =2 then 'Mobile'
    when loanmaster.loantype='BNPL' and sil_category.store_type =3 then 'Mall'
    when loanmaster.loantype='BNPL' and sil_category.store_type not in (1,2,3) then store_tagging
    else 'not applicable' end as loan_product_type,
     case when lower(coalesce(loanmaster.osversion_v2, loanmaster.osVersion)) like '%andro%' then 'android'
    when lower(coalesce(loanmaster.osversion_v2, loanmaster.osVersion)) like '%os%' then 'ios'
    when lower(loanmaster.deviceType) like '%andro%' then 'android'
    else 'ios' end osType,
 'beta_demo_model_sil' Model_Name,
 'SIL' as product,
  trenchCategory,
  'Train' Data_selection,
  prediction as score,
 calcFeature calcFeatures,
    IF(loanmaster.new_loan_type = 'Flex-up', loanmaster.startApplyDateTime, loanmaster.termsAndConditionsSubmitDateTime) AS appln_submit_datetime,
    loanmaster.disbursementDateTime,
    format_date('%Y-%m', IF(loanmaster.new_loan_type = 'Flex-up', loanmaster.startApplyDateTime, loanmaster.termsAndConditionsSubmitDateTime)) as Application_month,
 FROM cleaned r
left join risk_credit_mis.loan_master_table loanmaster
  ON loanmaster.digitalLoanAccountId = r.digitalLoanAccountId
 left join(SELECT DISTINCT mer_refferal_code, mer_name mer_name,store_type,store_tagging FROM `dl_loans_db_raw.tdbk_merchant_refferal_mtb`
  left join worktable_datachampions.TARGET_SPLIT P on P.STORE_NAME = mer_name
 qualify row_number() over(partition by mer_refferal_code order by  created_dt desc)=1) sil_category on loanmaster.purpleKey=sil_category.mer_refferal_code
 qualify row_number() over (partition by r.customerId,r.digitalLoanAccountId, modelVersionId 
order by   coalesce(IF(loanmaster.new_loan_type = 'Flex-up', loanmaster.startApplyDateTime, loanmaster.termsAndConditionsSubmitDateTime),  cast(r.start_time as datetime)) desc) = 1
)
select * from base
;
"""

dfd = client.query(sq).to_dataframe()
dfd.head()

Unnamed: 0,customerId,digitalLoanAccountId,prediction,start_time,end_time,modelDisplayName,modelVersionId,new_loan_type,gender,loan_product_type,osType,Model_Name,product,trenchCategory,Data_selection,score,calcFeatures,appln_submit_datetime,disbursementDateTime,Application_month
0,1055817,ff1c832f-bfea-4c37-9384-8960636513ed,0.157715,2025-12-13T11:28:21.769592,2025-12-13T11:28:21.769592,beta_demo_model_sil,v1,SIL Repeat,M,Appliance,android,beta_demo_model_sil,SIL,ALL,Train,0.157715,"{""beta_de_ln_vas_opted_flag"": ""1"", ""beta_de_ln...",2024-12-20 14:41:24,2024-12-20 14:44:10,2024-12
1,1075642,0c0ff939-0d41-4b5a-933e-dc86498f2746,0.536496,2025-12-13T11:46:15.243387,2025-12-13T11:46:15.243387,beta_demo_model_sil,v2,SIL-Instore,M,Appliance,android,beta_demo_model_sil,SIL,Trench 2,Train,0.536496,"{""ln_vas_opted_flag"": ""1"", ""ln_doc_type_rolled...",2025-01-06 15:28:43,2025-01-06 15:33:05,2025-01
2,1100922,d77d6f91-d27c-4035-9caf-5b49d3c9cca5,0.091887,2025-12-13T11:28:35.023087,2025-12-13T11:28:35.023087,beta_demo_model_sil,v1,SIL Repeat,F,Appliance,android,beta_demo_model_sil,SIL,ALL,Train,0.091887,"{""beta_de_ln_vas_opted_flag"": ""1"", ""beta_de_ln...",2024-12-27 11:19:26,2024-12-27 11:22:02,2024-12
3,1123651,f2895209-f888-404f-8288-569239f41ebf,0.094283,2025-12-13T11:28:23.574565,2025-12-13T11:28:23.574565,beta_demo_model_sil,v1,SIL Repeat,M,Appliance,ios,beta_demo_model_sil,SIL,ALL,Train,0.094283,"{""beta_de_ln_vas_opted_flag"": ""1"", ""beta_de_ln...",2024-12-21 10:47:52,NaT,2024-12
4,1201657,e45874a5-6f20-4898-b158-e301a7191628,0.236439,2025-12-13T11:28:09.752125,2025-12-13T11:28:09.752125,beta_demo_model_sil,v1,SIL-Instore,F,Appliance,android,beta_demo_model_sil,SIL,ALL,Train,0.236439,"{""beta_de_ln_vas_opted_flag"": ""1"", ""beta_de_ln...",2024-07-20 18:29:48,NaT,2024-07


In [None]:
df2 = dfd.copy()

In [None]:
df_concat = pd.concat([df1, df2], ignore_index=True)
print(f"The shape of the concatenated dataframe is: {df_concat.shape}")

The shape of the concatenated dataframe is: (885339, 20)


In [None]:
print(f"The shape of the concatenated dataframe is: {df_concat.shape}")
df_combined = dropping_duplicates(df_concat)
print(f"The shape of the dataframe after dropping duplicates is: {df_combined.shape}")

The shape of the concatenated dataframe is: (885339, 20)
The shape of the dataframe after dropping duplicates is: (885339, 20)


In [None]:
df_combined['score'] = pd.to_numeric(df_combined['score'], errors='coerce')

### PSI calculation

In [None]:
# Usage Example:
psi_results = calculate_psi_for_model(
    dfcombined=df_combined,
    configdf=configdf,
    model_display_name='beta_demo_model_sil'
)
psi_results.head()


Starting PSI Pipeline for Model: beta_demo_model_sil
Total combinations to process: 5

Processing combination 46/5: modelVersionId=v1, trenchCategory=ALL
  Data points: 508096
  Features expanded successfully
  Features identified: 17
  PSI calculated: 1513 rows
  Score PSI calculated: 89 rows
Processing combination 47/5: modelVersionId=v2, trenchCategory=ALL
  No data found for specific combination. Retrieving all data for modelVersionId=v2...
  Data points: 377243
  Features expanded successfully
  Features identified: 16
  PSI calculated: 352 rows
  Score PSI calculated: 22 rows
Processing combination 48/5: modelVersionId=v2, trenchCategory=Trench 1
  Data points: 349745
  Features expanded successfully
  Features identified: 16
  PSI calculated: 288 rows
  Score PSI calculated: 18 rows
Processing combination 49/5: modelVersionId=v2, trenchCategory=Trench 2
  Data points: 13501
  Features expanded successfully
  Features identified: 16
  PSI calculated: 288 rows
  Score PSI calcula

Unnamed: 0,modelDisplayName,modelVersionId,trenchCategory,Feature,Feature_Type,Segment_Column,Segment_Value,Month,Base_Month,Current_Month,Base_Count,Actual_Count,Expected_Percentage,Actual_Percentage,PSI
0,beta_demo_model_sil,v1,ALL,v1_Calc_beta_de_ln_vas_opted_flag,categorical,Overall,All,2025-03,Train,2025-03,349207,7278,50.0,50.0,0.038641
1,beta_demo_model_sil,v1,ALL,v1_Calc_beta_de_ln_vas_opted_flag,categorical,Overall,All,2025-04,Train,2025-04,349207,1039,50.0,50.0,0.041896
2,beta_demo_model_sil,v1,ALL,v1_Calc_beta_de_ln_vas_opted_flag,categorical,Overall,All,2025-05,Train,2025-05,349207,1561,50.0,50.0,0.047286
3,beta_demo_model_sil,v1,ALL,v1_Calc_beta_de_ln_vas_opted_flag,categorical,Overall,All,2025-06,Train,2025-06,349207,16368,50.0,50.0,0.017988
4,beta_demo_model_sil,v1,ALL,v1_Calc_beta_de_ln_vas_opted_flag,categorical,Overall,All,2025-07,Train,2025-07,349207,22790,50.0,50.0,0.01152


In [None]:

psi_results['Feature'] = psi_results['Feature'].str.replace('_Calc_', '_', regex=False)
psi_results[['modelVersionId','Feature']].value_counts()


modelVersionId  Feature                               
v1              score                                     89
                v1_beta_de_ln_province_bin                89
                v1_beta_de_ln_age_bin                     89
                v1_beta_de_onb_name_email_match_score     89
                v1_beta_de_ln_vas_opted_flag              89
                v1_beta_de_ln_telconame                   89
                v1_beta_de_ln_source_of_funds_new_bin     89
                v1_beta_de_ln_ref2_type                   89
                v1_beta_de_ln_ref1_type                   89
                v1_beta_de_time_bw_onb_loan_appln_mins    89
                v1_beta_de_ln_marital_status              89
                v1_beta_de_ln_employment_type_new_bin     89
                v1_beta_de_ln_email_primary_domain        89
                v1_beta_de_ln_education_level             89
                v1_beta_de_ln_doc_type_rolled             89
                v1_beta_de_ln_

In [None]:

# List of features to remove, '
remove_features = ['v2_appScoreModel'                       ]
# Drop rows where feature is in the list
psi_results = psi_results[~psi_results['Feature'].isin(remove_features)]

# Replace 'score' with 'Sil_Alpha_Stack_score' in the Feature column
psi_results['Feature'] = psi_results['Feature'].replace('score', 'sil_beta_demo_score')

# Replace values starting with 'calc_' by removing the prefix
psi_results['Feature'] = psi_results['Feature'].apply(
    lambda x: x[5:] if x.startswith('calc_') else x
)




In [None]:
psi_results[['modelVersionId','Feature']].value_counts()

modelVersionId  Feature                               
v1              sil_beta_demo_score                       89
                v1_beta_de_ln_province_bin                89
                v1_beta_de_ln_age_bin                     89
                v1_beta_de_onb_name_email_match_score     89
                v1_beta_de_ln_vas_opted_flag              89
                v1_beta_de_ln_telconame                   89
                v1_beta_de_ln_source_of_funds_new_bin     89
                v1_beta_de_ln_ref2_type                   89
                v1_beta_de_ln_ref1_type                   89
                v1_beta_de_time_bw_onb_loan_appln_mins    89
                v1_beta_de_ln_marital_status              89
                v1_beta_de_ln_employment_type_new_bin     89
                v1_beta_de_ln_email_primary_domain        89
                v1_beta_de_ln_education_level             89
                v1_beta_de_ln_doc_type_rolled             89
                v1_beta_de_ln_

In [None]:
table_id = "prj-prod-dataplatform.dap_ds_poweruser_playground.beta_demo_score_model_psi_v5"
job_config = bigquery.LoadJobConfig(
    write_disposition="WRITE_TRUNCATE",  # or "WRITE_APPEND"
)
job = client.load_table_from_dataframe(psi_results, table_id, job_config=job_config)
job.result()

LoadJob<project=prj-prod-dataplatform, location=asia-southeast1, id=bcb10df4-1a9a-421a-9a74-1fd185795cc1>

# End