# Define Library

In [1]:
# %% [markdown]
# # Jupyter Notebook Loading Header
#
# This is a custom loading header for Jupyter Notebooks in Visual Studio Code.
# It includes common imports and settings to get you started quickly.
# %% [markdown]
## Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from google.cloud import bigquery
from google.cloud import storage
import os
import tempfile
import time
from datetime import datetime
import uuid
import joblib
import uuid

import gcsfs
import duckdb as dd
import pickle
import joblib
from typing import Union
import io

path = r'C:\Users\Dwaipayan\AppData\Roaming\gcloud\legacy_credentials\dchakroborti@tonikbank.com\adc.json'
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = path
client = bigquery.Client(project='prj-prod-dataplatform')
os.environ["GOOGLE_CLOUD_PROJECT"] = "prj-prod-dataplatform"
# %% [markdown]
## Configure Settings
# Set options or configurations as needed
pd.set_option('display.max_columns', None)
pd.set_option("Display.max_rows", 100)

# Function

In [2]:
import pandas as pd
import numpy as np
from typing import List, Dict, Tuple, Union

def identify_feature_types(df: pd.DataFrame, feature_list: List[str]) -> Dict[str, str]:
    """
    Identify whether features are categorical or numerical
    """
    feature_types = {}
    
    for feature in feature_list:
        # Check if feature exists in dataframe
        if feature not in df.columns:
            feature_types[feature] = 'unknown'
            continue
            
        # Check data type and unique values
        unique_count = df[feature].nunique()
        
        # For features with specific patterns in name or known categoricals
        if (feature in ['cic_ScoreRange', 'cic_ln_loan_level_user_type'] or 
            any(keyword in feature.lower() for keyword in ['flag', 'flg', 'type', 'category', 'range'])):
            feature_types[feature] = 'categorical'
        # For features that are clearly numerical
        elif ('ratio' in feature.lower() or 'amt' in feature.lower() or 
              'cnt' in feature.lower() or 'age' in feature.lower() or 
              'limit' in feature.lower() or 'max' in feature.lower()):
            feature_types[feature] = 'numerical'
        # Fallback based on data characteristics
        elif unique_count <= 20 or df[feature].dtype == 'object':
            feature_types[feature] = 'categorical'
        else:
            feature_types[feature] = 'numerical'
            
    return feature_types

def create_bins_numerical(df: pd.DataFrame, feature: str, num_bins: int = 10) -> pd.DataFrame:
    """
    Create bins for numerical features using quantiles with robust error handling
    """
    df_copy = df.copy()
    
    # Handle missing values
    if df_copy[feature].isna().any():
        df_copy[f'{feature}_bin'] = 'Missing'
        non_missing = df_copy[feature].notna()
    else:
        non_missing = pd.Series([True] * len(df_copy))
    
    if non_missing.any():
        feature_data = df_copy.loc[non_missing, feature]
        
        # Check if we have enough data for binning
        if len(feature_data) < 2:
            df_copy.loc[non_missing, f'{feature}_bin'] = 'All'
            return df_copy
            
        # Check if all values are the same
        if feature_data.nunique() == 1:
            df_copy.loc[non_missing, f'{feature}_bin'] = 'All'
            return df_copy
            
        try:
            # Use quantile-based binning with error handling
            bins = pd.qcut(feature_data, q=min(num_bins, len(feature_data)), 
                          duplicates='drop', labels=False)
            df_copy.loc[non_missing, f'{feature}_bin'] = bins.astype(str)
        except (ValueError, TypeError) as e:
            try:
                # Fallback to equal-width binning
                bins = pd.cut(feature_data, bins=min(num_bins, len(feature_data)), 
                            labels=False)
                df_copy.loc[non_missing, f'{feature}_bin'] = bins.astype(str)
            except:
                # Final fallback - use value ranges
                unique_vals = feature_data.unique()
                if len(unique_vals) <= 10:
                    df_copy.loc[non_missing, f'{feature}_bin'] = feature_data.astype(str)
                else:
                    df_copy.loc[non_missing, f'{feature}_bin'] = 'All'
    
    return df_copy

def create_bins_categorical(df: pd.DataFrame, feature: str, top_n: int = 6) -> pd.DataFrame:
    """
    Create bins for categorical features - top N categories and 'Others'
    """
    df_copy = df.copy()
    
    # Handle missing values
    if df_copy[feature].isna().any():
        df_copy[f'{feature}_bin'] = 'Missing'
        non_missing = df_copy[feature].notna()
    else:
        non_missing = pd.Series([True] * len(df_copy))
    
    if non_missing.any():
        feature_data = df_copy.loc[non_missing, feature]
        
        # If no data or only one value
        if len(feature_data) == 0:
            df_copy.loc[non_missing, f'{feature}_bin'] = 'No Data'
            return df_copy
            
        # Get value counts
        value_counts = feature_data.value_counts()
        
        if len(value_counts) == 0:
            df_copy.loc[non_missing, f'{feature}_bin'] = 'No Data'
        elif len(value_counts) <= top_n:
            # If fewer categories than top_n, use all
            df_copy.loc[non_missing, f'{feature}_bin'] = feature_data.astype(str)
        else:
            # Keep top N categories, group rest as 'Others'
            top_categories = value_counts.head(top_n).index
            df_copy.loc[non_missing, f'{feature}_bin'] = feature_data.apply(
                lambda x: str(x) if x in top_categories else 'Others'
            )
    
    return df_copy

def calculate_psi(expected: pd.Series, actual: pd.Series) -> float:
    """
    Calculate Population Stability Index between expected and actual distributions
    """
    # Handle empty series
    if len(expected) == 0 or len(actual) == 0:
        return np.nan
        
    # Combine all possible categories
    all_categories = set(expected.unique()) | set(actual.unique())
    
    psi = 0
    for category in all_categories:
        exp_count = (expected == category).sum()
        act_count = (actual == category).sum()
        
        exp_perc = exp_count / len(expected) if len(expected) > 0 else 0
        act_perc = act_count / len(actual) if len(actual) > 0 else 0
        
        # Add small epsilon to avoid division by zero and log(0)
        eps = 1e-6
        exp_perc_adj = exp_perc + eps
        act_perc_adj = act_perc + eps
        
        if exp_perc_adj > 0 and act_perc_adj > 0:
            psi_component = (act_perc_adj - exp_perc_adj) * np.log(act_perc_adj / exp_perc_adj)
            psi += psi_component
    
    return psi

def calculate_feature_psi(df: pd.DataFrame, feature: str, feature_type: str, 
                         month_column: str, base_month: str = None) -> pd.DataFrame:
    """
    Calculate PSI for a single feature across months
    """
    if base_month is None:
        base_month = df[month_column].min()
    
    # Check if we have data for base month
    base_data = df[df[month_column] == base_month]
    if len(base_data) == 0:
        return pd.DataFrame()
    
    try:
        # Create bins based on feature type
        if feature_type == 'numerical':
            df_binned = create_bins_numerical(df, feature)
        else:  # categorical
            df_binned = create_bins_categorical(df, feature)
        
        # Get base distribution
        base_data = df_binned[df_binned[month_column] == base_month]
        expected_dist = base_data[f'{feature}_bin']
        
        # Calculate PSI for each month
        results = []
        months = sorted(df_binned[month_column].unique())
        
        for month in months:
            if month == base_month:
                psi = 0.0  # PSI with itself is 0
            else:
                current_data = df_binned[df_binned[month_column] == month]
                if len(current_data) == 0:
                    psi = np.nan
                else:
                    actual_dist = current_data[f'{feature}_bin']
                    psi = calculate_psi(expected_dist, actual_dist)
            
            month_data = df_binned[df_binned[month_column] == month]
            results.append({
                'month': month,
                'feature': feature,
                'psi': psi,
                'base_month': base_month,
                'sample_size': len(month_data)
            })
        
        return pd.DataFrame(results)
    
    except Exception as e:
        print(f"Error calculating PSI for feature {feature}: {str(e)}")
        return pd.DataFrame()

def calculate_segment_psi(df: pd.DataFrame, feature: str, feature_type: str,
                        segment_columns: List[str], month_column: str, 
                        base_month: str = None) -> pd.DataFrame:
    """
    Calculate PSI for each segment
    """
    if base_month is None:
        base_month = df[month_column].min()
    
    all_results = []
    
    # Overall PSI (no segmentation)
    overall_psi = calculate_feature_psi(df, feature, feature_type, month_column, base_month)
    if not overall_psi.empty:
        overall_psi['segment_name'] = 'overall'
        overall_psi['segment_value'] = 'overall'
        all_results.append(overall_psi)
    
    # PSI for each segment
    for segment_col in segment_columns:
        if segment_col not in df.columns:
            continue
            
        for segment_value in df[segment_col].dropna().unique():
            segment_data = df[df[segment_col] == segment_value]
            if len(segment_data) == 0:
                continue
                
            segment_psi = calculate_feature_psi(segment_data, feature, feature_type, month_column, base_month)
            if not segment_psi.empty:
                segment_psi['segment_name'] = segment_col
                segment_psi['segment_value'] = str(segment_value)
                all_results.append(segment_psi)
    
    # Combine all results
    if all_results:
        return pd.concat(all_results, ignore_index=True)
    else:
        return pd.DataFrame()

def calculate_bin_level_psi(df: pd.DataFrame, feature: str, feature_type: str,
                          segment_columns: List[str], month_column: str,
                          base_month: str = None) -> pd.DataFrame:
    """
    Calculate bin-level PSI contributions
    """
    if base_month is None:
        base_month = df[month_column].min()
    
    # Check if we have base data
    base_data = df[df[month_column] == base_month]
    if len(base_data) == 0:
        return pd.DataFrame()
    
    try:
        # Create bins
        if feature_type == 'numerical':
            df_binned = create_bins_numerical(df, feature)
        else:
            df_binned = create_bins_categorical(df, feature)
        
        bin_results = []
        
        # Helper function to calculate bin distributions
        def get_bin_distribution(data, bin_col):
            if len(data) == 0:
                return {}
            dist = data[bin_col].value_counts(normalize=True).to_dict()
            return {str(k): v for k, v in dist.items()}
        
        # Base distribution (overall)
        base_data = df_binned[df_binned[month_column] == base_month]
        base_dist_overall = get_bin_distribution(base_data, f'{feature}_bin')
        
        # Overall bin-level PSI
        months = sorted(df_binned[month_column].unique())
        for month in months:
            if month == base_month:
                continue
                
            current_data = df_binned[df_binned[month_column] == month]
            current_dist = get_bin_distribution(current_data, f'{feature}_bin')
            
            # Calculate PSI contribution for each bin
            all_bins = set(base_dist_overall.keys()) | set(current_dist.keys())
            for bin_name in all_bins:
                exp_perc = base_dist_overall.get(bin_name, 1e-6)
                act_perc = current_dist.get(bin_name, 1e-6)
                
                # Add epsilon to avoid log(0)
                eps = 1e-6
                exp_perc_adj = exp_perc + eps
                act_perc_adj = act_perc + eps
                
                psi_contribution = (act_perc_adj - exp_perc_adj) * np.log(act_perc_adj / exp_perc_adj)
                
                bin_results.append({
                    'month': month,
                    'segment_name': 'overall',
                    'segment_value': 'overall',
                    'feature': feature,
                    'bin': bin_name,
                    'expected_percentage': exp_perc,
                    'actual_percentage': act_perc,
                    'psi_contribution': psi_contribution,
                    'base_month': base_month
                })
        
        # Segment-level bin PSI
        for segment_col in segment_columns:
            if segment_col not in df_binned.columns:
                continue
                
            for segment_value in df_binned[segment_col].dropna().unique():
                segment_base_data = base_data[base_data[segment_col] == segment_value]
                if len(segment_base_data) == 0:
                    continue
                    
                segment_base_dist = get_bin_distribution(segment_base_data, f'{feature}_bin')
                
                for month in months:
                    if month == base_month:
                        continue
                        
                    segment_current_data = df_binned[
                        (df_binned[month_column] == month) & 
                        (df_binned[segment_col] == segment_value)
                    ]
                    if len(segment_current_data) == 0:
                        continue
                        
                    segment_current_dist = get_bin_distribution(segment_current_data, f'{feature}_bin')
                    
                    all_bins = set(segment_base_dist.keys()) | set(segment_current_dist.keys())
                    for bin_name in all_bins:
                        exp_perc = segment_base_dist.get(bin_name, 1e-6)
                        act_perc = segment_current_dist.get(bin_name, 1e-6)
                        
                        eps = 1e-6
                        exp_perc_adj = exp_perc + eps
                        act_perc_adj = act_perc + eps
                        
                        psi_contribution = (act_perc_adj - exp_perc_adj) * np.log(act_perc_adj / exp_perc_adj)
                        
                        bin_results.append({
                            'month': month,
                            'segment_name': segment_col,
                            'segment_value': str(segment_value),
                            'feature': feature,
                            'bin': bin_name,
                            'expected_percentage': exp_perc,
                            'actual_percentage': act_perc,
                            'psi_contribution': psi_contribution,
                            'base_month': base_month
                        })
        
        return pd.DataFrame(bin_results)
    
    except Exception as e:
        print(f"Error calculating bin-level PSI for feature {feature}: {str(e)}")
        return pd.DataFrame()

def compute_population_stability_index(df: pd.DataFrame, 
                                    feature_list: List[str],
                                    segment_columns: List[str],
                                    month_column: str = 'Application_month') -> Dict[str, pd.DataFrame]:
    """
    Main function to compute Population Stability Index
    """
    
    # Identify feature types
    feature_types = identify_feature_types(df, feature_list)
    
    # Filter features that exist in dataframe
    valid_features = [f for f in feature_list if f in df.columns and feature_types[f] != 'unknown']
    print(f"Processing {len(valid_features)} valid features: {valid_features}")
    
    # Calculate base month (minimum month)
    base_month = df[month_column].min()
    print(f"Using base month: {base_month}")
    
    # Initialize results dataframes
    all_feature_psi = []
    all_bin_psi = []
    
    # Calculate PSI for each feature
    for feature in valid_features:
        print(f"Processing feature: {feature} ({feature_types[feature]})")
        
        try:
            # Feature-level PSI
            feature_psi_df = calculate_segment_psi(
                df, feature, feature_types[feature], segment_columns, month_column, base_month
            )
            if not feature_psi_df.empty:
                all_feature_psi.append(feature_psi_df)
            
            # Bin-level PSI
            bin_psi_df = calculate_bin_level_psi(
                df, feature, feature_types[feature], segment_columns, month_column, base_month
            )
            if not bin_psi_df.empty:
                all_bin_psi.append(bin_psi_df)
                
        except Exception as e:
            print(f"Error processing feature {feature}: {str(e)}")
            continue
    
    # Combine all results
    feature_psi_result = pd.concat(all_feature_psi, ignore_index=True) if all_feature_psi else pd.DataFrame()
    bin_psi_result = pd.concat(all_bin_psi, ignore_index=True) if all_bin_psi else pd.DataFrame()
    
    return {
        'feature_psi': feature_psi_result,
        'bin_psi': bin_psi_result
    }



## AFter removing the error

In [None]:
# sq = """drop table if exists prj-prod-dataplatform.dap_ds_poweruser_playground.temp_csi_new_monitoring_data;"""

# job = client.query(sq)
# job.result()  # Wait for job to complete
# print(f"Table  prj-prod-dataplatform.dap_ds_poweruser_playground.temp_csi_new_monitoring_data dropped successfully.")

# Queries

## Alpha-Cash-CIC-Model

In [3]:
sq = r"""  
WITH parsed as (
  select customerId, digitalLoanAccountId,modelDisplayName,modelVersionId,start_time,end_time,prediction,
REPLACE(REPLACE(calcFeature, "'", '"'), "None", "null") AS calcFeatures,
--REPLACE(REPLACE(prediction, "'", '"'), "None", "null") AS prediction_clean
FROM `prj-prod-dataplatform.audit_balance.ml_model_run_details`
where modelDisplayName = 'Alpha-Cash-CIC-Model'
),

latest_request as (
select * from parsed
QUALIFY ROW_NUMBER() OVER (PARTITION BY customerId, digitalLoanAccountId,modelDisplayName ORDER BY start_time DESC ) = 1),

model_run as (
select customerId,digitalLoanAccountId,modelName, publish_time,requestPayload as requestPayload_clean
--REPLACE(REPLACE(requestPayload, "'", '"'), "None", "null") AS requestPayload_clean
from `prj-prod-dataplatform.audit_balance.ml_request_details` 
WHERE modelName = 'Alpha-Cash-Model-response'
QUALIFY ROW_NUMBER() OVER (PARTITION BY customerId, digitalLoanAccountId,modelName ORDER BY publish_time DESC ) = 1),
base as (
select * from (
  select 
 r.customerId,
 r.digitalLoanAccountId,
 r.prediction,
 r.start_time,
 r.end_time,
 r.modelDisplayName,
 r.modelVersionId,
 loanmaster.new_loan_type,
 loanmaster.gender,
  REGEXP_EXTRACT(m.requestPayload_clean, r"osType[:=]['\"]?([^'\"]+)['\"]?") AS osType,
  REGEXP_EXTRACT(m.requestPayload_clean, r"loanType[:=]['\"]?([^'\"]+)['\"]?") AS loanType,
  REGEXP_EXTRACT(m.requestPayload_clean, r"trenchCategory[:=]['\"]?([^'\"]+)['\"]?") AS trenchCategory,
   SAFE_CAST(REGEXP_EXTRACT(m.requestPayload_clean, r"aStackScore[:= ]([0-9\.]+)") AS FLOAT64) AS aStackScore,
  SAFE_CAST(REGEXP_EXTRACT(m.requestPayload_clean, r"aCicScore[:= ]([0-9\.]+)") AS FLOAT64) AS aCicScore,
  --  Alpha CIC Score Model Features for Trench 1
  SAFE_CAST(JSON_VALUE(r.calcFeatures, "$.cic_max_age_all_contracts_snapshot") AS INT64) AS cic_max_age_all_contracts_snapshot,
  SAFE_CAST(JSON_VALUE(r.calcFeatures, "$.cic_ratio_overdue_contracts_to_granted_contracts") AS FLOAT64) AS cic_ratio_overdue_contracts_to_granted_contracts,
  JSON_VALUE(r.calcFeatures, "$.cic_ScoreRange") AS cic_ScoreRange,
  JSON_VALUE(r.calcFeatures, "$.cic_ln_loan_level_user_type") AS cic_ln_loan_level_user_type,
  JSON_VALUE(r.calcFeatures, "$.cic_has_ever_been_overdue") AS cic_has_ever_been_overdue,
  JSON_VALUE(r.calcFeatures, "$.cic_latest_granted_contract_overdue_flag") AS cic_latest_granted_contract_overdue_flag,
  JSON_VALUE(r.calcFeatures, "$.cic_ratio_closed_over_new_granted_cnt_24M") AS cic_ratio_closed_over_new_granted_cnt_24M,
  JSON_VALUE(r.calcFeatures, "$.cic_ratio_risky_contracts_to_granted_contracts") AS cic_ratio_risky_contracts_to_granted_contracts,
  JSON_VALUE(r.calcFeatures, "$.cic_Short_and_Term_Loans_granted_contracts_cnt_24M") AS cic_Short_and_Term_Loans_granted_contracts_cnt_24M,
  JSON_VALUE(r.calcFeatures, "$.cic_flg_zero_non_granted_ever") AS cic_flg_zero_non_granted_ever,
  JSON_VALUE(r.calcFeatures, "$.cic_Personal_Loans_granted_contracts_amt_24M") AS cic_Personal_Loans_granted_contracts_amt_24M,
  JSON_VALUE(r.calcFeatures, "$.cic_CreditAvgCreditLimit") AS cic_CreditAvgCreditLimit,
  JSON_VALUE(r.calcFeatures, "$.cic_flg_zero_granted_ever") AS cic_flg_zero_granted_ever,
  coalesce(IF(loanmaster.new_loan_type = 'Flex-up', loanmaster.startApplyDateTime, loanmaster.termsAndConditionsSubmitDateTime),  r.start_time) AS appln_submit_datetime,
  loanmaster.disbursementDateTime,
  format_date('%Y-%m', coalesce(IF(loanmaster.new_loan_type = 'Flex-up', loanmaster.startApplyDateTime, loanmaster.termsAndConditionsSubmitDateTime),  r.start_time)) as Application_month, 
FROM latest_request r
left join model_run m
on r.digitalLoanAccountId = m.digitalLoanAccountId 
left join risk_credit_mis.loan_master_table loanmaster 
  ON loanmaster.digitalLoanAccountId = r.digitalLoanAccountId
 )
where trenchCategory = 'Trench 1'
)
select *, case when appln_submit_datetime <= '2025-09-30' then 'Train' else 'Test' end dataselection from base
"""

df = client.query(sq).to_dataframe()
df.head()



Unnamed: 0,customerId,digitalLoanAccountId,prediction,start_time,end_time,modelDisplayName,modelVersionId,new_loan_type,gender,osType,loanType,trenchCategory,aStackScore,aCicScore,cic_max_age_all_contracts_snapshot,cic_ratio_overdue_contracts_to_granted_contracts,cic_ScoreRange,cic_ln_loan_level_user_type,cic_has_ever_been_overdue,cic_latest_granted_contract_overdue_flag,cic_ratio_closed_over_new_granted_cnt_24M,cic_ratio_risky_contracts_to_granted_contracts,cic_Short_and_Term_Loans_granted_contracts_cnt_24M,cic_flg_zero_non_granted_ever,cic_Personal_Loans_granted_contracts_amt_24M,cic_CreditAvgCreditLimit,cic_flg_zero_granted_ever,appln_submit_datetime,disbursementDateTime,Application_month,dataselection
0,3711327,e3b782e3-0ddd-487b-99ac-877479e0b9f4,0.7408754443271659,2025-10-01 07:32:57.111020,2025-10-01 07:32:57.246689,Alpha-Cash-CIC-Model,v1,Quick,M,android,Quick,Trench 1,0.718002,0.740875,,0.666667,Ai,2_New Applicant,1.0,1.0,,0.3333333333,,1,,40000,0,2025-10-01 15:32:47,NaT,2025-10,Test
1,3734296,b91a7ba7-352a-4965-8527-87a21ecefcf7,0.7506778180294458,2025-10-09 15:29:42.308355,2025-10-09 15:29:42.434107,Alpha-Cash-CIC-Model,v1,Quick,F,android,Quick,Trench 1,0.665141,0.750678,7.0,1.0,Ai,2_New Applicant,1.0,1.0,1.0,0.0,,0,2000.0,0,0,2025-10-09 23:29:32,NaT,2025-10,Test
2,3731772,3d50eb87-0075-4e6c-968a-3fc303b0238c,0.3228720506435742,2025-10-08 08:39:21.660847,2025-10-08 08:39:21.860095,Alpha-Cash-CIC-Model,v1,Quick,F,android,Quick,Trench 1,0.198943,0.322872,,0.035714,Hi,2_New Applicant,1.0,0.0,1.0,0.0,4.0,0,18610.0,92500,0,2025-10-08 16:39:05,2025-10-09 08:25:52,2025-10,Test
3,3703708,c1d96cd9-0410-45bd-9c2d-b3ca7f4be45d,0.6040735881116797,2025-09-24 23:45:56.812265,2025-09-24 23:45:56.890714,Alpha-Cash-CIC-Model,v1,Quick,F,android,Quick,Trench 1,0.642562,0.604074,,,NH_Ii,2_New Applicant,Unknown,Unknown,,,,0,,0,1,2025-09-25 07:45:47,NaT,2025-09,Train
4,3730522,9e86edd0-0393-4b66-a6af-89cfb9c270f1,0.4556741334480477,2025-10-07 17:15:42.679182,2025-10-07 17:15:42.823178,Alpha-Cash-CIC-Model,v1,Quick,M,android,Quick,Trench 1,0.408507,0.455674,,0.0,Gi,2_New Applicant,0.0,0.0,0.3333333333,0.0,,0,85100.0,300000,0,2025-10-08 01:15:33,2025-10-08 13:23:55,2025-10,Test


In [None]:
df.head(100).to_json(r'D:\OneDrive - Tonik Financial Pte Ltd\MyStuff\Data Engineering\Model_Monitoring\New_Model_Monitoring\Notebook\CSI_testing_data.json', orient='records', lines=True)

In [None]:
df['cic_flg_zero_granted_ever'].value_counts(dropna=False)

In [None]:
# convert the object column to numeric for correct population stability index calculation
columns_to_convert = [
    'cic_ratio_closed_over_new_granted_cnt_24M',
    'cic_ratio_risky_contracts_to_granted_contracts',
    'cic_Short_and_Term_Loans_granted_contracts_cnt_24M',
    'cic_Personal_Loans_granted_contracts_amt_24M',
    'cic_CreditAvgCreditLimit'
]

for col in columns_to_convert:
    df[col] = pd.to_numeric(df[col], errors='coerce')

In [None]:
df.info()

In [None]:
df.to_csv(r"sample.csv", index = False)

In [None]:
# Example usage with your data:
def analyze_sample_data(df):
    # Define features and segments
    feature_list = [
        'aCicScore',
        'cic_max_age_all_contracts_snapshot',
        'cic_ratio_overdue_contracts_to_granted_contracts', 
        'cic_ScoreRange',
        'cic_ln_loan_level_user_type', 
        'cic_has_ever_been_overdue',
        'cic_latest_granted_contract_overdue_flag',
        'cic_ratio_closed_over_new_granted_cnt_24M',
        'cic_ratio_risky_contracts_to_granted_contracts',
        'cic_Short_and_Term_Loans_granted_contracts_cnt_24M',
        'cic_flg_zero_non_granted_ever',
        'cic_Personal_Loans_granted_contracts_amt_24M',
        'cic_CreditAvgCreditLimit', 
        'cic_flg_zero_granted_ever',
    ]
    
    segment_columns = ['new_loan_type', 'gender', 'osType', 'loanType', 'trenchCategory']
    
    # Calculate PSI
    results = compute_population_stability_index(df, feature_list, segment_columns, 'Application_month')
    
    return results

# Run the analysis
results = analyze_sample_data(df)
psi_results = results['feature_psi'].copy()
psi_results['modelDisplayName'] = df['modelDisplayName'].iloc[0]
psi_results['modelVersionId'] = df['modelVersionId'].iloc[0]
psi_results['trenchCategory'] = df['trenchCategory'].iloc[0]

bin_psi_results = results['bin_psi'].copy()

bin_psi_results['modelDisplayName'] = df['modelDisplayName'].iloc[0]
bin_psi_results['modelVersionId'] = df['modelVersionId'].iloc[0]
bin_psi_results['trenchCategory'] = df['trenchCategory'].iloc[0]




In [None]:
psi_results[psi_results['segment_name'] == 'gender'].head(20)

In [None]:
# Display results
print("\n=== FEATURE-LEVEL PSI RESULTS ===")
results['feature_psi'].head(10)



In [None]:
print("\n=== BIN-LEVEL PSI RESULTS ===")
results['bin_psi'].head(100).sort_values(by=['feature', 'month', 'segment_name', 'segment_value', 'bin'])



In [None]:
# Summary statistics
if not results['feature_psi'].empty:
    print(f"\n=== SUMMARY ===")
    print(f"Total feature-month-segment combinations: {len(results['feature_psi'])}")
    print(f"Features analyzed: {results['feature_psi']['feature'].nunique()}")
    print(f"Segments analyzed: {results['feature_psi']['segment_name'].nunique()}")
    print(f"PSI range: [{results['feature_psi']['psi'].min():.4f}, {results['feature_psi']['psi'].max():.4f}]")
    
    # Count by PSI interpretation
    psi_interpretation = []
    for psi in results['feature_psi']['psi'].dropna():
        if psi < 0.1:
            psi_interpretation.append('Stable (PSI < 0.1)')
        elif psi < 0.25:
            psi_interpretation.append('Moderate change (0.1 ≤ PSI < 0.25)')
        else:
            psi_interpretation.append('Significant change (PSI ≥ 0.25)')
    
    if psi_interpretation:
        interpretation_counts = pd.Series(psi_interpretation).value_counts()
        print("\nPSI Interpretation Distribution:")
        for interpretation, count in interpretation_counts.items():
            print(f"  {interpretation}: {count}")

In [None]:
# Example usage with your data:
def analyze_sample_data(df):
    # Load your data
    df = df.copy()
    
    # Define features and segments
    feature_list = ['aCicScore',
        'cic_max_age_all_contracts_snapshot',
        'cic_ratio_overdue_contracts_to_granted_contracts', 
        'cic_ScoreRange',
        'cic_ln_loan_level_user_type', 
        'cic_has_ever_been_overdue',
        'cic_latest_granted_contract_overdue_flag',
        'cic_ratio_closed_over_new_granted_cnt_24M',
        'cic_ratio_risky_contracts_to_granted_contracts',
        'cic_Short_and_Term_Loans_granted_contracts_cnt_24M',
        'cic_flg_zero_non_granted_ever',
        'cic_Personal_Loans_granted_contracts_amt_24M',
        'cic_CreditAvgCreditLimit', 
        'cic_flg_zero_granted_ever',
    ]
    
    segment_columns = ['new_loan_type', 'gender', 'osType', 'loanType', 'trenchCategory']
    
    # Calculate PSI
    results = compute_population_stability_index(df, feature_list, segment_columns, 'Application_month')
    
    return results

# Run the analysis
if __name__ == "__main__":
    results = analyze_sample_data(df)
    
    # Display results
    print("\n=== FEATURE-LEVEL PSI RESULTS ===")
    results['feature_psi'].head(10)
    


In [None]:
    print("\n=== BIN-LEVEL PSI RESULTS ===")
    print(results['bin_psi'].head(10))
    
    # Save results to files
    results['feature_psi'].to_csv('feature_psi_results.csv', index=False)
    results['bin_psi'].to_csv('bin_psi_results.csv', index=False)
    
    print("\nResults saved to feature_psi_results.csv and bin_psi_results.csv")

In [None]:
# Example usage
def usage(df):
    """
    Example of how to use both functions
    """
    # Load your data
    df = df.copy()
    
    feature_list = ['aCicScore',
        'cic_max_age_all_contracts_snapshot',
        'cic_ratio_overdue_contracts_to_granted_contracts', 
        'cic_ScoreRange',
        'cic_ln_loan_level_user_type', 
        'cic_has_ever_been_overdue',
        'cic_latest_granted_contract_overdue_flag',
        'cic_ratio_closed_over_new_granted_cnt_24M',
        'cic_ratio_risky_contracts_to_granted_contracts',
        'cic_Short_and_Term_Loans_granted_contracts_cnt_24M',
        'cic_flg_zero_non_granted_ever',
        'cic_Personal_Loans_granted_contracts_amt_24M',
        'cic_CreditAvgCreditLimit', 
        'cic_flg_zero_granted_ever',
    ]
    
    segment_columns = ['new_loan_type', 'gender', 'osType', 'loanType', 'trenchCategory']
    
    # Calculate overall PSI
    print("Calculating overall PSI...")
    psi_results = calculate_population_stability_index(df, feature_list, segment_columns)
    analyze_psi_results(psi_results)
    psi_results['modelDisplayName'] = df['modelDisplayName'].iloc[0]
    psi_results['modelVersionId'] = df['modelVersionId'].iloc[0]
    psi_results['trenchCategory'] = df['trenchCategory'].iloc[0]
    
    
    # Calculate bin-level PSI
    print("\nCalculating bin-level PSI...")
    bin_psi_results = calculate_bin_level_psi(df, feature_list, segment_columns)
    analyze_bin_level_results(bin_psi_results)
    bin_psi_results['modelDisplayName'] = df['modelDisplayName'].iloc[0]
    bin_psi_results['modelVersionId'] = df['modelVersionId'].iloc[0]
    bin_psi_results['trenchCategory'] = df['trenchCategory'].iloc[0]
    
    psi_results = psi_results[[ 'modelDisplayName',
       'modelVersionId', 'trenchCategory', 'feature', 'feature_type', 'segment_column', 'segment_value',
       'baseline_month', 'current_month', 'psi', 'num_baseline_records',
       'num_current_records', 'psi_interpretation']].copy()
    
    bin_psi_results = bin_psi_results[[ 'modelDisplayName', 'modelVersionId',
       'trenchCategory','feature', 'feature_type', 'segment_column', 'segment_value',
       'baseline_month', 'current_month', 'bin', 'baseline_percentage',
       'current_percentage', 'psi_component', 'baseline_count',
       'current_count', 'percentage_change', 'change_interpretation',
       'abs_percentage_change',]].copy()
    
    

    return psi_results, bin_psi_results

In [None]:
# Run the analysis
psi_results, bin_psi_results = usage(df)

In [None]:
psi_results.head()

In [None]:
bin_psi_results

In [None]:
# Upload to BigQuery
table_id = "prj-prod-dataplatform.dap_ds_poweruser_playground.temp_csi_new_monitoring_data"
job_config = bigquery.LoadJobConfig(
    write_disposition="WRITE_TRUNCATE",  # or "WRITE_APPEND"
)
job = client.load_table_from_dataframe(psi_results, table_id, job_config=job_config)
job.result()  # Wait for the job to complete


# Upload to BigQuery
table_id = "prj-prod-dataplatform.dap_ds_poweruser_playground.temp_csi_new_monitoring_data_feature_bin_level"
job_config = bigquery.LoadJobConfig(
    write_disposition="WRITE_TRUNCATE",  # or "WRITE_APPEND"
)
job = client.load_table_from_dataframe(bin_psi_results, table_id, job_config=job_config)
job.result()  # Wait for the job to complete

## Alpha-Cash-Stack-Model

In [None]:
sq = r"""WITH parsed as (
  select customerId, digitalLoanAccountId,modelDisplayName,modelVersionId,start_time,end_time,prediction,
REPLACE(REPLACE(calcFeature, "'", '"'), "None", "null") AS calcFeatures,
FROM `prj-prod-dataplatform.audit_balance.ml_model_run_details`
where modelDisplayName = 'Alpha-Cash-Stack-Model'),
latest_request as (
select * from parsed
QUALIFY ROW_NUMBER() OVER (PARTITION BY customerId, digitalLoanAccountId,modelDisplayName ORDER BY start_time DESC ) = 1),
model_run as (
select customerId,digitalLoanAccountId,modelName, publish_time,requestPayload as requestPayload_clean
from `prj-prod-dataplatform.audit_balance.ml_request_details` 
WHERE modelName = 'Alpha-Cash-Model-response'
QUALIFY ROW_NUMBER() OVER (PARTITION BY customerId, digitalLoanAccountId,modelName ORDER BY publish_time DESC ) = 1)
select * from (
  select 
 r.customerId,
 r.digitalLoanAccountId,
 r.prediction Alpha_Cash_Stack_Score,
 r.start_time,
 r.end_time,
 r.modelDisplayName,
 r.modelVersionId,
  loanmaster.new_loan_type,
 loanmaster.gender,

  REGEXP_EXTRACT(m.requestPayload_clean, r"osType[:=]['\"]?([^'\"]+)['\"]?") AS osType,
  REGEXP_EXTRACT(m.requestPayload_clean, r"loanType[:=]['\"]?([^'\"]+)['\"]?") AS loanType,
  REGEXP_EXTRACT(m.requestPayload_clean, r"trenchCategory[:=]['\"]?([^'\"]+)['\"]?") AS trenchCategory,

 SAFE_CAST(JSON_VALUE(r.calcFeatures, "$.apps_score") AS FLOAT64) AS  apps_score,
 SAFE_CAST(JSON_VALUE(r.calcFeatures, "$.c_demo_score") AS FLOAT64) AS  c_demo_score,
 SAFE_CAST(JSON_VALUE(r.calcFeatures, "$.c_credo_score") AS FLOAT64) AS  c_credo_score,
 SAFE_CAST(JSON_VALUE(r.calcFeatures, "$.c_tx_score") AS FLOAT64) AS  c_tx_score,
 SAFE_CAST(JSON_VALUE(r.calcFeatures, "$.ca_cic_score") AS FLOAT64) AS  ca_cic_score,
coalesce(IF(loanmaster.new_loan_type = 'Flex-up', loanmaster.startApplyDateTime, loanmaster.termsAndConditionsSubmitDateTime),  r.start_time) AS appln_submit_datetime,
  loanmaster.disbursementDateTime,
  format_date('%Y-%m', coalesce(IF(loanmaster.new_loan_type = 'Flex-up', loanmaster.startApplyDateTime, loanmaster.termsAndConditionsSubmitDateTime),  r.start_time)) as Application_month, 
FROM latest_request r
left join model_run m
on r.digitalLoanAccountId = m.digitalLoanAccountId 
left join risk_credit_mis.loan_master_table loanmaster 
  ON loanmaster.digitalLoanAccountId = r.digitalLoanAccountId
) where trenchCategory = 'Trench 1'
;
"""

df = client.query(sq).to_dataframe()
df.head()

In [None]:
df.columns

In [None]:
df.info()

In [None]:
# Example usage
def usage(df):
    """
    Example of how to use both functions
    """
    # Load your data
    df = df.copy()
    
    feature_list = ['Alpha_Cash_Stack_Score',  
                    'apps_score', 
                    'c_demo_score', 
                    'c_credo_score', 
                    'c_tx_score', 
                    'ca_cic_score'
    ]
    
    segment_columns = ['new_loan_type', 'gender', 'osType', 'loanType', 'trenchCategory']
    
    # Calculate overall PSI
    print("Calculating overall PSI...")
    psi_results = calculate_population_stability_index(df, feature_list, segment_columns)
    analyze_psi_results(psi_results)
    psi_results['modelDisplayName'] = df['modelDisplayName'].iloc[0]
    psi_results['modelVersionId'] = df['modelVersionId'].iloc[0]
    psi_results['trenchCategory'] = df['trenchCategory'].iloc[0]
    
    
    # Calculate bin-level PSI
    print("\nCalculating bin-level PSI...")
    bin_psi_results = calculate_bin_level_psi(df, feature_list, segment_columns)
    analyze_bin_level_results(bin_psi_results)
    bin_psi_results['modelDisplayName'] = df['modelDisplayName'].iloc[0]
    bin_psi_results['modelVersionId'] = df['modelVersionId'].iloc[0]
    bin_psi_results['trenchCategory'] = df['trenchCategory'].iloc[0]
    
    psi_results = psi_results[[ 'modelDisplayName',
       'modelVersionId', 'trenchCategory', 'feature', 'feature_type', 'segment_column', 'segment_value',
       'baseline_month', 'current_month', 'psi', 'num_baseline_records',
       'num_current_records', 'psi_interpretation']].copy()
    
    bin_psi_results = bin_psi_results[[ 'modelDisplayName', 'modelVersionId',
       'trenchCategory','feature', 'feature_type', 'segment_column', 'segment_value',
       'baseline_month', 'current_month', 'bin', 'baseline_percentage',
       'current_percentage', 'psi_component', 'baseline_count',
       'current_count', 'percentage_change', 'change_interpretation',
       'abs_percentage_change',]].copy()
    
    

    return psi_results, bin_psi_results

In [None]:
# Run the analysis
psi_results, bin_psi_results = usage(df)

In [None]:
psi_results.head()

In [None]:
bin_psi_results

In [None]:
# Upload to BigQuery
table_id = "prj-prod-dataplatform.dap_ds_poweruser_playground.temp_csi_new_monitoring_data"
job_config = bigquery.LoadJobConfig(
    write_disposition="WRITE_APPEND",  # or "WRITE_APPEND"
)
job = client.load_table_from_dataframe(psi_results, table_id, job_config=job_config)
job.result()  # Wait for the job to complete


# Upload to BigQuery
table_id = "prj-prod-dataplatform.dap_ds_poweruser_playground.temp_csi_new_monitoring_data_feature_bin_level"
job_config = bigquery.LoadJobConfig(
    write_disposition="WRITE_APPEND",  # or "WRITE_APPEND"
)
job = client.load_table_from_dataframe(bin_psi_results, table_id, job_config=job_config)
job.result()  # Wait for the job to complete

# Testing

In [None]:
# import pandas as pd
# import numpy as np
# from typing import List, Dict, Tuple, Optional

# def calculate_population_stability_index(
#     df: pd.DataFrame, 
#     feature_list: List[str], 
#     segment_columns: List[str], 
#     month_column: str = 'Application_month',
#     baseline_month: Optional[str] = None
# ) -> pd.DataFrame:
#     """
#     Calculate Population Stability Index (PSI) for features overall and by segments.
    
#     Parameters:
#     -----------
#     df : pd.DataFrame
#         Input dataframe with features and segments
#     feature_list : List[str]
#         List of feature names to calculate PSI for
#     segment_columns : List[str]
#         List of segment column names
#     month_column : str
#         Name of the month column (default: 'Application_month')
#     baseline_month : str, optional
#         Specific baseline month to use. If None, uses minimum month
    
#     Returns:
#     --------
#     pd.DataFrame
#         DataFrame with PSI values for each feature, segment, and month
#     """
    
#     # Identify feature types
#     def identify_feature_types(df: pd.DataFrame, feature_list: List[str]) -> Dict[str, str]:
#         """Identify categorical vs numerical features"""
#         feature_types = {}
        
#         for feature in feature_list:
#             # Check if feature exists in dataframe
#             if feature not in df.columns:
#                 print(f"Warning: Feature '{feature}' not found in dataframe")
#                 continue
                
#             # Check data type and unique values
#             unique_vals = df[feature].nunique()
#             dtype = df[feature].dtype
            
#             # Rules for categorical features
#             if (dtype == 'object' or 
#                 unique_vals <= 10 or 
#                 feature in ['cic_ScoreRange', 'cic_ln_loan_level_user_type', 
#                           'cic_has_ever_been_overdue', 'cic_latest_granted_contract_overdue_flag',
#                           'cic_flg_zero_non_granted_ever', 'cic_flg_zero_granted_ever']):
#                 feature_types[feature] = 'categorical'
#             else:
#                 feature_types[feature] = 'numerical'
                
#         return feature_types
    
#     # Create bins for numerical features
#     def create_numerical_bins(baseline_data: pd.Series, n_bins: int = 10) -> pd.IntervalIndex:
#         """Create bins for numerical features using deciles"""
#         # Remove null values
#         clean_data = baseline_data.dropna()
        
#         if len(clean_data) == 0:
#             return None
            
#         # Create decile bins
#         try:
#             bins = pd.qcut(clean_data, n_bins, duplicates='drop', retbins=True)[1]
#             # Ensure bins are unique and in order
#             bins = sorted(set(bins))
#             return pd.IntervalIndex.from_breaks(bins)
#         except:
#             # Fallback to equal width bins if deciles fail
#             min_val = clean_data.min()
#             max_val = clean_data.max()
#             bins = np.linspace(min_val, max_val, n_bins + 1)
#             return pd.IntervalIndex.from_breaks(bins)
    
#     # Process categorical features
#     def process_categorical_feature(baseline_data: pd.Series, top_n: int = 6) -> List:
#         """Get top N categories and group rest as 'Other'"""
#         value_counts = baseline_data.value_counts()
#         top_categories = value_counts.head(top_n).index.tolist()
#         return top_categories
    
#     # Calculate distribution
#     def calculate_distribution(data: pd.Series, feature_type: str, 
#                             bins: pd.IntervalIndex = None, 
#                             top_categories: List = None) -> Dict:
#         """Calculate distribution of data"""
#         if feature_type == 'numerical':
#             return calculate_numerical_distribution(data, bins)
#         else:
#             return calculate_categorical_distribution(data, top_categories)
    
#     def calculate_numerical_distribution(data: pd.Series, bins: pd.IntervalIndex) -> Dict:
#         """Calculate distribution for numerical data"""
#         if bins is None:
#             return {}
            
#         clean_data = data.dropna()
#         if len(clean_data) == 0:
#             return {str(bin_): 0 for bin_ in bins}
        
#         binned = pd.cut(clean_data, bins, include_lowest=True)
#         distribution = binned.value_counts().sort_index()
#         total = len(clean_data)
        
#         return {str(interval): count/total if total > 0 else 0 
#                 for interval, count in distribution.items()}
    
#     def calculate_categorical_distribution(data: pd.Series, top_categories: List) -> Dict:
#         """Calculate distribution for categorical data"""
#         if top_categories is None:
#             return {}
            
#         value_counts = data.value_counts()
#         total = len(data.dropna())
        
#         distribution = {}
#         for category in top_categories:
#             if category in value_counts:
#                 distribution[category] = value_counts[category] / total if total > 0 else 0
#             else:
#                 distribution[category] = 0
                
#         # Calculate "Other" category
#         other_categories = [cat for cat in value_counts.index if cat not in top_categories]
#         other_count = value_counts[other_categories].sum() if other_categories else 0
#         distribution['Other'] = other_count / total if total > 0 else 0
        
#         return distribution
    
#     # Calculate PSI
#     def calculate_psi(expected_dist: Dict, actual_dist: Dict) -> float:
#         """Calculate Population Stability Index"""
#         psi = 0
#         all_categories = set(expected_dist.keys()) | set(actual_dist.keys())
        
#         for category in all_categories:
#             expected_pct = expected_dist.get(category, 1e-6)  # Avoid division by zero
#             actual_pct = actual_dist.get(category, 1e-6)
            
#             # Avoid log(0) by using small epsilon
#             if expected_pct == 0:
#                 expected_pct = 1e-6
#             if actual_pct == 0:
#                 actual_pct = 1e-6
                
#             psi_component = (actual_pct - expected_pct) * np.log(actual_pct / expected_pct)
#             psi += psi_component
            
#         return psi
    
#     # Main PSI calculation logic
#     def calculate_psi_for_feature(df: pd.DataFrame, feature: str, feature_type: str, 
#                                 month_column: str, segment_info: Tuple[str, str] = None) -> pd.DataFrame:
#         """Calculate PSI for a specific feature"""
        
#         # Filter data if segment is specified
#         if segment_info:
#             segment_col, segment_val = segment_info
#             segment_data = df[df[segment_col] == segment_val].copy()
#         else:
#             segment_data = df.copy()
#             segment_info = ('Overall', 'Overall')
        
#         # Get months and baseline
#         months = sorted(segment_data[month_column].unique())
#         if len(months) < 2:
#             return pd.DataFrame()  # Need at least 2 months for comparison
            
#         if baseline_month:
#             if baseline_month not in months:
#                 print(f"Warning: Baseline month {baseline_month} not found in segment {segment_info}")
#                 return pd.DataFrame()
#             baseline_month_used = baseline_month
#         else:
#             baseline_month_used = months[0]
        
#         # Get baseline data
#         baseline_data = segment_data[segment_data[month_column] == baseline_month_used][feature]
        
#         # Skip if baseline data is empty
#         if len(baseline_data.dropna()) == 0:
#             return pd.DataFrame()
        
#         # Prepare bins or categories based on baseline
#         if feature_type == 'numerical':
#             bins = create_numerical_bins(baseline_data)
#             top_categories = None
#         else:
#             bins = None
#             top_categories = process_categorical_feature(baseline_data)
        
#         # Calculate baseline distribution
#         baseline_dist = calculate_distribution(baseline_data, feature_type, bins, top_categories)
        
#         # Calculate PSI for each month compared to baseline
#         results = []
#         for current_month in months:
#             if current_month == baseline_month_used:
#                 # For baseline month, PSI = 0 (comparison with itself)
#                 psi_value = 0.0
#                 current_data = baseline_data
#                 current_dist = baseline_dist
#             else:
#                 current_data = segment_data[segment_data[month_column] == current_month][feature]
                
#                 if len(current_data.dropna()) == 0:
#                     continue
                    
#                 current_dist = calculate_distribution(current_data, feature_type, bins, top_categories)
#                 psi_value = calculate_psi(baseline_dist, current_dist)
            
#             results.append({
#                 'feature': feature,
#                 'feature_type': feature_type,
#                 'segment_column': segment_info[0],
#                 'segment_value': segment_info[1],
#                 'baseline_month': baseline_month_used,
#                 'current_month': current_month,
#                 'psi': psi_value,
#                 'num_baseline_records': len(baseline_data),
#                 'num_current_records': len(current_data)
#             })
        
#         return pd.DataFrame(results)
    
#     # Main execution
#     print("Identifying feature types...")
#     feature_types = identify_feature_types(df, feature_list)
    
#     # Prepare results dataframe
#     all_results = []
    
#     # Calculate PSI overall (no segmentation)
#     print("Calculating overall PSI...")
#     for feature, ftype in feature_types.items():
#         print(f"  Processing {feature} ({ftype})")
#         result_df = calculate_psi_for_feature(df, feature, ftype, month_column)
#         if not result_df.empty:
#             all_results.append(result_df)
    
#     # Calculate PSI for each segment
#     print("Calculating segment-wise PSI...")
#     for segment_col in segment_columns:
#         if segment_col not in df.columns:
#             print(f"Warning: Segment column '{segment_col}' not found in dataframe")
#             continue
            
#         segment_values = df[segment_col].dropna().unique()
#         print(f"  Processing segment: {segment_col} ({len(segment_values)} values)")
        
#         for segment_val in segment_values:
#             for feature, ftype in feature_types.items():
#                 result_df = calculate_psi_for_feature(
#                     df, feature, ftype, month_column, (segment_col, segment_val)
#                 )
#                 if not result_df.empty:
#                     all_results.append(result_df)
    
#     # Combine all results
#     if all_results:
#         final_results = pd.concat(all_results, ignore_index=True)
        
#         # Add PSI interpretation
#         def interpret_psi(psi_value):
#             if psi_value < 0.1:
#                 return 'No significant change'
#             elif psi_value < 0.2:
#                 return 'Minor change'
#             elif psi_value < 0.5:
#                 return 'Moderate change'
#             else:
#                 return 'Significant change'
        
#         final_results['psi_interpretation'] = final_results['psi'].apply(interpret_psi)
        
#         # Create combined month column for plotting
#         # For baseline month, use baseline_month, for others use current_month
#         final_results['plot_month'] = final_results['current_month']
        
#         # Add month sequence for proper ordering in plots
#         all_months = sorted(final_results['plot_month'].unique())
#         month_sequence = {month: i for i, month in enumerate(all_months)}
#         final_results['month_sequence'] = final_results['plot_month'].map(month_sequence)
        
#         return final_results.sort_values(['feature', 'segment_column', 'segment_value', 'month_sequence'])
#     else:
#         return pd.DataFrame()

# def calculate_bin_level_psi(
#     df: pd.DataFrame,
#     feature_list: List[str],
#     segment_columns: List[str],
#     month_column: str = 'Application_month',
#     baseline_month: Optional[str] = None,
#     top_n_categories: int = 6
# ) -> pd.DataFrame:
#     """
#     Calculate bin-level PSI details showing distribution changes for each bin/category.
    
#     Parameters:
#     -----------
#     df : pd.DataFrame
#         Input dataframe with features and segments
#     feature_list : List[str]
#         List of feature names to calculate PSI for
#     segment_columns : List[str]
#         List of segment column names
#     month_column : str
#         Name of the month column
#     baseline_month : str, optional
#         Specific baseline month to use
#     top_n_categories : int
#         Number of top categories to keep for categorical features
    
#     Returns:
#     --------
#     pd.DataFrame
#         DataFrame with bin-level PSI details
#     """
    
#     def identify_feature_types(df: pd.DataFrame, feature_list: List[str]) -> Dict[str, str]:
#         """Identify categorical vs numerical features"""
#         feature_types = {}
#         for feature in feature_list:
#             if feature not in df.columns:
#                 continue
#             unique_vals = df[feature].nunique()
#             dtype = df[feature].dtype
            
#             if (dtype == 'object' or unique_vals <= 10 or 
#                 feature in ['cic_ScoreRange', 'cic_ln_loan_level_user_type', 
#                           'cic_has_ever_been_overdue', 'cic_latest_granted_contract_overdue_flag',
#                           'cic_flg_zero_non_granted_ever', 'cic_flg_zero_granted_ever']):
#                 feature_types[feature] = 'categorical'
#             else:
#                 feature_types[feature] = 'numerical'
#         return feature_types
    
#     def create_numerical_bins(baseline_data: pd.Series, n_bins: int = 10) -> pd.IntervalIndex:
#         """Create bins for numerical features"""
#         clean_data = baseline_data.dropna()
#         if len(clean_data) == 0:
#             return None
#         try:
#             bins = pd.qcut(clean_data, n_bins, duplicates='drop', retbins=True)[1]
#             bins = sorted(set(bins))
#             return pd.IntervalIndex.from_breaks(bins)
#         except:
#             min_val = clean_data.min()
#             max_val = clean_data.max()
#             bins = np.linspace(min_val, max_val, n_bins + 1)
#             return pd.IntervalIndex.from_breaks(bins)
    
#     def get_bin_level_distribution(data: pd.Series, feature_type: str, 
#                                  bins: pd.IntervalIndex = None, 
#                                  top_categories: List = None) -> pd.DataFrame:
#         """Get distribution at bin/category level"""
#         if feature_type == 'numerical':
#             return get_numerical_bin_distribution(data, bins)
#         else:
#             return get_categorical_bin_distribution(data, top_categories)
    
#     def get_numerical_bin_distribution(data: pd.Series, bins: pd.IntervalIndex) -> pd.DataFrame:
#         """Get numerical distribution by bin"""
#         if bins is None:
#             return pd.DataFrame()
            
#         clean_data = data.dropna()
#         if len(clean_data) == 0:
#             return pd.DataFrame({'bin': [str(bin_) for bin_ in bins], 'count': 0, 'percentage': 0})
        
#         binned = pd.cut(clean_data, bins, include_lowest=True)
#         distribution = binned.value_counts().sort_index()
#         total = len(clean_data)
        
#         results = []
#         for interval, count in distribution.items():
#             results.append({
#                 'bin': str(interval),
#                 'count': count,
#                 'percentage': count / total if total > 0 else 0
#             })
        
#         # Add missing bins with zero count
#         existing_bins = {str(interval) for interval in distribution.index}
#         for bin_interval in bins:
#             bin_str = str(bin_interval)
#             if bin_str not in existing_bins:
#                 results.append({
#                     'bin': bin_str,
#                     'count': 0,
#                     'percentage': 0
#                 })
        
#         return pd.DataFrame(results)
    
#     def get_categorical_bin_distribution(data: pd.Series, top_categories: List) -> pd.DataFrame:
#         """Get categorical distribution by category"""
#         if top_categories is None:
#             return pd.DataFrame()
            
#         value_counts = data.value_counts()
#         total = len(data.dropna())
        
#         results = []
#         # Top categories
#         for category in top_categories:
#             count = value_counts.get(category, 0)
#             results.append({
#                 'bin': str(category),
#                 'count': count,
#                 'percentage': count / total if total > 0 else 0
#             })
        
#         # Other categories
#         other_categories = [cat for cat in value_counts.index if cat not in top_categories]
#         other_count = value_counts[other_categories].sum() if other_categories else 0
#         results.append({
#             'bin': 'Other',
#             'count': other_count,
#             'percentage': other_count / total if total > 0 else 0
#         })
        
#         return pd.DataFrame(results)
    
#     # Main execution for bin-level analysis
#     print("Calculating bin-level PSI details...")
#     feature_types = identify_feature_types(df, feature_list)
#     all_bin_results = []
    
#     # Process overall and segments
#     segments_to_process = [('Overall', 'Overall')]  # Overall first
#     for segment_col in segment_columns:
#         if segment_col in df.columns:
#             for segment_val in df[segment_col].dropna().unique():
#                 segments_to_process.append((segment_col, segment_val))
    
#     for segment_info in segments_to_process:
#         segment_col, segment_val = segment_info
        
#         # Filter data for segment
#         if segment_col == 'Overall':
#             segment_data = df.copy()
#         else:
#             segment_data = df[df[segment_col] == segment_val].copy()
        
#         # Get months and baseline
#         months = sorted(segment_data[month_column].unique())
#         if len(months) < 2:
#             continue
            
#         baseline_month_used = baseline_month if baseline_month else months[0]
        
#         if baseline_month_used not in months:
#             continue
        
#         print(f"Processing segment: {segment_col}={segment_val}")
        
#         for feature, ftype in feature_types.items():
#             print(f"  Feature: {feature}")
            
#             # Get baseline data and setup
#             baseline_data = segment_data[segment_data[month_column] == baseline_month_used][feature]
            
#             if len(baseline_data.dropna()) == 0:
#                 continue
            
#             # Prepare bins/categories
#             if ftype == 'numerical':
#                 bins = create_numerical_bins(baseline_data)
#                 top_categories = None
#             else:
#                 bins = None
#                 top_categories = segment_data[segment_data[month_column] == baseline_month_used][feature].value_counts().head(top_n_categories).index.tolist()
            
#             # Get baseline distribution
#             baseline_dist_df = get_bin_level_distribution(baseline_data, ftype, bins, top_categories)
#             baseline_dist_dict = dict(zip(baseline_dist_df['bin'], baseline_dist_df['percentage']))
            
#             # Process each current month
#             for current_month in months:
#                 if current_month == baseline_month_used:
#                     # For baseline month, PSI components are 0
#                     current_data = baseline_data
#                     current_dist_df = baseline_dist_df
#                     current_dist_dict = baseline_dist_dict
#                 else:
#                     current_data = segment_data[segment_data[month_column] == current_month][feature]
                    
#                     if len(current_data.dropna()) == 0:
#                         continue
                    
#                     current_dist_df = get_bin_level_distribution(current_data, ftype, bins, top_categories)
#                     current_dist_dict = dict(zip(current_dist_df['bin'], current_dist_df['percentage']))
                
#                 # Calculate PSI components for each bin
#                 all_bins = set(baseline_dist_dict.keys()) | set(current_dist_dict.keys())
                
#                 for bin_name in all_bins:
#                     expected_pct = baseline_dist_dict.get(bin_name, 1e-6)
#                     actual_pct = current_dist_dict.get(bin_name, 1e-6)
                    
#                     # Calculate PSI component
#                     if current_month == baseline_month_used:
#                         psi_component = 0.0  # Baseline month has zero PSI component
#                     else:
#                         if expected_pct == 0:
#                             expected_pct = 1e-6
#                         if actual_pct == 0:
#                             actual_pct = 1e-6
#                         psi_component = (actual_pct - expected_pct) * np.log(actual_pct / expected_pct)
                    
#                     # Get counts
#                     baseline_count = baseline_dist_df[baseline_dist_df['bin'] == bin_name]['count'].iloc[0] if bin_name in baseline_dist_df['bin'].values else 0
#                     current_count = current_dist_df[current_dist_df['bin'] == bin_name]['count'].iloc[0] if bin_name in current_dist_df['bin'].values else 0
                    
#                     all_bin_results.append({
#                         'feature': feature,
#                         'feature_type': ftype,
#                         'segment_column': segment_col,
#                         'segment_value': segment_val,
#                         'baseline_month': baseline_month_used,
#                         'current_month': current_month,
#                         'bin': bin_name,
#                         'baseline_percentage': expected_pct,
#                         'current_percentage': actual_pct,
#                         'psi_component': psi_component,
#                         'baseline_count': baseline_count,
#                         'current_count': current_count,
#                         'percentage_change': actual_pct - expected_pct
#                     })
    
#     if all_bin_results:
#         bin_results_df = pd.DataFrame(all_bin_results)
        
#         # Add interpretation for percentage changes
#         def interpret_percentage_change(change):
#             abs_change = abs(change)
#             if abs_change < 0.01:
#                 return 'Very Small'
#             elif abs_change < 0.05:
#                 return 'Small'
#             elif abs_change < 0.1:
#                 return 'Moderate'
#             else:
#                 return 'Large'
        
#         bin_results_df['change_interpretation'] = bin_results_df['percentage_change'].apply(interpret_percentage_change)
        
#         # Create combined month column for plotting
#         bin_results_df['plot_month'] = bin_results_df['current_month']
        
#         # Add month sequence for proper ordering in plots
#         all_months = sorted(bin_results_df['plot_month'].unique())
#         month_sequence = {month: i for i, month in enumerate(all_months)}
#         bin_results_df['month_sequence'] = bin_results_df['plot_month'].map(month_sequence)
        
#         return bin_results_df.sort_values(['feature', 'segment_column', 'segment_value', 'month_sequence', 'bin'])
#     else:
#         return pd.DataFrame()

# # Enhanced plotting functions
# import matplotlib.pyplot as plt
# import seaborn as sns

# def plot_psi_trends(psi_df: pd.DataFrame, features: List[str] = None, segments: List[Tuple[str, str]] = None):
#     """
#     Plot PSI trends over time for specified features and segments.
    
#     Parameters:
#     -----------
#     psi_df : pd.DataFrame
#         PSI results from calculate_population_stability_index
#     features : List[str], optional
#         List of features to plot. If None, plots all features
#     segments : List[Tuple[str, str]], optional
#         List of segments to plot in format [(segment_column, segment_value), ...]
#         If None, plots overall only
#     """
#     if psi_df.empty:
#         print("No PSI data to plot")
#         return
    
#     # Filter features if specified
#     if features is None:
#         features = psi_df['feature'].unique()
    
#     # Filter segments if specified
#     if segments is None:
#         plot_data = psi_df[psi_df['segment_column'] == 'Overall']
#     else:
#         segment_conditions = []
#         for seg_col, seg_val in segments:
#             condition = (psi_df['segment_column'] == seg_col) & (psi_df['segment_value'] == seg_val)
#             segment_conditions.append(condition)
        
#         plot_data = psi_df[pd.concat(segment_conditions, axis=1).any(axis=1)]
    
#     # Create subplots
#     n_features = len(features)
#     n_cols = min(3, n_features)
#     n_rows = (n_features + n_cols - 1) // n_cols
    
#     fig, axes = plt.subplots(n_rows, n_cols, figsize=(5*n_cols, 4*n_rows))
#     if n_features == 1:
#         axes = [axes]
#     elif n_rows > 1 and n_cols > 1:
#         axes = axes.flatten()
    
#     for idx, feature in enumerate(features):
#         if idx >= len(axes):
#             break
            
#         ax = axes[idx]
#         feature_data = plot_data[plot_data['feature'] == feature]
        
#         # Plot each segment
#         segments_in_data = feature_data[['segment_column', 'segment_value']].drop_duplicates()
        
#         for _, seg_row in segments_in_data.iterrows():
#             seg_col, seg_val = seg_row['segment_column'], seg_row['segment_value']
#             seg_data = feature_data[
#                 (feature_data['segment_column'] == seg_col) & 
#                 (feature_data['segment_value'] == seg_val)
#             ].sort_values('month_sequence')
            
#             label = f"{seg_col}={seg_val}" if seg_col != 'Overall' else 'Overall'
#             ax.plot(seg_data['month_sequence'], seg_data['psi'], marker='o', label=label, linewidth=2)
        
#         ax.set_title(f'PSI Trend: {feature}', fontsize=14, fontweight='bold')
#         ax.set_xlabel('Month Sequence')
#         ax.set_ylabel('PSI Value')
#         ax.grid(True, alpha=0.3)
        
#         # Add PSI interpretation guidelines
#         ax.axhline(y=0.1, color='orange', linestyle='--', alpha=0.7, label='Minor Change Threshold')
#         ax.axhline(y=0.2, color='red', linestyle='--', alpha=0.7, label='Moderate Change Threshold')
        
#         if idx == 0:  # Only show legend on first subplot
#             ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    
#     # Hide empty subplots
#     for idx in range(len(features), len(axes)):
#         axes[idx].set_visible(False)
    
#     plt.tight_layout()
#     plt.show()


# def plot_feature_psi_heatmap(psi_df: pd.DataFrame, month: str = None):
#     """
#     Create a heatmap of PSI values across features and segments for a specific month.
    
#     Parameters:
#     -----------
#     psi_df : pd.DataFrame
#         PSI results from calculate_population_stability_index
#     month : str, optional
#         Specific month to plot. If None, uses latest month
#     """
#     if psi_df.empty:
#         print("No PSI data to plot")
#         return
    
#     # Use latest month if not specified
#     if month is None:
#         month = psi_df['current_month'].max()
    
#     # Filter data for the specific month
#     heatmap_data = psi_df[psi_df['current_month'] == month].copy()
    
#     if heatmap_data.empty:
#         print(f"No data for month {month}")
#         return
    
#     # Create pivot table for heatmap
#     pivot_data = heatmap_data.pivot_table(
#         index=['segment_column', 'segment_value'],
#         columns='feature',
#         values='psi',
#         aggfunc='first'
#     ).fillna(0)
    
#     # Create the heatmap
#     plt.figure(figsize=(max(12, len(pivot_data.columns) * 0.8), max(8, len(pivot_data) * 0.6)))
    
#     # Create custom colormap
#     cmap = sns.diverging_palette(10, 130, as_cmap=True)
    
#     sns.heatmap(
#         pivot_data,
#         annot=True,
#         fmt='.3f',
#         cmap=cmap,
#         center=0.1,
#         cbar_kws={'label': 'PSI Value'}
#     )
    
#     plt.title(f'PSI Values - {month}\n(PSI < 0.1: No Change, 0.1-0.2: Minor, 0.2-0.5: Moderate, >0.5: Significant)',
#               fontsize=14, fontweight='bold', pad=20)
#     plt.xlabel('Features')
#     plt.ylabel('Segments')
#     plt.xticks(rotation=45, ha='right')
#     plt.tight_layout()
#     plt.show()


# # Enhanced analysis functions
# def analyze_psi_results(psi_df: pd.DataFrame):
#     """
#     Analyze and summarize PSI results
#     """
#     if psi_df.empty:
#         print("No PSI results to analyze")
#         return
    
#     print("PSI Results Summary:")
#     print("=" * 80)
    
#     # Overall summary
#     print(f"\nTotal PSI calculations: {len(psi_df)}")
#     print(f"Features analyzed: {psi_df['feature'].nunique()}")
#     print(f"Segments analyzed: {psi_df['segment_column'].nunique()}")
#     print(f"Time periods analyzed: {psi_df['current_month'].nunique()}")
    
#     # Features with highest average PSI (excluding baseline)
#     non_baseline_psi = psi_df[psi_df['current_month'] != psi_df['baseline_month']]
#     if not non_baseline_psi.empty:
#         feature_psi_avg = non_baseline_psi.groupby('feature')['psi'].mean().sort_values(ascending=False)
#         print(f"\nTop 5 features with highest average PSI (excluding baseline):")
#         for feature, avg_psi in feature_psi_avg.head().items():
#             print(f"  {feature}: {avg_psi:.4f}")
    
#     # Segments with highest average PSI
#     segment_psi_avg = psi_df.groupby('segment_column')['psi'].mean().sort_values(ascending=False)
#     print(f"\nAverage PSI by segment:")
#     for segment, avg_psi in segment_psi_avg.items():
#         print(f"  {segment}: {avg_psi:.4f}")
    
#     # PSI interpretation distribution
#     interpretation_counts = psi_df['psi_interpretation'].value_counts()
#     print(f"\nPSI Interpretation Distribution:")
#     for interpretation, count in interpretation_counts.items():
#         percentage = (count / len(psi_df)) * 100
#         print(f"  {interpretation}: {count} ({percentage:.1f}%)")
    
#     # Monthly trend
#     monthly_psi = psi_df.groupby('current_month')['psi'].mean()
#     print(f"\nAverage PSI by month:")
#     for month, avg_psi in monthly_psi.items():
#         print(f"  {month}: {avg_psi:.4f}")


# def analyze_bin_level_results(bin_psi_df: pd.DataFrame, top_n: int = 10):
#     """
#     Analyze bin-level PSI results
#     """
#     if bin_psi_df.empty:
#         print("No bin-level results to analyze")
#         return
    
#     print("Bin-Level PSI Analysis:")
#     print("=" * 80)
    
#     # Exclude baseline month for change analysis
#     non_baseline_bins = bin_psi_df[bin_psi_df['current_month'] != bin_psi_df['baseline_month']]
    
#     if not non_baseline_bins.empty:
#         # Bins with highest PSI components
#         top_psi_components = non_baseline_bins.nlargest(top_n, 'psi_component')[[
#             'feature', 'segment_value', 'current_month', 'bin', 
#             'psi_component', 'baseline_percentage', 'current_percentage'
#         ]]
        
#         print(f"\nTop {top_n} bins with highest PSI components:")
#         for _, row in top_psi_components.iterrows():
#             print(f"  {row['feature']} | {row['segment_value']} | {row['current_month']}")
#             print(f"    Bin: {row['bin']}")
#             print(f"    PSI Component: {row['psi_component']:.4f}")
#             print(f"    Baseline: {row['baseline_percentage']:.1%} -> Current: {row['current_percentage']:.1%}")
#             print()
        
#         # Largest percentage changes
#         non_baseline_bins['abs_percentage_change'] = non_baseline_bins['percentage_change'].abs()
#         top_changes = non_baseline_bins.nlargest(top_n, 'abs_percentage_change')[[
#             'feature', 'segment_value', 'current_month', 'bin',
#             'baseline_percentage', 'current_percentage', 'percentage_change'
#         ]]
        
#         print(f"\nTop {top_n} largest percentage changes:")
#         for _, row in top_changes.iterrows():
#             print(f"  {row['feature']} | {row['segment_value']} | {row['current_month']}")
#             print(f"    Bin: {row['bin']}")
#             print(f"    Change: {row['baseline_percentage']:.1%} -> {row['current_percentage']:.1%}")
#             print(f"    Δ: {row['percentage_change']:+.1%}")
#             print()



In [None]:
# # Example usage
# def example_usage():
#     """
#     Example of how to use both functions
#     """
#     # Load your data
#     # df = pd.read_csv('sample.csv')
    
#     feature_list = ['aCicScore',
#         'cic_max_age_all_contracts_snapshot',
#         'cic_ratio_overdue_contracts_to_granted_contracts', 
#         'cic_ScoreRange',
#         'cic_ln_loan_level_user_type', 
#         'cic_has_ever_been_overdue',
#         'cic_latest_granted_contract_overdue_flag',
#         'cic_ratio_closed_over_new_granted_cnt_24M',
#         'cic_ratio_risky_contracts_to_granted_contracts',
#         'cic_Short_and_Term_Loans_granted_contracts_cnt_24M',
#         'cic_flg_zero_non_granted_ever',
#         'cic_Personal_Loans_granted_contracts_amt_24M',
#         'cic_CreditAvgCreditLimit', 
#         'cic_flg_zero_granted_ever',
#     ]
    
#     segment_columns = ['new_loan_type', 'gender', 'osType', 'loanType', 'trenchCategory']
    
#     # Calculate overall PSI
#     print("Calculating overall PSI...")
#     psi_results = calculate_population_stability_index(df, feature_list, segment_columns)
#     analyze_psi_results(psi_results)
#     psi_results['modelDisplayName'] = df['modelDisplayName'].iloc[0]
#     psi_results['modelVersionId'] = df['modelVersionId'].iloc[0]
#     psi_results['trenchCategory'] = df['trenchCategory'].iloc[0]
    
#     # Calculate bin-level PSI
#     print("\nCalculating bin-level PSI...")
#     bin_psi_results = calculate_bin_level_psi(df, feature_list, segment_columns)
#     analyze_bin_level_results(bin_psi_results)
#     bin_psi_results['modelDisplayName'] = df['modelDisplayName'].iloc[0]
#     bin_psi_results['modelVersionId'] = df['modelVersionId'].iloc[0]
#     bin_psi_results['trenchCategory'] = df['trenchCategory'].iloc[0]

#     # psi_results = psi_results[[ 'modelDisplayName',
#     # 'modelVersionId', 'trenchCategory', 'feature', 'feature_type', 'segment_column', 'segment_value',
#     # 'baseline_month', 'current_month', 'psi', 'num_baseline_records',
#     # 'num_current_records', 'psi_interpretation']].copy()
    
#     # bin_psi_results = bin_psi_results[[ 'modelDisplayName', 'modelVersionId',
#     #    'trenchCategory','feature', 'feature_type', 'segment_column', 'segment_value',
#     #    'baseline_month', 'current_month', 'bin', 'baseline_percentage',
#     #    'current_percentage', 'psi_component', 'baseline_count',
#     #    'current_count', 'percentage_change', 'change_interpretation',
#     #    ]].copy()
    
#     # Create plots
#     print("\nCreating plots...")
#     plot_psi_trends(psi_results, features=feature_list[:6])  # Plot first 6 features
#     plot_feature_psi_heatmap(psi_results)
    
#     return psi_results, bin_psi_results



In [None]:
# # Run the analysis
# if __name__ == "__main__":
#     psi_results, bin_psi_results = example_usage()
#     pass

In [None]:
psi_results.head()