- In this Notebook, I am trying to integrate the training data from the backscore and rest of the period from prj-prod-dataplatform.audit_balance.ml_model_run_details table.
- In this I will compare the training period with each month of test period.

**Steps to Follow**:

* Read the specific model data from prj-prod-dataplatform.audit_balance.ml_model_run_details table
* Expand the calcFeature column to extract all the features for the model
* Read the data from specific backscore table for the training data
* Identify the features and create a list
* Use transform_data function to create the same structure as ml_model_run_details table
* Insert the data to a similar training table - prj-prod-dataplatform.dap_ds_poweruser_playground.ml_training_model_run_details
* Read the specific model data from prj-prod-dataplatform.dap_ds_poweruser_playground.ml_training_model_run_details
* expand the training set from the calcFeature column
* Concatenate both the test and train datasets
* Calculate the PSI using the PSI function comparing it with the train set
* Insert the result to a PSI table prj-prod-dataplatform.dap_ds_poweruser_playground.alpha_cic_sil_model_psi_v4

# **PSI - CSI Calculation**

## Define Libraries

In [48]:
# %% [markdown]
# # Jupyter Notebook Loading Header
#
# This is a custom loading header for Jupyter Notebooks in Visual Studio Code.
# It includes common imports and settings to get you started quickly.
# %% [markdown]
## Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from google.cloud import bigquery
from google.cloud import storage
import os
import tempfile
import time
from datetime import datetime
import uuid
import joblib
import uuid

import gcsfs
import duckdb as dd
import pickle
import joblib
from typing import Union
import io
path = r'C:\Users\Dwaipayan\AppData\Roaming\gcloud\legacy_credentials\dchakroborti@tonikbank.com\adc.json'
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = path
client = bigquery.Client(project='prj-prod-dataplatform')
os.environ["GOOGLE_CLOUD_PROJECT"] = "prj-prod-dataplatform"

# %% [markdown]
## Configure Settings
# Set options or configurations as needed
pd.set_option('display.max_columns', None)
pd.set_option("Display.max_rows", 100)

### Function

#### expand_calc_features

In [49]:
import pandas as pd
import json

def expand_calc_features(df):
    """
    Expand the calcFeatures JSON column into separate columns and return the complete DataFrame.

    Parameters:
    df (pd.DataFrame): Input DataFrame with calcFeatures column containing JSON data

    Returns:
    pd.DataFrame: Expanded DataFrame with all original columns plus JSON features as separate columns
    """

    # Make a copy to avoid modifying the original DataFrame
    df_expanded = df.copy()

    # Parse the calcFeatures JSON column
    calc_features_list = []

    for idx, calc_features_str in enumerate(df['calcFeatures']):
        try:
            # Parse the JSON string
            features_dict = json.loads(calc_features_str.replace("'", '"'))  # Replace single quotes with double quotes for valid JSON
            calc_features_list.append(features_dict)
        except (json.JSONDecodeError, AttributeError) as e:
            # If parsing fails, create an empty dict and print warning
            print(f"Warning: Could not parse calcFeatures at index {idx}: {e}")
            calc_features_list.append({})

    # Create DataFrame from the parsed JSON data
    calc_features_df = pd.DataFrame(calc_features_list)

    # Add prefix to JSON-derived columns to avoid conflicts
    calc_features_df = calc_features_df.add_prefix('calc_')

    # Reset index to ensure proper alignment
    df_expanded = df_expanded.reset_index(drop=True)
    calc_features_df = calc_features_df.reset_index(drop=True)

    # Combine original DataFrame with expanded calcFeatures
    result_df = pd.concat([df_expanded, calc_features_df], axis=1)

    return result_df


#### expand_calc_features_robust

In [50]:
# import pandas as pd
# import json

# def expand_calc_features_robust(df):
#     """
#     Expand the calcFeatures JSON column into separate columns with better error handling.

#     Parameters:
#     df (pd.DataFrame): Input DataFrame with calcFeatures column containing JSON data

#     Returns:
#     pd.DataFrame: Expanded DataFrame with all original columns plus JSON features as separate columns
#     """

#     # Make a copy to avoid modifying the original DataFrame
#     df_expanded = df.copy()

#     # Parse the calcFeatures JSON column
#     calc_features_data = []

#     for idx, row in df.iterrows():
#         calc_features_str = row['calcFeatures']

#         if pd.isna(calc_features_str) or calc_features_str == '':
#             calc_features_data.append({})
#             continue

#         try:
#             # Clean the string and parse JSON
#             cleaned_str = calc_features_str.replace("'", '"').replace('None', 'null').replace('True', 'true').replace('False', 'false')
#             features_dict = json.loads(cleaned_str)
#             calc_features_data.append(features_dict)
#         except Exception as e:
#             print(f"Warning: Could not parse calcFeatures at index {idx}: {e}")
#             print(f"Problematic string: {calc_features_str[:100]}...")  # Print first 100 chars
#             calc_features_data.append({})

#     # Create DataFrame from the parsed JSON data
#     calc_features_df = pd.DataFrame(calc_features_data)

#     # Add prefix to JSON-derived columns to avoid conflicts with existing columns
#     calc_features_df = calc_features_df.add_prefix('feat_')

#     # Combine DataFrames
#     result_df = pd.concat([df_expanded, calc_features_df], axis=1)

#     print(f"Original DataFrame shape: {df.shape}")
#     print(f"Expanded DataFrame shape: {result_df.shape}")
#     print(f"Added {len(calc_features_df.columns)} new columns from calcFeatures")

#     return result_df

import pandas as pd
import json

def expand_calc_features_robust(df):
    """
    Expand the calcFeatures JSON column into separate columns and return the complete DataFrame.
    
    Parameters:
    df (pd.DataFrame): Input DataFrame with calcFeatures column containing JSON data
    
    Returns:
    pd.DataFrame: Expanded DataFrame with all original columns plus JSON features as separate columns
    """
    
    # Make a copy to avoid modifying the original DataFrame
    df_expanded = df.copy()
    
    # Parse the calcFeatures JSON column
    calc_features_data = []
    
    for idx, calc_features_str in enumerate(df['calcFeatures']):
        try:
            # Handle None or NaN values
            if pd.isna(calc_features_str):
                calc_features_data.append({})
                continue
            
            # Convert to string if not already
            calc_features_str = str(calc_features_str)
            
            # Parse the JSON string
            features_dict = json.loads(calc_features_str.replace("'", '"'))
            
            # Ensure it's a dictionary, not a list
            if isinstance(features_dict, dict):
                calc_features_data.append(features_dict)
            elif isinstance(features_dict, list):
                # If it's a list, convert to dict with index keys or skip
                print(f"Warning: calcFeatures at index {idx} is a list, converting to dict")
                calc_features_data.append({'raw_list': features_dict})
            else:
                print(f"Warning: calcFeatures at index {idx} is neither dict nor list: {type(features_dict)}")
                calc_features_data.append({})
                
        except (json.JSONDecodeError, AttributeError, TypeError) as e:
            # If parsing fails, create an empty dict and print warning
            print(f"Warning: Could not parse calcFeatures at index {idx}: {e}")
            print(f"  Value: {calc_features_str}")
            calc_features_data.append({})
    
    # Create DataFrame from the parsed JSON data
    calc_features_df = pd.DataFrame(calc_features_data)
    
    # Add prefix to JSON-derived columns to avoid conflicts
    calc_features_df = calc_features_df.add_prefix('calc_')
    
    # Reset index to ensure proper alignment
    df_expanded = df_expanded.reset_index(drop=True)
    calc_features_df = calc_features_df.reset_index(drop=True)
    
    # Combine original DataFrame with expanded calcFeatures
    result_df = pd.concat([df_expanded, calc_features_df], axis=1)
    
    return result_df




# expand_calc_features_fixed

In [51]:
import pandas as pd
import json

def expand_calc_features_fixed(df):
    """
    Expand the calcFeatures JSON column into separate columns and return the complete DataFrame.

    Parameters:
    df (pd.DataFrame): Input DataFrame with calcFeatures column containing JSON data

    Returns:
    pd.DataFrame: Expanded DataFrame with all original columns plus JSON features as separate columns
    """

    # Make a copy to avoid modifying the original DataFrame
    df_expanded = df.copy()

    # Parse the calcFeatures JSON column
    calc_features_list = []

    for idx, calc_features_str in enumerate(df['calcFeatures']):
        try:
            # Handle None/null values
            if pd.isna(calc_features_str) or calc_features_str is None:
                calc_features_list.append({})
                continue
                
            # Parse the JSON string
            features_data = json.loads(calc_features_str.replace("'", '"'))  # Replace single quotes with double quotes for valid JSON
            
            # Handle both list and dictionary formats
            if isinstance(features_data, list):
                # If it's a list, take the first element (assuming list of one dictionary)
                if len(features_data) > 0:
                    calc_features_list.append(features_data[0])
                else:
                    calc_features_list.append({})
            elif isinstance(features_data, dict):
                calc_features_list.append(features_data)
            else:
                print(f"Warning: Unexpected data type at index {idx}: {type(features_data)}")
                calc_features_list.append({})
                
        except (json.JSONDecodeError, AttributeError, TypeError) as e:
            # If parsing fails, create an empty dict and print warning
            print(f"Warning: Could not parse calcFeatures at index {idx}: {e}")
            calc_features_list.append({})

    # Create DataFrame from the parsed JSON data
    calc_features_df = pd.DataFrame(calc_features_list)

    # Add prefix to JSON-derived columns to avoid conflicts
    calc_features_df = calc_features_df.add_prefix('calc_')

    # Reset index to ensure proper alignment
    df_expanded = df_expanded.reset_index(drop=True)
    calc_features_df = calc_features_df.reset_index(drop=True)

    # Combine original DataFrame with expanded calcFeatures
    result_df = pd.concat([df_expanded, calc_features_df], axis=1)

    return result_df

#### transform_data

In [52]:
# import pandas as pd
# import json
# import uuid
# from datetime import datetime
# from typing import List

# def transform_data(d1: pd.DataFrame, feature_column: List[str], a='demo_score', modelDisplayName = 'Cash_beta_trench1_Demo_backscore', subscription_name = 'sil_march 25 models'):
#     # Read the input CSV file
#     df = d1.copy()
    
#     # Create the output DataFrame with the required structure
#     output_data = []
    
#     for _, row in df.iterrows():
#         # Create the calcFeature JSON with all the feature columns
#         feature_columns = feature_column
        
#         calc_feature = {}
#         for col in feature_columns:
#             if col in row and pd.notna(row[col]):
#                 # Convert Timestamp objects to string
#                 if isinstance(row[col], pd.Timestamp):
#                     calc_feature[col] = row[col].isoformat()
#                 else:
#                     calc_feature[col] = row[col]
        
       
#         # Get current timestamp
#         current_time = datetime.now().isoformat()
        
#         # Create the output row
#         output_row = {
#             "customerId": row['customer_id'],
#             "digitalLoanAccountId": row['digitalLoanAccountId'],
#             "crifApplicationId": str(uuid.uuid4()),  # Generate random UUID
#             "prediction": row.get(a, 0),
#             "start_time": current_time,
#             "end_time": current_time,
#             "modelDisplayName":modelDisplayName,
#             "modelVersionId":"v1",
#             "subscription_name": subscription_name,
#             "message_id": str(uuid.uuid4()),  # Generate random UUID
#             "publish_time": current_time,
#             "attributes": "{}",  # Empty JSON object
#             "calcFeature": json.dumps(calc_feature, default=str)  # Use default=str to handle non-serializable objects
            
#         }
        
#         output_data.append(output_row)
    
#     # Create DataFrame from the output data
#     output_df = pd.DataFrame(output_data)
    
#     return output_df

# # Example usage:
# # transformeddata = 'cash_beta_trench1_applied_loans_backscored_20241001_20250831'
# # transform_data(f'{LOCALPATH}/{transformeddata}.csv')

import pandas as pd
import json
import uuid
from datetime import datetime
from typing import List

def transform_data(
    d1: pd.DataFrame, 
    feature_column: List[str], 
    a: str = 'demo_score', 
    modelDisplayName: str = 'Cash_beta_trench1_Demo_backscore', 
    tc: str = "", 
    subscription_name: str = 'sil_march 25 models'
) -> pd.DataFrame:
    """
    Transforms input data into a structured format suitable for model scoring output.

    Parameters:
    - d1 (pd.DataFrame): Input DataFrame containing raw data.
    - feature_column (List[str]): List of column names to include in the 'calcFeature' JSON.
    - a (str): Column name containing the prediction score. Default is 'demo_score'.
    - modelDisplayName (str): Name of the model used for scoring.
    - tc (str): Trench category (optional).
    - do (str): Device operating system. Default is 'android'.
    - subscription_name (str): Name of the subscription or model group.

    Returns:
    - pd.DataFrame: Transformed DataFrame with structured output.
    """

    # Make a copy of the input DataFrame to avoid modifying the original
    df = d1.copy()
    
    # Initialize an empty list to store transformed rows
    output_data = []
    
    # Iterate over each row in the DataFrame
    for _, row in df.iterrows():
        # Initialize dictionary to hold feature values
        calc_feature = {}
        
        # Loop through each feature column and extract its value from the row
        for col in feature_column:
            if col in row and pd.notna(row[col]):
                # Convert datetime values to ISO format strings
                if isinstance(row[col], pd.Timestamp):
                    calc_feature[col] = row[col].isoformat()
                else:
                    calc_feature[col] = row[col]
        
        # Get the current timestamp for start_time, end_time, and publish_time
        current_time = datetime.now().isoformat()
        
        # Construct the output row dictionary with required fields
        output_row = {
            "customerId": row['customer_id'],  # Unique customer identifier
            "digitalLoanAccountId": row['digitalLoanAccountId'],  # Loan account ID
            "crifApplicationId": str(uuid.uuid4()),  # Random UUID for application ID
            "prediction": row.get(a, 0),  # Prediction score from specified column
            "start_time": current_time,  # Timestamp when processing starts
            "end_time": current_time,    # Timestamp when processing ends
            "modelDisplayName": modelDisplayName,  # Name of the model used
            "modelVersionId": "v1",  # Static model version
            "calcFeature": json.dumps(calc_feature, default=str),  # Features as JSON string
            "subscription_name": subscription_name,  # Subscription name
            "message_id": str(uuid.uuid4()),  # Random UUID for message ID
            "publish_time": current_time,  # Timestamp when message is published
            "attributes": "{}",  # Placeholder for additional attributes
            "trenchCategory": tc,  # Optional trench category
            "deviceOs": row['osType'],  # Device operating system
        }
        
        # Append the transformed row to the output list
        output_data.append(output_row)
    
    # Convert the list of dictionaries to a DataFrame
    output_df = pd.DataFrame(output_data)
    
    # Return the transformed DataFrame
    return output_df



#### PSI Functions new

In [53]:
## Updated on 27-10-2025 - Modified for Training Period Baseline
import pandas as pd
import numpy as np
from typing import List, Dict, Tuple
import warnings
warnings.filterwarnings('ignore')

def identify_feature_types(df: pd.DataFrame, feature_list: List[str]) -> Dict[str, List[str]]:
    """
    Identify categorical and numerical features from the feature list.

    Parameters:
    -----------
    df : pd.DataFrame
        Input dataframe
    feature_list : List[str]
        List of features to classify

    Returns:
    --------
    Dict with 'categorical' and 'numerical' keys containing respective feature lists
    """
    categorical_features = []
    numerical_features = []

    for feature in feature_list:
        if feature not in df.columns:
            print(f"Warning: Feature '{feature}' not found in dataframe")
            continue

        # Check if feature is numeric
        if pd.api.types.is_numeric_dtype(df[feature]):
            # If unique values are less than 15 and all integers, treat as categorical
            unique_vals = df[feature].nunique()
            if unique_vals < 15 and df[feature].dropna().apply(lambda x: x == int(x) if isinstance(x, (int, float)) else False).all():
                categorical_features.append(feature)
            else:
                numerical_features.append(feature)
        else:
            categorical_features.append(feature)

    return {
        'categorical': categorical_features,
        'numerical': numerical_features
    }


def create_bins_for_features(df: pd.DataFrame,
                             numerical_features: List[str],
                             categorical_features: List[str],
                             train_period_df: pd.DataFrame) -> Dict:
    """
    Create bins for numerical features (deciles with fallback) and categorical features (top 6 + others)
    based on the entire training period data.

    Parameters:
    -----------
    df : pd.DataFrame
        Full input dataframe
    numerical_features : List[str]
        List of numerical features
    categorical_features : List[str]
        List of categorical features
    train_period_df : pd.DataFrame
        Training period dataframe (June 2024 to March 2025)

    Returns:
    --------
    Dictionary containing binning information for each feature
    """
    binning_info = {}

    # Create bins for numerical features with fallback strategy
    for feature in numerical_features:
        valid_data = train_period_df[feature].dropna()

        if len(valid_data) == 0:
            binning_info[feature] = {'type': 'numerical', 'bins': None, 'bin_ranges': {}}
            continue

        bins = None
        bin_count = None

        # Try 10 bins (deciles)
        try:
            test_bins = np.percentile(valid_data, np.arange(0, 101, 10))
            test_bins = np.unique(test_bins)
            if len(test_bins) >= 11:  # 11 edges = 10 bins
                bins = test_bins
                bin_count = 10
        except Exception as e:
            pass

        # If 10 bins not possible, try 5 bins
        if bins is None:
            try:
                test_bins = np.percentile(valid_data, np.arange(0, 101, 20))
                test_bins = np.unique(test_bins)
                if len(test_bins) >= 6:  # 6 edges = 5 bins
                    bins = test_bins
                    bin_count = 5
            except Exception as e:
                pass

        # If 5 bins not possible, try 3 bins
        if bins is None:
            try:
                test_bins = np.percentile(valid_data, [0, 33.33, 66.67, 100])
                test_bins = np.unique(test_bins)
                if len(test_bins) >= 4:  # 4 edges = 3 bins
                    bins = test_bins
                    bin_count = 3
            except Exception as e:
                pass

        # If still no bins possible, use equal distance bins of 5
        if bins is None:
            print(f"Warning: Feature '{feature}' has insufficient variance - cannot create standard bins")
            print(f"Feature '{feature}': Using equal distance bins of 5")

            min_val = valid_data.min()
            max_val = valid_data.max()

            # Create 5 equal distance bins
            bins = np.linspace(min_val, max_val, 6)  # 6 edges = 5 bins
            bins = np.unique(bins)
            bin_count = len(bins) - 1

            # If all values are the same, add slight buffer
            if bin_count == 1:
                bins = np.array([min_val - 0.1, min_val, min_val + 0.1])
                bin_count = 2
                print(f"Feature '{feature}': Constant value ({min_val}). Created 2 equal distance bins with buffer")

        # Add infinity edges to capture all values
        bins = bins.copy()
        bins[0] = -np.inf
        bins[-1] = np.inf

        print(f"Feature '{feature}': Created {bin_count} bins")

        # Create bin ranges dictionary
        bin_ranges = {}
        for i in range(len(bins)-1):
            bin_name = f"Bin_{i+1}"
            bin_ranges[bin_name] = {
                'min': bins[i],
                'max': bins[i+1],
                'range_str': f"[{bins[i]:.2f}, {bins[i+1]:.2f}]" if not np.isinf(bins[i]) and not np.isinf(bins[i+1]) else f"({bins[i]}, {bins[i+1]})"
            }

        binning_info[feature] = {
            'type': 'numerical',
            'bins': bins,
            'bin_ranges': bin_ranges,
            'bin_count': bin_count
        }

    # Create bins for categorical features (top 6 + others) using training period
    for feature in categorical_features:
        value_counts = train_period_df[feature].value_counts()
        unique_categories = value_counts.index.tolist()
        print(f"Unique categories: {unique_categories}")

        if len(unique_categories) <= 6:
            # Treat each category as a separate bin
            top_categories = unique_categories
        else:
            # Use top 6 categories only
            top_categories = value_counts.nlargest(6).index.tolist()

        print(f"Top categories for feature '{feature}': {top_categories}")

        binning_info[feature] = {
                'type': 'categorical',
                'top_categories': top_categories,
                'bin_ranges': {}  # No ranges for categorical
            }

    return binning_info


def apply_binning(df: pd.DataFrame,
                  feature: str,
                  binning_info: Dict) -> pd.Series:
    """
    Apply binning to a feature based on binning information.

    Parameters:
    -----------
    df : pd.DataFrame
        Input dataframe
    feature : str
        Feature name
    binning_info : Dict
        Binning information for the feature

    Returns:
    --------
    pd.Series with binned values
    """
    if binning_info['type'] == 'numerical':
        if binning_info['bins'] is None:
            return pd.Series(['Missing'] * len(df), index=df.index)

        bins = binning_info['bins']
        labels = [f"Bin_{i+1}" for i in range(len(bins)-1)]

        binned = pd.cut(df[feature],
                       bins=bins,
                       labels=labels,
                       include_lowest=True,
                       duplicates='drop')

        # Handle nulls - convert to string and then replace
        binned = binned.astype(str)
        binned[df[feature].isna()] = 'Missing'

        return binned

    else:  # categorical
        top_cats = binning_info['top_categories']

        # Convert to string for consistent comparison
        if pd.api.types.is_categorical_dtype(df[feature]):
            feature_data = df[feature].astype(str)
        else:
            feature_data = df[feature].astype(str)

        # Replace NaN string representation with 'Missing'
        feature_data = feature_data.replace('nan', 'Missing')

        # Convert top_cats to strings for comparison
        top_cats_str = [str(cat) for cat in top_cats]

        # Apply binning logic: use category name if in top_cats, else 'Others' (except for Missing)
        binned = feature_data.apply(lambda x: x if x in top_cats_str else ('Others' if x != 'Missing' else 'Missing'))

        return binned


def calculate_psi(expected_pct: pd.Series,
                  actual_pct: pd.Series,
                  epsilon: float = 0.0001) -> float:
    """
    Calculate Population Stability Index with proper epsilon handling and renormalization.

    Parameters:
    -----------
    expected_pct : pd.Series
        Expected (baseline) percentages
    actual_pct : pd.Series
        Actual percentages
    epsilon : float
        Small value to avoid log(0)

    Returns:
    --------
    PSI value
    """
    # Align indices
    all_bins = expected_pct.index.union(actual_pct.index)
    expected_pct = expected_pct.reindex(all_bins, fill_value=0)
    actual_pct = actual_pct.reindex(all_bins, fill_value=0)

    # Only add epsilon where values are zero
    expected_pct = expected_pct.apply(lambda x: epsilon if x == 0 else x)
    actual_pct = actual_pct.apply(lambda x: epsilon if x == 0 else x)

    # Renormalize to ensure they sum to 1 after adding epsilon
    expected_pct = expected_pct / expected_pct.sum()
    actual_pct = actual_pct / actual_pct.sum()

    # Calculate PSI
    psi_value = np.sum((actual_pct - expected_pct) * np.log(actual_pct / expected_pct))

    return psi_value


def calculate_month_on_month_psi(df: pd.DataFrame,
                                 feature_list: List[str],
                                 segment_columns: List[str],
                                 month_col: str = 'Application_month',
                                 data_selection_col: str = 'Data_selection',
                                 account_id_col: str = 'digitalLoanAccountId') -> pd.DataFrame:
    """
    Calculate PSI for each feature comparing training period (June 2024 to March 2025)
    vs each month after March 2025, overall and by segments.

    Parameters:
    -----------
    df : pd.DataFrame
        Input dataframe
    feature_list : List[str]
        List of features to calculate PSI for
    segment_columns : List[str]
        List of segment columns
    month_col : str
        Name of month column
    data_selection_col : str
        Name of data selection column (identifies train period)
    account_id_col : str
        Name of account ID column for counting distinct accounts

    Returns:
    --------
    pd.DataFrame with PSI values with one row per feature-month-segment combination
    """
    # Create a copy to avoid modifying original
    df = df.copy()

    # Identify training and test periods
    train_df = df[df[data_selection_col] == 'Train'].copy()
    test_df = df[df[data_selection_col] != 'Train'].copy()

    if len(train_df) == 0:
        raise ValueError("No training data found. Check Data_selection column.")

    print(f"Training period: {train_df[month_col].min()} to {train_df[month_col].max()}")
    print(f"Test period: {test_df[month_col].min()} to {test_df[month_col].max()}")

    # Identify feature types
    feature_types = identify_feature_types(df, feature_list)

    # Create binning strategy based on training period
    binning_info = create_bins_for_features(
        df,
        feature_types['numerical'],
        feature_types['categorical'],
        train_df
    )

    # Get sorted test months
    test_months = sorted(test_df[month_col].unique())

    results = []

    # Calculate overall PSI
    for feature in feature_list:
        if feature not in df.columns:
            continue

        # Apply binning to entire dataset
        df[f'{feature}_binned'] = apply_binning(df, feature, binning_info[feature])
        # print(f"Feature binned {df[f'{feature}_binned']}")
        # Get training period distribution (baseline)
        train_baseline = df[df[data_selection_col] == 'Train'][f'{feature}_binned'].value_counts(normalize=True)

        # Calculate PSI for each test month
        for month in test_months:
            actual_dist = df[df[month_col] == month][f'{feature}_binned'].value_counts(normalize=True)
            psi_value = calculate_psi(train_baseline, actual_dist)

            # Calculate average percentages across all bins
            expected_avg_pct = train_baseline.mean() * 100
            actual_avg_pct = actual_dist.mean() * 100

            # # Count distinct accounts for segment
            # base_segment_count = train_segment[account_id_col].nunique()
            # actual_segment_count = actual_segment[account_id_col].nunique()


            results.append({
                'Feature': feature,
                'Feature_Type': binning_info[feature]['type'],
                'Segment_Column': 'Overall',
                'Segment_Value': 'All',
                'Month': f"{month}",
                'Base_Month': 'Train (Jun 2024 - Mar 2025)',
                'Current_Month': month,
                'Expected_Percentage': expected_avg_pct,
                'Actual_Percentage': actual_avg_pct,
                'PSI': psi_value
            })

    # Calculate PSI by segments
    for segment_col in segment_columns:
        if segment_col not in df.columns:
            continue

        segments = df[segment_col].dropna().unique()

        for segment_val in segments:
            segment_df = df[df[segment_col] == segment_val]

            for feature in feature_list:
                if feature not in df.columns:
                    continue

                # Get training period distribution for segment
                train_segment = segment_df[segment_df[data_selection_col] == 'Train']
                if len(train_segment) == 0:
                    continue

                train_baseline = train_segment[f'{feature}_binned'].value_counts(normalize=True)

                # Calculate PSI for each test month
                for month in test_months:
                    actual_segment = segment_df[segment_df[month_col] == month]
                    if len(actual_segment) == 0:
                        continue

                    actual_dist = actual_segment[f'{feature}_binned'].value_counts(normalize=True)
                    psi_value = calculate_psi(train_baseline, actual_dist)

                    # Calculate average percentages across all bins
                    expected_avg_pct = train_baseline.mean() * 100
                    actual_avg_pct = actual_dist.mean() * 100

                    # Count distinct accounts for segment
                    base_segment_count = train_segment[account_id_col].nunique()
                    actual_segment_count = actual_segment[account_id_col].nunique()

                    results.append({
                        'Feature': feature,
                        'Feature_Type': binning_info[feature]['type'],
                        'Segment_Column': segment_col,
                        'Segment_Value': segment_val,
                        'Month': f"{month}",
                        'Base_Month': 'Train (Jun 2024 - Mar 2025)',
                        'Current_Month': month,
                        'Base_Count': base_segment_count,
                        'Actual_Count': actual_segment_count,
                        'Expected_Percentage': expected_avg_pct,
                        'Actual_Percentage': actual_avg_pct,
                        'PSI': psi_value
                    })

    return pd.DataFrame(results)


def calculate_bin_level_psi(df: pd.DataFrame,
                            feature_list: List[str],
                            segment_columns: List[str],
                            month_col: str = 'Application_month',
                            data_selection_col: str = 'Data_selection',
                            account_id_col: str = 'digitalLoanAccountId') -> pd.DataFrame:
    """
    Calculate bin-level PSI for each feature comparing training period
    vs each month after March 2025, overall and by segments.

    Parameters:
    -----------
    df : pd.DataFrame
        Input dataframe
    feature_list : List[str]
        List of features to calculate PSI for
    segment_columns : List[str]
        List of segment columns
    month_col : str
        Name of month column
    data_selection_col : str
        Name of data selection column
    account_id_col : str
        Name of account ID column for counting distinct accounts

    Returns:
    --------
    pd.DataFrame with bin-level PSI details including bin ranges
    """
    # Create a copy to avoid modifying original
    df = df.copy()

    # Identify training and test periods
    train_df = df[df[data_selection_col] == 'Train'].copy()
    test_df = df[df[data_selection_col] != 'Train'].copy()

    if len(train_df) == 0:
        raise ValueError("No training data found. Check Data_selection column.")

    print(f"Training period: {train_df[month_col].min()} to {train_df[month_col].max()}")
    print(f"Test period: {test_df[month_col].min()} to {test_df[month_col].max()}")

    # Identify feature types
    feature_types = identify_feature_types(df, feature_list)

    # Create binning strategy based on training period
    binning_info = create_bins_for_features(
        df,
        feature_types['numerical'],
        feature_types['categorical'],
        train_df
    )

    # Get sorted test months
    test_months = sorted(test_df[month_col].unique())

    results = []
    epsilon = 0.0001

    # Calculate overall bin-level PSI
    for feature in feature_list:
        if feature not in df.columns:
            continue

        # Apply binning to entire dataset
        df[f'{feature}_binned'] = apply_binning(df, feature, binning_info[feature])
        # print(df[f'{feature}_binned'])

        # Get training period distribution (baseline)
        train_baseline = df[df[data_selection_col] == 'Train'][f'{feature}_binned'].value_counts(normalize=True)

        # Calculate bin-level PSI for each test month
        for month in test_months:
            month_data = df[df[month_col] == month]
            actual_dist = month_data[f'{feature}_binned'].value_counts(normalize=True)

            # Count distinct accounts
            base_count = df[df[data_selection_col] == 'Train'][account_id_col].nunique()
            actual_count = month_data[account_id_col].nunique()

            # Get all bins
            all_bins = train_baseline.index.union(actual_dist.index)

            for bin_name in all_bins:
                # Simplified epsilon logic - no redundancy
                expected_pct = train_baseline.get(bin_name, 0)
                actual_pct = actual_dist.get(bin_name, 0)

                # Add epsilon only if zero
                expected_pct = epsilon if expected_pct == 0 else expected_pct
                actual_pct = epsilon if actual_pct == 0 else actual_pct

                # Calculate bin-level PSI
                bin_psi = (actual_pct - expected_pct) * np.log(actual_pct / expected_pct)

                # Get bin range information
                bin_ranges = binning_info[feature]['bin_ranges']
                if bin_name in bin_ranges:
                    bin_min = bin_ranges[bin_name]['min']
                    bin_max = bin_ranges[bin_name]['max']
                    bin_range = bin_ranges[bin_name]['range_str']
                else:
                    # For categorical or special bins (Missing, Others)
                    bin_min = None
                    bin_max = None
                    bin_range = bin_name

                results.append({
                    'Feature': feature,
                    'Feature_Type': binning_info[feature]['type'],
                    'Segment_Column': 'Overall',
                    'Segment_Value': 'All',
                    'Month': f"{month}",
                    'Base_Month': 'Train (Jun 2024 - Mar 2025)',
                    'Current_Month': month,
                    'Base_Count': base_count,
                    'Actual_Count': actual_count,
                    'Bin': bin_name,
                    'Bin_Range': bin_range,
                    'Bin_Min': bin_min,
                    'Bin_Max': bin_max,
                    'Base_Percentage': (train_baseline.get(bin_name, 0) * 100),
                    'Actual_Percentage': (actual_dist.get(bin_name, 0) * 100),
                    'Bin_PSI': bin_psi
                })

    # Calculate bin-level PSI by segments
    for segment_col in segment_columns:
        if segment_col not in df.columns:
            continue

        segments = df[segment_col].dropna().unique()

        for segment_val in segments:
            segment_df = df[df[segment_col] == segment_val]

            for feature in feature_list:
                if feature not in df.columns:
                    continue

                # Get training period distribution for segment
                train_segment = segment_df[segment_df[data_selection_col] == 'Train']
                if len(train_segment) == 0:
                    continue

                train_baseline = train_segment[f'{feature}_binned'].value_counts(normalize=True)

                # Calculate bin-level PSI for each test month
                for month in test_months:
                    actual_segment = segment_df[segment_df[month_col] == month]
                    if len(actual_segment) == 0:
                        continue

                    actual_dist = actual_segment[f'{feature}_binned'].value_counts(normalize=True)

                    # Count distinct accounts for segment
                    base_segment_count = train_segment[account_id_col].nunique()
                    actual_segment_count = actual_segment[account_id_col].nunique()

                    # Get all bins
                    all_bins = train_baseline.index.union(actual_dist.index)

                    for bin_name in all_bins:
                        # Simplified epsilon logic - no redundancy
                        expected_pct = train_baseline.get(bin_name, 0)
                        actual_pct = actual_dist.get(bin_name, 0)

                        # Add epsilon only if zero
                        expected_pct = epsilon if expected_pct == 0 else expected_pct
                        actual_pct = epsilon if actual_pct == 0 else actual_pct

                        # Calculate bin-level PSI
                        bin_psi = (actual_pct - expected_pct) * np.log(actual_pct / expected_pct)

                        # Get bin range information
                        bin_ranges = binning_info[feature]['bin_ranges']
                        if bin_name in bin_ranges:
                            bin_min = bin_ranges[bin_name]['min']
                            bin_max = bin_ranges[bin_name]['max']
                            bin_range = bin_ranges[bin_name]['range_str']
                        else:
                            # For categorical or special bins (Missing, Others)
                            bin_min = None
                            bin_max = None
                            bin_range = bin_name

                        results.append({
                            'Feature': feature,
                            'Feature_Type': binning_info[feature]['type'],
                            'Segment_Column': segment_col,
                            'Segment_Value': segment_val,
                            'Month': f"{month}",
                            'Base_Month': 'Train (Jun 2024 - Mar 2025)',
                            'Current_Month': month,
                            'Base_Count': base_segment_count,
                            'Actual_Count': actual_segment_count,
                            'Bin': bin_name,
                            'Bin_Range': bin_range,
                            'Bin_Min': bin_min,
                            'Bin_Max': bin_max,
                            'Base_Percentage': (train_baseline.get(bin_name, 0) * 100),
                            'Actual_Percentage': (actual_dist.get(bin_name, 0) * 100),
                            'Bin_PSI': bin_psi
                        })

    return pd.DataFrame(results)

# dropping_duplicates

In [54]:
def dropping_duplicates(df: pd.DataFrame) -> pd.DataFrame:
    """
    Drop duplicates based on digitalLoanAccountId, Data_selection, and modelVersionid,
    keeping the first occurrence based on appln_submit_datetime.

    Parameters:
    -----------
    df : pd.DataFrame
        Input dataframe

    Returns:
    --------
    pd.DataFrame with duplicates dropped
    """

    df = df.sort_values(
        ['digitalLoanAccountId', 'Data_selection', 'modelVersionId', 'appln_submit_datetime'],
        ascending=[True, True, True, True],
        na_position='last'
    )

    result = df.drop_duplicates(
        subset=['digitalLoanAccountId', 'Data_selection', 'modelVersionId'],
        keep='first'
    ).copy()

    return result

# New PSI Function

In [55]:
import pandas as pd
import numpy as np
import json
from typing import List, Dict, Tuple
import warnings
warnings.filterwarnings('ignore')


def expand_calc_features(df, exclude_columns: List[str] = None):
    """
    Expand the calcFeatures JSON column into separate columns and return the complete DataFrame.

    Parameters:
    df (pd.DataFrame): Input DataFrame with calcFeatures column containing JSON data
    exclude_columns (List[str]): List of column names to exclude from expansion
                                (e.g., ['digitalLoanAccountId', 'customerId', 'crifApplicationId', 'run_date'])

    Returns:
    pd.DataFrame: Expanded DataFrame with all original columns plus JSON features as separate columns
    """
    if exclude_columns is None:
        exclude_columns = []
    
    df_expanded = df.copy()
    calc_features_list = []

    for idx, calc_features_str in enumerate(df['calcFeatures']):
        try:
            features_dict = json.loads(calc_features_str.replace("'", '"'))
            # Remove excluded columns from the features dict
            for col in exclude_columns:
                features_dict.pop(col, None)
            calc_features_list.append(features_dict)
        except (json.JSONDecodeError, AttributeError) as e:
            print(f"Warning: Could not parse calcFeatures at index {idx}: {e}")
            calc_features_list.append({})

    calc_features_df = pd.DataFrame(calc_features_list)
    calc_features_df = calc_features_df.add_prefix('calc_')

    df_expanded = df_expanded.reset_index(drop=True)
    calc_features_df = calc_features_df.reset_index(drop=True)

    result_df = pd.concat([df_expanded, calc_features_df], axis=1)
    return result_df


def identify_feature_types(df: pd.DataFrame, feature_list: List[str]) -> Dict[str, List[str]]:
    """
    Identify categorical and numerical features from the feature list.
    """
    categorical_features = []
    numerical_features = []

    for feature in feature_list:
        if feature not in df.columns:
            continue

        if pd.api.types.is_numeric_dtype(df[feature]):
            unique_vals = df[feature].nunique()
            if unique_vals < 15 and df[feature].dropna().apply(lambda x: x == int(x) if isinstance(x, (int, float)) else False).all():
                categorical_features.append(feature)
            else:
                numerical_features.append(feature)
        else:
            categorical_features.append(feature)

    return {
        'categorical': categorical_features,
        'numerical': numerical_features
    }


def create_bins_for_features(df: pd.DataFrame,
                             numerical_features: List[str],
                             categorical_features: List[str],
                             train_period_df: pd.DataFrame) -> Dict:
    """
    Create bins for numerical features (deciles with fallback) and categorical features (top 6 + others)
    based on the training period data.
    """
    binning_info = {}

    for feature in numerical_features:
        valid_data = train_period_df[feature].dropna()

        if len(valid_data) == 0:
            binning_info[feature] = {'type': 'numerical', 'bins': None, 'bin_ranges': {}}
            continue

        bins = None
        bin_count = None

        # Try 10 bins (deciles)
        try:
            test_bins = np.percentile(valid_data, np.arange(0, 101, 10))
            test_bins = np.unique(test_bins)
            if len(test_bins) >= 11:
                bins = test_bins
                bin_count = 10
        except Exception as e:
            pass

        # If 10 bins not possible, try 5 bins
        if bins is None:
            try:
                test_bins = np.percentile(valid_data, np.arange(0, 101, 20))
                test_bins = np.unique(test_bins)
                if len(test_bins) >= 6:
                    bins = test_bins
                    bin_count = 5
            except Exception as e:
                pass

        # If 5 bins not possible, try 3 bins
        if bins is None:
            try:
                test_bins = np.percentile(valid_data, [0, 33.33, 66.67, 100])
                test_bins = np.unique(test_bins)
                if len(test_bins) >= 4:
                    bins = test_bins
                    bin_count = 3
            except Exception as e:
                pass

        # If still no bins possible, use equal distance bins of 5
        if bins is None:
            print(f"Warning: Feature '{feature}' has insufficient variance - cannot create standard bins")
            min_val = valid_data.min()
            max_val = valid_data.max()
            bins = np.linspace(min_val, max_val, 6)
            bins = np.unique(bins)
            bin_count = len(bins) - 1

            if bin_count == 1:
                bins = np.array([min_val - 0.1, min_val, min_val + 0.1])
                bin_count = 2

        bins = bins.copy()
        bins[0] = -np.inf
        bins[-1] = np.inf

        print(f"Feature '{feature}': Created {bin_count} bins")

        bin_ranges = {}
        for i in range(len(bins)-1):
            bin_name = f"Bin_{i+1}"
            bin_ranges[bin_name] = {
                'min': bins[i],
                'max': bins[i+1],
                'range_str': f"[{bins[i]:.2f}, {bins[i+1]:.2f}]" if not np.isinf(bins[i]) and not np.isinf(bins[i+1]) else f"({bins[i]}, {bins[i+1]})"
            }

        binning_info[feature] = {
            'type': 'numerical',
            'bins': bins,
            'bin_ranges': bin_ranges,
            'bin_count': bin_count
        }

    # Categorical features
    for feature in categorical_features:
        value_counts = train_period_df[feature].value_counts()
        unique_categories = value_counts.index.tolist()

        if len(unique_categories) <= 6:
            top_categories = unique_categories
        else:
            top_categories = value_counts.nlargest(6).index.tolist()

        print(f"Top categories for feature '{feature}': {top_categories}")

        binning_info[feature] = {
            'type': 'categorical',
            'top_categories': top_categories,
            'bin_ranges': {}
        }

    return binning_info


def apply_binning(df: pd.DataFrame,
                  feature: str,
                  binning_info: Dict) -> pd.Series:
    """
    Apply binning to a feature based on binning information.
    """
    if binning_info['type'] == 'numerical':
        if binning_info['bins'] is None:
            return pd.Series(['Missing'] * len(df), index=df.index)

        bins = binning_info['bins']
        labels = [f"Bin_{i+1}" for i in range(len(bins)-1)]

        binned = pd.cut(df[feature],
                       bins=bins,
                       labels=labels,
                       include_lowest=True,
                       duplicates='drop')

        binned = binned.astype(str)
        binned[df[feature].isna()] = 'Missing'

        return binned

    else:
        top_cats = binning_info['top_categories']

        if pd.api.types.is_categorical_dtype(df[feature]):
            feature_data = df[feature].astype(str)
        else:
            feature_data = df[feature].astype(str)

        feature_data = feature_data.replace('nan', 'Missing')
        top_cats_str = [str(cat) for cat in top_cats]

        binned = feature_data.apply(lambda x: x if x in top_cats_str else ('Others' if x != 'Missing' else 'Missing'))

        return binned


def calculate_psi(expected_pct: pd.Series,
                  actual_pct: pd.Series,
                  epsilon: float = 0.0001) -> float:
    """
    Calculate Population Stability Index with proper epsilon handling and renormalization.
    """
    all_bins = expected_pct.index.union(actual_pct.index)
    expected_pct = expected_pct.reindex(all_bins, fill_value=0)
    actual_pct = actual_pct.reindex(all_bins, fill_value=0)

    expected_pct = expected_pct.apply(lambda x: epsilon if x == 0 else x)
    actual_pct = actual_pct.apply(lambda x: epsilon if x == 0 else x)

    expected_pct = expected_pct / expected_pct.sum()
    actual_pct = actual_pct / actual_pct.sum()

    psi_value = np.sum((actual_pct - expected_pct) * np.log(actual_pct / expected_pct))

    return psi_value


def calculate_psi_by_model_version(df: pd.DataFrame,
                                   score_column: str,
                                   segment_columns: List[str] = None,
                                   month_col: str = 'Application_month',
                                   data_selection_col: str = 'Data_selection',
                                   model_version_col: str = 'modelVersionId',
                                   trench_category_col: str = 'trenchCategory',
                                   model_display_name_col: str = 'modelDisplayName',
                                   account_id_col: str = 'digitalLoanAccountId') -> pd.DataFrame:
    """
    Calculate PSI for each model version by comparing Train vs Test periods.
    Expands calcFeatures for each model version and calculates PSI for:
    1. Overall score
    2. Each expanded feature
    3. By segments (optional)

    Parameters:
    -----------
    df : pd.DataFrame
        Input dataframe with concatenated Train and Test data
    score_column : str
        Name of the score column (e.g., 'Alpha_cic_sil_score')
    segment_columns : List[str]
        List of segment columns (e.g., ['new_loan_type', 'loan_product_type', 'osType'])
    month_col : str
        Name of month column
    data_selection_col : str
        Name of data selection column (Train/Test)
    model_version_col : str
        Name of model version column
    trench_category_col : str
        Name of trench category column
    model_display_name_col : str
        Name of model display name column
    account_id_col : str
        Name of account ID column

    Returns:
    --------
    pd.DataFrame with PSI values for overall and by features
    """
    
    if segment_columns is None:
        segment_columns = []
    
    # Columns to exclude when expanding calcFeatures
    exclude_cols = ['digitalLoanAccountId', 'customerId', 'crifApplicationId', 'run_date']
    
    # Expand calcFeatures for entire dataset
    print("Expanding calcFeatures (excluding: digitalLoanAccountId, customerId, crifApplicationId, run_date)...")
    df_expanded = expand_calc_features(df, exclude_columns=exclude_cols)
    
    # Get unique model versions
    model_versions = sorted(df_expanded[model_version_col].unique())
    print(f"Found model versions: {model_versions}\n")
    
    all_results = []
    
    # Process each model version
    for model_version in model_versions:
        print(f"{'='*100}")
        print(f"Processing Model Version: {model_version}")
        print(f"{'='*100}")
        
        # Filter data for current model version
        mv_df = df_expanded[df_expanded[model_version_col] == model_version].copy()
        
        # Extract trenchCategory and modelDisplayName for this model version
        trench_category = mv_df[trench_category_col].iloc[0] if trench_category_col in mv_df.columns else None
        model_display_name = mv_df[model_display_name_col].iloc[0] if model_display_name_col in mv_df.columns else None
        
        # Split into train and test
        train_df = mv_df[mv_df[data_selection_col] == 'Train'].copy()
        test_df = mv_df[mv_df[data_selection_col] == 'Test'].copy()
        
        if len(train_df) == 0:
            print(f"Warning: No training data for model version {model_version}\n")
            continue
        
        if len(test_df) == 0:
            print(f"Warning: No test data for model version {model_version}\n")
            continue
        
        print(f"Train records: {len(train_df)}, Test records: {len(test_df)}")
        print(f"Training period: {train_df[month_col].min()} to {train_df[month_col].max()}")
        print(f"Test period: {test_df[month_col].min()} to {test_df[month_col].max()}")
        
        # Get all expanded features (calc_ prefixed columns)
        calc_features = [col for col in df_expanded.columns if col.startswith('calc_')]
        
        # Add score column to feature list
        feature_list = [score_column] + calc_features
        
        print(f"Score column: {score_column}")
        print(f"Number of expanded features: {len(calc_features)}\n")
        
        # Identify feature types
        feature_types = identify_feature_types(mv_df, feature_list)
        
        # Create binning strategy based on training period
        binning_info = create_bins_for_features(
            mv_df,
            feature_types['numerical'],
            feature_types['categorical'],
            train_df
        )
        
        # Get sorted test months
        test_months = sorted(test_df[month_col].unique())
        
        # ========== OVERALL PSI ==========
        print(f"\nCalculating Overall PSI...")
        for feature in feature_list:
            if feature not in mv_df.columns:
                continue
            
            # Apply binning to entire dataset
            mv_df[f'{feature}_binned'] = apply_binning(mv_df, feature, binning_info[feature])
            
            # Get training period distribution (baseline)
            train_baseline = mv_df[mv_df[data_selection_col] == 'Train'][f'{feature}_binned'].value_counts(normalize=True)
            
            # Calculate PSI for each test month
            for month in test_months:
                month_data = mv_df[mv_df[month_col] == month]
                
                if len(month_data) == 0:
                    continue
                
                actual_dist = month_data[f'{feature}_binned'].value_counts(normalize=True)
                psi_value = calculate_psi(train_baseline, actual_dist)
                
                # Count distinct accounts
                base_count = mv_df[mv_df[data_selection_col] == 'Train'][account_id_col].nunique()
                actual_count = month_data[account_id_col].nunique()
                
                # Calculate average percentages
                expected_avg_pct = train_baseline.mean() * 100
                actual_avg_pct = actual_dist.mean() * 100
                
                # Determine feature category
                feature_category = 'Score' if feature == score_column else 'Expanded_Feature'
                
                results_row = {
                    'Model_Version': model_version,
                    'Trench_Category': trench_category,
                    'Model_Display_Name': model_display_name,
                    'Feature': feature,
                    'Feature_Type': binning_info[feature]['type'],
                    'Feature_Category': feature_category,
                    'Segment_Column': 'Overall',
                    'Segment_Value': 'All',
                    'Month': f"{month}",
                    'Base_Month': 'Train',
                    'Current_Month': month,
                    'Base_Count': base_count,
                    'Actual_Count': actual_count,
                    'Expected_Percentage': expected_avg_pct,
                    'Actual_Percentage': actual_avg_pct,
                    'PSI': psi_value
                }
                
                all_results.append(results_row)
        
        # ========== SEGMENT-WISE PSI ==========
        if segment_columns:
            print(f"\nCalculating Segment-wise PSI...")
            for segment_col in segment_columns:
                if segment_col not in mv_df.columns:
                    print(f"Warning: Segment column '{segment_col}' not found")
                    continue
                
                segments = mv_df[segment_col].dropna().unique()
                print(f"\nSegment Column: {segment_col} | Values: {list(segments)}")
                
                for segment_val in segments:
                    segment_df = mv_df[mv_df[segment_col] == segment_val].copy()
                    
                    # Get train and test data for this segment
                    train_segment = segment_df[segment_df[data_selection_col] == 'Train']
                    test_segment = segment_df[segment_df[data_selection_col] == 'Test']
                    
                    if len(train_segment) == 0 or len(test_segment) == 0:
                        continue
                    
                    for feature in feature_list:
                        if feature not in segment_df.columns:
                            continue
                        
                        # Get baseline from train segment
                        train_baseline = train_segment[f'{feature}_binned'].value_counts(normalize=True)
                        
                        # Calculate PSI for each test month in this segment
                        for month in test_months:
                            actual_segment = segment_df[segment_df[month_col] == month]
                            
                            if len(actual_segment) == 0:
                                continue
                            
                            actual_dist = actual_segment[f'{feature}_binned'].value_counts(normalize=True)
                            psi_value = calculate_psi(train_baseline, actual_dist)
                            
                            # Count distinct accounts for segment
                            base_segment_count = train_segment[account_id_col].nunique()
                            actual_segment_count = actual_segment[account_id_col].nunique()
                            
                            # Calculate average percentages
                            expected_avg_pct = train_baseline.mean() * 100
                            actual_avg_pct = actual_dist.mean() * 100
                            
                            feature_category = 'Score' if feature == score_column else 'Expanded_Feature'
                            
                            results_row = {
                                'Model_Version': model_version,
                                'Trench_Category': trench_category,
                                'Model_Display_Name': model_display_name,
                                'Feature': feature,
                                'Feature_Type': binning_info[feature]['type'],
                                'Feature_Category': feature_category,
                                'Segment_Column': segment_col,
                                'Segment_Value': segment_val,
                                'Month': f"{month}",
                                'Base_Month': 'Train',
                                'Current_Month': month,
                                'Base_Count': base_segment_count,
                                'Actual_Count': actual_segment_count,
                                'Expected_Percentage': expected_avg_pct,
                                'Actual_Percentage': actual_avg_pct,
                                'PSI': psi_value
                            }
                            
                            all_results.append(results_row)
        
        print(f"Completed processing for Model Version: {model_version}\n")
    
    results_df = pd.DataFrame(all_results)
    return results_df


def calculate_bin_level_psi_by_model_version(df: pd.DataFrame,
                                             score_column: str,
                                             segment_columns: List[str] = None,
                                             month_col: str = 'Application_month',
                                             data_selection_col: str = 'Data_selection',
                                             model_version_col: str = 'modelVersionId',
                                             trench_category_col: str = 'trenchCategory',
                                             model_display_name_col: str = 'modelDisplayName',
                                             account_id_col: str = 'digitalLoanAccountId') -> pd.DataFrame:
    """
    Calculate bin-level PSI for each model version.
    Provides detailed breakdown by bins/categories.
    """
    
    if segment_columns is None:
        segment_columns = []
    
    print("Expanding calcFeatures for bin-level analysis (excluding: digitalLoanAccountId, customerId, crifApplicationId, run_date)...")
    df_expanded = expand_calc_features(df, exclude_columns=['digitalLoanAccountId', 'customerId', 'crifApplicationId', 'run_date'])
    
    model_versions = sorted(df_expanded[model_version_col].unique())
    
    all_results = []
    epsilon = 0.0001
    
    for model_version in model_versions:
        print(f"\nProcessing bin-level PSI for Model Version: {model_version}")
        
        mv_df = df_expanded[df_expanded[model_version_col] == model_version].copy()
        
        train_df = mv_df[mv_df[data_selection_col] == 'Train'].copy()
        test_df = mv_df[mv_df[data_selection_col] == 'Test'].copy()
        
        calc_features = [col for col in df_expanded.columns if col.startswith('calc_')]
        feature_list = [score_column] + calc_features
        
        feature_types = identify_feature_types(mv_df, feature_list)
        binning_info = create_bins_for_features(
            mv_df,
            feature_types['numerical'],
            feature_types['categorical'],
            train_df
        )
        
        test_months = sorted(test_df[month_col].unique())
        
        # ========== OVERALL BIN-LEVEL PSI ==========
        for feature in feature_list:
            if feature not in mv_df.columns:
                continue
            
            mv_df[f'{feature}_binned'] = apply_binning(mv_df, feature, binning_info[feature])
            train_baseline = mv_df[mv_df[data_selection_col] == 'Train'][f'{feature}_binned'].value_counts(normalize=True)
            
            for month in test_months:
                month_data = mv_df[mv_df[month_col] == month]
                if len(month_data) == 0:
                    continue
                
                actual_dist = month_data[f'{feature}_binned'].value_counts(normalize=True)
                
                base_count = mv_df[mv_df[data_selection_col] == 'Train'][account_id_col].nunique()
                actual_count = month_data[account_id_col].nunique()
                
                all_bins = train_baseline.index.union(actual_dist.index)
                
                for bin_name in all_bins:
                    expected_pct = train_baseline.get(bin_name, 0)
                    actual_pct = actual_dist.get(bin_name, 0)
                    
                    expected_pct = epsilon if expected_pct == 0 else expected_pct
                    actual_pct = epsilon if actual_pct == 0 else actual_pct
                    
                    bin_psi = (actual_pct - expected_pct) * np.log(actual_pct / expected_pct)
                    
                    bin_ranges = binning_info[feature]['bin_ranges']
                    if bin_name in bin_ranges:
                        bin_min = bin_ranges[bin_name]['min']
                        bin_max = bin_ranges[bin_name]['max']
                        bin_range = bin_ranges[bin_name]['range_str']
                    else:
                        bin_min = None
                        bin_max = None
                        bin_range = bin_name
                    
                    feature_category = 'Score' if feature == score_column else 'Expanded_Feature'
                    
                    all_results.append({
                        'Model_Version': model_version,
                        'Trench_Category': trench_category,
                        'Model_Display_Name': model_display_name,
                        'Feature': feature,
                        'Feature_Type': binning_info[feature]['type'],
                        'Feature_Category': feature_category,
                        'Segment_Column': 'Overall',
                        'Segment_Value': 'All',
                        'Month': f"{month}",
                        'Base_Month': 'Train',
                        'Current_Month': month,
                        'Base_Count': base_count,
                        'Actual_Count': actual_count,
                        'Bin': bin_name,
                        'Bin_Range': bin_range,
                        'Bin_Min': bin_min,
                        'Bin_Max': bin_max,
                        'Base_Percentage': train_baseline.get(bin_name, 0) * 100,
                        'Actual_Percentage': actual_dist.get(bin_name, 0) * 100,
                        'Bin_PSI': bin_psi
                    })
        
        # ========== SEGMENT-WISE BIN-LEVEL PSI ==========
        if segment_columns:
            for segment_col in segment_columns:
                if segment_col not in mv_df.columns:
                    continue
                
                segments = mv_df[segment_col].dropna().unique()
                
                for segment_val in segments:
                    segment_df = mv_df[mv_df[segment_col] == segment_val].copy()
                    
                    train_segment = segment_df[segment_df[data_selection_col] == 'Train']
                    test_segment = segment_df[segment_df[data_selection_col] == 'Test']
                    
                    if len(train_segment) == 0 or len(test_segment) == 0:
                        continue
                    
                    for feature in feature_list:
                        if feature not in segment_df.columns:
                            continue
                        
                        train_baseline = train_segment[f'{feature}_binned'].value_counts(normalize=True)
                        
                        for month in test_months:
                            actual_segment = segment_df[segment_df[month_col] == month]
                            
                            if len(actual_segment) == 0:
                                continue
                            
                            actual_dist = actual_segment[f'{feature}_binned'].value_counts(normalize=True)
                            
                            base_segment_count = train_segment[account_id_col].nunique()
                            actual_segment_count = actual_segment[account_id_col].nunique()
                            
                            all_bins = train_baseline.index.union(actual_dist.index)
                            
                            for bin_name in all_bins:
                                expected_pct = train_baseline.get(bin_name, 0)
                                actual_pct = actual_dist.get(bin_name, 0)
                                
                                expected_pct = epsilon if expected_pct == 0 else expected_pct
                                actual_pct = epsilon if actual_pct == 0 else actual_pct
                                
                                bin_psi = (actual_pct - expected_pct) * np.log(actual_pct / expected_pct)
                                
                                bin_ranges = binning_info[feature]['bin_ranges']
                                if bin_name in bin_ranges:
                                    bin_min = bin_ranges[bin_name]['min']
                                    bin_max = bin_ranges[bin_name]['max']
                                    bin_range = bin_ranges[bin_name]['range_str']
                                else:
                                    bin_min = None
                                    bin_max = None
                                    bin_range = bin_name
                                
                                feature_category = 'Score' if feature == score_column else 'Expanded_Feature'
                                
                                all_results.append({
                                    'Model_Version': model_version,
                                    'Trench_Category': trench_category,
                                    'Model_Display_Name': model_display_name,
                                    'Feature': feature,
                                    'Feature_Type': binning_info[feature]['type'],
                                    'Feature_Category': feature_category,
                                    'Segment_Column': segment_col,
                                    'Segment_Value': segment_val,
                                    'Month': f"{month}",
                                    'Base_Month': 'Train',
                                    'Current_Month': month,
                                    'Base_Count': base_segment_count,
                                    'Actual_Count': actual_segment_count,
                                    'Bin': bin_name,
                                    'Bin_Range': bin_range,
                                    'Bin_Min': bin_min,
                                    'Bin_Max': bin_max,
                                    'Base_Percentage': train_baseline.get(bin_name, 0) * 100,
                                    'Actual_Percentage': actual_dist.get(bin_name, 0) * 100,
                                    'Bin_PSI': bin_psi
                                })
    
    return pd.DataFrame(all_results)


# ============================================================================
# USAGE EXAMPLE
# ============================================================================
"""
# Assuming you have df with concatenated Train and Test data

# Calculate Overall PSI (Overall + By Segments)
psi_results = calculate_psi_by_model_version(
    df=your_concatenated_df,
    score_column='Alpha_cic_sil_score',
    segment_columns=['new_loan_type', 'loan_product_type', 'osType'],
    month_col='Application_month',
    data_selection_col='Data_selection',
    model_version_col='modelVersionId',
    account_id_col='digitalLoanAccountId'
)

print(psi_results.head(20))
print(f"\nTotal rows: {len(psi_results)}")

# View Overall results only
overall_results = psi_results[psi_results['Segment_Column'] == 'Overall']
print(overall_results)

# View specific model version
v1_results = psi_results[psi_results['Model_Version'] == 'v1']
print(v1_results)

# View specific segment
loan_type_results = psi_results[psi_results['Segment_Column'] == 'new_loan_type']
print(loan_type_results)

# Save results
psi_results.to_csv('psi_results_overall_and_segments.csv', index=False)

# ---- For Bin-Level Details ----
bin_psi_results = calculate_bin_level_psi_by_model_version(
    df=your_concatenated_df,
    score_column='Alpha_cic_sil_score',
    segment_columns=['new_loan_type', 'loan_product_type', 'osType']
)

bin_psi_results.to_csv('bin_level_psi_results_overall_and_segments.csv', index=False)
print(bin_psi_results.head(30))
"""

'\n# Assuming you have df with concatenated Train and Test data\n\n# Calculate Overall PSI (Overall + By Segments)\npsi_results = calculate_psi_by_model_version(\n    df=your_concatenated_df,\n    score_column=\'Alpha_cic_sil_score\',\n    segment_columns=[\'new_loan_type\', \'loan_product_type\', \'osType\'],\n    month_col=\'Application_month\',\n    data_selection_col=\'Data_selection\',\n    model_version_col=\'modelVersionId\',\n    account_id_col=\'digitalLoanAccountId\'\n)\n\nprint(psi_results.head(20))\nprint(f"\nTotal rows: {len(psi_results)}")\n\n# View Overall results only\noverall_results = psi_results[psi_results[\'Segment_Column\'] == \'Overall\']\nprint(overall_results)\n\n# View specific model version\nv1_results = psi_results[psi_results[\'Model_Version\'] == \'v1\']\nprint(v1_results)\n\n# View specific segment\nloan_type_results = psi_results[psi_results[\'Segment_Column\'] == \'new_loan_type\']\nprint(loan_type_results)\n\n# Save results\npsi_results.to_csv(\'psi_re

In [56]:
# import pandas as pd
# import numpy as np
# import json
# from typing import List, Dict, Tuple
# import warnings
# warnings.filterwarnings('ignore')


# def expand_calc_features(df):
#     """
#     Expand the calcFeatures JSON column into separate columns and return the complete DataFrame.

#     Parameters:
#     df (pd.DataFrame): Input DataFrame with calcFeatures column containing JSON data

#     Returns:
#     pd.DataFrame: Expanded DataFrame with all original columns plus JSON features as separate columns
#     """
#     df_expanded = df.copy()
#     calc_features_list = []

#     for idx, calc_features_str in enumerate(df['calcFeatures']):
#         try:
#             features_dict = json.loads(calc_features_str.replace("'", '"'))
#             calc_features_list.append(features_dict)
#         except (json.JSONDecodeError, AttributeError) as e:
#             print(f"Warning: Could not parse calcFeatures at index {idx}: {e}")
#             calc_features_list.append({})

#     calc_features_df = pd.DataFrame(calc_features_list)
#     calc_features_df = calc_features_df.add_prefix('calc_')

#     df_expanded = df_expanded.reset_index(drop=True)
#     calc_features_df = calc_features_df.reset_index(drop=True)

#     result_df = pd.concat([df_expanded, calc_features_df], axis=1)
#     return result_df


# def identify_feature_types(df: pd.DataFrame, feature_list: List[str]) -> Dict[str, List[str]]:
#     """
#     Identify categorical and numerical features from the feature list.
#     """
#     categorical_features = []
#     numerical_features = []

#     for feature in feature_list:
#         if feature not in df.columns:
#             continue

#         if pd.api.types.is_numeric_dtype(df[feature]):
#             unique_vals = df[feature].nunique()
#             if unique_vals < 15 and df[feature].dropna().apply(lambda x: x == int(x) if isinstance(x, (int, float)) else False).all():
#                 categorical_features.append(feature)
#             else:
#                 numerical_features.append(feature)
#         else:
#             categorical_features.append(feature)

#     return {
#         'categorical': categorical_features,
#         'numerical': numerical_features
#     }


# def create_bins_for_features(df: pd.DataFrame,
#                              numerical_features: List[str],
#                              categorical_features: List[str],
#                              train_period_df: pd.DataFrame) -> Dict:
#     """
#     Create bins for numerical features (deciles with fallback) and categorical features (top 6 + others)
#     based on the training period data.
#     """
#     binning_info = {}

#     for feature in numerical_features:
#         valid_data = train_period_df[feature].dropna()

#         if len(valid_data) == 0:
#             binning_info[feature] = {'type': 'numerical', 'bins': None, 'bin_ranges': {}}
#             continue

#         bins = None
#         bin_count = None

#         # Try 10 bins (deciles)
#         try:
#             test_bins = np.percentile(valid_data, np.arange(0, 101, 10))
#             test_bins = np.unique(test_bins)
#             if len(test_bins) >= 11:
#                 bins = test_bins
#                 bin_count = 10
#         except Exception as e:
#             pass

#         # If 10 bins not possible, try 5 bins
#         if bins is None:
#             try:
#                 test_bins = np.percentile(valid_data, np.arange(0, 101, 20))
#                 test_bins = np.unique(test_bins)
#                 if len(test_bins) >= 6:
#                     bins = test_bins
#                     bin_count = 5
#             except Exception as e:
#                 pass

#         # If 5 bins not possible, try 3 bins
#         if bins is None:
#             try:
#                 test_bins = np.percentile(valid_data, [0, 33.33, 66.67, 100])
#                 test_bins = np.unique(test_bins)
#                 if len(test_bins) >= 4:
#                     bins = test_bins
#                     bin_count = 3
#             except Exception as e:
#                 pass

#         # If still no bins possible, use equal distance bins of 5
#         if bins is None:
#             print(f"Warning: Feature '{feature}' has insufficient variance - cannot create standard bins")
#             min_val = valid_data.min()
#             max_val = valid_data.max()
#             bins = np.linspace(min_val, max_val, 6)
#             bins = np.unique(bins)
#             bin_count = len(bins) - 1

#             if bin_count == 1:
#                 bins = np.array([min_val - 0.1, min_val, min_val + 0.1])
#                 bin_count = 2

#         bins = bins.copy()
#         bins[0] = -np.inf
#         bins[-1] = np.inf

#         print(f"Feature '{feature}': Created {bin_count} bins")

#         bin_ranges = {}
#         for i in range(len(bins)-1):
#             bin_name = f"Bin_{i+1}"
#             bin_ranges[bin_name] = {
#                 'min': bins[i],
#                 'max': bins[i+1],
#                 'range_str': f"[{bins[i]:.2f}, {bins[i+1]:.2f}]" if not np.isinf(bins[i]) and not np.isinf(bins[i+1]) else f"({bins[i]}, {bins[i+1]})"
#             }

#         binning_info[feature] = {
#             'type': 'numerical',
#             'bins': bins,
#             'bin_ranges': bin_ranges,
#             'bin_count': bin_count
#         }

#     # Categorical features
#     for feature in categorical_features:
#         value_counts = train_period_df[feature].value_counts()
#         unique_categories = value_counts.index.tolist()

#         if len(unique_categories) <= 6:
#             top_categories = unique_categories
#         else:
#             top_categories = value_counts.nlargest(6).index.tolist()

#         print(f"Top categories for feature '{feature}': {top_categories}")

#         binning_info[feature] = {
#             'type': 'categorical',
#             'top_categories': top_categories,
#             'bin_ranges': {}
#         }

#     return binning_info


# def apply_binning(df: pd.DataFrame,
#                   feature: str,
#                   binning_info: Dict) -> pd.Series:
#     """
#     Apply binning to a feature based on binning information.
#     """
#     if binning_info['type'] == 'numerical':
#         if binning_info['bins'] is None:
#             return pd.Series(['Missing'] * len(df), index=df.index)

#         bins = binning_info['bins']
#         labels = [f"Bin_{i+1}" for i in range(len(bins)-1)]

#         binned = pd.cut(df[feature],
#                        bins=bins,
#                        labels=labels,
#                        include_lowest=True,
#                        duplicates='drop')

#         binned = binned.astype(str)
#         binned[df[feature].isna()] = 'Missing'

#         return binned

#     else:
#         top_cats = binning_info['top_categories']

#         if pd.api.types.is_categorical_dtype(df[feature]):
#             feature_data = df[feature].astype(str)
#         else:
#             feature_data = df[feature].astype(str)

#         feature_data = feature_data.replace('nan', 'Missing')
#         top_cats_str = [str(cat) for cat in top_cats]

#         binned = feature_data.apply(lambda x: x if x in top_cats_str else ('Others' if x != 'Missing' else 'Missing'))

#         return binned


# def calculate_psi(expected_pct: pd.Series,
#                   actual_pct: pd.Series,
#                   epsilon: float = 0.0001) -> float:
#     """
#     Calculate Population Stability Index with proper epsilon handling and renormalization.
#     """
#     all_bins = expected_pct.index.union(actual_pct.index)
#     expected_pct = expected_pct.reindex(all_bins, fill_value=0)
#     actual_pct = actual_pct.reindex(all_bins, fill_value=0)

#     expected_pct = expected_pct.apply(lambda x: epsilon if x == 0 else x)
#     actual_pct = actual_pct.apply(lambda x: epsilon if x == 0 else x)

#     expected_pct = expected_pct / expected_pct.sum()
#     actual_pct = actual_pct / actual_pct.sum()

#     psi_value = np.sum((actual_pct - expected_pct) * np.log(actual_pct / expected_pct))

#     return psi_value


# def calculate_psi_by_model_version(df: pd.DataFrame,
#                                    score_column: str,
#                                    segment_columns: List[str] = None,
#                                    month_col: str = 'Application_month',
#                                    data_selection_col: str = 'Data_selection',
#                                    model_version_col: str = 'modelVersionId',
#                                    account_id_col: str = 'digitalLoanAccountId') -> pd.DataFrame:
#     """
#     Calculate PSI for each model version by comparing Train vs Test periods.
#     Expands calcFeatures for each model version and calculates PSI for:
#     1. Overall score
#     2. Each expanded feature
#     3. By segments (optional)

#     Parameters:
#     -----------
#     df : pd.DataFrame
#         Input dataframe with concatenated Train and Test data
#     score_column : str
#         Name of the score column (e.g., 'Alpha_cic_sil_score')
#     segment_columns : List[str]
#         List of segment columns (e.g., ['new_loan_type', 'loan_product_type', 'osType'])
#     month_col : str
#         Name of month column
#     data_selection_col : str
#         Name of data selection column (Train/Test)
#     model_version_col : str
#         Name of model version column
#     account_id_col : str
#         Name of account ID column

#     Returns:
#     --------
#     pd.DataFrame with PSI values for overall and by features
#     """
    
#     if segment_columns is None:
#         segment_columns = []
    
#     # Expand calcFeatures for entire dataset
#     print("Expanding calcFeatures...")
#     df_expanded = expand_calc_features(df)
    
#     # Get unique model versions
#     model_versions = sorted(df_expanded[model_version_col].unique())
#     print(f"Found model versions: {model_versions}\n")
    
#     all_results = []
    
#     # Process each model version
#     for model_version in model_versions:
#         print(f"{'='*100}")
#         print(f"Processing Model Version: {model_version}")
#         print(f"{'='*100}")
        
#         # Filter data for current model version
#         mv_df = df_expanded[df_expanded[model_version_col] == model_version].copy()
        
#         # Split into train and test
#         train_df = mv_df[mv_df[data_selection_col] == 'Train'].copy()
#         test_df = mv_df[mv_df[data_selection_col] == 'Test'].copy()
        
#         if len(train_df) == 0:
#             print(f"Warning: No training data for model version {model_version}\n")
#             continue
        
#         if len(test_df) == 0:
#             print(f"Warning: No test data for model version {model_version}\n")
#             continue
        
#         print(f"Train records: {len(train_df)}, Test records: {len(test_df)}")
#         print(f"Training period: {train_df[month_col].min()} to {train_df[month_col].max()}")
#         print(f"Test period: {test_df[month_col].min()} to {test_df[month_col].max()}")
        
#         # Get all expanded features (calc_ prefixed columns)
#         calc_features = [col for col in df_expanded.columns if col.startswith('calc_')]
        
#         # Add score column to feature list
#         feature_list = [score_column] + calc_features
        
#         print(f"Score column: {score_column}")
#         print(f"Number of expanded features: {len(calc_features)}\n")
        
#         # Identify feature types
#         feature_types = identify_feature_types(mv_df, feature_list)
        
#         # Create binning strategy based on training period
#         binning_info = create_bins_for_features(
#             mv_df,
#             feature_types['numerical'],
#             feature_types['categorical'],
#             train_df
#         )
        
#         # Get sorted test months
#         test_months = sorted(test_df[month_col].unique())
        
#         # ========== OVERALL PSI ==========
#         print(f"\nCalculating Overall PSI...")
#         for feature in feature_list:
#             if feature not in mv_df.columns:
#                 continue
            
#             # Apply binning to entire dataset
#             mv_df[f'{feature}_binned'] = apply_binning(mv_df, feature, binning_info[feature])
            
#             # Get training period distribution (baseline)
#             train_baseline = mv_df[mv_df[data_selection_col] == 'Train'][f'{feature}_binned'].value_counts(normalize=True)
            
#             # Calculate PSI for each test month
#             for month in test_months:
#                 month_data = mv_df[mv_df[month_col] == month]
                
#                 if len(month_data) == 0:
#                     continue
                
#                 actual_dist = month_data[f'{feature}_binned'].value_counts(normalize=True)
#                 psi_value = calculate_psi(train_baseline, actual_dist)
                
#                 # Count distinct accounts
#                 base_count = mv_df[mv_df[data_selection_col] == 'Train'][account_id_col].nunique()
#                 actual_count = month_data[account_id_col].nunique()
                
#                 # Calculate average percentages
#                 expected_avg_pct = train_baseline.mean() * 100
#                 actual_avg_pct = actual_dist.mean() * 100
                
#                 # Determine feature category
#                 feature_category = 'Score' if feature == score_column else 'Expanded_Feature'
                
#                 results_row = {
#                     'Model_Version': model_version,
#                     'Feature': feature,
#                     'Feature_Type': binning_info[feature]['type'],
#                     'Feature_Category': feature_category,
#                     'Segment_Column': 'Overall',
#                     'Segment_Value': 'All',
#                     'Month': f"{month}",
#                     'Base_Month': 'Train',
#                     'Current_Month': month,
#                     'Base_Count': base_count,
#                     'Actual_Count': actual_count,
#                     'Expected_Percentage': expected_avg_pct,
#                     'Actual_Percentage': actual_avg_pct,
#                     'PSI': psi_value
#                 }
                
#                 all_results.append(results_row)
        
#         # ========== SEGMENT-WISE PSI ==========
#         if segment_columns:
#             print(f"\nCalculating Segment-wise PSI...")
#             for segment_col in segment_columns:
#                 if segment_col not in mv_df.columns:
#                     print(f"Warning: Segment column '{segment_col}' not found")
#                     continue
                
#                 segments = mv_df[segment_col].dropna().unique()
#                 print(f"\nSegment Column: {segment_col} | Values: {list(segments)}")
                
#                 for segment_val in segments:
#                     segment_df = mv_df[mv_df[segment_col] == segment_val].copy()
                    
#                     # Get train and test data for this segment
#                     train_segment = segment_df[segment_df[data_selection_col] == 'Train']
#                     test_segment = segment_df[segment_df[data_selection_col] == 'Test']
                    
#                     if len(train_segment) == 0 or len(test_segment) == 0:
#                         continue
                    
#                     for feature in feature_list:
#                         if feature not in segment_df.columns:
#                             continue
                        
#                         # Get baseline from train segment
#                         train_baseline = train_segment[f'{feature}_binned'].value_counts(normalize=True)
                        
#                         # Calculate PSI for each test month in this segment
#                         for month in test_months:
#                             actual_segment = segment_df[segment_df[month_col] == month]
                            
#                             if len(actual_segment) == 0:
#                                 continue
                            
#                             actual_dist = actual_segment[f'{feature}_binned'].value_counts(normalize=True)
#                             psi_value = calculate_psi(train_baseline, actual_dist)
                            
#                             # Count distinct accounts for segment
#                             base_segment_count = train_segment[account_id_col].nunique()
#                             actual_segment_count = actual_segment[account_id_col].nunique()
                            
#                             # Calculate average percentages
#                             expected_avg_pct = train_baseline.mean() * 100
#                             actual_avg_pct = actual_dist.mean() * 100
                            
#                             feature_category = 'Score' if feature == score_column else 'Expanded_Feature'
                            
#                             results_row = {
#                                 'Model_Version': model_version,
#                                 'Feature': feature,
#                                 'Feature_Type': binning_info[feature]['type'],
#                                 'Feature_Category': feature_category,
#                                 'Segment_Column': segment_col,
#                                 'Segment_Value': segment_val,
#                                 'Month': f"{month}",
#                                 'Base_Month': 'Train',
#                                 'Current_Month': month,
#                                 'Base_Count': base_segment_count,
#                                 'Actual_Count': actual_segment_count,
#                                 'Expected_Percentage': expected_avg_pct,
#                                 'Actual_Percentage': actual_avg_pct,
#                                 'PSI': psi_value
#                             }
                            
#                             all_results.append(results_row)
        
#         print(f"Completed processing for Model Version: {model_version}\n")
    
#     results_df = pd.DataFrame(all_results)
#     return results_df


# def calculate_bin_level_psi_by_model_version(df: pd.DataFrame,
#                                              score_column: str,
#                                              segment_columns: List[str] = None,
#                                              month_col: str = 'Application_month',
#                                              data_selection_col: str = 'Data_selection',
#                                              model_version_col: str = 'modelVersionId',
#                                              account_id_col: str = 'digitalLoanAccountId') -> pd.DataFrame:
#     """
#     Calculate bin-level PSI for each model version.
#     Provides detailed breakdown by bins/categories.
#     """
    
#     if segment_columns is None:
#         segment_columns = []
    
#     print("Expanding calcFeatures for bin-level analysis...")
#     df_expanded = expand_calc_features(df)
    
#     model_versions = sorted(df_expanded[model_version_col].unique())
    
#     all_results = []
#     epsilon = 0.0001
    
#     for model_version in model_versions:
#         print(f"\nProcessing bin-level PSI for Model Version: {model_version}")
        
#         mv_df = df_expanded[df_expanded[model_version_col] == model_version].copy()
        
#         train_df = mv_df[mv_df[data_selection_col] == 'Train'].copy()
#         test_df = mv_df[mv_df[data_selection_col] == 'Test'].copy()
        
#         if len(train_df) == 0 or len(test_df) == 0:
#             continue
        
#         calc_features = [col for col in df_expanded.columns if col.startswith('calc_')]
#         feature_list = [score_column] + calc_features
        
#         feature_types = identify_feature_types(mv_df, feature_list)
#         binning_info = create_bins_for_features(
#             mv_df,
#             feature_types['numerical'],
#             feature_types['categorical'],
#             train_df
#         )
        
#         test_months = sorted(test_df[month_col].unique())
        
#         # ========== OVERALL BIN-LEVEL PSI ==========
#         for feature in feature_list:
#             if feature not in mv_df.columns:
#                 continue
            
#             mv_df[f'{feature}_binned'] = apply_binning(mv_df, feature, binning_info[feature])
#             train_baseline = mv_df[mv_df[data_selection_col] == 'Train'][f'{feature}_binned'].value_counts(normalize=True)
            
#             for month in test_months:
#                 month_data = mv_df[mv_df[month_col] == month]
#                 if len(month_data) == 0:
#                     continue
                
#                 actual_dist = month_data[f'{feature}_binned'].value_counts(normalize=True)
                
#                 base_count = mv_df[mv_df[data_selection_col] == 'Train'][account_id_col].nunique()
#                 actual_count = month_data[account_id_col].nunique()
                
#                 all_bins = train_baseline.index.union(actual_dist.index)
                
#                 for bin_name in all_bins:
#                     expected_pct = train_baseline.get(bin_name, 0)
#                     actual_pct = actual_dist.get(bin_name, 0)
                    
#                     expected_pct = epsilon if expected_pct == 0 else expected_pct
#                     actual_pct = epsilon if actual_pct == 0 else actual_pct
                    
#                     bin_psi = (actual_pct - expected_pct) * np.log(actual_pct / expected_pct)
                    
#                     bin_ranges = binning_info[feature]['bin_ranges']
#                     if bin_name in bin_ranges:
#                         bin_min = bin_ranges[bin_name]['min']
#                         bin_max = bin_ranges[bin_name]['max']
#                         bin_range = bin_ranges[bin_name]['range_str']
#                     else:
#                         bin_min = None
#                         bin_max = None
#                         bin_range = bin_name
                    
#                     feature_category = 'Score' if feature == score_column else 'Expanded_Feature'
                    
#                     all_results.append({
#                         'Model_Version': model_version,
#                         'Feature': feature,
#                         'Feature_Type': binning_info[feature]['type'],
#                         'Feature_Category': feature_category,
#                         'Segment_Column': 'Overall',
#                         'Segment_Value': 'All',
#                         'Month': f"{month}",
#                         'Base_Month': 'Train',
#                         'Current_Month': month,
#                         'Base_Count': base_count,
#                         'Actual_Count': actual_count,
#                         'Bin': bin_name,
#                         'Bin_Range': bin_range,
#                         'Bin_Min': bin_min,
#                         'Bin_Max': bin_max,
#                         'Base_Percentage': train_baseline.get(bin_name, 0) * 100,
#                         'Actual_Percentage': actual_dist.get(bin_name, 0) * 100,
#                         'Bin_PSI': bin_psi
#                     })
        
#         # ========== SEGMENT-WISE BIN-LEVEL PSI ==========
#         if segment_columns:
#             for segment_col in segment_columns:
#                 if segment_col not in mv_df.columns:
#                     continue
                
#                 segments = mv_df[segment_col].dropna().unique()
                
#                 for segment_val in segments:
#                     segment_df = mv_df[mv_df[segment_col] == segment_val].copy()
                    
#                     train_segment = segment_df[segment_df[data_selection_col] == 'Train']
#                     test_segment = segment_df[segment_df[data_selection_col] == 'Test']
                    
#                     if len(train_segment) == 0 or len(test_segment) == 0:
#                         continue
                    
#                     for feature in feature_list:
#                         if feature not in segment_df.columns:
#                             continue
                        
#                         train_baseline = train_segment[f'{feature}_binned'].value_counts(normalize=True)
                        
#                         for month in test_months:
#                             actual_segment = segment_df[segment_df[month_col] == month]
                            
#                             if len(actual_segment) == 0:
#                                 continue
                            
#                             actual_dist = actual_segment[f'{feature}_binned'].value_counts(normalize=True)
                            
#                             base_segment_count = train_segment[account_id_col].nunique()
#                             actual_segment_count = actual_segment[account_id_col].nunique()
                            
#                             all_bins = train_baseline.index.union(actual_dist.index)
                            
#                             for bin_name in all_bins:
#                                 expected_pct = train_baseline.get(bin_name, 0)
#                                 actual_pct = actual_dist.get(bin_name, 0)
                                
#                                 expected_pct = epsilon if expected_pct == 0 else expected_pct
#                                 actual_pct = epsilon if actual_pct == 0 else actual_pct
                                
#                                 bin_psi = (actual_pct - expected_pct) * np.log(actual_pct / expected_pct)
                                
#                                 bin_ranges = binning_info[feature]['bin_ranges']
#                                 if bin_name in bin_ranges:
#                                     bin_min = bin_ranges[bin_name]['min']
#                                     bin_max = bin_ranges[bin_name]['max']
#                                     bin_range = bin_ranges[bin_name]['range_str']
#                                 else:
#                                     bin_min = None
#                                     bin_max = None
#                                     bin_range = bin_name
                                
#                                 feature_category = 'Score' if feature == score_column else 'Expanded_Feature'
                                
#                                 all_results.append({
#                                     'Model_Version': model_version,
#                                     'Feature': feature,
#                                     'Feature_Type': binning_info[feature]['type'],
#                                     'Feature_Category': feature_category,
#                                     'Segment_Column': segment_col,
#                                     'Segment_Value': segment_val,
#                                     'Month': f"{month}",
#                                     'Base_Month': 'Train',
#                                     'Current_Month': month,
#                                     'Base_Count': base_segment_count,
#                                     'Actual_Count': actual_segment_count,
#                                     'Bin': bin_name,
#                                     'Bin_Range': bin_range,
#                                     'Bin_Min': bin_min,
#                                     'Bin_Max': bin_max,
#                                     'Base_Percentage': train_baseline.get(bin_name, 0) * 100,
#                                     'Actual_Percentage': actual_dist.get(bin_name, 0) * 100,
#                                     'Bin_PSI': bin_psi
#                                 })
    
#     return pd.DataFrame(all_results)


# # ============================================================================
# # USAGE EXAMPLE
# # ============================================================================
# """
# # Assuming you have df with concatenated Train and Test data

# # Calculate Overall PSI (Overall + By Segments)
# psi_results = calculate_psi_by_model_version(
#     df=your_concatenated_df,
#     score_column='Alpha_cic_sil_score',
#     segment_columns=['new_loan_type', 'loan_product_type', 'osType'],
#     month_col='Application_month',
#     data_selection_col='Data_selection',
#     model_version_col='modelVersionId',
#     account_id_col='digitalLoanAccountId'
# )

# print(psi_results.head(20))
# print(f"\nTotal rows: {len(psi_results)}")

# # View Overall results only
# overall_results = psi_results[psi_results['Segment_Column'] == 'Overall']
# print(overall_results)

# # View specific model version
# v1_results = psi_results[psi_results['Model_Version'] == 'v1']
# print(v1_results)

# # View specific segment
# loan_type_results = psi_results[psi_results['Segment_Column'] == 'new_loan_type']
# print(loan_type_results)

# # Save results
# psi_results.to_csv('psi_results_overall_and_segments.csv', index=False)

# # ---- For Bin-Level Details ----
# bin_psi_results = calculate_bin_level_psi_by_model_version(
#     df=your_concatenated_df,
#     score_column='Alpha_cic_sil_score',
#     segment_columns=['new_loan_type', 'loan_product_type', 'osType']
# )

# bin_psi_results.to_csv('bin_level_psi_results_overall_and_segments.csv', index=False)
# print(bin_psi_results.head(30))
# """

# create_comprehensive_psi_report

In [57]:
import pandas as pd
import numpy as np
import json
from typing import List, Dict, Tuple, Set
from itertools import combinations
import warnings
warnings.filterwarnings('ignore')


def expand_calc_features(df):
    """
    Expand the calcFeatures JSON column into separate columns.
    """
    df_expanded = df.copy()
    calc_features_list = []

    for idx, calc_features_str in enumerate(df['calcFeatures']):
        try:
            features_dict = json.loads(calc_features_str.replace("'", '"'))
            calc_features_list.append(features_dict)
        except (json.JSONDecodeError, AttributeError) as e:
            print(f"Warning: Could not parse calcFeatures at index {idx}: {e}")
            calc_features_list.append({})

    calc_features_df = pd.DataFrame(calc_features_list)
    calc_features_df = calc_features_df.add_prefix('calc_')

    df_expanded = df_expanded.reset_index(drop=True)
    calc_features_df = calc_features_df.reset_index(drop=True)

    result_df = pd.concat([df_expanded, calc_features_df], axis=1)
    return result_df


def identify_feature_types(df: pd.DataFrame, feature_list: List[str]) -> Dict[str, List[str]]:
    """
    Identify categorical and numerical features.
    """
    categorical_features = []
    numerical_features = []

    for feature in feature_list:
        if feature not in df.columns:
            continue

        if pd.api.types.is_numeric_dtype(df[feature]):
            unique_vals = df[feature].nunique()
            if unique_vals < 15 and df[feature].dropna().apply(
                lambda x: x == int(x) if isinstance(x, (int, float)) else False
            ).all():
                categorical_features.append(feature)
            else:
                numerical_features.append(feature)
        else:
            categorical_features.append(feature)

    return {
        'categorical': categorical_features,
        'numerical': numerical_features
    }


def create_bins_for_features(df: pd.DataFrame,
                             numerical_features: List[str],
                             categorical_features: List[str],
                             train_period_df: pd.DataFrame) -> Dict:
    """
    Create bins for numerical features (deciles) and categorical features (top 9 + others).
    """
    binning_info = {}

    # Numerical features - Deciles
    for feature in numerical_features:
        valid_data = train_period_df[feature].dropna()

        if len(valid_data) == 0:
            binning_info[feature] = {'type': 'numerical', 'bins': None, 'bin_ranges': {}}
            continue

        bins = None
        bin_count = None

        # Try 10 bins (deciles)
        try:
            test_bins = np.percentile(valid_data, np.arange(0, 101, 10))
            test_bins = np.unique(test_bins)
            if len(test_bins) >= 11:
                bins = test_bins
                bin_count = 10
        except Exception as e:
            pass

        # Fallback to 5 bins
        if bins is None:
            try:
                test_bins = np.percentile(valid_data, np.arange(0, 101, 20))
                test_bins = np.unique(test_bins)
                if len(test_bins) >= 6:
                    bins = test_bins
                    bin_count = 5
            except Exception as e:
                pass

        # Fallback to 3 bins
        if bins is None:
            try:
                test_bins = np.percentile(valid_data, [0, 33.33, 66.67, 100])
                test_bins = np.unique(test_bins)
                if len(test_bins) >= 4:
                    bins = test_bins
                    bin_count = 3
            except Exception as e:
                pass

        # Equal distance bins
        if bins is None:
            print(f"Warning: Feature '{feature}' has insufficient variance")
            min_val = valid_data.min()
            max_val = valid_data.max()
            bins = np.linspace(min_val, max_val, 6)
            bins = np.unique(bins)
            bin_count = len(bins) - 1

            if bin_count == 1:
                bins = np.array([min_val - 0.1, min_val, min_val + 0.1])
                bin_count = 2

        bins = bins.copy()
        bins[0] = -np.inf
        bins[-1] = np.inf

        print(f"Feature '{feature}': Created {bin_count} bins")

        bin_ranges = {}
        for i in range(len(bins)-1):
            bin_name = f"Bin_{i+1}"
            bin_ranges[bin_name] = {
                'min': bins[i],
                'max': bins[i+1],
                'range_str': f"[{bins[i]:.2f}, {bins[i+1]:.2f}]" if not np.isinf(bins[i]) and not np.isinf(bins[i+1]) else f"({bins[i]}, {bins[i+1]})"
            }

        binning_info[feature] = {
            'type': 'numerical',
            'bins': bins,
            'bin_ranges': bin_ranges,
            'bin_count': bin_count
        }

    # Categorical features - Top 9 + Others
    for feature in categorical_features:
        value_counts = train_period_df[feature].value_counts()
        unique_categories = value_counts.index.tolist()

        if len(unique_categories) <= 9:
            top_categories = unique_categories
        else:
            top_categories = value_counts.nlargest(9).index.tolist()

        print(f"Feature '{feature}': Using {len(top_categories)} top categories (total unique: {len(unique_categories)})")

        binning_info[feature] = {
            'type': 'categorical',
            'top_categories': top_categories,
            'bin_ranges': {}
        }

    return binning_info


def apply_binning(df: pd.DataFrame,
                  feature: str,
                  binning_info: Dict) -> pd.Series:
    """
    Apply binning to a feature.
    """
    if binning_info['type'] == 'numerical':
        if binning_info['bins'] is None:
            return pd.Series(['Missing'] * len(df), index=df.index)

        bins = binning_info['bins']
        labels = [f"Bin_{i+1}" for i in range(len(bins)-1)]

        binned = pd.cut(df[feature],
                       bins=bins,
                       labels=labels,
                       include_lowest=True,
                       duplicates='drop')

        binned = binned.astype(str)
        binned[df[feature].isna()] = 'Missing'

        return binned

    else:  # categorical
        top_cats = binning_info['top_categories']
        feature_data = df[feature].astype(str)
        feature_data = feature_data.replace('nan', 'Missing')
        top_cats_str = [str(cat) for cat in top_cats]

        binned = feature_data.apply(
            lambda x: x if x in top_cats_str else ('Others' if x != 'Missing' else 'Missing')
        )

        return binned


def calculate_psi(expected_pct: pd.Series,
                  actual_pct: pd.Series,
                  epsilon: float = 0.0001) -> float:
    """
    Calculate Population Stability Index.
    """
    all_bins = expected_pct.index.union(actual_pct.index)
    expected_pct = expected_pct.reindex(all_bins, fill_value=0)
    actual_pct = actual_pct.reindex(all_bins, fill_value=0)

    expected_pct = expected_pct.apply(lambda x: epsilon if x == 0 else x)
    actual_pct = actual_pct.apply(lambda x: epsilon if x == 0 else x)

    expected_pct = expected_pct / expected_pct.sum()
    actual_pct = actual_pct / actual_pct.sum()

    psi_value = np.sum((actual_pct - expected_pct) * np.log(actual_pct / expected_pct))

    return psi_value


def get_valid_model_trench_combinations(df: pd.DataFrame) -> List[Tuple[str, List[str]]]:
    """
    Get all valid combinations of modelVersionId and trenchCategory.
    Returns list of tuples: (modelVersionId, [trenchCategory_list])
    """
    combinations_list = []
    
    for model_id in df['modelVersionId'].unique():
        model_df = df[df['modelVersionId'] == model_id]
        trenches = model_df['trenchCategory'].dropna().unique().tolist()
        
        if trenches:
            combinations_list.append((model_id, trenches))
    
    return combinations_list


def generate_segment_combinations(model_trenches_list: List[Tuple]) -> List[Tuple[str, List[str]]]:
    """
    Generate all combinations of models and trenches.
    Returns (modelVersionId, [trench_combinations])
    """
    all_combinations = []
    
    for model_id, trenches in model_trenches_list:
        # Single trenches
        for trench in trenches:
            all_combinations.append((model_id, [trench]))
        
        # Multiple trench combinations
        for r in range(2, len(trenches) + 1):
            for combo in combinations(trenches, r):
                all_combinations.append((model_id, list(combo)))
    
    return all_combinations


def calculate_psi_for_segments(df: pd.DataFrame,
                               feature: str,
                               binning_info: Dict,
                               segment_def: Dict,
                               account_id_col: str = 'digitalLoanAccountId',
                               data_selection_col: str = 'Data_selection',
                               month_col: str = 'Application_month') -> Dict:
    """
    Calculate PSI for a specific feature and segment definition.
    """
    results = []
    
    # Apply binning
    df[f'{feature}_binned'] = apply_binning(df, feature, binning_info[feature])
    
    # Get train baseline
    train_mask = df[data_selection_col] == 'Train'
    
    # Apply segment filters
    segment_mask = train_mask.copy()
    for col, values in segment_def.items():
        if col in df.columns:
            if isinstance(values, list):
                segment_mask = segment_mask & (df[col].isin(values))
            else:
                segment_mask = segment_mask & (df[col] == values)
    
    train_segment = df[segment_mask]
    
    if len(train_segment) == 0:
        return []
    
    train_baseline = train_segment[f'{feature}_binned'].value_counts(normalize=True)
    
    # Get test months
    test_df = df[~train_mask]
    test_months = sorted(test_df[month_col].unique())
    
    for month in test_months:
        # Apply segment filters to test data
        test_mask = (df[month_col] == month)
        for col, values in segment_def.items():
            if col in df.columns:
                if isinstance(values, list):
                    test_mask = test_mask & (df[col].isin(values))
                else:
                    test_mask = test_mask & (df[col] == values)
        
        actual_segment = df[test_mask]
        
        if len(actual_segment) == 0:
            continue
        
        actual_dist = actual_segment[f'{feature}_binned'].value_counts(normalize=True)
        psi_value = calculate_psi(train_baseline, actual_dist)
        
        expected_avg_pct = train_baseline.mean() * 100
        actual_avg_pct = actual_dist.mean() * 100
        
        base_count = train_segment[account_id_col].nunique()
        actual_count = actual_segment[account_id_col].nunique()
        
        results.append({
            'Feature': feature,
            'Feature_Type': binning_info[feature]['type'],
            'Month': month,
            'Base_Month': 'Train',
            'Base_Count': base_count,
            'Actual_Count': actual_count,
            'Expected_Percentage': expected_avg_pct,
            'Actual_Percentage': actual_avg_pct,
            'PSI': psi_value
        })
    
    return results


def create_comprehensive_psi_report(df: pd.DataFrame,
                                    excluded_features: List[str] = None,
                                    account_id_col: str = 'digitalLoanAccountId',
                                    data_selection_col: str = 'Data_selection',
                                    month_col: str = 'Application_month') -> pd.DataFrame:
    """
    Create comprehensive PSI report with intelligent segmentation.
    
    Parameters:
    -----------
    df : pd.DataFrame
        Concatenated train+test dataframe (already expanded with calcFeatures)
    excluded_features : List[str]
        Features to exclude (e.g., ['digitalLoanAccountId', 'customerId', 'crifapplicationid'])
    account_id_col : str
        Column name for counting distinct accounts
    data_selection_col : str
        Column identifying Train vs Test
    month_col : str
        Column with month information
    
    Returns:
    --------
    pd.DataFrame with comprehensive PSI results and segment definitions
    """
    
    if excluded_features is None:
        excluded_features = ['digitalLoanAccountId', 'customerId', 'crifapplicationid', 'rundate']
    
    # Expand calcFeatures
    print("Expanding calcFeatures...")
    df = expand_calc_features(df)
    
    # Get feature list (exclude specific columns)
    exclude_cols = {
        'customerId', 'digitalLoanAccountId', 'Alpha_cic_sil_score',
        'start_time', 'end_time', 'modelDisplayName', 'modelVersionId',
        'new_loan_type', 'gender', 'loan_product_type', 'osType',
        'Model_Name', 'product', 'trenchCategory', 'calcFeatures',
        'Data_selection', 'appln_submit_datetime', 'disbursementDateTime',
        'Application_month'
    }
    exclude_cols.update(excluded_features)
    
    feature_list = [col for col in df.columns if col not in exclude_cols]
    
    print(f"Found {len(feature_list)} features to analyze")
    
    # Identify feature types from training data
    train_df = df[df[data_selection_col] == 'Train']
    feature_types = identify_feature_types(df, feature_list)
    
    # Create binning strategy
    print("Creating binning strategy...")
    binning_info = create_bins_for_features(
        df,
        feature_types['numerical'],
        feature_types['categorical'],
        train_df
    )
    
    all_results = []
    segment_id_counter = 0
    
    # Get valid model-trench combinations
    model_trenches = get_valid_model_trench_combinations(df)
    segment_combos = generate_segment_combinations(model_trenches)
    
    print(f"Found {len(segment_combos)} model-trench combinations")
    
    # Level 1: Model Score PSI (modelVersionId + trenchCategory)
    print("\n=== Level 1: Model Score PSI ===")
    for model_id, trenches in segment_combos:
        segment_def = {
            'modelVersionId': model_id,
            'trenchCategory': trenches
        }
        
        # Model score is Alpha_cic_sil_score
        feature = 'Alpha_cic_sil_score'
        if feature not in binning_info:
            feature_types_score = identify_feature_types(df, [feature])
            binning_info[feature] = create_bins_for_features(
                df, feature_types_score['numerical'], 
                feature_types_score['categorical'], train_df
            )[feature]
        
        results = calculate_psi_for_segments(df, feature, binning_info, segment_def)
        
        for res in results:
            res['Segment_ID'] = segment_id_counter
            res['Segment_Name'] = f"Model_{model_id}_Trench_{'_'.join(trenches)}"
            res['Segment_Column_1'] = 'modelVersionId'
            res['Segment_Value_1'] = model_id
            res['Segment_Column_2'] = 'trenchCategory'
            res['Segment_Value_2'] = ','.join(trenches)
            all_results.append(res)
        
        segment_id_counter += 1
    
    # Level 2-4: Additional segmentations
    additional_segments = [
        ('loan_product_type', 'Level 2: By Loan Product Type'),
        ('new_loan_type', 'Level 3: By Loan Type'),
        ('osType', 'Level 4: By OS Type')
    ]
    
    for segment_col, level_name in additional_segments:
        print(f"\n=== {level_name} ===")
        
        for model_id, trenches in segment_combos:
            segment_values = df[
                (df['modelVersionId'] == model_id) & 
                (df['trenchCategory'].isin(trenches))
            ][segment_col].dropna().unique().tolist()
            
            for seg_val in segment_values:
                segment_def = {
                    'modelVersionId': model_id,
                    'trenchCategory': trenches,
                    segment_col: seg_val
                }
                
                # Calculate for all features
                for feature in feature_list:
                    if feature not in binning_info:
                        continue
                    
                    results = calculate_psi_for_segments(df, feature, binning_info, segment_def)
                    
                    for res in results:
                        res['Segment_ID'] = segment_id_counter
                        res['Segment_Name'] = (
                            f"Model_{model_id}_Trench_{'_'.join(trenches)}_"
                            f"{segment_col}_{seg_val}"
                        )
                        res['Segment_Column_1'] = 'modelVersionId'
                        res['Segment_Value_1'] = model_id
                        res['Segment_Column_2'] = 'trenchCategory'
                        res['Segment_Value_2'] = ','.join(trenches)
                        res['Segment_Column_3'] = segment_col
                        res['Segment_Value_3'] = seg_val
                        all_results.append(res)
                
                segment_id_counter += 1
    
    # Create results dataframe
    results_df = pd.DataFrame(all_results)
    
    # Reorder columns
    core_cols = [
        'Segment_ID', 'Segment_Name',
        'Segment_Column_1', 'Segment_Value_1',
        'Segment_Column_2', 'Segment_Value_2',
        'Segment_Column_3', 'Segment_Value_3',
        'Feature', 'Feature_Type', 'Month',
        'Base_Count', 'Actual_Count',
        'Expected_Percentage', 'Actual_Percentage', 'PSI'
    ]
    
    available_cols = [col for col in core_cols if col in results_df.columns]
    results_df = results_df[available_cols]
    
    print(f"\nTotal PSI calculations: {len(results_df)}")
    
    return results_df


# Example Usage:
# df = pd.read_csv('your_data.csv')
# psi_report = create_comprehensive_psi_report(df)
# psi_report.to_csv('psi_report.csv', index=False)

# create_comprehensive_psi_report Version 2

In [58]:
import pandas as pd
import numpy as np
import json
from typing import List, Dict, Tuple, Set
from itertools import combinations
import warnings
warnings.filterwarnings('ignore')


def expand_calc_features(df):
    """
    Expand the calcFeatures JSON column into separate columns.
    """
    df_expanded = df.copy()
    calc_features_list = []

    for idx, calc_features_str in enumerate(df['calcFeatures']):
        try:
            features_dict = json.loads(calc_features_str.replace("'", '"'))
            calc_features_list.append(features_dict)
        except (json.JSONDecodeError, AttributeError) as e:
            print(f"Warning: Could not parse calcFeatures at index {idx}: {e}")
            calc_features_list.append({})

    calc_features_df = pd.DataFrame(calc_features_list)
    calc_features_df = calc_features_df.add_prefix('calc_')

    df_expanded = df_expanded.reset_index(drop=True)
    calc_features_df = calc_features_df.reset_index(drop=True)

    result_df = pd.concat([df_expanded, calc_features_df], axis=1)
    return result_df


def get_model_version_specific_features(df: pd.DataFrame,
                                        model_id: str,
                                        trenches: List[str]) -> List[str]:
    """
    Get features that are specific to a modelVersionId + trenchCategory combination.
    Only includes calc_ features that have actual data in this segment.
    """
    # Filter to specific model and trenches
    segment_df = df[
        (df['modelVersionId'] == model_id) & 
        (df['trenchCategory'].isin(trenches))
    ]
    
    if len(segment_df) == 0:
        return []
    
    # Get all calc_ columns
    calc_cols = [col for col in segment_df.columns if col.startswith('calc_')]
    
    # Filter to only those with actual data (not all NaN)
    valid_features = []
    for col in calc_cols:
        if segment_df[col].notna().sum() > 0:
            valid_features.append(col)
    
    return valid_features


def identify_feature_types(df: pd.DataFrame, feature_list: List[str]) -> Dict[str, List[str]]:
    """
    Identify categorical and numerical features.
    """
    categorical_features = []
    numerical_features = []

    for feature in feature_list:
        if feature not in df.columns:
            continue

        if pd.api.types.is_numeric_dtype(df[feature]):
            unique_vals = df[feature].nunique()
            if unique_vals < 15 and df[feature].dropna().apply(
                lambda x: x == int(x) if isinstance(x, (int, float)) else False
            ).all():
                categorical_features.append(feature)
            else:
                numerical_features.append(feature)
        else:
            categorical_features.append(feature)

    return {
        'categorical': categorical_features,
        'numerical': numerical_features
    }


def create_bins_for_features(df: pd.DataFrame,
                             numerical_features: List[str],
                             categorical_features: List[str],
                             train_period_df: pd.DataFrame) -> Dict:
    """
    Create bins for numerical features (deciles) and categorical features (top 9 + others).
    """
    binning_info = {}

    # Numerical features - Deciles
    for feature in numerical_features:
        valid_data = train_period_df[feature].dropna()

        if len(valid_data) == 0:
            binning_info[feature] = {'type': 'numerical', 'bins': None, 'bin_ranges': {}}
            continue

        bins = None
        bin_count = None

        # Try 10 bins (deciles)
        try:
            test_bins = np.percentile(valid_data, np.arange(0, 101, 10))
            test_bins = np.unique(test_bins)
            if len(test_bins) >= 11:
                bins = test_bins
                bin_count = 10
        except Exception as e:
            pass

        # Fallback to 5 bins
        if bins is None:
            try:
                test_bins = np.percentile(valid_data, np.arange(0, 101, 20))
                test_bins = np.unique(test_bins)
                if len(test_bins) >= 6:
                    bins = test_bins
                    bin_count = 5
            except Exception as e:
                pass

        # Fallback to 3 bins
        if bins is None:
            try:
                test_bins = np.percentile(valid_data, [0, 33.33, 66.67, 100])
                test_bins = np.unique(test_bins)
                if len(test_bins) >= 4:
                    bins = test_bins
                    bin_count = 3
            except Exception as e:
                pass

        # Equal distance bins
        if bins is None:
            print(f"Warning: Feature '{feature}' has insufficient variance")
            min_val = valid_data.min()
            max_val = valid_data.max()
            bins = np.linspace(min_val, max_val, 6)
            bins = np.unique(bins)
            bin_count = len(bins) - 1

            if bin_count == 1:
                bins = np.array([min_val - 0.1, min_val, min_val + 0.1])
                bin_count = 2

        bins = bins.copy()
        bins[0] = -np.inf
        bins[-1] = np.inf

        print(f"  Feature '{feature}': Created {bin_count} bins")

        bin_ranges = {}
        for i in range(len(bins)-1):
            bin_name = f"Bin_{i+1}"
            bin_ranges[bin_name] = {
                'min': bins[i],
                'max': bins[i+1],
                'range_str': f"[{bins[i]:.2f}, {bins[i+1]:.2f}]" if not np.isinf(bins[i]) and not np.isinf(bins[i+1]) else f"({bins[i]}, {bins[i+1]})"
            }

        binning_info[feature] = {
            'type': 'numerical',
            'bins': bins,
            'bin_ranges': bin_ranges,
            'bin_count': bin_count
        }

    # Categorical features - Top 9 + Others
    for feature in categorical_features:
        value_counts = train_period_df[feature].value_counts()
        unique_categories = value_counts.index.tolist()

        if len(unique_categories) <= 9:
            top_categories = unique_categories
        else:
            top_categories = value_counts.nlargest(9).index.tolist()

        print(f"  Feature '{feature}': Using {len(top_categories)} top categories (total: {len(unique_categories)})")

        binning_info[feature] = {
            'type': 'categorical',
            'top_categories': top_categories,
            'bin_ranges': {}
        }

    return binning_info


def apply_binning(df: pd.DataFrame,
                  feature: str,
                  binning_info: Dict) -> pd.Series:
    """
    Apply binning to a feature.
    """
    if binning_info['type'] == 'numerical':
        if binning_info['bins'] is None:
            return pd.Series(['Missing'] * len(df), index=df.index)

        bins = binning_info['bins']
        labels = [f"Bin_{i+1}" for i in range(len(bins)-1)]

        binned = pd.cut(df[feature],
                       bins=bins,
                       labels=labels,
                       include_lowest=True,
                       duplicates='drop')

        binned = binned.astype(str)
        binned[df[feature].isna()] = 'Missing'

        return binned

    else:  # categorical
        top_cats = binning_info['top_categories']
        feature_data = df[feature].astype(str)
        feature_data = feature_data.replace('nan', 'Missing')
        top_cats_str = [str(cat) for cat in top_cats]

        binned = feature_data.apply(
            lambda x: x if x in top_cats_str else ('Others' if x != 'Missing' else 'Missing')
        )

        return binned


def calculate_psi(expected_pct: pd.Series,
                  actual_pct: pd.Series,
                  epsilon: float = 0.0001) -> float:
    """
    Calculate Population Stability Index.
    """
    all_bins = expected_pct.index.union(actual_pct.index)
    expected_pct = expected_pct.reindex(all_bins, fill_value=0)
    actual_pct = actual_pct.reindex(all_bins, fill_value=0)

    expected_pct = expected_pct.apply(lambda x: epsilon if x == 0 else x)
    actual_pct = actual_pct.apply(lambda x: epsilon if x == 0 else x)

    expected_pct = expected_pct / expected_pct.sum()
    actual_pct = actual_pct / actual_pct.sum()

    psi_value = np.sum((actual_pct - expected_pct) * np.log(actual_pct / expected_pct))

    return psi_value


def get_valid_model_trench_combinations(df: pd.DataFrame) -> List[Tuple[str, str, List[str]]]:
    """
    Get all valid combinations of modelVersionId, modelDisplayName, and trenchCategory.
    Returns list of tuples: (modelVersionId, modelDisplayName, [trenchCategory_list])
    """
    combinations_list = []
    
    for model_id in df['modelVersionId'].unique():
        model_df = df[df['modelVersionId'] == model_id]
        model_display = model_df['modelDisplayName'].iloc[0]
        trenches = sorted(model_df['trenchCategory'].dropna().unique().tolist())
        
        if trenches:
            combinations_list.append((model_id, model_display, trenches))
    
    return combinations_list


def generate_segment_combinations(model_trenches_list: List[Tuple]) -> List[Tuple[str, str, List[str]]]:
    """
    Generate all combinations of models and trenches with modelDisplayName.
    Returns (modelVersionId, modelDisplayName, [trench_combinations])
    """
    all_combinations = []
    
    for model_id, model_display, trenches in model_trenches_list:
        # Single trenches
        for trench in trenches:
            all_combinations.append((model_id, model_display, [trench]))
        
        # Multiple trench combinations
        for r in range(2, len(trenches) + 1):
            for combo in combinations(trenches, r):
                all_combinations.append((model_id, model_display, sorted(list(combo))))
    
    return all_combinations


def calculate_psi_for_segments(df: pd.DataFrame,
                               feature: str,
                               binning_info: Dict,
                               segment_def: Dict,
                               account_id_col: str = 'digitalLoanAccountId',
                               data_selection_col: str = 'Data_selection',
                               month_col: str = 'Application_month') -> List[Dict]:
    """
    Calculate PSI for a specific feature and segment definition.
    """
    results = []
    
    # Apply binning
    df[f'{feature}_binned'] = apply_binning(df, feature, binning_info[feature])
    
    # Get train baseline
    train_mask = df[data_selection_col] == 'Train'
    
    # Apply segment filters
    segment_mask = train_mask.copy()
    for col, values in segment_def.items():
        if col in df.columns:
            if isinstance(values, list):
                segment_mask = segment_mask & (df[col].isin(values))
            else:
                segment_mask = segment_mask & (df[col] == values)
    
    train_segment = df[segment_mask]
    
    if len(train_segment) == 0:
        return []
    
    train_baseline = train_segment[f'{feature}_binned'].value_counts(normalize=True)
    
    # Get test months
    test_df = df[~train_mask]
    test_months = sorted(test_df[month_col].unique())
    
    for month in test_months:
        # Apply segment filters to test data
        test_mask = (df[month_col] == month)
        for col, values in segment_def.items():
            if col in df.columns:
                if isinstance(values, list):
                    test_mask = test_mask & (df[col].isin(values))
                else:
                    test_mask = test_mask & (df[col] == values)
        
        actual_segment = df[test_mask]
        
        if len(actual_segment) == 0:
            continue
        
        actual_dist = actual_segment[f'{feature}_binned'].value_counts(normalize=True)
        psi_value = calculate_psi(train_baseline, actual_dist)
        
        expected_avg_pct = train_baseline.mean() * 100
        actual_avg_pct = actual_dist.mean() * 100
        
        base_count = train_segment[account_id_col].nunique()
        actual_count = actual_segment[account_id_col].nunique()
        
        results.append({
            'Feature': feature,
            'Feature_Type': binning_info[feature]['type'],
            'Month': month,
            'Base_Month': 'Train',
            'Base_Count': base_count,
            'Actual_Count': actual_count,
            'Expected_Percentage': expected_avg_pct,
            'Actual_Percentage': actual_avg_pct,
            'PSI': psi_value
        })
    
    return results


def create_comprehensive_psi_report(df: pd.DataFrame,
                                    excluded_features: List[str] = None,
                                    account_id_col: str = 'digitalLoanAccountId',
                                    data_selection_col: str = 'Data_selection',
                                    month_col: str = 'Application_month') -> pd.DataFrame:
    """
    Create comprehensive PSI report with intelligent segmentation.
    Features are calculated per modelVersionId + trenchCategory combination ONLY.
    
    Parameters:
    -----------
    df : pd.DataFrame
        Concatenated train+test dataframe (already expanded with calcFeatures)
    excluded_features : List[str]
        Features to exclude (e.g., ['digitalLoanAccountId', 'customerId', 'crifapplicationid'])
    account_id_col : str
        Column name for counting distinct accounts
    data_selection_col : str
        Column identifying Train vs Test
    month_col : str
        Column with month information
    
    Returns:
    --------
    pd.DataFrame with comprehensive PSI results and segment definitions
    """
    
    if excluded_features is None:
        excluded_features = ['digitalLoanAccountId', 'customerId', 'crifapplicationid', 'rundate']
    
    # Expand calcFeatures
    print("Expanding calcFeatures...")
    df = expand_calc_features(df)
    
    # Get valid model-trench combinations with modelDisplayName
    model_trenches = get_valid_model_trench_combinations(df)
    segment_combos = generate_segment_combinations(model_trenches)
    
    print(f"Found {len(segment_combos)} model-trench combinations\n")
    
    all_results = []
    segment_id_counter = 0
    
    # Get columns to exclude from analysis
    exclude_cols = {
        'customerId', 'digitalLoanAccountId', 'Alpha_cic_sil_score',
        'start_time', 'end_time', 'modelDisplayName', 'modelVersionId',
        'new_loan_type', 'gender', 'loan_product_type', 'osType',
        'Model_Name', 'product', 'trenchCategory', 'calcFeatures',
        'Data_selection', 'appln_submit_datetime', 'disbursementDateTime',
        'Application_month'
    }
    exclude_cols.update(excluded_features)
    
    # Process each model-trench combination
    for model_id, model_display, trenches in segment_combos:
        print(f"\n{'='*80}")
        print(f"Processing Model: {model_display} (ID: {model_id}), Trenches: {trenches}")
        print(f"{'='*80}")
        
        # Filter data for this model-trench combo
        segment_data = df[
            (df['modelVersionId'] == model_id) & 
            (df['trenchCategory'].isin(trenches))
        ].copy()
        
        if len(segment_data) == 0:
            continue
        
        # Get features specific to this model version + trenches
        model_specific_features = get_model_version_specific_features(df, model_id, trenches)
        feature_list = [f for f in model_specific_features if f not in exclude_cols]
        
        print(f"Found {len(feature_list)} features for this combination")
        
        # Identify feature types from training data WITHIN this segment
        train_segment_data = segment_data[segment_data[data_selection_col] == 'Train']
        feature_types = identify_feature_types(segment_data, feature_list)
        
        # Create binning strategy based on training data WITHIN this segment
        print("\nCreating binning strategy...")
        binning_info = create_bins_for_features(
            segment_data,
            feature_types['numerical'],
            feature_types['categorical'],
            train_segment_data
        )
        
        trench_str = '_'.join(trenches)
        
        # Level 1: Model Score PSI (modelVersionId + trenchCategory)
        print(f"\n--- Level 1: Model Score PSI ---")
        segment_def = {
            'modelVersionId': model_id,
            'trenchCategory': trenches
        }
        
        feature = 'Alpha_cic_sil_score'
        if feature in segment_data.columns:
            if feature not in binning_info:
                feature_types_score = identify_feature_types(segment_data, [feature])
                binning_info[feature] = create_bins_for_features(
                    segment_data, feature_types_score['numerical'], 
                    feature_types_score['categorical'], train_segment_data
                )[feature]
            
            results = calculate_psi_for_segments(segment_data, feature, binning_info, segment_def)
            
            for res in results:
                res['Segment_ID'] = segment_id_counter
                res['Model_Version_ID'] = model_id
                res['Model_Display_Name'] = model_display
                res['Segment_Name'] = f"{model_display}_Trench_{trench_str}"
                res['Segment_Column_1'] = 'modelVersionId'
                res['Segment_Value_1'] = model_id
                res['Segment_Column_2'] = 'trenchCategory'
                res['Segment_Value_2'] = ','.join(trenches)
                all_results.append(res)
        
        segment_id_counter += 1
        
        # Level 2: By Loan Product Type
        print(f"--- Level 2: By Loan Product Type ---")
        loan_products = segment_data[segment_data[data_selection_col] == 'Train']['loan_product_type'].dropna().unique()
        
        for loan_product in loan_products:
            segment_def = {
                'modelVersionId': model_id,
                'trenchCategory': trenches,
                'loan_product_type': loan_product
            }
            
            for feature in feature_list:
                if feature not in binning_info:
                    continue
                
                results = calculate_psi_for_segments(segment_data, feature, binning_info, segment_def)
                
                for res in results:
                    res['Segment_ID'] = segment_id_counter
                    res['Model_Version_ID'] = model_id
                    res['Model_Display_Name'] = model_display
                    res['Segment_Name'] = f"{model_display}_Trench_{trench_str}_LoanProduct_{loan_product}"
                    res['Segment_Column_1'] = 'modelVersionId'
                    res['Segment_Value_1'] = model_id
                    res['Segment_Column_2'] = 'trenchCategory'
                    res['Segment_Value_2'] = ','.join(trenches)
                    res['Segment_Column_3'] = 'loan_product_type'
                    res['Segment_Value_3'] = loan_product
                    all_results.append(res)
            
            segment_id_counter += 1
        
        # Level 3: By Loan Type
        print(f"--- Level 3: By Loan Type ---")
        loan_types = segment_data[segment_data[data_selection_col] == 'Train']['new_loan_type'].dropna().unique()
        
        for loan_type in loan_types:
            segment_def = {
                'modelVersionId': model_id,
                'trenchCategory': trenches,
                'new_loan_type': loan_type
            }
            
            for feature in feature_list:
                if feature not in binning_info:
                    continue
                
                results = calculate_psi_for_segments(segment_data, feature, binning_info, segment_def)
                
                for res in results:
                    res['Segment_ID'] = segment_id_counter
                    res['Model_Version_ID'] = model_id
                    res['Model_Display_Name'] = model_display
                    res['Segment_Name'] = f"{model_display}_Trench_{trench_str}_LoanType_{loan_type}"
                    res['Segment_Column_1'] = 'modelVersionId'
                    res['Segment_Value_1'] = model_id
                    res['Segment_Column_2'] = 'trenchCategory'
                    res['Segment_Value_2'] = ','.join(trenches)
                    res['Segment_Column_3'] = 'new_loan_type'
                    res['Segment_Value_3'] = loan_type
                    all_results.append(res)
            
            segment_id_counter += 1
        
        # Level 4: By OS Type
        print(f"--- Level 4: By OS Type ---")
        os_types = segment_data[segment_data[data_selection_col] == 'Train']['osType'].dropna().unique()
        
        for os_type in os_types:
            segment_def = {
                'modelVersionId': model_id,
                'trenchCategory': trenches,
                'osType': os_type
            }
            
            for feature in feature_list:
                if feature not in binning_info:
                    continue
                
                results = calculate_psi_for_segments(segment_data, feature, binning_info, segment_def)
                
                for res in results:
                    res['Segment_ID'] = segment_id_counter
                    res['Model_Version_ID'] = model_id
                    res['Model_Display_Name'] = model_display
                    res['Segment_Name'] = f"{model_display}_Trench_{trench_str}_OSType_{os_type}"
                    res['Segment_Column_1'] = 'modelVersionId'
                    res['Segment_Value_1'] = model_id
                    res['Segment_Column_2'] = 'trenchCategory'
                    res['Segment_Value_2'] = ','.join(trenches)
                    res['Segment_Column_3'] = 'osType'
                    res['Segment_Value_3'] = os_type
                    all_results.append(res)
            
            segment_id_counter += 1
    
    # Create results dataframe
    results_df = pd.DataFrame(all_results)
    
    # Reorder columns intelligently
    core_cols = [
        'Segment_ID', 'Segment_Name', 'Model_Display_Name', 'Model_Version_ID',
        'Segment_Column_1', 'Segment_Value_1',
        'Segment_Column_2', 'Segment_Value_2',
        'Segment_Column_3', 'Segment_Value_3',
        'Feature', 'Feature_Type', 'Month',
        'Base_Count', 'Actual_Count',
        'Expected_Percentage', 'Actual_Percentage', 'PSI'
    ]
    
    available_cols = [col for col in core_cols if col in results_df.columns]
    results_df = results_df[available_cols]
    
    # Sort by Segment_ID and Feature
    results_df = results_df.sort_values(['Segment_ID', 'Feature', 'Month']).reset_index(drop=True)
    
    print(f"\n{'='*80}")
    print(f"Total PSI calculations: {len(results_df)}")
    print(f"Unique segments: {results_df['Segment_ID'].nunique()}")
    print(f"Unique features analyzed: {results_df['Feature'].nunique()}")
    print(f"{'='*80}\n")
    
    return results_df


# Example Usage:
# df = pd.read_csv('your_data.csv')
# psi_report = create_comprehensive_psi_report(df)
# psi_report.to_csv('psi_report.csv', index=False)
# psi_report.to_excel('psi_report.xlsx', index=False)

# Comprehensive PSI report function version 3

In [78]:
import pandas as pd
import numpy as np
import json
from typing import List, Dict, Tuple, Set
from itertools import combinations
import warnings
warnings.filterwarnings('ignore')


def expand_calc_features(df):
    """
    Expand the calcFeatures JSON column into separate columns.
    """
    df_expanded = df.copy()
    calc_features_list = []

    for idx, calc_features_str in enumerate(df['calcFeatures']):
        try:
            features_dict = json.loads(calc_features_str.replace("'", '"'))
            calc_features_list.append(features_dict)
        except (json.JSONDecodeError, AttributeError) as e:
            print(f"Warning: Could not parse calcFeatures at index {idx}: {e}")
            calc_features_list.append({})

    calc_features_df = pd.DataFrame(calc_features_list)
    calc_features_df = calc_features_df.add_prefix('calc_')

    df_expanded = df_expanded.reset_index(drop=True)
    calc_features_df = calc_features_df.reset_index(drop=True)

    result_df = pd.concat([df_expanded, calc_features_df], axis=1)
    return result_df


def get_model_version_specific_features(df: pd.DataFrame,
                                        model_id: str,
                                        trenches: List[str]) -> List[str]:
    """
    Get features that are specific to a modelVersionId + trenchCategory combination.
    Only includes calc_ features that have actual data in this segment.
    """
    segment_df = df[
        (df['modelVersionId'] == model_id) & 
        (df['trenchCategory'].isin(trenches))
    ]
    
    if len(segment_df) == 0:
        return []
    
    calc_cols = [col for col in segment_df.columns if col.startswith('calc_')]
    
    valid_features = []
    for col in calc_cols:
        if segment_df[col].notna().sum() > 0:
            valid_features.append(col)
    
    return valid_features


def identify_feature_types(df: pd.DataFrame, feature_list: List[str]) -> Dict[str, List[str]]:
    """
    Identify categorical and numerical features.
    """
    categorical_features = []
    numerical_features = []

    for feature in feature_list:
        if feature not in df.columns:
            continue

        if pd.api.types.is_numeric_dtype(df[feature]):
            unique_vals = df[feature].nunique()
            if unique_vals < 15 and df[feature].dropna().apply(
                lambda x: x == int(x) if isinstance(x, (int, float)) else False
            ).all():
                categorical_features.append(feature)
            else:
                numerical_features.append(feature)
        else:
            categorical_features.append(feature)

    return {
        'categorical': categorical_features,
        'numerical': numerical_features
    }


def create_bins_for_features(df: pd.DataFrame,
                             numerical_features: List[str],
                             categorical_features: List[str],
                             train_period_df: pd.DataFrame) -> Dict:
    """
    Create bins for numerical features (deciles) and categorical features (top 9 + others).
    """
    binning_info = {}

    for feature in numerical_features:
        valid_data = train_period_df[feature].dropna()

        if len(valid_data) == 0:
            binning_info[feature] = {'type': 'numerical', 'bins': None, 'bin_ranges': {}}
            continue

        bins = None
        bin_count = None

        try:
            test_bins = np.percentile(valid_data, np.arange(0, 101, 10))
            test_bins = np.unique(test_bins)
            if len(test_bins) >= 11:
                bins = test_bins
                bin_count = 10
        except Exception as e:
            pass

        if bins is None:
            try:
                test_bins = np.percentile(valid_data, np.arange(0, 101, 20))
                test_bins = np.unique(test_bins)
                if len(test_bins) >= 6:
                    bins = test_bins
                    bin_count = 5
            except Exception as e:
                pass

        if bins is None:
            try:
                test_bins = np.percentile(valid_data, [0, 33.33, 66.67, 100])
                test_bins = np.unique(test_bins)
                if len(test_bins) >= 4:
                    bins = test_bins
                    bin_count = 3
            except Exception as e:
                pass

        if bins is None:
            print(f"Warning: Feature '{feature}' has insufficient variance")
            min_val = valid_data.min()
            max_val = valid_data.max()
            bins = np.linspace(min_val, max_val, 6)
            bins = np.unique(bins)
            bin_count = len(bins) - 1

            if bin_count == 1:
                bins = np.array([min_val - 0.1, min_val, min_val + 0.1])
                bin_count = 2

        bins = bins.copy()
        bins[0] = -np.inf
        bins[-1] = np.inf

        print(f"  Feature '{feature}': Created {bin_count} bins")

        bin_ranges = {}
        for i in range(len(bins)-1):
            bin_name = f"Bin_{i+1}"
            bin_ranges[bin_name] = {
                'min': bins[i],
                'max': bins[i+1],
                'range_str': f"[{bins[i]:.2f}, {bins[i+1]:.2f}]" if not np.isinf(bins[i]) and not np.isinf(bins[i+1]) else f"({bins[i]}, {bins[i+1]})"
            }

        binning_info[feature] = {
            'type': 'numerical',
            'bins': bins,
            'bin_ranges': bin_ranges,
            'bin_count': bin_count
        }

    for feature in categorical_features:
        value_counts = train_period_df[feature].value_counts()
        unique_categories = value_counts.index.tolist()

        if len(unique_categories) <= 9:
            top_categories = unique_categories
        else:
            top_categories = value_counts.nlargest(9).index.tolist()

        print(f"  Feature '{feature}': Using {len(top_categories)} top categories (total: {len(unique_categories)})")

        binning_info[feature] = {
            'type': 'categorical',
            'top_categories': top_categories,
            'bin_ranges': {}
        }

    return binning_info


def apply_binning(df: pd.DataFrame,
                  feature: str,
                  binning_info: Dict) -> pd.Series:
    """
    Apply binning to a feature.
    """
    if binning_info['type'] == 'numerical':
        if binning_info['bins'] is None:
            return pd.Series(['Missing'] * len(df), index=df.index)

        bins = binning_info['bins']
        labels = [f"Bin_{i+1}" for i in range(len(bins)-1)]

        binned = pd.cut(df[feature],
                       bins=bins,
                       labels=labels,
                       include_lowest=True,
                       duplicates='drop')

        binned = binned.astype(str)
        binned[df[feature].isna()] = 'Missing'

        return binned

    else:
        top_cats = binning_info['top_categories']
        feature_data = df[feature].astype(str)
        feature_data = feature_data.replace('nan', 'Missing')
        top_cats_str = [str(cat) for cat in top_cats]

        binned = feature_data.apply(
            lambda x: x if x in top_cats_str else ('Others' if x != 'Missing' else 'Missing')
        )

        return binned


def calculate_psi(expected_pct: pd.Series,
                  actual_pct: pd.Series,
                  epsilon: float = 0.0001) -> float:
    """
    Calculate Population Stability Index.
    """
    all_bins = expected_pct.index.union(actual_pct.index)
    expected_pct = expected_pct.reindex(all_bins, fill_value=0)
    actual_pct = actual_pct.reindex(all_bins, fill_value=0)

    expected_pct = expected_pct.apply(lambda x: epsilon if x == 0 else x)
    actual_pct = actual_pct.apply(lambda x: epsilon if x == 0 else x)

    expected_pct = expected_pct / expected_pct.sum()
    actual_pct = actual_pct / actual_pct.sum()

    psi_value = np.sum((actual_pct - expected_pct) * np.log(actual_pct / expected_pct))

    return psi_value


def get_valid_model_trench_combinations(df: pd.DataFrame) -> List[Tuple[str, str, List[str]]]:
    """
    Get all valid combinations of modelVersionId, modelDisplayName, and trenchCategory.
    """
    combinations_list = []
    
    for model_id in df['modelVersionId'].unique():
        model_df = df[df['modelVersionId'] == model_id]
        model_display = model_df['modelDisplayName'].iloc[0]
        trenches = sorted(model_df['trenchCategory'].dropna().unique().tolist())
        
        if trenches:
            combinations_list.append((model_id, model_display, trenches))
    
    return combinations_list


def generate_segment_combinations(model_trenches_list: List[Tuple]) -> List[Tuple[str, str, List[str]]]:
    """
    Generate all combinations of models and trenches with modelDisplayName.
    """
    all_combinations = []
    
    for model_id, model_display, trenches in model_trenches_list:
        for trench in trenches:
            all_combinations.append((model_id, model_display, [trench]))
        
        for r in range(2, len(trenches) + 1):
            for combo in combinations(trenches, r):
                all_combinations.append((model_id, model_display, sorted(list(combo))))
    
    return all_combinations


def calculate_psi_for_segments(df: pd.DataFrame,
                               feature: str,
                               binning_info: Dict,
                               segment_def: Dict,
                               account_id_col: str = 'digitalLoanAccountId',
                               data_selection_col: str = 'Data_selection',
                               month_col: str = 'Application_month') -> List[Dict]:
    """
    Calculate PSI for a specific feature and segment definition.
    """
    results = []
    
    df[f'{feature}_binned'] = apply_binning(df, feature, binning_info[feature])
    
    train_mask = df[data_selection_col] == 'Train'
    
    segment_mask = train_mask.copy()
    for col, values in segment_def.items():
        if col in df.columns:
            if isinstance(values, list):
                segment_mask = segment_mask & (df[col].isin(values))
            else:
                segment_mask = segment_mask & (df[col] == values)
    
    train_segment = df[segment_mask]
    
    if len(train_segment) == 0:
        return []
    
    train_baseline = train_segment[f'{feature}_binned'].value_counts(normalize=True)
    
    test_df = df[~train_mask]
    test_months = sorted(test_df[month_col].unique())
    
    for month in test_months:
        test_mask = (df[month_col] == month)
        for col, values in segment_def.items():
            if col in df.columns:
                if isinstance(values, list):
                    test_mask = test_mask & (df[col].isin(values))
                else:
                    test_mask = test_mask & (df[col] == values)
        
        actual_segment = df[test_mask]
        
        if len(actual_segment) == 0:
            continue
        
        actual_dist = actual_segment[f'{feature}_binned'].value_counts(normalize=True)
        psi_value = calculate_psi(train_baseline, actual_dist)
        
        expected_avg_pct = train_baseline.mean() * 100
        actual_avg_pct = actual_dist.mean() * 100
        
        base_count = train_segment[account_id_col].nunique()
        actual_count = actual_segment[account_id_col].nunique()
        
        results.append({
            'Feature': feature,
            'Feature_Type': binning_info[feature]['type'],
            'Month': month,
            'Base_Month': 'Train',
            'Base_Count': base_count,
            'Actual_Count': actual_count,
            'Expected_Percentage': expected_avg_pct,
            'Actual_Percentage': actual_avg_pct,
            'PSI': psi_value
        })
    
    return results


def create_comprehensive_psi_report(df: pd.DataFrame,
                                    excluded_features: List[str] = None,
                                    account_id_col: str = 'digitalLoanAccountId',
                                    data_selection_col: str = 'Data_selection',
                                    month_col: str = 'Application_month') -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    """
    Create comprehensive PSI report split into dimension and fact tables for Power BI.
    
    Returns:
    --------
    Tuple of 4 DataFrames:
    - dim_segment: Segment dimension table
    - dim_feature: Feature dimension table
    - dim_month: Month dimension table
    - fact_psi: PSI fact table with foreign keys
    """
    
    if excluded_features is None:
        excluded_features = ['digitalLoanAccountId', 'customerId', 'crifapplicationid', 'rundate']
    
    print("Expanding calcFeatures...")
    df = expand_calc_features(df)
    
    model_trenches = get_valid_model_trench_combinations(df)
    segment_combos = generate_segment_combinations(model_trenches)
    
    print(f"Found {len(segment_combos)} model-trench combinations\n")
    
    exclude_cols = {
        'customerId', 'digitalLoanAccountId', 'Alpha_cic_sil_score',
        'start_time', 'end_time', 'modelDisplayName', 'modelVersionId',
        'new_loan_type', 'gender', 'loan_product_type', 'osType',
        'Model_Name', 'product', 'trenchCategory', 'calcFeatures',
        'Data_selection', 'appln_submit_datetime', 'disbursementDateTime',
        'Application_month'
    }
    exclude_cols.update(excluded_features)
    
    # Storage for dimension data
    segment_data = []
    feature_data = []
    month_data = []
    fact_data = []
    
    feature_id_map = {}  # Map feature names to IDs
    month_id_map = {}    # Map months to IDs
    segment_id_counter = 0
    feature_id_counter = 1
    month_id_counter = 1
    
    # Process each model-trench combination
    for model_id, model_display, trenches in segment_combos:
        print(f"\nProcessing Model: {model_display} (ID: {model_id}), Trenches: {trenches}")
        
        segment_data_subset = df[
            (df['modelVersionId'] == model_id) & 
            (df['trenchCategory'].isin(trenches))
        ].copy()
        
        if len(segment_data_subset) == 0:
            continue
        
        model_specific_features = get_model_version_specific_features(df, model_id, trenches)
        feature_list = [f for f in model_specific_features if f not in exclude_cols]
        
        print(f"Found {len(feature_list)} features for this combination")
        
        train_segment_data = segment_data_subset[segment_data_subset[data_selection_col] == 'Train']
        feature_types = identify_feature_types(segment_data_subset, feature_list)
        
        print("Creating binning strategy...")
        binning_info = create_bins_for_features(
            segment_data_subset,
            feature_types['numerical'],
            feature_types['categorical'],
            train_segment_data
        )
        
        trench_str = '_'.join(trenches)
        
        # Level 1: Model Score
        print(f"Level 1: Model Score PSI")
        segment_def = {
            'modelVersionId': model_id,
            'trenchCategory': trenches
        }
        
        segment_key = f"Model_{model_id}_Trench_{trench_str}"
        
        segment_data.append({
            'Segment_ID': segment_id_counter,
            'Segment_Key': segment_key,
            'Segment_Name': f"{model_display}_Trench_{trench_str}",
            'Model_Version_ID': model_id,
            'Model_Display_Name': model_display,
            'Segment_Type': 'Model Score',
            'Trench_Category': ','.join(trenches),
            'Loan_Product_Type': None,
            'New_Loan_Type': None,
            'OS_Type': None,
            'Segment_Level': 1
        })
        
        feature = 'Alpha_cic_sil_score'
        if feature in segment_data_subset.columns:
            if feature not in feature_id_map:
                feature_id_map[feature] = feature_id_counter
                feature_data.append({
                    'Feature_ID': feature_id_counter,
                    'Feature_Name': feature,
                    'Feature_Type': 'Numerical',
                    'Display_Name': 'Model Score'
                })
                feature_id_counter += 1
            
            results = calculate_psi_for_segments(segment_data_subset, feature, binning_info, segment_def)
            
            for res in results:
                month = res['Month']
                if month not in month_id_map:
                    month_id_map[month] = month_id_counter
                    month_data.append({
                        'Month_ID': month_id_counter,
                        'Month': month,
                        'Month_Sort_Order': pd.to_datetime(month).strftime('%Y%m%d')
                    })
                    month_id_counter += 1
                
                fact_data.append({
                    'Segment_ID': segment_id_counter,
                    'Feature_ID': feature_id_map[feature],
                    'Month_ID': month_id_map[month],
                    'Base_Count': res['Base_Count'],
                    'Actual_Count': res['Actual_Count'],
                    'Expected_Percentage': res['Expected_Percentage'],
                    'Actual_Percentage': res['Actual_Percentage'],
                    'PSI': res['PSI']
                })
        
        segment_id_counter += 1
        
        # Level 2: By Loan Product Type
        print(f"Level 2: By Loan Product Type")
        loan_products = segment_data_subset[segment_data_subset[data_selection_col] == 'Train']['loan_product_type'].dropna().unique()
        
        for loan_product in loan_products:
            segment_def = {
                'modelVersionId': model_id,
                'trenchCategory': trenches,
                'loan_product_type': loan_product
            }
            
            segment_key = f"Model_{model_id}_Trench_{trench_str}_LoanProduct_{loan_product}"
            
            segment_data.append({
                'Segment_ID': segment_id_counter,
                'Segment_Key': segment_key,
                'Segment_Name': f"{model_display}_Trench_{trench_str}_LoanProduct_{loan_product}",
                'Model_Version_ID': model_id,
                'Model_Display_Name': model_display,
                'Segment_Type': 'By Loan Product Type',
                'Trench_Category': ','.join(trenches),
                'Loan_Product_Type': loan_product,
                'New_Loan_Type': None,
                'OS_Type': None,
                'Segment_Level': 2
            })
            
            for feature in feature_list:
                if feature not in binning_info:
                    continue
                
                if feature not in feature_id_map:
                    feature_id_map[feature] = feature_id_counter
                    feature_data.append({
                        'Feature_ID': feature_id_counter,
                        'Feature_Name': feature,
                        'Feature_Type': binning_info[feature]['type'],
                        'Display_Name': feature.replace('calc_', '')
                    })
                    feature_id_counter += 1
                
                results = calculate_psi_for_segments(segment_data_subset, feature, binning_info, segment_def)
                
                for res in results:
                    month = res['Month']
                    if month not in month_id_map:
                        month_id_map[month] = month_id_counter
                        month_data.append({
                            'Month_ID': month_id_counter,
                            'Month': month,
                            'Month_Sort_Order': pd.to_datetime(month).strftime('%Y%m%d')
                        })
                        month_id_counter += 1
                    
                    fact_data.append({
                        'Segment_ID': segment_id_counter,
                        'Feature_ID': feature_id_map[feature],
                        'Month_ID': month_id_map[month],
                        'Base_Count': res['Base_Count'],
                        'Actual_Count': res['Actual_Count'],
                        'Expected_Percentage': res['Expected_Percentage'],
                        'Actual_Percentage': res['Actual_Percentage'],
                        'PSI': res['PSI']
                    })
            
            segment_id_counter += 1
        
        # Level 3: By Loan Type
        print(f"Level 3: By Loan Type")
        loan_types = segment_data_subset[segment_data_subset[data_selection_col] == 'Train']['new_loan_type'].dropna().unique()
        
        for loan_type in loan_types:
            segment_def = {
                'modelVersionId': model_id,
                'trenchCategory': trenches,
                'new_loan_type': loan_type
            }
            
            segment_key = f"Model_{model_id}_Trench_{trench_str}_LoanType_{loan_type}"
            
            segment_data.append({
                'Segment_ID': segment_id_counter,
                'Segment_Key': segment_key,
                'Segment_Name': f"{model_display}_Trench_{trench_str}_LoanType_{loan_type}",
                'Model_Version_ID': model_id,
                'Model_Display_Name': model_display,
                'Segment_Type': 'By Loan Type',
                'Trench_Category': ','.join(trenches),
                'Loan_Product_Type': None,
                'New_Loan_Type': loan_type,
                'OS_Type': None,
                'Segment_Level': 3
            })
            
            for feature in feature_list:
                if feature not in binning_info:
                    continue
                
                if feature not in feature_id_map:
                    feature_id_map[feature] = feature_id_counter
                    feature_data.append({
                        'Feature_ID': feature_id_counter,
                        'Feature_Name': feature,
                        'Feature_Type': binning_info[feature]['type'],
                        'Display_Name': feature.replace('calc_', '')
                    })
                    feature_id_counter += 1
                
                results = calculate_psi_for_segments(segment_data_subset, feature, binning_info, segment_def)
                
                for res in results:
                    month = res['Month']
                    if month not in month_id_map:
                        month_id_map[month] = month_id_counter
                        month_data.append({
                            'Month_ID': month_id_counter,
                            'Month': month,
                            'Month_Sort_Order': pd.to_datetime(month).strftime('%Y%m%d')
                        })
                        month_id_counter += 1
                    
                    fact_data.append({
                        'Segment_ID': segment_id_counter,
                        'Feature_ID': feature_id_map[feature],
                        'Month_ID': month_id_map[month],
                        'Base_Count': res['Base_Count'],
                        'Actual_Count': res['Actual_Count'],
                        'Expected_Percentage': res['Expected_Percentage'],
                        'Actual_Percentage': res['Actual_Percentage'],
                        'PSI': res['PSI']
                    })
            
            segment_id_counter += 1
        
        # Level 4: By OS Type
        print(f"Level 4: By OS Type")
        os_types = segment_data_subset[segment_data_subset[data_selection_col] == 'Train']['osType'].dropna().unique()
        
        for os_type in os_types:
            segment_def = {
                'modelVersionId': model_id,
                'trenchCategory': trenches,
                'osType': os_type
            }
            
            segment_key = f"Model_{model_id}_Trench_{trench_str}_OSType_{os_type}"
            
            segment_data.append({
                'Segment_ID': segment_id_counter,
                'Segment_Key': segment_key,
                'Segment_Name': f"{model_display}_Trench_{trench_str}_OSType_{os_type}",
                'Model_Version_ID': model_id,
                'Model_Display_Name': model_display,
                'Segment_Type': 'By OS Type',
                'Trench_Category': ','.join(trenches),
                'Loan_Product_Type': None,
                'New_Loan_Type': None,
                'OS_Type': os_type,
                'Segment_Level': 4
            })
            
            for feature in feature_list:
                if feature not in binning_info:
                    continue
                
                if feature not in feature_id_map:
                    feature_id_map[feature] = feature_id_counter
                    feature_data.append({
                        'Feature_ID': feature_id_counter,
                        'Feature_Name': feature,
                        'Feature_Type': binning_info[feature]['type'],
                        'Display_Name': feature.replace('calc_', '')
                    })
                    feature_id_counter += 1
                
                results = calculate_psi_for_segments(segment_data_subset, feature, binning_info, segment_def)
                
                for res in results:
                    month = res['Month']
                    if month not in month_id_map:
                        month_id_map[month] = month_id_counter
                        month_data.append({
                            'Month_ID': month_id_counter,
                            'Month': month,
                            'Month_Sort_Order': pd.to_datetime(month).strftime('%Y%m%d')
                        })
                        month_id_counter += 1
                    
                    fact_data.append({
                        'Segment_ID': segment_id_counter,
                        'Feature_ID': feature_id_map[feature],
                        'Month_ID': month_id_map[month],
                        'Base_Count': res['Base_Count'],
                        'Actual_Count': res['Actual_Count'],
                        'Expected_Percentage': res['Expected_Percentage'],
                        'Actual_Percentage': res['Actual_Percentage'],
                        'PSI': res['PSI']
                    })
            
            segment_id_counter += 1
    
    # Convert to DataFrames
    dim_segment = pd.DataFrame(segment_data)
    dim_feature = pd.DataFrame(feature_data)
    dim_month = pd.DataFrame(month_data).sort_values('Month_Sort_Order').reset_index(drop=True)
    fact_psi = pd.DataFrame(fact_data)
    
    print(f"\n{'='*80}")
    print(f"DIMENSION TABLES CREATED:")
    print(f"  - dim_segment: {len(dim_segment)} records")
    print(f"  - dim_feature: {len(dim_feature)} records")
    print(f"  - dim_month: {len(dim_month)} records")
    print(f"FACT TABLE:")
    print(f"  - fact_psi: {len(fact_psi)} records")
    print(f"{'='*80}\n")
    
    return dim_segment, dim_feature, dim_month, fact_psi


# Example Usage:
# df = pd.read_csv('your_data.csv')
# dim_segment, dim_feature, dim_month, fact_psi = create_comprehensive_psi_report(df)
#
# # Save to CSV
# dim_segment.to_csv('dim_segment.csv', index=False)
# dim_feature.to_csv('dim_feature.csv', index=False)
# dim_month.to_csv('dim_month.csv', index=False)
# fact_psi.to_csv('fact_psi.csv', index=False)
#
# # Or save to Excel
# with pd.ExcelWriter('psi_model.xlsx') as writer:
#     dim_segment.to_excel(writer, sheet_name='dim_segment', index=False)
#     dim_feature.to_excel(writer, sheet_name='dim_feature', index=False)
#     dim_month.to_excel(writer, sheet_name='dim_month', index=False)
#     fact_psi.to_excel(writer, sheet_name='fact_psi', index=False)

# Function Ends

# SIL

## Queries

#### Alpha - CIC-SIL-Model

##### Test

In [60]:
## This is for the test period of Alpha - CIC sil model - reading the data from ml_model_run_details

sq = """
WITH cleaned AS (
  SELECT
    customerId,digitalLoanAccountId,prediction,start_time,end_time,
        case when modelDisplayName = 'Alpha - CIC-SIL-Model' then 'cic_model_sil' else modelDisplayName end as modelDisplayName    
    ,modelVersionId,
    case when trenchCategory is null then 'ALL' 
         when trenchCategory='' then 'ALL'    
    else trenchCategory end trenchCategory,
    REPLACE(REPLACE(calcFeature, "'", '"'), "None", "null") AS calcFeature
  FROM `prj-prod-dataplatform.audit_balance.ml_model_run_details`
  WHERE modelDisplayName in ('Alpha - CIC-SIL-Model', 'cic_model_sil')
  ),
base as
(SELECT distinct
  r.customerId,r.digitalLoanAccountId,prediction Alpha_cic_sil_score
    ,start_time,end_time,modelDisplayName,modelVersionId,
   loanmaster.new_loan_type,
 loanmaster.gender,
    case when loanmaster.loantype='BNPL' and sil_category.store_type =1 then 'Appliance'
    when loanmaster.loantype='BNPL' and sil_category.store_type =2 then 'Mobile'
    when loanmaster.loantype='BNPL' and sil_category.store_type =3 then 'Mall'
    when loanmaster.loantype='BNPL' and sil_category.store_type not in (1,2,3) then store_tagging
    else 'not applicable' end as loan_product_type,
     case when lower(coalesce(loanmaster.osversion_v2, loanmaster.osVersion)) like '%andro%' then 'android'
        when lower(coalesce(loanmaster.osversion_v2, loanmaster.osVersion)) like '%os%' then 'ios'
        when lower(loanmaster.deviceType) like '%andro%' then 'android'
        else 'ios' end osType,
 'cic_model_sil' Model_Name,
 'SIL' as product,
  trenchCategory,
  r.calcFeature calcFeatures,
  'Test' Data_selection,
  coalesce(IF(loanmaster.new_loan_type = 'Flex-up', loanmaster.startApplyDateTime, loanmaster.termsAndConditionsSubmitDateTime),  r.start_time) AS appln_submit_datetime,
  loanmaster.disbursementDateTime,
  format_date('%Y-%m', coalesce(IF(loanmaster.new_loan_type = 'Flex-up', loanmaster.startApplyDateTime, loanmaster.termsAndConditionsSubmitDateTime),  r.start_time)) as Application_month,
FROM cleaned r
left join risk_credit_mis.loan_master_table loanmaster
  ON loanmaster.digitalLoanAccountId = r.digitalLoanAccountId
 left join(SELECT DISTINCT mer_refferal_code, mer_name mer_name,store_type,store_tagging FROM `dl_loans_db_raw.tdbk_merchant_refferal_mtb`
  left join worktable_datachampions.TARGET_SPLIT P on P.STORE_NAME = mer_name
 qualify row_number() over(partition by mer_refferal_code order by  created_dt desc)=1) sil_category on loanmaster.purpleKey=sil_category.mer_refferal_code
qualify row_number() over (partition by r.customerId,r.digitalLoanAccountId, modelVersionId 
order by coalesce(IF(loanmaster.new_loan_type = 'Flex-up', loanmaster.startApplyDateTime, loanmaster.termsAndConditionsSubmitDateTime),  r.start_time) desc) = 1
)
select * from base
;
"""
dfd = client.query(sq).to_dataframe()
print(f"The shape of the dataframe is: {dfd.shape}")
dfd.head()

## this data is not expanded. We will have to expand and get the features from the calcFeatures column

The shape of the dataframe is: (100984, 19)


Unnamed: 0,customerId,digitalLoanAccountId,Alpha_cic_sil_score,start_time,end_time,modelDisplayName,modelVersionId,new_loan_type,gender,loan_product_type,osType,Model_Name,product,trenchCategory,calcFeatures,Data_selection,appln_submit_datetime,disbursementDateTime,Application_month
0,2147044,aef56954-6c97-4520-aa14-2e163676f889,0.2013274896821575,2025-08-23 05:04:52.388799,2025-08-23 05:04:52.395256,cic_model_sil,v1,SIL Competitor,F,Appliance,android,cic_model_sil,SIL,ALL,"{""run_date"":1755907200000,""cic_Personal_Loans_...",Test,2025-08-23 13:04:42,NaT,2025-08
1,2551996,81cf347d-8d6c-41ee-a67c-043c0a91e9de,0.1214890420101941,2025-11-04 10:54:49.330879,2025-11-04 10:54:49.337092,cic_model_sil,v1,SIL-Instore,M,Appliance,android,cic_model_sil,SIL,ALL,"{""run_date"":1762214400000,""cic_Personal_Loans_...",Test,2025-11-04 18:53:56,NaT,2025-11
2,2700374,73de3961-c48b-40a0-b961-84c9adb10e4a,0.1958455324463609,2025-06-15 07:14:26.730021,2025-06-15 07:14:26.736034,cic_model_sil,v1,SIL-Instore,F,Appliance,android,cic_model_sil,SIL,ALL,"{""run_date"":1749945600000,""cic_Personal_Loans_...",Test,2025-06-15 15:14:18,NaT,2025-06
3,2741007,28222315-a058-4b2e-858e-689a3837a08d,0.1235028393775057,2025-06-12 04:54:47.541738,2025-06-12 04:54:47.547863,cic_model_sil,v1,SIL-Instore,M,Appliance,android,cic_model_sil,SIL,ALL,"{""run_date"":1749686400000,""cic_Personal_Loans_...",Test,2025-06-12 12:54:37,NaT,2025-06
4,2907116,b22a1f11-eac7-4a15-96dd-db53f75e452b,0.1621598483948442,2025-11-12 03:32:13.445054,2025-11-12 03:32:13.451666,cic_model_sil,v1,SIL Competitor,F,Appliance,android,cic_model_sil,SIL,ALL,"{""run_date"":1762905600000,""cic_Personal_Loans_...",Test,2025-11-12 11:32:03,NaT,2025-11


In [61]:
d1 = dfd.copy()

### Train 

In [62]:
sq = """WITH cleaned AS (
  SELECT
    customerId,digitalLoanAccountId,prediction,start_time,end_time,
    
    case when modelDisplayName = 'Alpha - CIC-SIL-Model' then 'cic_model_sil' else modelDisplayName end as modelDisplayName
    
    ,modelVersionId,
        case when trenchCategory is null then 'ALL' 
         when trenchCategory = '' then 'ALL'
    else trenchCategory end trenchCategory,
    REPLACE(REPLACE(calcFeature, "'", '"'), "None", "null") AS calcFeature
  FROM prj-prod-dataplatform.dap_ds_poweruser_playground.ml_training_model_run_details
  WHERE modelDisplayName in ('Alpha - CIC-SIL-Model', 'cic_model_sil')
  ),
base as 
(SELECT distinct
  r.customerId,r.digitalLoanAccountId,prediction Alpha_cic_sil_score
    ,start_time,end_time,modelDisplayName,modelVersionId,
   loanmaster.new_loan_type,
 loanmaster.gender,
    case when loanmaster.loantype='BNPL' and sil_category.store_type =1 then 'Appliance'
    when loanmaster.loantype='BNPL' and sil_category.store_type =2 then 'Mobile'
    when loanmaster.loantype='BNPL' and sil_category.store_type =3 then 'Mall'
    when loanmaster.loantype='BNPL' and sil_category.store_type not in (1,2,3) then store_tagging
    else 'not applicable' end as loan_product_type,
     case when lower(coalesce(loanmaster.osversion_v2, loanmaster.osVersion)) like '%andro%' then 'android'
        when lower(coalesce(loanmaster.osversion_v2, loanmaster.osVersion)) like '%os%' then 'ios'
        when lower(loanmaster.deviceType) like '%andro%' then 'android'
        else 'ios' end osType,
 'Alpha - CIC-SIL-Model' Model_Name,
 'SIL' as product,
  trenchCategory,
  r.calcFeature calcFeatures,
  'Train' Data_selection,
  coalesce(IF(loanmaster.new_loan_type = 'Flex-up', loanmaster.startApplyDateTime, loanmaster.termsAndConditionsSubmitDateTime),  cast(r.start_time as datetime)) AS appln_submit_datetime,
  loanmaster.disbursementDateTime,
  format_date('%Y-%m', coalesce(IF(loanmaster.new_loan_type = 'Flex-up', loanmaster.startApplyDateTime, loanmaster.termsAndConditionsSubmitDateTime),  cast(r.start_time as datetime))) as Application_month,
FROM cleaned r
left join risk_credit_mis.loan_master_table loanmaster
  ON loanmaster.digitalLoanAccountId = r.digitalLoanAccountId
 left join(SELECT DISTINCT mer_refferal_code, mer_name mer_name,store_type,store_tagging FROM `dl_loans_db_raw.tdbk_merchant_refferal_mtb`
  left join worktable_datachampions.TARGET_SPLIT P on P.STORE_NAME = mer_name
 qualify row_number() over(partition by mer_refferal_code order by  created_dt desc)=1) sil_category on loanmaster.purpleKey=sil_category.mer_refferal_code
qualify row_number() over (partition by r.customerId,r.digitalLoanAccountId, modelVersionId 
order by   coalesce(IF(loanmaster.new_loan_type = 'Flex-up', loanmaster.startApplyDateTime, loanmaster.termsAndConditionsSubmitDateTime),  cast(r.start_time as datetime)) desc) = 1
)
select * from base
;
"""

dfd = client.query(sq).to_dataframe()
print(f"The shape of the dataframe is: {dfd.shape}")
dfd.head()

The shape of the dataframe is: (450861, 19)


Unnamed: 0,customerId,digitalLoanAccountId,Alpha_cic_sil_score,start_time,end_time,modelDisplayName,modelVersionId,new_loan_type,gender,loan_product_type,osType,Model_Name,product,trenchCategory,calcFeatures,Data_selection,appln_submit_datetime,disbursementDateTime,Application_month
0,1878354,20559e3f-43ed-4383-8d61-77821b466934,0.080499,2025-12-01T08:59:43.458980,2025-12-01T08:59:43.458980,cic_model_sil,v1,SIL-Instore,M,Appliance,android,Alpha - CIC-SIL-Model,SIL,ALL,"{""cic_Personal_Loans_granted_contracts_amt_24M...",Train,2023-01-29 18:32:36,2023-01-29 18:40:02,2023-01
1,2033393,897bccad-ac68-4c02-aff6-53628030bfa6,0.12674,2025-12-01T09:00:34.513600,2025-12-01T09:00:34.513600,cic_model_sil,v1,SIL-Instore,F,Appliance,android,Alpha - CIC-SIL-Model,SIL,ALL,"{""cic_days_since_last_inquiry"": 1967.0, ""cic_z...",Train,2023-05-07 16:51:20,NaT,2023-05
2,2134557,a5d436ea-9086-4c5a-945c-0cd17589ef66,0.128572,2025-12-01T09:00:49.389048,2025-12-01T09:00:49.389048,cic_model_sil,v1,SIL-Instore,F,Appliance,android,Alpha - CIC-SIL-Model,SIL,ALL,"{""cic_Personal_Loans_granted_contracts_amt_24M...",Train,2024-09-20 17:33:41,NaT,2024-09
3,2134961,215caf75-9b47-49fb-bed1-30c6e153bdf5,0.189113,2025-12-01T09:00:22.123586,2025-12-01T09:00:22.123586,cic_model_sil,v1,SIL-Instore,M,Appliance,android,Alpha - CIC-SIL-Model,SIL,ALL,"{""cic_days_since_last_inquiry"": 0.0, ""cic_vel_...",Train,2025-02-09 12:17:30,NaT,2025-02
4,2203660,f7e4ec78-2085-4bf7-86a7-6a5d0516957e,0.123928,2025-12-01T09:00:25.037224,2025-12-01T09:00:25.037224,cic_model_sil,v1,SIL-Instore,M,Appliance,android,Alpha - CIC-SIL-Model,SIL,ALL,"{""cic_days_since_last_inquiry"": 2124.0, ""cic_z...",Train,2023-08-23 11:42:41,NaT,2023-08


In [63]:
d2 = dfd.copy()

In [64]:
df_concat = pd.concat([d1, d2], ignore_index=True)
df_concat.head()

Unnamed: 0,customerId,digitalLoanAccountId,Alpha_cic_sil_score,start_time,end_time,modelDisplayName,modelVersionId,new_loan_type,gender,loan_product_type,osType,Model_Name,product,trenchCategory,calcFeatures,Data_selection,appln_submit_datetime,disbursementDateTime,Application_month
0,2147044,aef56954-6c97-4520-aa14-2e163676f889,0.2013274896821575,2025-08-23 05:04:52.388799,2025-08-23 05:04:52.395256,cic_model_sil,v1,SIL Competitor,F,Appliance,android,cic_model_sil,SIL,ALL,"{""run_date"":1755907200000,""cic_Personal_Loans_...",Test,2025-08-23 13:04:42,NaT,2025-08
1,2551996,81cf347d-8d6c-41ee-a67c-043c0a91e9de,0.1214890420101941,2025-11-04 10:54:49.330879,2025-11-04 10:54:49.337092,cic_model_sil,v1,SIL-Instore,M,Appliance,android,cic_model_sil,SIL,ALL,"{""run_date"":1762214400000,""cic_Personal_Loans_...",Test,2025-11-04 18:53:56,NaT,2025-11
2,2700374,73de3961-c48b-40a0-b961-84c9adb10e4a,0.1958455324463609,2025-06-15 07:14:26.730021,2025-06-15 07:14:26.736034,cic_model_sil,v1,SIL-Instore,F,Appliance,android,cic_model_sil,SIL,ALL,"{""run_date"":1749945600000,""cic_Personal_Loans_...",Test,2025-06-15 15:14:18,NaT,2025-06
3,2741007,28222315-a058-4b2e-858e-689a3837a08d,0.1235028393775057,2025-06-12 04:54:47.541738,2025-06-12 04:54:47.547863,cic_model_sil,v1,SIL-Instore,M,Appliance,android,cic_model_sil,SIL,ALL,"{""run_date"":1749686400000,""cic_Personal_Loans_...",Test,2025-06-12 12:54:37,NaT,2025-06
4,2907116,b22a1f11-eac7-4a15-96dd-db53f75e452b,0.1621598483948442,2025-11-12 03:32:13.445054,2025-11-12 03:32:13.451666,cic_model_sil,v1,SIL Competitor,F,Appliance,android,cic_model_sil,SIL,ALL,"{""run_date"":1762905600000,""cic_Personal_Loans_...",Test,2025-11-12 11:32:03,NaT,2025-11


In [65]:
print(f"The shape of the concatenated dataframe is: {df_concat.shape}")
df = dropping_duplicates(df_concat)
print(f"The shape of the dataframe after dropping duplicates is: {df.shape}")

The shape of the concatenated dataframe is: (551845, 19)
The shape of the dataframe after dropping duplicates is: (551845, 19)


In [66]:
df.columns.values

array(['customerId', 'digitalLoanAccountId', 'Alpha_cic_sil_score',
       'start_time', 'end_time', 'modelDisplayName', 'modelVersionId',
       'new_loan_type', 'gender', 'loan_product_type', 'osType',
       'Model_Name', 'product', 'trenchCategory', 'calcFeatures',
       'Data_selection', 'appln_submit_datetime', 'disbursementDateTime',
       'Application_month'], dtype=object)

In [67]:
df[['modelVersionId', 'trenchCategory', 'modelDisplayName']].value_counts()

modelVersionId  trenchCategory  modelDisplayName
v1              ALL             cic_model_sil       303511
v2              Trench 1        cic_model_sil       226411
                Trench 3        cic_model_sil        11585
                Trench 2        cic_model_sil        10338
Name: count, dtype: int64

In [79]:
dim_segment, dim_feature, dim_month, fact_psi = create_comprehensive_psi_report(df)

Expanding calcFeatures...
Found 8 model-trench combinations


Processing Model: cic_model_sil (ID: v2), Trenches: ['Trench 1']
Found 19 features for this combination
Creating binning strategy...
  Feature 'calc_total_overdue_granted_contracts': Created 5 bins
  Feature 'calc_cnt_nongranted_contracts_3M': Created 5 bins
  Feature 'calc_max_amt_granted_24M': Created 10 bins
  Feature 'calc_days_since_last_closed': Created 10 bins
  Feature 'calc_vel_contract_nongranted_cnt_6on12': Created 5 bins
  Feature 'calc_vel_contract_closed_amt_3on12': Created 3 bins
  Feature 'calc_granted_contracts_cnt_6M': Created 5 bins
  Feature 'calc_cnt_active_contracts': Created 5 bins
  Feature 'calc_tot_active_contracts_util': Created 10 bins
  Feature 'calc_vel_contract_granted_amt_6on12': Created 5 bins
  Feature 'calc_Personal_Loans_granted_contracts_amt_24M': Created 10 bins
  Feature 'calc_ScoreRange': Using 9 top categories (total: 19)
  Feature 'calc_ln_loan_level_user_type': Using 1 top categorie

KeyError: 'Alpha_cic_sil_score'

In [None]:
# psi_report = create_comprehensive_psi_report(
#     df,  # Your concatenated train+test dataframe
#     excluded_features=['crifapplicationid', 'run_date', 'customerId', 'digitalLoanAccountId']
# )
# psi_report.to_csv('comprehensive_psi_report.csv', index=False)

Expanding calcFeatures...
Found 8 model-trench combinations


Processing Model: cic_model_sil (ID: v2), Trenches: ['Trench 1']
Found 19 features for this combination

Creating binning strategy...
  Feature 'calc_total_overdue_granted_contracts': Created 5 bins
  Feature 'calc_cnt_nongranted_contracts_3M': Created 5 bins
  Feature 'calc_max_amt_granted_24M': Created 10 bins
  Feature 'calc_days_since_last_closed': Created 10 bins
  Feature 'calc_vel_contract_nongranted_cnt_6on12': Created 5 bins
  Feature 'calc_vel_contract_closed_amt_3on12': Created 3 bins
  Feature 'calc_granted_contracts_cnt_6M': Created 5 bins
  Feature 'calc_cnt_active_contracts': Created 5 bins
  Feature 'calc_tot_active_contracts_util': Created 10 bins
  Feature 'calc_vel_contract_granted_amt_6on12': Created 5 bins
  Feature 'calc_Personal_Loans_granted_contracts_amt_24M': Created 10 bins
  Feature 'calc_ScoreRange': Using 9 top categories (total: 19)
  Feature 'calc_ln_loan_level_user_type': Using 1 top categori

In [69]:
psi_report.head(10)

Unnamed: 0,Segment_ID,Segment_Name,Model_Display_Name,Model_Version_ID,Segment_Column_1,Segment_Value_1,Segment_Column_2,Segment_Value_2,Segment_Column_3,Segment_Value_3,Feature,Feature_Type,Month,Base_Count,Actual_Count,Expected_Percentage,Actual_Percentage,PSI
0,0,cic_model_sil_Trench_Trench 1,cic_model_sil,v2,modelVersionId,v2,trenchCategory,Trench 1,,,Alpha_cic_sil_score,categorical,2025-11,224718,836,10.0,25.0,0.596118
1,0,cic_model_sil_Trench_Trench 1,cic_model_sil,v2,modelVersionId,v2,trenchCategory,Trench 1,,,Alpha_cic_sil_score,categorical,2025-12,224718,858,10.0,16.666667,0.547015
2,1,cic_model_sil_Trench_Trench 1_LoanProduct_Appl...,cic_model_sil,v2,modelVersionId,v2,trenchCategory,Trench 1,loan_product_type,Appliance,calc_Personal_Loans_granted_contracts_amt_24M,numerical,2025-11,180245,658,9.090909,9.090909,0.218863
3,1,cic_model_sil_Trench_Trench 1_LoanProduct_Appl...,cic_model_sil,v2,modelVersionId,v2,trenchCategory,Trench 1,loan_product_type,Appliance,calc_Personal_Loans_granted_contracts_amt_24M,numerical,2025-12,180245,635,9.090909,9.090909,0.214828
4,1,cic_model_sil_Trench_Trench 1_LoanProduct_Appl...,cic_model_sil,v2,modelVersionId,v2,trenchCategory,Trench 1,loan_product_type,Appliance,calc_ScoreRange,categorical,2025-11,180245,658,10.0,10.0,0.185771
5,1,cic_model_sil_Trench_Trench 1_LoanProduct_Appl...,cic_model_sil,v2,modelVersionId,v2,trenchCategory,Trench 1,loan_product_type,Appliance,calc_ScoreRange,categorical,2025-12,180245,635,10.0,10.0,0.223081
6,1,cic_model_sil_Trench_Trench 1_LoanProduct_Appl...,cic_model_sil,v2,modelVersionId,v2,trenchCategory,Trench 1,loan_product_type,Appliance,calc_cnt_active_contracts,numerical,2025-11,180245,658,16.666667,33.333333,0.06004
7,1,cic_model_sil_Trench_Trench 1_LoanProduct_Appl...,cic_model_sil,v2,modelVersionId,v2,trenchCategory,Trench 1,loan_product_type,Appliance,calc_cnt_active_contracts,numerical,2025-12,180245,635,16.666667,33.333333,0.084973
8,1,cic_model_sil_Trench_Trench 1_LoanProduct_Appl...,cic_model_sil,v2,modelVersionId,v2,trenchCategory,Trench 1,loan_product_type,Appliance,calc_cnt_nongranted_contracts_3M,numerical,2025-11,180245,658,20.0,50.0,0.033295
9,1,cic_model_sil_Trench_Trench 1_LoanProduct_Appl...,cic_model_sil,v2,modelVersionId,v2,trenchCategory,Trench 1,loan_product_type,Appliance,calc_cnt_nongranted_contracts_3M,numerical,2025-12,180245,635,20.0,50.0,0.023122


In [77]:
psi_report['Feature'][psi_report['Segment_Value_3'] == 'Appliance'].unique()

array(['calc_Personal_Loans_granted_contracts_amt_24M', 'calc_ScoreRange',
       'calc_cnt_active_contracts', 'calc_cnt_nongranted_contracts_3M',
       'calc_crifApplicationId', 'calc_customerId',
       'calc_days_since_last_closed', 'calc_digitalLoanAccountId',
       'calc_flg_zero_granted_ever', 'calc_flg_zero_non_granted_ever',
       'calc_granted_contracts_cnt_6M', 'calc_has_ever_been_overdue',
       'calc_ln_loan_level_user_type', 'calc_max_amt_granted_24M',
       'calc_tot_active_contracts_util',
       'calc_total_overdue_granted_contracts',
       'calc_vel_contract_closed_amt_3on12',
       'calc_vel_contract_granted_amt_6on12',
       'calc_vel_contract_nongranted_cnt_6on12',
       'calc_cic_Personal_Loans_granted_contracts_amt_24M',
       'calc_cic_cnt_active_contracts',
       'calc_cic_days_since_last_inquiry', 'calc_cic_max_amt_granted_24M',
       'calc_cic_tot_active_contracts_util',
       'calc_cic_vel_contract_granted_amt_12on24',
       'calc_cic_vel_contra

In [None]:
# # ============================================================================
# # USAGE EXAMPLE
# # ============================================================================

# # Assuming you have df with concatenated Train and Test data

# # Calculate Overall PSI (Overall + By Segments)
# psi_results = calculate_psi_by_model_version(
#     df=df.copy(),
#     score_column='Alpha_cic_sil_score',
#     segment_columns=['new_loan_type', 'loan_product_type', 'osType', 'trenchCategory'],
#     month_col='Application_month',
#     data_selection_col='Data_selection',
#     model_version_col='modelVersionId',
#     # trench_category_col = 'trenchCategory',
#     model_display_name_col = 'modelDisplayName',
#     account_id_col='digitalLoanAccountId'
# )

# print(psi_results.head(20))
# print(f"\nTotal rows: {len(psi_results)}")

# # View Overall results only
# # overall_results = psi_results[psi_results['Segment_Column'] == 'Overall']
# # print(overall_results.head())

# # # View specific model version
# # v1_results = psi_results[psi_results['Model_Version'] == 'v1']
# # print(v1_results)

# # # View specific segment
# # loan_type_results = psi_results[psi_results['Segment_Column'] == 'new_loan_type']
# # print(loan_type_results)

# # Save results
# psi_results.to_csv(r'D:\OneDrive - Tonik Financial Pte Ltd\MyStuff\Data Engineering\Model_Monitoring\PSI Monitoring\NEW_MONITORING_DASHBOARD_20251125\Future\psi_results_overall_and_segments.csv', index=False)

# # # ---- For Bin-Level Details ----
# # bin_psi_results = calculate_bin_level_psi_by_model_version(
# #     df=df.copy(),
# #     score_column='Alpha_cic_sil_score',
# #     segment_columns=['new_loan_type', 'loan_product_type', 'osType']
# # )

# # bin_psi_results.to_csv('bin_level_psi_results_overall_and_segments.csv', index=False)
# # print(bin_psi_results.head(30))


In [None]:
psi_results[['Model_Version', 'Trench_Category', 'Model_Display_Name', 'Feature']].value_counts()

In [None]:
# psi_results['modelDisplayName'] = psi_results['Model_Version'].apply(lambda x: 'cic_model_sil' if x in ['v1', 'v2'] else 'Alpha - CIC-SIL-Model')

In [None]:
# Upload to BigQuery
table_id = "prj-prod-dataplatform.dap_ds_poweruser_playground.alpha_cic_sil_model_psi_v5"
job_config = bigquery.LoadJobConfig(
    write_disposition="WRITE_TRUNCATE",  # or "WRITE_APPEND"
)
job = client.load_table_from_dataframe(psi_results, table_id, job_config=job_config)
job.result()  # Wait for the job to complete

In [None]:
# Expand the calcFeatures column
expanded_df = expand_calc_features(dfd)

# Display the result
print(f"Original columns: {dfd.shape[1]}")
print(f"Expanded columns: {expanded_df.shape[1]}")
df1 = expanded_df.copy()
df1.sample(5)


##### Train

In [None]:
sq = """
WITH cleaned AS (
  SELECT
    customerId,digitalLoanAccountId,prediction,start_time,end_time,modelDisplayName,modelVersionId,
    REPLACE(REPLACE(calcFeature, "'", '"'), "None", "null") AS calcFeature
  FROM prj-prod-dataplatform.dap_ds_poweruser_playground.ml_training_model_run_details
  WHERE modelDisplayName in ('Alpha - CIC-SIL-Model', 'cic_model_sil')
  and modelVersionId='v1'
  )
SELECT distinct
  r.customerId,r.digitalLoanAccountId,prediction Alpha_cic_sil_score
    ,start_time,end_time,modelDisplayName,modelVersionId,
   loanmaster.new_loan_type,
 loanmaster.gender,
    case when loanmaster.loantype='BNPL' and sil_category.store_type =1 then 'Appliance'
    when loanmaster.loantype='BNPL' and sil_category.store_type =2 then 'Mobile'
    when loanmaster.loantype='BNPL' and sil_category.store_type =3 then 'Mall'
    when loanmaster.loantype='BNPL' and sil_category.store_type not in (1,2,3) then store_tagging
    else 'not applicable' end as loan_product_type,
     case when lower(coalesce(loanmaster.osversion_v2, loanmaster.osVersion)) like '%andro%' then 'android'
        when lower(coalesce(loanmaster.osversion_v2, loanmaster.osVersion)) like '%os%' then 'ios'
        when lower(loanmaster.deviceType) like '%andro%' then 'android'
        else 'ios' end osType,
 'Alpha - CIC-SIL-Model' Model_Name,
 'SIL' as product,
 'NA' trenchCategory,
  r.calcFeature calcFeatures,
  'Train' Data_selection,
  coalesce(IF(loanmaster.new_loan_type = 'Flex-up', loanmaster.startApplyDateTime, loanmaster.termsAndConditionsSubmitDateTime),  cast(r.start_time as datetime)) AS appln_submit_datetime,
  loanmaster.disbursementDateTime,
  format_date('%Y-%m', coalesce(IF(loanmaster.new_loan_type = 'Flex-up', loanmaster.startApplyDateTime, loanmaster.termsAndConditionsSubmitDateTime),  cast(r.start_time as datetime))) as Application_month,
FROM cleaned r
left join risk_credit_mis.loan_master_table loanmaster
  ON loanmaster.digitalLoanAccountId = r.digitalLoanAccountId
 left join(SELECT DISTINCT mer_refferal_code, mer_name mer_name,store_type,store_tagging FROM `dl_loans_db_raw.tdbk_merchant_refferal_mtb`
  left join worktable_datachampions.TARGET_SPLIT P on P.STORE_NAME = mer_name
 qualify row_number() over(partition by mer_refferal_code order by  created_dt desc)=1) sil_category on loanmaster.purpleKey=sil_category.mer_refferal_code
qualify row_number() over (partition by r.customerId,r.digitalLoanAccountId order by cast(r.start_time as datetime) desc) = 1
;
"""
dfd = client.query(sq).to_dataframe()
print(f"The shape of the dataframe is: {dfd.shape}")
dfd.head()

In [None]:
# Expand the calcFeatures column
expanded_df = expand_calc_features(dfd)

# Display the result
print(f"Original columns: {dfd.shape[1]}")
print(f"Expanded columns: {expanded_df.shape[1]}")
df2 = expanded_df.copy()
df2.head()


In [None]:
df1.columns

In [None]:
df2.columns

##### concatenate

In [None]:
# df_concat = pd.concat([df1, df2], ignore_index=True)
# df_concat.head()

# 1) Get all IDs present in Train
train_ids = set(df2['digitalLoanAccountId'])

# 2) Keep only Test rows whose ID is NOT in Train
df1_no_dupes = df1[~df1['digitalLoanAccountId'].isin(train_ids)]

# 3) Concatenate
df_concat = pd.concat([df1_no_dupes, df2], ignore_index=True)

print(f"The shape of the concatenated dataframe is:\t {df_concat.shape}")
df_concat.head()

In [None]:
df_concat.groupby(['Data_selection','Application_month'])['digitalLoanAccountId'].nunique().reset_index().sort_values(by=['Application_month','Data_selection'])

In [None]:
df = df_concat.drop(columns=['calcFeatures']).copy()

In [None]:
df['Alpha_cic_sil_score'] = pd.to_numeric(df['Alpha_cic_sil_score'], errors='coerce')

##### PSI calculation

In [None]:
df = df.copy()

# Define feature list
feature_list = ['Alpha_cic_sil_score',  'calc_cic_Personal_Loans_granted_contracts_amt_24M',
       'calc_cic_days_since_last_inquiry', 'calc_cic_cnt_active_contracts',
       'calc_cic_vel_contract_nongranted_cnt_12on24',
       'calc_cic_max_amt_granted_24M', 'calc_cic_zero_non_granted_ever_flag',
       'calc_cic_tot_active_contracts_util',
       'calc_cic_vel_contract_granted_amt_12on24',
       'calc_cic_zero_granted_ever_flag']

# Define segment columns
segment_columns = ['new_loan_type','osType', 'loan_product_type']
# Calculate month-on-month PSI
psi_results = calculate_month_on_month_psi(df, feature_list, segment_columns)
psi_results['modelDisplayName'] = df['modelDisplayName'].iloc[0]
psi_results['Model_Name'] = df['Model_Name'].iloc[0]
psi_results['modelVersionId'] = df['modelVersionId'].iloc[0]
psi_results['trenchCategory'] = df['trenchCategory'].iloc[0]
psi_results = psi_results[['modelDisplayName', 'Model_Name', 'modelVersionId', 'trenchCategory',
                           'Feature', 'Feature_Type', 'Segment_Column', 'Segment_Value', 'Month',
                           'Base_Month', 'Current_Month', 'Base_Count', 'Actual_Count',
                           'Expected_Percentage', 'Actual_Percentage', 'PSI']].copy()

# # Calculate bin-level PSI
# bin_psi_results = calculate_bin_level_psi(df, feature_list, segment_columns)
# bin_psi_results['modelDisplayName'] = df['modelDisplayName'].iloc[0]
# bin_psi_results['Model_Name'] = df['Model_Name'].iloc[0]
# bin_psi_results['modelVersionId'] = df['modelVersionId'].iloc[0]
# bin_psi_results['trenchCategory'] = df['trenchCategory'].iloc[0]
# bin_psi_results = bin_psi_results[['modelDisplayName', 'Model_Name', 'modelVersionId', 'trenchCategory',
#                                    'Feature', 'Feature_Type', 'Segment_Column', 'Segment_Value',
#                                     'Month', 'Base_Month', 'Current_Month', 'Base_Count', 'Actual_Count',
#                                     'Bin', 'Bin_Range', 'Base_Percentage', 'Actual_Percentage', 'Bin_PSI']].copy()

# Upload to BigQuery
table_id = "prj-prod-dataplatform.dap_ds_poweruser_playground.alpha_cic_sil_model_psi_v4"
job_config = bigquery.LoadJobConfig(
    write_disposition="WRITE_TRUNCATE",  # or "WRITE_APPEND"
)
job = client.load_table_from_dataframe(psi_results, table_id, job_config=job_config)
job.result()  # Wait for the job to complete


# # Upload to BigQuery
# table_id = "prj-prod-dataplatform.dap_ds_poweruser_playground.alpha_cic_sil_model_csi_v4"
# job_config = bigquery.LoadJobConfig(
#     write_disposition="WRITE_TRUNCATE",  # or "WRITE_APPEND"
# )
# job = client.load_table_from_dataframe(bin_psi_results, table_id, job_config=job_config)
# job.result()  # Wait for the job to complete

#### Alpha - IncomeEstimationModel

##### Test

In [None]:
sq = """
WITH cleaned AS (
  SELECT
    customerId,digitalLoanAccountId,prediction,start_time,end_time,modelDisplayName,modelVersionId,
    REPLACE(REPLACE(calcFeature, "'", '"'), "None", "null") AS calcFeature
  FROM `prj-prod-dataplatform.audit_balance.ml_model_run_details`
  WHERE modelDisplayName = 'Alpha  - IncomeEstimationModel'
    and modelVersionId='v1'
  )
SELECT
  r.customerId,r.digitalLoanAccountId,prediction Alpha_Income_Estimated_score,start_time,end_time,modelDisplayName,modelVersionId,
  loanmaster.new_loan_type,
 loanmaster.gender,
    case when loanmaster.loantype='BNPL' and sil_category.store_type =1 then 'Appliance'
    when loanmaster.loantype='BNPL' and sil_category.store_type =2 then 'Mobile'
    when loanmaster.loantype='BNPL' and sil_category.store_type =3 then 'Mall'
    when loanmaster.loantype='BNPL' and sil_category.store_type not in (1,2,3) then store_tagging
    else 'not applicable' end as loan_product_type,
    case when lower(coalesce(loanmaster.osversion_v2, loanmaster.osVersion)) like '%andro%' then 'android'
    when lower(coalesce(loanmaster.osversion_v2, loanmaster.osVersion)) like '%os%' then 'ios'
    when lower(loanmaster.deviceType) like '%andro%' then 'android'
    else 'ios' end osType,
  'Alpha  - IncomeEstimationModel' Model_Name,
  'SIL' as product,
  'NA' trenchCategory,
  r.calcFeature calcFeatures,
  'Test' Data_selection,
  JSON_VALUE(calcFeature, "$.inc_alpha_cic_credit_avg_credit_limit") AS inc_alpha_cic_credit_avg_credit_limit,
  JSON_VALUE(calcFeature, "$.inc_alpha_cic_max_active_contracts_amt") AS inc_alpha_cic_max_active_contracts_amt,
  JSON_VALUE(calcFeature, "$.inc_alpha_ln_age") AS inc_alpha_ln_age,
  JSON_VALUE(calcFeature, "$.inc_alpha_doc_type_rolled") AS inc_alpha_doc_type_rolled,
  JSON_VALUE(calcFeature, "$.inc_alpha_ln_brand") AS inc_alpha_ln_brand,
  JSON_VALUE(calcFeature, "$.inc_alpha_ln_city") AS inc_alpha_ln_city,
  JSON_VALUE(calcFeature, "$.inc_alpha_ln_cnt_dependents") AS inc_alpha_ln_cnt_dependents,
  JSON_VALUE(calcFeature, "$.inc_alpha_ln_education_level") AS inc_alpha_ln_education_level,
  JSON_VALUE(calcFeature, "$.inc_alpha_ln_employment_type_new") AS inc_alpha_ln_employment_type_new,
  JSON_VALUE(calcFeature, "$.inc_alpha_ln_gender") AS inc_alpha_ln_gender,
  JSON_VALUE(calcFeature, "$.inc_alpha_ln_industry_new") AS inc_alpha_ln_industry_new,
  JSON_VALUE(calcFeature, "$.inc_alpha_ln_loan_prod_type") AS inc_alpha_ln_loan_prod_type,
  JSON_VALUE(calcFeature, "$.inc_alpha_ln_marital_status_new") AS inc_alpha_ln_marital_status_new,
  JSON_VALUE(calcFeature, "$.inc_alpha_ln_nature_of_work_new") AS inc_alpha_ln_nature_of_work_new,
  JSON_VALUE(calcFeature, "$.inc_alpha_ln_osversion_bin") AS inc_alpha_ln_osversion_bin,
  JSON_VALUE(calcFeature, "$.inc_alpha_ln_purpose") AS inc_alpha_ln_purpose,
  JSON_VALUE(calcFeature, "$.inc_alpha_ln_source_of_funds_new") AS inc_alpha_ln_source_of_funds_new,
  JSON_VALUE(calcFeature, "$.inc_alpha_encoded_company_name_grouped") AS inc_alpha_encoded_company_name_grouped,
  coalesce(IF(loanmaster.new_loan_type = 'Flex-up', loanmaster.startApplyDateTime, loanmaster.termsAndConditionsSubmitDateTime),  r.start_time) AS appln_submit_datetime,
  loanmaster.disbursementDateTime,
  format_date('%Y-%m', coalesce(IF(loanmaster.new_loan_type = 'Flex-up', loanmaster.startApplyDateTime, loanmaster.termsAndConditionsSubmitDateTime),  r.start_time)) as Application_month,
FROM cleaned r
left join risk_credit_mis.loan_master_table loanmaster
  ON loanmaster.digitalLoanAccountId = r.digitalLoanAccountId
 left join(SELECT DISTINCT mer_refferal_code, mer_name mer_name,store_type,store_tagging FROM `dl_loans_db_raw.tdbk_merchant_refferal_mtb`
  left join worktable_datachampions.TARGET_SPLIT P on P.STORE_NAME = mer_name
 qualify row_number() over(partition by mer_refferal_code order by  created_dt desc)=1) sil_category on loanmaster.purpleKey=sil_category.mer_refferal_code
qualify row_number() over (partition by r.customerId,r.digitalLoanAccountId order by r.start_time desc) = 1
  ;
"""
dfd = client.query(sq).to_dataframe()
dfd.head()

In [None]:
# Expand the calcFeatures column
df1 = dfd.copy()

# Display the result
print(f"Original columns: {dfd.shape[1]}")
print(f"Expanded columns: {df1.shape[1]}")
df1.head()

##### Train

In [None]:
sq = """ 
WITH cleaned AS (
  SELECT
    customerId,digitalLoanAccountId,prediction,start_time,end_time,modelDisplayName,modelVersionId,
    REPLACE(REPLACE(calcFeature, "'", '"'), "None", "null") AS calcFeature
  FROM prj-prod-dataplatform.dap_ds_poweruser_playground.ml_training_model_run_details
  WHERE modelDisplayName = 'Alpha  - IncomeEstimationModel'
  and modelVersionId='v1'
  )
SELECT
  r.customerId,r.digitalLoanAccountId,prediction Alpha_Income_Estimated_score,start_time,end_time,modelDisplayName,modelVersionId,
  loanmaster.new_loan_type,
 loanmaster.gender,
    case when loanmaster.loantype='BNPL' and sil_category.store_type =1 then 'Appliance'
    when loanmaster.loantype='BNPL' and sil_category.store_type =2 then 'Mobile'
    when loanmaster.loantype='BNPL' and sil_category.store_type =3 then 'Mall'
    when loanmaster.loantype='BNPL' and sil_category.store_type not in (1,2,3) then store_tagging
    else 'not applicable' end as loan_product_type,
    case when lower(coalesce(loanmaster.osversion_v2, loanmaster.osVersion)) like '%andro%' then 'android'
    when lower(coalesce(loanmaster.osversion_v2, loanmaster.osVersion)) like '%os%' then 'ios'
    when lower(loanmaster.deviceType) like '%andro%' then 'android'
    else 'ios' end osType,
  'Alpha  - IncomeEstimationModel' Model_Name,
  'SIL' as product,
  'NA' trenchCategory,
  r.calcFeature calcFeatures,
  'Train' Data_selection,
  JSON_VALUE(calcFeature, "$.inc_alpha_cic_credit_avg_credit_limit") AS inc_alpha_cic_credit_avg_credit_limit,
  JSON_VALUE(calcFeature, "$.inc_alpha_cic_max_active_contracts_amt") AS inc_alpha_cic_max_active_contracts_amt,
  JSON_VALUE(calcFeature, "$.inc_alpha_ln_age") AS inc_alpha_ln_age,
  JSON_VALUE(calcFeature, "$.inc_alpha_doc_type_rolled") AS inc_alpha_doc_type_rolled,
  JSON_VALUE(calcFeature, "$.inc_alpha_ln_brand") AS inc_alpha_ln_brand,
  JSON_VALUE(calcFeature, "$.inc_alpha_ln_city") AS inc_alpha_ln_city,
  JSON_VALUE(calcFeature, "$.inc_alpha_ln_cnt_dependents") AS inc_alpha_ln_cnt_dependents,
  JSON_VALUE(calcFeature, "$.inc_alpha_ln_education_level") AS inc_alpha_ln_education_level,
  JSON_VALUE(calcFeature, "$.inc_alpha_ln_employment_type_new") AS inc_alpha_ln_employment_type_new,
  JSON_VALUE(calcFeature, "$.inc_alpha_ln_gender") AS inc_alpha_ln_gender,
  JSON_VALUE(calcFeature, "$.inc_alpha_ln_industry_new") AS inc_alpha_ln_industry_new,
  JSON_VALUE(calcFeature, "$.inc_alpha_ln_loan_prod_type") AS inc_alpha_ln_loan_prod_type,
  JSON_VALUE(calcFeature, "$.inc_alpha_ln_marital_status_new") AS inc_alpha_ln_marital_status_new,
  JSON_VALUE(calcFeature, "$.inc_alpha_ln_nature_of_work_new") AS inc_alpha_ln_nature_of_work_new,
  JSON_VALUE(calcFeature, "$.inc_alpha_ln_osversion_bin") AS inc_alpha_ln_osversion_bin,
  JSON_VALUE(calcFeature, "$.inc_alpha_ln_purpose") AS inc_alpha_ln_purpose,
  JSON_VALUE(calcFeature, "$.inc_alpha_ln_source_of_funds_new") AS inc_alpha_ln_source_of_funds_new,
  JSON_VALUE(calcFeature, "$.inc_alpha_encoded_company_name_grouped") AS inc_alpha_encoded_company_name_grouped,
  coalesce(IF(loanmaster.new_loan_type = 'Flex-up', loanmaster.startApplyDateTime, loanmaster.termsAndConditionsSubmitDateTime),  cast(r.start_time as datetime)) AS appln_submit_datetime,
  loanmaster.disbursementDateTime,
  format_date('%Y-%m', coalesce(IF(loanmaster.new_loan_type = 'Flex-up', loanmaster.startApplyDateTime, loanmaster.termsAndConditionsSubmitDateTime),  cast(r.start_time as datetime))) as Application_month,
FROM cleaned r
left join risk_credit_mis.loan_master_table loanmaster
  ON loanmaster.digitalLoanAccountId = r.digitalLoanAccountId
 left join(SELECT DISTINCT mer_refferal_code, mer_name mer_name,store_type,store_tagging FROM `dl_loans_db_raw.tdbk_merchant_refferal_mtb`
  left join worktable_datachampions.TARGET_SPLIT P on P.STORE_NAME = mer_name
 qualify row_number() over(partition by mer_refferal_code order by  created_dt desc)=1) sil_category on loanmaster.purpleKey=sil_category.mer_refferal_code
qualify row_number() over (partition by r.customerId,r.digitalLoanAccountId order by cast(r.start_time as datetime) desc) = 1
  ;
"""
dfd = client.query(sq).to_dataframe()
print(f"The shape of the dataframe is: {dfd.shape}")
dfd.head()


In [None]:
df2 = dfd.copy()

In [None]:
df1.columns

In [None]:
df2.columns

##### concatenate

In [None]:
df_concat = pd.concat([df1, df2], ignore_index=True)
df_concat.head()

In [None]:
df_concat.groupby(['Data_selection','Application_month'])['digitalLoanAccountId'].nunique().reset_index().sort_values(by=['Application_month','Data_selection'])

##### PSI calculation

In [None]:
df = df_concat.drop(columns=['calcFeatures']).copy()

In [None]:
df.info()

In [None]:
df['Alpha_Income_Estimated_score'] = pd.to_numeric(df['Alpha_Income_Estimated_score'], errors='coerce')

In [None]:
df = df.copy()

# Define feature list
feature_list = ['Alpha_Income_Estimated_score', 'inc_alpha_cic_credit_avg_credit_limit',
       'inc_alpha_cic_max_active_contracts_amt', 'inc_alpha_ln_age',
       'inc_alpha_doc_type_rolled', 'inc_alpha_ln_brand', 'inc_alpha_ln_city',
       'inc_alpha_ln_cnt_dependents', 'inc_alpha_ln_education_level',
       'inc_alpha_ln_employment_type_new', 'inc_alpha_ln_gender',
       'inc_alpha_ln_industry_new', 'inc_alpha_ln_loan_prod_type',
       'inc_alpha_ln_marital_status_new', 'inc_alpha_ln_nature_of_work_new',
       'inc_alpha_ln_osversion_bin', 'inc_alpha_ln_purpose',
       'inc_alpha_ln_source_of_funds_new',
       'inc_alpha_encoded_company_name_grouped',]

# Define segment columns
segment_columns = ['new_loan_type','osType', 'loan_product_type']
# Calculate month-on-month PSI
psi_results = calculate_month_on_month_psi(df, feature_list, segment_columns)
psi_results['modelDisplayName'] = df['modelDisplayName'].iloc[0]
psi_results['Model_Name'] = df['Model_Name'].iloc[0]
psi_results['modelVersionId'] = df['modelVersionId'].iloc[0]
psi_results['trenchCategory'] = df['trenchCategory'].iloc[0]
psi_results = psi_results[['modelDisplayName', 'Model_Name', 'modelVersionId', 'trenchCategory',
                           'Feature', 'Feature_Type', 'Segment_Column', 'Segment_Value', 'Month',
                           'Base_Month', 'Current_Month', 'Base_Count', 'Actual_Count',
                           'Expected_Percentage', 'Actual_Percentage', 'PSI']].copy()

# # Calculate bin-level PSI
# bin_psi_results = calculate_bin_level_psi(df, feature_list, segment_columns)
# bin_psi_results['modelDisplayName'] = df['modelDisplayName'].iloc[0]
# bin_psi_results['Model_Name'] = df['Model_Name'].iloc[0]
# bin_psi_results['modelVersionId'] = df['modelVersionId'].iloc[0]
# bin_psi_results['trenchCategory'] = df['trenchCategory'].iloc[0]
# bin_psi_results = bin_psi_results[['modelDisplayName', 'Model_Name', 'modelVersionId', 'trenchCategory',
#                                    'Feature', 'Feature_Type', 'Segment_Column', 'Segment_Value',
#                                     'Month', 'Base_Month', 'Current_Month', 'Base_Count', 'Actual_Count',
#                                     'Bin', 'Bin_Range', 'Base_Percentage', 'Actual_Percentage', 'Bin_PSI']].copy()

# Upload to BigQuery
table_id = "prj-prod-dataplatform.dap_ds_poweruser_playground.alpha_income_estimation_model_psi_v4"
job_config = bigquery.LoadJobConfig(
    write_disposition="WRITE_APPEND",  # or "WRITE_APPEND"
)
job = client.load_table_from_dataframe(psi_results, table_id, job_config=job_config)
job.result()  # Wait for the job to complete


# # Upload to BigQuery
# table_id = "prj-prod-dataplatform.dap_ds_poweruser_playground.alpha_cic_sil_model_csi_v4"
# job_config = bigquery.LoadJobConfig(
#     write_disposition="WRITE_TRUNCATE",  # or "WRITE_APPEND"
# )
# job = client.load_table_from_dataframe(bin_psi_results, table_id, job_config=job_config)
# job.result()  # Wait for the job to complete

#### Alpha Sil Stack Model

##### Test

In [None]:
sq = """ WITH cleaned AS (
  SELECT
    customerId,digitalLoanAccountId,prediction,start_time,end_time,modelDisplayName,modelVersionId,
    REPLACE(REPLACE(calcFeature, "'", '"'), "None", "null") AS calcFeature
  FROM `prj-prod-dataplatform.audit_balance.ml_model_run_details`
  WHERE modelDisplayName in ('Alpha - StackingModel', 'alpha_stack_model_sil')
    and modelVersionId='v1'
  )
SELECT distinct
 r.customerId,r.digitalLoanAccountId,prediction Sil_Alpha_Stack_score,start_time,end_time,modelDisplayName,modelVersionId,
 loanmaster.new_loan_type,
 loanmaster.gender,
    case when loanmaster.loantype='BNPL' and sil_category.store_type =1 then 'Appliance'
    when loanmaster.loantype='BNPL' and sil_category.store_type =2 then 'Mobile'
    when loanmaster.loantype='BNPL' and sil_category.store_type =3 then 'Mall'
    when loanmaster.loantype='BNPL' and sil_category.store_type not in (1,2,3) then store_tagging
    else 'not applicable' end as loan_product_type,
    case when lower(coalesce(loanmaster.osversion_v2, loanmaster.osVersion)) like '%andro%' then 'android'
    when lower(coalesce(loanmaster.osversion_v2, loanmaster.osVersion)) like '%os%' then 'ios'
    when lower(loanmaster.deviceType) like '%andro%' then 'android'
    else 'ios' end osType,
 'SIL Alpha - StackingModel' Model_Name,
 'SIL' as product,
 'NA' trenchCategory,
  r.calcFeature calcFeatures,
  'Test' Data_selection,
  coalesce(IF(loanmaster.new_loan_type = 'Flex-up', loanmaster.startApplyDateTime, loanmaster.termsAndConditionsSubmitDateTime),  r.start_time) AS appln_submit_datetime,
  loanmaster.disbursementDateTime,
  format_date('%Y-%m', coalesce(IF(loanmaster.new_loan_type = 'Flex-up', loanmaster.startApplyDateTime, loanmaster.termsAndConditionsSubmitDateTime),  r.start_time)) as Application_month,
FROM cleaned r
left join risk_credit_mis.loan_master_table loanmaster
  ON loanmaster.digitalLoanAccountId = r.digitalLoanAccountId
 left join(SELECT DISTINCT mer_refferal_code, mer_name mer_name,store_type,store_tagging FROM `dl_loans_db_raw.tdbk_merchant_refferal_mtb`
  left join worktable_datachampions.TARGET_SPLIT P on P.STORE_NAME = mer_name
 qualify row_number() over(partition by mer_refferal_code order by  created_dt desc)=1) sil_category on loanmaster.purpleKey=sil_category.mer_refferal_code
qualify row_number() over(partition by r.digitalLoanAccountId order by r.start_time desc)=1
  ;"""
dfd = client.query(sq).to_dataframe()
dfd = dfd.drop_duplicates(keep='first')
dfd.head()

In [None]:
# Expand the calcFeatures column
expanded_df = expand_calc_features(dfd)

# Display the result
print(f"Original columns: {dfd.shape[1]}")
print(f"Expanded columns: {expanded_df.shape[1]}")
df1 = expanded_df.copy()


##### Train

In [None]:
sq = """ WITH cleaned AS (
  SELECT
    customerId,digitalLoanAccountId,prediction,start_time,end_time,modelDisplayName,modelVersionId,
    REPLACE(REPLACE(calcFeature, "'", '"'), "None", "null") AS calcFeature
  FROM prj-prod-dataplatform.dap_ds_poweruser_playground.ml_training_model_run_details
  WHERE modelDisplayName in ('Alpha - StackingModel', 'alpha_stack_model_sil')
    and modelVersionId='v1'
  )
SELECT distinct
 r.customerId,r.digitalLoanAccountId,prediction Sil_Alpha_Stack_score,start_time,end_time,modelDisplayName,modelVersionId,
 loanmaster.new_loan_type,
 loanmaster.gender,
    case when loanmaster.loantype='BNPL' and sil_category.store_type =1 then 'Appliance'
    when loanmaster.loantype='BNPL' and sil_category.store_type =2 then 'Mobile'
    when loanmaster.loantype='BNPL' and sil_category.store_type =3 then 'Mall'
    when loanmaster.loantype='BNPL' and sil_category.store_type not in (1,2,3) then store_tagging
    else 'not applicable' end as loan_product_type,
    case when lower(coalesce(loanmaster.osversion_v2, loanmaster.osVersion)) like '%andro%' then 'android'
    when lower(coalesce(loanmaster.osversion_v2, loanmaster.osVersion)) like '%os%' then 'ios'
    when lower(loanmaster.deviceType) like '%andro%' then 'android'
    else 'ios' end osType,
 'SIL Alpha - StackingModel' Model_Name,
 'SIL' as product,
 'NA' trenchCategory,
  r.calcFeature calcFeatures,
  'Train' Data_selection,
  coalesce(IF(loanmaster.new_loan_type = 'Flex-up', loanmaster.startApplyDateTime, loanmaster.termsAndConditionsSubmitDateTime),  cast(r.start_time as datetime)) AS appln_submit_datetime,
  loanmaster.disbursementDateTime,
  format_date('%Y-%m', coalesce(IF(loanmaster.new_loan_type = 'Flex-up', loanmaster.startApplyDateTime, loanmaster.termsAndConditionsSubmitDateTime),   cast(r.start_time as datetime))) as Application_month,
FROM cleaned r
left join risk_credit_mis.loan_master_table loanmaster
  ON loanmaster.digitalLoanAccountId = r.digitalLoanAccountId
 left join(SELECT DISTINCT mer_refferal_code, mer_name mer_name,store_type,store_tagging FROM `dl_loans_db_raw.tdbk_merchant_refferal_mtb`
  left join worktable_datachampions.TARGET_SPLIT P on P.STORE_NAME = mer_name
 qualify row_number() over(partition by mer_refferal_code order by  created_dt desc)=1) sil_category on loanmaster.purpleKey=sil_category.mer_refferal_code
qualify row_number() over(partition by r.digitalLoanAccountId order by  cast(r.start_time as datetime) desc)=1
  ;"""
dfd = client.query(sq).to_dataframe()
dfd = dfd.drop_duplicates(keep='first')
dfd.head()

In [None]:
# Expand the calcFeatures column
expanded_df = expand_calc_features(dfd)

# Display the result
print(f"Original columns: {dfd.shape[1]}")
print(f"Expanded columns: {expanded_df.shape[1]}")
df2 = expanded_df.copy()


In [None]:
df1.columns

In [None]:
df2.columns

In [None]:
df2.rename(columns={'calc_beta_demo_score':'calc_sb_demo_score', 'calc_cic_score':'calc_s_cic_score', 'calc_apps_score':'calc_s_apps_score', 'calc_credo_gen_score':'calc_s_credo_score'}, inplace = True)

##### concatenate

In [None]:
df_concat = pd.concat([df1, df2], ignore_index=True)
df_concat.info()

In [None]:
df = df_concat.drop(columns=['calcFeatures']).copy()

In [None]:
df['Sil_Alpha_Stack_score'] = pd.to_numeric(df['Sil_Alpha_Stack_score'], errors='coerce')

##### PSI calculation

In [None]:
df = df.copy()

# Define feature list
feature_list = ['Sil_Alpha_Stack_score',   'calc_sb_demo_score',
       'calc_s_cic_score', 'calc_s_credo_score', 'calc_s_apps_score']

# Define segment columns
segment_columns = ['new_loan_type','osType', 'loan_product_type']
# Calculate month-on-month PSI
psi_results = calculate_month_on_month_psi(df, feature_list, segment_columns)
psi_results['modelDisplayName'] = df['modelDisplayName'].iloc[0]
psi_results['Model_Name'] = df['Model_Name'].iloc[0]
psi_results['modelVersionId'] = df['modelVersionId'].iloc[0]
psi_results['trenchCategory'] = df['trenchCategory'].iloc[0]
psi_results = psi_results[['modelDisplayName', 'Model_Name', 'modelVersionId', 'trenchCategory',
                           'Feature', 'Feature_Type', 'Segment_Column', 'Segment_Value', 'Month',
                           'Base_Month', 'Current_Month', 'Base_Count', 'Actual_Count',
                           'Expected_Percentage', 'Actual_Percentage', 'PSI']].copy()

# Calculate bin-level PSI
bin_psi_results = calculate_bin_level_psi(df, feature_list, segment_columns)
bin_psi_results['modelDisplayName'] = df['modelDisplayName'].iloc[0]
bin_psi_results['Model_Name'] = df['Model_Name'].iloc[0]
bin_psi_results['modelVersionId'] = df['modelVersionId'].iloc[0]
bin_psi_results['trenchCategory'] = df['trenchCategory'].iloc[0]
bin_psi_results = bin_psi_results[['modelDisplayName', 'Model_Name', 'modelVersionId', 'trenchCategory',
                                   'Feature', 'Feature_Type', 'Segment_Column', 'Segment_Value',
                                    'Month', 'Base_Month', 'Current_Month', 'Base_Count', 'Actual_Count',
                                    'Bin', 'Bin_Range', 'Base_Percentage', 'Actual_Percentage', 'Bin_PSI']].copy()

# Upload to BigQuery
table_id = "prj-prod-dataplatform.dap_ds_poweruser_playground.alpha_sil_stack_model_psi_v4"
job_config = bigquery.LoadJobConfig(
    write_disposition="WRITE_TRUNCATE",  # or "WRITE_APPEND"
)
job = client.load_table_from_dataframe(psi_results, table_id, job_config=job_config)
job.result()  # Wait for the job to complete



#### Beta Sil App Score

##### Test

In [None]:
sq = """
WITH cleaned AS (
  SELECT
    customerId,digitalLoanAccountId,prediction,start_time,end_time,modelDisplayName,modelVersionId,

    REPLACE(REPLACE(calcFeature, "'", '"'), "None", "null") AS calcFeature,
    REPLACE(REPLACE(prediction, "'", '"'), "None", "null") AS prediction_clean
  FROM `prj-prod-dataplatform.audit_balance.ml_model_run_details`
  WHERE modelDisplayName in ('Beta - AppsScoreModel', 'apps_score_model_sil')
    and modelVersionId='v1'
  )
SELECT
  r.customerId,r.digitalLoanAccountId,prediction,start_time,end_time,
  modelDisplayName,modelVersionId,
     loanmaster.new_loan_type,
 loanmaster.gender,
    case when loanmaster.loantype='BNPL' and sil_category.store_type =1 then 'Appliance'
    when loanmaster.loantype='BNPL' and sil_category.store_type =2 then 'Mobile'
    when loanmaster.loantype='BNPL' and sil_category.store_type =3 then 'Mall'
    when loanmaster.loantype='BNPL' and sil_category.store_type not in (1,2,3) then store_tagging
    else 'not applicable' end as loan_product_type,
     case when lower(coalesce(loanmaster.osversion_v2, loanmaster.osVersion)) like '%andro%' then 'android'
    when lower(coalesce(loanmaster.osversion_v2, loanmaster.osVersion)) like '%os%' then 'ios'
    when lower(loanmaster.deviceType) like '%andro%' then 'android'
    else 'ios' end osType,
 'SIL Beta - AppsScoreModel' Model_Name,
 'SIL' as product,
  'NA' trenchCategory,
  'Test' Data_selection,
  safe_cast(JSON_VALUE(prediction_clean, "$.combined_score") AS float64) as sil_beta_app_score,
 calcFeature calcFeatures,
    coalesce(IF(loanmaster.new_loan_type = 'Flex-up', loanmaster.startApplyDateTime, loanmaster.termsAndConditionsSubmitDateTime),  r.start_time) AS appln_submit_datetime,
    loanmaster.disbursementDateTime,
    format_date('%Y-%m', coalesce(IF(loanmaster.new_loan_type = 'Flex-up', loanmaster.startApplyDateTime, loanmaster.termsAndConditionsSubmitDateTime),  r.start_time)) as Application_month,
 FROM cleaned r
left join risk_credit_mis.loan_master_table loanmaster
  ON loanmaster.digitalLoanAccountId = r.digitalLoanAccountId
 left join(SELECT DISTINCT mer_refferal_code, mer_name mer_name,store_type,store_tagging FROM `dl_loans_db_raw.tdbk_merchant_refferal_mtb`
  left join worktable_datachampions.TARGET_SPLIT P on P.STORE_NAME = mer_name
 qualify row_number() over(partition by mer_refferal_code order by  created_dt desc)=1) sil_category on loanmaster.purpleKey=sil_category.mer_refferal_code
  qualify row_number() over(partition by r.customerId, r.digitalLoanAccountid order by start_time desc) = 1
;
"""
dfd = client.query(sq).to_dataframe()
dfd.head()

In [None]:
# Expand the calcFeatures column
expanded_df = expand_calc_features(dfd)

# Display the result
print(f"Original columns: {dfd.shape[1]}")
print(f"Expanded columns: {expanded_df.shape[1]}")
df1 = expanded_df.copy()


##### Train

In [None]:
sq = """ 
WITH cleaned AS (
  SELECT
    customerId,digitalLoanAccountId,prediction,start_time,end_time,modelDisplayName,modelVersionId,

    REPLACE(REPLACE(calcFeature, "'", '"'), "None", "null") AS calcFeature,
    REPLACE(REPLACE(cast(prediction as string), "'", '"'), "None", "null") AS prediction_clean
  FROM prj-prod-dataplatform.dap_ds_poweruser_playground.ml_training_model_run_details
  WHERE modelDisplayName in ('Beta - AppsScoreModel', 'apps_score_model_sil')
    and modelVersionId='v1'
  )
SELECT
  r.customerId,r.digitalLoanAccountId,prediction,start_time,end_time,
  modelDisplayName,modelVersionId,
     loanmaster.new_loan_type,
 loanmaster.gender,
    case when loanmaster.loantype='BNPL' and sil_category.store_type =1 then 'Appliance'
    when loanmaster.loantype='BNPL' and sil_category.store_type =2 then 'Mobile'
    when loanmaster.loantype='BNPL' and sil_category.store_type =3 then 'Mall'
    when loanmaster.loantype='BNPL' and sil_category.store_type not in (1,2,3) then store_tagging
    else 'not applicable' end as loan_product_type,
     case when lower(coalesce(loanmaster.osversion_v2, loanmaster.osVersion)) like '%andro%' then 'android'
    when lower(coalesce(loanmaster.osversion_v2, loanmaster.osVersion)) like '%os%' then 'ios'
    when lower(loanmaster.deviceType) like '%andro%' then 'android'
    else 'ios' end osType,
 'SIL Beta - AppsScoreModel' Model_Name,
 'SIL' as product,
  'NA' trenchCategory,
  'Train' Data_selection,
  coalesce(prediction, safe_cast(JSON_VALUE(prediction_clean, "$.combined_score") AS float64)) as sil_beta_app_score,
 calcFeature calcFeatures,
    IF(loanmaster.new_loan_type = 'Flex-up', loanmaster.startApplyDateTime, loanmaster.termsAndConditionsSubmitDateTime) AS appln_submit_datetime,
    loanmaster.disbursementDateTime,
    format_date('%Y-%m', IF(loanmaster.new_loan_type = 'Flex-up', loanmaster.startApplyDateTime, loanmaster.termsAndConditionsSubmitDateTime)) as Application_month,
 FROM cleaned r
left join risk_credit_mis.loan_master_table loanmaster
  ON loanmaster.digitalLoanAccountId = r.digitalLoanAccountId
 left join(SELECT DISTINCT mer_refferal_code, mer_name mer_name,store_type,store_tagging FROM `dl_loans_db_raw.tdbk_merchant_refferal_mtb`
  left join worktable_datachampions.TARGET_SPLIT P on P.STORE_NAME = mer_name
 qualify row_number() over(partition by mer_refferal_code order by  created_dt desc)=1) sil_category on loanmaster.purpleKey=sil_category.mer_refferal_code
  qualify row_number() over(partition by r.customerId, r.digitalLoanAccountid order by cast(start_time as datetime) desc) = 1
;
"""

dfd = client.query(sq).to_dataframe()
dfd.head()

In [None]:
# Expand the calcFeatures column
expanded_df = expand_calc_features(dfd)

# Display the result
print(f"Original columns: {dfd.shape[1]}")
print(f"Expanded columns: {expanded_df.shape[1]}")
df2 = expanded_df.copy()

In [None]:
df1.columns

In [None]:
df2.columns

##### concatenate

In [None]:
df_concat = pd.concat([df1, df2], ignore_index=True)
df_concat.info()

In [None]:
df = df_concat.drop(columns=['calcFeatures']).copy()

In [None]:
df['sil_beta_app_score'] = pd.to_numeric(df['sil_beta_app_score'], errors='coerce')

In [None]:
df = df.copy()

# Define feature list
feature_list = ['sil_beta_app_score',
       'calc_app_cnt_rated_for_3plus_ever', 'calc_app_cnt_education_ever',
       'calc_app_cnt_business_ever', 'calc_app_cnt_music_and_audio_ever',
       'calc_app_cnt_travel_and_local_ever', 'calc_app_cnt_finance_7d',
       'calc_app_cnt_absence_tag_30d', 'calc_app_cnt_competitors_30d',
       'calc_app_cnt_absence_tag_90d',    'calc_app_cnt_finance_90d', 'calc_app_cnt_competitors_90d',
       'calc_app_cnt_payday_90d',      'calc_app_median_time_bw_installed_mins_30d',
       'calc_app_first_competitors_install_to_apply_days',
       'calc_app_first_payday_install_to_apply_days',
       'calc_app_vel_finance_30_over_365']

# Define segment columns
segment_columns = ['new_loan_type','osType', 'loan_product_type']
# Calculate month-on-month PSI
psi_results = calculate_month_on_month_psi(df, feature_list, segment_columns)
psi_results['modelDisplayName'] = df['modelDisplayName'].iloc[0]
psi_results['Model_Name'] = df['Model_Name'].iloc[0]
psi_results['modelVersionId'] = df['modelVersionId'].iloc[0]
psi_results['trenchCategory'] = df['trenchCategory'].iloc[0]
psi_results = psi_results[['modelDisplayName', 'Model_Name', 'modelVersionId', 'trenchCategory',
                           'Feature', 'Feature_Type', 'Segment_Column', 'Segment_Value', 'Month',
                           'Base_Month', 'Current_Month', 'Base_Count', 'Actual_Count',
                           'Expected_Percentage', 'Actual_Percentage', 'PSI']].copy()

# # Calculate bin-level PSI
# bin_psi_results = calculate_bin_level_psi(df, feature_list, segment_columns)
# bin_psi_results['modelDisplayName'] = df['modelDisplayName'].iloc[0]
# bin_psi_results['Model_Name'] = df['Model_Name'].iloc[0]
# bin_psi_results['modelVersionId'] = df['modelVersionId'].iloc[0]
# bin_psi_results['trenchCategory'] = df['trenchCategory'].iloc[0]
# bin_psi_results = bin_psi_results[['modelDisplayName', 'Model_Name', 'modelVersionId', 'trenchCategory',
#                                    'Feature', 'Feature_Type', 'Segment_Column', 'Segment_Value',
#                                     'Month', 'Base_Month', 'Current_Month', 'Base_Count', 'Actual_Count',
#                                     'Bin', 'Bin_Range', 'Base_Percentage', 'Actual_Percentage', 'Bin_PSI']].copy()

# Upload to BigQuery
table_id = "prj-prod-dataplatform.dap_ds_poweruser_playground.beta_sil_appscore_model_psi_v4"
job_config = bigquery.LoadJobConfig(
    write_disposition="WRITE_TRUNCATE",  # or "WRITE_APPEND"
)
job = client.load_table_from_dataframe(psi_results, table_id, job_config=job_config)
job.result()  # Wait for the job to complete


# # Upload to BigQuery
# table_id = "prj-prod-dataplatform.dap_ds_poweruser_playground.beta_sil_appscore_model_csi_v4"
# job_config = bigquery.LoadJobConfig(
#     write_disposition="WRITE_TRUNCATE",  # or "WRITE_APPEND"
# )
# job = client.load_table_from_dataframe(bin_psi_results, table_id, job_config=job_config)
# job.result()  # Wait for the job to complete

#### Beta SIL Demo Score

##### Test

In [None]:
sq = """ WITH cleaned AS (
  SELECT
  customerId,digitalLoanAccountId,prediction,start_time,end_time,modelDisplayName,modelVersionId,
REPLACE(REPLACE(calcFeature, "'", '"'), "None", "null") AS calcFeature_cleaned
  FROM `prj-prod-dataplatform.audit_balance.ml_model_run_details`
  WHERE modelDisplayName in  ('Beta - DemoScoreModel', 'beta_demo_model_sil')
  and modelVersionId='v1'
  )
SELECT
  r.customerId,  r.digitalLoanAccountId,start_time, prediction sil_beta_demo_score, modelDisplayName,modelVersionId,
 loanmaster.new_loan_type,
 loanmaster.gender,
      case when lower(coalesce(loanmaster.osversion_v2, loanmaster.osVersion)) like '%andro%' then 'android'
    when lower(coalesce(loanmaster.osversion_v2, loanmaster.osVersion)) like '%os%' then 'ios'
    when lower(loanmaster.deviceType) like '%andro%' then 'android'
    else 'ios' end osType,
  case when loanmaster.loantype='BNPL' and sil_category.store_type =1 then 'Appliance'
    when loanmaster.loantype='BNPL' and sil_category.store_type =2 then 'Mobile'
    when loanmaster.loantype='BNPL' and sil_category.store_type =3 then 'Mall'
    when loanmaster.loantype='BNPL' and sil_category.store_type not in (1,2,3) then store_tagging
    else 'not applicable' end as loan_product_type,
 'SIL Beta - DemoScoreModel' Model_Name,
 'SIL' as product,
 'NA' trenchCategory,
  r.calcFeature_cleaned calcFeatures,
  'Test' Data_selection,
  coalesce(IF(loanmaster.new_loan_type = 'Flex-up', loanmaster.startApplyDateTime, loanmaster.termsAndConditionsSubmitDateTime),  r.start_time) AS appln_submit_datetime,
  loanmaster.disbursementDateTime,
  format_date('%Y-%m', coalesce(IF(loanmaster.new_loan_type = 'Flex-up', loanmaster.startApplyDateTime, loanmaster.termsAndConditionsSubmitDateTime),  r.start_time)) as Application_month,
FROM cleaned r
left join risk_credit_mis.loan_master_table loanmaster
  ON loanmaster.digitalLoanAccountId = r.digitalLoanAccountId
 left join(SELECT DISTINCT mer_refferal_code, mer_name mer_name,store_type,store_tagging FROM `dl_loans_db_raw.tdbk_merchant_refferal_mtb`
  left join worktable_datachampions.TARGET_SPLIT P on P.STORE_NAME = mer_name
 qualify row_number() over(partition by mer_refferal_code order by  created_dt desc)=1) sil_category on loanmaster.purpleKey=sil_category.mer_refferal_code

qualify row_number() over(partition by r.customerId, r.digitalLoanAccountId order by start_time desc) = 1;"""

dfd = client.query(sq).to_dataframe()
dfd.head()

In [None]:
# Expand the calcFeatures column
expanded_df = expand_calc_features(dfd)

# Display the result
print(f"Original columns: {dfd.shape[1]}")
print(f"Expanded columns: {expanded_df.shape[1]}")
df1 = expanded_df.copy()


##### Train

In [None]:
sq = """ WITH cleaned AS (
  SELECT
  customerId,digitalLoanAccountId,prediction,start_time,end_time,modelDisplayName,modelVersionId,
REPLACE(REPLACE(calcFeature, "'", '"'), "None", "null") AS calcFeature_cleaned
  FROM prj-prod-dataplatform.dap_ds_poweruser_playground.ml_training_model_run_details
  WHERE modelDisplayName in  ('Beta - DemoScoreModel', 'beta_demo_model_sil')
  and modelVersionId='v1'
  )
SELECT
  r.customerId,  r.digitalLoanAccountId,start_time, prediction sil_beta_demo_score, modelDisplayName,modelVersionId,
 loanmaster.new_loan_type,
 loanmaster.gender,
      case when lower(coalesce(loanmaster.osversion_v2, loanmaster.osVersion)) like '%andro%' then 'android'
    when lower(coalesce(loanmaster.osversion_v2, loanmaster.osVersion)) like '%os%' then 'ios'
    when lower(loanmaster.deviceType) like '%andro%' then 'android'
    else 'ios' end osType,
  case when loanmaster.loantype='BNPL' and sil_category.store_type =1 then 'Appliance'
    when loanmaster.loantype='BNPL' and sil_category.store_type =2 then 'Mobile'
    when loanmaster.loantype='BNPL' and sil_category.store_type =3 then 'Mall'
    when loanmaster.loantype='BNPL' and sil_category.store_type not in (1,2,3) then store_tagging
    else 'not applicable' end as loan_product_type,
 'SIL Beta - DemoScoreModel' Model_Name,
 'SIL' as product,
 'NA' trenchCategory,
  r.calcFeature_cleaned calcFeatures,
  'Train' Data_selection,
  IF(loanmaster.new_loan_type = 'Flex-up', loanmaster.startApplyDateTime, loanmaster.termsAndConditionsSubmitDateTime) AS appln_submit_datetime,
  loanmaster.disbursementDateTime,
  format_date('%Y-%m', IF(loanmaster.new_loan_type = 'Flex-up', loanmaster.startApplyDateTime, loanmaster.termsAndConditionsSubmitDateTime)) as Application_month,
FROM cleaned r
left join risk_credit_mis.loan_master_table loanmaster
  ON loanmaster.digitalLoanAccountId = r.digitalLoanAccountId
 left join(SELECT DISTINCT mer_refferal_code, mer_name mer_name,store_type,store_tagging FROM `dl_loans_db_raw.tdbk_merchant_refferal_mtb`
  left join worktable_datachampions.TARGET_SPLIT P on P.STORE_NAME = mer_name
 qualify row_number() over(partition by mer_refferal_code order by  created_dt desc)=1) sil_category on loanmaster.purpleKey=sil_category.mer_refferal_code

qualify row_number() over(partition by r.customerId, r.digitalLoanAccountId order by cast(start_time as datetime) desc) = 1;"""

dfd = client.query(sq).to_dataframe()
dfd.head()

In [None]:
# Expand the calcFeatures column
expanded_df = expand_calc_features(dfd)

# Display the result
print(f"Original columns: {dfd.shape[1]}")
print(f"Expanded columns: {expanded_df.shape[1]}")
df2 = expanded_df.copy()


In [None]:
df1.columns

In [None]:
df2.columns

##### Concatenate

In [None]:
df_concat = pd.concat([df1, df2], ignore_index=True)
df_concat.info()

In [None]:
df = df_concat.drop(columns=['calcFeatures']).copy()

In [None]:
df['sil_beta_demo_score'] = pd.to_numeric(df['sil_beta_demo_score'], errors='coerce')

##### PSI calculation

In [None]:
df = df.copy()

# Define feature list
feature_list = ['sil_beta_demo_score',
       'calc_beta_de_ln_vas_opted_flag',
       'calc_beta_de_ln_doc_type_rolled', 'calc_beta_de_ln_marital_status',
       'calc_beta_de_ln_age_bin', 'calc_beta_de_ln_province_bin',
       'calc_beta_de_ln_ref2_type', 'calc_beta_de_ln_education_level',
       'calc_beta_de_ln_ref1_type', 'calc_beta_de_ln_industry_new_bin',
       'calc_beta_de_ln_appln_day_of_week',
       'calc_beta_de_onb_name_email_match_score',
       'calc_beta_de_ln_employment_type_new_bin', 'calc_beta_de_ln_telconame',
       'calc_beta_de_time_bw_onb_loan_appln_mins',
       'calc_beta_de_ln_source_of_funds_new_bin', 'calc_beta_de_ln_brand_bin',
       'calc_beta_de_ln_email_primary_domain']

# Define segment columns
segment_columns = ['new_loan_type','osType', 'loan_product_type']
# Calculate month-on-month PSI
psi_results = calculate_month_on_month_psi(df, feature_list, segment_columns)
psi_results['modelDisplayName'] = df['modelDisplayName'].iloc[0]
psi_results['Model_Name'] = df['Model_Name'].iloc[0]
psi_results['modelVersionId'] = df['modelVersionId'].iloc[0]
psi_results['trenchCategory'] = df['trenchCategory'].iloc[0]
psi_results = psi_results[['modelDisplayName', 'Model_Name', 'modelVersionId', 'trenchCategory',
                           'Feature', 'Feature_Type', 'Segment_Column', 'Segment_Value', 'Month',
                           'Base_Month', 'Current_Month', 'Base_Count', 'Actual_Count',
                           'Expected_Percentage', 'Actual_Percentage', 'PSI']].copy()

# # Calculate bin-level PSI
# bin_psi_results = calculate_bin_level_psi(df, feature_list, segment_columns)
# bin_psi_results['modelDisplayName'] = df['modelDisplayName'].iloc[0]
# bin_psi_results['Model_Name'] = df['Model_Name'].iloc[0]
# bin_psi_results['modelVersionId'] = df['modelVersionId'].iloc[0]
# bin_psi_results['trenchCategory'] = df['trenchCategory'].iloc[0]
# bin_psi_results = bin_psi_results[['modelDisplayName', 'Model_Name', 'modelVersionId', 'trenchCategory',
#                                    'Feature', 'Feature_Type', 'Segment_Column', 'Segment_Value',
#                                     'Month', 'Base_Month', 'Current_Month', 'Base_Count', 'Actual_Count',
#                                     'Bin', 'Bin_Range', 'Base_Percentage', 'Actual_Percentage', 'Bin_PSI']].copy()

# Upload to BigQuery
table_id = "prj-prod-dataplatform.dap_ds_poweruser_playground.beta_demo_score_model_psi_v4"
job_config = bigquery.LoadJobConfig(
    write_disposition="WRITE_TRUNCATE",  # or "WRITE_APPEND"
)
job = client.load_table_from_dataframe(psi_results, table_id, job_config=job_config)
job.result()  # Wait for the job to complete


# # Upload to BigQuery
# table_id = "prj-prod-dataplatform.dap_ds_poweruser_playground.beta_demo_score_model_csi_v4"
# job_config = bigquery.LoadJobConfig(
#     write_disposition="WRITE_TRUNCATE",  # or "WRITE_APPEND"
# )
# job = client.load_table_from_dataframe(bin_psi_results, table_id, job_config=job_config)
# job.result()  # Wait for the job to complete

#### Beta SIL STACK Score Model

##### Test

In [None]:
sq = """ WITH cleaned AS (
  SELECT
    customerId,digitalLoanAccountId,prediction,start_time,end_time,modelDisplayName,modelVersionId,
    REPLACE(REPLACE(calcFeature, "'", '"'), "None", "null") AS calcFeature
  FROM `prj-prod-dataplatform.audit_balance.ml_model_run_details`
  WHERE modelDisplayName in ('Beta - StackScoreModel', 'beta_stack_model_sil')
  and modelVersionId='v1'
  )
SELECT
  r.customerId,r.digitalLoanAccountId,prediction,start_time,end_time,modelDisplayName,modelVersionId,
 loanmaster.new_loan_type,
 loanmaster.gender,
    case when loanmaster.loantype='BNPL' and sil_category.store_type =1 then 'Appliance'
    when loanmaster.loantype='BNPL' and sil_category.store_type =2 then 'Mobile'
    when loanmaster.loantype='BNPL' and sil_category.store_type =3 then 'Mall'
    when loanmaster.loantype='BNPL' and sil_category.store_type not in (1,2,3) then store_tagging
    else 'not applicable' end as loan_product_type,
 'SIL_Beta - StackScoreModel' Model_Name,
 'SIL' as product,
 'NA' trenchCategory,
  r.calcFeature calcFeatures,
  prediction sil_beta_stack_score,
  'Test' Data_selection,
  coalesce(IF(loanmaster.new_loan_type = 'Flex-up', loanmaster.startApplyDateTime, loanmaster.termsAndConditionsSubmitDateTime),  r.start_time) AS appln_submit_datetime,
  loanmaster.disbursementDateTime,
  format_date('%Y-%m', coalesce(IF(loanmaster.new_loan_type = 'Flex-up', loanmaster.startApplyDateTime, loanmaster.termsAndConditionsSubmitDateTime),  r.start_time)) as Application_month,
FROM cleaned r
left join risk_credit_mis.loan_master_table loanmaster
  ON loanmaster.digitalLoanAccountId = r.digitalLoanAccountId
 left join(SELECT DISTINCT mer_refferal_code, mer_name mer_name,store_type,store_tagging FROM `dl_loans_db_raw.tdbk_merchant_refferal_mtb`
  left join worktable_datachampions.TARGET_SPLIT P on P.STORE_NAME = mer_name
 qualify row_number() over(partition by mer_refferal_code order by  created_dt desc)=1) sil_category on loanmaster.purpleKey=sil_category.mer_refferal_code
qualify row_number() over (partition by r.customerId, r.digitalLoanAccountId order by start_time desc) = 1
;
"""
dfd= client.query(sq).to_dataframe()
dfd.head()

In [None]:
# Expand the calcFeatures column
expanded_df = expand_calc_features(dfd)

# Display the result
print(f"Original columns: {dfd.shape[1]}")
print(f"Expanded columns: {expanded_df.shape[1]}")
df1 = expanded_df.copy()


##### Train

In [None]:
sq = """ WITH cleaned AS (
  SELECT
    customerId,digitalLoanAccountId,prediction,start_time,end_time,modelDisplayName,modelVersionId,
    REPLACE(REPLACE(calcFeature, "'", '"'), "None", "null") AS calcFeature
  FROM  prj-prod-dataplatform.dap_ds_poweruser_playground.ml_training_model_run_details
  WHERE modelDisplayName in ('Beta - StackScoreModel', 'beta_stack_model_sil')
  and modelVersionId='v1'
  )
SELECT
  r.customerId,r.digitalLoanAccountId,prediction,start_time,end_time,modelDisplayName,modelVersionId,
 loanmaster.new_loan_type,
 loanmaster.gender,
    case when loanmaster.loantype='BNPL' and sil_category.store_type =1 then 'Appliance'
    when loanmaster.loantype='BNPL' and sil_category.store_type =2 then 'Mobile'
    when loanmaster.loantype='BNPL' and sil_category.store_type =3 then 'Mall'
    when loanmaster.loantype='BNPL' and sil_category.store_type not in (1,2,3) then store_tagging
    else 'not applicable' end as loan_product_type,
 'SIL_Beta - StackScoreModel' Model_Name,
 'SIL' as product,
 'NA' trenchCategory,
  r.calcFeature calcFeatures,
  prediction sil_beta_stack_score,
  'Train' Data_selection,
  IF(loanmaster.new_loan_type = 'Flex-up', loanmaster.startApplyDateTime, loanmaster.termsAndConditionsSubmitDateTime) AS appln_submit_datetime,
  loanmaster.disbursementDateTime,
  format_date('%Y-%m', IF(loanmaster.new_loan_type = 'Flex-up', loanmaster.startApplyDateTime, loanmaster.termsAndConditionsSubmitDateTime)) as Application_month,
FROM cleaned r
left join risk_credit_mis.loan_master_table loanmaster
  ON loanmaster.digitalLoanAccountId = r.digitalLoanAccountId
 left join(SELECT DISTINCT mer_refferal_code, mer_name mer_name,store_type,store_tagging FROM `dl_loans_db_raw.tdbk_merchant_refferal_mtb`
  left join worktable_datachampions.TARGET_SPLIT P on P.STORE_NAME = mer_name
 qualify row_number() over(partition by mer_refferal_code order by  created_dt desc)=1) sil_category on loanmaster.purpleKey=sil_category.mer_refferal_code
qualify row_number() over (partition by r.customerId, r.digitalLoanAccountId order by cast(start_time as datetime) desc) = 1
;
"""
dfd= client.query(sq).to_dataframe()
dfd.head()

In [None]:
# Expand the calcFeatures column
expanded_df = expand_calc_features(dfd)

# Display the result
print(f"Original columns: {dfd.shape[1]}")
print(f"Expanded columns: {expanded_df.shape[1]}")
df2 = expanded_df.copy()


In [None]:
df1.columns

In [None]:
df2.columns

In [None]:
df2.rename(columns={'calc_apps_score':'calc_s_apps_score', 'calc_credo_gen_score':'calc_s_credo_score', 'calc_beta_demo_score':'calc_sb_demo_score'}, inplace = True)

##### concatenate

In [None]:
df_concat = pd.concat([df1, df2], ignore_index=True)
df_concat.info()

In [None]:
df = df_concat.drop(columns=['calcFeatures']).copy()

In [None]:
df['sil_beta_stack_score'] = pd.to_numeric(df['sil_beta_stack_score'], errors='coerce')

In [None]:
df = df.copy()

# Define feature list
feature_list = ['sil_beta_stack_score',
        'calc_s_apps_score', 'calc_s_credo_score', 'calc_sb_demo_score']

# Define segment columns
segment_columns = ['new_loan_type','osType', 'loan_product_type']
# Calculate month-on-month PSI
psi_results = calculate_month_on_month_psi(df, feature_list, segment_columns)
psi_results['modelDisplayName'] = df['modelDisplayName'].iloc[0]
psi_results['Model_Name'] = df['Model_Name'].iloc[0]
psi_results['modelVersionId'] = df['modelVersionId'].iloc[0]
psi_results['trenchCategory'] = df['trenchCategory'].iloc[0]
psi_results = psi_results[['modelDisplayName', 'Model_Name', 'modelVersionId', 'trenchCategory',
                           'Feature', 'Feature_Type', 'Segment_Column', 'Segment_Value', 'Month',
                           'Base_Month', 'Current_Month', 'Base_Count', 'Actual_Count',
                           'Expected_Percentage', 'Actual_Percentage', 'PSI']].copy()

# # Calculate bin-level PSI
# bin_psi_results = calculate_bin_level_psi(df, feature_list, segment_columns)
# bin_psi_results['modelDisplayName'] = df['modelDisplayName'].iloc[0]
# bin_psi_results['Model_Name'] = df['Model_Name'].iloc[0]
# bin_psi_results['modelVersionId'] = df['modelVersionId'].iloc[0]
# bin_psi_results['trenchCategory'] = df['trenchCategory'].iloc[0]
# bin_psi_results = bin_psi_results[['modelDisplayName', 'Model_Name', 'modelVersionId', 'trenchCategory',
#                                    'Feature', 'Feature_Type', 'Segment_Column', 'Segment_Value',
#                                     'Month', 'Base_Month', 'Current_Month', 'Base_Count', 'Actual_Count',
#                                     'Bin', 'Bin_Range', 'Base_Percentage', 'Actual_Percentage', 'Bin_PSI']].copy()

# Upload to BigQuery
table_id = "prj-prod-dataplatform.dap_ds_poweruser_playground.beta_sil_stack_model_psi_v4"
job_config = bigquery.LoadJobConfig(
    write_disposition="WRITE_TRUNCATE",  # or "WRITE_APPEND"
)
job = client.load_table_from_dataframe(psi_results, table_id, job_config=job_config)
job.result()  # Wait for the job to complete


# # Upload to BigQuery
# table_id = "prj-prod-dataplatform.dap_ds_poweruser_playground.beta_sil_stack_model_csi_v4"
# job_config = bigquery.LoadJobConfig(
#     write_disposition="WRITE_TRUNCATE",  # or "WRITE_APPEND"
# )
# job = client.load_table_from_dataframe(bin_psi_results, table_id, job_config=job_config)
# job.result()  # Wait for the job to complete

#### Beta - IncomeEstimationModel

##### Test

In [None]:
sq = """
WITH cleaned AS (
  SELECT
    customerId,digitalLoanAccountId,prediction,start_time,end_time,modelDisplayName,modelVersionId,

    REPLACE(REPLACE(calcFeature, "'", '"'), "None", "null") AS calcFeature
  FROM `prj-prod-dataplatform.audit_balance.ml_model_run_details`
  WHERE modelDisplayName in  ('Beta - IncomeEstimationModel', 'beta_income_model')
  and modelVersionId='v1'
  )
SELECT
  r.customerId,r.digitalLoanAccountId,prediction sil_beta_income_estimation_score,start_time,end_time,modelDisplayName,modelVersionId,
  JSON_VALUE(calcFeature, "$.inc_beta_ln_loan_type") AS inc_beta_ln_loan_type,
  JSON_VALUE(calcFeature, "$.inc_beta_ln_education_level") AS inc_beta_ln_education_level,
  JSON_VALUE(calcFeature, "$.inc_beta_ln_employment_type_new") AS inc_beta_ln_employment_type_new,
  JSON_VALUE(calcFeature, "$.inc_beta_ln_industry_new") AS inc_beta_ln_industry_new,
  JSON_VALUE(calcFeature, "$.inc_beta_ln_age") AS inc_beta_ln_age,
  JSON_VALUE(calcFeature, "$.inc_beta_ln_brand") AS inc_beta_ln_brand,
  JSON_VALUE(calcFeature, "$.inc_beta_ln_city") AS inc_beta_ln_city,
  JSON_VALUE(calcFeature, "$.inc_beta_ln_purpose") AS inc_beta_ln_purpose,
  JSON_VALUE(calcFeature, "$.inc_beta_ln_osversion_bin") AS inc_beta_ln_osversion_bin,
  JSON_VALUE(calcFeature, "$.inc_beta_ln_postal_code") AS inc_beta_ln_postal_code,
  JSON_VALUE(calcFeature, "$.inc_beta_ln_gender") AS inc_beta_ln_gender,
  JSON_VALUE(calcFeature, "$.inc_beta_ln_doc_type_rolled") AS inc_beta_ln_doc_type_rolled,
  JSON_VALUE(calcFeature, "$.inc_beta_ln_cnt_dependents") AS inc_beta_ln_cnt_dependents,
  JSON_VALUE(calcFeature, "$.inc_beta_ln_source_of_funds_new") AS inc_beta_ln_source_of_funds_new,
  JSON_VALUE(calcFeature, "$.inc_beta_ln_marital_status_new") AS inc_beta_ln_marital_status_new,
  JSON_VALUE(calcFeature, "$.inc_beta_encoded_company_name_grouped") AS inc_beta_encoded_company_name_grouped,
   loanmaster.new_loan_type,
 loanmaster.gender,
    case when loanmaster.loantype='BNPL' and sil_category.store_type =1 then 'Appliance'
    when loanmaster.loantype='BNPL' and sil_category.store_type =2 then 'Mobile'
    when loanmaster.loantype='BNPL' and sil_category.store_type =3 then 'Mall'
    when loanmaster.loantype='BNPL' and sil_category.store_type not in (1,2,3) then store_tagging
    else 'not applicable' end as loan_product_type,
 'SIL Beta - IncomeEstimationModel' Model_Name,
 'SIL' as product,
 'NA' trenchCategory,
  r.calcFeature calcFeatures,
  'Test' Data_selection,
  coalesce(IF(loanmaster.new_loan_type = 'Flex-up', loanmaster.startApplyDateTime, loanmaster.termsAndConditionsSubmitDateTime),  r.start_time) AS appln_submit_datetime,
  loanmaster.disbursementDateTime,
  format_date('%Y-%m', coalesce(IF(loanmaster.new_loan_type = 'Flex-up', loanmaster.startApplyDateTime, loanmaster.termsAndConditionsSubmitDateTime),  r.start_time)) as Application_month,
FROM cleaned r
left join risk_credit_mis.loan_master_table loanmaster
  ON loanmaster.digitalLoanAccountId = r.digitalLoanAccountId
 left join(SELECT DISTINCT mer_refferal_code, mer_name mer_name,store_type,store_tagging FROM `dl_loans_db_raw.tdbk_merchant_refferal_mtb`
  left join worktable_datachampions.TARGET_SPLIT P on P.STORE_NAME = mer_name
 qualify row_number() over(partition by mer_refferal_code order by  created_dt desc)=1) sil_category on loanmaster.purpleKey=sil_category.mer_refferal_code
qualify row_number() over (partition by r.customerId, r.digitalLoanAccountId order by start_time desc) = 1
  ;
"""
dfd = client.query(sq).to_dataframe()
dfd.head()

In [None]:
df1 = dfd.copy()

##### Train

In [None]:
sq = """
WITH cleaned AS (
  SELECT
    customerId,digitalLoanAccountId,prediction,start_time,end_time,modelDisplayName,modelVersionId,

    REPLACE(REPLACE(calcFeature, "'", '"'), "None", "null") AS calcFeature
  FROM prj-prod-dataplatform.dap_ds_poweruser_playground.ml_training_model_run_details
  WHERE modelDisplayName in  ('Beta - IncomeEstimationModel', 'beta_income_model')
  )
SELECT
  r.customerId,r.digitalLoanAccountId,prediction sil_beta_income_estimation_score,start_time,end_time,modelDisplayName,modelVersionId,
  JSON_VALUE(calcFeature, "$.inc_beta_ln_loan_type") AS inc_beta_ln_loan_type,
  JSON_VALUE(calcFeature, "$.inc_beta_ln_education_level") AS inc_beta_ln_education_level,
  JSON_VALUE(calcFeature, "$.inc_beta_ln_employment_type_new") AS inc_beta_ln_employment_type_new,
  JSON_VALUE(calcFeature, "$.inc_beta_ln_industry_new") AS inc_beta_ln_industry_new,
  JSON_VALUE(calcFeature, "$.inc_beta_ln_age") AS inc_beta_ln_age,
  JSON_VALUE(calcFeature, "$.inc_beta_ln_brand") AS inc_beta_ln_brand,
  JSON_VALUE(calcFeature, "$.inc_beta_ln_city") AS inc_beta_ln_city,
  JSON_VALUE(calcFeature, "$.inc_beta_ln_purpose") AS inc_beta_ln_purpose,
  JSON_VALUE(calcFeature, "$.inc_beta_ln_osversion_bin") AS inc_beta_ln_osversion_bin,
  JSON_VALUE(calcFeature, "$.inc_beta_ln_postal_code") AS inc_beta_ln_postal_code,
  JSON_VALUE(calcFeature, "$.inc_beta_ln_gender") AS inc_beta_ln_gender,
  JSON_VALUE(calcFeature, "$.inc_beta_ln_doc_type_rolled") AS inc_beta_ln_doc_type_rolled,
  JSON_VALUE(calcFeature, "$.inc_beta_ln_cnt_dependents") AS inc_beta_ln_cnt_dependents,
  JSON_VALUE(calcFeature, "$.inc_beta_ln_source_of_funds_new") AS inc_beta_ln_source_of_funds_new,
  JSON_VALUE(calcFeature, "$.inc_beta_ln_marital_status_new") AS inc_beta_ln_marital_status_new,
  JSON_VALUE(calcFeature, "$.inc_beta_encoded_company_name_grouped") AS inc_beta_encoded_company_name_grouped,
   loanmaster.new_loan_type,
 loanmaster.gender,
    case when loanmaster.loantype='BNPL' and sil_category.store_type =1 then 'Appliance'
    when loanmaster.loantype='BNPL' and sil_category.store_type =2 then 'Mobile'
    when loanmaster.loantype='BNPL' and sil_category.store_type =3 then 'Mall'
    when loanmaster.loantype='BNPL' and sil_category.store_type not in (1,2,3) then store_tagging
    else 'not applicable' end as loan_product_type,
 'SIL Beta - IncomeEstimationModel' Model_Name,
 'SIL' as product,
 'NA' trenchCategory,
  r.calcFeature calcFeatures,
  'Train' Data_selection,
  IF(loanmaster.new_loan_type = 'Flex-up', loanmaster.startApplyDateTime, loanmaster.termsAndConditionsSubmitDateTime) AS appln_submit_datetime,
  loanmaster.disbursementDateTime,
  format_date('%Y-%m', IF(loanmaster.new_loan_type = 'Flex-up', loanmaster.startApplyDateTime, loanmaster.termsAndConditionsSubmitDateTime)) as Application_month,
FROM cleaned r
left join risk_credit_mis.loan_master_table loanmaster
  ON loanmaster.digitalLoanAccountId = r.digitalLoanAccountId
 left join(SELECT DISTINCT mer_refferal_code, mer_name mer_name,store_type,store_tagging FROM `dl_loans_db_raw.tdbk_merchant_refferal_mtb`
  left join worktable_datachampions.TARGET_SPLIT P on P.STORE_NAME = mer_name
 qualify row_number() over(partition by mer_refferal_code order by  created_dt desc)=1) sil_category on loanmaster.purpleKey=sil_category.mer_refferal_code
qualify row_number() over (partition by r.customerId, r.digitalLoanAccountId order by cast(start_time as datetime) desc) = 1
  ;
"""
dfd = client.query(sq).to_dataframe()
dfd.head()

In [None]:
df2 = dfd.copy()

In [None]:
df1.columns

In [None]:
df2.columns

##### concatenate

In [None]:
df_concat = pd.concat([df1, df2], ignore_index=True)
df_concat.info()

In [None]:
df = df_concat.drop(columns=['calcFeatures']).copy()

In [None]:
df['sil_beta_income_estimation_score'] = pd.to_numeric(df['sil_beta_income_estimation_score'], errors='coerce')

##### PSI calculation

In [None]:
df = df.copy()

# Define feature list
feature_list = ['sil_beta_income_estimation_score',  'inc_beta_ln_loan_type',
       'inc_beta_ln_education_level', 'inc_beta_ln_employment_type_new',
       'inc_beta_ln_industry_new', 'inc_beta_ln_age', 'inc_beta_ln_brand',
       'inc_beta_ln_city', 'inc_beta_ln_purpose', 'inc_beta_ln_osversion_bin',
       'inc_beta_ln_postal_code', 'inc_beta_ln_gender',
       'inc_beta_ln_doc_type_rolled', 'inc_beta_ln_cnt_dependents',
       'inc_beta_ln_source_of_funds_new', 'inc_beta_ln_marital_status_new',
       'inc_beta_encoded_company_name_grouped',]

# Define segment columns
segment_columns = ['new_loan_type','osType', 'loan_product_type']
# Calculate month-on-month PSI
psi_results = calculate_month_on_month_psi(df, feature_list, segment_columns)
psi_results['modelDisplayName'] = df['modelDisplayName'].iloc[0]
psi_results['Model_Name'] = df['Model_Name'].iloc[0]
psi_results['modelVersionId'] = df['modelVersionId'].iloc[0]
psi_results['trenchCategory'] = df['trenchCategory'].iloc[0]
psi_results = psi_results[['modelDisplayName', 'Model_Name', 'modelVersionId', 'trenchCategory',
                           'Feature', 'Feature_Type', 'Segment_Column', 'Segment_Value', 'Month',
                           'Base_Month', 'Current_Month', 'Base_Count', 'Actual_Count',
                           'Expected_Percentage', 'Actual_Percentage', 'PSI']].copy()

# # Calculate bin-level PSI
# bin_psi_results = calculate_bin_level_psi(df, feature_list, segment_columns)
# bin_psi_results['modelDisplayName'] = df['modelDisplayName'].iloc[0]
# bin_psi_results['Model_Name'] = df['Model_Name'].iloc[0]
# bin_psi_results['modelVersionId'] = df['modelVersionId'].iloc[0]
# bin_psi_results['trenchCategory'] = df['trenchCategory'].iloc[0]
# bin_psi_results = bin_psi_results[['modelDisplayName', 'Model_Name', 'modelVersionId', 'trenchCategory',
#                                    'Feature', 'Feature_Type', 'Segment_Column', 'Segment_Value',
#                                     'Month', 'Base_Month', 'Current_Month', 'Base_Count', 'Actual_Count',
#                                     'Bin', 'Bin_Range', 'Base_Percentage', 'Actual_Percentage', 'Bin_PSI']].copy()

# Upload to BigQuery
table_id = "prj-prod-dataplatform.dap_ds_poweruser_playground.beta_sil_income_estimation_model_psi_v4"
job_config = bigquery.LoadJobConfig(
    write_disposition="WRITE_TRUNCATE",  # or "WRITE_APPEND"
)
job = client.load_table_from_dataframe(psi_results, table_id, job_config=job_config)
job.result()  # Wait for the job to complete


# # Upload to BigQuery
# table_id = "prj-prod-dataplatform.dap_ds_poweruser_playground.beta_sil_income_estimation_model_csi_v4"
# job_config = bigquery.LoadJobConfig(
#     write_disposition="WRITE_TRUNCATE",  # or "WRITE_APPEND"
# )
# job = client.load_table_from_dataframe(bin_psi_results, table_id, job_config=job_config)
# job.result()  # Wait for the job to complete

# Cash

#### Alpha-Cash-CIC-Model (All Trench)

##### Trench 1

##### Test

In [None]:
sq = """
WITH parsed as (
select customerId, digitalLoanAccountId,modelDisplayName,modelVersionId,start_time,end_time,prediction,trenchCategory,
REPLACE(REPLACE(calcFeature, "'", '"'), "None", "null") AS calcFeatures,
FROM `prj-prod-dataplatform.audit_balance.ml_model_run_details`
where modelDisplayName in ('Alpha-Cash-CIC-Model','Alpha Cash CIC Model','cic_model_cash')
),
base as
(select 
 r.customerId,r.digitalLoanAccountId,prediction aCicScore 
 ,start_time,end_time,
  modelDisplayName,modelVersionId,
     loanmaster.new_loan_type loanType,
 loanmaster.gender,
    case when loanmaster.loantype='BNPL' and sil_category.store_type =1 then 'Appliance'
    when loanmaster.loantype='BNPL' and sil_category.store_type =2 then 'Mobile'
    when loanmaster.loantype='BNPL' and sil_category.store_type =3 then 'Mall'
    when loanmaster.loantype='BNPL' and sil_category.store_type not in (1,2,3) then store_tagging
    else 'not applicable' end as loan_product_type,
     case when lower(coalesce(loanmaster.osversion_v2, loanmaster.osVersion)) like '%andro%' then 'android'
    when lower(coalesce(loanmaster.osversion_v2, loanmaster.osVersion)) like '%os%' then 'ios'
    when lower(loanmaster.deviceType) like '%andro%' then 'android'
    else 'ios' end osType,
 'Cash' as product,
 'Alpha-Cash-CIC-Model_All_Trench' Model_Name,
  trenchCategory,
   'Test' Data_selection,
  calcFeatures,
  coalesce(IF(loanmaster.new_loan_type = 'Flex-up', loanmaster.startApplyDateTime, loanmaster.termsAndConditionsSubmitDateTime),  r.start_time) AS appln_submit_datetime,
  loanmaster.disbursementDateTime,
  format_date('%Y-%m', coalesce(IF(loanmaster.new_loan_type = 'Flex-up', loanmaster.startApplyDateTime, loanmaster.termsAndConditionsSubmitDateTime),  r.start_time)) as Application_month, 
from parsed r
left join risk_credit_mis.loan_master_table loanmaster
  ON loanmaster.digitalLoanAccountId = r.digitalLoanAccountId
 left join(SELECT DISTINCT mer_refferal_code, mer_name mer_name,store_type,store_tagging FROM `dl_loans_db_raw.tdbk_merchant_refferal_mtb`
  left join worktable_datachampions.TARGET_SPLIT P on P.STORE_NAME = mer_name
 qualify row_number() over(partition by mer_refferal_code order by  created_dt desc)=1) sil_category on loanmaster.purpleKey=sil_category.mer_refferal_code
  qualify row_number() over(partition by r.customerId, r.digitalLoanAccountid order by cast(start_time as datetime) desc) = 1
)
select * from base where trenchCategory = 'Trench 1'
""" 

dfd= client.query(sq).to_dataframe(progress_bar_type='tqdm')
dfd.head()

In [None]:
# Expand the calcFeatures column
expanded_df = expand_calc_features(dfd)

# Display the result
print(f"Original columns: {dfd.shape[1]}")
print(f"Expanded columns: {expanded_df.shape[1]}")
df1 = expanded_df.copy()


##### Train

In [None]:
sq = """ 
WITH parsed as (
select customerId, digitalLoanAccountId,modelDisplayName,modelVersionId,start_time,end_time,prediction,trenchCategory,
REPLACE(REPLACE(calcFeature, "'", '"'), "None", "null") AS calcFeatures,
FROM prj-prod-dataplatform.dap_ds_poweruser_playground.ml_training_model_run_details
where modelDisplayName in ('Alpha-Cash-CIC-Model','Alpha Cash CIC Model','cic_model_cash')
),
base as
(select 
 r.customerId,r.digitalLoanAccountId,prediction aCicScore 
 ,start_time,end_time,
  modelDisplayName,modelVersionId,
     loanmaster.new_loan_type loanType,
 loanmaster.gender,
    case when loanmaster.loantype='BNPL' and sil_category.store_type =1 then 'Appliance'
    when loanmaster.loantype='BNPL' and sil_category.store_type =2 then 'Mobile'
    when loanmaster.loantype='BNPL' and sil_category.store_type =3 then 'Mall'
    when loanmaster.loantype='BNPL' and sil_category.store_type not in (1,2,3) then store_tagging
    else 'not applicable' end as loan_product_type,
     case when lower(coalesce(loanmaster.osversion_v2, loanmaster.osVersion)) like '%andro%' then 'android'
    when lower(coalesce(loanmaster.osversion_v2, loanmaster.osVersion)) like '%os%' then 'ios'
    when lower(loanmaster.deviceType) like '%andro%' then 'android'
    else 'ios' end osType,
 'Cash' as product,
 'Alpha-Cash-CIC-Model_All_Trench' Model_Name,
  trenchCategory,
   'Train' Data_selection,
  calcFeatures,
  IF(loanmaster.new_loan_type = 'Flex-up', loanmaster.startApplyDateTime, loanmaster.termsAndConditionsSubmitDateTime) AS appln_submit_datetime,
  loanmaster.disbursementDateTime,
  format_date('%Y-%m', IF(loanmaster.new_loan_type = 'Flex-up', loanmaster.startApplyDateTime, loanmaster.termsAndConditionsSubmitDateTime)) as Application_month, 
from parsed r
left join risk_credit_mis.loan_master_table loanmaster
  ON loanmaster.digitalLoanAccountId = r.digitalLoanAccountId
 left join(SELECT DISTINCT mer_refferal_code, mer_name mer_name,store_type,store_tagging FROM `dl_loans_db_raw.tdbk_merchant_refferal_mtb`
  left join worktable_datachampions.TARGET_SPLIT P on P.STORE_NAME = mer_name
 qualify row_number() over(partition by mer_refferal_code order by  created_dt desc)=1) sil_category on loanmaster.purpleKey=sil_category.mer_refferal_code
  qualify row_number() over(partition by r.customerId, r.digitalLoanAccountid order by cast(start_time as datetime) desc) = 1
)
select * from base where trenchCategory = 'Trench 1'
;
"""
dfd= client.query(sq).to_dataframe(progress_bar_type='tqdm')
dfd.head()


In [None]:
# Expand the calcFeatures column
expanded_df = expand_calc_features(dfd)

# Display the result
print(f"Original columns: {dfd.shape[1]}")
print(f"Expanded columns: {expanded_df.shape[1]}")
df2 = expanded_df.copy()

In [None]:
df1.columns

In [None]:
df2.columns

In [None]:
df2.rename(columns = {
    'calc_max_age_all_contracts_snapshot':'calc_cic_max_age_all_contracts_snapshot',
    'calc_ratio_overdue_contracts_to_granted_contracts':'calc_cic_ratio_overdue_contracts_to_granted_contracts',
    'calc_ScoreRange':'calc_cic_ScoreRange',
    'calc_ln_loan_level_user_type':'calc_cic_ln_loan_level_user_type', 
    'calc_has_ever_been_overdue':'calc_cic_has_ever_been_overdue',
    'calc_latest_granted_contract_overdue_flag':'calc_cic_latest_granted_contract_overdue_flag',
    'calc_ratio_closed_over_new_granted_cnt_24M':'calc_cic_ratio_closed_over_new_granted_cnt_24M',
    'calc_ratio_risky_contracts_to_granted_contracts':'calc_cic_ratio_risky_contracts_to_granted_contracts',
    'calc_Short_and_Term_Loans_granted_contracts_cnt_24M':'calc_cic_Short_and_Term_Loans_granted_contracts_cnt_24M',
    'calc_flg_zero_non_granted_ever':'calc_cic_flg_zero_non_granted_ever', 
    'calc_CreditAvgCreditLimit':'calc_cic_CreditAvgCreditLimit',
    'calc_flg_zero_granted_ever':'calc_cic_flg_zero_granted_ever', 
    'calc_ca_cic_score':'calc_cic_ca_cic_score',
    'calc_Personal_Loans_granted_contracts_amt_24M':'calc_cic_Personal_Loans_granted_contracts_amt_24M'
    
}, inplace = True)

##### concatenate

In [None]:
df_concat = pd.concat([df1, df2], ignore_index=True)
df_concat.info()

In [None]:
df = df_concat.drop(columns=['calcFeatures']).copy()

In [None]:
df['aCicScore'] = pd.to_numeric(df['aCicScore'], errors='coerce')

In [None]:
df = df.copy()

# Define feature list
feature_list = ['aCicScore',
     'calc_cic_max_age_all_contracts_snapshot',
       'calc_cic_ratio_overdue_contracts_to_granted_contracts',
       'calc_cic_ScoreRange', 'calc_cic_ln_loan_level_user_type',
       'calc_cic_has_ever_been_overdue',
       'calc_cic_latest_granted_contract_overdue_flag',
       'calc_cic_ratio_closed_over_new_granted_cnt_24M',
       'calc_cic_ratio_risky_contracts_to_granted_contracts',
       'calc_cic_Short_and_Term_Loans_granted_contracts_cnt_24M',
       'calc_cic_flg_zero_non_granted_ever',
       'calc_cic_Personal_Loans_granted_contracts_amt_24M',
       'calc_cic_CreditAvgCreditLimit', 'calc_cic_flg_zero_granted_ever'
]
# Define segment columns
segment_columns = ['new_loan_type','osType', 'loan_product_type']
# Calculate month-on-month PSI
psi_results = calculate_month_on_month_psi(df, feature_list, segment_columns)
psi_results['modelDisplayName'] = df['modelDisplayName'].iloc[0]
psi_results['Model_Name'] = df['Model_Name'].iloc[0]
psi_results['modelVersionId'] = df['modelVersionId'].iloc[0]
psi_results['trenchCategory'] = df['trenchCategory'].iloc[0]
psi_results = psi_results[['modelDisplayName', 'Model_Name', 'modelVersionId', 'trenchCategory',
                           'Feature', 'Feature_Type', 'Segment_Column', 'Segment_Value', 'Month',
                           'Base_Month', 'Current_Month', 'Base_Count', 'Actual_Count',
                           'Expected_Percentage', 'Actual_Percentage', 'PSI']].copy()

# # Calculate bin-level PSI
# bin_psi_results = calculate_bin_level_psi(df, feature_list, segment_columns)
# bin_psi_results['modelDisplayName'] = df['modelDisplayName'].iloc[0]
# bin_psi_results['Model_Name'] = df['Model_Name'].iloc[0]
# bin_psi_results['modelVersionId'] = df['modelVersionId'].iloc[0]
# bin_psi_results['trenchCategory'] = df['trenchCategory'].iloc[0]
# bin_psi_results = bin_psi_results[['modelDisplayName', 'Model_Name', 'modelVersionId', 'trenchCategory',
#                                    'Feature', 'Feature_Type', 'Segment_Column', 'Segment_Value',
#                                     'Month', 'Base_Month', 'Current_Month', 'Base_Count', 'Actual_Count',
#                                     'Bin', 'Bin_Range', 'Base_Percentage', 'Actual_Percentage', 'Bin_PSI']].copy()

# Upload to BigQuery
table_id = "prj-prod-dataplatform.dap_ds_poweruser_playground.alpha_cash_cic_model_psi_v4"
job_config = bigquery.LoadJobConfig(
    write_disposition="WRITE_TRUNCATE",  # or "WRITE_APPEND"
)
job = client.load_table_from_dataframe(psi_results, table_id, job_config=job_config)
job.result()  # Wait for the job to complete


# # Upload to BigQuery
# table_id = "prj-prod-dataplatform.dap_ds_poweruser_playground.beta_sil_income_estimation_model_csi_v4"
# job_config = bigquery.LoadJobConfig(
#     write_disposition="WRITE_TRUNCATE",  # or "WRITE_APPEND"
# )
# job = client.load_table_from_dataframe(bin_psi_results, table_id, job_config=job_config)
# job.result()  # Wait for the job to complete

##### Trench 2

##### Test

In [None]:
sq = """
WITH parsed as (
select customerId, digitalLoanAccountId,modelDisplayName,modelVersionId,start_time,end_time,prediction,trenchCategory,
REPLACE(REPLACE(calcFeature, "'", '"'), "None", "null") AS calcFeatures,
FROM `prj-prod-dataplatform.audit_balance.ml_model_run_details`
where modelDisplayName in ('Alpha-Cash-CIC-Model','Alpha Cash CIC Model','cic_model_cash')
),
base as
(select 
 r.customerId,r.digitalLoanAccountId,prediction aCicScore 
 ,start_time,end_time,
  modelDisplayName,modelVersionId,
     loanmaster.new_loan_type loanType,
 loanmaster.gender,
    case when loanmaster.loantype='BNPL' and sil_category.store_type =1 then 'Appliance'
    when loanmaster.loantype='BNPL' and sil_category.store_type =2 then 'Mobile'
    when loanmaster.loantype='BNPL' and sil_category.store_type =3 then 'Mall'
    when loanmaster.loantype='BNPL' and sil_category.store_type not in (1,2,3) then store_tagging
    else 'not applicable' end as loan_product_type,
     case when lower(coalesce(loanmaster.osversion_v2, loanmaster.osVersion)) like '%andro%' then 'android'
    when lower(coalesce(loanmaster.osversion_v2, loanmaster.osVersion)) like '%os%' then 'ios'
    when lower(loanmaster.deviceType) like '%andro%' then 'android'
    else 'ios' end osType,
 'Cash' as product,
 'Alpha-Cash-CIC-Model_All_Trench' Model_Name,
  trenchCategory,
   'Test' Data_selection,
  calcFeatures,
  coalesce(IF(loanmaster.new_loan_type = 'Flex-up', loanmaster.startApplyDateTime, loanmaster.termsAndConditionsSubmitDateTime),  r.start_time) AS appln_submit_datetime,
  loanmaster.disbursementDateTime,
  format_date('%Y-%m', coalesce(IF(loanmaster.new_loan_type = 'Flex-up', loanmaster.startApplyDateTime, loanmaster.termsAndConditionsSubmitDateTime),  r.start_time)) as Application_month, 
from parsed r
left join risk_credit_mis.loan_master_table loanmaster
  ON loanmaster.digitalLoanAccountId = r.digitalLoanAccountId
 left join(SELECT DISTINCT mer_refferal_code, mer_name mer_name,store_type,store_tagging FROM `dl_loans_db_raw.tdbk_merchant_refferal_mtb`
  left join worktable_datachampions.TARGET_SPLIT P on P.STORE_NAME = mer_name
 qualify row_number() over(partition by mer_refferal_code order by  created_dt desc)=1) sil_category on loanmaster.purpleKey=sil_category.mer_refferal_code
  qualify row_number() over(partition by r.customerId, r.digitalLoanAccountid order by cast(start_time as datetime) desc) = 1
)
select * from base where trenchCategory = 'Trench 2'
""" 

dfd= client.query(sq).to_dataframe(progress_bar_type='tqdm')
dfd.head()

In [None]:
# Expand the calcFeatures column
expanded_df = expand_calc_features(dfd)

# Display the result
print(f"Original columns: {dfd.shape[1]}")
print(f"Expanded columns: {expanded_df.shape[1]}")
df1 = expanded_df.copy()


##### Train

In [None]:
sq = """ 
WITH parsed as (
select customerId, digitalLoanAccountId,modelDisplayName,modelVersionId,start_time,end_time,prediction,trenchCategory,
REPLACE(REPLACE(calcFeature, "'", '"'), "None", "null") AS calcFeatures,
FROM prj-prod-dataplatform.dap_ds_poweruser_playground.ml_training_model_run_details
where modelDisplayName in ('Alpha-Cash-CIC-Model','Alpha Cash CIC Model','cic_model_cash')
),
base as
(select 
 r.customerId,r.digitalLoanAccountId,prediction aCicScore 
 ,start_time,end_time,
  modelDisplayName,modelVersionId,
     loanmaster.new_loan_type loanType,
 loanmaster.gender,
    case when loanmaster.loantype='BNPL' and sil_category.store_type =1 then 'Appliance'
    when loanmaster.loantype='BNPL' and sil_category.store_type =2 then 'Mobile'
    when loanmaster.loantype='BNPL' and sil_category.store_type =3 then 'Mall'
    when loanmaster.loantype='BNPL' and sil_category.store_type not in (1,2,3) then store_tagging
    else 'not applicable' end as loan_product_type,
     case when lower(coalesce(loanmaster.osversion_v2, loanmaster.osVersion)) like '%andro%' then 'android'
    when lower(coalesce(loanmaster.osversion_v2, loanmaster.osVersion)) like '%os%' then 'ios'
    when lower(loanmaster.deviceType) like '%andro%' then 'android'
    else 'ios' end osType,
 'Cash' as product,
 'Alpha-Cash-CIC-Model_All_Trench' Model_Name,
  trenchCategory,
   'Train' Data_selection,
  calcFeatures,
  IF(loanmaster.new_loan_type = 'Flex-up', loanmaster.startApplyDateTime, loanmaster.termsAndConditionsSubmitDateTime) AS appln_submit_datetime,
  loanmaster.disbursementDateTime,
  format_date('%Y-%m', IF(loanmaster.new_loan_type = 'Flex-up', loanmaster.startApplyDateTime, loanmaster.termsAndConditionsSubmitDateTime)) as Application_month, 
from parsed r
left join risk_credit_mis.loan_master_table loanmaster
  ON loanmaster.digitalLoanAccountId = r.digitalLoanAccountId
 left join(SELECT DISTINCT mer_refferal_code, mer_name mer_name,store_type,store_tagging FROM `dl_loans_db_raw.tdbk_merchant_refferal_mtb`
  left join worktable_datachampions.TARGET_SPLIT P on P.STORE_NAME = mer_name
 qualify row_number() over(partition by mer_refferal_code order by  created_dt desc)=1) sil_category on loanmaster.purpleKey=sil_category.mer_refferal_code
  qualify row_number() over(partition by r.customerId, r.digitalLoanAccountid order by cast(start_time as datetime) desc) = 1
)
select * from base where trenchCategory = 'Trench 2'
;
"""
dfd= client.query(sq).to_dataframe(progress_bar_type='tqdm')
dfd.head()


In [None]:
# Expand the calcFeatures column
expanded_df = expand_calc_features(dfd)

# Display the result
print(f"Original columns: {dfd.shape[1]}")
print(f"Expanded columns: {expanded_df.shape[1]}")
df2 = expanded_df.copy()

In [None]:
df1.columns

In [None]:
df2.columns

In [None]:
df2.rename(columns = {
    'calc_max_age_all_contracts_snapshot':'calc_cic_max_age_all_contracts_snapshot',
    'calc_ratio_overdue_contracts_to_granted_contracts':'calc_cic_ratio_overdue_contracts_to_granted_contracts',
    'calc_ScoreRange':'calc_cic_ScoreRange',
    'calc_ln_loan_level_user_type':'calc_cic_ln_loan_level_user_type', 
    'calc_has_ever_been_overdue':'calc_cic_has_ever_been_overdue',
    'calc_latest_granted_contract_overdue_flag':'calc_cic_latest_granted_contract_overdue_flag',
    'calc_ratio_closed_over_new_granted_cnt_24M':'calc_cic_ratio_closed_over_new_granted_cnt_24M',
    'calc_ratio_risky_contracts_to_granted_contracts':'calc_cic_ratio_risky_contracts_to_granted_contracts',
    'calc_Short_and_Term_Loans_granted_contracts_cnt_24M':'calc_cic_Short_and_Term_Loans_granted_contracts_cnt_24M',
    'calc_flg_zero_non_granted_ever':'calc_cic_flg_zero_non_granted_ever', 
    'calc_CreditAvgCreditLimit':'calc_cic_CreditAvgCreditLimit',
    'calc_flg_zero_granted_ever':'calc_cic_flg_zero_granted_ever', 
    'calc_ca_cic_score':'calc_cic_ca_cic_score',
    'calc_Personal_Loans_granted_contracts_amt_24M':'calc_cic_Personal_Loans_granted_contracts_amt_24M'
    
}, inplace = True)

##### concatenate

In [None]:
df_concat = pd.concat([df1, df2], ignore_index=True)
df_concat.info()

In [None]:
df = df_concat.drop(columns=['calcFeatures']).copy()
df['aCicScore'] = pd.to_numeric(df['aCicScore'], errors='coerce')

In [None]:
df = df.copy()

# Define feature list
feature_list = ['aCicScore',
     'calc_cic_max_age_all_contracts_snapshot',
       'calc_cic_ratio_overdue_contracts_to_granted_contracts',
       'calc_cic_ScoreRange', 'calc_cic_ln_loan_level_user_type',
       'calc_cic_has_ever_been_overdue',
       'calc_cic_latest_granted_contract_overdue_flag',
       'calc_cic_ratio_closed_over_new_granted_cnt_24M',
       'calc_cic_ratio_risky_contracts_to_granted_contracts',
       'calc_cic_Short_and_Term_Loans_granted_contracts_cnt_24M',
       'calc_cic_flg_zero_non_granted_ever',
       'calc_cic_Personal_Loans_granted_contracts_amt_24M',
       'calc_cic_CreditAvgCreditLimit', 'calc_cic_flg_zero_granted_ever'
]
# Define segment columns
segment_columns = ['new_loan_type','osType', 'loan_product_type']
# Calculate month-on-month PSI
psi_results = calculate_month_on_month_psi(df, feature_list, segment_columns)
psi_results['modelDisplayName'] = df['modelDisplayName'].iloc[0]
psi_results['Model_Name'] = df['Model_Name'].iloc[0]
psi_results['modelVersionId'] = df['modelVersionId'].iloc[0]
psi_results['trenchCategory'] = df['trenchCategory'].iloc[0]
psi_results = psi_results[['modelDisplayName', 'Model_Name', 'modelVersionId', 'trenchCategory',
                           'Feature', 'Feature_Type', 'Segment_Column', 'Segment_Value', 'Month',
                           'Base_Month', 'Current_Month', 'Base_Count', 'Actual_Count',
                           'Expected_Percentage', 'Actual_Percentage', 'PSI']].copy()

# # Calculate bin-level PSI
# bin_psi_results = calculate_bin_level_psi(df, feature_list, segment_columns)
# bin_psi_results['modelDisplayName'] = df['modelDisplayName'].iloc[0]
# bin_psi_results['Model_Name'] = df['Model_Name'].iloc[0]
# bin_psi_results['modelVersionId'] = df['modelVersionId'].iloc[0]
# bin_psi_results['trenchCategory'] = df['trenchCategory'].iloc[0]
# bin_psi_results = bin_psi_results[['modelDisplayName', 'Model_Name', 'modelVersionId', 'trenchCategory',
#                                    'Feature', 'Feature_Type', 'Segment_Column', 'Segment_Value',
#                                     'Month', 'Base_Month', 'Current_Month', 'Base_Count', 'Actual_Count',
#                                     'Bin', 'Bin_Range', 'Base_Percentage', 'Actual_Percentage', 'Bin_PSI']].copy()

# Upload to BigQuery
table_id = "prj-prod-dataplatform.dap_ds_poweruser_playground.alpha_cash_cic_model_psi_v4"
job_config = bigquery.LoadJobConfig(
    write_disposition="WRITE_APPEND",  # or "WRITE_APPEND"
)
job = client.load_table_from_dataframe(psi_results, table_id, job_config=job_config)
job.result()  # Wait for the job to complete


# # Upload to BigQuery
# table_id = "prj-prod-dataplatform.dap_ds_poweruser_playground.beta_sil_income_estimation_model_csi_v4"
# job_config = bigquery.LoadJobConfig(
#     write_disposition="WRITE_TRUNCATE",  # or "WRITE_APPEND"
# )
# job = client.load_table_from_dataframe(bin_psi_results, table_id, job_config=job_config)
# job.result()  # Wait for the job to complete

##### Trench 3

##### Test

In [None]:
sq = """
WITH parsed as (
select customerId, digitalLoanAccountId,modelDisplayName,modelVersionId,start_time,end_time,prediction,trenchCategory,
REPLACE(REPLACE(calcFeature, "'", '"'), "None", "null") AS calcFeatures,
FROM `prj-prod-dataplatform.audit_balance.ml_model_run_details`
where modelDisplayName in ('Alpha-Cash-CIC-Model','Alpha Cash CIC Model','cic_model_cash')
),
base as
(select 
 r.customerId,r.digitalLoanAccountId,prediction aCicScore 
 ,start_time,end_time,
  modelDisplayName,modelVersionId,
     loanmaster.new_loan_type loanType,
 loanmaster.gender,
    case when loanmaster.loantype='BNPL' and sil_category.store_type =1 then 'Appliance'
    when loanmaster.loantype='BNPL' and sil_category.store_type =2 then 'Mobile'
    when loanmaster.loantype='BNPL' and sil_category.store_type =3 then 'Mall'
    when loanmaster.loantype='BNPL' and sil_category.store_type not in (1,2,3) then store_tagging
    else 'not applicable' end as loan_product_type,
     case when lower(coalesce(loanmaster.osversion_v2, loanmaster.osVersion)) like '%andro%' then 'android'
    when lower(coalesce(loanmaster.osversion_v2, loanmaster.osVersion)) like '%os%' then 'ios'
    when lower(loanmaster.deviceType) like '%andro%' then 'android'
    else 'ios' end osType,
 'Cash' as product,
 'Alpha-Cash-CIC-Model_All_Trench' Model_Name,
  trenchCategory,
   'Test' Data_selection,
  calcFeatures,
  coalesce(IF(loanmaster.new_loan_type = 'Flex-up', loanmaster.startApplyDateTime, loanmaster.termsAndConditionsSubmitDateTime),  r.start_time) AS appln_submit_datetime,
  loanmaster.disbursementDateTime,
  format_date('%Y-%m', coalesce(IF(loanmaster.new_loan_type = 'Flex-up', loanmaster.startApplyDateTime, loanmaster.termsAndConditionsSubmitDateTime),  r.start_time)) as Application_month, 
from parsed r
left join risk_credit_mis.loan_master_table loanmaster
  ON loanmaster.digitalLoanAccountId = r.digitalLoanAccountId
 left join(SELECT DISTINCT mer_refferal_code, mer_name mer_name,store_type,store_tagging FROM `dl_loans_db_raw.tdbk_merchant_refferal_mtb`
  left join worktable_datachampions.TARGET_SPLIT P on P.STORE_NAME = mer_name
 qualify row_number() over(partition by mer_refferal_code order by  created_dt desc)=1) sil_category on loanmaster.purpleKey=sil_category.mer_refferal_code
  qualify row_number() over(partition by r.customerId, r.digitalLoanAccountid order by cast(start_time as datetime) desc) = 1
)
select * from base where trenchCategory = 'Trench 3'
""" 

dfd= client.query(sq).to_dataframe(progress_bar_type='tqdm')
dfd.head()

In [None]:
# Expand the calcFeatures column
expanded_df = expand_calc_features(dfd)

# Display the result
print(f"Original columns: {dfd.shape[1]}")
print(f"Expanded columns: {expanded_df.shape[1]}")
df1 = expanded_df.copy()


In [None]:
sq = """ 
WITH parsed as (
select customerId, digitalLoanAccountId,modelDisplayName,modelVersionId,start_time,end_time,prediction,trenchCategory,
REPLACE(REPLACE(calcFeature, "'", '"'), "None", "null") AS calcFeatures,
FROM prj-prod-dataplatform.dap_ds_poweruser_playground.ml_training_model_run_details
where modelDisplayName in ('Alpha-Cash-CIC-Model','Alpha Cash CIC Model','cic_model_cash')
),
base as
(select 
 r.customerId,r.digitalLoanAccountId,prediction aCicScore 
 ,start_time,end_time,
  modelDisplayName,modelVersionId,
     loanmaster.new_loan_type loanType,
 loanmaster.gender,
    case when loanmaster.loantype='BNPL' and sil_category.store_type =1 then 'Appliance'
    when loanmaster.loantype='BNPL' and sil_category.store_type =2 then 'Mobile'
    when loanmaster.loantype='BNPL' and sil_category.store_type =3 then 'Mall'
    when loanmaster.loantype='BNPL' and sil_category.store_type not in (1,2,3) then store_tagging
    else 'not applicable' end as loan_product_type,
     case when lower(coalesce(loanmaster.osversion_v2, loanmaster.osVersion)) like '%andro%' then 'android'
    when lower(coalesce(loanmaster.osversion_v2, loanmaster.osVersion)) like '%os%' then 'ios'
    when lower(loanmaster.deviceType) like '%andro%' then 'android'
    else 'ios' end osType,
 'Cash' as product,
 'Alpha-Cash-CIC-Model_All_Trench' Model_Name,
  trenchCategory,
   'Train' Data_selection,
  calcFeatures,
  IF(loanmaster.new_loan_type = 'Flex-up', loanmaster.startApplyDateTime, loanmaster.termsAndConditionsSubmitDateTime) AS appln_submit_datetime,
  loanmaster.disbursementDateTime,
  format_date('%Y-%m', IF(loanmaster.new_loan_type = 'Flex-up', loanmaster.startApplyDateTime, loanmaster.termsAndConditionsSubmitDateTime)) as Application_month, 
from parsed r
left join risk_credit_mis.loan_master_table loanmaster
  ON loanmaster.digitalLoanAccountId = r.digitalLoanAccountId
 left join(SELECT DISTINCT mer_refferal_code, mer_name mer_name,store_type,store_tagging FROM `dl_loans_db_raw.tdbk_merchant_refferal_mtb`
  left join worktable_datachampions.TARGET_SPLIT P on P.STORE_NAME = mer_name
 qualify row_number() over(partition by mer_refferal_code order by  created_dt desc)=1) sil_category on loanmaster.purpleKey=sil_category.mer_refferal_code
  qualify row_number() over(partition by r.customerId, r.digitalLoanAccountid order by cast(start_time as datetime) desc) = 1
)
select * from base where trenchCategory = 'Trench 3'
;
"""
dfd= client.query(sq).to_dataframe(progress_bar_type='tqdm')
dfd.head()


In [None]:
# Expand the calcFeatures column
expanded_df = expand_calc_features(dfd)

# Display the result
print(f"Original columns: {dfd.shape[1]}")
print(f"Expanded columns: {expanded_df.shape[1]}")
df2 = expanded_df.copy()


In [None]:
df1.columns

In [None]:
df2.columns 

In [None]:
df2.rename(columns = {
    'calc_max_age_all_contracts_snapshot':'calc_cic_max_age_all_contracts_snapshot',
    'calc_ratio_overdue_contracts_to_granted_contracts':'calc_cic_ratio_overdue_contracts_to_granted_contracts',
    'calc_ScoreRange':'calc_cic_ScoreRange',
    'calc_ln_loan_level_user_type':'calc_cic_ln_loan_level_user_type', 
    'calc_has_ever_been_overdue':'calc_cic_has_ever_been_overdue',
    'calc_latest_granted_contract_overdue_flag':'calc_cic_latest_granted_contract_overdue_flag',
    'calc_ratio_closed_over_new_granted_cnt_24M':'calc_cic_ratio_closed_over_new_granted_cnt_24M',
    'calc_ratio_risky_contracts_to_granted_contracts':'calc_cic_ratio_risky_contracts_to_granted_contracts',
    'calc_Short_and_Term_Loans_granted_contracts_cnt_24M':'calc_cic_Short_and_Term_Loans_granted_contracts_cnt_24M',
    'calc_flg_zero_non_granted_ever':'calc_cic_flg_zero_non_granted_ever', 
    'calc_CreditAvgCreditLimit':'calc_cic_CreditAvgCreditLimit',
    'calc_flg_zero_granted_ever':'calc_cic_flg_zero_granted_ever', 
    'calc_ca_cic_score':'calc_cic_ca_cic_score',
    'calc_Personal_Loans_granted_contracts_amt_24M':'calc_cic_Personal_Loans_granted_contracts_amt_24M'
    
}, inplace = True)

##### concatenate

In [None]:
df_concat = pd.concat([df1, df2], ignore_index=True)
df_concat.info()

In [None]:
df = df_concat.drop(columns=['calcFeatures']).copy()
df['aCicScore'] = pd.to_numeric(df['aCicScore'], errors='coerce')

##### PSI calculation

In [None]:
df = df.copy()

# Define feature list
feature_list = ['aCicScore',
     'calc_cic_max_age_all_contracts_snapshot',
       'calc_cic_ratio_overdue_contracts_to_granted_contracts',
       'calc_cic_ScoreRange', 'calc_cic_ln_loan_level_user_type',
       'calc_cic_has_ever_been_overdue',
       'calc_cic_latest_granted_contract_overdue_flag',
       'calc_cic_ratio_closed_over_new_granted_cnt_24M',
       'calc_cic_ratio_risky_contracts_to_granted_contracts',
       'calc_cic_Short_and_Term_Loans_granted_contracts_cnt_24M',
       'calc_cic_flg_zero_non_granted_ever',
       'calc_cic_Personal_Loans_granted_contracts_amt_24M',
       'calc_cic_CreditAvgCreditLimit', 'calc_cic_flg_zero_granted_ever'
]
# Define segment columns
segment_columns = ['new_loan_type','osType', 'loan_product_type']
# Calculate month-on-month PSI
psi_results = calculate_month_on_month_psi(df, feature_list, segment_columns)
psi_results['modelDisplayName'] = df['modelDisplayName'].iloc[0]
psi_results['Model_Name'] = df['Model_Name'].iloc[0]
psi_results['modelVersionId'] = df['modelVersionId'].iloc[0]
psi_results['trenchCategory'] = df['trenchCategory'].iloc[0]
psi_results = psi_results[['modelDisplayName', 'Model_Name', 'modelVersionId', 'trenchCategory',
                           'Feature', 'Feature_Type', 'Segment_Column', 'Segment_Value', 'Month',
                           'Base_Month', 'Current_Month', 'Base_Count', 'Actual_Count',
                           'Expected_Percentage', 'Actual_Percentage', 'PSI']].copy()

# # Calculate bin-level PSI
# bin_psi_results = calculate_bin_level_psi(df, feature_list, segment_columns)
# bin_psi_results['modelDisplayName'] = df['modelDisplayName'].iloc[0]
# bin_psi_results['Model_Name'] = df['Model_Name'].iloc[0]
# bin_psi_results['modelVersionId'] = df['modelVersionId'].iloc[0]
# bin_psi_results['trenchCategory'] = df['trenchCategory'].iloc[0]
# bin_psi_results = bin_psi_results[['modelDisplayName', 'Model_Name', 'modelVersionId', 'trenchCategory',
#                                    'Feature', 'Feature_Type', 'Segment_Column', 'Segment_Value',
#                                     'Month', 'Base_Month', 'Current_Month', 'Base_Count', 'Actual_Count',
#                                     'Bin', 'Bin_Range', 'Base_Percentage', 'Actual_Percentage', 'Bin_PSI']].copy()

# Upload to BigQuery
table_id = "prj-prod-dataplatform.dap_ds_poweruser_playground.alpha_cash_cic_model_psi_v4"
job_config = bigquery.LoadJobConfig(
    write_disposition="WRITE_APPEND",  # or "WRITE_APPEND"
)
job = client.load_table_from_dataframe(psi_results, table_id, job_config=job_config)
job.result()  # Wait for the job to complete


# # Upload to BigQuery
# table_id = "prj-prod-dataplatform.dap_ds_poweruser_playground.beta_sil_income_estimation_model_csi_v4"
# job_config = bigquery.LoadJobConfig(
#     write_disposition="WRITE_TRUNCATE",  # or "WRITE_APPEND"
# )
# job = client.load_table_from_dataframe(bin_psi_results, table_id, job_config=job_config)
# job.result()  # Wait for the job to complete

#### Alpha-Cash-Stack-Model

###### Trench 1

##### Test

In [None]:
sq = """ 
WITH parsed as (
select customerId, digitalLoanAccountId,modelDisplayName,modelVersionId,start_time,end_time,prediction,trenchCategory,
REPLACE(REPLACE(calcFeature, "'", '"'), "None", "null") AS calcFeatures,
FROM `prj-prod-dataplatform.audit_balance.ml_model_run_details`
where modelDisplayName in ('Alpha-Cash-Stack-Model', 'alpha_stack_model_cash')
),
base as
(select 
 r.customerId,r.digitalLoanAccountId,prediction aCicScore 
 ,start_time,end_time,
  modelDisplayName,modelVersionId,
     loanmaster.new_loan_type loanType,
 loanmaster.gender,
    case when loanmaster.loantype='BNPL' and sil_category.store_type =1 then 'Appliance'
    when loanmaster.loantype='BNPL' and sil_category.store_type =2 then 'Mobile'
    when loanmaster.loantype='BNPL' and sil_category.store_type =3 then 'Mall'
    when loanmaster.loantype='BNPL' and sil_category.store_type not in (1,2,3) then store_tagging
    else 'not applicable' end as loan_product_type,
     case when lower(coalesce(loanmaster.osversion_v2, loanmaster.osVersion)) like '%andro%' then 'android'
    when lower(coalesce(loanmaster.osversion_v2, loanmaster.osVersion)) like '%os%' then 'ios'
    when lower(loanmaster.deviceType) like '%andro%' then 'android'
    else 'ios' end osType,
 'Cash' as product,
 'Alpha-Cash-Stack-Model' Model_Name,
  trenchCategory,
   'Test' Data_selection,
  calcFeatures,
  prediction aStackScore,
  coalesce(IF(loanmaster.new_loan_type = 'Flex-up', loanmaster.startApplyDateTime, loanmaster.termsAndConditionsSubmitDateTime),  r.start_time) AS appln_submit_datetime,
  loanmaster.disbursementDateTime,
  format_date('%Y-%m', coalesce(IF(loanmaster.new_loan_type = 'Flex-up', loanmaster.startApplyDateTime, loanmaster.termsAndConditionsSubmitDateTime),  r.start_time)) as Application_month, 
from parsed r
left join risk_credit_mis.loan_master_table loanmaster
  ON loanmaster.digitalLoanAccountId = r.digitalLoanAccountId
 left join(SELECT DISTINCT mer_refferal_code, mer_name mer_name,store_type,store_tagging FROM `dl_loans_db_raw.tdbk_merchant_refferal_mtb`
  left join worktable_datachampions.TARGET_SPLIT P on P.STORE_NAME = mer_name
 qualify row_number() over(partition by mer_refferal_code order by  created_dt desc)=1) sil_category on loanmaster.purpleKey=sil_category.mer_refferal_code
  qualify row_number() over(partition by r.customerId, r.digitalLoanAccountid order by cast(start_time as datetime) desc) = 1
)
select * from base where trenchCategory = 'Trench 1'
;
"""

dfd= client.query(sq).to_dataframe(progress_bar_type='tqdm')
dfd.head()

In [None]:
# Expand the calcFeatures column
expanded_df = expand_calc_features(dfd)

# Display the result
print(f"Original columns: {dfd.shape[1]}")
print(f"Expanded columns: {expanded_df.shape[1]}")
df1 = expanded_df.copy()


##### Train

In [None]:
sq = """ 
WITH parsed as (
select customerId, digitalLoanAccountId,modelDisplayName,modelVersionId,start_time,end_time,prediction,trenchCategory,
REPLACE(REPLACE(calcFeature, "'", '"'), "None", "null") AS calcFeatures,
FROM prj-prod-dataplatform.dap_ds_poweruser_playground.ml_training_model_run_details
where modelDisplayName in ('Alpha-Cash-Stack-Model', 'alpha_stack_model_cash')
),
base as
(select 
 r.customerId,r.digitalLoanAccountId,prediction aStackScore 
 ,start_time,end_time,
  modelDisplayName,modelVersionId,
     loanmaster.new_loan_type loanType,
 loanmaster.gender,
    case when loanmaster.loantype='BNPL' and sil_category.store_type =1 then 'Appliance'
    when loanmaster.loantype='BNPL' and sil_category.store_type =2 then 'Mobile'
    when loanmaster.loantype='BNPL' and sil_category.store_type =3 then 'Mall'
    when loanmaster.loantype='BNPL' and sil_category.store_type not in (1,2,3) then store_tagging
    else 'not applicable' end as loan_product_type,
     case when lower(coalesce(loanmaster.osversion_v2, loanmaster.osVersion)) like '%andro%' then 'android'
    when lower(coalesce(loanmaster.osversion_v2, loanmaster.osVersion)) like '%os%' then 'ios'
    when lower(loanmaster.deviceType) like '%andro%' then 'android'
    else 'ios' end osType,
 'Cash' as product,
 'Alpha-Cash-Stack-Model' Model_Name,
  trenchCategory,
   'Train' Data_selection,
  calcFeatures,
  IF(loanmaster.new_loan_type = 'Flex-up', loanmaster.startApplyDateTime, loanmaster.termsAndConditionsSubmitDateTime) AS appln_submit_datetime,
  loanmaster.disbursementDateTime,
  format_date('%Y-%m',IF(loanmaster.new_loan_type = 'Flex-up', loanmaster.startApplyDateTime, loanmaster.termsAndConditionsSubmitDateTime)) as Application_month, 
from parsed r
left join risk_credit_mis.loan_master_table loanmaster
  ON loanmaster.digitalLoanAccountId = r.digitalLoanAccountId
 left join(SELECT DISTINCT mer_refferal_code, mer_name mer_name,store_type,store_tagging FROM `dl_loans_db_raw.tdbk_merchant_refferal_mtb`
  left join worktable_datachampions.TARGET_SPLIT P on P.STORE_NAME = mer_name
 qualify row_number() over(partition by mer_refferal_code order by  created_dt desc)=1) sil_category on loanmaster.purpleKey=sil_category.mer_refferal_code
  qualify row_number() over(partition by r.customerId, r.digitalLoanAccountid order by cast(start_time as datetime) desc) = 1
)
select * from base where trenchCategory = 'Trench 1'

"""
dfd= client.query(sq).to_dataframe(progress_bar_type='tqdm')
dfd.head()


In [None]:
# Expand the calcFeatures column
expanded_df = expand_calc_features(dfd)

# Display the result
print(f"Original columns: {dfd.shape[1]}")
print(f"Expanded columns: {expanded_df.shape[1]}")
df2 = expanded_df.copy()

In [None]:
df1.columns

In [None]:
df1.info()

In [None]:
df2.columns

In [None]:
df2.info()

In [None]:
df2.rename(columns={'calc_demo_score':'calc_c_demo_score',
                     'calc_credo_score':'calc_c_credo_score' ,
                    'calc_cic_score':'calc_ca_cic_score',
                    'calc_apps_score':'calc_apps_score'
                   }, inplace = True)

##### Concatenate

In [None]:
df_concat = pd.concat([df1, df2], ignore_index=True)
df_concat.info()

In [None]:
dd.query("""select Data_selection, count(distinct digitalLoanAccountId) cnt from df_concat where calc_apps_score is not null group by 1""").to_df()

In [None]:
df = df_concat.drop(columns=['calcFeatures']).copy()
df['aStackScore'] = pd.to_numeric(df['aStackScore'], errors='coerce')

##### PSI calculation

In [None]:
df = df.copy()

# Define feature list
feature_list = ['aStackScore',  'calc_apps_score', 'calc_c_demo_score',   'calc_c_credo_score', 'calc_ca_cic_score']

# Define segment columns
segment_columns = ['new_loan_type','osType', 'loan_product_type']
# Calculate month-on-month PSI
psi_results = calculate_month_on_month_psi(df, feature_list, segment_columns)
psi_results['modelDisplayName'] = df['modelDisplayName'].iloc[0]
psi_results['Model_Name'] = df['Model_Name'].iloc[0]
psi_results['modelVersionId'] = df['modelVersionId'].iloc[0]
psi_results['trenchCategory'] = df['trenchCategory'].iloc[0]
psi_results = psi_results[['modelDisplayName', 'Model_Name', 'modelVersionId', 'trenchCategory',
                           'Feature', 'Feature_Type', 'Segment_Column', 'Segment_Value', 'Month',
                           'Base_Month', 'Current_Month', 'Base_Count', 'Actual_Count',
                           'Expected_Percentage', 'Actual_Percentage', 'PSI']].copy()

# # Calculate bin-level PSI
# bin_psi_results = calculate_bin_level_psi(df, feature_list, segment_columns)
# bin_psi_results['modelDisplayName'] = df['modelDisplayName'].iloc[0]
# bin_psi_results['Model_Name'] = df['Model_Name'].iloc[0]
# bin_psi_results['modelVersionId'] = df['modelVersionId'].iloc[0]
# bin_psi_results['trenchCategory'] = df['trenchCategory'].iloc[0]
# bin_psi_results = bin_psi_results[['modelDisplayName', 'Model_Name', 'modelVersionId', 'trenchCategory',
#                                    'Feature', 'Feature_Type', 'Segment_Column', 'Segment_Value',
#                                     'Month', 'Base_Month', 'Current_Month', 'Base_Count', 'Actual_Count',
#                                     'Bin', 'Bin_Range', 'Base_Percentage', 'Actual_Percentage', 'Bin_PSI']].copy()

# Upload to BigQuery
table_id = "prj-prod-dataplatform.dap_ds_poweruser_playground.alpha_cash_stack_model_psi_v4"
job_config = bigquery.LoadJobConfig(
    write_disposition="WRITE_TRUNCATE",  # or "WRITE_APPEND"
)
job = client.load_table_from_dataframe(psi_results, table_id, job_config=job_config)
job.result()  # Wait for the job to complete


# # Upload to BigQuery
# table_id = "prj-prod-dataplatform.dap_ds_poweruser_playground.beta_sil_income_estimation_model_csi_v4"
# job_config = bigquery.LoadJobConfig(
#     write_disposition="WRITE_TRUNCATE",  # or "WRITE_APPEND"
# )
# job = client.load_table_from_dataframe(bin_psi_results, table_id, job_config=job_config)
# job.result()  # Wait for the job to complete

##### Trench 2

##### Test

In [None]:
sq = """ 
WITH parsed as (
select customerId, digitalLoanAccountId,modelDisplayName,modelVersionId,start_time,end_time,prediction,trenchCategory,
REPLACE(REPLACE(calcFeature, "'", '"'), "None", "null") AS calcFeatures,
FROM `prj-prod-dataplatform.audit_balance.ml_model_run_details`
where modelDisplayName in ('Alpha-Cash-Stack-Model', 'alpha_stack_model_cash')
),
base as
(select 
 r.customerId,r.digitalLoanAccountId,prediction aCicScore 
 ,start_time,end_time,
  modelDisplayName,modelVersionId,
     loanmaster.new_loan_type loanType,
 loanmaster.gender,
    case when loanmaster.loantype='BNPL' and sil_category.store_type =1 then 'Appliance'
    when loanmaster.loantype='BNPL' and sil_category.store_type =2 then 'Mobile'
    when loanmaster.loantype='BNPL' and sil_category.store_type =3 then 'Mall'
    when loanmaster.loantype='BNPL' and sil_category.store_type not in (1,2,3) then store_tagging
    else 'not applicable' end as loan_product_type,
     case when lower(coalesce(loanmaster.osversion_v2, loanmaster.osVersion)) like '%andro%' then 'android'
    when lower(coalesce(loanmaster.osversion_v2, loanmaster.osVersion)) like '%os%' then 'ios'
    when lower(loanmaster.deviceType) like '%andro%' then 'android'
    else 'ios' end osType,
 'Cash' as product,
 'Alpha-Cash-Stack-Model' Model_Name,
  trenchCategory,
   'Test' Data_selection,
  calcFeatures,
  prediction aStackScore,
  coalesce(IF(loanmaster.new_loan_type = 'Flex-up', loanmaster.startApplyDateTime, loanmaster.termsAndConditionsSubmitDateTime),  r.start_time) AS appln_submit_datetime,
  loanmaster.disbursementDateTime,
  format_date('%Y-%m', coalesce(IF(loanmaster.new_loan_type = 'Flex-up', loanmaster.startApplyDateTime, loanmaster.termsAndConditionsSubmitDateTime),  r.start_time)) as Application_month, 
from parsed r
left join risk_credit_mis.loan_master_table loanmaster
  ON loanmaster.digitalLoanAccountId = r.digitalLoanAccountId
 left join(SELECT DISTINCT mer_refferal_code, mer_name mer_name,store_type,store_tagging FROM `dl_loans_db_raw.tdbk_merchant_refferal_mtb`
  left join worktable_datachampions.TARGET_SPLIT P on P.STORE_NAME = mer_name
 qualify row_number() over(partition by mer_refferal_code order by  created_dt desc)=1) sil_category on loanmaster.purpleKey=sil_category.mer_refferal_code
  qualify row_number() over(partition by r.customerId, r.digitalLoanAccountid order by cast(start_time as datetime) desc) = 1
)
select * from base where trenchCategory = 'Trench 2'
;
"""

dfd= client.query(sq).to_dataframe(progress_bar_type='tqdm')
dfd.head()

In [None]:
# dfd.groupby(['Application_month', 'trenchCategory'])['digitalLoanAccountId'].nunique()

In [None]:
# Expand the calcFeatures column
expanded_df = expand_calc_features(dfd)

# Display the result
print(f"Original columns: {dfd.shape[1]}")
print(f"Expanded columns: {expanded_df.shape[1]}")
df1 = expanded_df.copy()


##### Train 

In [None]:
sq = """ 
WITH parsed as (
select customerId, digitalLoanAccountId,modelDisplayName,modelVersionId,start_time,end_time,prediction,trenchCategory,
REPLACE(REPLACE(calcFeature, "'", '"'), "None", "null") AS calcFeatures,
FROM prj-prod-dataplatform.dap_ds_poweruser_playground.ml_training_model_run_details
where modelDisplayName in ('Alpha-Cash-Stack-Model', 'alpha_stack_model_cash')
),
base as
(select 
 r.customerId,r.digitalLoanAccountId,prediction aStackScore 
 ,start_time,end_time,
  modelDisplayName,modelVersionId,
     loanmaster.new_loan_type loanType,
 loanmaster.gender,
    case when loanmaster.loantype='BNPL' and sil_category.store_type =1 then 'Appliance'
    when loanmaster.loantype='BNPL' and sil_category.store_type =2 then 'Mobile'
    when loanmaster.loantype='BNPL' and sil_category.store_type =3 then 'Mall'
    when loanmaster.loantype='BNPL' and sil_category.store_type not in (1,2,3) then store_tagging
    else 'not applicable' end as loan_product_type,
     case when lower(coalesce(loanmaster.osversion_v2, loanmaster.osVersion)) like '%andro%' then 'android'
    when lower(coalesce(loanmaster.osversion_v2, loanmaster.osVersion)) like '%os%' then 'ios'
    when lower(loanmaster.deviceType) like '%andro%' then 'android'
    else 'ios' end osType,
 'Cash' as product,
 'Alpha-Cash-Stack-Model' Model_Name,
  trenchCategory,
   'Train' Data_selection,
  calcFeatures,
  IF(loanmaster.new_loan_type = 'Flex-up', loanmaster.startApplyDateTime, loanmaster.termsAndConditionsSubmitDateTime) AS appln_submit_datetime,
  loanmaster.disbursementDateTime,
  format_date('%Y-%m',IF(loanmaster.new_loan_type = 'Flex-up', loanmaster.startApplyDateTime, loanmaster.termsAndConditionsSubmitDateTime)) as Application_month, 
from parsed r
left join risk_credit_mis.loan_master_table loanmaster
  ON loanmaster.digitalLoanAccountId = r.digitalLoanAccountId
 left join(SELECT DISTINCT mer_refferal_code, mer_name mer_name,store_type,store_tagging FROM `dl_loans_db_raw.tdbk_merchant_refferal_mtb`
  left join worktable_datachampions.TARGET_SPLIT P on P.STORE_NAME = mer_name
 qualify row_number() over(partition by mer_refferal_code order by  created_dt desc)=1) sil_category on loanmaster.purpleKey=sil_category.mer_refferal_code
  qualify row_number() over(partition by r.customerId, r.digitalLoanAccountid order by cast(start_time as datetime) desc) = 1
)
select * from base where trenchCategory = 'Trench 2'

"""
dfd= client.query(sq).to_dataframe(progress_bar_type='tqdm')
dfd.head()


In [None]:
# dfd.groupby(['Application_month', 'trenchCategory'])['digitalLoanAccountId'].nunique()

In [None]:
# Expand the calcFeatures column
expanded_df = expand_calc_features(dfd)

# Display the result
print(f"Original columns: {dfd.shape[1]}")
print(f"Expanded columns: {expanded_df.shape[1]}")
df2 = expanded_df.copy()

In [None]:
df1.info()

In [None]:
df2.info()

In [None]:
df2.rename(columns={'calc_demo_score':'calc_c_demo_score',
                     'calc_credo_score':'calc_c_credo_score' ,
                    'calc_cic_score':'calc_ca_cic_score',
                    'calc_apps_score':'calc_apps_score'
                   }, inplace = True)

##### concatenate

In [None]:
df_concat = pd.concat([df1, df2], ignore_index=True)
df_concat.info()

In [None]:
df = df_concat.drop(columns=['calcFeatures']).copy()
df['aStackScore'] = pd.to_numeric(df['aStackScore'], errors='coerce')

##### PSI calculation

In [None]:
df = df.copy()

# Define feature list
feature_list = ['aStackScore',  'calc_apps_score', 'calc_c_demo_score',   'calc_c_credo_score', 'calc_ca_cic_score']

# Define segment columns
segment_columns = ['new_loan_type','osType', 'loan_product_type']
# Calculate month-on-month PSI
psi_results = calculate_month_on_month_psi(df, feature_list, segment_columns)
psi_results['modelDisplayName'] = df['modelDisplayName'].iloc[0]
psi_results['Model_Name'] = df['Model_Name'].iloc[0]
psi_results['modelVersionId'] = df['modelVersionId'].iloc[0]
psi_results['trenchCategory'] = df['trenchCategory'].iloc[0]
psi_results = psi_results[['modelDisplayName', 'Model_Name', 'modelVersionId', 'trenchCategory',
                           'Feature', 'Feature_Type', 'Segment_Column', 'Segment_Value', 'Month',
                           'Base_Month', 'Current_Month', 'Base_Count', 'Actual_Count',
                           'Expected_Percentage', 'Actual_Percentage', 'PSI']].copy()

# # Calculate bin-level PSI
# bin_psi_results = calculate_bin_level_psi(df, feature_list, segment_columns)
# bin_psi_results['modelDisplayName'] = df['modelDisplayName'].iloc[0]
# bin_psi_results['Model_Name'] = df['Model_Name'].iloc[0]
# bin_psi_results['modelVersionId'] = df['modelVersionId'].iloc[0]
# bin_psi_results['trenchCategory'] = df['trenchCategory'].iloc[0]
# bin_psi_results = bin_psi_results[['modelDisplayName', 'Model_Name', 'modelVersionId', 'trenchCategory',
#                                    'Feature', 'Feature_Type', 'Segment_Column', 'Segment_Value',
#                                     'Month', 'Base_Month', 'Current_Month', 'Base_Count', 'Actual_Count',
#                                     'Bin', 'Bin_Range', 'Base_Percentage', 'Actual_Percentage', 'Bin_PSI']].copy()

# Upload to BigQuery
table_id = "prj-prod-dataplatform.dap_ds_poweruser_playground.alpha_cash_stack_model_psi_v4"
job_config = bigquery.LoadJobConfig(
    write_disposition="WRITE_APPEND",  # or "WRITE_APPEND"
)
job = client.load_table_from_dataframe(psi_results, table_id, job_config=job_config)
job.result()  # Wait for the job to complete


# # Upload to BigQuery
# table_id = "prj-prod-dataplatform.dap_ds_poweruser_playground.beta_sil_income_estimation_model_csi_v4"
# job_config = bigquery.LoadJobConfig(
#     write_disposition="WRITE_TRUNCATE",  # or "WRITE_APPEND"
# )
# job = client.load_table_from_dataframe(bin_psi_results, table_id, job_config=job_config)
# job.result()  # Wait for the job to complete

In [None]:
# sq = """select * from prj-prod-dataplatform.dap_ds_poweruser_playground.alpha_cash_stack_model_psi_v4"""
# dfd = client.query(sq).to_dataframe()

##### Trench 3

##### Test

In [None]:
sq = """ 
WITH parsed as (
select customerId, digitalLoanAccountId,modelDisplayName,modelVersionId,start_time,end_time,prediction,trenchCategory,
REPLACE(REPLACE(calcFeature, "'", '"'), "None", "null") AS calcFeatures,
FROM `prj-prod-dataplatform.audit_balance.ml_model_run_details`
where modelDisplayName in ('Alpha-Cash-Stack-Model', 'alpha_stack_model_cash')
),
base as
(select 
 r.customerId,r.digitalLoanAccountId,prediction aCicScore 
 ,start_time,end_time,
  modelDisplayName,modelVersionId,
     loanmaster.new_loan_type loanType,
 loanmaster.gender,
    case when loanmaster.loantype='BNPL' and sil_category.store_type =1 then 'Appliance'
    when loanmaster.loantype='BNPL' and sil_category.store_type =2 then 'Mobile'
    when loanmaster.loantype='BNPL' and sil_category.store_type =3 then 'Mall'
    when loanmaster.loantype='BNPL' and sil_category.store_type not in (1,2,3) then store_tagging
    else 'not applicable' end as loan_product_type,
     case when lower(coalesce(loanmaster.osversion_v2, loanmaster.osVersion)) like '%andro%' then 'android'
    when lower(coalesce(loanmaster.osversion_v2, loanmaster.osVersion)) like '%os%' then 'ios'
    when lower(loanmaster.deviceType) like '%andro%' then 'android'
    else 'ios' end osType,
 'Cash' as product,
 'Alpha-Cash-Stack-Model' Model_Name,
  trenchCategory,
   'Test' Data_selection,
  calcFeatures,
  prediction aStackScore,
  coalesce(IF(loanmaster.new_loan_type = 'Flex-up', loanmaster.startApplyDateTime, loanmaster.termsAndConditionsSubmitDateTime),  r.start_time) AS appln_submit_datetime,
  loanmaster.disbursementDateTime,
  format_date('%Y-%m', coalesce(IF(loanmaster.new_loan_type = 'Flex-up', loanmaster.startApplyDateTime, loanmaster.termsAndConditionsSubmitDateTime),  r.start_time)) as Application_month, 
from parsed r
left join risk_credit_mis.loan_master_table loanmaster
  ON loanmaster.digitalLoanAccountId = r.digitalLoanAccountId
 left join(SELECT DISTINCT mer_refferal_code, mer_name mer_name,store_type,store_tagging FROM `dl_loans_db_raw.tdbk_merchant_refferal_mtb`
  left join worktable_datachampions.TARGET_SPLIT P on P.STORE_NAME = mer_name
 qualify row_number() over(partition by mer_refferal_code order by  created_dt desc)=1) sil_category on loanmaster.purpleKey=sil_category.mer_refferal_code
  qualify row_number() over(partition by r.customerId, r.digitalLoanAccountid order by cast(start_time as datetime) desc) = 1
)
select * from base where trenchCategory = 'Trench 3'
;
"""

dfd= client.query(sq).to_dataframe(progress_bar_type='tqdm')
dfd.head()

In [None]:
# dfd.groupby(['Application_month', 'trenchCategory'])['digitalLoanAccountId'].nunique()

In [None]:
# Expand the calcFeatures column
expanded_df = expand_calc_features(dfd)

# Display the result
print(f"Original columns: {dfd.shape[1]}")
print(f"Expanded columns: {expanded_df.shape[1]}")
df1 = expanded_df.copy()


##### Train 

In [None]:
sq = """ 
WITH parsed as (
select customerId, digitalLoanAccountId,modelDisplayName,modelVersionId,start_time,end_time,prediction,trenchCategory,
REPLACE(REPLACE(calcFeature, "'", '"'), "None", "null") AS calcFeatures,
FROM prj-prod-dataplatform.dap_ds_poweruser_playground.ml_training_model_run_details
where modelDisplayName in ('Alpha-Cash-Stack-Model', 'alpha_stack_model_cash')
),
base as
(select 
 r.customerId,r.digitalLoanAccountId,prediction aStackScore 
 ,start_time,end_time,
  modelDisplayName,modelVersionId,
     loanmaster.new_loan_type loanType,
 loanmaster.gender,
    case when loanmaster.loantype='BNPL' and sil_category.store_type =1 then 'Appliance'
    when loanmaster.loantype='BNPL' and sil_category.store_type =2 then 'Mobile'
    when loanmaster.loantype='BNPL' and sil_category.store_type =3 then 'Mall'
    when loanmaster.loantype='BNPL' and sil_category.store_type not in (1,2,3) then store_tagging
    else 'not applicable' end as loan_product_type,
     case when lower(coalesce(loanmaster.osversion_v2, loanmaster.osVersion)) like '%andro%' then 'android'
    when lower(coalesce(loanmaster.osversion_v2, loanmaster.osVersion)) like '%os%' then 'ios'
    when lower(loanmaster.deviceType) like '%andro%' then 'android'
    else 'ios' end osType,
 'Cash' as product,
 'Alpha-Cash-Stack-Model' Model_Name,
  trenchCategory,
   'Train' Data_selection,
  calcFeatures,
  IF(loanmaster.new_loan_type = 'Flex-up', loanmaster.startApplyDateTime, loanmaster.termsAndConditionsSubmitDateTime) AS appln_submit_datetime,
  loanmaster.disbursementDateTime,
  format_date('%Y-%m',IF(loanmaster.new_loan_type = 'Flex-up', loanmaster.startApplyDateTime, loanmaster.termsAndConditionsSubmitDateTime)) as Application_month, 
from parsed r
left join risk_credit_mis.loan_master_table loanmaster
  ON loanmaster.digitalLoanAccountId = r.digitalLoanAccountId
 left join(SELECT DISTINCT mer_refferal_code, mer_name mer_name,store_type,store_tagging FROM `dl_loans_db_raw.tdbk_merchant_refferal_mtb`
  left join worktable_datachampions.TARGET_SPLIT P on P.STORE_NAME = mer_name
 qualify row_number() over(partition by mer_refferal_code order by  created_dt desc)=1) sil_category on loanmaster.purpleKey=sil_category.mer_refferal_code
  qualify row_number() over(partition by r.customerId, r.digitalLoanAccountid order by cast(start_time as datetime) desc) = 1
)
select * from base where trenchCategory = 'Trench 3'

"""
dfd= client.query(sq).to_dataframe(progress_bar_type='tqdm')
dfd.head()


In [None]:
# dfd.groupby(['Application_month', 'trenchCategory'])['digitalLoanAccountId'].nunique()

In [None]:
# Expand the calcFeatures column
expanded_df = expand_calc_features(dfd)

# Display the result
print(f"Original columns: {dfd.shape[1]}")
print(f"Expanded columns: {expanded_df.shape[1]}")
df2 = expanded_df.copy()

In [None]:
df1.info()

In [None]:
df2.info()

In [None]:
df2.rename(columns={'calc_demo_score':'calc_c_demo_score',
                     'calc_credo_score':'calc_c_credo_score' ,
                    'calc_cic_score':'calc_ca_cic_score',
                    'calc_apps_score':'calc_apps_score'
                   }, inplace = True)

##### concatenate

In [None]:
df_concat = pd.concat([df1, df2], ignore_index=True)
df_concat.info()

In [None]:
df = df_concat.drop(columns=['calcFeatures']).copy()
df['aStackScore'] = pd.to_numeric(df['aStackScore'], errors='coerce')

##### PSI calculation

In [None]:
df = df.copy()

# Define feature list
feature_list = ['aStackScore',  'calc_apps_score', 'calc_c_demo_score',   'calc_c_credo_score', 'calc_ca_cic_score']

# Define segment columns
segment_columns = ['new_loan_type','osType', 'loan_product_type']
# Calculate month-on-month PSI
psi_results = calculate_month_on_month_psi(df, feature_list, segment_columns)
psi_results['modelDisplayName'] = df['modelDisplayName'].iloc[0]
psi_results['Model_Name'] = df['Model_Name'].iloc[0]
psi_results['modelVersionId'] = df['modelVersionId'].iloc[0]
psi_results['trenchCategory'] = df['trenchCategory'].iloc[0]
psi_results = psi_results[['modelDisplayName', 'Model_Name', 'modelVersionId', 'trenchCategory',
                           'Feature', 'Feature_Type', 'Segment_Column', 'Segment_Value', 'Month',
                           'Base_Month', 'Current_Month', 'Base_Count', 'Actual_Count',
                           'Expected_Percentage', 'Actual_Percentage', 'PSI']].copy()

# # Calculate bin-level PSI
# bin_psi_results = calculate_bin_level_psi(df, feature_list, segment_columns)
# bin_psi_results['modelDisplayName'] = df['modelDisplayName'].iloc[0]
# bin_psi_results['Model_Name'] = df['Model_Name'].iloc[0]
# bin_psi_results['modelVersionId'] = df['modelVersionId'].iloc[0]
# bin_psi_results['trenchCategory'] = df['trenchCategory'].iloc[0]
# bin_psi_results = bin_psi_results[['modelDisplayName', 'Model_Name', 'modelVersionId', 'trenchCategory',
#                                    'Feature', 'Feature_Type', 'Segment_Column', 'Segment_Value',
#                                     'Month', 'Base_Month', 'Current_Month', 'Base_Count', 'Actual_Count',
#                                     'Bin', 'Bin_Range', 'Base_Percentage', 'Actual_Percentage', 'Bin_PSI']].copy()

# Upload to BigQuery
table_id = "prj-prod-dataplatform.dap_ds_poweruser_playground.alpha_cash_stack_model_psi_v4"
job_config = bigquery.LoadJobConfig(
    write_disposition="WRITE_APPEND",  # or "WRITE_APPEND"
)
job = client.load_table_from_dataframe(psi_results, table_id, job_config=job_config)
job.result()  # Wait for the job to complete


# # Upload to BigQuery
# table_id = "prj-prod-dataplatform.dap_ds_poweruser_playground.beta_sil_income_estimation_model_csi_v4"
# job_config = bigquery.LoadJobConfig(
#     write_disposition="WRITE_TRUNCATE",  # or "WRITE_APPEND"
# )
# job = client.load_table_from_dataframe(bin_psi_results, table_id, job_config=job_config)
# job.result()  # Wait for the job to complete

In [None]:
# sq = """select * from prj-prod-dataplatform.dap_ds_poweruser_playground.alpha_cash_stack_model_psi_v4"""
# dfd = client.query(sq).to_dataframe()
# dfd.groupby(['trenchCategory','Month'])['modelDisplayName'].count()

#### Beta-Cash-Demo-Model

##### Trench 1

##### Test

In [None]:
sq = """
WITH parsed as (
select customerId, digitalLoanAccountId,modelDisplayName,modelVersionId,start_time,end_time,prediction,trenchCategory,
REPLACE(REPLACE(calcFeature, "'", '"'), "None", "null") AS calcFeatures,
FROM `prj-prod-dataplatform.audit_balance.ml_model_run_details`
where modelDisplayName in ('Beta-Cash-Demo-Model', 'beta_demo_model_cash')
),
base as
(select 
 r.customerId,r.digitalLoanAccountId,prediction aCicScore 
 ,start_time,end_time,
  modelDisplayName,modelVersionId,
     loanmaster.new_loan_type loanType,
 loanmaster.gender,
    case when loanmaster.loantype='BNPL' and sil_category.store_type =1 then 'Appliance'
    when loanmaster.loantype='BNPL' and sil_category.store_type =2 then 'Mobile'
    when loanmaster.loantype='BNPL' and sil_category.store_type =3 then 'Mall'
    when loanmaster.loantype='BNPL' and sil_category.store_type not in (1,2,3) then store_tagging
    else 'not applicable' end as loan_product_type,
     case when lower(coalesce(loanmaster.osversion_v2, loanmaster.osVersion)) like '%andro%' then 'android'
    when lower(coalesce(loanmaster.osversion_v2, loanmaster.osVersion)) like '%os%' then 'ios'
    when lower(loanmaster.deviceType) like '%andro%' then 'android'
    else 'ios' end osType,
 'Cash' as product,
 'Beta-Cash-Demo-Model_All_Trench' Model_Name,
  trenchCategory,
   'Test' Data_selection,
  calcFeatures,
  prediction Beta_Cash_Demo_Score,
  coalesce(IF(loanmaster.new_loan_type = 'Flex-up', loanmaster.startApplyDateTime, loanmaster.termsAndConditionsSubmitDateTime),  r.start_time) AS appln_submit_datetime,
  loanmaster.disbursementDateTime,
  format_date('%Y-%m', coalesce(IF(loanmaster.new_loan_type = 'Flex-up', loanmaster.startApplyDateTime, loanmaster.termsAndConditionsSubmitDateTime),  r.start_time)) as Application_month, 
from parsed r
left join risk_credit_mis.loan_master_table loanmaster
  ON loanmaster.digitalLoanAccountId = r.digitalLoanAccountId
 left join(SELECT DISTINCT mer_refferal_code, mer_name mer_name,store_type,store_tagging FROM `dl_loans_db_raw.tdbk_merchant_refferal_mtb`
  left join worktable_datachampions.TARGET_SPLIT P on P.STORE_NAME = mer_name
 qualify row_number() over(partition by mer_refferal_code order by  created_dt desc)=1) sil_category on loanmaster.purpleKey=sil_category.mer_refferal_code
  qualify row_number() over(partition by r.customerId, r.digitalLoanAccountid order by cast(start_time as datetime) desc) = 1
)
select * from base where trenchCategory = 'Trench 1'
;
"""
dfd= client.query(sq).to_dataframe(progress_bar_type='tqdm')
dfd.head()

In [None]:
# dfd.groupby(['Application_month', 'trenchCategory'])['digitalLoanAccountId'].nunique()

In [None]:
# Expand the calcFeatures column
expanded_df = expand_calc_features(dfd)

# Display the result
print(f"Original columns: {dfd.shape[1]}")
print(f"Expanded columns: {expanded_df.shape[1]}")
df1 = expanded_df.copy()


##### Train

In [None]:
sq = """
WITH parsed as (
select customerId, digitalLoanAccountId,modelDisplayName,modelVersionId,start_time,end_time,prediction,trenchCategory,
REPLACE(REPLACE(calcFeature, "'", '"'), "None", "null") AS calcFeatures,
FROM prj-prod-dataplatform.dap_ds_poweruser_playground.ml_training_model_run_details
where modelDisplayName in ('Beta-Cash-Demo-Model', 'beta_demo_model_cash')
),
base as
(select 
 r.customerId,r.digitalLoanAccountId,prediction 
 ,start_time,end_time,
  modelDisplayName,modelVersionId,
     loanmaster.new_loan_type loanType,
 loanmaster.gender,
    case when loanmaster.loantype='BNPL' and sil_category.store_type =1 then 'Appliance'
    when loanmaster.loantype='BNPL' and sil_category.store_type =2 then 'Mobile'
    when loanmaster.loantype='BNPL' and sil_category.store_type =3 then 'Mall'
    when loanmaster.loantype='BNPL' and sil_category.store_type not in (1,2,3) then store_tagging
    else 'not applicable' end as loan_product_type,
     case when lower(coalesce(loanmaster.osversion_v2, loanmaster.osVersion)) like '%andro%' then 'android'
    when lower(coalesce(loanmaster.osversion_v2, loanmaster.osVersion)) like '%os%' then 'ios'
    when lower(loanmaster.deviceType) like '%andro%' then 'android'
    else 'ios' end osType,
 'Cash' as product,
 'Beta-Cash-Demo-Model_All_Trench' Model_Name,
  trenchCategory,
   'Train' Data_selection,
  calcFeatures,
  prediction Beta_Cash_Demo_Score,
  IF(loanmaster.new_loan_type = 'Flex-up', loanmaster.startApplyDateTime, loanmaster.termsAndConditionsSubmitDateTime) AS appln_submit_datetime,
  loanmaster.disbursementDateTime,
  format_date('%Y-%m',IF(loanmaster.new_loan_type = 'Flex-up', loanmaster.startApplyDateTime, loanmaster.termsAndConditionsSubmitDateTime)) as Application_month, 
from parsed r
left join risk_credit_mis.loan_master_table loanmaster
  ON loanmaster.digitalLoanAccountId = r.digitalLoanAccountId
 left join(SELECT DISTINCT mer_refferal_code, mer_name mer_name,store_type,store_tagging FROM `dl_loans_db_raw.tdbk_merchant_refferal_mtb`
  left join worktable_datachampions.TARGET_SPLIT P on P.STORE_NAME = mer_name
 qualify row_number() over(partition by mer_refferal_code order by  created_dt desc)=1) sil_category on loanmaster.purpleKey=sil_category.mer_refferal_code
  qualify row_number() over(partition by r.customerId, r.digitalLoanAccountid order by cast(start_time as datetime) desc) = 1
)
select * from base where trenchCategory = 'Trench 1'
;
"""

dfd = client.query(sq).to_dataframe(progress_bar_type='tqdm')
dfd.head()


In [None]:
# dfd.groupby(['Application_month', 'trenchCategory'])['digitalLoanAccountId'].nunique()

In [None]:
# Expand the calcFeatures column
expanded_df = expand_calc_features(dfd)

# Display the result
print(f"Original columns: {dfd.shape[1]}")
print(f"Expanded columns: {expanded_df.shape[1]}")
df2 = expanded_df.copy()

In [None]:
df1.info()

In [None]:
df2.info()

##### concatenate

In [None]:
df_concat = pd.concat([df1, df2], ignore_index=True)
df_concat.info()

In [None]:
df_concat = df_concat.drop(columns=['calc_Beta_Cash_Demo_Score', 'calc_ln_fspd30_flag', 'calc_ln_os_type', 'calc_ln_mature_fspd30_flag', 'calc_ln_appln_submit_datetime']).copy()
df_concat.columns = df_concat.columns.str.replace('calc_', '', regex=False)
df_concat.columns

In [None]:
df = df_concat.drop(columns=['calcFeatures']).copy()
df['Beta_Cash_Demo_Score'] = pd.to_numeric(df['Beta_Cash_Demo_Score'], errors='coerce')
df.columns

##### PSI calculation

In [None]:
df = df.copy()

# Define feature list
feature_list = ['Beta_Cash_Demo_Score', 'ln_vas_opted_flag', 'ln_self_dec_income', 'ln_age',
       'ln_source_funds_new_bin', 'ln_loan_level_user_type',
       'ln_industry_new_cat_bin', 'ln_marital_status', 'ln_doc_type_rolled',
       'ln_education_level', 'ln_ref2_type', 'ln_email_primary_domain',
       'ln_province_bin']

# Define segment columns
segment_columns = ['new_loan_type','osType', 'loan_product_type']
# Calculate month-on-month PSI
psi_results = calculate_month_on_month_psi(df, feature_list, segment_columns)
psi_results['modelDisplayName'] = df['modelDisplayName'].iloc[0]
psi_results['Model_Name'] = df['Model_Name'].iloc[0]
psi_results['modelVersionId'] = df['modelVersionId'].iloc[0]
psi_results['trenchCategory'] = df['trenchCategory'].iloc[0]
psi_results = psi_results[['modelDisplayName', 'Model_Name', 'modelVersionId', 'trenchCategory',
                           'Feature', 'Feature_Type', 'Segment_Column', 'Segment_Value', 'Month',
                           'Base_Month', 'Current_Month', 'Base_Count', 'Actual_Count',
                           'Expected_Percentage', 'Actual_Percentage', 'PSI']].copy()

# # Calculate bin-level PSI
# bin_psi_results = calculate_bin_level_psi(df, feature_list, segment_columns)
# bin_psi_results['modelDisplayName'] = df['modelDisplayName'].iloc[0]
# bin_psi_results['Model_Name'] = df['Model_Name'].iloc[0]
# bin_psi_results['modelVersionId'] = df['modelVersionId'].iloc[0]
# bin_psi_results['trenchCategory'] = df['trenchCategory'].iloc[0]
# bin_psi_results = bin_psi_results[['modelDisplayName', 'Model_Name', 'modelVersionId', 'trenchCategory',
#                                    'Feature', 'Feature_Type', 'Segment_Column', 'Segment_Value',
#                                     'Month', 'Base_Month', 'Current_Month', 'Base_Count', 'Actual_Count',
#                                     'Bin', 'Bin_Range', 'Base_Percentage', 'Actual_Percentage', 'Bin_PSI']].copy()

# Upload to BigQuery
table_id = "prj-prod-dataplatform.dap_ds_poweruser_playground.beta_cash_demo_model_psi_v4"
job_config = bigquery.LoadJobConfig(
    write_disposition="WRITE_TRUNCATE",  # or "WRITE_APPEND"
)
job = client.load_table_from_dataframe(psi_results, table_id, job_config=job_config)
job.result()  # Wait for the job to complete


# # Upload to BigQuery
# table_id = "prj-prod-dataplatform.dap_ds_poweruser_playground.beta_sil_income_estimation_model_csi_v4"
# job_config = bigquery.LoadJobConfig(
#     write_disposition="WRITE_TRUNCATE",  # or "WRITE_APPEND"
# )
# job = client.load_table_from_dataframe(bin_psi_results, table_id, job_config=job_config)
# job.result()  # Wait for the job to complete

In [None]:
# sq = """select * from prj-prod-dataplatform.dap_ds_poweruser_playground.beta_cash_demo_model_psi_v4"""
# dfd = client.query(sq).to_dataframe()
# dfd.groupby(['trenchCategory','Month'])['modelDisplayName'].count()

##### Trench 2

#### Test

In [None]:
sq = """
WITH parsed as (
select customerId, digitalLoanAccountId,modelDisplayName,modelVersionId,start_time,end_time,prediction,trenchCategory,
REPLACE(REPLACE(calcFeature, "'", '"'), "None", "null") AS calcFeatures,
FROM `prj-prod-dataplatform.audit_balance.ml_model_run_details`
where modelDisplayName in ('Beta-Cash-Demo-Model', 'beta_demo_model_cash')
),
base as
(select 
 r.customerId,r.digitalLoanAccountId,prediction aCicScore 
 ,start_time,end_time,
  modelDisplayName,modelVersionId,
     loanmaster.new_loan_type loanType,
 loanmaster.gender,
    case when loanmaster.loantype='BNPL' and sil_category.store_type =1 then 'Appliance'
    when loanmaster.loantype='BNPL' and sil_category.store_type =2 then 'Mobile'
    when loanmaster.loantype='BNPL' and sil_category.store_type =3 then 'Mall'
    when loanmaster.loantype='BNPL' and sil_category.store_type not in (1,2,3) then store_tagging
    else 'not applicable' end as loan_product_type,
     case when lower(coalesce(loanmaster.osversion_v2, loanmaster.osVersion)) like '%andro%' then 'android'
    when lower(coalesce(loanmaster.osversion_v2, loanmaster.osVersion)) like '%os%' then 'ios'
    when lower(loanmaster.deviceType) like '%andro%' then 'android'
    else 'ios' end osType,
 'Cash' as product,
 'Beta-Cash-Demo-Model_All_Trench' Model_Name,
  trenchCategory,
   'Test' Data_selection,
  calcFeatures,
  prediction Beta_Cash_Demo_Score,
  coalesce(IF(loanmaster.new_loan_type = 'Flex-up', loanmaster.startApplyDateTime, loanmaster.termsAndConditionsSubmitDateTime),  r.start_time) AS appln_submit_datetime,
  loanmaster.disbursementDateTime,
  format_date('%Y-%m', coalesce(IF(loanmaster.new_loan_type = 'Flex-up', loanmaster.startApplyDateTime, loanmaster.termsAndConditionsSubmitDateTime),  r.start_time)) as Application_month, 
from parsed r
left join risk_credit_mis.loan_master_table loanmaster
  ON loanmaster.digitalLoanAccountId = r.digitalLoanAccountId
 left join(SELECT DISTINCT mer_refferal_code, mer_name mer_name,store_type,store_tagging FROM `dl_loans_db_raw.tdbk_merchant_refferal_mtb`
  left join worktable_datachampions.TARGET_SPLIT P on P.STORE_NAME = mer_name
 qualify row_number() over(partition by mer_refferal_code order by  created_dt desc)=1) sil_category on loanmaster.purpleKey=sil_category.mer_refferal_code
  qualify row_number() over(partition by r.customerId, r.digitalLoanAccountid order by cast(start_time as datetime) desc) = 1
)
select * from base where trenchCategory = 'Trench 2'
;
"""
dfd= client.query(sq).to_dataframe(progress_bar_type='tqdm')
dfd.head()

In [None]:
# dfd.groupby(['Application_month', 'trenchCategory'])['digitalLoanAccountId'].nunique()

In [None]:
# Expand the calcFeatures column
expanded_df = expand_calc_features(dfd)

# Display the result
print(f"Original columns: {dfd.shape[1]}")
print(f"Expanded columns: {expanded_df.shape[1]}")
df1 = expanded_df.copy()


##### Train

In [None]:
sq = """
WITH parsed as (
select customerId, digitalLoanAccountId,modelDisplayName,modelVersionId,start_time,end_time,prediction,trenchCategory,
REPLACE(REPLACE(calcFeature, "'", '"'), "None", "null") AS calcFeatures,
FROM prj-prod-dataplatform.dap_ds_poweruser_playground.ml_training_model_run_details
where modelDisplayName in ('Beta-Cash-Demo-Model', 'beta_demo_model_cash')
),
base as
(select 
 r.customerId,r.digitalLoanAccountId,prediction 
 ,start_time,end_time,
  modelDisplayName,modelVersionId,
     loanmaster.new_loan_type loanType,
 loanmaster.gender,
    case when loanmaster.loantype='BNPL' and sil_category.store_type =1 then 'Appliance'
    when loanmaster.loantype='BNPL' and sil_category.store_type =2 then 'Mobile'
    when loanmaster.loantype='BNPL' and sil_category.store_type =3 then 'Mall'
    when loanmaster.loantype='BNPL' and sil_category.store_type not in (1,2,3) then store_tagging
    else 'not applicable' end as loan_product_type,
     case when lower(coalesce(loanmaster.osversion_v2, loanmaster.osVersion)) like '%andro%' then 'android'
    when lower(coalesce(loanmaster.osversion_v2, loanmaster.osVersion)) like '%os%' then 'ios'
    when lower(loanmaster.deviceType) like '%andro%' then 'android'
    else 'ios' end osType,
 'Cash' as product,
 'Beta-Cash-Demo-Model_All_Trench' Model_Name,
  trenchCategory,
   'Train' Data_selection,
  calcFeatures,
  prediction Beta_Cash_Demo_Score,
  IF(loanmaster.new_loan_type = 'Flex-up', loanmaster.startApplyDateTime, loanmaster.termsAndConditionsSubmitDateTime) AS appln_submit_datetime,
  loanmaster.disbursementDateTime,
  format_date('%Y-%m',IF(loanmaster.new_loan_type = 'Flex-up', loanmaster.startApplyDateTime, loanmaster.termsAndConditionsSubmitDateTime)) as Application_month, 
from parsed r
left join risk_credit_mis.loan_master_table loanmaster
  ON loanmaster.digitalLoanAccountId = r.digitalLoanAccountId
 left join(SELECT DISTINCT mer_refferal_code, mer_name mer_name,store_type,store_tagging FROM `dl_loans_db_raw.tdbk_merchant_refferal_mtb`
  left join worktable_datachampions.TARGET_SPLIT P on P.STORE_NAME = mer_name
 qualify row_number() over(partition by mer_refferal_code order by  created_dt desc)=1) sil_category on loanmaster.purpleKey=sil_category.mer_refferal_code
  qualify row_number() over(partition by r.customerId, r.digitalLoanAccountid order by cast(start_time as datetime) desc) = 1
)
select * from base where trenchCategory = 'Trench 2'
;
"""

dfd = client.query(sq).to_dataframe(progress_bar_type='tqdm')
dfd.head()


In [None]:
# dfd.groupby(['Application_month', 'trenchCategory'])['digitalLoanAccountId'].nunique()

In [None]:
# Expand the calcFeatures column
expanded_df = expand_calc_features(dfd)

# Display the result
print(f"Original columns: {dfd.shape[1]}")
print(f"Expanded columns: {expanded_df.shape[1]}")
df2 = expanded_df.copy()

In [None]:
df1.info()

In [None]:
df2.info()

##### concatenate

In [None]:
df_concat = pd.concat([df1, df2], ignore_index=True)
df_concat.info()

In [None]:
df_concat = df_concat.drop(columns=['calc_Beta_Cash_Demo_Score', 'calc_ln_fspd30_flag', 'calc_ln_os_type', 'calc_ln_mature_fspd30_flag', 'calc_ln_appln_submit_datetime']).copy()
df_concat.columns = df_concat.columns.str.replace('calc_', '', regex=False)
df_concat.columns

In [None]:
df = df_concat.drop(columns=['calcFeatures']).copy()
df['Beta_Cash_Demo_Score'] = pd.to_numeric(df['Beta_Cash_Demo_Score'], errors='coerce')
df.columns

##### PSI calculation

In [None]:
df = df.copy()

# Define feature list
feature_list = ['Beta_Cash_Demo_Score', 'ln_vas_opted_flag', 'ln_self_dec_income', 'ln_age',
       'ln_source_funds_new_bin', 'ln_loan_level_user_type',
       'ln_industry_new_cat_bin', 'ln_marital_status', 'ln_doc_type_rolled',
       'ln_education_level', 'ln_ref2_type', 'ln_email_primary_domain',
        'ln_province_bin']

# Define segment columns
segment_columns = ['new_loan_type','osType', 'loan_product_type']
# Calculate month-on-month PSI
psi_results = calculate_month_on_month_psi(df, feature_list, segment_columns)
psi_results['modelDisplayName'] = df['modelDisplayName'].iloc[0]
psi_results['Model_Name'] = df['Model_Name'].iloc[0]
psi_results['modelVersionId'] = df['modelVersionId'].iloc[0]
psi_results['trenchCategory'] = df['trenchCategory'].iloc[0]
psi_results = psi_results[['modelDisplayName', 'Model_Name', 'modelVersionId', 'trenchCategory',
                           'Feature', 'Feature_Type', 'Segment_Column', 'Segment_Value', 'Month',
                           'Base_Month', 'Current_Month', 'Base_Count', 'Actual_Count',
                           'Expected_Percentage', 'Actual_Percentage', 'PSI']].copy()

# # Calculate bin-level PSI
# bin_psi_results = calculate_bin_level_psi(df, feature_list, segment_columns)
# bin_psi_results['modelDisplayName'] = df['modelDisplayName'].iloc[0]
# bin_psi_results['Model_Name'] = df['Model_Name'].iloc[0]
# bin_psi_results['modelVersionId'] = df['modelVersionId'].iloc[0]
# bin_psi_results['trenchCategory'] = df['trenchCategory'].iloc[0]
# bin_psi_results = bin_psi_results[['modelDisplayName', 'Model_Name', 'modelVersionId', 'trenchCategory',
#                                    'Feature', 'Feature_Type', 'Segment_Column', 'Segment_Value',
#                                     'Month', 'Base_Month', 'Current_Month', 'Base_Count', 'Actual_Count',
#                                     'Bin', 'Bin_Range', 'Base_Percentage', 'Actual_Percentage', 'Bin_PSI']].copy()

# Upload to BigQuery
table_id = "prj-prod-dataplatform.dap_ds_poweruser_playground.beta_cash_demo_model_psi_v4"
job_config = bigquery.LoadJobConfig(
    write_disposition="WRITE_APPEND",  # or "WRITE_APPEND"
)
job = client.load_table_from_dataframe(psi_results, table_id, job_config=job_config)
job.result()  # Wait for the job to complete


# # Upload to BigQuery
# table_id = "prj-prod-dataplatform.dap_ds_poweruser_playground.beta_sil_income_estimation_model_csi_v4"
# job_config = bigquery.LoadJobConfig(
#     write_disposition="WRITE_TRUNCATE",  # or "WRITE_APPEND"
# )
# job = client.load_table_from_dataframe(bin_psi_results, table_id, job_config=job_config)
# job.result()  # Wait for the job to complete

In [None]:
sq = """select * from prj-prod-dataplatform.dap_ds_poweruser_playground.beta_cash_demo_model_psi_v4"""
dfd = client.query(sq).to_dataframe()
dfd.groupby(['trenchCategory','Month'])['modelDisplayName'].count()

##### Trench 3

#### Test

In [None]:
sq = """
WITH parsed as (
select customerId, digitalLoanAccountId,modelDisplayName,modelVersionId,start_time,end_time,prediction,trenchCategory,
REPLACE(REPLACE(calcFeature, "'", '"'), "None", "null") AS calcFeatures,
FROM `prj-prod-dataplatform.audit_balance.ml_model_run_details`
where modelDisplayName in ('Beta-Cash-Demo-Model', 'beta_demo_model_cash')
),
base as
(select 
 r.customerId,r.digitalLoanAccountId,prediction aCicScore 
 ,start_time,end_time,
  modelDisplayName,modelVersionId,
     loanmaster.new_loan_type loanType,
 loanmaster.gender,
    case when loanmaster.loantype='BNPL' and sil_category.store_type =1 then 'Appliance'
    when loanmaster.loantype='BNPL' and sil_category.store_type =2 then 'Mobile'
    when loanmaster.loantype='BNPL' and sil_category.store_type =3 then 'Mall'
    when loanmaster.loantype='BNPL' and sil_category.store_type not in (1,2,3) then store_tagging
    else 'not applicable' end as loan_product_type,
     case when lower(coalesce(loanmaster.osversion_v2, loanmaster.osVersion)) like '%andro%' then 'android'
    when lower(coalesce(loanmaster.osversion_v2, loanmaster.osVersion)) like '%os%' then 'ios'
    when lower(loanmaster.deviceType) like '%andro%' then 'android'
    else 'ios' end osType,
 'Cash' as product,
 'Beta-Cash-Demo-Model_All_Trench' Model_Name,
  trenchCategory,
   'Test' Data_selection,
  calcFeatures,
  prediction Beta_Cash_Demo_Score,
  coalesce(IF(loanmaster.new_loan_type = 'Flex-up', loanmaster.startApplyDateTime, loanmaster.termsAndConditionsSubmitDateTime),  r.start_time) AS appln_submit_datetime,
  loanmaster.disbursementDateTime,
  format_date('%Y-%m', coalesce(IF(loanmaster.new_loan_type = 'Flex-up', loanmaster.startApplyDateTime, loanmaster.termsAndConditionsSubmitDateTime),  r.start_time)) as Application_month, 
from parsed r
left join risk_credit_mis.loan_master_table loanmaster
  ON loanmaster.digitalLoanAccountId = r.digitalLoanAccountId
 left join(SELECT DISTINCT mer_refferal_code, mer_name mer_name,store_type,store_tagging FROM `dl_loans_db_raw.tdbk_merchant_refferal_mtb`
  left join worktable_datachampions.TARGET_SPLIT P on P.STORE_NAME = mer_name
 qualify row_number() over(partition by mer_refferal_code order by  created_dt desc)=1) sil_category on loanmaster.purpleKey=sil_category.mer_refferal_code
  qualify row_number() over(partition by r.customerId, r.digitalLoanAccountid order by cast(start_time as datetime) desc) = 1
)
select * from base where trenchCategory = 'Trench 3'
;
"""
dfd= client.query(sq).to_dataframe(progress_bar_type='tqdm')
dfd.head()

In [None]:
# dfd.groupby(['Application_month', 'trenchCategory'])['digitalLoanAccountId'].nunique()

In [None]:
# Expand the calcFeatures column
expanded_df = expand_calc_features(dfd)

# Display the result
print(f"Original columns: {dfd.shape[1]}")
print(f"Expanded columns: {expanded_df.shape[1]}")
df1 = expanded_df.copy()


##### Train

In [None]:
sq = """
WITH parsed as (
select customerId, digitalLoanAccountId,modelDisplayName,modelVersionId,start_time,end_time,prediction,trenchCategory,
REPLACE(REPLACE(calcFeature, "'", '"'), "None", "null") AS calcFeatures,
FROM prj-prod-dataplatform.dap_ds_poweruser_playground.ml_training_model_run_details
where modelDisplayName in ('Beta-Cash-Demo-Model', 'beta_demo_model_cash')
),
base as
(select 
 r.customerId,r.digitalLoanAccountId,prediction 
 ,start_time,end_time,
  modelDisplayName,modelVersionId,
     loanmaster.new_loan_type loanType,
 loanmaster.gender,
    case when loanmaster.loantype='BNPL' and sil_category.store_type =1 then 'Appliance'
    when loanmaster.loantype='BNPL' and sil_category.store_type =2 then 'Mobile'
    when loanmaster.loantype='BNPL' and sil_category.store_type =3 then 'Mall'
    when loanmaster.loantype='BNPL' and sil_category.store_type not in (1,2,3) then store_tagging
    else 'not applicable' end as loan_product_type,
     case when lower(coalesce(loanmaster.osversion_v2, loanmaster.osVersion)) like '%andro%' then 'android'
    when lower(coalesce(loanmaster.osversion_v2, loanmaster.osVersion)) like '%os%' then 'ios'
    when lower(loanmaster.deviceType) like '%andro%' then 'android'
    else 'ios' end osType,
 'Cash' as product,
 'Beta-Cash-Demo-Model_All_Trench' Model_Name,
  trenchCategory,
   'Train' Data_selection,
  calcFeatures,
  prediction Beta_Cash_Demo_Score,
  IF(loanmaster.new_loan_type = 'Flex-up', loanmaster.startApplyDateTime, loanmaster.termsAndConditionsSubmitDateTime) AS appln_submit_datetime,
  loanmaster.disbursementDateTime,
  format_date('%Y-%m',IF(loanmaster.new_loan_type = 'Flex-up', loanmaster.startApplyDateTime, loanmaster.termsAndConditionsSubmitDateTime)) as Application_month, 
from parsed r
left join risk_credit_mis.loan_master_table loanmaster
  ON loanmaster.digitalLoanAccountId = r.digitalLoanAccountId
 left join(SELECT DISTINCT mer_refferal_code, mer_name mer_name,store_type,store_tagging FROM `dl_loans_db_raw.tdbk_merchant_refferal_mtb`
  left join worktable_datachampions.TARGET_SPLIT P on P.STORE_NAME = mer_name
 qualify row_number() over(partition by mer_refferal_code order by  created_dt desc)=1) sil_category on loanmaster.purpleKey=sil_category.mer_refferal_code
  qualify row_number() over(partition by r.customerId, r.digitalLoanAccountid order by cast(start_time as datetime) desc) = 1
)
select * from base where trenchCategory = 'Trench 3'
;
"""

dfd = client.query(sq).to_dataframe(progress_bar_type='tqdm')
dfd.head()


In [None]:
# dfd.groupby(['Application_month', 'trenchCategory'])['digitalLoanAccountId'].nunique()

In [None]:
# Expand the calcFeatures column
expanded_df = expand_calc_features(dfd)

# Display the result
print(f"Original columns: {dfd.shape[1]}")
print(f"Expanded columns: {expanded_df.shape[1]}")
df2 = expanded_df.copy()

In [None]:
df1.info()

In [None]:
df2.info()

##### concatenate

In [None]:
df_concat = pd.concat([df1, df2], ignore_index=True)
df_concat.info()

In [None]:
df_concat = df_concat.drop(columns=['calc_Beta_Cash_Demo_Score']).copy()
df_concat.columns = df_concat.columns.str.replace('calc_', '', regex=False)
df_concat.columns

In [None]:
df = df_concat.drop(columns=['calcFeatures']).copy()
df['Beta_Cash_Demo_Score'] = pd.to_numeric(df['Beta_Cash_Demo_Score'], errors='coerce')
df.columns

##### PSI calculation

In [None]:
df = df.copy()

# Define feature list
feature_list = ['Beta_Cash_Demo_Score', 'ln_vas_opted_flag', 'ln_self_dec_income', 'ln_age',
       'ln_source_funds_new_bin', 'ln_loan_level_user_type',
       'ln_industry_new_cat_bin', 'ln_marital_status', 'ln_doc_type_rolled',
       'ln_education_level', 'ln_ref2_type', 'ln_email_primary_domain',
        'ln_province_bin']

# Define segment columns
segment_columns = ['new_loan_type','osType', 'loan_product_type']
# Calculate month-on-month PSI
psi_results = calculate_month_on_month_psi(df, feature_list, segment_columns)
psi_results['modelDisplayName'] = df['modelDisplayName'].iloc[0]
psi_results['Model_Name'] = df['Model_Name'].iloc[0]
psi_results['modelVersionId'] = df['modelVersionId'].iloc[0]
psi_results['trenchCategory'] = df['trenchCategory'].iloc[0]
psi_results = psi_results[['modelDisplayName', 'Model_Name', 'modelVersionId', 'trenchCategory',
                           'Feature', 'Feature_Type', 'Segment_Column', 'Segment_Value', 'Month',
                           'Base_Month', 'Current_Month', 'Base_Count', 'Actual_Count',
                           'Expected_Percentage', 'Actual_Percentage', 'PSI']].copy()

# # Calculate bin-level PSI
# bin_psi_results = calculate_bin_level_psi(df, feature_list, segment_columns)
# bin_psi_results['modelDisplayName'] = df['modelDisplayName'].iloc[0]
# bin_psi_results['Model_Name'] = df['Model_Name'].iloc[0]
# bin_psi_results['modelVersionId'] = df['modelVersionId'].iloc[0]
# bin_psi_results['trenchCategory'] = df['trenchCategory'].iloc[0]
# bin_psi_results = bin_psi_results[['modelDisplayName', 'Model_Name', 'modelVersionId', 'trenchCategory',
#                                    'Feature', 'Feature_Type', 'Segment_Column', 'Segment_Value',
#                                     'Month', 'Base_Month', 'Current_Month', 'Base_Count', 'Actual_Count',
#                                     'Bin', 'Bin_Range', 'Base_Percentage', 'Actual_Percentage', 'Bin_PSI']].copy()

# Upload to BigQuery
table_id = "prj-prod-dataplatform.dap_ds_poweruser_playground.beta_cash_demo_model_psi_v4"
job_config = bigquery.LoadJobConfig(
    write_disposition="WRITE_APPEND",  # or "WRITE_APPEND"
)
job = client.load_table_from_dataframe(psi_results, table_id, job_config=job_config)
job.result()  # Wait for the job to complete


# # Upload to BigQuery
# table_id = "prj-prod-dataplatform.dap_ds_poweruser_playground.beta_sil_income_estimation_model_csi_v4"
# job_config = bigquery.LoadJobConfig(
#     write_disposition="WRITE_TRUNCATE",  # or "WRITE_APPEND"
# )
# job = client.load_table_from_dataframe(bin_psi_results, table_id, job_config=job_config)
# job.result()  # Wait for the job to complete

In [None]:
sq = """select * from prj-prod-dataplatform.dap_ds_poweruser_playground.beta_cash_demo_model_psi_v4"""
dfd = client.query(sq).to_dataframe()
dfd.groupby(['trenchCategory','Month'])['modelDisplayName'].count()

#### Beta-Cash-AppScore-Model

##### Trench 1

##### Test

In [None]:
sq = """WITH parsed as (
select customerId, digitalLoanAccountId,modelDisplayName,modelVersionId,start_time,end_time,prediction,trenchCategory,
REPLACE(REPLACE(calcFeature, "'", '"'), "None", "null") AS calcFeatures,
REPLACE(REPLACE(prediction, "'", '"'), "None", "null") AS prediction_clean
FROM `prj-prod-dataplatform.audit_balance.ml_model_run_details`
where modelDisplayName in  ('Beta-Cash-AppScore-Model', 'apps_score_cash')
),
base as
(select 
 r.customerId,r.digitalLoanAccountId,prediction aCicScore 
 ,start_time,end_time,
  modelDisplayName,modelVersionId,
     loanmaster.new_loan_type loanType,
 loanmaster.gender,
    case when loanmaster.loantype='BNPL' and sil_category.store_type =1 then 'Appliance'
    when loanmaster.loantype='BNPL' and sil_category.store_type =2 then 'Mobile'
    when loanmaster.loantype='BNPL' and sil_category.store_type =3 then 'Mall'
    when loanmaster.loantype='BNPL' and sil_category.store_type not in (1,2,3) then store_tagging
    else 'not applicable' end as loan_product_type,
     case when lower(coalesce(loanmaster.osversion_v2, loanmaster.osVersion)) like '%andro%' then 'android'
    when lower(coalesce(loanmaster.osversion_v2, loanmaster.osVersion)) like '%os%' then 'ios'
    when lower(loanmaster.deviceType) like '%andro%' then 'android'
    else 'ios' end osType,
 'Cash' as product,
 'Beta-Cash-AppScore-Model_Trench1' Model_Name,
  trenchCategory,
   'Test' Data_selection,
  calcFeatures,
  SAFE_CAST(JSON_VALUE(r.prediction_clean, "$.combined_score") AS Float64) AS beta_cash_app_score,
  coalesce(IF(loanmaster.new_loan_type = 'Flex-up', loanmaster.startApplyDateTime, loanmaster.termsAndConditionsSubmitDateTime),  r.start_time) AS appln_submit_datetime,
  loanmaster.disbursementDateTime,
  format_date('%Y-%m', coalesce(IF(loanmaster.new_loan_type = 'Flex-up', loanmaster.startApplyDateTime, loanmaster.termsAndConditionsSubmitDateTime),  r.start_time)) as Application_month, 
from parsed r
left join risk_credit_mis.loan_master_table loanmaster
  ON loanmaster.digitalLoanAccountId = r.digitalLoanAccountId
 left join(SELECT DISTINCT mer_refferal_code, mer_name mer_name,store_type,store_tagging FROM `dl_loans_db_raw.tdbk_merchant_refferal_mtb`
  left join worktable_datachampions.TARGET_SPLIT P on P.STORE_NAME = mer_name
 qualify row_number() over(partition by mer_refferal_code order by  created_dt desc)=1) sil_category on loanmaster.purpleKey=sil_category.mer_refferal_code
  qualify row_number() over(partition by r.customerId, r.digitalLoanAccountid order by cast(start_time as datetime) desc) = 1
)
select * from base where trenchCategory = 'Trench 1'
;"""

dfd = client.query(sq).to_dataframe(progress_bar_type='tqdm')
dfd.head()

In [None]:
# dfd.groupby(['Application_month', 'trenchCategory'])['digitalLoanAccountId'].nunique()

In [None]:
# Expand the calcFeatures column
expanded_df = expand_calc_features_fixed(dfd)

# Display the result
print(f"Original columns: {dfd.shape[1]}")
print(f"Expanded columns: {expanded_df.shape[1]}")
df1 = expanded_df.copy()


##### Train

In [None]:
sq = """ 
WITH parsed as (
select customerId, digitalLoanAccountId,modelDisplayName,modelVersionId,start_time,end_time,prediction,trenchCategory,
REPLACE(REPLACE(calcFeature, "'", '"'), "None", "null") AS calcFeatures,
FROM prj-prod-dataplatform.dap_ds_poweruser_playground.ml_training_model_run_details
where modelDisplayName in  ('Beta-Cash-AppScore-Model', 'apps_score_cash')
),
base as
(select 
 r.customerId,r.digitalLoanAccountId,prediction 
 ,start_time,end_time,
  modelDisplayName,modelVersionId,
     loanmaster.new_loan_type loanType,
 loanmaster.gender,
    case when loanmaster.loantype='BNPL' and sil_category.store_type =1 then 'Appliance'
    when loanmaster.loantype='BNPL' and sil_category.store_type =2 then 'Mobile'
    when loanmaster.loantype='BNPL' and sil_category.store_type =3 then 'Mall'
    when loanmaster.loantype='BNPL' and sil_category.store_type not in (1,2,3) then store_tagging
    else 'not applicable' end as loan_product_type,
     case when lower(coalesce(loanmaster.osversion_v2, loanmaster.osVersion)) like '%andro%' then 'android'
    when lower(coalesce(loanmaster.osversion_v2, loanmaster.osVersion)) like '%os%' then 'ios'
    when lower(loanmaster.deviceType) like '%andro%' then 'android'
    else 'ios' end osType,
 'Cash' as product,
 'Beta-Cash-AppScore-Model_Trench1' Model_Name,
  trenchCategory,
   'Train' Data_selection,
  calcFeatures,
  prediction beta_cash_app_score,
  IF(loanmaster.new_loan_type = 'Flex-up', loanmaster.startApplyDateTime, loanmaster.termsAndConditionsSubmitDateTime) AS appln_submit_datetime,
  loanmaster.disbursementDateTime,
  format_date('%Y-%m',IF(loanmaster.new_loan_type = 'Flex-up', loanmaster.startApplyDateTime, loanmaster.termsAndConditionsSubmitDateTime)) as Application_month, 
from parsed r
left join risk_credit_mis.loan_master_table loanmaster
  ON loanmaster.digitalLoanAccountId = r.digitalLoanAccountId
 left join(SELECT DISTINCT mer_refferal_code, mer_name mer_name,store_type,store_tagging FROM `dl_loans_db_raw.tdbk_merchant_refferal_mtb`
  left join worktable_datachampions.TARGET_SPLIT P on P.STORE_NAME = mer_name
 qualify row_number() over(partition by mer_refferal_code order by  created_dt desc)=1) sil_category on loanmaster.purpleKey=sil_category.mer_refferal_code
  qualify row_number() over(partition by r.customerId, r.digitalLoanAccountid order by cast(start_time as datetime) desc) = 1
)
select * from base where trenchCategory = 'Trench 1'
"""

dfd = client.query(sq).to_dataframe(progress_bar_type='tqdm')
dfd.head()

In [None]:
# dfd.groupby(['Application_month', 'trenchCategory'])['digitalLoanAccountId'].nunique()

In [None]:
# Expand the calcFeatures column
expanded_df = expand_calc_features(dfd)

# Display the result
print(f"Original columns: {dfd.shape[1]}")
print(f"Expanded columns: {expanded_df.shape[1]}")
df2 = expanded_df.copy()

In [None]:
df1.info()

In [None]:
df2.info()

##### concatenate

In [None]:
df_concat = pd.concat([df1, df2], ignore_index=True)
df_concat.info()

In [None]:
df_concat = df_concat.drop(columns=['prediction', 'calc_app_median_time_bw_installed_mins_ever', 'calc_app_avg_time_bw_installed_mins_ever', 'calc_beta_cash_app_score']).copy()
df_concat.columns = df_concat.columns.str.replace('calc_', '', regex=False)
df_concat.columns

In [None]:
df = df_concat.drop(columns=['calcFeatures']).copy()
df.info()

In [None]:
df['beta_cash_app_score'] = pd.to_numeric(df['beta_cash_app_score'], errors='coerce')
df.columns

##### PSI calculation

In [None]:
df = df.copy()

# Define feature list
feature_list = ['beta_cash_app_score',
       'app_cnt_health_and_fitness_ever', 'app_cnt_shopping_ever',
       'app_cnt_crypto_ever', 'app_cnt_driver_ever', 'app_cnt_payday_180d',
       'app_cnt_gambling_180d', 'app_avg_time_bw_installed_mins_3d',
       'app_median_time_bw_installed_mins_3d'
]

# Define segment columns
segment_columns = ['new_loan_type','osType', 'loan_product_type']
# Calculate month-on-month PSI
psi_results = calculate_month_on_month_psi(df, feature_list, segment_columns)
psi_results['modelDisplayName'] = df['modelDisplayName'].iloc[0]
psi_results['Model_Name'] = df['Model_Name'].iloc[0]
psi_results['modelVersionId'] = df['modelVersionId'].iloc[0]
psi_results['trenchCategory'] = df['trenchCategory'].iloc[0]
psi_results = psi_results[['modelDisplayName', 'Model_Name', 'modelVersionId', 'trenchCategory',
                           'Feature', 'Feature_Type', 'Segment_Column', 'Segment_Value', 'Month',
                           'Base_Month', 'Current_Month', 'Base_Count', 'Actual_Count',
                           'Expected_Percentage', 'Actual_Percentage', 'PSI']].copy()

# # Calculate bin-level PSI
# bin_psi_results = calculate_bin_level_psi(df, feature_list, segment_columns)
# bin_psi_results['modelDisplayName'] = df['modelDisplayName'].iloc[0]
# bin_psi_results['Model_Name'] = df['Model_Name'].iloc[0]
# bin_psi_results['modelVersionId'] = df['modelVersionId'].iloc[0]
# bin_psi_results['trenchCategory'] = df['trenchCategory'].iloc[0]
# bin_psi_results = bin_psi_results[['modelDisplayName', 'Model_Name', 'modelVersionId', 'trenchCategory',
#                                    'Feature', 'Feature_Type', 'Segment_Column', 'Segment_Value',
#                                     'Month', 'Base_Month', 'Current_Month', 'Base_Count', 'Actual_Count',
#                                     'Bin', 'Bin_Range', 'Base_Percentage', 'Actual_Percentage', 'Bin_PSI']].copy()

# Upload to BigQuery
table_id = "prj-prod-dataplatform.dap_ds_poweruser_playground.beta_cash_appscore_model_psi_v4"
job_config = bigquery.LoadJobConfig(
    write_disposition="WRITE_TRUNCATE",  # or "WRITE_APPEND"
)
job = client.load_table_from_dataframe(psi_results, table_id, job_config=job_config)
job.result()  # Wait for the job to complete


# # Upload to BigQuery
# table_id = "prj-prod-dataplatform.dap_ds_poweruser_playground.beta_sil_income_estimation_model_csi_v4"
# job_config = bigquery.LoadJobConfig(
#     write_disposition="WRITE_TRUNCATE",  # or "WRITE_APPEND"
# )
# job = client.load_table_from_dataframe(bin_psi_results, table_id, job_config=job_config)
# job.result()  # Wait for the job to complete

In [None]:
sq = """select * from prj-prod-dataplatform.dap_ds_poweruser_playground.beta_cash_appscore_model_psi_v4"""
dfd = client.query(sq).to_dataframe()
dfd.groupby(['trenchCategory','Month'])['modelDisplayName'].count()

##### Trench 2

##### Test

In [None]:
sq = """WITH parsed as (
select customerId, digitalLoanAccountId,modelDisplayName,modelVersionId,start_time,end_time,prediction,trenchCategory,
REPLACE(REPLACE(calcFeature, "'", '"'), "None", "null") AS calcFeatures,
REPLACE(REPLACE(prediction, "'", '"'), "None", "null") AS prediction_clean
FROM `prj-prod-dataplatform.audit_balance.ml_model_run_details`
where modelDisplayName in  ('Beta-Cash-AppScore-Model', 'apps_score_cash')
),
base as
(select 
 r.customerId,r.digitalLoanAccountId,prediction aCicScore 
 ,start_time,end_time,
  modelDisplayName,modelVersionId,
     loanmaster.new_loan_type loanType,
 loanmaster.gender,
    case when loanmaster.loantype='BNPL' and sil_category.store_type =1 then 'Appliance'
    when loanmaster.loantype='BNPL' and sil_category.store_type =2 then 'Mobile'
    when loanmaster.loantype='BNPL' and sil_category.store_type =3 then 'Mall'
    when loanmaster.loantype='BNPL' and sil_category.store_type not in (1,2,3) then store_tagging
    else 'not applicable' end as loan_product_type,
     case when lower(coalesce(loanmaster.osversion_v2, loanmaster.osVersion)) like '%andro%' then 'android'
    when lower(coalesce(loanmaster.osversion_v2, loanmaster.osVersion)) like '%os%' then 'ios'
    when lower(loanmaster.deviceType) like '%andro%' then 'android'
    else 'ios' end osType,
 'Cash' as product,
 'Beta-Cash-AppScore-Model_Trench2' Model_Name,
  trenchCategory,
   'Test' Data_selection,
  calcFeatures,
  SAFE_CAST(JSON_VALUE(r.prediction_clean, "$.combined_score") AS Float64) AS beta_cash_app_score,
  coalesce(IF(loanmaster.new_loan_type = 'Flex-up', loanmaster.startApplyDateTime, loanmaster.termsAndConditionsSubmitDateTime),  r.start_time) AS appln_submit_datetime,
  loanmaster.disbursementDateTime,
  format_date('%Y-%m', coalesce(IF(loanmaster.new_loan_type = 'Flex-up', loanmaster.startApplyDateTime, loanmaster.termsAndConditionsSubmitDateTime),  r.start_time)) as Application_month, 
from parsed r
left join risk_credit_mis.loan_master_table loanmaster
  ON loanmaster.digitalLoanAccountId = r.digitalLoanAccountId
 left join(SELECT DISTINCT mer_refferal_code, mer_name mer_name,store_type,store_tagging FROM `dl_loans_db_raw.tdbk_merchant_refferal_mtb`
  left join worktable_datachampions.TARGET_SPLIT P on P.STORE_NAME = mer_name
 qualify row_number() over(partition by mer_refferal_code order by  created_dt desc)=1) sil_category on loanmaster.purpleKey=sil_category.mer_refferal_code
  qualify row_number() over(partition by r.customerId, r.digitalLoanAccountid order by cast(start_time as datetime) desc) = 1
)
select * from base where trenchCategory = 'Trench 2'
;"""

dfd = client.query(sq).to_dataframe(progress_bar_type='tqdm')
dfd.head()

In [None]:
# dfd.groupby(['Application_month', 'trenchCategory'])['digitalLoanAccountId'].nunique()

In [None]:
# Expand the calcFeatures column
expanded_df = expand_calc_features(dfd)

# Display the result
print(f"Original columns: {dfd.shape[1]}")
print(f"Expanded columns: {expanded_df.shape[1]}")
df1 = expanded_df.copy()


##### Train

In [None]:
sq = """ 
WITH parsed as (
select customerId, digitalLoanAccountId,modelDisplayName,modelVersionId,start_time,end_time,prediction,trenchCategory,
REPLACE(REPLACE(calcFeature, "'", '"'), "None", "null") AS calcFeatures,
FROM prj-prod-dataplatform.dap_ds_poweruser_playground.ml_training_model_run_details
where modelDisplayName in  ('Beta-Cash-AppScore-Model', 'apps_score_cash')
),
base as
(select 
 r.customerId,r.digitalLoanAccountId,prediction 
 ,start_time,end_time,
  modelDisplayName,modelVersionId,
     loanmaster.new_loan_type loanType,
 loanmaster.gender,
    case when loanmaster.loantype='BNPL' and sil_category.store_type =1 then 'Appliance'
    when loanmaster.loantype='BNPL' and sil_category.store_type =2 then 'Mobile'
    when loanmaster.loantype='BNPL' and sil_category.store_type =3 then 'Mall'
    when loanmaster.loantype='BNPL' and sil_category.store_type not in (1,2,3) then store_tagging
    else 'not applicable' end as loan_product_type,
     case when lower(coalesce(loanmaster.osversion_v2, loanmaster.osVersion)) like '%andro%' then 'android'
    when lower(coalesce(loanmaster.osversion_v2, loanmaster.osVersion)) like '%os%' then 'ios'
    when lower(loanmaster.deviceType) like '%andro%' then 'android'
    else 'ios' end osType,
 'Cash' as product,
 'Beta-Cash-AppScore-Model_Trench2' Model_Name,
  trenchCategory,
   'Train' Data_selection,
  calcFeatures,
  prediction beta_cash_app_score,
  IF(loanmaster.new_loan_type = 'Flex-up', loanmaster.startApplyDateTime, loanmaster.termsAndConditionsSubmitDateTime) AS appln_submit_datetime,
  loanmaster.disbursementDateTime,
  format_date('%Y-%m',IF(loanmaster.new_loan_type = 'Flex-up', loanmaster.startApplyDateTime, loanmaster.termsAndConditionsSubmitDateTime)) as Application_month, 
from parsed r
left join risk_credit_mis.loan_master_table loanmaster
  ON loanmaster.digitalLoanAccountId = r.digitalLoanAccountId
 left join(SELECT DISTINCT mer_refferal_code, mer_name mer_name,store_type,store_tagging FROM `dl_loans_db_raw.tdbk_merchant_refferal_mtb`
  left join worktable_datachampions.TARGET_SPLIT P on P.STORE_NAME = mer_name
 qualify row_number() over(partition by mer_refferal_code order by  created_dt desc)=1) sil_category on loanmaster.purpleKey=sil_category.mer_refferal_code
  qualify row_number() over(partition by r.customerId, r.digitalLoanAccountid order by cast(start_time as datetime) desc) = 1
)
select * from base where trenchCategory = 'Trench 2'
"""

dfd = client.query(sq).to_dataframe(progress_bar_type='tqdm')
dfd.head()

In [None]:
# dfd.groupby(['Application_month', 'trenchCategory'])['digitalLoanAccountId'].nunique()

In [None]:
# Expand the calcFeatures column
expanded_df = expand_calc_features(dfd)

# Display the result
print(f"Original columns: {dfd.shape[1]}")
print(f"Expanded columns: {expanded_df.shape[1]}")
df2 = expanded_df.copy()

In [None]:
df1.info()

In [None]:
df2.info()

##### concatenate

In [None]:
df_concat = pd.concat([df1, df2], ignore_index=True)
df_concat.info()

In [None]:
df_concat = df_concat.drop(columns=['prediction', 'calc_app_median_time_bw_installed_mins_ever', 'calc_app_avg_time_bw_installed_mins_ever', 'calc_beta_cash_app_score']).copy()
df_concat.columns = df_concat.columns.str.replace('calc_', '', regex=False)
df_concat.columns

In [None]:
df = df_concat.drop(columns=['calcFeatures']).copy()
df.info()

In [None]:
df['beta_cash_app_score'] = pd.to_numeric(df['beta_cash_app_score'], errors='coerce')
df.columns

##### PSI calculation

In [None]:
df = df.copy()

# Define feature list
feature_list = ['beta_cash_app_score',
       'app_cnt_health_and_fitness_ever', 'app_cnt_shopping_ever',
       'app_cnt_crypto_ever', 'app_cnt_driver_ever', 'app_cnt_payday_180d',
       'app_cnt_gambling_180d', 'app_avg_time_bw_installed_mins_3d',
       'app_median_time_bw_installed_mins_3d'
]

# Define segment columns
segment_columns = ['new_loan_type','osType', 'loan_product_type']
# Calculate month-on-month PSI
psi_results = calculate_month_on_month_psi(df, feature_list, segment_columns)
psi_results['modelDisplayName'] = df['modelDisplayName'].iloc[0]
psi_results['Model_Name'] = df['Model_Name'].iloc[0]
psi_results['modelVersionId'] = df['modelVersionId'].iloc[0]
psi_results['trenchCategory'] = df['trenchCategory'].iloc[0]
psi_results = psi_results[['modelDisplayName', 'Model_Name', 'modelVersionId', 'trenchCategory',
                           'Feature', 'Feature_Type', 'Segment_Column', 'Segment_Value', 'Month',
                           'Base_Month', 'Current_Month', 'Base_Count', 'Actual_Count',
                           'Expected_Percentage', 'Actual_Percentage', 'PSI']].copy()

# # Calculate bin-level PSI
# bin_psi_results = calculate_bin_level_psi(df, feature_list, segment_columns)
# bin_psi_results['modelDisplayName'] = df['modelDisplayName'].iloc[0]
# bin_psi_results['Model_Name'] = df['Model_Name'].iloc[0]
# bin_psi_results['modelVersionId'] = df['modelVersionId'].iloc[0]
# bin_psi_results['trenchCategory'] = df['trenchCategory'].iloc[0]
# bin_psi_results = bin_psi_results[['modelDisplayName', 'Model_Name', 'modelVersionId', 'trenchCategory',
#                                    'Feature', 'Feature_Type', 'Segment_Column', 'Segment_Value',
#                                     'Month', 'Base_Month', 'Current_Month', 'Base_Count', 'Actual_Count',
#                                     'Bin', 'Bin_Range', 'Base_Percentage', 'Actual_Percentage', 'Bin_PSI']].copy()

# Upload to BigQuery
table_id = "prj-prod-dataplatform.dap_ds_poweruser_playground.beta_cash_appscore_model_psi_v4"
job_config = bigquery.LoadJobConfig(
    write_disposition="WRITE_APPEND",  # or "WRITE_APPEND"
)
job = client.load_table_from_dataframe(psi_results, table_id, job_config=job_config)
job.result()  # Wait for the job to complete


# # Upload to BigQuery
# table_id = "prj-prod-dataplatform.dap_ds_poweruser_playground.beta_sil_income_estimation_model_csi_v4"
# job_config = bigquery.LoadJobConfig(
#     write_disposition="WRITE_TRUNCATE",  # or "WRITE_APPEND"
# )
# job = client.load_table_from_dataframe(bin_psi_results, table_id, job_config=job_config)
# job.result()  # Wait for the job to complete

In [None]:
sq = """select * from prj-prod-dataplatform.dap_ds_poweruser_playground.beta_cash_appscore_model_psi_v4"""
dfd = client.query(sq).to_dataframe()
dfd.groupby(['trenchCategory','Month'])['modelDisplayName'].count()

##### Trench 3

##### Test

In [None]:
sq = """WITH parsed as (
select customerId, digitalLoanAccountId,modelDisplayName,modelVersionId,start_time,end_time,prediction,trenchCategory,
REPLACE(REPLACE(calcFeature, "'", '"'), "None", "null") AS calcFeatures,
REPLACE(REPLACE(prediction, "'", '"'), "None", "null") AS prediction_clean
FROM `prj-prod-dataplatform.audit_balance.ml_model_run_details`
where modelDisplayName in  ('Beta-Cash-AppScore-Model', 'apps_score_cash')
),
base as
(select 
 r.customerId,r.digitalLoanAccountId,prediction aCicScore 
 ,start_time,end_time,
  modelDisplayName,modelVersionId,
     loanmaster.new_loan_type loanType,
 loanmaster.gender,
    case when loanmaster.loantype='BNPL' and sil_category.store_type =1 then 'Appliance'
    when loanmaster.loantype='BNPL' and sil_category.store_type =2 then 'Mobile'
    when loanmaster.loantype='BNPL' and sil_category.store_type =3 then 'Mall'
    when loanmaster.loantype='BNPL' and sil_category.store_type not in (1,2,3) then store_tagging
    else 'not applicable' end as loan_product_type,
     case when lower(coalesce(loanmaster.osversion_v2, loanmaster.osVersion)) like '%andro%' then 'android'
    when lower(coalesce(loanmaster.osversion_v2, loanmaster.osVersion)) like '%os%' then 'ios'
    when lower(loanmaster.deviceType) like '%andro%' then 'android'
    else 'ios' end osType,
 'Cash' as product,
 'Beta-Cash-AppScore-Model_Trench3' Model_Name,
  trenchCategory,
   'Test' Data_selection,
  calcFeatures,
  SAFE_CAST(JSON_VALUE(r.prediction_clean, "$.combined_score") AS Float64) AS beta_cash_app_score,
  coalesce(IF(loanmaster.new_loan_type = 'Flex-up', loanmaster.startApplyDateTime, loanmaster.termsAndConditionsSubmitDateTime),  r.start_time) AS appln_submit_datetime,
  loanmaster.disbursementDateTime,
  format_date('%Y-%m', coalesce(IF(loanmaster.new_loan_type = 'Flex-up', loanmaster.startApplyDateTime, loanmaster.termsAndConditionsSubmitDateTime),  r.start_time)) as Application_month, 
from parsed r
left join risk_credit_mis.loan_master_table loanmaster
  ON loanmaster.digitalLoanAccountId = r.digitalLoanAccountId
 left join(SELECT DISTINCT mer_refferal_code, mer_name mer_name,store_type,store_tagging FROM `dl_loans_db_raw.tdbk_merchant_refferal_mtb`
  left join worktable_datachampions.TARGET_SPLIT P on P.STORE_NAME = mer_name
 qualify row_number() over(partition by mer_refferal_code order by  created_dt desc)=1) sil_category on loanmaster.purpleKey=sil_category.mer_refferal_code
  qualify row_number() over(partition by r.customerId, r.digitalLoanAccountid order by cast(start_time as datetime) desc) = 1
)
select * from base where trenchCategory = 'Trench 3'
;"""

dfd = client.query(sq).to_dataframe(progress_bar_type='tqdm')
dfd.head()

In [None]:
# dfd.groupby(['Application_month', 'trenchCategory'])['digitalLoanAccountId'].nunique()

In [None]:
# Expand the calcFeatures column
expanded_df = expand_calc_features(dfd)

# Display the result
print(f"Original columns: {dfd.shape[1]}")
print(f"Expanded columns: {expanded_df.shape[1]}")
df1 = expanded_df.copy()


##### Train

In [None]:
sq = """ 
WITH parsed as (
select customerId, digitalLoanAccountId,modelDisplayName,modelVersionId,start_time,end_time,prediction,trenchCategory,
REPLACE(REPLACE(calcFeature, "'", '"'), "None", "null") AS calcFeatures,
FROM prj-prod-dataplatform.dap_ds_poweruser_playground.ml_training_model_run_details
where modelDisplayName in  ('Beta-Cash-AppScore-Model', 'apps_score_cash')
),
base as
(select 
 r.customerId,r.digitalLoanAccountId,prediction 
 ,start_time,end_time,
  modelDisplayName,modelVersionId,
     loanmaster.new_loan_type loanType,
 loanmaster.gender,
    case when loanmaster.loantype='BNPL' and sil_category.store_type =1 then 'Appliance'
    when loanmaster.loantype='BNPL' and sil_category.store_type =2 then 'Mobile'
    when loanmaster.loantype='BNPL' and sil_category.store_type =3 then 'Mall'
    when loanmaster.loantype='BNPL' and sil_category.store_type not in (1,2,3) then store_tagging
    else 'not applicable' end as loan_product_type,
     case when lower(coalesce(loanmaster.osversion_v2, loanmaster.osVersion)) like '%andro%' then 'android'
    when lower(coalesce(loanmaster.osversion_v2, loanmaster.osVersion)) like '%os%' then 'ios'
    when lower(loanmaster.deviceType) like '%andro%' then 'android'
    else 'ios' end osType,
 'Cash' as product,
 'Beta-Cash-AppScore-Model_Trench3' Model_Name,
  trenchCategory,
   'Train' Data_selection,
  calcFeatures,
  prediction beta_cash_app_score,
  IF(loanmaster.new_loan_type = 'Flex-up', loanmaster.startApplyDateTime, loanmaster.termsAndConditionsSubmitDateTime) AS appln_submit_datetime,
  loanmaster.disbursementDateTime,
  format_date('%Y-%m',IF(loanmaster.new_loan_type = 'Flex-up', loanmaster.startApplyDateTime, loanmaster.termsAndConditionsSubmitDateTime)) as Application_month, 
from parsed r
left join risk_credit_mis.loan_master_table loanmaster
  ON loanmaster.digitalLoanAccountId = r.digitalLoanAccountId
 left join(SELECT DISTINCT mer_refferal_code, mer_name mer_name,store_type,store_tagging FROM `dl_loans_db_raw.tdbk_merchant_refferal_mtb`
  left join worktable_datachampions.TARGET_SPLIT P on P.STORE_NAME = mer_name
 qualify row_number() over(partition by mer_refferal_code order by  created_dt desc)=1) sil_category on loanmaster.purpleKey=sil_category.mer_refferal_code
  qualify row_number() over(partition by r.customerId, r.digitalLoanAccountid order by cast(start_time as datetime) desc) = 1
)
select * from base where trenchCategory = 'Trench 3'
"""

dfd = client.query(sq).to_dataframe(progress_bar_type='tqdm')
dfd.head()

In [None]:
# dfd.groupby(['Application_month', 'trenchCategory'])['digitalLoanAccountId'].nunique()

In [None]:
# Expand the calcFeatures column
expanded_df = expand_calc_features(dfd)

# Display the result
print(f"Original columns: {dfd.shape[1]}")
print(f"Expanded columns: {expanded_df.shape[1]}")
df2 = expanded_df.copy()

In [None]:
df1.info()

In [None]:
df2.info()

##### concatenate

In [None]:
df_concat = pd.concat([df1, df2], ignore_index=True)
df_concat.info()

In [None]:
df_concat = df_concat.drop(columns=['prediction', 'calc_beta_cash_app_score']).copy()
df_concat.columns = df_concat.columns.str.replace('calc_', '', regex=False)
df_concat.columns

In [None]:
df = df_concat.drop(columns=['calcFeatures']).copy()
df.info()

In [None]:
df['beta_cash_app_score'] = pd.to_numeric(df['beta_cash_app_score'], errors='coerce')
df.columns

##### PSI calculation

In [None]:
df = df.copy()

# Define feature list
feature_list = ['beta_cash_app_score', 'app_cnt_health_and_fitness_ever', 'app_cnt_productivity_ever',
       'app_cnt_rated_for_18plus_ever', 'app_cnt_books_and_reference_ever',
       'app_cnt_gaming_180d', 'app_cnt_absence_tag_365d',
       'app_last_payday_install_to_apply_days',
       'app_cnt_absence_tag_365d_binned', 'app_cnt_gaming_180d_binned',
       'app_cnt_productivity_ever_binned',
       'app_cnt_rated_for_18plus_ever_binned',
       'app_cnt_health_and_fitness_ever_binned',
       'app_cnt_books_and_reference_ever_binned',
       'app_last_payday_install_to_apply_days_binned', 'calc_ln_user_type'
]

# Define segment columns
segment_columns = ['new_loan_type','osType', 'loan_product_type']
# Calculate month-on-month PSI
psi_results = calculate_month_on_month_psi(df, feature_list, segment_columns)
psi_results['modelDisplayName'] = df['modelDisplayName'].iloc[0]
psi_results['Model_Name'] = df['Model_Name'].iloc[0]
psi_results['modelVersionId'] = df['modelVersionId'].iloc[0]
psi_results['trenchCategory'] = df['trenchCategory'].iloc[0]
psi_results = psi_results[['modelDisplayName', 'Model_Name', 'modelVersionId', 'trenchCategory',
                           'Feature', 'Feature_Type', 'Segment_Column', 'Segment_Value', 'Month',
                           'Base_Month', 'Current_Month', 'Base_Count', 'Actual_Count',
                           'Expected_Percentage', 'Actual_Percentage', 'PSI']].copy()

# # Calculate bin-level PSI
# bin_psi_results = calculate_bin_level_psi(df, feature_list, segment_columns)
# bin_psi_results['modelDisplayName'] = df['modelDisplayName'].iloc[0]
# bin_psi_results['Model_Name'] = df['Model_Name'].iloc[0]
# bin_psi_results['modelVersionId'] = df['modelVersionId'].iloc[0]
# bin_psi_results['trenchCategory'] = df['trenchCategory'].iloc[0]
# bin_psi_results = bin_psi_results[['modelDisplayName', 'Model_Name', 'modelVersionId', 'trenchCategory',
#                                    'Feature', 'Feature_Type', 'Segment_Column', 'Segment_Value',
#                                     'Month', 'Base_Month', 'Current_Month', 'Base_Count', 'Actual_Count',
#                                     'Bin', 'Bin_Range', 'Base_Percentage', 'Actual_Percentage', 'Bin_PSI']].copy()

# Upload to BigQuery
table_id = "prj-prod-dataplatform.dap_ds_poweruser_playground.beta_cash_appscore_model_psi_v4"
job_config = bigquery.LoadJobConfig(
    write_disposition="WRITE_APPEND",  # or "WRITE_APPEND"
)
job = client.load_table_from_dataframe(psi_results, table_id, job_config=job_config)
job.result()  # Wait for the job to complete


# # Upload to BigQuery
# table_id = "prj-prod-dataplatform.dap_ds_poweruser_playground.beta_sil_income_estimation_model_csi_v4"
# job_config = bigquery.LoadJobConfig(
#     write_disposition="WRITE_TRUNCATE",  # or "WRITE_APPEND"
# )
# job = client.load_table_from_dataframe(bin_psi_results, table_id, job_config=job_config)
# job.result()  # Wait for the job to complete

In [None]:
sq = """select * from prj-prod-dataplatform.dap_ds_poweruser_playground.beta_cash_appscore_model_psi_v4"""
dfd = client.query(sq).to_dataframe()
dfd.groupby(['trenchCategory','Month'])['modelDisplayName'].count()

#### Beta-Cash-Stack-Model Trench1

##### Trench 1

##### Test

In [None]:
sq = """
WITH parsed as (
select customerId, digitalLoanAccountId,modelDisplayName,modelVersionId,start_time,end_time,prediction,trenchCategory,
REPLACE(REPLACE(calcFeature, "'", '"'), "None", "null") AS calcFeatures,
REPLACE(REPLACE(prediction, "'", '"'), "None", "null") AS prediction_clean
FROM `prj-prod-dataplatform.audit_balance.ml_model_run_details`
where modelDisplayName in ('Beta-Cash-Stack-Model', 'beta_stack_model_cash')
),
base as
(select 
 r.customerId,r.digitalLoanAccountId,prediction Beta_cash_stack_score
 ,start_time,end_time,
  modelDisplayName,modelVersionId,
     loanmaster.new_loan_type loanType,
 loanmaster.gender,
    case when loanmaster.loantype='BNPL' and sil_category.store_type =1 then 'Appliance'
    when loanmaster.loantype='BNPL' and sil_category.store_type =2 then 'Mobile'
    when loanmaster.loantype='BNPL' and sil_category.store_type =3 then 'Mall'
    when loanmaster.loantype='BNPL' and sil_category.store_type not in (1,2,3) then store_tagging
    else 'not applicable' end as loan_product_type,
     case when lower(coalesce(loanmaster.osversion_v2, loanmaster.osVersion)) like '%andro%' then 'android'
    when lower(coalesce(loanmaster.osversion_v2, loanmaster.osVersion)) like '%os%' then 'ios'
    when lower(loanmaster.deviceType) like '%andro%' then 'android'
    else 'ios' end osType,
 'Cash' as product,
 'Beta-Cash-Stack-Model_Trench1' Model_Name,
  trenchCategory,
   'Test' Data_selection,
  calcFeatures,
  coalesce(IF(loanmaster.new_loan_type = 'Flex-up', loanmaster.startApplyDateTime, loanmaster.termsAndConditionsSubmitDateTime),  r.start_time) AS appln_submit_datetime,
  loanmaster.disbursementDateTime,
  format_date('%Y-%m', coalesce(IF(loanmaster.new_loan_type = 'Flex-up', loanmaster.startApplyDateTime, loanmaster.termsAndConditionsSubmitDateTime),  r.start_time)) as Application_month, 
from parsed r
left join risk_credit_mis.loan_master_table loanmaster
  ON loanmaster.digitalLoanAccountId = r.digitalLoanAccountId
 left join(SELECT DISTINCT mer_refferal_code, mer_name mer_name,store_type,store_tagging FROM `dl_loans_db_raw.tdbk_merchant_refferal_mtb`
  left join worktable_datachampions.TARGET_SPLIT P on P.STORE_NAME = mer_name
 qualify row_number() over(partition by mer_refferal_code order by  created_dt desc)=1) sil_category on loanmaster.purpleKey=sil_category.mer_refferal_code
  qualify row_number() over(partition by r.customerId, r.digitalLoanAccountid order by cast(start_time as datetime) desc) = 1
)
select * from base where trenchCategory = 'Trench 1'
;
"""

dfd = client.query(sq).to_dataframe(progress_bar_type='tqdm')
dfd.head()

In [None]:
# dfd.groupby(['Application_month', 'trenchCategory'])['digitalLoanAccountId'].nunique()

In [None]:
# Expand the calcFeatures column
expanded_df = expand_calc_features(dfd)

# Display the result
print(f"Original columns: {dfd.shape[1]}")
print(f"Expanded columns: {expanded_df.shape[1]}")
df1 = expanded_df.copy()


##### Train

In [None]:
sq = """ 
WITH parsed as (
select customerId, digitalLoanAccountId,modelDisplayName,modelVersionId,start_time,end_time,prediction,trenchCategory,
REPLACE(REPLACE(calcFeature, "'", '"'), "None", "null") AS calcFeatures,
FROM prj-prod-dataplatform.dap_ds_poweruser_playground.ml_training_model_run_details
where modelDisplayName in ('Beta-Cash-Stack-Model', 'beta_stack_model_cash')
),
base as
(select 
 r.customerId,r.digitalLoanAccountId,prediction Beta_cash_stack_score
 ,start_time,end_time,
  modelDisplayName,modelVersionId,
     loanmaster.new_loan_type loanType,
 loanmaster.gender,
    case when loanmaster.loantype='BNPL' and sil_category.store_type =1 then 'Appliance'
    when loanmaster.loantype='BNPL' and sil_category.store_type =2 then 'Mobile'
    when loanmaster.loantype='BNPL' and sil_category.store_type =3 then 'Mall'
    when loanmaster.loantype='BNPL' and sil_category.store_type not in (1,2,3) then store_tagging
    else 'not applicable' end as loan_product_type,
     case when lower(coalesce(loanmaster.osversion_v2, loanmaster.osVersion)) like '%andro%' then 'android'
    when lower(coalesce(loanmaster.osversion_v2, loanmaster.osVersion)) like '%os%' then 'ios'
    when lower(loanmaster.deviceType) like '%andro%' then 'android'
    else 'ios' end osType,
 'Cash' as product,
 'Beta-Cash-Stack-Model_Trench1' Model_Name,
  trenchCategory,
   'Train' Data_selection,
  calcFeatures,
  IF(loanmaster.new_loan_type = 'Flex-up', loanmaster.startApplyDateTime, loanmaster.termsAndConditionsSubmitDateTime) AS appln_submit_datetime,
  loanmaster.disbursementDateTime,
  format_date('%Y-%m',IF(loanmaster.new_loan_type = 'Flex-up', loanmaster.startApplyDateTime, loanmaster.termsAndConditionsSubmitDateTime)) as Application_month, 
from parsed r
left join risk_credit_mis.loan_master_table loanmaster
  ON loanmaster.digitalLoanAccountId = r.digitalLoanAccountId
 left join(SELECT DISTINCT mer_refferal_code, mer_name mer_name,store_type,store_tagging FROM `dl_loans_db_raw.tdbk_merchant_refferal_mtb`
  left join worktable_datachampions.TARGET_SPLIT P on P.STORE_NAME = mer_name
 qualify row_number() over(partition by mer_refferal_code order by  created_dt desc)=1) sil_category on loanmaster.purpleKey=sil_category.mer_refferal_code
  qualify row_number() over(partition by r.customerId, r.digitalLoanAccountid order by cast(start_time as datetime) desc) = 1
)
select * from base where trenchCategory = 'Trench 1'
;
"""

dfd = client.query(sq).to_dataframe(progress_bar_type='tqdm')
dfd.head()

In [None]:
# dfd.groupby(['Application_month', 'trenchCategory'])['digitalLoanAccountId'].nunique()

In [None]:
# Expand the calcFeatures column
expanded_df = expand_calc_features(dfd)

# Display the result
print(f"Original columns: {dfd.shape[1]}")
print(f"Expanded columns: {expanded_df.shape[1]}")
df2 = expanded_df.copy()

In [None]:
df1.info()

In [None]:
df2.info()

In [None]:
df2.rename(columns={'calc_apps_score':'calc_app_score'}, inplace = True)

##### concatenate

In [None]:
df_concat = pd.concat([df1, df2], ignore_index=True)
df_concat.info()

In [None]:
# df_concat = df_concat.drop(columns=['prediction', 'calc_app_median_time_bw_installed_mins_ever', 'calc_app_avg_time_bw_installed_mins_ever', 'calc_beta_cash_app_score']).copy()
df_concat.columns = df_concat.columns.str.replace('calc_', '', regex=False)
df_concat.columns

In [None]:
df = df_concat.drop(columns=['calcFeatures']).copy()
df.info()

In [None]:
df['Beta_cash_stack_score'] = pd.to_numeric(df['Beta_cash_stack_score'], errors='coerce')
df.columns

##### PSI calculation

In [None]:
df = df.copy()

# Define feature list
feature_list = ['Beta_cash_stack_score',
       'demo_score', 'credo_score', 'app_score'
]

# Define segment columns
segment_columns = ['new_loan_type','osType', 'loan_product_type']
# Calculate month-on-month PSI
psi_results = calculate_month_on_month_psi(df, feature_list, segment_columns)
psi_results['modelDisplayName'] = df['modelDisplayName'].iloc[0]
psi_results['Model_Name'] = df['Model_Name'].iloc[0]
psi_results['modelVersionId'] = df['modelVersionId'].iloc[0]
psi_results['trenchCategory'] = df['trenchCategory'].iloc[0]
psi_results = psi_results[['modelDisplayName', 'Model_Name', 'modelVersionId', 'trenchCategory',
                           'Feature', 'Feature_Type', 'Segment_Column', 'Segment_Value', 'Month',
                           'Base_Month', 'Current_Month', 'Base_Count', 'Actual_Count',
                           'Expected_Percentage', 'Actual_Percentage', 'PSI']].copy()

# # Calculate bin-level PSI
# bin_psi_results = calculate_bin_level_psi(df, feature_list, segment_columns)
# bin_psi_results['modelDisplayName'] = df['modelDisplayName'].iloc[0]
# bin_psi_results['Model_Name'] = df['Model_Name'].iloc[0]
# bin_psi_results['modelVersionId'] = df['modelVersionId'].iloc[0]
# bin_psi_results['trenchCategory'] = df['trenchCategory'].iloc[0]
# bin_psi_results = bin_psi_results[['modelDisplayName', 'Model_Name', 'modelVersionId', 'trenchCategory',
#                                    'Feature', 'Feature_Type', 'Segment_Column', 'Segment_Value',
#                                     'Month', 'Base_Month', 'Current_Month', 'Base_Count', 'Actual_Count',
#                                     'Bin', 'Bin_Range', 'Base_Percentage', 'Actual_Percentage', 'Bin_PSI']].copy()

# Upload to BigQuery
table_id = "prj-prod-dataplatform.dap_ds_poweruser_playground.beta_cash_stack_model_psi_v4"
job_config = bigquery.LoadJobConfig(
    write_disposition="WRITE_TRUNCATE",  # or "WRITE_APPEND"
)
job = client.load_table_from_dataframe(psi_results, table_id, job_config=job_config)
job.result()  # Wait for the job to complete


# # Upload to BigQuery
# table_id = "prj-prod-dataplatform.dap_ds_poweruser_playground.beta_sil_income_estimation_model_csi_v4"
# job_config = bigquery.LoadJobConfig(
#     write_disposition="WRITE_TRUNCATE",  # or "WRITE_APPEND"
# )
# job = client.load_table_from_dataframe(bin_psi_results, table_id, job_config=job_config)
# job.result()  # Wait for the job to complete

In [None]:
sq = """select * from prj-prod-dataplatform.dap_ds_poweruser_playground.beta_cash_stack_model_psi_v4"""
dfd = client.query(sq).to_dataframe()
dfd.groupby(['trenchCategory','Month'])['modelDisplayName'].count()

##### Trench 2

##### Test

In [None]:
sq = """
WITH parsed as (
select customerId, digitalLoanAccountId,modelDisplayName,modelVersionId,start_time,end_time,prediction,trenchCategory,
REPLACE(REPLACE(calcFeature, "'", '"'), "None", "null") AS calcFeatures,
REPLACE(REPLACE(prediction, "'", '"'), "None", "null") AS prediction_clean
FROM `prj-prod-dataplatform.audit_balance.ml_model_run_details`
where modelDisplayName in ('Beta-Cash-Stack-Model', 'beta_stack_model_cash')
),
base as
(select 
 r.customerId,r.digitalLoanAccountId,prediction Beta_cash_stack_score
 ,start_time,end_time,
  modelDisplayName,modelVersionId,
     loanmaster.new_loan_type loanType,
 loanmaster.gender,
    case when loanmaster.loantype='BNPL' and sil_category.store_type =1 then 'Appliance'
    when loanmaster.loantype='BNPL' and sil_category.store_type =2 then 'Mobile'
    when loanmaster.loantype='BNPL' and sil_category.store_type =3 then 'Mall'
    when loanmaster.loantype='BNPL' and sil_category.store_type not in (1,2,3) then store_tagging
    else 'not applicable' end as loan_product_type,
     case when lower(coalesce(loanmaster.osversion_v2, loanmaster.osVersion)) like '%andro%' then 'android'
    when lower(coalesce(loanmaster.osversion_v2, loanmaster.osVersion)) like '%os%' then 'ios'
    when lower(loanmaster.deviceType) like '%andro%' then 'android'
    else 'ios' end osType,
 'Cash' as product,
 'Beta-Cash-Stack-Model_Trench2' Model_Name,
  trenchCategory,
   'Test' Data_selection,
  calcFeatures,
  coalesce(IF(loanmaster.new_loan_type = 'Flex-up', loanmaster.startApplyDateTime, loanmaster.termsAndConditionsSubmitDateTime),  r.start_time) AS appln_submit_datetime,
  loanmaster.disbursementDateTime,
  format_date('%Y-%m', coalesce(IF(loanmaster.new_loan_type = 'Flex-up', loanmaster.startApplyDateTime, loanmaster.termsAndConditionsSubmitDateTime),  r.start_time)) as Application_month, 
from parsed r
left join risk_credit_mis.loan_master_table loanmaster
  ON loanmaster.digitalLoanAccountId = r.digitalLoanAccountId
 left join(SELECT DISTINCT mer_refferal_code, mer_name mer_name,store_type,store_tagging FROM `dl_loans_db_raw.tdbk_merchant_refferal_mtb`
  left join worktable_datachampions.TARGET_SPLIT P on P.STORE_NAME = mer_name
 qualify row_number() over(partition by mer_refferal_code order by  created_dt desc)=1) sil_category on loanmaster.purpleKey=sil_category.mer_refferal_code
  qualify row_number() over(partition by r.customerId, r.digitalLoanAccountid order by cast(start_time as datetime) desc) = 1
)
select * from base where trenchCategory = 'Trench 2'
;
"""

dfd = client.query(sq).to_dataframe(progress_bar_type='tqdm')
dfd.head()

In [None]:
# dfd.groupby(['Application_month', 'trenchCategory'])['digitalLoanAccountId'].nunique()

In [None]:
# Expand the calcFeatures column
expanded_df = expand_calc_features(dfd)

# Display the result
print(f"Original columns: {dfd.shape[1]}")
print(f"Expanded columns: {expanded_df.shape[1]}")
df1 = expanded_df.copy()


##### Train

In [None]:
sq = """ 
WITH parsed as (
select customerId, digitalLoanAccountId,modelDisplayName,modelVersionId,start_time,end_time,prediction,trenchCategory,
REPLACE(REPLACE(calcFeature, "'", '"'), "None", "null") AS calcFeatures,
FROM prj-prod-dataplatform.dap_ds_poweruser_playground.ml_training_model_run_details
where modelDisplayName in ('Beta-Cash-Stack-Model', 'beta_stack_model_cash')
),
base as
(select 
 r.customerId,r.digitalLoanAccountId,prediction Beta_cash_stack_score
 ,start_time,end_time,
  modelDisplayName,modelVersionId,
     loanmaster.new_loan_type loanType,
 loanmaster.gender,
    case when loanmaster.loantype='BNPL' and sil_category.store_type =1 then 'Appliance'
    when loanmaster.loantype='BNPL' and sil_category.store_type =2 then 'Mobile'
    when loanmaster.loantype='BNPL' and sil_category.store_type =3 then 'Mall'
    when loanmaster.loantype='BNPL' and sil_category.store_type not in (1,2,3) then store_tagging
    else 'not applicable' end as loan_product_type,
     case when lower(coalesce(loanmaster.osversion_v2, loanmaster.osVersion)) like '%andro%' then 'android'
    when lower(coalesce(loanmaster.osversion_v2, loanmaster.osVersion)) like '%os%' then 'ios'
    when lower(loanmaster.deviceType) like '%andro%' then 'android'
    else 'ios' end osType,
 'Cash' as product,
 'Beta-Cash-Stack-Model_Trench2' Model_Name,
  trenchCategory,
   'Train' Data_selection,
  calcFeatures,
  IF(loanmaster.new_loan_type = 'Flex-up', loanmaster.startApplyDateTime, loanmaster.termsAndConditionsSubmitDateTime) AS appln_submit_datetime,
  loanmaster.disbursementDateTime,
  format_date('%Y-%m',IF(loanmaster.new_loan_type = 'Flex-up', loanmaster.startApplyDateTime, loanmaster.termsAndConditionsSubmitDateTime)) as Application_month, 
from parsed r
left join risk_credit_mis.loan_master_table loanmaster
  ON loanmaster.digitalLoanAccountId = r.digitalLoanAccountId
 left join(SELECT DISTINCT mer_refferal_code, mer_name mer_name,store_type,store_tagging FROM `dl_loans_db_raw.tdbk_merchant_refferal_mtb`
  left join worktable_datachampions.TARGET_SPLIT P on P.STORE_NAME = mer_name
 qualify row_number() over(partition by mer_refferal_code order by  created_dt desc)=1) sil_category on loanmaster.purpleKey=sil_category.mer_refferal_code
  qualify row_number() over(partition by r.customerId, r.digitalLoanAccountid order by cast(start_time as datetime) desc) = 1
)
select * from base where trenchCategory = 'Trench 2'
;
"""

dfd = client.query(sq).to_dataframe(progress_bar_type='tqdm')
dfd.head()

In [None]:
# dfd.groupby(['Application_month', 'trenchCategory'])['digitalLoanAccountId'].nunique()

In [None]:
# Expand the calcFeatures column
expanded_df = expand_calc_features(dfd)

# Display the result
print(f"Original columns: {dfd.shape[1]}")
print(f"Expanded columns: {expanded_df.shape[1]}")
df2 = expanded_df.copy()

In [None]:
df1.info()

In [None]:
df2.info()

In [None]:
df2.rename(columns={'calc_apps_score':'calc_app_score'}, inplace = True)

##### concatenate

In [None]:
df_concat = pd.concat([df1, df2], ignore_index=True)
df_concat.info()

In [None]:
# df_concat = df_concat.drop(columns=['prediction', 'calc_app_median_time_bw_installed_mins_ever', 'calc_app_avg_time_bw_installed_mins_ever', 'calc_beta_cash_app_score']).copy()
df_concat.columns = df_concat.columns.str.replace('calc_', '', regex=False)
df_concat.columns

In [None]:
df = df_concat.drop(columns=['calcFeatures']).copy()
df.info()

In [None]:
df['Beta_cash_stack_score'] = pd.to_numeric(df['Beta_cash_stack_score'], errors='coerce')
df.columns

##### PSI calculation

In [None]:
df = df.copy()

# Define feature list
feature_list = ['Beta_cash_stack_score',
       'demo_score', 'credo_score', 'app_score'
]

# Define segment columns
segment_columns = ['new_loan_type','osType', 'loan_product_type']
# Calculate month-on-month PSI
psi_results = calculate_month_on_month_psi(df, feature_list, segment_columns)
psi_results['modelDisplayName'] = df['modelDisplayName'].iloc[0]
psi_results['Model_Name'] = df['Model_Name'].iloc[0]
psi_results['modelVersionId'] = df['modelVersionId'].iloc[0]
psi_results['trenchCategory'] = df['trenchCategory'].iloc[0]
psi_results = psi_results[['modelDisplayName', 'Model_Name', 'modelVersionId', 'trenchCategory',
                           'Feature', 'Feature_Type', 'Segment_Column', 'Segment_Value', 'Month',
                           'Base_Month', 'Current_Month', 'Base_Count', 'Actual_Count',
                           'Expected_Percentage', 'Actual_Percentage', 'PSI']].copy()

# # Calculate bin-level PSI
# bin_psi_results = calculate_bin_level_psi(df, feature_list, segment_columns)
# bin_psi_results['modelDisplayName'] = df['modelDisplayName'].iloc[0]
# bin_psi_results['Model_Name'] = df['Model_Name'].iloc[0]
# bin_psi_results['modelVersionId'] = df['modelVersionId'].iloc[0]
# bin_psi_results['trenchCategory'] = df['trenchCategory'].iloc[0]
# bin_psi_results = bin_psi_results[['modelDisplayName', 'Model_Name', 'modelVersionId', 'trenchCategory',
#                                    'Feature', 'Feature_Type', 'Segment_Column', 'Segment_Value',
#                                     'Month', 'Base_Month', 'Current_Month', 'Base_Count', 'Actual_Count',
#                                     'Bin', 'Bin_Range', 'Base_Percentage', 'Actual_Percentage', 'Bin_PSI']].copy()

# Upload to BigQuery
table_id = "prj-prod-dataplatform.dap_ds_poweruser_playground.beta_cash_stack_model_psi_v4"
job_config = bigquery.LoadJobConfig(
    write_disposition="WRITE_APPEND",  # or "WRITE_APPEND"
)
job = client.load_table_from_dataframe(psi_results, table_id, job_config=job_config)
job.result()  # Wait for the job to complete


# # Upload to BigQuery
# table_id = "prj-prod-dataplatform.dap_ds_poweruser_playground.beta_sil_income_estimation_model_csi_v4"
# job_config = bigquery.LoadJobConfig(
#     write_disposition="WRITE_TRUNCATE",  # or "WRITE_APPEND"
# )
# job = client.load_table_from_dataframe(bin_psi_results, table_id, job_config=job_config)
# job.result()  # Wait for the job to complete

In [None]:
sq = """select * from prj-prod-dataplatform.dap_ds_poweruser_playground.beta_cash_stack_model_psi_v4"""
dfd = client.query(sq).to_dataframe()
dfd.groupby(['trenchCategory','Month'])['modelDisplayName'].count()

##### Trench 3

##### Test

In [None]:
sq = """
WITH parsed as (
select customerId, digitalLoanAccountId,modelDisplayName,modelVersionId,start_time,end_time,prediction,trenchCategory,
REPLACE(REPLACE(calcFeature, "'", '"'), "None", "null") AS calcFeatures,
REPLACE(REPLACE(prediction, "'", '"'), "None", "null") AS prediction_clean
FROM `prj-prod-dataplatform.audit_balance.ml_model_run_details`
where modelDisplayName in ('Beta-Cash-Stack-Model', 'beta_stack_model_cash')
),
base as
(select 
 r.customerId,r.digitalLoanAccountId,prediction Beta_cash_stack_score
 ,start_time,end_time,
  modelDisplayName,modelVersionId,
     loanmaster.new_loan_type loanType,
 loanmaster.gender,
    case when loanmaster.loantype='BNPL' and sil_category.store_type =1 then 'Appliance'
    when loanmaster.loantype='BNPL' and sil_category.store_type =2 then 'Mobile'
    when loanmaster.loantype='BNPL' and sil_category.store_type =3 then 'Mall'
    when loanmaster.loantype='BNPL' and sil_category.store_type not in (1,2,3) then store_tagging
    else 'not applicable' end as loan_product_type,
     case when lower(coalesce(loanmaster.osversion_v2, loanmaster.osVersion)) like '%andro%' then 'android'
    when lower(coalesce(loanmaster.osversion_v2, loanmaster.osVersion)) like '%os%' then 'ios'
    when lower(loanmaster.deviceType) like '%andro%' then 'android'
    else 'ios' end osType,
 'Cash' as product,
 'Beta-Cash-Stack-Model_Trench3' Model_Name,
  trenchCategory,
   'Test' Data_selection,
  calcFeatures,
  coalesce(IF(loanmaster.new_loan_type = 'Flex-up', loanmaster.startApplyDateTime, loanmaster.termsAndConditionsSubmitDateTime),  r.start_time) AS appln_submit_datetime,
  loanmaster.disbursementDateTime,
  format_date('%Y-%m', coalesce(IF(loanmaster.new_loan_type = 'Flex-up', loanmaster.startApplyDateTime, loanmaster.termsAndConditionsSubmitDateTime),  r.start_time)) as Application_month, 
from parsed r
left join risk_credit_mis.loan_master_table loanmaster
  ON loanmaster.digitalLoanAccountId = r.digitalLoanAccountId
 left join(SELECT DISTINCT mer_refferal_code, mer_name mer_name,store_type,store_tagging FROM `dl_loans_db_raw.tdbk_merchant_refferal_mtb`
  left join worktable_datachampions.TARGET_SPLIT P on P.STORE_NAME = mer_name
 qualify row_number() over(partition by mer_refferal_code order by  created_dt desc)=1) sil_category on loanmaster.purpleKey=sil_category.mer_refferal_code
  qualify row_number() over(partition by r.customerId, r.digitalLoanAccountid order by cast(start_time as datetime) desc) = 1
)
select * from base where trenchCategory = 'Trench 3'
;
"""

dfd = client.query(sq).to_dataframe(progress_bar_type='tqdm')
dfd.head()

In [None]:
# dfd.groupby(['Application_month', 'trenchCategory'])['digitalLoanAccountId'].nunique()

In [None]:
# Expand the calcFeatures column
expanded_df = expand_calc_features(dfd)

# Display the result
print(f"Original columns: {dfd.shape[1]}")
print(f"Expanded columns: {expanded_df.shape[1]}")
df1 = expanded_df.copy()


##### Train

In [None]:
sq = """ 
WITH parsed as (
select customerId, digitalLoanAccountId,modelDisplayName,modelVersionId,start_time,end_time,prediction,trenchCategory,
REPLACE(REPLACE(calcFeature, "'", '"'), "None", "null") AS calcFeatures,
FROM prj-prod-dataplatform.dap_ds_poweruser_playground.ml_training_model_run_details
where modelDisplayName in ('Beta-Cash-Stack-Model', 'beta_stack_model_cash')
),
base as
(select 
 r.customerId,r.digitalLoanAccountId,prediction Beta_cash_stack_score
 ,start_time,end_time,
  modelDisplayName,modelVersionId,
     loanmaster.new_loan_type loanType,
 loanmaster.gender,
    case when loanmaster.loantype='BNPL' and sil_category.store_type =1 then 'Appliance'
    when loanmaster.loantype='BNPL' and sil_category.store_type =2 then 'Mobile'
    when loanmaster.loantype='BNPL' and sil_category.store_type =3 then 'Mall'
    when loanmaster.loantype='BNPL' and sil_category.store_type not in (1,2,3) then store_tagging
    else 'not applicable' end as loan_product_type,
     case when lower(coalesce(loanmaster.osversion_v2, loanmaster.osVersion)) like '%andro%' then 'android'
    when lower(coalesce(loanmaster.osversion_v2, loanmaster.osVersion)) like '%os%' then 'ios'
    when lower(loanmaster.deviceType) like '%andro%' then 'android'
    else 'ios' end osType,
 'Cash' as product,
 'Beta-Cash-Stack-Model_Trench3' Model_Name,
  trenchCategory,
   'Train' Data_selection,
  calcFeatures,
  IF(loanmaster.new_loan_type = 'Flex-up', loanmaster.startApplyDateTime, loanmaster.termsAndConditionsSubmitDateTime) AS appln_submit_datetime,
  loanmaster.disbursementDateTime,
  format_date('%Y-%m',IF(loanmaster.new_loan_type = 'Flex-up', loanmaster.startApplyDateTime, loanmaster.termsAndConditionsSubmitDateTime)) as Application_month, 
from parsed r
left join risk_credit_mis.loan_master_table loanmaster
  ON loanmaster.digitalLoanAccountId = r.digitalLoanAccountId
 left join(SELECT DISTINCT mer_refferal_code, mer_name mer_name,store_type,store_tagging FROM `dl_loans_db_raw.tdbk_merchant_refferal_mtb`
  left join worktable_datachampions.TARGET_SPLIT P on P.STORE_NAME = mer_name
 qualify row_number() over(partition by mer_refferal_code order by  created_dt desc)=1) sil_category on loanmaster.purpleKey=sil_category.mer_refferal_code
  qualify row_number() over(partition by r.customerId, r.digitalLoanAccountid order by cast(start_time as datetime) desc) = 1
)
select * from base where trenchCategory = 'Trench 3'
;
"""

dfd = client.query(sq).to_dataframe(progress_bar_type='tqdm')
dfd.head()

In [None]:
# dfd.groupby(['Application_month', 'trenchCategory'])['digitalLoanAccountId'].nunique()

In [None]:
# Expand the calcFeatures column
expanded_df = expand_calc_features(dfd)

# Display the result
print(f"Original columns: {dfd.shape[1]}")
print(f"Expanded columns: {expanded_df.shape[1]}")
df2 = expanded_df.copy()

In [None]:
df1.info()

In [None]:
df2.info()

In [None]:
df2.rename(columns={'calc_apps_score':'calc_app_score'}, inplace = True)

##### concatenate

In [None]:
df_concat = pd.concat([df1, df2], ignore_index=True)
df_concat.info()

In [None]:
# df_concat = df_concat.drop(columns=['prediction', 'calc_app_median_time_bw_installed_mins_ever', 'calc_app_avg_time_bw_installed_mins_ever', 'calc_beta_cash_app_score']).copy()
df_concat.columns = df_concat.columns.str.replace('calc_', '', regex=False)
df_concat.columns

In [None]:
df = df_concat.drop(columns=['calcFeatures']).copy()
df.info()

In [None]:
df['Beta_cash_stack_score'] = pd.to_numeric(df['Beta_cash_stack_score'], errors='coerce')
df.columns

##### PSI calculation

In [None]:
df = df.copy()

# Define feature list
feature_list = ['Beta_cash_stack_score',
       'demo_score', 'credo_score', 'app_score'
]

# Define segment columns
segment_columns = ['new_loan_type','osType', 'loan_product_type']
# Calculate month-on-month PSI
psi_results = calculate_month_on_month_psi(df, feature_list, segment_columns)
psi_results['modelDisplayName'] = df['modelDisplayName'].iloc[0]
psi_results['Model_Name'] = df['Model_Name'].iloc[0]
psi_results['modelVersionId'] = df['modelVersionId'].iloc[0]
psi_results['trenchCategory'] = df['trenchCategory'].iloc[0]
psi_results = psi_results[['modelDisplayName', 'Model_Name', 'modelVersionId', 'trenchCategory',
                           'Feature', 'Feature_Type', 'Segment_Column', 'Segment_Value', 'Month',
                           'Base_Month', 'Current_Month', 'Base_Count', 'Actual_Count',
                           'Expected_Percentage', 'Actual_Percentage', 'PSI']].copy()

# # Calculate bin-level PSI
# bin_psi_results = calculate_bin_level_psi(df, feature_list, segment_columns)
# bin_psi_results['modelDisplayName'] = df['modelDisplayName'].iloc[0]
# bin_psi_results['Model_Name'] = df['Model_Name'].iloc[0]
# bin_psi_results['modelVersionId'] = df['modelVersionId'].iloc[0]
# bin_psi_results['trenchCategory'] = df['trenchCategory'].iloc[0]
# bin_psi_results = bin_psi_results[['modelDisplayName', 'Model_Name', 'modelVersionId', 'trenchCategory',
#                                    'Feature', 'Feature_Type', 'Segment_Column', 'Segment_Value',
#                                     'Month', 'Base_Month', 'Current_Month', 'Base_Count', 'Actual_Count',
#                                     'Bin', 'Bin_Range', 'Base_Percentage', 'Actual_Percentage', 'Bin_PSI']].copy()

# Upload to BigQuery
table_id = "prj-prod-dataplatform.dap_ds_poweruser_playground.beta_cash_stack_model_psi_v4"
job_config = bigquery.LoadJobConfig(
    write_disposition="WRITE_APPEND",  # or "WRITE_APPEND"
)
job = client.load_table_from_dataframe(psi_results, table_id, job_config=job_config)
job.result()  # Wait for the job to complete


# # Upload to BigQuery
# table_id = "prj-prod-dataplatform.dap_ds_poweruser_playground.beta_sil_income_estimation_model_csi_v4"
# job_config = bigquery.LoadJobConfig(
#     write_disposition="WRITE_TRUNCATE",  # or "WRITE_APPEND"
# )
# job = client.load_table_from_dataframe(bin_psi_results, table_id, job_config=job_config)
# job.result()  # Wait for the job to complete

In [None]:
sq = """select * from prj-prod-dataplatform.dap_ds_poweruser_playground.beta_cash_stack_model_psi_v4"""
dfd = client.query(sq).to_dataframe()
dfd.groupby(['trenchCategory','Month'])['modelDisplayName'].count()

# End

# SIL V2

#### Alpha - CIC-SIL-Model

##### Trench 1

##### Test

In [None]:
## This is for the test period of Alpha - CIC sil model - reading the data from ml_model_run_details

sq = """
WITH cleaned AS (
  SELECT
    customerId,digitalLoanAccountId,prediction,start_time,end_time,modelDisplayName,modelVersionId, trenchCategory,
    REPLACE(REPLACE(calcFeature, "'", '"'), "None", "null") AS calcFeature
  FROM `prj-prod-dataplatform.audit_balance.ml_model_run_details`
  WHERE modelDisplayName in ('Alpha - CIC-SIL-Model', 'cic_model_sil')
  and modelVersionId='v2'
  ),
base as
(SELECT
  r.customerId,r.digitalLoanAccountId,prediction Alpha_cic_sil_score
    ,start_time,end_time,modelDisplayName,modelVersionId,
   loanmaster.new_loan_type loanType,
 loanmaster.gender,
    case when loanmaster.loantype='BNPL' and sil_category.store_type =1 then 'Appliance'
    when loanmaster.loantype='BNPL' and sil_category.store_type =2 then 'Mobile'
    when loanmaster.loantype='BNPL' and sil_category.store_type =3 then 'Mall'
    when loanmaster.loantype='BNPL' and sil_category.store_type not in (1,2,3) then store_tagging
    else 'not applicable' end as loan_product_type,
     case when lower(coalesce(loanmaster.osversion_v2, loanmaster.osVersion)) like '%andro%' then 'android'
        when lower(coalesce(loanmaster.osversion_v2, loanmaster.osVersion)) like '%os%' then 'ios'
        when lower(loanmaster.deviceType) like '%andro%' then 'android'
        else 'ios' end osType,
 'Alpha - CIC-SIL-Model' Model_Name,
 'SIL' as product,
  trenchCategory,
  r.calcFeature calcFeatures,
  'Test' Data_selection,
  coalesce(IF(loanmaster.new_loan_type = 'Flex-up', loanmaster.startApplyDateTime, loanmaster.termsAndConditionsSubmitDateTime),  r.start_time) AS appln_submit_datetime,
  loanmaster.disbursementDateTime,
  format_date('%Y-%m', coalesce(IF(loanmaster.new_loan_type = 'Flex-up', loanmaster.startApplyDateTime, loanmaster.termsAndConditionsSubmitDateTime),  r.start_time)) as Application_month,
FROM cleaned r
left join risk_credit_mis.loan_master_table loanmaster
  ON loanmaster.digitalLoanAccountId = r.digitalLoanAccountId
 left join(SELECT DISTINCT mer_refferal_code, mer_name mer_name,store_type,store_tagging FROM `dl_loans_db_raw.tdbk_merchant_refferal_mtb`
  left join worktable_datachampions.TARGET_SPLIT P on P.STORE_NAME = mer_name
 qualify row_number() over(partition by mer_refferal_code order by  created_dt desc)=1) sil_category on loanmaster.purpleKey=sil_category.mer_refferal_code
qualify row_number() over (partition by r.customerId,r.digitalLoanAccountId order by r.start_time desc) = 1
)
select * from base where trenchCategory = 'Trench 1'
;
"""
dfd = client.query(sq).to_dataframe()
print(f"The shape of the dataframe is: {dfd.shape}")
dfd.head()

## this data is not expanded. We will have to expand and get the features from the calcFeatures column

In [None]:
# Expand the calcFeatures column
expanded_df = expand_calc_features(dfd)

# Display the result
print(f"Original columns: {dfd.shape[1]}")
print(f"Expanded columns: {expanded_df.shape[1]}")
df1 = expanded_df.copy()


In [None]:
df1.columns

##### Train

In [None]:
## This is for the test period of Alpha - CIC sil model - reading the data from ml_model_run_details

sq = """
WITH parsed as (
select customerId, digitalLoanAccountId,modelDisplayName,modelVersionId,start_time,end_time,prediction,trenchCategory,
REPLACE(REPLACE(calcFeature, "'", '"'), "None", "null") AS calcFeatures,
FROM prj-prod-dataplatform.dap_ds_poweruser_playground.ml_training_model_run_details
WHERE modelDisplayName in ('Alpha - CIC-SIL-Model', 'cic_model_sil')
),
base as
(select 
 r.customerId,r.digitalLoanAccountId,prediction Alpha_cic_sil_score
 ,start_time,end_time,
  modelDisplayName,modelVersionId,
     loanmaster.new_loan_type loanType,
 loanmaster.gender,
    case when loanmaster.loantype='BNPL' and sil_category.store_type =1 then 'Appliance'
    when loanmaster.loantype='BNPL' and sil_category.store_type =2 then 'Mobile'
    when loanmaster.loantype='BNPL' and sil_category.store_type =3 then 'Mall'
    when loanmaster.loantype='BNPL' and sil_category.store_type not in (1,2,3) then store_tagging
    else 'not applicable' end as loan_product_type,
     case when lower(coalesce(loanmaster.osversion_v2, loanmaster.osVersion)) like '%andro%' then 'android'
    when lower(coalesce(loanmaster.osversion_v2, loanmaster.osVersion)) like '%os%' then 'ios'
    when lower(loanmaster.deviceType) like '%andro%' then 'android'
    else 'ios' end osType,
 'Cash' as product,
 'Beta-Cash-Stack-Model_Trench1' Model_Name,
  trenchCategory,
   'Train' Data_selection,
  calcFeatures,
  IF(loanmaster.new_loan_type = 'Flex-up', loanmaster.startApplyDateTime, loanmaster.termsAndConditionsSubmitDateTime) AS appln_submit_datetime,
  loanmaster.disbursementDateTime,
  format_date('%Y-%m',IF(loanmaster.new_loan_type = 'Flex-up', loanmaster.startApplyDateTime, loanmaster.termsAndConditionsSubmitDateTime)) as Application_month, 
from parsed r
left join risk_credit_mis.loan_master_table loanmaster
  ON loanmaster.digitalLoanAccountId = r.digitalLoanAccountId
 left join(SELECT DISTINCT mer_refferal_code, mer_name mer_name,store_type,store_tagging FROM `dl_loans_db_raw.tdbk_merchant_refferal_mtb`
  left join worktable_datachampions.TARGET_SPLIT P on P.STORE_NAME = mer_name
 qualify row_number() over(partition by mer_refferal_code order by  created_dt desc)=1) sil_category on loanmaster.purpleKey=sil_category.mer_refferal_code
  qualify row_number() over(partition by r.customerId, r.digitalLoanAccountid order by cast(start_time as datetime) desc) = 1
)
select * from base where trenchCategory = 'Trench 1'
;
"""

dfd = client.query(sq).to_dataframe(progress_bar_type='tqdm')
dfd.head()


In [None]:
# Expand the calcFeatures column
expanded_df = expand_calc_features(dfd)

# Display the result
print(f"Original columns: {dfd.shape[1]}")
print(f"Expanded columns: {expanded_df.shape[1]}")
df2 = expanded_df.copy()

In [None]:
df2.rename(columns={'loan_type':'new_loan_type'}, inplace = True)

In [None]:
df1.info()

In [None]:
df2.info()

##### concatenation

In [None]:
df_concat = pd.concat([df1, df2], ignore_index=True)
df_concat.info()

In [None]:
df_concat = df_concat.drop(columns=['calc_digitalLoanAccountId', 'calc_customerId', 'calc_crifApplicationId']).copy()
df_concat.columns = df_concat.columns.str.replace('calc_', '', regex=False)
df_concat.columns