# <div align="center" style="color:rgb(51, 255, 228);">PSI and CSI Calculations</div>

In [1]:
# %% [markdown]
# # Jupyter Notebook Loading Header
#
# This is a custom loading header for Jupyter Notebooks in Visual Studio Code.
# It includes common imports and settings to get you started quickly.

# %% [markdown]
## Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from google.cloud import bigquery
import os
path = r'C:\Users\DwaipayanChakroborti\AppData\Roaming\gcloud\legacy_credentials\dchakroborti@tonikbank.com\adc.json'
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = path
client = bigquery.Client(project='prj-prod-dataplatform')

# %% [markdown]
## Configure Settings
# Set options or configurations as needed
# Example: pd.set_option('display.max_columns', None)

In [2]:
sq = """drop table if exists  `dap_ds_poweruser_playground.F_CSI_MODEL_FEATURES_BIN_TAB`;"""

client.query(sq)

QueryJob<project=prj-prod-dataplatform, location=asia-southeast1, id=3ae527d9-ce4d-4d20-badb-444b02dd16c1>

**Logic :-** <br>
`1. Calculate the CSI the same way as we have calculated PSI but for top 5 features of a Model.`<br>
`2. We will calculate it for overall, then for each categories of user_type, prod_type and os_type.`<br>

# CIC Score CIS

In [3]:
sq = """
with cicscorebase as 
(select 
digitalLoanAccountId,
FORMAT_DATE('%Y-%m', ln_appln_submit_datetime) Application_month,
FORMAT_DATE('%F', DATE_TRUNC(ln_appln_submit_datetime, WEEK(MONDAY))) as Appl_week_start_date,
EXTRACT(WEEK(MONDAY) FROM ln_appln_submit_datetime) as Appl_week_number,
ln_user_type,
ln_loan_type,
ln_prod_type, 
ln_os_type,
cic_hit_flag,
cic_score,
case when date_trunc(ln_appln_submit_datetime, day) between '2024-06-01' and '2024-09-30' then 'Train'
       when date_trunc(ln_appln_submit_datetime, day) >= '2024-10-01' then 'Test'
       Else 'Other' end dataselection,
case when cic_Personal_Loans_granted_contracts_amt_24M is null then 'g. missing'
     when cic_Personal_Loans_granted_contracts_amt_24M between 0.999 and 5650.6 then 'a. 0.999-5650.6'
     when cic_Personal_Loans_granted_contracts_amt_24M between 5650.7 and 12447.8 then 'b. 5650.7-12447.8'
     when cic_Personal_Loans_granted_contracts_amt_24M between 12447.9 and 24000.0 then 'c. 12447.9-24000.0'
     when cic_Personal_Loans_granted_contracts_amt_24M between 24000.1 and 50500.0 then 'd. 24000.1-50500'
     when cic_Personal_Loans_granted_contracts_amt_24M between 50500.1 and 2738545.0 then 'e. 50500.1-2738545.0'
     when cic_Personal_Loans_granted_contracts_amt_24M > 2738545.0 then 'f. >2738545.0' end cic_Personal_Loans_granted_contracts_amt_24M_bin,
case when cic_cnt_active_contracts is null then 'd. missing'
     when cic_cnt_active_contracts between 0.999 and 2.0 then 'a. 0.999-2.0'
     when cic_cnt_active_contracts between 2.1 and 88.0 then 'b. 2.1-88.0'
     when cic_cnt_active_contracts > 88.0 then 'c. >88.0' end cic_cnt_active_contracts_bin,
case when cic_vel_contract_nongranted_cnt_12on24 is null then 'd. missing'
     when cic_vel_contract_nongranted_cnt_12on24 between 0.285 and 1.994 then 'a. 0.285-1.994'
     when cic_vel_contract_nongranted_cnt_12on24 between 1.995 and 2.012 then 'b. 1.995-2.012'
     when cic_vel_contract_nongranted_cnt_12on24 > 2.012 then 'c. >2.012' end cic_vel_contract_nongranted_cnt_12on24_bin,
case when cic_days_since_last_inquiry is null then 'g. missing'
     when cic_days_since_last_inquiry between -0.001 and 10.0 then 'a. -0.001-10.0'
     when cic_days_since_last_inquiry between 10.1 and 117.0 then 'b. 10.1-117.0'
     when cic_days_since_last_inquiry between 117.1 and 281.0 then 'c. 117.1-281.0'
     when cic_days_since_last_inquiry between 281.1 and 832.0 then 'd. 281.1-832.0'
     when cic_days_since_last_inquiry between 832.1 and 10844.0 then 'e. 832.1-10844.0'
     when cic_days_since_last_inquiry > 10844.0 then 'f. >10844.0' end cic_days_since_last_inquiry_bin,
case when cic_max_amt_granted_24M is null then 'g. missing'
     when cic_max_amt_granted_24M between -0.001 and 5394.2 then 'a. -0.001-5394.2'
     when cic_max_amt_granted_24M between 5394.3 and 10502.4 then 'b. 5394.3-10502.4'
     when cic_max_amt_granted_24M between 10502.5 and 20000.0 then 'c. 10502.5-20000.0'
     when cic_max_amt_granted_24M between 20000.1 and 40000.0 then 'd. 20000.1-40000.0'
     when cic_max_amt_granted_24M between 40000.1 and 8000000.0 then 'e. 40000.1-8000000.0'
     when cic_max_amt_granted_24M > 8000000.0 then 'f. >8000000.0' end cic_max_amt_granted_24M_bin
from risk_mart.sil_risk_ds_master_20230101_20250223 
where cic_called_flag = 1
and date_trunc(ln_appln_submit_datetime, day) >= '2024-06-01'
)
select * from cicscorebase
"""

cicscoredf = client.query(sq).to_dataframe(progress_bar_type='tqdm')

Job ID 1b735fb6-ca13-4623-8d8b-fb91ec6baa14 successfully executed: 100%|[32m██████████[0m|
Downloading: 100%|[32m██████████[0m|


In [4]:
print(f"The shape of the df for cic score csi is {cicscoredf.shape}")

The shape of the df for cic score csi is (224361, 16)


In [5]:
cicscoredf.head()

Unnamed: 0,digitalLoanAccountId,Application_month,Appl_week_start_date,Appl_week_number,ln_user_type,ln_loan_type,ln_prod_type,ln_os_type,cic_hit_flag,cic_score,dataselection,cic_Personal_Loans_granted_contracts_amt_24M_bin,cic_cnt_active_contracts_bin,cic_vel_contract_nongranted_cnt_12on24_bin,cic_days_since_last_inquiry_bin,cic_max_amt_granted_24M_bin
0,f84f2822-3e4a-422b-b9fe-20292a3ded90,2025-01,2024-12-30,0,3_Applied_Not_Disbursed,SIL ZERO,Mall,Android,,,Test,g. missing,d. missing,d. missing,g. missing,g. missing
1,969c698e-5eac-4eb8-abe8-65e4333f5481,2025-01,2024-12-30,0,2_New Applicant,SIL-Instore,Mall,Android,1.0,0.080285,Test,e. 50500.1-2738545.0,a. 0.999-2.0,d. missing,c. 117.1-281.0,e. 40000.1-8000000.0
2,1380e3c9-41aa-4f94-acd0-5e1b39037674,2025-01,2024-12-30,0,3_Applied_Not_Disbursed,SIL-Instore,Mall,iOS,1.0,0.067089,Test,d. 24000.1-50500,a. 0.999-2.0,d. missing,c. 117.1-281.0,d. 20000.1-40000.0
3,c5972464-141e-4e10-931a-1e4189eaf530,2025-01,2024-12-30,0,2_New Applicant,SIL-Instore,Mall,Android,1.0,0.115715,Test,g. missing,a. 0.999-2.0,d. missing,c. 117.1-281.0,g. missing
4,a9092e75-441c-4b01-bc13-9776a0becc5c,2025-01,2024-12-30,0,2_New Applicant,SIL-Instore,Mall,Android,0.0,,Test,g. missing,d. missing,d. missing,g. missing,g. missing


In [6]:
cicscoredf.columns.values

array(['digitalLoanAccountId', 'Application_month',
       'Appl_week_start_date', 'Appl_week_number', 'ln_user_type',
       'ln_loan_type', 'ln_prod_type', 'ln_os_type', 'cic_hit_flag',
       'cic_score', 'dataselection',
       'cic_Personal_Loans_granted_contracts_amt_24M_bin',
       'cic_cnt_active_contracts_bin',
       'cic_vel_contract_nongranted_cnt_12on24_bin',
       'cic_days_since_last_inquiry_bin', 'cic_max_amt_granted_24M_bin'],
      dtype=object)

# For Overall CSI calculation

In [7]:
import pandas as pd
import numpy as np
from datetime import datetime

def calculate_categorical_csi(train_dist, test_dist):
    """
    Calculate csi for categorical features.
    
    Args:
        train_dist: Distribution of categories in training set
        test_dist: Distribution of categories in test set
    
    Returns:
        float: csi value
    """
    # Ensure both distributions have the same categories
    all_categories = set(train_dist.index) | set(test_dist.index)
    
    # Align distributions
    train_dist_aligned = train_dist.reindex(all_categories, fill_value=0.0001)  # Small value to avoid division by zero
    test_dist_aligned = test_dist.reindex(all_categories, fill_value=0.0001)
    
    # Calculate csi
    csi_values = (test_dist_aligned - train_dist_aligned) * np.log(test_dist_aligned / train_dist_aligned)
    return csi_values.sum()

def calculate_bin_csi(train_df, test_df, feature):
    """
    Calculate csi for each bin value within a feature.
    
    Args:
        train_df: Training DataFrame
        test_df: Test DataFrame
        feature: Feature name to calculate bin-level csi for
    
    Returns:
        DataFrame: csi results for each bin value
    """
    # Get all unique bin values across both datasets
    all_bins = set(train_df[feature].dropna().unique()) | set(test_df[feature].dropna().unique())
    
    # Results list for bin-level csi
    bin_csi_results = []
    
    # Calculate distribution for the entire feature in training set (for reference)
    train_counts = train_df[feature].value_counts(dropna=True)
    train_distribution = train_counts / train_counts.sum()
    
    # Calculate distribution for the entire feature in test set (for reference)
    test_counts = test_df[feature].value_counts(dropna=True)
    test_distribution = test_counts / test_counts.sum()
    
    # Calculate overall csi for the feature
    overall_csi = calculate_categorical_csi(train_distribution, test_distribution)
    
    # Calculate csi for each bin value
    for bin_value in all_bins:
        # Calculate percentage of this bin in train set
        train_bin_count = train_df[train_df[feature] == bin_value].shape[0]
        train_total = train_df.shape[0]
        train_bin_pct = train_bin_count / train_total if train_total > 0 else 0.0001
        
        # Calculate percentage of this bin in test set
        test_bin_count = test_df[test_df[feature] == bin_value].shape[0]
        test_total = test_df.shape[0]
        test_bin_pct = test_bin_count / test_total if test_total > 0 else 0.0001
        
        # Calculate csi for this bin
        if train_bin_pct < 0.0001:
            train_bin_pct = 0.0001  # Avoid division by zero
        if test_bin_pct < 0.0001:
            test_bin_pct = 0.0001  # Avoid division by zero
            
        bin_csi = (test_bin_pct - train_bin_pct) * np.log(test_bin_pct / train_bin_pct)
        
        # Store result
        bin_csi_results.append({
            'feature': feature,
            'bin_value': bin_value,
            'train_pct': train_bin_pct,
            'test_pct': test_bin_pct,
            'bin_csi': bin_csi,
            'feature_csi': overall_csi
        })
    
    return pd.DataFrame(bin_csi_results)

def calculate_segmented_bin_csi(df, feature_list, segment_columns=None):
    """
    Calculate csi for each bin value within multiple features, overall and by segments.
    
    Args:
        df: DataFrame containing the data
        feature_list: List of feature names to calculate csi for
        segment_columns: List of columns to segment by (e.g., ['ln_user_type', 'ln_os_type'])
    
    Returns:
        DataFrame: csi results for each bin value by month and segment
    """
    # Initialize results list
    all_results = []
    
    # If no segment columns are provided, use an empty list
    if segment_columns is None:
        segment_columns = []
    
    # First, calculate overall csi for each bin
    overall_results = calculate_feature_bin_csi(df, feature_list)
    overall_results['segment_type'] = 'Overall'
    overall_results['segment_value'] = 'All'
    all_results.append(overall_results)
    
    # Then calculate csi for each segment column
    for segment_col in segment_columns:
        if segment_col not in df.columns:
            print(f"Warning: {segment_col} not found in DataFrame. Skipping.")
            continue
        
        # Get unique segment values
        segment_values = df[segment_col].dropna().unique()
        
        for segment_val in segment_values:
            # Filter data for this segment
            segment_df = df[df[segment_col] == segment_val]
            
            # Skip if not enough data
            if len(segment_df) < 50:  # Arbitrary threshold
                print(f"Skipping {segment_col}={segment_val} due to insufficient data ({len(segment_df)} rows).")
                continue
                
            # Calculate csi for this segment
            try:
                segment_results = calculate_feature_bin_csi(segment_df, feature_list)
                segment_results['segment_type'] = segment_col
                segment_results['segment_value'] = segment_val
                all_results.append(segment_results)
            except Exception as e:
                print(f"Error calculating csi for {segment_col}={segment_val}: {e}")
    
    # Combine all results
    if all_results:
        combined_results = pd.concat(all_results, ignore_index=True)
        return combined_results
    else:
        return pd.DataFrame()

def calculate_feature_bin_csi(df, feature_list):
    """
    Calculate csi for each bin value within multiple features.
    
    Args:
        df: DataFrame containing the data
        feature_list: List of feature names to calculate csi for
    
    Returns:
        DataFrame: csi results for each bin value by month
    """
    # Make a copy to avoid modifying the original DataFrame
    df_copy = df.copy()
    
    # Separate train and test data
    train_df = df_copy[df_copy['dataselection'] == 'Train']
    test_df = df_copy[df_copy['dataselection'] == 'Test']
    
    # Skip if either dataset is empty
    if train_df.empty or test_df.empty:
        print("Warning: Either train or test dataset is empty. Skipping csi calculation.")
        return pd.DataFrame()
    
    # Handle Application_month based on its type
    if isinstance(df_copy['Application_month'].iloc[0], str):
        # If it's a string in format 'YYYY-MM-DD', extract just 'YYYY-MM'
        last_train_month_str = str(train_df['Application_month'].max())
        if len(last_train_month_str) >= 7:  # Ensure we have at least YYYY-MM
            last_train_month_str = last_train_month_str[:7]  # Extract YYYY-MM part
    else:
        # If it's already a datetime object
        try:
            last_train_month = pd.to_datetime(train_df['Application_month'].max())
            last_train_month_str = last_train_month.strftime('%Y-%m')
        except:
            # Fallback if conversion fails
            last_train_month_str = str(train_df['Application_month'].max())
    
    # Store all bin-level csi results
    all_bin_results = []
    
    # Calculate distribution for each feature in the training set
    for feature in feature_list:
        if feature not in train_df.columns:
            print(f"Warning: Feature {feature} not found in training data. Skipping.")
            continue
        
        # Calculate bin-level csi for the training set against itself (always 0)
        train_bins = train_df[feature].dropna().unique()
        for bin_value in train_bins:
            all_bin_results.append({
                'Month': last_train_month_str,
                'feature': feature,
                'bin_value': bin_value,
                'DateCategory': 'a_Training',
                'train_pct': (train_df[feature] == bin_value).mean(),
                'test_pct': (train_df[feature] == bin_value).mean(),  # Same as train for training data
                'bin_csi': 0.0,  # csi against itself is 0
                'feature_csi': 0.0,  # Overall csi against itself is 0
                'account_count': train_df['digitalLoanAccountId'].nunique()
            })
    
    # Get unique months from test set and sort them
    test_months = sorted(test_df['Application_month'].unique())
    
    # Create mapping of months to prefixed labels (b, c, d, etc.)
    prefix_map = {}
    for i, month in enumerate(test_months):
        month_str = str(month)
        if isinstance(month, str) and len(month_str) >= 7:
            month_str = month_str[:7]  # Extract YYYY-MM part
        
        # Use letters b, c, d, etc. for subsequent months (a is reserved for Training)
        prefix = chr(98 + i)  # ASCII: b=98, c=99, etc.
        prefix_map[month] = f"{prefix}_{month_str}"
    
    # Calculate monthly csi for each feature and bin in the test set
    for month in test_months:
        original_month_str = str(month)
        if isinstance(month, str) and len(original_month_str) >= 7:
            original_month_str = original_month_str[:7]  # Extract YYYY-MM part
        
        # Use the prefixed month string for sorting
        month_str = prefix_map[month]
        
        month_df = test_df[test_df['Application_month'] == month]
        
        if not month_df.empty:
            month_accounts = month_df['digitalLoanAccountId'].nunique()
            
            for feature in feature_list:
                if feature not in month_df.columns:
                    continue
                
                # Calculate bin-level csi for this feature in this month
                try:
                    # Get all unique bin values for this feature across train and test
                    all_bins = set(train_df[feature].dropna().unique()) | set(month_df[feature].dropna().unique())
                    
                    # Calculate overall feature csi for reference
                    train_counts = train_df[feature].value_counts(dropna=True, normalize=True)
                    test_counts = month_df[feature].value_counts(dropna=True, normalize=True)
                    overall_csi = calculate_categorical_csi(train_counts, test_counts)
                    
                    # Calculate csi for each bin
                    for bin_value in all_bins:
                        # Calculate percentages
                        train_pct = (train_df[feature] == bin_value).mean()
                        test_pct = (month_df[feature] == bin_value).mean()
                        
                        # Add small value to avoid division by zero
                        if train_pct < 0.0001:
                            train_pct = 0.0001
                        if test_pct < 0.0001:
                            test_pct = 0.0001
                            
                        # Calculate csi for this bin
                        bin_csi = (test_pct - train_pct) * np.log(test_pct / train_pct)
                        
                        # Store result
                        all_bin_results.append({
                            'Month': original_month_str,
                            'MonthSortKey': month_str,
                            'feature': feature,
                            'bin_value': bin_value,
                            'DateCategory': 'b_Monthly',
                            'train_pct': train_pct,
                            'test_pct': test_pct,
                            'bin_csi': bin_csi,
                            'feature_csi': overall_csi,
                            'account_count': month_accounts
                        })
                except Exception as e:
                    print(f"Error calculating bin csi for {feature} in {month}: {e}")
    
    # Create the output DataFrame
    return pd.DataFrame(all_bin_results)

# Features list
feature_list = [
    'cic_Personal_Loans_granted_contracts_amt_24M_bin',
    'cic_cnt_active_contracts_bin',
    'cic_vel_contract_nongranted_cnt_12on24_bin',
    'cic_days_since_last_inquiry_bin',
    'cic_max_amt_granted_24M_bin'
]

# Define segment columns
segment_columns = ['ln_user_type', 'ln_prod_type', 'ln_os_type']

# Calculate bin-level csi for overall and by segments
bin_results = calculate_segmented_bin_csi(cicscoredf, feature_list, segment_columns)

                              
                                  
# Try to combine with s_apps_score results if they exist (continued)
try:
    # First ensure the s_apps_score_output_df has the same structure
    if 'MonthSortKey' not in s_apps_score_output_df.columns:
        s_apps_score_output_df['MonthSortKey'] = s_apps_score_output_df['Month']
        # Update DateCategory with prefix
        s_apps_score_output_df['DateCategory'] = s_apps_score_output_df['DateCategory'].apply(
            lambda x: 'a_Training' if x == 'Training' else 'b_Monthly'
        )
    
    # Add segment info to s_apps_score_output_df
    s_apps_score_output_df['segment_type'] = 'Overall'
    s_apps_score_output_df['segment_value'] = 'All'
    
    # Add bin_value column to s_apps_score_output_df (as 'All' for feature-level csi)
    s_apps_score_output_df['bin_value'] = 'All'
    
    # Rename csivalues to feature_csi for consistency
    if 'csivalues' in s_apps_score_output_df.columns:
        s_apps_score_output_df = s_apps_score_output_df.rename(columns={'csivalues': 'feature_csi'})
    
    # Add bin_csi column (same as feature_csi for feature-level csi)
    if 'feature_csi' in s_apps_score_output_df.columns:
        s_apps_score_output_df['bin_csi'] = s_apps_score_output_df['feature_csi']
    
    # Replace 'scorename' with 'feature' for consistency
    if 'scorename' in s_apps_score_output_df.columns:
        s_apps_score_output_df['feature'] = s_apps_score_output_df['feature'].fillna(s_apps_score_output_df['scorename'])
        s_apps_score_output_df = s_apps_score_output_df.drop('scorename', axis=1)
    
    # Combine with bin_results
    combined_results = pd.concat([s_apps_score_output_df, bin_results], ignore_index=True)
except NameError:
    # If s_apps_score_output_df doesn't exist, just use bin_results
    combined_results = bin_results

# Sort by segment_type, segment_value, feature, bin_value, and MonthSortKey
sort_columns = ['segment_type', 'segment_value', 'feature', 'bin_value']
if 'MonthSortKey' in combined_results.columns:
    sort_columns.append('MonthSortKey')
else:
    sort_columns.append('Month')

combined_results = combined_results.sort_values(sort_columns)

# Save the detailed bin-level results
combined_results.to_csv('bin_level_csi_results_cicscore.csv', index=False)

# Display the first few rows
print("Sample of bin-level csi results:")
print(combined_results.head())

# Create pivot tables for easier analysis
print("\nGenerating pivot tables for bin-level analysis...")

# Function to create pivot table for a given segment and feature
def create_bin_pivot(data, segment_type, segment_value, feature=None):
    # Filter by segment
    segment_data = data[(data['segment_type'] == segment_type) & 
                       (data['segment_value'] == segment_value)]
    
    # Further filter by feature if specified
    if feature:
        segment_data = segment_data[segment_data['feature'] == feature]
    
    # Create pivot table - rows are bin values, columns are months
    pivot = segment_data.pivot_table(
        index=['feature', 'bin_value'], 
        columns=['MonthSortKey'] if 'MonthSortKey' in segment_data.columns else ['Month'], 
        values='bin_csi',
        aggfunc='first'
    )
    
    return pivot

# Create bin pivot tables for overall and by segments
unique_segment_combos = combined_results[['segment_type', 'segment_value']].drop_duplicates()
unique_features = combined_results['feature'].unique()

# Create Excel writer to save all pivots in one file
with pd.ExcelWriter('bin_level_csi_pivots_cicscore.xlsx') as writer:
    # First, create overall pivot with all features and bins
    overall_pivot = create_bin_pivot(combined_results, 'Overall', 'All')
    overall_pivot.to_excel(writer, sheet_name='Overall_All_Features')
    print("Created overall pivot table for all features")
    
    # Create separate pivot for each feature (across all segments)
    for feature in unique_features:
        # Create pivot for this feature - Overall segment
        feature_pivot = create_bin_pivot(combined_results, 'Overall', 'All', feature)
        
        # Make sheet name Excel-friendly (31 char limit, no special chars)
        sheet_name = f"Overall_{feature[-20:]}"
        sheet_name = sheet_name.replace("/", "_").replace("\\", "_")[:31]
        
        feature_pivot.to_excel(writer, sheet_name=sheet_name)
        print(f"Created pivot for feature: {feature}")
    
    # Create separate pivot for each segment and feature combination
    for _, segment_row in unique_segment_combos.iterrows():
        segment_type = segment_row['segment_type']
        segment_value = segment_row['segment_value']
        
        # Skip Overall segment as we already handled it
        if segment_type == 'Overall' and segment_value == 'All':
            continue
        
        # Create segment-specific pivots for each feature
        for feature in unique_features:
            # Filter data for this segment and feature
            segment_feature_data = combined_results[
                (combined_results['segment_type'] == segment_type) & 
                (combined_results['segment_value'] == segment_value) &
                (combined_results['feature'] == feature)
            ]
            
            # Skip if no data
            if segment_feature_data.empty:
                continue
                
            # Create pivot
            pivot = segment_feature_data.pivot_table(
                index=['bin_value'], 
                columns=['MonthSortKey'] if 'MonthSortKey' in segment_feature_data.columns else ['Month'], 
                values='bin_csi',
                aggfunc='first'
            )
            
            # Make sheet name Excel-friendly
            segment_name = f"{segment_type}_{segment_value}"
            feature_name = feature[-10:]  # Use last 10 chars of feature name to keep sheet name short
            sheet_name = f"{segment_name}_{feature_name}"
            sheet_name = sheet_name.replace("/", "_").replace("\\", "_")[:31]
            
            pivot.to_excel(writer, sheet_name=sheet_name)
            print(f"Created pivot for {segment_type}={segment_value}, feature={feature}")

print("\nAll bin-level csi results and pivot tables have been saved.")

# Create summary table showing which bins are the biggest contributors to csi
print("\nGenerating bin contribution summary...")

# Calculate bin contribution to total csi
summary_data = []

for segment_type in combined_results['segment_type'].unique():
    for segment_value in combined_results[combined_results['segment_type'] == segment_type]['segment_value'].unique():
        for feature in combined_results['feature'].unique():
            # Get data for this segment and feature
            segment_feature_data = combined_results[
                (combined_results['segment_type'] == segment_type) & 
                (combined_results['segment_value'] == segment_value) &
                (combined_results['feature'] == feature)
            ]
            
            if segment_feature_data.empty:
                continue
                
            # Get unique months
            months = segment_feature_data['Month'].unique()
            
            for month in months:
                month_data = segment_feature_data[segment_feature_data['Month'] == month]
                
                # Get feature csi (should be same for all bins in this feature/month/segment)
                feature_csi = month_data['feature_csi'].iloc[0] if not month_data.empty else 0
                
                # Get top contributing bins
                if not month_data.empty and 'bin_csi' in month_data.columns:
                    # Sort by absolute bin_csi value to get top contributors
                    top_bins = month_data.sort_values('bin_csi', key=abs, ascending=False)
                    
                    # Take top 3 bins
                    for i, (_, bin_row) in enumerate(top_bins.iterrows()):
                        if i >= 3:  # Limit to top 3
                            break
                            
                        bin_value = bin_row['bin_value']
                        bin_csi = bin_row['bin_csi']
                        
                        # Calculate contribution percentage
                        pct_contribution = (bin_csi / feature_csi * 100) if feature_csi != 0 else 0
                        
                        summary_data.append({
                            'segment_type': segment_type,
                            'segment_value': segment_value,
                            'feature': feature,
                            'Month': month,
                            'feature_csi': feature_csi,
                            'bin_value': bin_value,
                            'bin_csi': bin_csi,
                            'pct_contribution': pct_contribution,
                            'rank': i + 1
                        })

# Create summary DataFrame
if summary_data:
    summary_df = pd.DataFrame(summary_data)

    # Pivot to get a table with top contributors
    contribution_pivot = summary_df.pivot_table(
        index=['segment_type', 'segment_value', 'feature', 'Month', 'feature_csi'],
        columns=['rank'],
        values=['bin_value', 'bin_csi', 'pct_contribution'],
        aggfunc='first'
    )

    # Save to Excel
    contribution_pivot.to_excel('bin_contribution_summary_cic_score.xlsx')
    print("Bin contribution summary saved to 'bin_contribution_summary_cic_score.xlsx'")
else:
    print("No data available for bin contribution summary")

print("\nAnalysis complete!")

Sample of bin-level csi results:
       Month                                           feature  \
29   2024-10  cic_Personal_Loans_granted_contracts_amt_24M_bin   
56   2024-11  cic_Personal_Loans_granted_contracts_amt_24M_bin   
82   2024-12  cic_Personal_Loans_granted_contracts_amt_24M_bin   
109  2025-01  cic_Personal_Loans_granted_contracts_amt_24M_bin   
135  2025-02  cic_Personal_Loans_granted_contracts_amt_24M_bin   

           bin_value DateCategory  train_pct  test_pct   bin_csi  feature_csi  \
29   a. 0.999-5650.6    b_Monthly   0.043716  0.047084  0.000250     0.005537   
56   a. 0.999-5650.6    b_Monthly   0.043716  0.049279  0.000666     0.012295   
82   a. 0.999-5650.6    b_Monthly   0.043716  0.051597  0.001306     0.007070   
109  a. 0.999-5650.6    b_Monthly   0.043716  0.055197  0.002677     0.007449   
135  a. 0.999-5650.6    b_Monthly   0.043716  0.050185  0.000893     0.005815   

     account_count MonthSortKey segment_type segment_value  
29           23490    

In [8]:
combined_results

Unnamed: 0,Month,feature,bin_value,DateCategory,train_pct,test_pct,bin_csi,feature_csi,account_count,MonthSortKey,segment_type,segment_value
29,2024-10,cic_Personal_Loans_granted_contracts_amt_24M_bin,a. 0.999-5650.6,b_Monthly,0.043716,0.047084,0.000250,0.005537,23490,b_2024-10,Overall,All
56,2024-11,cic_Personal_Loans_granted_contracts_amt_24M_bin,a. 0.999-5650.6,b_Monthly,0.043716,0.049279,0.000666,0.012295,23925,c_2024-11,Overall,All
82,2024-12,cic_Personal_Loans_granted_contracts_amt_24M_bin,a. 0.999-5650.6,b_Monthly,0.043716,0.051597,0.001306,0.007070,48782,d_2024-12,Overall,All
109,2025-01,cic_Personal_Loans_granted_contracts_amt_24M_bin,a. 0.999-5650.6,b_Monthly,0.043716,0.055197,0.002677,0.007449,23063,e_2025-01,Overall,All
135,2025-02,cic_Personal_Loans_granted_contracts_amt_24M_bin,a. 0.999-5650.6,b_Monthly,0.043716,0.050185,0.000893,0.005815,15981,f_2025-02,Overall,All
...,...,...,...,...,...,...,...,...,...,...,...,...
220,2024-11,cic_vel_contract_nongranted_cnt_12on24_bin,d. missing,b_Monthly,0.655137,0.630361,0.000955,0.004209,8511,c_2024-11,ln_user_type,3_Applied_Not_Disbursed
245,2024-12,cic_vel_contract_nongranted_cnt_12on24_bin,d. missing,b_Monthly,0.655137,0.581155,0.008865,0.025722,18286,d_2024-12,ln_user_type,3_Applied_Not_Disbursed
271,2025-01,cic_vel_contract_nongranted_cnt_12on24_bin,d. missing,b_Monthly,0.655137,0.525308,0.028674,0.076179,8930,e_2025-01,ln_user_type,3_Applied_Not_Disbursed
296,2025-02,cic_vel_contract_nongranted_cnt_12on24_bin,d. missing,b_Monthly,0.655137,0.546854,0.019563,0.060019,6040,f_2025-02,ln_user_type,3_Applied_Not_Disbursed


In [9]:
combined_results['Month'] = combined_results['Month'].replace('2024-09', '2024-06-2024-09')
combined_results['MonthSortKey'] = combined_results['MonthSortKey'].fillna('a_2024-06-2024-09')
combined_results['Month'] = combined_results['Month'].apply(lambda x: x.split(' 00:00:00')[0] if'00:00:00' in x else x)
combined_results['scorename'] = 'CIC_Score'
combined_results['Modelname'] = 'SIL CIC Model'
combined_results['Description'] = 'Train period from 2024-06 to 2024-09'
combined_results

Unnamed: 0,Month,feature,bin_value,DateCategory,train_pct,test_pct,bin_csi,feature_csi,account_count,MonthSortKey,segment_type,segment_value,scorename,Modelname,Description
29,2024-10,cic_Personal_Loans_granted_contracts_amt_24M_bin,a. 0.999-5650.6,b_Monthly,0.043716,0.047084,0.000250,0.005537,23490,b_2024-10,Overall,All,CIC_Score,SIL CIC Model,Train period from 2024-06 to 2024-09
56,2024-11,cic_Personal_Loans_granted_contracts_amt_24M_bin,a. 0.999-5650.6,b_Monthly,0.043716,0.049279,0.000666,0.012295,23925,c_2024-11,Overall,All,CIC_Score,SIL CIC Model,Train period from 2024-06 to 2024-09
82,2024-12,cic_Personal_Loans_granted_contracts_amt_24M_bin,a. 0.999-5650.6,b_Monthly,0.043716,0.051597,0.001306,0.007070,48782,d_2024-12,Overall,All,CIC_Score,SIL CIC Model,Train period from 2024-06 to 2024-09
109,2025-01,cic_Personal_Loans_granted_contracts_amt_24M_bin,a. 0.999-5650.6,b_Monthly,0.043716,0.055197,0.002677,0.007449,23063,e_2025-01,Overall,All,CIC_Score,SIL CIC Model,Train period from 2024-06 to 2024-09
135,2025-02,cic_Personal_Loans_granted_contracts_amt_24M_bin,a. 0.999-5650.6,b_Monthly,0.043716,0.050185,0.000893,0.005815,15981,f_2025-02,Overall,All,CIC_Score,SIL CIC Model,Train period from 2024-06 to 2024-09
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
220,2024-11,cic_vel_contract_nongranted_cnt_12on24_bin,d. missing,b_Monthly,0.655137,0.630361,0.000955,0.004209,8511,c_2024-11,ln_user_type,3_Applied_Not_Disbursed,CIC_Score,SIL CIC Model,Train period from 2024-06 to 2024-09
245,2024-12,cic_vel_contract_nongranted_cnt_12on24_bin,d. missing,b_Monthly,0.655137,0.581155,0.008865,0.025722,18286,d_2024-12,ln_user_type,3_Applied_Not_Disbursed,CIC_Score,SIL CIC Model,Train period from 2024-06 to 2024-09
271,2025-01,cic_vel_contract_nongranted_cnt_12on24_bin,d. missing,b_Monthly,0.655137,0.525308,0.028674,0.076179,8930,e_2025-01,ln_user_type,3_Applied_Not_Disbursed,CIC_Score,SIL CIC Model,Train period from 2024-06 to 2024-09
296,2025-02,cic_vel_contract_nongranted_cnt_12on24_bin,d. missing,b_Monthly,0.655137,0.546854,0.019563,0.060019,6040,f_2025-02,ln_user_type,3_Applied_Not_Disbursed,CIC_Score,SIL CIC Model,Train period from 2024-06 to 2024-09


In [10]:
combined_results.dtypes

Month             object
feature           object
bin_value         object
DateCategory      object
train_pct        float64
test_pct         float64
bin_csi          float64
feature_csi      float64
account_count      int64
MonthSortKey      object
segment_type      object
segment_value     object
scorename         object
Modelname         object
Description       object
dtype: object

In [11]:
dataset_id = 'dap_ds_poweruser_playground'
table_id = 'F_CSI_MODEL_FEATURES_BIN_TAB'
# Define the table schema as per your DataFrame columns
schema = [
    bigquery.SchemaField("Month", "string"),
    bigquery.SchemaField("feature", "string"),
    bigquery.SchemaField("bin_value", "string"),
    bigquery.SchemaField("DateCategory", "string"),
    bigquery.SchemaField("train_pct", "float64"),
    bigquery.SchemaField("test_pct", "float64"),
    bigquery.SchemaField("bin_csi", "float64"),
    bigquery.SchemaField("feature_csi", "float64"),
    bigquery.SchemaField("account_count", "int64"),
    bigquery.SchemaField("MonthSortKey", "string"),
    bigquery.SchemaField("segment_type", "string"),
    bigquery.SchemaField("segment_value", "string"),
    bigquery.SchemaField("scorename", "string"),
    bigquery.SchemaField("Modelname", "string"),
    bigquery.SchemaField("Description", "string"),
    ]
# Create the dataset reference
dataset_ref = client.dataset(dataset_id)
# Define the table reference
table_ref = dataset_ref.table(table_id)
# Configure the job to overwrite the table if it already exists
job_config = bigquery.LoadJobConfig(schema = schema)
# Load the DataFrame into BigQuery
job = client.load_table_from_dataframe(combined_results, table_ref, job_config=job_config)
# Wait for the job to complete
job.result()
print(f"Table {table_id} created in dataset {dataset_id}.")

Table F_CSI_MODEL_FEATURES_BIN_TAB created in dataset dap_ds_poweruser_playground.


# beta_demo_score

In [12]:
sq = """
with sildemo as 
(select 
digitalLoanAccountId,
FORMAT_DATE('%Y-%m', ln_appln_submit_datetime) Application_month,
FORMAT_DATE('%F', DATE_TRUNC(ln_appln_submit_datetime, WEEK(MONDAY))) as Appl_week_start_date,
EXTRACT(WEEK(MONDAY) FROM ln_appln_submit_datetime) as Appl_week_number,
ln_user_type,
ln_loan_type,
ln_prod_type, 
ln_os_type,
beta_demo_score,
case when date_trunc(ln_appln_submit_datetime, day) between '2023-07-01' and '2024-06-30' then 'Train'
       when date_trunc(ln_appln_submit_datetime, day) >= '2024-07-01' then 'Test'
       Else 'Other' end dataselection,
case when beta_de_ln_vas_opted_flag is null then 'c. missing'
     when beta_de_ln_vas_opted_flag = '1' then 'a. 1'
     when beta_de_ln_vas_opted_flag = '0' then 'b. 0'
     end beta_de_ln_vas_opted_flag_bin,
case when beta_de_ln_doc_type_rolled is null then 'i. missing'
     when beta_de_ln_doc_type_rolled like 'Driving License' then 'a. Driving License'
     when beta_de_ln_doc_type_rolled like 'Others' then 'b. Others'
     when beta_de_ln_doc_type_rolled like 'Passport' then 'c. Passport'
     when beta_de_ln_doc_type_rolled like 'Postal ID Card' then 'd. Postal ID Card'
     when beta_de_ln_doc_type_rolled like 'Professional ID Card' then 'e. Professional ID Card'
     when beta_de_ln_doc_type_rolled like 'Social Security Card' then 'f. Social Security Card'
     when beta_de_ln_doc_type_rolled like 'UMID Card' then 'g. UMID Card'
     when beta_de_ln_doc_type_rolled like 'Voter Card' then 'h. Voter Card'
     else 'j. NA' end beta_de_ln_doc_type_rolled_bin,
case when (beta_de_ln_marital_status is null or beta_de_ln_marital_status like 'nan') then 'f. missing'
     when beta_de_ln_marital_status like 'Annulled / Separated' then 'a. Annulled / Separated'
     when beta_de_ln_marital_status like 'Married' then 'b. Married'
     when beta_de_ln_marital_status like 'Single' then 'c. Single'
     when beta_de_ln_marital_status like 'Widow / Widower' then 'd. Widow / Widower'
     when beta_de_ln_marital_status like 'With a Live-in Partner' then 'e. With a Live-in Partner'
     else 'g. NA' end beta_de_ln_marital_status_bin,
beta_de_ln_age_bin,
case when (beta_de_ln_ref2_type is null or beta_de_ln_ref2_type like 'nan') then 'g. missing'
     when beta_de_ln_ref2_type like 'Child' then 'a. Child'
     when beta_de_ln_ref2_type like 'Co-worker' then 'b. Co-worker'
     when beta_de_ln_ref2_type like 'Friend' then 'c. Friend'
     when beta_de_ln_ref2_type like 'Parent' then 'd. Parent'
     when beta_de_ln_ref2_type like 'Sibling' then 'e. Sibling'
     when beta_de_ln_ref2_type like 'Spouse' then 'f. Spouse'
     else 'h. NA' end beta_de_ln_ref2_type_bin
from risk_mart.sil_risk_ds_master_20230101_20250223 
where date_trunc(ln_appln_submit_datetime, day) >= '2023-07-01'
)
select * from sildemo;
"""
sildemodf = client.query(sq).to_dataframe(progress_bar_type='tqdm')

Job ID 1372122f-b506-43ed-b59d-9d6dca34b7c6 successfully executed: 100%|[32m██████████[0m|
Downloading: 100%|[32m██████████[0m|


In [13]:
print(f"The shape of the sildemodf is:\t {sildemodf.shape}")

The shape of the sildemodf is:	 (330431, 15)


In [14]:
sildemodf.head() 

Unnamed: 0,digitalLoanAccountId,Application_month,Appl_week_start_date,Appl_week_number,ln_user_type,ln_loan_type,ln_prod_type,ln_os_type,beta_demo_score,dataselection,beta_de_ln_vas_opted_flag_bin,beta_de_ln_doc_type_rolled_bin,beta_de_ln_marital_status_bin,beta_de_ln_age_bin,beta_de_ln_ref2_type_bin
0,8000ae81-7347-4f15-b10f-fdddb0ba98ac,2024-02,2024-02-12,7,3_Applied_Not_Disbursed,SIL-Instore,Mall,Android,0.189784,Train,a. 1,c. Passport,b. Married,"(39.0, 47.0]",a. Child
1,f756a256-7ca9-415c-b675-e3a12716534d,2023-08,2023-08-07,32,3_Applied_Not_Disbursed,SIL-Instore,Appliance,iOS,0.170973,Train,b. 0,g. UMID Card,c. Single,"(33.0, 39.0]",g. missing
2,0a61b7b7-56f3-4dce-aae3-43c86d217c0b,2023-09,2023-09-25,39,3_Applied_Not_Disbursed,SIL-Instore,Mall,Android,0.141437,Train,b. 0,g. UMID Card,b. Married,"(28.0, 33.0]",g. missing
3,d92cb2eb-8b66-4a23-aedb-edf25808feba,2023-10,2023-10-02,40,3_Applied_Not_Disbursed,SIL-Instore,Mall,iOS,0.098141,Train,b. 0,d. Postal ID Card,c. Single,"(-inf, 28.0]",c. Friend
4,a7b9c34b-b1b1-4814-95e3-b43bee7191b9,2023-09,2023-09-18,38,3_Applied_Not_Disbursed,SIL-Instore,Mall,Android,0.150388,Train,b. 0,c. Passport,b. Married,"(39.0, 47.0]",g. missing


In [15]:
sildemodf.columns.values

array(['digitalLoanAccountId', 'Application_month',
       'Appl_week_start_date', 'Appl_week_number', 'ln_user_type',
       'ln_loan_type', 'ln_prod_type', 'ln_os_type', 'beta_demo_score',
       'dataselection', 'beta_de_ln_vas_opted_flag_bin',
       'beta_de_ln_doc_type_rolled_bin', 'beta_de_ln_marital_status_bin',
       'beta_de_ln_age_bin', 'beta_de_ln_ref2_type_bin'], dtype=object)

In [16]:
import pandas as pd
import numpy as np
from datetime import datetime

def calculate_categorical_csi(train_dist, test_dist):
    """
    Calculate csi for categorical features.
    
    Args:
        train_dist: Distribution of categories in training set
        test_dist: Distribution of categories in test set
    
    Returns:
        float: csi value
    """
    # Ensure both distributions have the same categories
    all_categories = set(train_dist.index) | set(test_dist.index)
    
    # Align distributions
    train_dist_aligned = train_dist.reindex(all_categories, fill_value=0.0001)  # Small value to avoid division by zero
    test_dist_aligned = test_dist.reindex(all_categories, fill_value=0.0001)
    
    # Calculate csi
    csi_values = (test_dist_aligned - train_dist_aligned) * np.log(test_dist_aligned / train_dist_aligned)
    return csi_values.sum()

def calculate_bin_csi(train_df, test_df, feature):
    """
    Calculate csi for each bin value within a feature.
    
    Args:
        train_df: Training DataFrame
        test_df: Test DataFrame
        feature: Feature name to calculate bin-level csi for
    
    Returns:
        DataFrame: csi results for each bin value
    """
    # Get all unique bin values across both datasets
    all_bins = set(train_df[feature].dropna().unique()) | set(test_df[feature].dropna().unique())
    
    # Results list for bin-level csi
    bin_csi_results = []
    
    # Calculate distribution for the entire feature in training set (for reference)
    train_counts = train_df[feature].value_counts(dropna=True)
    train_distribution = train_counts / train_counts.sum()
    
    # Calculate distribution for the entire feature in test set (for reference)
    test_counts = test_df[feature].value_counts(dropna=True)
    test_distribution = test_counts / test_counts.sum()
    
    # Calculate overall csi for the feature
    overall_csi = calculate_categorical_csi(train_distribution, test_distribution)
    
    # Calculate csi for each bin value
    for bin_value in all_bins:
        # Calculate percentage of this bin in train set
        train_bin_count = train_df[train_df[feature] == bin_value].shape[0]
        train_total = train_df.shape[0]
        train_bin_pct = train_bin_count / train_total if train_total > 0 else 0.0001
        
        # Calculate percentage of this bin in test set
        test_bin_count = test_df[test_df[feature] == bin_value].shape[0]
        test_total = test_df.shape[0]
        test_bin_pct = test_bin_count / test_total if test_total > 0 else 0.0001
        
        # Calculate csi for this bin
        if train_bin_pct < 0.0001:
            train_bin_pct = 0.0001  # Avoid division by zero
        if test_bin_pct < 0.0001:
            test_bin_pct = 0.0001  # Avoid division by zero
            
        bin_csi = (test_bin_pct - train_bin_pct) * np.log(test_bin_pct / train_bin_pct)
        
        # Store result
        bin_csi_results.append({
            'feature': feature,
            'bin_value': bin_value,
            'train_pct': train_bin_pct,
            'test_pct': test_bin_pct,
            'bin_csi': bin_csi,
            'feature_csi': overall_csi
        })
    
    return pd.DataFrame(bin_csi_results)

def calculate_segmented_bin_csi(df, feature_list, segment_columns=None):
    """
    Calculate csi for each bin value within multiple features, overall and by segments.
    
    Args:
        df: DataFrame containing the data
        feature_list: List of feature names to calculate csi for
        segment_columns: List of columns to segment by (e.g., ['ln_user_type', 'ln_os_type'])
    
    Returns:
        DataFrame: csi results for each bin value by month and segment
    """
    # Initialize results list
    all_results = []
    
    # If no segment columns are provided, use an empty list
    if segment_columns is None:
        segment_columns = []
    
    # First, calculate overall csi for each bin
    overall_results = calculate_feature_bin_csi(df, feature_list)
    overall_results['segment_type'] = 'Overall'
    overall_results['segment_value'] = 'All'
    all_results.append(overall_results)
    
    # Then calculate csi for each segment column
    for segment_col in segment_columns:
        if segment_col not in df.columns:
            print(f"Warning: {segment_col} not found in DataFrame. Skipping.")
            continue
        
        # Get unique segment values
        segment_values = df[segment_col].dropna().unique()
        
        for segment_val in segment_values:
            # Filter data for this segment
            segment_df = df[df[segment_col] == segment_val]
            
            # Skip if not enough data
            if len(segment_df) < 50:  # Arbitrary threshold
                print(f"Skipping {segment_col}={segment_val} due to insufficient data ({len(segment_df)} rows).")
                continue
                
            # Calculate csi for this segment
            try:
                segment_results = calculate_feature_bin_csi(segment_df, feature_list)
                segment_results['segment_type'] = segment_col
                segment_results['segment_value'] = segment_val
                all_results.append(segment_results)
            except Exception as e:
                print(f"Error calculating csi for {segment_col}={segment_val}: {e}")
    
    # Combine all results
    if all_results:
        combined_results = pd.concat(all_results, ignore_index=True)
        return combined_results
    else:
        return pd.DataFrame()

def calculate_feature_bin_csi(df, feature_list):
    """
    Calculate csi for each bin value within multiple features.
    
    Args:
        df: DataFrame containing the data
        feature_list: List of feature names to calculate csi for
    
    Returns:
        DataFrame: csi results for each bin value by month
    """
    # Make a copy to avoid modifying the original DataFrame
    df_copy = df.copy()
    
    # Separate train and test data
    train_df = df_copy[df_copy['dataselection'] == 'Train']
    test_df = df_copy[df_copy['dataselection'] == 'Test']
    
    # Skip if either dataset is empty
    if train_df.empty or test_df.empty:
        print("Warning: Either train or test dataset is empty. Skipping csi calculation.")
        return pd.DataFrame()
    
    # Handle Application_month based on its type
    if isinstance(df_copy['Application_month'].iloc[0], str):
        # If it's a string in format 'YYYY-MM-DD', extract just 'YYYY-MM'
        last_train_month_str = str(train_df['Application_month'].max())
        if len(last_train_month_str) >= 7:  # Ensure we have at least YYYY-MM
            last_train_month_str = last_train_month_str[:7]  # Extract YYYY-MM part
    else:
        # If it's already a datetime object
        try:
            last_train_month = pd.to_datetime(train_df['Application_month'].max())
            last_train_month_str = last_train_month.strftime('%Y-%m')
        except:
            # Fallback if conversion fails
            last_train_month_str = str(train_df['Application_month'].max())
    
    # Store all bin-level csi results
    all_bin_results = []
    
    # Calculate distribution for each feature in the training set
    for feature in feature_list:
        if feature not in train_df.columns:
            print(f"Warning: Feature {feature} not found in training data. Skipping.")
            continue
        
        # Calculate bin-level csi for the training set against itself (always 0)
        train_bins = train_df[feature].dropna().unique()
        for bin_value in train_bins:
            all_bin_results.append({
                'Month': last_train_month_str,
                'feature': feature,
                'bin_value': bin_value,
                'DateCategory': 'a_Training',
                'train_pct': (train_df[feature] == bin_value).mean(),
                'test_pct': (train_df[feature] == bin_value).mean(),  # Same as train for training data
                'bin_csi': 0.0,  # csi against itself is 0
                'feature_csi': 0.0,  # Overall csi against itself is 0
                'account_count': train_df['digitalLoanAccountId'].nunique()
            })
    
    # Get unique months from test set and sort them
    test_months = sorted(test_df['Application_month'].unique())
    
    # Create mapping of months to prefixed labels (b, c, d, etc.)
    prefix_map = {}
    for i, month in enumerate(test_months):
        month_str = str(month)
        if isinstance(month, str) and len(month_str) >= 7:
            month_str = month_str[:7]  # Extract YYYY-MM part
        
        # Use letters b, c, d, etc. for subsequent months (a is reserved for Training)
        prefix = chr(98 + i)  # ASCII: b=98, c=99, etc.
        prefix_map[month] = f"{prefix}_{month_str}"
    
    # Calculate monthly csi for each feature and bin in the test set
    for month in test_months:
        original_month_str = str(month)
        if isinstance(month, str) and len(original_month_str) >= 7:
            original_month_str = original_month_str[:7]  # Extract YYYY-MM part
        
        # Use the prefixed month string for sorting
        month_str = prefix_map[month]
        
        month_df = test_df[test_df['Application_month'] == month]
        
        if not month_df.empty:
            month_accounts = month_df['digitalLoanAccountId'].nunique()
            
            for feature in feature_list:
                if feature not in month_df.columns:
                    continue
                
                # Calculate bin-level csi for this feature in this month
                try:
                    # Get all unique bin values for this feature across train and test
                    all_bins = set(train_df[feature].dropna().unique()) | set(month_df[feature].dropna().unique())
                    
                    # Calculate overall feature csi for reference
                    train_counts = train_df[feature].value_counts(dropna=True, normalize=True)
                    test_counts = month_df[feature].value_counts(dropna=True, normalize=True)
                    overall_csi = calculate_categorical_csi(train_counts, test_counts)
                    
                    # Calculate csi for each bin
                    for bin_value in all_bins:
                        # Calculate percentages
                        train_pct = (train_df[feature] == bin_value).mean()
                        test_pct = (month_df[feature] == bin_value).mean()
                        
                        # Add small value to avoid division by zero
                        if train_pct < 0.0001:
                            train_pct = 0.0001
                        if test_pct < 0.0001:
                            test_pct = 0.0001
                            
                        # Calculate csi for this bin
                        bin_csi = (test_pct - train_pct) * np.log(test_pct / train_pct)
                        
                        # Store result
                        all_bin_results.append({
                            'Month': original_month_str,
                            'MonthSortKey': month_str,
                            'feature': feature,
                            'bin_value': bin_value,
                            'DateCategory': 'b_Monthly',
                            'train_pct': train_pct,
                            'test_pct': test_pct,
                            'bin_csi': bin_csi,
                            'feature_csi': overall_csi,
                            'account_count': month_accounts
                        })
                except Exception as e:
                    print(f"Error calculating bin csi for {feature} in {month}: {e}")
    
    # Create the output DataFrame
    return pd.DataFrame(all_bin_results)

# Features list
feature_list = [
    'beta_de_ln_vas_opted_flag_bin',
    'beta_de_ln_doc_type_rolled_bin',
    'beta_de_ln_marital_status_bin',
    'beta_de_ln_age_bin',
    'beta_de_ln_ref2_type_bin'
]

# Define segment columns
segment_columns = ['ln_user_type', 'ln_prod_type', 'ln_os_type']

# Calculate bin-level csi for overall and by segments
bin_results = calculate_segmented_bin_csi(sildemodf, feature_list, segment_columns)

                              
                                  
# Try to combine with s_apps_score results if they exist (continued)
try:
    # First ensure the s_apps_score_output_df has the same structure
    if 'MonthSortKey' not in s_apps_score_output_df.columns:
        s_apps_score_output_df['MonthSortKey'] = s_apps_score_output_df['Month']
        # Update DateCategory with prefix
        s_apps_score_output_df['DateCategory'] = s_apps_score_output_df['DateCategory'].apply(
            lambda x: 'a_Training' if x == 'Training' else 'b_Monthly'
        )
    
    # Add segment info to s_apps_score_output_df
    s_apps_score_output_df['segment_type'] = 'Overall'
    s_apps_score_output_df['segment_value'] = 'All'
    
    # Add bin_value column to s_apps_score_output_df (as 'All' for feature-level csi)
    s_apps_score_output_df['bin_value'] = 'All'
    
    # Rename csivalues to feature_csi for consistency
    if 'csivalues' in s_apps_score_output_df.columns:
        s_apps_score_output_df = s_apps_score_output_df.rename(columns={'csivalues': 'feature_csi'})
    
    # Add bin_csi column (same as feature_csi for feature-level csi)
    if 'feature_csi' in s_apps_score_output_df.columns:
        s_apps_score_output_df['bin_csi'] = s_apps_score_output_df['feature_csi']
    
    # Replace 'scorename' with 'feature' for consistency
    if 'scorename' in s_apps_score_output_df.columns:
        s_apps_score_output_df['feature'] = s_apps_score_output_df['feature'].fillna(s_apps_score_output_df['scorename'])
        s_apps_score_output_df = s_apps_score_output_df.drop('scorename', axis=1)
    
    # Combine with bin_results
    combined_results = pd.concat([s_apps_score_output_df, bin_results], ignore_index=True)
except NameError:
    # If s_apps_score_output_df doesn't exist, just use bin_results
    combined_results = bin_results

# Sort by segment_type, segment_value, feature, bin_value, and MonthSortKey
sort_columns = ['segment_type', 'segment_value', 'feature', 'bin_value']
if 'MonthSortKey' in combined_results.columns:
    sort_columns.append('MonthSortKey')
else:
    sort_columns.append('Month')

combined_results = combined_results.sort_values(sort_columns)

# Save the detailed bin-level results
combined_results.to_csv('bin_level_csi_results_sildemo.csv', index=False)

# Display the first few rows
print("Sample of bin-level csi results:")
print(combined_results.head())

# Create pivot tables for easier analysis
print("\nGenerating pivot tables for bin-level analysis...")

# Function to create pivot table for a given segment and feature
def create_bin_pivot(data, segment_type, segment_value, feature=None):
    # Filter by segment
    segment_data = data[(data['segment_type'] == segment_type) & 
                       (data['segment_value'] == segment_value)]
    
    # Further filter by feature if specified
    if feature:
        segment_data = segment_data[segment_data['feature'] == feature]
    
    # Create pivot table - rows are bin values, columns are months
    pivot = segment_data.pivot_table(
        index=['feature', 'bin_value'], 
        columns=['MonthSortKey'] if 'MonthSortKey' in segment_data.columns else ['Month'], 
        values='bin_csi',
        aggfunc='first'
    )
    
    return pivot

# Create bin pivot tables for overall and by segments
unique_segment_combos = combined_results[['segment_type', 'segment_value']].drop_duplicates()
unique_features = combined_results['feature'].unique()

# Create Excel writer to save all pivots in one file
with pd.ExcelWriter('bin_level_csi_pivots_sildemo.xlsx') as writer:
    # First, create overall pivot with all features and bins
    overall_pivot = create_bin_pivot(combined_results, 'Overall', 'All')
    overall_pivot.to_excel(writer, sheet_name='Overall_All_Features')
    print("Created overall pivot table for all features")
    
    # Create separate pivot for each feature (across all segments)
    for feature in unique_features:
        # Create pivot for this feature - Overall segment
        feature_pivot = create_bin_pivot(combined_results, 'Overall', 'All', feature)
        
        # Make sheet name Excel-friendly (31 char limit, no special chars)
        sheet_name = f"Overall_{feature[-20:]}"
        sheet_name = sheet_name.replace("/", "_").replace("\\", "_")[:31]
        
        feature_pivot.to_excel(writer, sheet_name=sheet_name)
        print(f"Created pivot for feature: {feature}")
    
    # Create separate pivot for each segment and feature combination
    for _, segment_row in unique_segment_combos.iterrows():
        segment_type = segment_row['segment_type']
        segment_value = segment_row['segment_value']
        
        # Skip Overall segment as we already handled it
        if segment_type == 'Overall' and segment_value == 'All':
            continue
        
        # Create segment-specific pivots for each feature
        for feature in unique_features:
            # Filter data for this segment and feature
            segment_feature_data = combined_results[
                (combined_results['segment_type'] == segment_type) & 
                (combined_results['segment_value'] == segment_value) &
                (combined_results['feature'] == feature)
            ]
            
            # Skip if no data
            if segment_feature_data.empty:
                continue
                
            # Create pivot
            pivot = segment_feature_data.pivot_table(
                index=['bin_value'], 
                columns=['MonthSortKey'] if 'MonthSortKey' in segment_feature_data.columns else ['Month'], 
                values='bin_csi',
                aggfunc='first'
            )
            
            # Make sheet name Excel-friendly
            segment_name = f"{segment_type}_{segment_value}"
            feature_name = feature[-10:]  # Use last 10 chars of feature name to keep sheet name short
            sheet_name = f"{segment_name}_{feature_name}"
            sheet_name = sheet_name.replace("/", "_").replace("\\", "_")[:31]
            
            pivot.to_excel(writer, sheet_name=sheet_name)
            print(f"Created pivot for {segment_type}={segment_value}, feature={feature}")

print("\nAll bin-level csi results and pivot tables have been saved.")

# Create summary table showing which bins are the biggest contributors to csi
print("\nGenerating bin contribution summary...")

# Calculate bin contribution to total csi
summary_data = []

for segment_type in combined_results['segment_type'].unique():
    for segment_value in combined_results[combined_results['segment_type'] == segment_type]['segment_value'].unique():
        for feature in combined_results['feature'].unique():
            # Get data for this segment and feature
            segment_feature_data = combined_results[
                (combined_results['segment_type'] == segment_type) & 
                (combined_results['segment_value'] == segment_value) &
                (combined_results['feature'] == feature)
            ]
            
            if segment_feature_data.empty:
                continue
                
            # Get unique months
            months = segment_feature_data['Month'].unique()
            
            for month in months:
                month_data = segment_feature_data[segment_feature_data['Month'] == month]
                
                # Get feature csi (should be same for all bins in this feature/month/segment)
                feature_csi = month_data['feature_csi'].iloc[0] if not month_data.empty else 0
                
                # Get top contributing bins
                if not month_data.empty and 'bin_csi' in month_data.columns:
                    # Sort by absolute bin_csi value to get top contributors
                    top_bins = month_data.sort_values('bin_csi', key=abs, ascending=False)
                    
                    # Take top 3 bins
                    for i, (_, bin_row) in enumerate(top_bins.iterrows()):
                        if i >= 3:  # Limit to top 3
                            break
                            
                        bin_value = bin_row['bin_value']
                        bin_csi = bin_row['bin_csi']
                        
                        # Calculate contribution percentage
                        pct_contribution = (bin_csi / feature_csi * 100) if feature_csi != 0 else 0
                        
                        summary_data.append({
                            'segment_type': segment_type,
                            'segment_value': segment_value,
                            'feature': feature,
                            'Month': month,
                            'feature_csi': feature_csi,
                            'bin_value': bin_value,
                            'bin_csi': bin_csi,
                            'pct_contribution': pct_contribution,
                            'rank': i + 1
                        })

# Create summary DataFrame
if summary_data:
    summary_df = pd.DataFrame(summary_data)

    # Pivot to get a table with top contributors
    contribution_pivot = summary_df.pivot_table(
        index=['segment_type', 'segment_value', 'feature', 'Month', 'feature_csi'],
        columns=['rank'],
        values=['bin_value', 'bin_csi', 'pct_contribution'],
        aggfunc='first'
    )

    # Save to Excel
    contribution_pivot.to_excel('bin_contribution_summary_sildemo.xlsx')
    print("Bin contribution summary saved to 'bin_contribution_summary_sildemo.xlsx'")
else:
    print("No data available for bin contribution summary")

print("\nAnalysis complete!")

Sample of bin-level csi results:
       Month             feature     bin_value DateCategory  train_pct  \
49   2024-07  beta_de_ln_age_bin  (-inf, 28.0]    b_Monthly   0.264211   
78   2024-08  beta_de_ln_age_bin  (-inf, 28.0]    b_Monthly   0.264211   
107  2024-09  beta_de_ln_age_bin  (-inf, 28.0]    b_Monthly   0.264211   
136  2024-10  beta_de_ln_age_bin  (-inf, 28.0]    b_Monthly   0.264211   
165  2024-11  beta_de_ln_age_bin  (-inf, 28.0]    b_Monthly   0.264211   

     test_pct       bin_csi  feature_csi  account_count MonthSortKey  \
49   0.266755  2.437043e-05     0.003335          20382    b_2024-07   
78   0.269176  9.240946e-05     0.003258          25136    c_2024-08   
107  0.267877  5.050427e-05     0.004813          25284    d_2024-09   
136  0.264095  5.120661e-08     0.009950          23980    e_2024-10   
165  0.239071  2.513809e-03     0.016284          24407    f_2024-11   

    segment_type segment_value  
49       Overall           All  
78       Overall       

In [17]:
combined_results[['Month','MonthSortKey']].value_counts(dropna=False)

Month    MonthSortKey
2025-02  i_2025-02       252
2024-07  b_2024-07       250
2024-08  c_2024-08       250
2024-06  NaN             249
2024-09  d_2024-09       249
2024-10  e_2024-10       249
2024-11  f_2024-11       249
2024-12  g_2024-12       249
2025-01  h_2025-01       249
Name: count, dtype: int64

In [18]:
combined_results.head()

Unnamed: 0,Month,feature,bin_value,DateCategory,train_pct,test_pct,bin_csi,feature_csi,account_count,MonthSortKey,segment_type,segment_value
49,2024-07,beta_de_ln_age_bin,"(-inf, 28.0]",b_Monthly,0.264211,0.266755,2.437043e-05,0.003335,20382,b_2024-07,Overall,All
78,2024-08,beta_de_ln_age_bin,"(-inf, 28.0]",b_Monthly,0.264211,0.269176,9.240946e-05,0.003258,25136,c_2024-08,Overall,All
107,2024-09,beta_de_ln_age_bin,"(-inf, 28.0]",b_Monthly,0.264211,0.267877,5.050427e-05,0.004813,25284,d_2024-09,Overall,All
136,2024-10,beta_de_ln_age_bin,"(-inf, 28.0]",b_Monthly,0.264211,0.264095,5.120661e-08,0.00995,23980,e_2024-10,Overall,All
165,2024-11,beta_de_ln_age_bin,"(-inf, 28.0]",b_Monthly,0.264211,0.239071,0.002513809,0.016284,24407,f_2024-11,Overall,All


In [19]:
combined_results['Month'] = combined_results['Month'].replace('2024-06', '2023-07-2024-06')
combined_results['MonthSortKey'] = combined_results['MonthSortKey'].fillna('a_2023-07-2024-06')
combined_results['Month'] = combined_results['Month'].apply(lambda x: x.split(' 00:00:00')[0] if'00:00:00' in x else x)
combined_results['scorename'] = 'beta_demo_score'
combined_results['Modelname'] = 'SIL Beta Demo'
combined_results['Description'] = 'Train period from 2023-07 to 2024-06'
combined_results

Unnamed: 0,Month,feature,bin_value,DateCategory,train_pct,test_pct,bin_csi,feature_csi,account_count,MonthSortKey,segment_type,segment_value,scorename,Modelname,Description
49,2024-07,beta_de_ln_age_bin,"(-inf, 28.0]",b_Monthly,0.264211,0.266755,2.437043e-05,0.003335,20382,b_2024-07,Overall,All,beta_demo_score,SIL Beta Demo,Train period from 2023-07 to 2024-06
78,2024-08,beta_de_ln_age_bin,"(-inf, 28.0]",b_Monthly,0.264211,0.269176,9.240946e-05,0.003258,25136,c_2024-08,Overall,All,beta_demo_score,SIL Beta Demo,Train period from 2023-07 to 2024-06
107,2024-09,beta_de_ln_age_bin,"(-inf, 28.0]",b_Monthly,0.264211,0.267877,5.050427e-05,0.004813,25284,d_2024-09,Overall,All,beta_demo_score,SIL Beta Demo,Train period from 2023-07 to 2024-06
136,2024-10,beta_de_ln_age_bin,"(-inf, 28.0]",b_Monthly,0.264211,0.264095,5.120661e-08,0.009950,23980,e_2024-10,Overall,All,beta_demo_score,SIL Beta Demo,Train period from 2023-07 to 2024-06
165,2024-11,beta_de_ln_age_bin,"(-inf, 28.0]",b_Monthly,0.264211,0.239071,2.513809e-03,0.016284,24407,f_2024-11,Overall,All,beta_demo_score,SIL Beta Demo,Train period from 2023-07 to 2024-06
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
407,2024-11,beta_de_ln_vas_opted_flag_bin,b. 0,b_Monthly,0.364409,0.271113,2.759189e-02,0.040370,8952,f_2024-11,ln_user_type,3_Applied_Not_Disbursed,beta_demo_score,SIL Beta Demo,Train period from 2023-07 to 2024-06
436,2024-12,beta_de_ln_vas_opted_flag_bin,b. 0,b_Monthly,0.364409,0.253320,4.039473e-02,0.058289,19051,g_2024-12,ln_user_type,3_Applied_Not_Disbursed,beta_demo_score,SIL Beta Demo,Train period from 2023-07 to 2024-06
465,2025-01,beta_de_ln_vas_opted_flag_bin,b. 0,b_Monthly,0.364409,0.230263,6.158045e-02,0.087269,9272,h_2025-01,ln_user_type,3_Applied_Not_Disbursed,beta_demo_score,SIL Beta Demo,Train period from 2023-07 to 2024-06
494,2025-02,beta_de_ln_vas_opted_flag_bin,b. 0,b_Monthly,0.364409,0.198197,1.012263e-01,0.139839,6322,i_2025-02,ln_user_type,3_Applied_Not_Disbursed,beta_demo_score,SIL Beta Demo,Train period from 2023-07 to 2024-06


In [20]:
dataset_id = 'dap_ds_poweruser_playground'
table_id = 'F_CSI_MODEL_FEATURES_BIN_TAB'
# Define the table schema as per your DataFrame columns
schema = [
    bigquery.SchemaField("Month", "string"),
    bigquery.SchemaField("feature", "string"),
    bigquery.SchemaField("bin_value", "string"),
    bigquery.SchemaField("DateCategory", "string"),
    bigquery.SchemaField("train_pct", "float64"),
    bigquery.SchemaField("test_pct", "float64"),
    bigquery.SchemaField("bin_csi", "float64"),
    bigquery.SchemaField("feature_csi", "float64"),
    bigquery.SchemaField("account_count", "int64"),
    bigquery.SchemaField("MonthSortKey", "string"),
    bigquery.SchemaField("segment_type", "string"),
    bigquery.SchemaField("segment_value", "string"),
    bigquery.SchemaField("scorename", "string"),
    bigquery.SchemaField("Modelname", "string"),
    bigquery.SchemaField("Description", "string"),
    ]
# Create the dataset reference
dataset_ref = client.dataset(dataset_id)
# Define the table reference
table_ref = dataset_ref.table(table_id)
# Configure the job to overwrite the table if it already exists
job_config = bigquery.LoadJobConfig(schema = schema)
# Load the DataFrame into BigQuery
job = client.load_table_from_dataframe(combined_results, table_ref, job_config=job_config)
# Wait for the job to complete
job.result()
print(f"Table {table_id} created in dataset {dataset_id}.")

Table F_CSI_MODEL_FEATURES_BIN_TAB created in dataset dap_ds_poweruser_playground.


# App Score

In [21]:
sq = """
with appscore as 
(select 
digitalLoanAccountId,
FORMAT_DATE('%Y-%m', ln_appln_submit_datetime) Application_month,
FORMAT_DATE('%F', DATE_TRUNC(ln_appln_submit_datetime, WEEK(MONDAY))) as Appl_week_start_date,
EXTRACT(WEEK(MONDAY) FROM ln_appln_submit_datetime) as Appl_week_number,
ln_user_type,
ln_loan_type,
ln_prod_type, 
ln_os_type,
apps_score,
case when date_trunc(ln_appln_submit_datetime, day) between '2023-12-01' and '2024-06-30' then 'Train'
       when date_trunc(ln_appln_submit_datetime, day) >= '2024-07-01' then 'Test'
       Else 'Other' end dataselection,
case when app_first_competitors_install_to_apply_days is null then 'k. missing'
     when app_first_competitors_install_to_apply_days between -2.001 and 4.1 then 'a. -2.001-4.1'
     when app_first_competitors_install_to_apply_days between 4.2 and 40.1 then 'b. 4.2-40.1'
     when app_first_competitors_install_to_apply_days between 40.2 and 88.4 then 'c. 40.2-88.4'
     when app_first_competitors_install_to_apply_days between 88.5 and 143.2 then 'd. 88.5-143.2'
     when app_first_competitors_install_to_apply_days between 143.3 and 206.7 then 'e 143.3-206.7'
     when app_first_competitors_install_to_apply_days between 206.8 and 288.3 then 'f. 206.8-288.2'
     when app_first_competitors_install_to_apply_days between 288.4 and 391.9 then 'g. 288.4-391.9'
     when app_first_competitors_install_to_apply_days between 392.0 and 547.3 then 'h. 392.0-547.3'
     when app_first_competitors_install_to_apply_days between 547.4 and 826.1 then 'i. 547.4-826.1'
     when app_first_competitors_install_to_apply_days between 826.2 and 5242.2 then 'j. 826.2-5242.2'
     when app_first_competitors_install_to_apply_days > 5242.2 then 'l. >5242.2' end app_first_competitors_install_to_apply_days_bin,
case 
    when app_median_time_bw_installed_mins_30d is null then 'l. missing'
    when app_median_time_bw_installed_mins_30d between -0.001 and 96.022 then 'a. -0.001-96.022'
    when app_median_time_bw_installed_mins_30d between 96.023 and 1166.377 then 'b. 96.023-1166.377'
    when app_median_time_bw_installed_mins_30d between 1166.378 and 2259.803 then 'c. 1166.378-2259.803'
    when app_median_time_bw_installed_mins_30d between 2259.804 and 3532.3 then 'd. 2259.804-3532.3'
    when app_median_time_bw_installed_mins_30d between 3532.4 and 5067.042 then 'e. 3532.4-5067.042'
    when app_median_time_bw_installed_mins_30d between 5067.043 and 7065.507 then 'f. 5067.043-7065.507'
    when app_median_time_bw_installed_mins_30d between 7065.508 and 9891.612 then 'g. 7065.508-9891.612'
    when app_median_time_bw_installed_mins_30d between 9891.613 and 14384.46 then 'h. 9891.613-14384.46'
    when app_median_time_bw_installed_mins_30d between 14384.461 and 20358.378 then 'j. 14384.461-20358.378'
    when app_median_time_bw_installed_mins_30d between 20358.379 and 112663145.6 then 'k. 20358.379-112663145.6'
    else 'm. NA'end app_median_time_bw_installed_mins_30d_bin,
CASE
    WHEN app_cnt_absence_tag_90d is null then 'g. missing'
    WHEN app_cnt_absence_tag_90d BETWEEN -0.001 AND 1.0 THEN 'a. (-0.001, 1.0]'
    WHEN app_cnt_absence_tag_90d BETWEEN 1.001 AND 2.0 THEN 'b. (1.0, 2.0]'
    WHEN app_cnt_absence_tag_90d BETWEEN 2.001 AND 3.0 THEN 'c. (2.0, 3.0]'
    WHEN app_cnt_absence_tag_90d BETWEEN 3.001 AND 4.0 THEN 'd. (3.0, 4.0]'
    WHEN app_cnt_absence_tag_90d BETWEEN 4.001 AND 7.0 THEN 'e. (4.0, 7.0]'
    WHEN app_cnt_absence_tag_90d BETWEEN 7.001 AND 154.0 THEN 'f. (7.0, 154.0]'
    ELSE 'h. NA' END app_cnt_absence_tag_90d_bin,
CASE
    WHEN app_cnt_finance_90d is null then 'f. missing'
    WHEN app_cnt_finance_90d BETWEEN -0.001 AND 1.0 THEN 'a. (-0.001, 1.0]'
    WHEN app_cnt_finance_90d BETWEEN 1.001 AND 2.0 THEN 'b. (1.0, 2.0]'
    WHEN app_cnt_finance_90d BETWEEN 2.001 AND 3.0 THEN 'c. (2.0, 3.0]'
    WHEN app_cnt_finance_90d BETWEEN 3.001 AND 4.0 THEN 'd. (3.0, 4.0]'
    WHEN app_cnt_finance_90d BETWEEN 4.001 AND 30.0 THEN 'e. (4.0, 30.0]'
    ELSE 'g. NA'  
END app_cnt_finance_90d_bin,
CASE
    WHEN app_first_payday_install_to_apply_days is null then 'h. missing'
    WHEN app_first_payday_install_to_apply_days BETWEEN -1.001 AND 0.0 THEN 'a. (-1.001, 0.0]'
    WHEN app_first_payday_install_to_apply_days BETWEEN 0.001 AND 0.1 THEN 'b. (0.0, 0.1]'
    WHEN app_first_payday_install_to_apply_days BETWEEN 0.101 AND 42.1 THEN 'c. (0.1, 42.1]'
    WHEN app_first_payday_install_to_apply_days BETWEEN 42.101 AND 138.8 THEN 'd. (42.1, 138.8]'
    WHEN app_first_payday_install_to_apply_days BETWEEN 138.801 AND 273.54 THEN 'e. (138.8, 273.54]'
    WHEN app_first_payday_install_to_apply_days BETWEEN 273.541 AND 532.2 THEN 'f. (273.54, 532.2]'
    WHEN app_first_payday_install_to_apply_days BETWEEN 532.201 AND 5242.2 THEN 'g. (532.2, 5242.2]'
    ELSE 'i. NA'  -- Add an else clause to handle values outside the defined ranges
END app_first_payday_install_to_apply_days_bin
from risk_mart.sil_risk_ds_master_20230101_20250223 
where date_trunc(ln_appln_submit_datetime, day) >= '2023-12-01'
)
select * from appscore;
"""

appscoredf = client.query(sq).to_dataframe(progress_bar_type='tqdm')

Job ID 61dfaf5c-8603-43f8-934c-84757c1a6628 successfully executed: 100%|[32m██████████[0m|
Downloading: 100%|[32m██████████[0m|


In [22]:
print(f"The shape of the appscoredf dataframe is :\t {appscoredf.shape}")

The shape of the appscoredf dataframe is :	 (304245, 15)


In [23]:
appscoredf.head()

Unnamed: 0,digitalLoanAccountId,Application_month,Appl_week_start_date,Appl_week_number,ln_user_type,ln_loan_type,ln_prod_type,ln_os_type,apps_score,dataselection,app_first_competitors_install_to_apply_days_bin,app_median_time_bw_installed_mins_30d_bin,app_cnt_absence_tag_90d_bin,app_cnt_finance_90d_bin,app_first_payday_install_to_apply_days_bin
0,2aeb7d5c-fccf-4953-9fa5-ac56ce53451d,2023-12,2023-11-27,48,1_Repeat Applicant,SIL-Instore,Mall,Android,0.539664,Train,c. 40.2-88.4,g. 7065.508-9891.612,"a. (-0.001, 1.0]","b. (1.0, 2.0]","a. (-1.001, 0.0]"
1,3631d0e1-cc7e-43e2-aead-ef3fcc6d891d,2023-12,2023-11-27,48,1_Repeat Applicant,SIL-Instore,Mall,iOS,,Train,k. missing,l. missing,g. missing,f. missing,h. missing
2,6e64c405-6caa-4870-92db-0e5aa4171374,2023-12,2023-11-27,48,2_New Applicant,SIL-Instore,Mall,Android,0.582709,Train,k. missing,d. 2259.804-3532.3,"d. (3.0, 4.0]","b. (1.0, 2.0]","a. (-1.001, 0.0]"
3,ac3c06de-6ec4-4952-9408-a361e885d53e,2023-12,2023-11-27,48,3_Applied_Not_Disbursed,SIL-Instore,Mall,Android,0.419275,Train,h. 392.0-547.3,a. -0.001-96.022,"e. (4.0, 7.0]","c. (2.0, 3.0]","e. (138.8, 273.54]"
4,0834e57d-a620-448d-aac8-217fa76ef139,2023-12,2023-11-27,48,2_New Applicant,SIL-Instore,Mall,Android,0.544951,Train,g. 288.4-391.9,e. 3532.4-5067.042,"b. (1.0, 2.0]","d. (3.0, 4.0]","c. (0.1, 42.1]"


In [24]:
appscoredf.columns

Index(['digitalLoanAccountId', 'Application_month', 'Appl_week_start_date',
       'Appl_week_number', 'ln_user_type', 'ln_loan_type', 'ln_prod_type',
       'ln_os_type', 'apps_score', 'dataselection',
       'app_first_competitors_install_to_apply_days_bin',
       'app_median_time_bw_installed_mins_30d_bin',
       'app_cnt_absence_tag_90d_bin', 'app_cnt_finance_90d_bin',
       'app_first_payday_install_to_apply_days_bin'],
      dtype='object')

In [25]:
import pandas as pd
import numpy as np
from datetime import datetime

def calculate_categorical_csi(train_dist, test_dist):
    """
    Calculate csi for categorical features.
    
    Args:
        train_dist: Distribution of categories in training set
        test_dist: Distribution of categories in test set
    
    Returns:
        float: csi value
    """
    # Ensure both distributions have the same categories
    all_categories = set(train_dist.index) | set(test_dist.index)
    
    # Align distributions
    train_dist_aligned = train_dist.reindex(all_categories, fill_value=0.0001)  # Small value to avoid division by zero
    test_dist_aligned = test_dist.reindex(all_categories, fill_value=0.0001)
    
    # Calculate csi
    csi_values = (test_dist_aligned - train_dist_aligned) * np.log(test_dist_aligned / train_dist_aligned)
    return csi_values.sum()

def calculate_bin_csi(train_df, test_df, feature):
    """
    Calculate csi for each bin value within a feature.
    
    Args:
        train_df: Training DataFrame
        test_df: Test DataFrame
        feature: Feature name to calculate bin-level csi for
    
    Returns:
        DataFrame: csi results for each bin value
    """
    # Get all unique bin values across both datasets
    all_bins = set(train_df[feature].dropna().unique()) | set(test_df[feature].dropna().unique())
    
    # Results list for bin-level csi
    bin_csi_results = []
    
    # Calculate distribution for the entire feature in training set (for reference)
    train_counts = train_df[feature].value_counts(dropna=True)
    train_distribution = train_counts / train_counts.sum()
    
    # Calculate distribution for the entire feature in test set (for reference)
    test_counts = test_df[feature].value_counts(dropna=True)
    test_distribution = test_counts / test_counts.sum()
    
    # Calculate overall csi for the feature
    overall_csi = calculate_categorical_csi(train_distribution, test_distribution)
    
    # Calculate csi for each bin value
    for bin_value in all_bins:
        # Calculate percentage of this bin in train set
        train_bin_count = train_df[train_df[feature] == bin_value].shape[0]
        train_total = train_df.shape[0]
        train_bin_pct = train_bin_count / train_total if train_total > 0 else 0.0001
        
        # Calculate percentage of this bin in test set
        test_bin_count = test_df[test_df[feature] == bin_value].shape[0]
        test_total = test_df.shape[0]
        test_bin_pct = test_bin_count / test_total if test_total > 0 else 0.0001
        
        # Calculate csi for this bin
        if train_bin_pct < 0.0001:
            train_bin_pct = 0.0001  # Avoid division by zero
        if test_bin_pct < 0.0001:
            test_bin_pct = 0.0001  # Avoid division by zero
            
        bin_csi = (test_bin_pct - train_bin_pct) * np.log(test_bin_pct / train_bin_pct)
        
        # Store result
        bin_csi_results.append({
            'feature': feature,
            'bin_value': bin_value,
            'train_pct': train_bin_pct,
            'test_pct': test_bin_pct,
            'bin_csi': bin_csi,
            'feature_csi': overall_csi
        })
    
    return pd.DataFrame(bin_csi_results)

def calculate_segmented_bin_csi(df, feature_list, segment_columns=None):
    """
    Calculate csi for each bin value within multiple features, overall and by segments.
    
    Args:
        df: DataFrame containing the data
        feature_list: List of feature names to calculate csi for
        segment_columns: List of columns to segment by (e.g., ['ln_user_type', 'ln_os_type'])
    
    Returns:
        DataFrame: csi results for each bin value by month and segment
    """
    # Initialize results list
    all_results = []
    
    # If no segment columns are provided, use an empty list
    if segment_columns is None:
        segment_columns = []
    
    # First, calculate overall csi for each bin
    overall_results = calculate_feature_bin_csi(df, feature_list)
    overall_results['segment_type'] = 'Overall'
    overall_results['segment_value'] = 'All'
    all_results.append(overall_results)
    
    # Then calculate csi for each segment column
    for segment_col in segment_columns:
        if segment_col not in df.columns:
            print(f"Warning: {segment_col} not found in DataFrame. Skipping.")
            continue
        
        # Get unique segment values
        segment_values = df[segment_col].dropna().unique()
        
        for segment_val in segment_values:
            # Filter data for this segment
            segment_df = df[df[segment_col] == segment_val]
            
            # Skip if not enough data
            if len(segment_df) < 50:  # Arbitrary threshold
                print(f"Skipping {segment_col}={segment_val} due to insufficient data ({len(segment_df)} rows).")
                continue
                
            # Calculate csi for this segment
            try:
                segment_results = calculate_feature_bin_csi(segment_df, feature_list)
                segment_results['segment_type'] = segment_col
                segment_results['segment_value'] = segment_val
                all_results.append(segment_results)
            except Exception as e:
                print(f"Error calculating csi for {segment_col}={segment_val}: {e}")
    
    # Combine all results
    if all_results:
        combined_results = pd.concat(all_results, ignore_index=True)
        return combined_results
    else:
        return pd.DataFrame()

def calculate_feature_bin_csi(df, feature_list):
    """
    Calculate csi for each bin value within multiple features.
    
    Args:
        df: DataFrame containing the data
        feature_list: List of feature names to calculate csi for
    
    Returns:
        DataFrame: csi results for each bin value by month
    """
    # Make a copy to avoid modifying the original DataFrame
    df_copy = df.copy()
    
    # Separate train and test data
    train_df = df_copy[df_copy['dataselection'] == 'Train']
    test_df = df_copy[df_copy['dataselection'] == 'Test']
    
    # Skip if either dataset is empty
    if train_df.empty or test_df.empty:
        print("Warning: Either train or test dataset is empty. Skipping csi calculation.")
        return pd.DataFrame()
    
    # Handle Application_month based on its type
    if isinstance(df_copy['Application_month'].iloc[0], str):
        # If it's a string in format 'YYYY-MM-DD', extract just 'YYYY-MM'
        last_train_month_str = str(train_df['Application_month'].max())
        if len(last_train_month_str) >= 7:  # Ensure we have at least YYYY-MM
            last_train_month_str = last_train_month_str[:7]  # Extract YYYY-MM part
    else:
        # If it's already a datetime object
        try:
            last_train_month = pd.to_datetime(train_df['Application_month'].max())
            last_train_month_str = last_train_month.strftime('%Y-%m')
        except:
            # Fallback if conversion fails
            last_train_month_str = str(train_df['Application_month'].max())
    
    # Store all bin-level csi results
    all_bin_results = []
    
    # Calculate distribution for each feature in the training set
    for feature in feature_list:
        if feature not in train_df.columns:
            print(f"Warning: Feature {feature} not found in training data. Skipping.")
            continue
        
        # Calculate bin-level csi for the training set against itself (always 0)
        train_bins = train_df[feature].dropna().unique()
        for bin_value in train_bins:
            all_bin_results.append({
                'Month': last_train_month_str,
                'feature': feature,
                'bin_value': bin_value,
                'DateCategory': 'a_Training',
                'train_pct': (train_df[feature] == bin_value).mean(),
                'test_pct': (train_df[feature] == bin_value).mean(),  # Same as train for training data
                'bin_csi': 0.0,  # csi against itself is 0
                'feature_csi': 0.0,  # Overall csi against itself is 0
                'account_count': train_df['digitalLoanAccountId'].nunique()
            })
    
    # Get unique months from test set and sort them
    test_months = sorted(test_df['Application_month'].unique())
    
    # Create mapping of months to prefixed labels (b, c, d, etc.)
    prefix_map = {}
    for i, month in enumerate(test_months):
        month_str = str(month)
        if isinstance(month, str) and len(month_str) >= 7:
            month_str = month_str[:7]  # Extract YYYY-MM part
        
        # Use letters b, c, d, etc. for subsequent months (a is reserved for Training)
        prefix = chr(98 + i)  # ASCII: b=98, c=99, etc.
        prefix_map[month] = f"{prefix}_{month_str}"
    
    # Calculate monthly csi for each feature and bin in the test set
    for month in test_months:
        original_month_str = str(month)
        if isinstance(month, str) and len(original_month_str) >= 7:
            original_month_str = original_month_str[:7]  # Extract YYYY-MM part
        
        # Use the prefixed month string for sorting
        month_str = prefix_map[month]
        
        month_df = test_df[test_df['Application_month'] == month]
        
        if not month_df.empty:
            month_accounts = month_df['digitalLoanAccountId'].nunique()
            
            for feature in feature_list:
                if feature not in month_df.columns:
                    continue
                
                # Calculate bin-level csi for this feature in this month
                try:
                    # Get all unique bin values for this feature across train and test
                    all_bins = set(train_df[feature].dropna().unique()) | set(month_df[feature].dropna().unique())
                    
                    # Calculate overall feature csi for reference
                    train_counts = train_df[feature].value_counts(dropna=True, normalize=True)
                    test_counts = month_df[feature].value_counts(dropna=True, normalize=True)
                    overall_csi = calculate_categorical_csi(train_counts, test_counts)
                    
                    # Calculate csi for each bin
                    for bin_value in all_bins:
                        # Calculate percentages
                        train_pct = (train_df[feature] == bin_value).mean()
                        test_pct = (month_df[feature] == bin_value).mean()
                        
                        # Add small value to avoid division by zero
                        if train_pct < 0.0001:
                            train_pct = 0.0001
                        if test_pct < 0.0001:
                            test_pct = 0.0001
                            
                        # Calculate csi for this bin
                        bin_csi = (test_pct - train_pct) * np.log(test_pct / train_pct)
                        
                        # Store result
                        all_bin_results.append({
                            'Month': original_month_str,
                            'MonthSortKey': month_str,
                            'feature': feature,
                            'bin_value': bin_value,
                            'DateCategory': 'b_Monthly',
                            'train_pct': train_pct,
                            'test_pct': test_pct,
                            'bin_csi': bin_csi,
                            'feature_csi': overall_csi,
                            'account_count': month_accounts
                        })
                except Exception as e:
                    print(f"Error calculating bin csi for {feature} in {month}: {e}")
    
    # Create the output DataFrame
    return pd.DataFrame(all_bin_results)

# Features list
feature_list = [
  'app_first_competitors_install_to_apply_days_bin',
       'app_median_time_bw_installed_mins_30d_bin',
       'app_cnt_absence_tag_90d_bin',
       'app_cnt_finance_90d_bin',
       'app_first_payday_install_to_apply_days_bin'
]

# Define segment columns
segment_columns = ['ln_user_type', 'ln_prod_type', 'ln_os_type']

# Calculate bin-level csi for overall and by segments
bin_results = calculate_segmented_bin_csi(appscoredf, feature_list, segment_columns)

                              
                                  
# Try to combine with s_apps_score results if they exist (continued)
try:
    # First ensure the s_apps_score_output_df has the same structure
    if 'MonthSortKey' not in s_apps_score_output_df.columns:
        s_apps_score_output_df['MonthSortKey'] = s_apps_score_output_df['Month']
        # Update DateCategory with prefix
        s_apps_score_output_df['DateCategory'] = s_apps_score_output_df['DateCategory'].apply(
            lambda x: 'a_Training' if x == 'Training' else 'b_Monthly'
        )
    
    # Add segment info to s_apps_score_output_df
    s_apps_score_output_df['segment_type'] = 'Overall'
    s_apps_score_output_df['segment_value'] = 'All'
    
    # Add bin_value column to s_apps_score_output_df (as 'All' for feature-level csi)
    s_apps_score_output_df['bin_value'] = 'All'
    
    # Rename csivalues to feature_csi for consistency
    if 'csivalues' in s_apps_score_output_df.columns:
        s_apps_score_output_df = s_apps_score_output_df.rename(columns={'csivalues': 'feature_csi'})
    
    # Add bin_csi column (same as feature_csi for feature-level csi)
    if 'feature_csi' in s_apps_score_output_df.columns:
        s_apps_score_output_df['bin_csi'] = s_apps_score_output_df['feature_csi']
    
    # Replace 'scorename' with 'feature' for consistency
    if 'scorename' in s_apps_score_output_df.columns:
        s_apps_score_output_df['feature'] = s_apps_score_output_df['feature'].fillna(s_apps_score_output_df['scorename'])
        s_apps_score_output_df = s_apps_score_output_df.drop('scorename', axis=1)
    
    # Combine with bin_results
    combined_results = pd.concat([s_apps_score_output_df, bin_results], ignore_index=True)
except NameError:
    # If s_apps_score_output_df doesn't exist, just use bin_results
    combined_results = bin_results

# Sort by segment_type, segment_value, feature, bin_value, and MonthSortKey
sort_columns = ['segment_type', 'segment_value', 'feature', 'bin_value']
if 'MonthSortKey' in combined_results.columns:
    sort_columns.append('MonthSortKey')
else:
    sort_columns.append('Month')

combined_results = combined_results.sort_values(sort_columns)

# Save the detailed bin-level results
combined_results.to_csv('bin_level_csi_results_appscore.csv', index=False)

# Display the first few rows
print("Sample of bin-level csi results:")
print(combined_results.head())

# Create pivot tables for easier analysis
print("\nGenerating pivot tables for bin-level analysis...")

# Function to create pivot table for a given segment and feature
def create_bin_pivot(data, segment_type, segment_value, feature=None):
    # Filter by segment
    segment_data = data[(data['segment_type'] == segment_type) & 
                       (data['segment_value'] == segment_value)]
    
    # Further filter by feature if specified
    if feature:
        segment_data = segment_data[segment_data['feature'] == feature]
    
    # Create pivot table - rows are bin values, columns are months
    pivot = segment_data.pivot_table(
        index=['feature', 'bin_value'], 
        columns=['MonthSortKey'] if 'MonthSortKey' in segment_data.columns else ['Month'], 
        values='bin_csi',
        aggfunc='first'
    )
    
    return pivot

# Create bin pivot tables for overall and by segments
unique_segment_combos = combined_results[['segment_type', 'segment_value']].drop_duplicates()
unique_features = combined_results['feature'].unique()

# Create Excel writer to save all pivots in one file
with pd.ExcelWriter('bin_level_csi_pivots_app_score.xlsx') as writer:
    # First, create overall pivot with all features and bins
    overall_pivot = create_bin_pivot(combined_results, 'Overall', 'All')
    overall_pivot.to_excel(writer, sheet_name='Overall_All_Features')
    print("Created overall pivot table for all features")
    
    # Create separate pivot for each feature (across all segments)
    for feature in unique_features:
        # Create pivot for this feature - Overall segment
        feature_pivot = create_bin_pivot(combined_results, 'Overall', 'All', feature)
        
        # Make sheet name Excel-friendly (31 char limit, no special chars)
        sheet_name = f"Overall_{feature[-20:]}"
        sheet_name = sheet_name.replace("/", "_").replace("\\", "_")[:31]
        
        feature_pivot.to_excel(writer, sheet_name=sheet_name)
        print(f"Created pivot for feature: {feature}")
    
    # Create separate pivot for each segment and feature combination
    for _, segment_row in unique_segment_combos.iterrows():
        segment_type = segment_row['segment_type']
        segment_value = segment_row['segment_value']
        
        # Skip Overall segment as we already handled it
        if segment_type == 'Overall' and segment_value == 'All':
            continue
        
        # Create segment-specific pivots for each feature
        for feature in unique_features:
            # Filter data for this segment and feature
            segment_feature_data = combined_results[
                (combined_results['segment_type'] == segment_type) & 
                (combined_results['segment_value'] == segment_value) &
                (combined_results['feature'] == feature)
            ]
            
            # Skip if no data
            if segment_feature_data.empty:
                continue
                
            # Create pivot
            pivot = segment_feature_data.pivot_table(
                index=['bin_value'], 
                columns=['MonthSortKey'] if 'MonthSortKey' in segment_feature_data.columns else ['Month'], 
                values='bin_csi',
                aggfunc='first'
            )
            
            # Make sheet name Excel-friendly
            segment_name = f"{segment_type}_{segment_value}"
            feature_name = feature[-10:]  # Use last 10 chars of feature name to keep sheet name short
            sheet_name = f"{segment_name}_{feature_name}"
            sheet_name = sheet_name.replace("/", "_").replace("\\", "_")[:31]
            
            pivot.to_excel(writer, sheet_name=sheet_name)
            print(f"Created pivot for {segment_type}={segment_value}, feature={feature}")

print("\nAll bin-level csi results and pivot tables have been saved.")

# Create summary table showing which bins are the biggest contributors to csi
print("\nGenerating bin contribution summary...")

# Calculate bin contribution to total csi
summary_data = []

for segment_type in combined_results['segment_type'].unique():
    for segment_value in combined_results[combined_results['segment_type'] == segment_type]['segment_value'].unique():
        for feature in combined_results['feature'].unique():
            # Get data for this segment and feature
            segment_feature_data = combined_results[
                (combined_results['segment_type'] == segment_type) & 
                (combined_results['segment_value'] == segment_value) &
                (combined_results['feature'] == feature)
            ]
            
            if segment_feature_data.empty:
                continue
                
            # Get unique months
            months = segment_feature_data['Month'].unique()
            
            for month in months:
                month_data = segment_feature_data[segment_feature_data['Month'] == month]
                
                # Get feature csi (should be same for all bins in this feature/month/segment)
                feature_csi = month_data['feature_csi'].iloc[0] if not month_data.empty else 0
                
                # Get top contributing bins
                if not month_data.empty and 'bin_csi' in month_data.columns:
                    # Sort by absolute bin_csi value to get top contributors
                    top_bins = month_data.sort_values('bin_csi', key=abs, ascending=False)
                    
                    # Take top 3 bins
                    for i, (_, bin_row) in enumerate(top_bins.iterrows()):
                        if i >= 3:  # Limit to top 3
                            break
                            
                        bin_value = bin_row['bin_value']
                        bin_csi = bin_row['bin_csi']
                        
                        # Calculate contribution percentage
                        pct_contribution = (bin_csi / feature_csi * 100) if feature_csi != 0 else 0
                        
                        summary_data.append({
                            'segment_type': segment_type,
                            'segment_value': segment_value,
                            'feature': feature,
                            'Month': month,
                            'feature_csi': feature_csi,
                            'bin_value': bin_value,
                            'bin_csi': bin_csi,
                            'pct_contribution': pct_contribution,
                            'rank': i + 1
                        })

# Create summary DataFrame
if summary_data:
    summary_df = pd.DataFrame(summary_data)

    # Pivot to get a table with top contributors
    contribution_pivot = summary_df.pivot_table(
        index=['segment_type', 'segment_value', 'feature', 'Month', 'feature_csi'],
        columns=['rank'],
        values=['bin_value', 'bin_csi', 'pct_contribution'],
        aggfunc='first'
    )

    # Save to Excel
    contribution_pivot.to_excel('bin_contribution_summary_appscore.xlsx')
    print("Bin contribution summary saved to 'bin_contribution_summary_appscore.xlsx'")
else:
    print("No data available for bin contribution summary")

print("\nAnalysis complete!")

Sample of bin-level csi results:
       Month                      feature         bin_value DateCategory  \
75   2024-07  app_cnt_absence_tag_90d_bin  a. (-0.001, 1.0]    b_Monthly   
121  2024-08  app_cnt_absence_tag_90d_bin  a. (-0.001, 1.0]    b_Monthly   
168  2024-09  app_cnt_absence_tag_90d_bin  a. (-0.001, 1.0]    b_Monthly   
214  2024-10  app_cnt_absence_tag_90d_bin  a. (-0.001, 1.0]    b_Monthly   
260  2024-11  app_cnt_absence_tag_90d_bin  a. (-0.001, 1.0]    b_Monthly   

     train_pct  test_pct   bin_csi  feature_csi  account_count MonthSortKey  \
75    0.425208  0.435580  0.000250     0.001070          20382    b_2024-07   
121   0.425208  0.433999  0.000180     0.001469          25136    c_2024-08   
168   0.425208  0.442572  0.000695     0.002705          25284    d_2024-09   
214   0.425208  0.438032  0.000381     0.001143          23980    e_2024-10   
260   0.425208  0.427910  0.000017     0.001599          24407    f_2024-11   

    segment_type segment_value  
75

In [26]:
combined_results

Unnamed: 0,Month,feature,bin_value,DateCategory,train_pct,test_pct,bin_csi,feature_csi,account_count,MonthSortKey,segment_type,segment_value
75,2024-07,app_cnt_absence_tag_90d_bin,"a. (-0.001, 1.0]",b_Monthly,0.425208,0.435580,2.499865e-04,0.001070,20382,b_2024-07,Overall,All
121,2024-08,app_cnt_absence_tag_90d_bin,"a. (-0.001, 1.0]",b_Monthly,0.425208,0.433999,1.798999e-04,0.001469,25136,c_2024-08,Overall,All
168,2024-09,app_cnt_absence_tag_90d_bin,"a. (-0.001, 1.0]",b_Monthly,0.425208,0.442572,6.950210e-04,0.002705,25284,d_2024-09,Overall,All
214,2024-10,app_cnt_absence_tag_90d_bin,"a. (-0.001, 1.0]",b_Monthly,0.425208,0.438032,3.810295e-04,0.001143,23980,e_2024-10,Overall,All
260,2024-11,app_cnt_absence_tag_90d_bin,"a. (-0.001, 1.0]",b_Monthly,0.425208,0.427910,1.711630e-05,0.001599,24407,f_2024-11,Overall,All
...,...,...,...,...,...,...,...,...,...,...,...,...
1497,2024-12,app_median_time_bw_installed_mins_30d_bin,l. missing,b_Monthly,0.213916,0.186394,3.790289e-03,0.007939,19051,g_2024-12,ln_user_type,3_Applied_Not_Disbursed
1543,2025-01,app_median_time_bw_installed_mins_30d_bin,l. missing,b_Monthly,0.213916,0.170190,9.998918e-03,0.020670,9272,h_2025-01,ln_user_type,3_Applied_Not_Disbursed
1589,2025-02,app_median_time_bw_installed_mins_30d_bin,l. missing,b_Monthly,0.213916,0.175736,7.506487e-03,0.017265,6322,i_2025-02,ln_user_type,3_Applied_Not_Disbursed
1219,2024-06,app_median_time_bw_installed_mins_30d_bin,l. missing,a_Training,0.213916,0.213916,0.000000e+00,0.000000,39376,,ln_user_type,3_Applied_Not_Disbursed


In [27]:
combined_results['Month'] = combined_results['Month'].replace('2024-06', '2023-12-2024-06')
combined_results['MonthSortKey'] = combined_results['MonthSortKey'].fillna('a_2023-12-2024-06')
combined_results['Month'] = combined_results['Month'].apply(lambda x: x.split(' 00:00:00')[0] if'00:00:00' in x else x)
combined_results['scorename'] = 'apps_score'
combined_results['Modelname'] = 'Android_SIL_Apps_Score'
combined_results['Description'] = 'Train period from 2023-12 to 2024-06'
combined_results

Unnamed: 0,Month,feature,bin_value,DateCategory,train_pct,test_pct,bin_csi,feature_csi,account_count,MonthSortKey,segment_type,segment_value,scorename,Modelname,Description
75,2024-07,app_cnt_absence_tag_90d_bin,"a. (-0.001, 1.0]",b_Monthly,0.425208,0.435580,2.499865e-04,0.001070,20382,b_2024-07,Overall,All,apps_score,Android_SIL_Apps_Score,Train period from 2023-12 to 2024-06
121,2024-08,app_cnt_absence_tag_90d_bin,"a. (-0.001, 1.0]",b_Monthly,0.425208,0.433999,1.798999e-04,0.001469,25136,c_2024-08,Overall,All,apps_score,Android_SIL_Apps_Score,Train period from 2023-12 to 2024-06
168,2024-09,app_cnt_absence_tag_90d_bin,"a. (-0.001, 1.0]",b_Monthly,0.425208,0.442572,6.950210e-04,0.002705,25284,d_2024-09,Overall,All,apps_score,Android_SIL_Apps_Score,Train period from 2023-12 to 2024-06
214,2024-10,app_cnt_absence_tag_90d_bin,"a. (-0.001, 1.0]",b_Monthly,0.425208,0.438032,3.810295e-04,0.001143,23980,e_2024-10,Overall,All,apps_score,Android_SIL_Apps_Score,Train period from 2023-12 to 2024-06
260,2024-11,app_cnt_absence_tag_90d_bin,"a. (-0.001, 1.0]",b_Monthly,0.425208,0.427910,1.711630e-05,0.001599,24407,f_2024-11,Overall,All,apps_score,Android_SIL_Apps_Score,Train period from 2023-12 to 2024-06
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1497,2024-12,app_median_time_bw_installed_mins_30d_bin,l. missing,b_Monthly,0.213916,0.186394,3.790289e-03,0.007939,19051,g_2024-12,ln_user_type,3_Applied_Not_Disbursed,apps_score,Android_SIL_Apps_Score,Train period from 2023-12 to 2024-06
1543,2025-01,app_median_time_bw_installed_mins_30d_bin,l. missing,b_Monthly,0.213916,0.170190,9.998918e-03,0.020670,9272,h_2025-01,ln_user_type,3_Applied_Not_Disbursed,apps_score,Android_SIL_Apps_Score,Train period from 2023-12 to 2024-06
1589,2025-02,app_median_time_bw_installed_mins_30d_bin,l. missing,b_Monthly,0.213916,0.175736,7.506487e-03,0.017265,6322,i_2025-02,ln_user_type,3_Applied_Not_Disbursed,apps_score,Android_SIL_Apps_Score,Train period from 2023-12 to 2024-06
1219,2023-12-2024-06,app_median_time_bw_installed_mins_30d_bin,l. missing,a_Training,0.213916,0.213916,0.000000e+00,0.000000,39376,a_2023-12-2024-06,ln_user_type,3_Applied_Not_Disbursed,apps_score,Android_SIL_Apps_Score,Train period from 2023-12 to 2024-06


In [28]:
dataset_id = 'dap_ds_poweruser_playground'
table_id = 'F_CSI_MODEL_FEATURES_BIN_TAB'
# Define the table schema as per your DataFrame columns
schema = [
    bigquery.SchemaField("Month", "string"),
    bigquery.SchemaField("feature", "string"),
    bigquery.SchemaField("bin_value", "string"),
    bigquery.SchemaField("DateCategory", "string"),
    bigquery.SchemaField("train_pct", "float64"),
    bigquery.SchemaField("test_pct", "float64"),
    bigquery.SchemaField("bin_csi", "float64"),
    bigquery.SchemaField("feature_csi", "float64"),
    bigquery.SchemaField("account_count", "int64"),
    bigquery.SchemaField("MonthSortKey", "string"),
    bigquery.SchemaField("segment_type", "string"),
    bigquery.SchemaField("segment_value", "string"),
    bigquery.SchemaField("scorename", "string"),
    bigquery.SchemaField("Modelname", "string"),
    bigquery.SchemaField("Description", "string"),
    ]
# Create the dataset reference
dataset_ref = client.dataset(dataset_id)
# Define the table reference
table_ref = dataset_ref.table(table_id)
# Configure the job to overwrite the table if it already exists
job_config = bigquery.LoadJobConfig(schema = schema)
# Load the DataFrame into BigQuery
job = client.load_table_from_dataframe(combined_results, table_ref, job_config=job_config)
# Wait for the job to complete
job.result()
print(f"Table {table_id} created in dataset {dataset_id}.")

Table F_CSI_MODEL_FEATURES_BIN_TAB created in dataset dap_ds_poweruser_playground.


In [29]:
combined_results[(combined_results['Month']=='2024-07')]

Unnamed: 0,Month,feature,bin_value,DateCategory,train_pct,test_pct,bin_csi,feature_csi,account_count,MonthSortKey,segment_type,segment_value,scorename,Modelname,Description
75,2024-07,app_cnt_absence_tag_90d_bin,"a. (-0.001, 1.0]",b_Monthly,0.425208,0.435580,2.499865e-04,0.001070,20382,b_2024-07,Overall,All,apps_score,Android_SIL_Apps_Score,Train period from 2023-12 to 2024-06
71,2024-07,app_cnt_absence_tag_90d_bin,"b. (1.0, 2.0]",b_Monthly,0.144708,0.144932,3.465094e-07,0.001070,20382,b_2024-07,Overall,All,apps_score,Android_SIL_Apps_Score,Train period from 2023-12 to 2024-06
69,2024-07,app_cnt_absence_tag_90d_bin,"c. (2.0, 3.0]",b_Monthly,0.098697,0.097782,8.519803e-06,0.001070,20382,b_2024-07,Overall,All,apps_score,Android_SIL_Apps_Score,Train period from 2023-12 to 2024-06
72,2024-07,app_cnt_absence_tag_90d_bin,"d. (3.0, 4.0]",b_Monthly,0.062973,0.065793,1.235806e-04,0.001070,20382,b_2024-07,Overall,All,apps_score,Android_SIL_Apps_Score,Train period from 2023-12 to 2024-06
74,2024-07,app_cnt_absence_tag_90d_bin,"e. (4.0, 7.0]",b_Monthly,0.095673,0.093514,4.928903e-05,0.001070,20382,b_2024-07,Overall,All,apps_score,Android_SIL_Apps_Score,Train period from 2023-12 to 2024-06
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1265,2024-07,app_median_time_bw_installed_mins_30d_bin,g. 7065.508-9891.612,b_Monthly,0.071381,0.074280,1.153758e-04,0.004508,8091,b_2024-07,ln_user_type,3_Applied_Not_Disbursed,apps_score,Android_SIL_Apps_Score,Train period from 2023-12 to 2024-06
1263,2024-07,app_median_time_bw_installed_mins_30d_bin,h. 9891.613-14384.46,b_Monthly,0.067400,0.063157,2.759833e-04,0.004508,8091,b_2024-07,ln_user_type,3_Applied_Not_Disbursed,apps_score,Android_SIL_Apps_Score,Train period from 2023-12 to 2024-06
1268,2024-07,app_median_time_bw_installed_mins_30d_bin,j. 14384.461-20358.378,b_Monthly,0.064687,0.065011,1.612970e-06,0.004508,8091,b_2024-07,ln_user_type,3_Applied_Not_Disbursed,apps_score,Android_SIL_Apps_Score,Train period from 2023-12 to 2024-06
1269,2024-07,app_median_time_bw_installed_mins_30d_bin,k. 20358.379-112663145.6,b_Monthly,0.062963,0.054134,1.333792e-03,0.004508,8091,b_2024-07,ln_user_type,3_Applied_Not_Disbursed,apps_score,Android_SIL_Apps_Score,Train period from 2023-12 to 2024-06
