# <div align="center" style="color: #ff5733;">PSI Monitoring</div>

# Declare Libraries

In [1]:
# %% [markdown]
# # Jupyter Notebook Loading Header
#
# This is a custom loading header for Jupyter Notebooks in Visual Studio Code.
# It includes common imports and settings to get you started quickly.

# %% [markdown]
## Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from google.cloud import bigquery
import os
path = r'C:\Users\DwaipayanChakroborti\AppData\Roaming\gcloud\legacy_credentials\dchakroborti@tonikbank.com\adc.json'
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = path
client = bigquery.Client(project='prj-prod-dataplatform')

# %% [markdown]
## Configure Settings
# Set options or configurations as needed
# Example: pd.set_option('display.max_columns', None)

# s_apps_score

In [3]:
sq = """
with base as 
(select 
  a.digitalLoanAccountId, 
  FORMAT_DATE('%Y-%m', a.ln_appln_submit_datetime) Application_month,
  FORMAT_DATE('%F', DATE_TRUNC(a.ln_appln_submit_datetime, WEEK(MONDAY))) as Appl_week_start_date,
  EXTRACT(WEEK(MONDAY) FROM a.ln_appln_submit_datetime) as Appl_week_number,
  a.ln_loan_type,
  case when date_trunc(a.ln_appln_submit_datetime, day) between '2023-07-01' and '2024-06-30' then 'Train'
       when date_trunc(a.ln_appln_submit_datetime, day) >= '2024-08-01' then 'Test'
       Else 'Other' end dataselection,
  a.apps_score s_apps_score,
 from
  risk_mart.sil_risk_ds_master_20230101_20250309 a
 where a.ln_loan_applied_flag = 1 and ln_dl_rule_reject_flag = 0
 and date_trunc(a.ln_appln_submit_datetime, day) >= '2023-04-01'
 )
 select * from base where dataselection in ('Train', 'Test');"""
#  s_apps_score is not null and
 
df = client.query(sq).to_dataframe(progress_bar_type='tqdm')
print(f"The shape of df before duplicate drop is:\t {df.shape}")

df = df.drop_duplicates(keep='first')

print(f"The shape of df after duplicate drop is:\t {df.shape}")

Job ID 9de6eb8b-6c8a-4e8a-ac5d-449960f0e508 successfully executed: 100%|[32m██████████[0m|




Downloading: 100%|[32m██████████[0m|
The shape of df before duplicate drop is:	 (320422, 7)
The shape of df after duplicate drop is:	 (320380, 7)


In [6]:
df.head()

Unnamed: 0,digitalLoanAccountId,Application_month,Appl_week_start_date,Appl_week_number,ln_loan_type,dataselection,s_apps_score
0,cff7a63d-07bf-4183-abe9-ea96653f0714,2023-08,2023-08-07,32,SIL-Instore,Train,0.313732
1,257c2f5c-901b-41ac-8d04-589fe750ec00,2023-08,2023-08-28,35,SIL-Instore,Train,0.582865
2,3df9d156-adaa-4fe3-b70d-392effd36d7f,2023-08,2023-08-28,35,SIL-Instore,Train,0.564581
3,a68613cb-0646-4e22-95b7-7242437dc0f4,2023-09,2023-09-25,39,SIL-Instore,Train,0.366233
4,e1244409-28e0-4033-9441-d01ce36e6ce5,2023-09,2023-08-28,35,SIL-Instore,Train,0.334749


In [13]:
import pandas as pd
import numpy as np
from datetime import datetime

# Assuming your DataFrame is called 'df' and has the structure from the image
# Replace this with your actual DataFrame loading process
sq = """
with base as 
(select 
  a.digitalLoanAccountId, 
  FORMAT_DATE('%Y-%m', a.ln_appln_submit_datetime) Application_month,
  FORMAT_DATE('%F', DATE_TRUNC(a.ln_appln_submit_datetime, WEEK(MONDAY))) as Appl_week_start_date,
  EXTRACT(WEEK(MONDAY) FROM a.ln_appln_submit_datetime) as Appl_week_number,
  a.ln_loan_type,
  case when date_trunc(a.ln_appln_submit_datetime, day) between '2023-07-01' and '2024-06-30' then 'Train'
       when date_trunc(a.ln_appln_submit_datetime, day) >= '2024-07-01' then 'Test'
       Else 'Other' end dataselection,
  a.apps_score s_apps_score,
 from 
 risk_mart.sil_risk_ds_master_20230101_20250309 a
 where a.ln_loan_applied_flag = 1 and ln_dl_rule_reject_flag = 0
 and date_trunc(a.ln_appln_submit_datetime, day) >= '2023-04-01'
 )
 select * from base where dataselection in ('Train', 'Test') and s_apps_score is not null;"""

print(df.groupby(['dataselection'])['digitalLoanAccountId'].nunique())
 
df = client.query(sq).to_dataframe(progress_bar_type='tqdm')

# Convert Application_month to datetime if it's not already
if df['Application_month'].dtype != 'datetime64[ns]':
    df['Application_month'] = pd.to_datetime(df['Application_month'] + '-01')

# Separate train and test data
train_df = df[df['dataselection'] == 'Train']
test_df = df[df['dataselection'] == 'Test']

# Calculate decile bins for the entire training set
train_deciles = pd.qcut(train_df['s_apps_score'], 10, labels=False, retbins=True)
train_decile_bins = train_deciles[1]

# Function to calculate PSI using the pre-defined decile bins
def calculate_psi_with_bins(data_scores, decile_bins):
    """Calculates PSI using pre-defined decile bins."""
    data_deciles = pd.cut(data_scores, bins=decile_bins, labels=False, include_lowest=True)
    distribution = pd.Series(data_deciles).value_counts().sort_index() / len(data_scores)

    # Align with training distribution
    all_bins = range(10)  # Assuming 10 deciles
    distribution_aligned = distribution.reindex(all_bins, fill_value=0)
    train_dist_aligned = pd.Series(train_deciles[0]).value_counts().sort_index() / len(train_df['s_apps_score'])
    train_dist_aligned = train_dist_aligned.reindex(all_bins, fill_value=0)

    psi_values = (distribution_aligned - train_dist_aligned) * np.log(distribution_aligned / train_dist_aligned)
    return psi_values.sum()

# Calculate PSI for the entire training set
train_psi = calculate_psi_with_bins(train_df['s_apps_score'], train_decile_bins)

# Get the last month of the training set
last_train_month = train_df['Application_month'].max()
last_train_month_str = last_train_month.strftime('%Y-%m')

import pandas as pd
import numpy as np
from datetime import datetime

# Your existing query and initial dataframe setup remains the same
# ...

# Calculate monthly PSI for the test set
monthly_psi_results = []

# Add the train set PSI to the results (with the correct last month)
monthly_psi_results.append({
    'Month': last_train_month_str,  # Use the last month of the training set
    'scorename': 's_apps_score',
    'DateCategory': 'Training',
    'psivalues': train_psi,
    'account_count': train_df['digitalLoanAccountId'].nunique()  # Add distinct account count
})

# Calculate monthly PSI for the test set
for month in sorted(test_df['Application_month'].unique()):
    month_str = month.strftime('%Y-%m')
    month_df = test_df[test_df['Application_month'] == month]
    if not month_df.empty:
        month_psi = calculate_psi_with_bins(month_df['s_apps_score'], train_decile_bins)
        monthly_psi_results.append({
            'Month': month_str,
            'scorename': 's_apps_score',
            'DateCategory': 'Monthly',
            'psivalues': month_psi,
            'account_count': month_df['digitalLoanAccountId'].nunique()  # Add distinct account count
        })

# Create the output DataFrame
s_apps_score_output_df = pd.DataFrame(monthly_psi_results)



dataselection
Test     196860
Train    120879
Name: digitalLoanAccountId, dtype: int64
Job ID cf4f7c3e-7cd1-4450-9876-ff300d7b0516 successfully executed: 100%|[32m██████████[0m|




Downloading: 100%|[32m██████████[0m|


In [16]:
s_apps_score_output_df.rename(columns={'psivalues':'s_apps_score_psivalues'}, inplace = True)

In [18]:
s_apps_score_output_df

Unnamed: 0,Month,scorename,DateCategory,s_apps_score_psivalues,account_count
0,2024-06,s_apps_score,Training,0.0,109413
1,2024-07,s_apps_score,Monthly,0.034227,18571
2,2024-08,s_apps_score,Monthly,0.034721,22959
3,2024-09,s_apps_score,Monthly,0.045001,23145
4,2024-10,s_apps_score,Monthly,0.045987,21868
5,2024-11,s_apps_score,Monthly,0.039437,22271
6,2024-12,s_apps_score,Monthly,0.027791,45504
7,2025-01,s_apps_score,Monthly,0.046202,21253
8,2025-02,s_apps_score,Monthly,0.058867,18625
9,2025-03,s_apps_score,Monthly,0.059748,6133


# sb_demo_score

In [19]:
sq = """
with base as 
(select 
  a.digitalLoanAccountId, 
  FORMAT_DATE('%Y-%m', a.ln_appln_submit_datetime) Application_month,
  FORMAT_DATE('%F', DATE_TRUNC(a.ln_appln_submit_datetime, WEEK(MONDAY))) as Appl_week_start_date,
  EXTRACT(WEEK(MONDAY) FROM a.ln_appln_submit_datetime) as Appl_week_number,
  a.ln_loan_type,
  case when date_trunc(a.ln_appln_submit_datetime, day) between '2023-07-01' and '2024-06-30' then 'Train'
       when date_trunc(a.ln_appln_submit_datetime, day) >= '2024-07-01' then 'Test'
       Else 'Other' end dataselection,
  a.beta_demo_score sb_demo_score,
 from risk_mart.sil_risk_ds_master_20230101_20250309 a
 where a.ln_loan_applied_flag = 1 and ln_dl_rule_reject_flag = 0
 and date_trunc(a.ln_appln_submit_datetime, day) >= '2023-04-01'
 )
 select * from base where sb_demo_score is not null and dataselection in ('Train', 'Test');"""
 
df = client.query(sq).to_dataframe(progress_bar_type='tqdm')
df.head()

Job ID f2932d76-c933-499c-986e-35095e4878d1 successfully executed: 100%|[32m██████████[0m|




Downloading: 100%|[32m██████████[0m|


Unnamed: 0,digitalLoanAccountId,Application_month,Appl_week_start_date,Appl_week_number,ln_loan_type,dataselection,sb_demo_score
0,cff7a63d-07bf-4183-abe9-ea96653f0714,2023-08,2023-08-07,32,SIL-Instore,Train,0.156812
1,257c2f5c-901b-41ac-8d04-589fe750ec00,2023-08,2023-08-28,35,SIL-Instore,Train,0.187182
2,3df9d156-adaa-4fe3-b70d-392effd36d7f,2023-08,2023-08-28,35,SIL-Instore,Train,0.184286
3,a68613cb-0646-4e22-95b7-7242437dc0f4,2023-09,2023-09-25,39,SIL-Instore,Train,0.102756
4,b93a195b-9668-4c15-bdc8-7c607623c117,2024-03,2024-02-26,9,SIL-Instore,Train,0.099354


In [20]:
import pandas as pd
import numpy as np
from datetime import datetime

# Replace this with your actual DataFrame loading process
sq = """
with base as 
(select 
  a.digitalLoanAccountId, 
  FORMAT_DATE('%Y-%m', a.ln_appln_submit_datetime) Application_month,
  FORMAT_DATE('%F', DATE_TRUNC(a.ln_appln_submit_datetime, WEEK(MONDAY))) as Appl_week_start_date,
  EXTRACT(WEEK(MONDAY) FROM a.ln_appln_submit_datetime) as Appl_week_number,
  a.ln_loan_type,
  case when date_trunc(a.ln_appln_submit_datetime, day) between '2023-07-01' and '2024-06-30' then 'Train'
       when date_trunc(a.ln_appln_submit_datetime, day) >= '2024-07-01' then 'Test'
       Else 'Other' end dataselection,
  a.beta_demo_score sb_demo_score,
 from  risk_mart.sil_risk_ds_master_20230101_20250309 a
 where a.ln_loan_applied_flag = 1
 and date_trunc(a.ln_appln_submit_datetime, day) >= '2023-04-01'
 )
 select * from base where sb_demo_score is not null and dataselection in ('Train', 'Test');"""
df = client.query(sq).to_dataframe(progress_bar_type='tqdm')

print(df.groupby(['dataselection'])['digitalLoanAccountId'].nunique())

# Convert Application_month to datetime if it's not already
if df['Application_month'].dtype != 'datetime64[ns]':
    df['Application_month'] = pd.to_datetime(df['Application_month'] + '-01')

# Separate train and test data
train_df = df[df['dataselection'] == 'Train']
test_df = df[df['dataselection'] == 'Test']

# Calculate decile bins for the entire training set
train_deciles = pd.qcut(train_df['sb_demo_score'], 10, labels=False, retbins=True)
train_decile_bins = train_deciles[1]
print(train_decile_bins)
# Function to calculate PSI using the pre-defined decile bins
def calculate_psi_with_bins(data_scores, decile_bins):
    """Calculates PSI using pre-defined decile bins."""
    data_deciles = pd.cut(data_scores, bins=decile_bins, labels=False, include_lowest=True)
    distribution = pd.Series(data_deciles).value_counts().sort_index() / len(data_scores)

    # Align with training distribution
    all_bins = range(10)  # Assuming 10 deciles
    distribution_aligned = distribution.reindex(all_bins, fill_value=0)
    train_dist_aligned = pd.Series(train_deciles[0]).value_counts().sort_index() / len(train_df['sb_demo_score'])
    train_dist_aligned = train_dist_aligned.reindex(all_bins, fill_value=0)

    psi_values = (distribution_aligned - train_dist_aligned) * np.log(distribution_aligned / train_dist_aligned)
    return psi_values.sum()

# Calculate PSI for the entire training set
train_psi = calculate_psi_with_bins(train_df['sb_demo_score'], train_decile_bins)

# Get the last month of the training set
last_train_month = train_df['Application_month'].max()
last_train_month_str = last_train_month.strftime('%Y-%m')

# Calculate monthly PSI for the test set
monthly_psi_results = []

# Add the train set PSI to the results (with the correct last month)
monthly_psi_results.append({
    'Month': last_train_month_str,  # Use the last month of the training set
    # 'loan_type': train_df['ln_loan_type'].iloc[0],  # Assuming loan_type is consistent in the train set
    'scorename': 'sb_demo_score',
    'DateCategory': 'Training',
    'psivalues': train_psi,
    'account_count': train_df['digitalLoanAccountId'].nunique()  # Add distinct account count
})

# Calculate monthly PSI for the test set
for month in sorted(test_df['Application_month'].unique()):
    month_str = month.strftime('%Y-%m')
    month_df = test_df[test_df['Application_month'] == month]
    if not month_df.empty:
        month_psi = calculate_psi_with_bins(month_df['sb_demo_score'], train_decile_bins)
        monthly_psi_results.append({
            'Month': month_str,
            # 'loan_type': month_df['ln_loan_type'].iloc[0],  # Assuming loan_type is consistent in the month
            'scorename': 'sb_demo_score',
            'DateCategory': 'Monthly',
            'psivalues': month_psi,
            'account_count': month_df['digitalLoanAccountId'].nunique()  # Add distinct account count
        })

# Create the output DataFrame
sb_demo_score_output_df = pd.DataFrame(monthly_psi_results)
sb_demo_score_output_df.rename(columns={'psivalues':'sb_demo_score_psivalues'}, inplace = True)
sb_demo_score_output_df

Job ID bbbe05b9-a0ad-4fcb-8322-340a06e55e67 successfully executed: 100%|[32m██████████[0m|




Downloading: 100%|[32m██████████[0m|
dataselection
Test     219883
Train    121661
Name: digitalLoanAccountId, dtype: int64
[0.00933333 0.04168501 0.05392044 0.06440865 0.07482585 0.08556978
 0.09703029 0.10986035 0.1267363  0.15369263 0.46407017]


Unnamed: 0,Month,scorename,DateCategory,sb_demo_score_psivalues,account_count
0,2024-06,sb_demo_score,Training,0.0,121661
1,2024-07,sb_demo_score,Monthly,0.01218,20382
2,2024-08,sb_demo_score,Monthly,0.011622,25136
3,2024-09,sb_demo_score,Monthly,0.016331,25284
4,2024-10,sb_demo_score,Monthly,0.013352,23980
5,2024-11,sb_demo_score,Monthly,0.02344,24407
6,2024-12,sb_demo_score,Monthly,0.013217,49721
7,2025-01,sb_demo_score,Monthly,0.002959,23473
8,2025-02,sb_demo_score,Monthly,0.004903,20636
9,2025-03,sb_demo_score,Monthly,0.011888,6864


# s_cic_score

In [22]:
import pandas as pd
import numpy as np
from datetime import datetime

# Replace this with your actual DataFrame loading process
sq = """
with base as 
(select 
  a.digitalLoanAccountId, 
  FORMAT_DATE('%Y-%m', a.ln_appln_submit_datetime) Application_month,
  FORMAT_DATE('%F', DATE_TRUNC(a.ln_appln_submit_datetime, WEEK(MONDAY))) as Appl_week_start_date,
  EXTRACT(WEEK(MONDAY) FROM a.ln_appln_submit_datetime) as Appl_week_number,
  a.ln_loan_type,
  case when date_trunc(a.ln_appln_submit_datetime, day) between '2023-07-01' and '2024-06-30' then 'Train'
       when date_trunc(a.ln_appln_submit_datetime, day) >= '2024-07-01' then 'Test'
       Else 'Other' end dataselection,
  a.cic_score s_cic_score,
 from  risk_mart.sil_risk_ds_master_20230101_20250309 a
 where a.ln_loan_applied_flag = 1 and ln_dl_rule_reject_flag = 0
 and date_trunc(a.ln_appln_submit_datetime, day) >= '2023-04-01'
 )
 select * from base 
 where s_cic_score is not null and  dataselection in ('Train', 'Test');"""
df = client.query(sq).to_dataframe(progress_bar_type='tqdm')

print(df.groupby(['dataselection'])['digitalLoanAccountId'].nunique())

# Convert Application_month to datetime if it's not already
if df['Application_month'].dtype != 'datetime64[ns]':
    df['Application_month'] = pd.to_datetime(df['Application_month'] + '-01')

# Separate train and test data
train_df = df[df['dataselection'] == 'Train']
test_df = df[df['dataselection'] == 'Test']

# Calculate decile bins for the entire training set
train_deciles = pd.qcut(train_df['s_cic_score'], 10, labels=False, retbins=True)
train_decile_bins = train_deciles[1]
print(train_decile_bins)
# Function to calculate PSI using the pre-defined decile bins
def calculate_psi_with_bins(data_scores, decile_bins):
    """Calculates PSI using pre-defined decile bins."""
    data_deciles = pd.cut(data_scores, bins=decile_bins, labels=False, include_lowest=True)
    distribution = pd.Series(data_deciles).value_counts().sort_index() / len(data_scores)
    
    # Align with training distribution
    all_bins = range(10)  # Assuming 10 deciles
    distribution_aligned = distribution.reindex(all_bins, fill_value=0)
    train_dist_aligned = pd.Series(train_deciles[0]).value_counts().sort_index() / len(train_df['s_cic_score'])
    train_dist_aligned = train_dist_aligned.reindex(all_bins, fill_value=0)
    
    # print(f"distribution_aligned-{distribution_aligned}")
    # print(f"train_dist_aligned - {train_dist_aligned}")

    psi_values = (distribution_aligned - train_dist_aligned) * np.log(distribution_aligned / train_dist_aligned)
    return psi_values.sum()

# Calculate PSI for the entire training set
train_psi = calculate_psi_with_bins(train_df['s_cic_score'], train_decile_bins)

# Get the last month of the training set
last_train_month = train_df['Application_month'].max()
last_train_month_str = last_train_month.strftime('%Y-%m')

# Calculate monthly PSI for the test set
monthly_psi_results = []

# Add the train set PSI to the results (with the correct last month)
monthly_psi_results.append({
    'Month': last_train_month_str,  # Use the last month of the training set
    # 'loan_type': train_df['ln_loan_type'].iloc[0],  # Assuming loan_type is consistent in the train set
    'scorename': 's_cic_score',
    'DateCategory': 'Training',
    'psivalues': train_psi,
    'account_count': train_df['digitalLoanAccountId'].nunique()  # Add distinct account count
})

# Calculate monthly PSI for the test set
for month in sorted(test_df['Application_month'].unique()):
    month_str = month.strftime('%Y-%m')
    month_df = test_df[test_df['Application_month'] == month]
    if not month_df.empty:
        month_psi = calculate_psi_with_bins(month_df['s_cic_score'], train_decile_bins)
        monthly_psi_results.append({
            'Month': month_str,
            # 'loan_type': month_df['ln_loan_type'].iloc[0],  # Assuming loan_type is consistent in the month
            'scorename': 's_cic_score',
            'DateCategory': 'Monthly',
            'psivalues': month_psi,
            'account_count': month_df['digitalLoanAccountId'].nunique()  # Add distinct account count
        })

# Create the output DataFrame
s_cic_score_output_df = pd.DataFrame(monthly_psi_results)
s_cic_score_output_df.rename(columns={'psivalues':'s_cic_score_psivalues'}, inplace = True)
s_cic_score_output_df


[A
[A
[A
Job ID 65a7fe90-1aa8-4f8e-ac9f-1112e04c9fc0 successfully executed: 100%|[32m██████████[0m|





Query is running:   0%|[32m          [0m|

[A
[A
[A
[A
Downloading: 100%|[32m██████████[0m|
dataselection
Test     154386
Train     47681
Name: digitalLoanAccountId, dtype: int64
[0.01708888 0.07036059 0.09294641 0.11136085 0.12356264 0.12566888
 0.14021432 0.16052322 0.17681078 0.19927335 0.52746045]


Unnamed: 0,Month,scorename,DateCategory,s_cic_score_psivalues,account_count
0,2024-06,s_cic_score,Training,0.0,47681
1,2024-07,s_cic_score,Monthly,0.100763,13346
2,2024-08,s_cic_score,Monthly,0.064853,17808
3,2024-09,s_cic_score,Monthly,0.040939,17502
4,2024-10,s_cic_score,Monthly,0.039377,16817
5,2024-11,s_cic_score,Monthly,0.049279,17931
6,2024-12,s_cic_score,Monthly,0.036077,34696
7,2025-01,s_cic_score,Monthly,0.051432,16804
8,2025-02,s_cic_score,Monthly,0.060747,14246
9,2025-03,s_cic_score,Monthly,0.050984,5236


# Checking CIC psi with period testing from jan 2025 to feb2-25

In [23]:
import pandas as pd
import numpy as np
from datetime import datetime

# Replace this with your actual DataFrame loading process
sq = """
with base as 
(select 
  a.digitalLoanAccountId, 
  FORMAT_DATE('%Y-%m', a.ln_appln_submit_datetime) Application_month,
  ---FORMAT_DATE('%F', DATE_TRUNC(a.ln_appln_submit_datetime, WEEK(MONDAY))) as Appl_week_start_date,
 ---- EXTRACT(WEEK(MONDAY) FROM a.ln_appln_submit_datetime) as Appl_week_number,
  ---a.ln_loan_type,
  case when date_trunc(a.ln_appln_submit_datetime, day) between '2023-07-01' and '2024-06-30' then 'Train'
       when date_trunc(a.ln_appln_submit_datetime, day) >= '2024-07-01' then 'Test'
       Else 'Other' end dataselection,
  a.cic_score s_cic_score,
 from risk_mart.sil_risk_ds_master_20230101_20250309  a
 where a.ln_loan_applied_flag = 1 and ln_dl_rule_reject_flag = 0
 and date_trunc(a.ln_appln_submit_datetime, day) >= '2023-07-01'
 )
 select * from base where dataselection in ('Train', 'Test') and s_cic_score is not null;"""
df = client.query(sq).to_dataframe(progress_bar_type='tqdm')

# Convert Application_month to datetime if it's not already
if df['Application_month'].dtype != 'datetime64[ns]':
    df['Application_month'] = pd.to_datetime(df['Application_month'] + '-01')

# Separate train and test data
train_df = df[df['dataselection'] == 'Train']
test_df = df[df['dataselection'] == 'Test']

# Function to calculate PSI between two periods
def calculate_psi(expected_array, actual_array, bins=10):
    """
    Calculate PSI for two arrays
    
    Parameters:
    -----------
    expected_array : numpy array of expected/training values
    actual_array : numpy array of actual/test values
    bins : number of bins to create
    
    Returns:
    --------
    psi_value : float, the calculated PSI value
    bin_details : DataFrame with binning details
    """
    # Create bins based on the expected array
    quantiles = np.linspace(0, 1, bins+1)
    bin_edges = np.quantile(expected_array, quantiles)
    
    # Ensure bin edges are unique (handle duplicates if they exist)
    bin_edges = np.unique(bin_edges)
    if len(bin_edges) < bins + 1:
        # Add small increments to duplicate values
        temp_edges = np.sort(np.unique(expected_array))
        if len(temp_edges) >= bins + 1:
            bin_edges = np.quantile(temp_edges, quantiles)
        else:
            # If not enough unique values, use min-max range divided into bins
            bin_edges = np.linspace(min(expected_array), max(expected_array), bins+1)
    
    # Create bins for both arrays
    expected_counts, _ = np.histogram(expected_array, bins=bin_edges)
    actual_counts, _ = np.histogram(actual_array, bins=bin_edges)
    
    # Calculate percentages
    expected_percents = expected_counts / len(expected_array) * 100
    actual_percents = actual_counts / len(actual_array) * 100
    
    # Calculate differences and PSI components
    diff = actual_percents - expected_percents
    
    # Safe division and log calculation (avoiding div by zero)
    ratio = np.divide(actual_percents, expected_percents, 
                     out=np.ones_like(actual_percents), 
                     where=expected_percents!=0)
    ln_ratio = np.log(ratio, out=np.zeros_like(ratio), where=ratio>0)
    
    # Calculate PSI components and total
    psi_components = diff / 100 * ln_ratio
    psi_value = np.sum(psi_components)
    
    # Create detailed results DataFrame
    bin_details = pd.DataFrame({
        'Bins': [f"{i+1}" for i in range(len(expected_counts))],
        '# Train': expected_counts,
        '# Train %': expected_percents,
        '# Test': actual_counts,
        '# Test %': actual_percents,
        'A-B': diff,
        'ln(A/B)': ln_ratio,
        'PSI': psi_components * 100
    })
    
    bin_details.loc['Grand Total'] = [
        '', sum(expected_counts), 100.0, sum(actual_counts), 100.0, '', '', psi_value * 100
    ]
    
    return psi_value, bin_details

# Calculate monthly PSI as in your original code
def calculate_monthly_psi():
    # Calculate decile bins for the entire training set
    train_deciles = pd.qcut(train_df['s_cic_score'], 10, labels=False, retbins=True)
    train_decile_bins = train_deciles[1]
    
    # Get the last month of the training set
    last_train_month = train_df['Application_month'].max()
    last_train_month_str = last_train_month.strftime('%Y-%m')
    
    # Calculate monthly PSI for the test set
    monthly_psi_results = []
    
    # Add the train set PSI to the results (with the correct last month)
    monthly_psi_results.append({
        'Month': last_train_month_str,
        'scorename': 's_cic_score',
        'DateCategory': 'Training',
        'psivalues': 0.0  # PSI against itself is 0
    })
    
    # Calculate monthly PSI for the test set
    for month in sorted(test_df['Application_month'].unique()):
        month_str = month.strftime('%Y-%m')
        month_df = test_df[test_df['Application_month'] == month]
        
        if not month_df.empty:
            # Calculate PSI using our function
            month_psi, _ = calculate_psi(train_df['s_cic_score'].values, month_df['s_cic_score'].values)
            
            monthly_psi_results.append({
                'Month': month_str,
                'scorename': 's_cic_score',
                'DateCategory': 'Monthly',
                'psivalues': month_psi
            })
    
    # Create the output DataFrame
    monthly_psi_df = pd.DataFrame(monthly_psi_results)
    monthly_psi_df.rename(columns={'psivalues': 's_cic_score_psivalues'}, inplace=True)
    
    return monthly_psi_df

# Calculate PSI between two specific periods (as shown in the image)
def calculate_period_psi():
    # Define the periods matching the image
    train_period = train_df  # Already defined as 2023-07 to 2024-06
    
    # Filter test data for Jan-Feb 2025 - using datetime objects to avoid the February 29 issue
    jan_2025 = pd.Timestamp('2025-01-01')
    feb_2025 = pd.Timestamp('2025-02-28')  # Using Feb 28 instead of Feb 29
    
    test_period = test_df[(test_df['Application_month'] >= jan_2025) & 
                          (test_df['Application_month'] <= feb_2025)]
    
    # Calculate PSI between periods
    period_psi, psi_details = calculate_psi(train_period['s_cic_score'].values, 
                                           test_period['s_cic_score'].values,
                                           bins=10)
    
    print("PSI between 2023-07 to 2024-06 and 2025-01 to 2025-02:")
    print(f"Overall PSI: {period_psi:.6f}")
    
    return period_psi, psi_details

# Run both calculations
print("Calculating monthly PSI values...")
monthly_psi_results = calculate_monthly_psi()
print(monthly_psi_results)

print("\nCalculating period PSI (matching the image)...")
period_psi, psi_details = calculate_period_psi()
print("\nDetailed PSI calculation by bin:")
print(psi_details)

Job ID 5bd405f7-617f-4a28-ac56-7620ae9ce531 successfully executed: 100%|[32m██████████[0m|




Downloading: 100%|[32m██████████[0m|
Calculating monthly PSI values...
     Month    scorename DateCategory  s_cic_score_psivalues
0  2024-06  s_cic_score     Training               0.000000
1  2024-07  s_cic_score      Monthly               0.089112
2  2024-08  s_cic_score      Monthly               0.059244
3  2024-09  s_cic_score      Monthly               0.035671
4  2024-10  s_cic_score      Monthly               0.033889
5  2024-11  s_cic_score      Monthly               0.034579
6  2024-12  s_cic_score      Monthly               0.025900
7  2025-01  s_cic_score      Monthly               0.040464
8  2025-02  s_cic_score      Monthly               0.042533
9  2025-03  s_cic_score      Monthly               0.039215

Calculating period PSI (matching the image)...
PSI between 2023-07 to 2024-06 and 2025-01 to 2025-02:
Overall PSI: 0.041026

Detailed PSI calculation by bin:
            Bins  # Train   # Train %  # Test    # Test %       A-B   ln(A/B)  \
0              1     4771 

In [24]:
sq = """
with base as 
(select 
  a.digitalLoanAccountId, 
  FORMAT_DATE('%Y-%m', a.ln_appln_submit_datetime) Application_month,
  case when date_trunc(a.ln_appln_submit_datetime, day) between '2023-07-01' and '2024-06-30' then 'Train'
       when date_trunc(a.ln_appln_submit_datetime, day) >= '2024-07-01' then 'Test'
       Else 'Other' end dataselection,
  a.cic_score s_cic_score,
 from risk_mart.sil_risk_ds_master_20230101_20250309  a
 where a.ln_loan_applied_flag = 1 and ln_dl_rule_reject_flag = 0
 and date_trunc(a.ln_appln_submit_datetime, day) >= '2023-07-01'
 )
 select * from base where s_cic_score is not null and dataselection in ('Train', 'Test');"""
df = client.query(sq).to_dataframe(progress_bar_type='tqdm')


Job ID a2559ac9-5f0d-4923-9d68-dfc9b6e9b489 successfully executed: 100%|[32m██████████[0m|




Downloading: 100%|[32m██████████[0m|


In [25]:
df.groupby(['Application_month', 'dataselection'])['digitalLoanAccountId'].nunique()

Application_month  dataselection
2023-07            Train              521
2023-08            Train             1148
2023-09            Train             1886
2023-10            Train             1836
2023-11            Train             2505
2023-12            Train             4458
2024-01            Train             2506
2024-02            Train             2382
2024-03            Train             3083
2024-04            Train             5851
2024-05            Train             8033
2024-06            Train            13472
2024-07            Test             13346
2024-08            Test             17808
2024-09            Test             17502
2024-10            Test             16817
2024-11            Test             17931
2024-12            Test             34696
2025-01            Test             16804
2025-02            Test             14246
2025-03            Test              5236
Name: digitalLoanAccountId, dtype: int64

In [26]:
df.to_csv("Test.csv")

# sb_stack_score

In [27]:
import pandas as pd
import numpy as np
from datetime import datetime

# Assuming your DataFrame is called 'df' and has the structure from the image
# Replace this with your actual DataFrame loading process
sq = """
with base as 
(select 
  a.digitalLoanAccountId, 
  FORMAT_DATE('%Y-%m', a.ln_appln_submit_datetime) Application_month,
  FORMAT_DATE('%F', DATE_TRUNC(a.ln_appln_submit_datetime, WEEK(MONDAY))) as Appl_week_start_date,
  EXTRACT(WEEK(MONDAY) FROM a.ln_appln_submit_datetime) as Appl_week_number,
  a.ln_loan_type,
  case when date_trunc(a.ln_appln_submit_datetime, day) between '2023-07-01' and '2024-06-30' then 'Train'
       when date_trunc(a.ln_appln_submit_datetime, day) >= '2024-07-01' then 'Test'
       Else 'Other' end dataselection,
  a.beta_stack_score sb_stack_score,
 from risk_mart.sil_risk_ds_master_20230101_20250309 a
 where a.ln_loan_applied_flag = 1 and ln_dl_rule_reject_flag = 0
 and date_trunc(a.ln_appln_submit_datetime, day) >= '2023-04-01'
 )
 select * from base where sb_stack_score is not null and dataselection in ('Train', 'Test');"""
df = client.query(sq).to_dataframe(progress_bar_type='tqdm')
print(df.groupby(['dataselection'])['digitalLoanAccountId'].nunique())

# Convert Application_month to datetime if it's not already
if df['Application_month'].dtype != 'datetime64[ns]':
    df['Application_month'] = pd.to_datetime(df['Application_month'] + '-01')

# Separate train and test data
train_df = df[df['dataselection'] == 'Train']
test_df = df[df['dataselection'] == 'Test']

# Calculate decile bins for the entire training set
train_deciles = pd.qcut(train_df['sb_stack_score'], 10, labels=False, retbins=True)
train_decile_bins = train_deciles[1]
print(train_decile_bins)
# Function to calculate PSI using the pre-defined decile bins
def calculate_psi_with_bins(data_scores, decile_bins):
    """Calculates PSI using pre-defined decile bins."""
    data_deciles = pd.cut(data_scores, bins=decile_bins, labels=False, include_lowest=True)
    distribution = pd.Series(data_deciles).value_counts().sort_index() / len(data_scores)

    # Align with training distribution
    all_bins = range(10)  # Assuming 10 deciles
    distribution_aligned = distribution.reindex(all_bins, fill_value=0)
    train_dist_aligned = pd.Series(train_deciles[0]).value_counts().sort_index() / len(train_df['sb_stack_score'])
    train_dist_aligned = train_dist_aligned.reindex(all_bins, fill_value=0)

    psi_values = (distribution_aligned - train_dist_aligned) * np.log(distribution_aligned / train_dist_aligned)
    return psi_values.sum()

# Calculate PSI for the entire training set
train_psi = calculate_psi_with_bins(train_df['sb_stack_score'], train_decile_bins)

# Get the last month of the training set
last_train_month = train_df['Application_month'].max()
last_train_month_str = last_train_month.strftime('%Y-%m')

# Calculate monthly PSI for the test set
monthly_psi_results = []

# Add the train set PSI to the results (with the correct last month)
monthly_psi_results.append({
    'Month': last_train_month_str,  # Use the last month of the training set
    # 'loan_type': train_df['ln_loan_type'].iloc[0],  # Assuming loan_type is consistent in the train set
    'scorename': 'sb_stack_score',
    'DateCategory': 'Training',
    'psivalues': train_psi,
    'account_count': train_df['digitalLoanAccountId'].nunique()  # Add distinct account count
    
})

# Calculate monthly PSI for the test set
for month in sorted(test_df['Application_month'].unique()):
    month_str = month.strftime('%Y-%m')
    month_df = test_df[test_df['Application_month'] == month]
    if not month_df.empty:
        month_psi = calculate_psi_with_bins(month_df['sb_stack_score'], train_decile_bins)
        monthly_psi_results.append({
            'Month': month_str,
            # 'loan_type': month_df['ln_loan_type'].iloc[0],  # Assuming loan_type is consistent in the month
            'scorename': 'sb_stack_score',
            'DateCategory': 'Monthly',
            'psivalues': month_psi,
            'account_count': month_df['digitalLoanAccountId'].nunique()  # Add distinct account count
        })

# Create the output DataFrame
sb_stack_score_output_df = pd.DataFrame(monthly_psi_results)
sb_stack_score_output_df.rename(columns={'psivalues':'sb_stack_score_psivalues'}, inplace = True)
sb_stack_score_output_df

Job ID 6cc9899c-f2aa-457e-80cd-8b98f99e2c92 successfully executed: 100%|[32m██████████[0m|




Downloading: 100%|[32m██████████[0m|
dataselection
Test     219883
Train    120879
Name: digitalLoanAccountId, dtype: int64
[0.00768898 0.02260755 0.03505001 0.04702901 0.05502427 0.06718315
 0.08232039 0.09881382 0.11758208 0.15261032 0.67827956]


Unnamed: 0,Month,scorename,DateCategory,sb_stack_score_psivalues,account_count
0,2024-06,sb_stack_score,Training,0.0,120879
1,2024-07,sb_stack_score,Monthly,0.01644,20382
2,2024-08,sb_stack_score,Monthly,0.017579,25136
3,2024-09,sb_stack_score,Monthly,0.023814,25284
4,2024-10,sb_stack_score,Monthly,0.024434,23980
5,2024-11,sb_stack_score,Monthly,0.016027,24407
6,2024-12,sb_stack_score,Monthly,0.014396,49721
7,2025-01,sb_stack_score,Monthly,0.033761,23473
8,2025-02,sb_stack_score,Monthly,0.04134,20636
9,2025-03,sb_stack_score,Monthly,0.047353,6864


# sa_stack_score

In [28]:
import pandas as pd
import numpy as np
from datetime import datetime

# Assuming your DataFrame is called 'df' and has the structure from the image
# Replace this with your actual DataFrame loading process
sq = """
with base as 
(select 
  a.digitalLoanAccountId, 
  FORMAT_DATE('%Y-%m', a.ln_appln_submit_datetime) Application_month,
  FORMAT_DATE('%F', DATE_TRUNC(a.ln_appln_submit_datetime, WEEK(MONDAY))) as Appl_week_start_date,
  EXTRACT(WEEK(MONDAY) FROM a.ln_appln_submit_datetime) as Appl_week_number,
  a.ln_loan_type,
  case when date_trunc(a.ln_appln_submit_datetime, day) between '2023-07-01' and '2024-06-30' then 'Train'
       when date_trunc(a.ln_appln_submit_datetime, day) >= '2024-07-01' then 'Test'
       Else 'Other' end dataselection,
  a.alpha_stack_score sa_stack_score,
 from risk_mart.sil_risk_ds_master_20230101_20250309  a
 where a.ln_loan_applied_flag = 1  and ln_dl_rule_reject_flag = 0
 and date_trunc(a.ln_appln_submit_datetime, day) >= '2023-04-01'
 )
 select * from base where sa_stack_score is not null and dataselection in ('Train', 'Test');"""
df = client.query(sq).to_dataframe(progress_bar_type='tqdm')
print(df.groupby(['dataselection'])['digitalLoanAccountId'].nunique())

# Convert Application_month to datetime if it's not already
if df['Application_month'].dtype != 'datetime64[ns]':
    df['Application_month'] = pd.to_datetime(df['Application_month'] + '-01')

# Separate train and test data
train_df = df[df['dataselection'] == 'Train']
test_df = df[df['dataselection'] == 'Test']

# Calculate decile bins for the entire training set
train_deciles = pd.qcut(train_df['sa_stack_score'], 10, labels=False, retbins=True)
train_decile_bins = train_deciles[1]
print(train_decile_bins)
# Function to calculate PSI using the pre-defined decile bins
def calculate_psi_with_bins(data_scores, decile_bins):
    """Calculates PSI using pre-defined decile bins."""
    data_deciles = pd.cut(data_scores, bins=decile_bins, labels=False, include_lowest=True)
    distribution = pd.Series(data_deciles).value_counts().sort_index() / len(data_scores)

    # Align with training distribution
    all_bins = range(10)  # Assuming 10 deciles
    distribution_aligned = distribution.reindex(all_bins, fill_value=0)
    train_dist_aligned = pd.Series(train_deciles[0]).value_counts().sort_index() / len(train_df['sa_stack_score'])
    train_dist_aligned = train_dist_aligned.reindex(all_bins, fill_value=0)

    psi_values = (distribution_aligned - train_dist_aligned) * np.log(distribution_aligned / train_dist_aligned)
    return psi_values.sum()

# Calculate PSI for the entire training set
train_psi = calculate_psi_with_bins(train_df['sa_stack_score'], train_decile_bins)

# Get the last month of the training set
last_train_month = train_df['Application_month'].max()
last_train_month_str = last_train_month.strftime('%Y-%m')

# Calculate monthly PSI for the test set
monthly_psi_results = []

# Add the train set PSI to the results (with the correct last month)
monthly_psi_results.append({
    'Month': last_train_month_str,  # Use the last month of the training set
    # 'loan_type': train_df['ln_loan_type'].iloc[0],  # Assuming loan_type is consistent in the train set
    'scorename': 'sa_stack_score',
    'DateCategory': 'Training',
    'psivalues': train_psi,
    'account_count': train_df['digitalLoanAccountId'].nunique()  # Add distinct account count
})

# Calculate monthly PSI for the test set
for month in sorted(test_df['Application_month'].unique()):
    month_str = month.strftime('%Y-%m')
    month_df = test_df[test_df['Application_month'] == month]
    if not month_df.empty:
        month_psi = calculate_psi_with_bins(month_df['sa_stack_score'], train_decile_bins)
        monthly_psi_results.append({
            'Month': month_str,
            # 'loan_type': month_df['ln_loan_type'].iloc[0],  # Assuming loan_type is consistent in the month
            'scorename': 'sa_stack_score',
            'DateCategory': 'Monthly',
            'psivalues': month_psi,
            'account_count': month_df['digitalLoanAccountId'].nunique()  # Add distinct account count
        })

# Create the output DataFrame
sa_stack_score_output_df = pd.DataFrame(monthly_psi_results)
sa_stack_score_output_df.rename(columns={'psivalues':'sa_stack_score_psivalues'}, inplace = True)
sa_stack_score_output_df


Job ID f62551fd-7ba6-4163-8a02-790bf4d91636 successfully executed: 100%|[32m██████████[0m|




Downloading: 100%|[32m██████████[0m|
dataselection
Test     154386
Train     47681
Name: digitalLoanAccountId, dtype: int64
[0.00758834 0.0207969  0.02805118 0.03565707 0.04443132 0.05425189
 0.06539076 0.0784071  0.09551966 0.12476151 0.65014194]


Unnamed: 0,Month,scorename,DateCategory,sa_stack_score_psivalues,account_count
0,2024-06,sa_stack_score,Training,0.0,47681
1,2024-07,sa_stack_score,Monthly,0.047756,13346
2,2024-08,sa_stack_score,Monthly,0.026786,17808
3,2024-09,sa_stack_score,Monthly,0.019113,17502
4,2024-10,sa_stack_score,Monthly,0.02204,16817
5,2024-11,sa_stack_score,Monthly,0.014377,17931
6,2024-12,sa_stack_score,Monthly,0.010192,34696
7,2025-01,sa_stack_score,Monthly,0.036305,16804
8,2025-02,sa_stack_score,Monthly,0.039701,14246
9,2025-03,sa_stack_score,Monthly,0.03676,5236


# c_credo_score_output_df

In [30]:
import pandas as pd
import numpy as np
from datetime import datetime

# Assuming your DataFrame is called 'df' and has the structure from the image
# Replace this with your actual DataFrame loading process
sq = """
with base as 
(select 
  a.digitalLoanAccountId, 
  FORMAT_DATE('%Y-%m', a.ln_appln_submit_datetime) Application_month,
  FORMAT_DATE('%F', DATE_TRUNC(a.ln_appln_submit_datetime, WEEK(MONDAY))) as Appl_week_start_date,
  EXTRACT(WEEK(MONDAY) FROM a.ln_appln_submit_datetime) as Appl_week_number,
  a.ln_loan_type,
  case when date_trunc(a.ln_appln_submit_datetime, day) between '2023-07-01' and '2024-06-30' then 'Train'
       when date_trunc(a.ln_appln_submit_datetime, day) >= '2024-07-01' then 'Test'
       Else 'Other' end dataselection,
  a.credo_cash_score c_credo_score,
 from risk_mart.sil_risk_ds_master_20230101_20250309  a
 where a.ln_loan_applied_flag = 1  and ln_dl_rule_reject_flag = 0
 and date_trunc(a.ln_appln_submit_datetime, day) >= '2023-07-01'
 )
 select * from base where c_credo_score is not null and dataselection in ('Train', 'Test');"""
df = client.query(sq).to_dataframe(progress_bar_type='tqdm')
print(df.groupby(['dataselection'])['digitalLoanAccountId'].nunique())

# Convert Application_month to datetime if it's not already
if df['Application_month'].dtype != 'datetime64[ns]':
    df['Application_month'] = pd.to_datetime(df['Application_month'] + '-01')

# Separate train and test data
train_df = df[df['dataselection'] == 'Train']
test_df = df[df['dataselection'] == 'Test']

# Calculate decile bins for the entire training set
train_deciles = pd.qcut(train_df['c_credo_score'], 10, labels=False, retbins=True)
train_decile_bins = train_deciles[1]
print(train_decile_bins)
# Function to calculate PSI using the pre-defined decile bins
def calculate_psi_with_bins(data_scores, decile_bins):
    """Calculates PSI using pre-defined decile bins."""
    data_deciles = pd.cut(data_scores, bins=decile_bins, labels=False, include_lowest=True)
    distribution = pd.Series(data_deciles).value_counts().sort_index() / len(data_scores)

    # Align with training distribution
    all_bins = range(10)  # Assuming 10 deciles
    distribution_aligned = distribution.reindex(all_bins, fill_value=0)
    train_dist_aligned = pd.Series(train_deciles[0]).value_counts().sort_index() / len(train_df['c_credo_score'])
    train_dist_aligned = train_dist_aligned.reindex(all_bins, fill_value=0)

    psi_values = (distribution_aligned - train_dist_aligned) * np.log(distribution_aligned / train_dist_aligned)
    return psi_values.sum()

# Calculate PSI for the entire training set
train_psi = calculate_psi_with_bins(train_df['c_credo_score'], train_decile_bins)

# Get the last month of the training set
last_train_month = train_df['Application_month'].max()
last_train_month_str = last_train_month.strftime('%Y-%m')

# Calculate monthly PSI for the test set
monthly_psi_results = []

# Add the train set PSI to the results (with the correct last month)
monthly_psi_results.append({
    'Month': last_train_month_str,  # Use the last month of the training set
    # 'loan_type': train_df['ln_loan_type'].iloc[0],  # Assuming loan_type is consistent in the train set
    'scorename': 'c_credo_score',
    'DateCategory': 'Training',
    'psivalues': train_psi,
    'account_count': train_df['digitalLoanAccountId'].nunique()  # Add distinct account count
})

# Calculate monthly PSI for the test set
for month in sorted(test_df['Application_month'].unique()):
    month_str = month.strftime('%Y-%m')
    month_df = test_df[test_df['Application_month'] == month]
    if not month_df.empty:
        month_psi = calculate_psi_with_bins(month_df['c_credo_score'], train_decile_bins)
        monthly_psi_results.append({
            'Month': month_str,
            # 'loan_type': month_df['ln_loan_type'].iloc[0],  # Assuming loan_type is consistent in the month
            'scorename': 'c_credo_score',
            'DateCategory': 'Monthly',
            'psivalues': month_psi,
            'account_count': month_df['digitalLoanAccountId'].nunique()  # Add distinct account count
        })

# Create the output DataFrame
c_credo_score_output_df = pd.DataFrame(monthly_psi_results)
c_credo_score_output_df.rename(columns={'psivalues':'c_credo_score_psivalues'}, inplace = True)


[A
[A
[A
Job ID b8686585-30f4-4c4c-a42b-85fc82e358f6 successfully executed: 100%|[32m██████████[0m|





Query is running:   0%|[32m          [0m|

[A
[A
[A
[A
[A
[A
Downloading: 100%|[32m██████████[0m|
dataselection
Test     219883
Train    120879
Name: digitalLoanAccountId, dtype: int64
[0.01944472 0.15413745 0.18755253 0.21863023 0.25137791 0.28296505
 0.31845495 0.35933473 0.40894292 0.47981712 0.85616948]


In [31]:
c_credo_score_output_df

Unnamed: 0,Month,scorename,DateCategory,c_credo_score_psivalues,account_count
0,2024-06,c_credo_score,Training,0.0,120879
1,2024-07,c_credo_score,Monthly,0.00633,20382
2,2024-08,c_credo_score,Monthly,0.003648,25136
3,2024-09,c_credo_score,Monthly,0.003723,25284
4,2024-10,c_credo_score,Monthly,0.002484,23980
5,2024-11,c_credo_score,Monthly,0.002357,24407
6,2024-12,c_credo_score,Monthly,0.002967,49721
7,2025-01,c_credo_score,Monthly,0.002762,23473
8,2025-02,c_credo_score,Monthly,0.004631,20636
9,2025-03,c_credo_score,Monthly,0.005626,6864


# s_credo_score

In [32]:
import pandas as pd
import numpy as np
from datetime import datetime

# Assuming your DataFrame is called 'df' and has the structure from the image
# Replace this with your actual DataFrame loading process
sq = """
with base as 
(select 
  a.digitalLoanAccountId, 
  FORMAT_DATE('%Y-%m', a.ln_appln_submit_datetime) Application_month,
  FORMAT_DATE('%F', DATE_TRUNC(a.ln_appln_submit_datetime, WEEK(MONDAY))) as Appl_week_start_date,
  EXTRACT(WEEK(MONDAY) FROM a.ln_appln_submit_datetime) as Appl_week_number,
  a.ln_loan_type,
  case when date_trunc(a.ln_appln_submit_datetime, day) between '2023-07-01' and '2024-06-30' then 'Train'
       when date_trunc(a.ln_appln_submit_datetime, day) >= '2024-07-01' then 'Test'
       Else 'Other' end dataselection,
  a.credo_sil_score s_credo_score,
 from risk_mart.sil_risk_ds_master_20230101_20250309  a
 where a.ln_loan_applied_flag = 1  and ln_dl_rule_reject_flag = 0
 and date_trunc(a.ln_appln_submit_datetime, day) >= '2023-07-01'
 )
 select * from base where s_credo_score is not null and dataselection in ('Train', 'Test');"""
df = client.query(sq).to_dataframe(progress_bar_type='tqdm')
print(df.groupby(['dataselection'])['digitalLoanAccountId'].nunique())

# Convert Application_month to datetime if it's not already
if df['Application_month'].dtype != 'datetime64[ns]':
    df['Application_month'] = pd.to_datetime(df['Application_month'] + '-01')

# Separate train and test data
train_df = df[df['dataselection'] == 'Train']
test_df = df[df['dataselection'] == 'Test']

# Calculate decile bins for the entire training set
train_deciles = pd.qcut(train_df['s_credo_score'], 10, labels=False, retbins=True)
train_decile_bins = train_deciles[1]
print(train_decile_bins)
# Function to calculate PSI using the pre-defined decile bins
def calculate_psi_with_bins(data_scores, decile_bins):
    """Calculates PSI using pre-defined decile bins."""
    data_deciles = pd.cut(data_scores, bins=decile_bins, labels=False, include_lowest=True)
    distribution = pd.Series(data_deciles).value_counts().sort_index() / len(data_scores)

    # Align with training distribution
    all_bins = range(10)  # Assuming 10 deciles
    distribution_aligned = distribution.reindex(all_bins, fill_value=0)
    train_dist_aligned = pd.Series(train_deciles[0]).value_counts().sort_index() / len(train_df['s_credo_score'])
    train_dist_aligned = train_dist_aligned.reindex(all_bins, fill_value=0)

    psi_values = (distribution_aligned - train_dist_aligned) * np.log(distribution_aligned / train_dist_aligned)
    return psi_values.sum()

# Calculate PSI for the entire training set
train_psi = calculate_psi_with_bins(train_df['s_credo_score'], train_decile_bins)

# Get the last month of the training set
last_train_month = train_df['Application_month'].max()
last_train_month_str = last_train_month.strftime('%Y-%m')

# Calculate monthly PSI for the test set
monthly_psi_results = []

# Add the train set PSI to the results (with the correct last month)
monthly_psi_results.append({
    'Month': last_train_month_str,  # Use the last month of the training set
    # 'loan_type': train_df['ln_loan_type'].iloc[0],  # Assuming loan_type is consistent in the train set
    'scorename': 's_credo_score',
    'DateCategory': 'Training',
    'psivalues': train_psi,
    'account_count': train_df['digitalLoanAccountId'].nunique()  # Add distinct account count
})

# Calculate monthly PSI for the test set
for month in sorted(test_df['Application_month'].unique()):
    month_str = month.strftime('%Y-%m')
    month_df = test_df[test_df['Application_month'] == month]
    if not month_df.empty:
        month_psi = calculate_psi_with_bins(month_df['s_credo_score'], train_decile_bins)
        monthly_psi_results.append({
            'Month': month_str,
            # 'loan_type': month_df['ln_loan_type'].iloc[0],  # Assuming loan_type is consistent in the month
            'scorename': 's_credo_score',
            'DateCategory': 'Monthly',
            'psivalues': month_psi,
            'account_count': month_df['digitalLoanAccountId'].nunique()  # Add distinct account count
        })

# Create the output DataFrame
s_credo_score_output_df = pd.DataFrame(monthly_psi_results)
s_credo_score_output_df.rename(columns= {'psivalues':'s_credo_score_psivalues'}, inplace = True)
s_credo_score_output_df

Job ID 04c99ede-51be-4f3c-8c9e-198457404879 successfully executed: 100%|[32m██████████[0m|




Downloading: 100%|[32m██████████[0m|
dataselection
Test     219883
Train    120879
Name: digitalLoanAccountId, dtype: int64
[0.01777745 0.05814288 0.07035747 0.08057525 0.09078621 0.1010898
 0.11250886 0.12671799 0.14598377 0.17642667 0.44602214]


Unnamed: 0,Month,scorename,DateCategory,s_credo_score_psivalues,account_count
0,2024-06,s_credo_score,Training,0.0,120879
1,2024-07,s_credo_score,Monthly,0.00633,20382
2,2024-08,s_credo_score,Monthly,0.002463,25136
3,2024-09,s_credo_score,Monthly,0.010557,25284
4,2024-10,s_credo_score,Monthly,0.016389,23980
5,2024-11,s_credo_score,Monthly,0.007762,24407
6,2024-12,s_credo_score,Monthly,0.006517,49721
7,2025-01,s_credo_score,Monthly,0.025606,23473
8,2025-02,s_credo_score,Monthly,0.014504,20636
9,2025-03,s_credo_score,Monthly,0.011477,6864


# fu_credo_score

In [33]:
import pandas as pd
import numpy as np
from datetime import datetime

# Assuming your DataFrame is called 'df' and has the structure from the image
# Replace this with your actual DataFrame loading process
sq = """
with base as 
(select 
  a.digitalLoanAccountId, 
  FORMAT_DATE('%Y-%m', a.ln_appln_submit_datetime) Application_month,
  FORMAT_DATE('%F', DATE_TRUNC(a.ln_appln_submit_datetime, WEEK(MONDAY))) as Appl_week_start_date,
  EXTRACT(WEEK(MONDAY) FROM a.ln_appln_submit_datetime) as Appl_week_number,
  a.ln_loan_type,
  case when date_trunc(a.ln_appln_submit_datetime, day) between '2023-07-01' and '2024-06-30' then 'Train'
       when date_trunc(a.ln_appln_submit_datetime, day) >= '2024-07-01' then 'Test'
       Else 'Other' end dataselection,
  a.credo_flexup_score fu_credo_score,
 from risk_mart.sil_risk_ds_master_20230101_20250309 a
 where a.ln_loan_applied_flag = 1  and ln_dl_rule_reject_flag = 0
 and date_trunc(a.ln_appln_submit_datetime, day) >= '2023-07-01'
 )
 select * from base where fu_credo_score is not null and dataselection in ('Train', 'Test');"""
df = client.query(sq).to_dataframe(progress_bar_type='tqdm')
print(df.groupby(['dataselection'])['digitalLoanAccountId'].nunique())

# Convert Application_month to datetime if it's not already
if df['Application_month'].dtype != 'datetime64[ns]':
    df['Application_month'] = pd.to_datetime(df['Application_month'] + '-01')

# Separate train and test data
train_df = df[df['dataselection'] == 'Train']
test_df = df[df['dataselection'] == 'Test']

# Calculate decile bins for the entire training set
train_deciles = pd.qcut(train_df['fu_credo_score'], 10, labels=False, retbins=True)
train_decile_bins = train_deciles[1]
print(train_decile_bins)
# Function to calculate PSI using the pre-defined decile bins
def calculate_psi_with_bins(data_scores, decile_bins):
    """Calculates PSI using pre-defined decile bins."""
    data_deciles = pd.cut(data_scores, bins=decile_bins, labels=False, include_lowest=True)
    distribution = pd.Series(data_deciles).value_counts().sort_index() / len(data_scores)

    # Align with training distribution
    all_bins = range(10)  # Assuming 10 deciles
    distribution_aligned = distribution.reindex(all_bins, fill_value=0)
    train_dist_aligned = pd.Series(train_deciles[0]).value_counts().sort_index() / len(train_df['fu_credo_score'])
    train_dist_aligned = train_dist_aligned.reindex(all_bins, fill_value=0)

    psi_values = (distribution_aligned - train_dist_aligned) * np.log(distribution_aligned / train_dist_aligned)
    return psi_values.sum()

# Calculate PSI for the entire training set
train_psi = calculate_psi_with_bins(train_df['fu_credo_score'], train_decile_bins)

# Get the last month of the training set
last_train_month = train_df['Application_month'].max()
last_train_month_str = last_train_month.strftime('%Y-%m')

# Calculate monthly PSI for the test set
monthly_psi_results = []

# Add the train set PSI to the results (with the correct last month)
monthly_psi_results.append({
    'Month': last_train_month_str,  # Use the last month of the training set
    # 'loan_type': train_df['ln_loan_type'].iloc[0],  # Assuming loan_type is consistent in the train set
    'scorename': 'fu_credo_score',
    'DateCategory': 'Training',
    'psivalues': train_psi,
    'account_count': train_df['digitalLoanAccountId'].nunique()  # Add distinct account count
})

# Calculate monthly PSI for the test set
for month in sorted(test_df['Application_month'].unique()):
    month_str = month.strftime('%Y-%m')
    month_df = test_df[test_df['Application_month'] == month]
    if not month_df.empty:
        month_psi = calculate_psi_with_bins(month_df['fu_credo_score'], train_decile_bins)
        monthly_psi_results.append({
            'Month': month_str,
            # 'loan_type': month_df['ln_loan_type'].iloc[0],  # Assuming loan_type is consistent in the month
            'scorename': 'fu_credo_score',
            'DateCategory': 'Monthly',
            'psivalues': month_psi,
            'account_count': month_df['digitalLoanAccountId'].nunique()  # Add distinct account count
        })

# Create the output DataFrame
fu_credo_score_output_df = pd.DataFrame(monthly_psi_results)
fu_credo_score_output_df.rename(columns={'psivalues':'fu_credo_score_psivalues'}, inplace = True)
fu_credo_score_output_df

Job ID b5bf31ac-0206-45f5-b1c6-8743f29a6dec successfully executed: 100%|[32m██████████[0m|




Downloading: 100%|[32m██████████[0m|
dataselection
Test     219883
Train    120879
Name: digitalLoanAccountId, dtype: int64
[0.00198555 0.0284396  0.04201433 0.05616501 0.07195484 0.09119812
 0.11546254 0.14559758 0.18540856 0.23653321 0.86342827]


Unnamed: 0,Month,scorename,DateCategory,fu_credo_score_psivalues,account_count
0,2024-06,fu_credo_score,Training,0.0,120879
1,2024-07,fu_credo_score,Monthly,0.006634,20382
2,2024-08,fu_credo_score,Monthly,0.006883,25136
3,2024-09,fu_credo_score,Monthly,0.008657,25284
4,2024-10,fu_credo_score,Monthly,0.01708,23980
5,2024-11,fu_credo_score,Monthly,0.011944,24407
6,2024-12,fu_credo_score,Monthly,0.010415,49721
7,2025-01,fu_credo_score,Monthly,0.004587,23473
8,2025-02,fu_credo_score,Monthly,0.007908,20636
9,2025-03,fu_credo_score,Monthly,0.011244,6864


# r_credo_score

In [34]:
import pandas as pd
import numpy as np
from datetime import datetime

# Assuming your DataFrame is called 'df' and has the structure from the image
# Replace this with your actual DataFrame loading process
sq = """
with base as 
(select 
  a.digitalLoanAccountId, 
  FORMAT_DATE('%Y-%m', a.ln_appln_submit_datetime) Application_month,
  FORMAT_DATE('%F', DATE_TRUNC(a.ln_appln_submit_datetime, WEEK(MONDAY))) as Appl_week_start_date,
  EXTRACT(WEEK(MONDAY) FROM a.ln_appln_submit_datetime) as Appl_week_number,
  a.ln_loan_type,
  case when date_trunc(a.ln_appln_submit_datetime, day) between '2023-07-01' and '2024-06-30' then 'Train'
       when date_trunc(a.ln_appln_submit_datetime, day) >= '2024-07-01' then 'Test'
       Else 'Other' end dataselection,
  a.credo_reloan_score r_credo_score,
 from  risk_mart.sil_risk_ds_master_20230101_20250309 a
 where a.ln_loan_applied_flag = 1  and ln_dl_rule_reject_flag = 0
 and date_trunc(a.ln_appln_submit_datetime, day) >= '2023-07-01'
 )
 select * from base where r_credo_score is not null and dataselection in ('Train', 'Test');"""
df = client.query(sq).to_dataframe(progress_bar_type='tqdm')
print(df.groupby(['dataselection'])['digitalLoanAccountId'].nunique())

# Convert Application_month to datetime if it's not already
if df['Application_month'].dtype != 'datetime64[ns]':
    df['Application_month'] = pd.to_datetime(df['Application_month'] + '-01')

# Separate train and test data
train_df = df[df['dataselection'] == 'Train']
test_df = df[df['dataselection'] == 'Test']

# Calculate decile bins for the entire training set
train_deciles = pd.qcut(train_df['r_credo_score'], 10, labels=False, retbins=True)
train_decile_bins = train_deciles[1]
print(train_decile_bins)
# Function to calculate PSI using the pre-defined decile bins
def calculate_psi_with_bins(data_scores, decile_bins):
    """Calculates PSI using pre-defined decile bins."""
    data_deciles = pd.cut(data_scores, bins=decile_bins, labels=False, include_lowest=True)
    distribution = pd.Series(data_deciles).value_counts().sort_index() / len(data_scores)

    # Align with training distribution
    all_bins = range(10)  # Assuming 10 deciles
    distribution_aligned = distribution.reindex(all_bins, fill_value=0)
    train_dist_aligned = pd.Series(train_deciles[0]).value_counts().sort_index() / len(train_df['r_credo_score'])
    train_dist_aligned = train_dist_aligned.reindex(all_bins, fill_value=0)

    psi_values = (distribution_aligned - train_dist_aligned) * np.log(distribution_aligned / train_dist_aligned)
    return psi_values.sum()

# Calculate PSI for the entire training set
train_psi = calculate_psi_with_bins(train_df['r_credo_score'], train_decile_bins)

# Get the last month of the training set
last_train_month = train_df['Application_month'].max()
last_train_month_str = last_train_month.strftime('%Y-%m')

# Calculate monthly PSI for the test set
monthly_psi_results = []

# Add the train set PSI to the results (with the correct last month)
monthly_psi_results.append({
    'Month': last_train_month_str,  # Use the last month of the training set
    # 'loan_type': train_df['ln_loan_type'].iloc[0],  # Assuming loan_type is consistent in the train set
    'scorename': 'r_credo_score',
    'DateCategory': 'Training',
    'psivalues': train_psi,
    'account_count': train_df['digitalLoanAccountId'].nunique()  # Add distinct account count
})

# Calculate monthly PSI for the test set
for month in sorted(test_df['Application_month'].unique()):
    month_str = month.strftime('%Y-%m')
    month_df = test_df[test_df['Application_month'] == month]
    if not month_df.empty:
        month_psi = calculate_psi_with_bins(month_df['r_credo_score'], train_decile_bins)
        monthly_psi_results.append({
            'Month': month_str,
            # 'loan_type': month_df['ln_loan_type'].iloc[0],  # Assuming loan_type is consistent in the month
            'scorename': 'r_credo_score',
            'DateCategory': 'Monthly',
            'psivalues': month_psi,
            'account_count': month_df['digitalLoanAccountId'].nunique()  # Add distinct account count
        })

# Create the output DataFrame
r_credo_score_output_df = pd.DataFrame(monthly_psi_results)
r_credo_score_output_df.rename(columns={'psivalues':'r_credo_score_psivalues'}, inplace = True)
r_credo_score_output_df


Job ID 155a29ad-ed76-43c1-bf9a-76cf33958998 successfully executed: 100%|[32m██████████[0m|




Downloading: 100%|[32m██████████[0m|
dataselection
Test     219883
Train    120879
Name: digitalLoanAccountId, dtype: int64
[0.00715845 0.09849608 0.12675658 0.15480483 0.18613496 0.21970326
 0.25742173 0.30081016 0.3551176  0.43576858 0.87489346]


Unnamed: 0,Month,scorename,DateCategory,r_credo_score_psivalues,account_count
0,2024-06,r_credo_score,Training,0.0,120879
1,2024-07,r_credo_score,Monthly,0.010712,20382
2,2024-08,r_credo_score,Monthly,0.008423,25136
3,2024-09,r_credo_score,Monthly,0.005237,25284
4,2024-10,r_credo_score,Monthly,0.007045,23980
5,2024-11,r_credo_score,Monthly,0.004144,24407
6,2024-12,r_credo_score,Monthly,0.004122,49721
7,2025-01,r_credo_score,Monthly,0.012302,23473
8,2025-02,r_credo_score,Monthly,0.004779,20636
9,2025-03,r_credo_score,Monthly,0.00676,6864


# gen_credo_score

In [35]:
import pandas as pd
import numpy as np
from datetime import datetime

# Assuming your DataFrame is called 'df' and has the structure from the image
# Replace this with your actual DataFrame loading process
sq = """
with base as 
(select 
  a.digitalLoanAccountId, 
  FORMAT_DATE('%Y-%m', a.ln_appln_submit_datetime) Application_month,
  FORMAT_DATE('%F', DATE_TRUNC(a.ln_appln_submit_datetime, WEEK(MONDAY))) as Appl_week_start_date,
  EXTRACT(WEEK(MONDAY) FROM a.ln_appln_submit_datetime) as Appl_week_number,
  a.ln_loan_type,
  case when date_trunc(a.ln_appln_submit_datetime, day) between '2023-07-01' and '2024-06-30' then 'Train'
       when date_trunc(a.ln_appln_submit_datetime, day) >= '2024-07-01' then 'Test'
       Else 'Other' end dataselection,
  a.credo_gen_score gen_credo_score,
 from risk_mart.sil_risk_ds_master_20230101_20250309  a
 where a.ln_loan_applied_flag = 1  and ln_dl_rule_reject_flag = 0
 and date_trunc(a.ln_appln_submit_datetime, day) >= '2023-04-01'
 )
 select * from base where gen_credo_score is not null and dataselection in ('Train', 'Test');"""
df = client.query(sq).to_dataframe(progress_bar_type='tqdm')
print(df.groupby(['dataselection'])['digitalLoanAccountId'].nunique())

# Convert Application_month to datetime if it's not already
if df['Application_month'].dtype != 'datetime64[ns]':
    df['Application_month'] = pd.to_datetime(df['Application_month'] + '-01')

# Separate train and test data
train_df = df[df['dataselection'] == 'Train']
test_df = df[df['dataselection'] == 'Test']

# Calculate decile bins for the entire training set
train_deciles = pd.qcut(train_df['gen_credo_score'], 10, labels=False, retbins=True)
train_decile_bins = train_deciles[1]
print(train_decile_bins)
# Function to calculate PSI using the pre-defined decile bins
def calculate_psi_with_bins(data_scores, decile_bins):
    """Calculates PSI using pre-defined decile bins."""
    data_deciles = pd.cut(data_scores, bins=decile_bins, labels=False, include_lowest=True)
    distribution = pd.Series(data_deciles).value_counts().sort_index() / len(data_scores)

    # Align with training distribution
    all_bins = range(10)  # Assuming 10 deciles
    distribution_aligned = distribution.reindex(all_bins, fill_value=0)
    train_dist_aligned = pd.Series(train_deciles[0]).value_counts().sort_index() / len(train_df['gen_credo_score'])
    train_dist_aligned = train_dist_aligned.reindex(all_bins, fill_value=0)

    psi_values = (distribution_aligned - train_dist_aligned) * np.log(distribution_aligned / train_dist_aligned)
    return psi_values.sum()

# Calculate PSI for the entire training set
train_psi = calculate_psi_with_bins(train_df['gen_credo_score'], train_decile_bins)

# Get the last month of the training set
last_train_month = train_df['Application_month'].max()
last_train_month_str = last_train_month.strftime('%Y-%m')

# Calculate monthly PSI for the test set
monthly_psi_results = []

# Add the train set PSI to the results (with the correct last month)
monthly_psi_results.append({
    'Month': last_train_month_str,  # Use the last month of the training set
    # 'loan_type': train_df['ln_loan_type'].iloc[0],  # Assuming loan_type is consistent in the train set
    'scorename': 'gen_credo_score',
    'DateCategory': 'Training',
    'psivalues': train_psi,
    'account_count': train_df['digitalLoanAccountId'].nunique()  # Add distinct account count
})

# Calculate monthly PSI for the test set
for month in sorted(test_df['Application_month'].unique()):
    month_str = month.strftime('%Y-%m')
    month_df = test_df[test_df['Application_month'] == month]
    if not month_df.empty:
        month_psi = calculate_psi_with_bins(month_df['gen_credo_score'], train_decile_bins)
        monthly_psi_results.append({
            'Month': month_str,
            # 'loan_type': month_df['ln_loan_type'].iloc[0],  # Assuming loan_type is consistent in the month
            'scorename': 'gen_credo_score',
            'DateCategory': 'Monthly',
            'psivalues': month_psi,
            'account_count': month_df['digitalLoanAccountId'].nunique()  # Add distinct account count
        })

# Create the output DataFrame
gen_credo_score_output_df = pd.DataFrame(monthly_psi_results)
gen_credo_score_output_df.rename(columns={'psivalues':'gen_credo_score_psivalues'}, inplace = True)
gen_credo_score_output_df

Job ID ab37c34e-81fe-423a-ba61-143ae76043b8 successfully executed: 100%|[32m██████████[0m|




Downloading: 100%|[32m██████████[0m|
dataselection
Test     219883
Train    120879
Name: digitalLoanAccountId, dtype: int64
[0.01431733 0.0541337  0.06841701 0.08115605 0.09352465 0.10625077
 0.12041919 0.13724732 0.15941183 0.19535887 0.57325398]


Unnamed: 0,Month,scorename,DateCategory,gen_credo_score_psivalues,account_count
0,2024-06,gen_credo_score,Training,0.0,120879
1,2024-07,gen_credo_score,Monthly,0.014749,20382
2,2024-08,gen_credo_score,Monthly,0.008829,25136
3,2024-09,gen_credo_score,Monthly,0.015357,25284
4,2024-10,gen_credo_score,Monthly,0.024187,23980
5,2024-11,gen_credo_score,Monthly,0.015609,24407
6,2024-12,gen_credo_score,Monthly,0.014802,49721
7,2025-01,gen_credo_score,Monthly,0.050278,23473
8,2025-02,gen_credo_score,Monthly,0.025697,20636
9,2025-03,gen_credo_score,Monthly,0.034939,6864


# Combining dataframes

In [36]:
# import functools

# dataframes = [s_apps_score_output_df, sb_demo_score_output_df, s_cic_score_output_df, sb_stack_score_output_df,sa_stack_score_output_df, c_credo_score_output_df, s_credo_score_output_df, fu_credo_score_output_df,
#               r_credo_score_output_df, gen_credo_score_output_df]
# common_columns = ['Month', 'scorename', 'DateCategory', 'psivalues']

# def merge_dataframes(df1, df2):
#     return pd.merge(df1, df2, on=common_columns, how='outer')

# final_df = functools.reduce(merge_dataframes, dataframes)

# final_df.columns.values

In [37]:
import pandas as pd

def concatenate_dataframes(dataframe_list):
    """
    Concatenates a list of Pandas DataFrames into a single DataFrame.

    Args:
        dataframe_list: A list of Pandas DataFrames to concatenate.

    Returns:
        A single concatenated Pandas DataFrame, or None if the input list is empty.
    """
    if not dataframe_list:
        return None  # Return None if the list is empty

    try:
        concatenated_df = pd.concat(dataframe_list, ignore_index=True)
        return concatenated_df
    except Exception as e:
        print(f"An error occurred during concatenation: {e}")
        return None

# Example usage (assuming your DataFrames are already defined):

# Replace these with your actual DataFrames
# s_apps_score_output_df = ...
# sb_demo_score_output_df = ...
# s_cic_score_output_df = ...
# sb_stack_score_output_df = ...
# sa_stack_score_output_df = ...
# c_credo_score_output_df = ...
# s_credo_score_output_df = ...
# fu_credo_score_output_df = ...
# r_credo_score_output_df = ...
# gen_credo_score_output_df = ...

dataframe_list = [
    s_apps_score_output_df,
    sb_demo_score_output_df,
    s_cic_score_output_df,
    sb_stack_score_output_df,
    sa_stack_score_output_df,
    c_credo_score_output_df,
    s_credo_score_output_df,
    fu_credo_score_output_df,
    r_credo_score_output_df,
    gen_credo_score_output_df,
]

concatenated_result = concatenate_dataframes(dataframe_list)

if concatenated_result is not None:
    print(concatenated_result)
else:
    print("Concatenation failed or the input list was empty.")

      Month        scorename DateCategory  s_apps_score_psivalues  \
0   2024-06     s_apps_score     Training                0.000000   
1   2024-07     s_apps_score      Monthly                0.034227   
2   2024-08     s_apps_score      Monthly                0.034721   
3   2024-09     s_apps_score      Monthly                0.045001   
4   2024-10     s_apps_score      Monthly                0.045987   
..      ...              ...          ...                     ...   
95  2024-11  gen_credo_score      Monthly                     NaN   
96  2024-12  gen_credo_score      Monthly                     NaN   
97  2025-01  gen_credo_score      Monthly                     NaN   
98  2025-02  gen_credo_score      Monthly                     NaN   
99  2025-03  gen_credo_score      Monthly                     NaN   

    account_count  sb_demo_score_psivalues  s_cic_score_psivalues  \
0          109413                      NaN                    NaN   
1           18571                

In [38]:
concatenated_result.dtypes

Month                         object
scorename                     object
DateCategory                  object
s_apps_score_psivalues       float64
account_count                  int64
sb_demo_score_psivalues      float64
s_cic_score_psivalues        float64
sb_stack_score_psivalues     float64
sa_stack_score_psivalues     float64
c_credo_score_psivalues      float64
s_credo_score_psivalues      float64
fu_credo_score_psivalues     float64
r_credo_score_psivalues      float64
gen_credo_score_psivalues    float64
dtype: object

In [39]:
sq = """drop table if exists prj-prod-dataplatform.dap_ds_poweruser_playground.Model_Psi;"""

client.query(sq)

QueryJob<project=prj-prod-dataplatform, location=asia-southeast1, id=c89a19ce-048f-459c-b22c-856f248bf8c1>

In [40]:
import pandas as pd
from google.cloud import bigquery

# Create a BigQuery client
client = bigquery.Client('prj-prod-dataplatform')

# Define your table schema
table_schema = [
    bigquery.SchemaField('Month', 'STRING'),
    bigquery.SchemaField('scorename', 'STRING'),
    bigquery.SchemaField('DateCategory', 'STRING'),
    bigquery.SchemaField('s_apps_score_psivalues', 'FLOAT64'),
    bigquery.SchemaField('account_count', 'INT64'),
    bigquery.SchemaField('sb_demo_score_psivalues', 'FLOAT64'),
    bigquery.SchemaField('s_cic_score_psivalues', 'FLOAT64'),
    bigquery.SchemaField('sb_stack_score_psivalues', 'FLOAT64'),
    bigquery.SchemaField('sa_stack_score_psivalues', 'FLOAT64'),
    bigquery.SchemaField('c_credo_score_psivalues', 'FLOAT64'),
    bigquery.SchemaField('s_credo_score_psivalue', 'FLOAT64'),
    bigquery.SchemaField('fu_credo_score_psivalues', 'FLOAT64'),
    bigquery.SchemaField('r_credo_score_psivalues', 'FLOAT64'),
    bigquery.SchemaField('gen_credo_score_psivalues', 'FLOAT64'),
   
]

# Create your BigQuery table
table_id = 'prj-prod-dataplatform.dap_ds_poweruser_playground.Model_Psi'
table = bigquery.Table(table_id, schema=table_schema)
table = client.create_table(table)

# Load your DataFrame into BigQuery
job_config = bigquery.LoadJobConfig(
    write_disposition='WRITE_TRUNCATE'
)

load_job = client.load_table_from_dataframe(
    concatenated_result, table_id, job_config=job_config
)

load_job.result()

LoadJob<project=prj-prod-dataplatform, location=asia-southeast1, id=ff08a189-0a16-49bb-8dc0-73df90344786>

In [41]:
concatenated_result.head()

Unnamed: 0,Month,scorename,DateCategory,s_apps_score_psivalues,account_count,sb_demo_score_psivalues,s_cic_score_psivalues,sb_stack_score_psivalues,sa_stack_score_psivalues,c_credo_score_psivalues,s_credo_score_psivalues,fu_credo_score_psivalues,r_credo_score_psivalues,gen_credo_score_psivalues
0,2024-06,s_apps_score,Training,0.0,109413,,,,,,,,,
1,2024-07,s_apps_score,Monthly,0.034227,18571,,,,,,,,,
2,2024-08,s_apps_score,Monthly,0.034721,22959,,,,,,,,,
3,2024-09,s_apps_score,Monthly,0.045001,23145,,,,,,,,,
4,2024-10,s_apps_score,Monthly,0.045987,21868,,,,,,,,,


In [28]:
sq = """
with base as 
(select 
  a.digitalLoanAccountId, 
  FORMAT_DATE('%Y-%m', a.ln_appln_submit_datetime) Application_month,
  FORMAT_DATE('%F', DATE_TRUNC(a.ln_appln_submit_datetime, WEEK(MONDAY))) as Appl_week_start_date,
  EXTRACT(WEEK(MONDAY) FROM a.ln_appln_submit_datetime) as Appl_week_number,
  a.ln_loan_type,
  case when date_trunc(a.ln_appln_submit_datetime, day) between '2023-07-01' and '2024-06-30' then 'Train'
       when date_trunc(a.ln_appln_submit_datetime, day) >= '2024-07-01' then 'Test'
       Else 'Other' end dataselection,
  a.s_cic_score,
 from 
 prj-prod-dataplatform.risk_mart.sil_risk_ds_master_20230101_20250206 a
 where a.ln_loan_applied_flag = 1 and ln_dl_rule_reject_flag = 0
 and date_trunc(a.ln_appln_submit_datetime, day) >= '2023-04-01'
 )
 select * from base where s_cic_score is not null and dataselection in ('Train', 'Test');"""
df = client.query(sq).to_dataframe(progress_bar_type='tqdm')

Job ID 6930aff7-13de-4c22-9c15-e60313b8d400 successfully executed: 100%|[32m██████████[0m|




Downloading: 100%|[32m██████████[0m|


In [29]:
df.groupby(['dataselection'])['digitalLoanAccountId'].nunique()

dataselection
Test     138094
Train     47680
Name: digitalLoanAccountId, dtype: int64