# <div align="center" style="color: #ff5733;">PSI Monitoring</div>

# Declare Libraries

In [1]:
# %% [markdown]
# # Jupyter Notebook Loading Header
#
# This is a custom loading header for Jupyter Notebooks in Visual Studio Code.
# It includes common imports and settings to get you started quickly.

# %% [markdown]
## Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from google.cloud import bigquery
import os
path = r'C:\Users\DwaipayanChakroborti\AppData\Roaming\gcloud\legacy_credentials\dchakroborti@tonikbank.com\adc.json'
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = path
client = bigquery.Client(project='prj-prod-dataplatform')

# %% [markdown]
## Configure Settings
# Set options or configurations as needed
# Example: pd.set_option('display.max_columns', None)

# s_apps_score

In [2]:
sq = """
with base as 
(select 
  a.digitalLoanAccountId, 
  FORMAT_DATE('%Y-%m', a.ln_appln_submit_datetime) Application_month,
  FORMAT_DATE('%F', DATE_TRUNC(a.ln_appln_submit_datetime, WEEK(MONDAY))) as Appl_week_start_date,
  EXTRACT(WEEK(MONDAY) FROM a.ln_appln_submit_datetime) as Appl_week_number,
  a.ln_loan_type,
  case when date_trunc(a.ln_appln_submit_datetime, day) between '2023-07-01' and '2024-06-30' then 'Train'
       when date_trunc(a.ln_appln_submit_datetime, day) >= '2024-08-01' then 'Test'
       Else 'Other' end dataselection,
  a.s_apps_score,
 from 
 prj-prod-dataplatform.risk_mart.sil_risk_ds_master_20230101_20250206 a
 where a.ln_loan_applied_flag = 1 and ln_dl_rule_reject_flag = 0
 and date_trunc(a.ln_appln_submit_datetime, day) >= '2023-04-01'
 )
 select * from base where dataselection in ('Train', 'Test');"""
#  s_apps_score is not null and
 
df = client.query(sq).to_dataframe(progress_bar_type='tqdm')
print(f"The shape of df before duplicate drop is:\t {df.shape}")

df = df.drop_duplicates(keep='first')

print(f"The shape of df after duplicate drop is:\t {df.shape}")

Job ID 2882d649-6b94-41fb-9a02-00264f1b3305 successfully executed: 100%|[32m██████████[0m|




Downloading: 100%|[32m██████████[0m|
The shape of df before duplicate drop is:	 (297447, 7)
The shape of df after duplicate drop is:	 (297357, 7)


In [3]:
df.head()

Unnamed: 0,digitalLoanAccountId,Application_month,Appl_week_start_date,Appl_week_number,ln_loan_type,dataselection,s_apps_score
0,023c2f36-9864-49e0-921f-2602c21cd2f6,2023-09,2023-09-18,38,SIL-Instore,Train,0.51424
1,af910337-e834-4bf6-8b64-55878b7065b4,2023-08,2023-07-31,31,SIL-Instore,Train,0.603107
2,23fc76f3-3926-45ef-9d04-f673064e8f12,2023-09,2023-09-04,36,SIL-Instore,Train,0.648165
3,60ae9748-fe3b-47f5-959a-12ee5b52ddbc,2023-07,2023-07-17,29,SIL-Instore,Train,0.628679
4,1116d784-b243-4d42-ac65-8ccd15b494ac,2023-09,2023-09-04,36,SIL-Instore,Train,0.550248


In [4]:
import pandas as pd
import numpy as np
from datetime import datetime

# Assuming your DataFrame is called 'df' and has the structure from the image
# Replace this with your actual DataFrame loading process
sq = """
with base as 
(select 
  a.digitalLoanAccountId, 
  FORMAT_DATE('%Y-%m', a.ln_appln_submit_datetime) Application_month,
  FORMAT_DATE('%F', DATE_TRUNC(a.ln_appln_submit_datetime, WEEK(MONDAY))) as Appl_week_start_date,
  EXTRACT(WEEK(MONDAY) FROM a.ln_appln_submit_datetime) as Appl_week_number,
  a.ln_loan_type,
  case when date_trunc(a.ln_appln_submit_datetime, day) between '2023-07-01' and '2024-06-30' then 'Train'
       when date_trunc(a.ln_appln_submit_datetime, day) >= '2024-07-01' then 'Test'
       Else 'Other' end dataselection,
  a.s_apps_score,
 from 
 prj-prod-dataplatform.risk_mart.sil_risk_ds_master_20230101_20250206 a
 where a.ln_loan_applied_flag = 1 and ln_dl_rule_reject_flag = 0
 and date_trunc(a.ln_appln_submit_datetime, day) >= '2023-04-01'
 )
 select * from base where dataselection in ('Train', 'Test') and s_apps_score is not null;"""

print(df.groupby(['dataselection'])['digitalLoanAccountId'].nunique())
 
df = client.query(sq).to_dataframe(progress_bar_type='tqdm')

# Convert Application_month to datetime if it's not already
if df['Application_month'].dtype != 'datetime64[ns]':
    df['Application_month'] = pd.to_datetime(df['Application_month'] + '-01')

# Separate train and test data
train_df = df[df['dataselection'] == 'Train']
test_df = df[df['dataselection'] == 'Test']

# Calculate decile bins for the entire training set
train_deciles = pd.qcut(train_df['s_apps_score'], 10, labels=False, retbins=True)
train_decile_bins = train_deciles[1]

# Function to calculate PSI using the pre-defined decile bins
def calculate_psi_with_bins(data_scores, decile_bins):
    """Calculates PSI using pre-defined decile bins."""
    data_deciles = pd.cut(data_scores, bins=decile_bins, labels=False, include_lowest=True)
    distribution = pd.Series(data_deciles).value_counts().sort_index() / len(data_scores)

    # Align with training distribution
    all_bins = range(10)  # Assuming 10 deciles
    distribution_aligned = distribution.reindex(all_bins, fill_value=0)
    train_dist_aligned = pd.Series(train_deciles[0]).value_counts().sort_index() / len(train_df['s_apps_score'])
    train_dist_aligned = train_dist_aligned.reindex(all_bins, fill_value=0)

    psi_values = (distribution_aligned - train_dist_aligned) * np.log(distribution_aligned / train_dist_aligned)
    return psi_values.sum()

# Calculate PSI for the entire training set
train_psi = calculate_psi_with_bins(train_df['s_apps_score'], train_decile_bins)

# Get the last month of the training set
last_train_month = train_df['Application_month'].max()
last_train_month_str = last_train_month.strftime('%Y-%m')

import pandas as pd
import numpy as np
from datetime import datetime

# Your existing query and initial dataframe setup remains the same
# ...

# Calculate monthly PSI for the test set
monthly_psi_results = []

# Add the train set PSI to the results (with the correct last month)
monthly_psi_results.append({
    'Month': last_train_month_str,  # Use the last month of the training set
    'scorename': 's_apps_score',
    'DateCategory': 'Training',
    'psivalues': train_psi,
    'account_count': train_df['digitalLoanAccountId'].nunique()  # Add distinct account count
})

# Calculate monthly PSI for the test set
for month in sorted(test_df['Application_month'].unique()):
    month_str = month.strftime('%Y-%m')
    month_df = test_df[test_df['Application_month'] == month]
    if not month_df.empty:
        month_psi = calculate_psi_with_bins(month_df['s_apps_score'], train_decile_bins)
        monthly_psi_results.append({
            'Month': month_str,
            'scorename': 's_apps_score',
            'DateCategory': 'Monthly',
            'psivalues': month_psi,
            'account_count': month_df['digitalLoanAccountId'].nunique()  # Add distinct account count
        })

# Create the output DataFrame
s_apps_score_output_df = pd.DataFrame(monthly_psi_results)



dataselection
Test     176478
Train    120879
Name: digitalLoanAccountId, dtype: int64
Job ID cb611d80-3ed4-4e7f-97ed-8c98d4aa4461 successfully executed: 100%|[32m██████████[0m|




Downloading: 100%|[32m██████████[0m|


In [5]:
s_apps_score_output_df.rename(columns={'psivalues':'s_apps_score_psivalues'}, inplace = True)

In [6]:
s_apps_score_output_df

Unnamed: 0,Month,scorename,DateCategory,s_apps_score_psivalues,account_count
0,2024-06,s_apps_score,Training,0.0,105027
1,2024-07,s_apps_score,Monthly,0.034506,18549
2,2024-08,s_apps_score,Monthly,0.034966,22940
3,2024-09,s_apps_score,Monthly,0.045057,23136
4,2024-10,s_apps_score,Monthly,0.044652,20461
5,2024-11,s_apps_score,Monthly,0.039977,22232
6,2024-12,s_apps_score,Monthly,0.028287,45418
7,2025-01,s_apps_score,Monthly,0.046706,21220
8,2025-02,s_apps_score,Monthly,0.048738,4038


# sb_demo_score

In [7]:
sq = """
with base as 
(select 
  a.digitalLoanAccountId, 
  FORMAT_DATE('%Y-%m', a.ln_appln_submit_datetime) Application_month,
  FORMAT_DATE('%F', DATE_TRUNC(a.ln_appln_submit_datetime, WEEK(MONDAY))) as Appl_week_start_date,
  EXTRACT(WEEK(MONDAY) FROM a.ln_appln_submit_datetime) as Appl_week_number,
  a.ln_loan_type,
  case when date_trunc(a.ln_appln_submit_datetime, day) between '2023-07-01' and '2024-06-30' then 'Train'
       when date_trunc(a.ln_appln_submit_datetime, day) >= '2024-07-01' then 'Test'
       Else 'Other' end dataselection,
  a.sb_demo_score,
 from 
 prj-prod-dataplatform.risk_mart.sil_risk_ds_master_20230101_20250206 a
 where a.ln_loan_applied_flag = 1 and ln_dl_rule_reject_flag = 0
 and date_trunc(a.ln_appln_submit_datetime, day) >= '2023-04-01'
 )
 select * from base where sb_demo_score is not null and dataselection in ('Train', 'Test');"""
 
df = client.query(sq).to_dataframe(progress_bar_type='tqdm')
df.head()

Job ID 817ae9f0-6ca1-476a-9b0c-2d1c9eec36ad successfully executed: 100%|[32m██████████[0m|




Downloading: 100%|[32m██████████[0m|


Unnamed: 0,digitalLoanAccountId,Application_month,Appl_week_start_date,Appl_week_number,ln_loan_type,dataselection,sb_demo_score
0,023c2f36-9864-49e0-921f-2602c21cd2f6,2023-09,2023-09-18,38,SIL-Instore,Train,0.15752
1,af910337-e834-4bf6-8b64-55878b7065b4,2023-08,2023-07-31,31,SIL-Instore,Train,0.164215
2,23fc76f3-3926-45ef-9d04-f673064e8f12,2023-09,2023-09-04,36,SIL-Instore,Train,0.084356
3,31d2e125-5b3e-4894-8069-c4b15573f0d3,2023-10,2023-10-23,43,SIL-Instore,Train,0.084557
4,60ae9748-fe3b-47f5-959a-12ee5b52ddbc,2023-07,2023-07-17,29,SIL-Instore,Train,0.126975


In [8]:
import pandas as pd
import numpy as np
from datetime import datetime

# Replace this with your actual DataFrame loading process
sq = """
with base as 
(select 
  a.digitalLoanAccountId, 
  FORMAT_DATE('%Y-%m', a.ln_appln_submit_datetime) Application_month,
  FORMAT_DATE('%F', DATE_TRUNC(a.ln_appln_submit_datetime, WEEK(MONDAY))) as Appl_week_start_date,
  EXTRACT(WEEK(MONDAY) FROM a.ln_appln_submit_datetime) as Appl_week_number,
  a.ln_loan_type,
  case when date_trunc(a.ln_appln_submit_datetime, day) between '2023-07-01' and '2024-06-30' then 'Train'
       when date_trunc(a.ln_appln_submit_datetime, day) >= '2024-07-01' then 'Test'
       Else 'Other' end dataselection,
  a.sb_demo_score,
 from 
 prj-prod-dataplatform.risk_mart.sil_risk_ds_master_20230101_20250206 a
 where a.ln_loan_applied_flag = 1
 and date_trunc(a.ln_appln_submit_datetime, day) >= '2023-04-01'
 )
 select * from base where sb_demo_score is not null and dataselection in ('Train', 'Test');"""
df = client.query(sq).to_dataframe(progress_bar_type='tqdm')

print(df.groupby(['dataselection'])['digitalLoanAccountId'].nunique())

# Convert Application_month to datetime if it's not already
if df['Application_month'].dtype != 'datetime64[ns]':
    df['Application_month'] = pd.to_datetime(df['Application_month'] + '-01')

# Separate train and test data
train_df = df[df['dataselection'] == 'Train']
test_df = df[df['dataselection'] == 'Test']

# Calculate decile bins for the entire training set
train_deciles = pd.qcut(train_df['sb_demo_score'], 10, labels=False, retbins=True)
train_decile_bins = train_deciles[1]
print(train_decile_bins)
# Function to calculate PSI using the pre-defined decile bins
def calculate_psi_with_bins(data_scores, decile_bins):
    """Calculates PSI using pre-defined decile bins."""
    data_deciles = pd.cut(data_scores, bins=decile_bins, labels=False, include_lowest=True)
    distribution = pd.Series(data_deciles).value_counts().sort_index() / len(data_scores)

    # Align with training distribution
    all_bins = range(10)  # Assuming 10 deciles
    distribution_aligned = distribution.reindex(all_bins, fill_value=0)
    train_dist_aligned = pd.Series(train_deciles[0]).value_counts().sort_index() / len(train_df['sb_demo_score'])
    train_dist_aligned = train_dist_aligned.reindex(all_bins, fill_value=0)

    psi_values = (distribution_aligned - train_dist_aligned) * np.log(distribution_aligned / train_dist_aligned)
    return psi_values.sum()

# Calculate PSI for the entire training set
train_psi = calculate_psi_with_bins(train_df['sb_demo_score'], train_decile_bins)

# Get the last month of the training set
last_train_month = train_df['Application_month'].max()
last_train_month_str = last_train_month.strftime('%Y-%m')

# Calculate monthly PSI for the test set
monthly_psi_results = []

# Add the train set PSI to the results (with the correct last month)
monthly_psi_results.append({
    'Month': last_train_month_str,  # Use the last month of the training set
    # 'loan_type': train_df['ln_loan_type'].iloc[0],  # Assuming loan_type is consistent in the train set
    'scorename': 'sb_demo_score',
    'DateCategory': 'Training',
    'psivalues': train_psi,
    'account_count': train_df['digitalLoanAccountId'].nunique()  # Add distinct account count
})

# Calculate monthly PSI for the test set
for month in sorted(test_df['Application_month'].unique()):
    month_str = month.strftime('%Y-%m')
    month_df = test_df[test_df['Application_month'] == month]
    if not month_df.empty:
        month_psi = calculate_psi_with_bins(month_df['sb_demo_score'], train_decile_bins)
        monthly_psi_results.append({
            'Month': month_str,
            # 'loan_type': month_df['ln_loan_type'].iloc[0],  # Assuming loan_type is consistent in the month
            'scorename': 'sb_demo_score',
            'DateCategory': 'Monthly',
            'psivalues': month_psi,
            'account_count': month_df['digitalLoanAccountId'].nunique()  # Add distinct account count
        })

# Create the output DataFrame
sb_demo_score_output_df = pd.DataFrame(monthly_psi_results)
sb_demo_score_output_df.rename(columns={'psivalues':'sb_demo_score_psivalues'}, inplace = True)
sb_demo_score_output_df

Job ID a93e9f8d-070c-4e52-bc65-8a47a4bd77d9 successfully executed: 100%|[32m██████████[0m|




Downloading: 100%|[32m██████████[0m|
dataselection
Test     196860
Train    121661
Name: digitalLoanAccountId, dtype: int64
[0.02598842 0.07546623 0.09377793 0.10884758 0.12332053 0.1376085
 0.15237181 0.16863433 0.18702659 0.21413433 0.49832412]


Unnamed: 0,Month,scorename,DateCategory,sb_demo_score_psivalues,account_count
0,2024-06,sb_demo_score,Training,0.0,121661
1,2024-07,sb_demo_score,Monthly,0.006551,20382
2,2024-08,sb_demo_score,Monthly,0.007059,25136
3,2024-09,sb_demo_score,Monthly,0.00879,25284
4,2024-10,sb_demo_score,Monthly,0.00885,23980
5,2024-11,sb_demo_score,Monthly,0.011183,24407
6,2024-12,sb_demo_score,Monthly,0.02471,49721
7,2025-01,sb_demo_score,Monthly,0.046982,23473
8,2025-02,sb_demo_score,Monthly,0.049794,4477


# s_cic_score

In [9]:
import pandas as pd
import numpy as np
from datetime import datetime

# Replace this with your actual DataFrame loading process
sq = """
with base as 
(select 
  a.digitalLoanAccountId, 
  FORMAT_DATE('%Y-%m', a.ln_appln_submit_datetime) Application_month,
  FORMAT_DATE('%F', DATE_TRUNC(a.ln_appln_submit_datetime, WEEK(MONDAY))) as Appl_week_start_date,
  EXTRACT(WEEK(MONDAY) FROM a.ln_appln_submit_datetime) as Appl_week_number,
  a.ln_loan_type,
  case when date_trunc(a.ln_appln_submit_datetime, day) between '2023-07-01' and '2024-06-30' then 'Train'
       when date_trunc(a.ln_appln_submit_datetime, day) >= '2024-07-01' then 'Test'
       Else 'Other' end dataselection,
  a.s_cic_score,
 from 
 prj-prod-dataplatform.risk_mart.sil_risk_ds_master_20230101_20250206 a
 where a.ln_loan_applied_flag = 1 and ln_dl_rule_reject_flag = 0
 and date_trunc(a.ln_appln_submit_datetime, day) >= '2023-04-01'
 )
 select * from base 
 where s_cic_score is not null and  dataselection in ('Train', 'Test');"""
df = client.query(sq).to_dataframe(progress_bar_type='tqdm')

print(df.groupby(['dataselection'])['digitalLoanAccountId'].nunique())

# Convert Application_month to datetime if it's not already
if df['Application_month'].dtype != 'datetime64[ns]':
    df['Application_month'] = pd.to_datetime(df['Application_month'] + '-01')

# Separate train and test data
train_df = df[df['dataselection'] == 'Train']
test_df = df[df['dataselection'] == 'Test']

# Calculate decile bins for the entire training set
train_deciles = pd.qcut(train_df['s_cic_score'], 10, labels=False, retbins=True)
train_decile_bins = train_deciles[1]
print(train_decile_bins)
# Function to calculate PSI using the pre-defined decile bins
def calculate_psi_with_bins(data_scores, decile_bins):
    """Calculates PSI using pre-defined decile bins."""
    data_deciles = pd.cut(data_scores, bins=decile_bins, labels=False, include_lowest=True)
    distribution = pd.Series(data_deciles).value_counts().sort_index() / len(data_scores)
    
    # Align with training distribution
    all_bins = range(10)  # Assuming 10 deciles
    distribution_aligned = distribution.reindex(all_bins, fill_value=0)
    train_dist_aligned = pd.Series(train_deciles[0]).value_counts().sort_index() / len(train_df['s_cic_score'])
    train_dist_aligned = train_dist_aligned.reindex(all_bins, fill_value=0)
    
    # print(f"distribution_aligned-{distribution_aligned}")
    # print(f"train_dist_aligned - {train_dist_aligned}")

    psi_values = (distribution_aligned - train_dist_aligned) * np.log(distribution_aligned / train_dist_aligned)
    return psi_values.sum()

# Calculate PSI for the entire training set
train_psi = calculate_psi_with_bins(train_df['s_cic_score'], train_decile_bins)

# Get the last month of the training set
last_train_month = train_df['Application_month'].max()
last_train_month_str = last_train_month.strftime('%Y-%m')

# Calculate monthly PSI for the test set
monthly_psi_results = []

# Add the train set PSI to the results (with the correct last month)
monthly_psi_results.append({
    'Month': last_train_month_str,  # Use the last month of the training set
    # 'loan_type': train_df['ln_loan_type'].iloc[0],  # Assuming loan_type is consistent in the train set
    'scorename': 's_cic_score',
    'DateCategory': 'Training',
    'psivalues': train_psi,
    'account_count': train_df['digitalLoanAccountId'].nunique()  # Add distinct account count
})

# Calculate monthly PSI for the test set
for month in sorted(test_df['Application_month'].unique()):
    month_str = month.strftime('%Y-%m')
    month_df = test_df[test_df['Application_month'] == month]
    if not month_df.empty:
        month_psi = calculate_psi_with_bins(month_df['s_cic_score'], train_decile_bins)
        monthly_psi_results.append({
            'Month': month_str,
            # 'loan_type': month_df['ln_loan_type'].iloc[0],  # Assuming loan_type is consistent in the month
            'scorename': 's_cic_score',
            'DateCategory': 'Monthly',
            'psivalues': month_psi,
            'account_count': month_df['digitalLoanAccountId'].nunique()  # Add distinct account count
        })

# Create the output DataFrame
s_cic_score_output_df = pd.DataFrame(monthly_psi_results)
s_cic_score_output_df.rename(columns={'psivalues':'s_cic_score_psivalues'}, inplace = True)
s_cic_score_output_df

Job ID b0c8047c-0dc6-450d-a271-ef86fc5945ab successfully executed: 100%|[32m██████████[0m|




Downloading: 100%|[32m██████████[0m|
dataselection
Test     138094
Train     47680
Name: digitalLoanAccountId, dtype: int64
[0.03142267 0.06353086 0.072324   0.0814206  0.09121701 0.10908593
 0.13162711 0.14313393 0.15483266 0.18569543 0.63407415]


Unnamed: 0,Month,scorename,DateCategory,s_cic_score_psivalues,account_count
0,2024-06,s_cic_score,Training,0.0,47680
1,2024-07,s_cic_score,Monthly,0.079906,13346
2,2024-08,s_cic_score,Monthly,0.052631,17808
3,2024-09,s_cic_score,Monthly,0.029636,17502
4,2024-10,s_cic_score,Monthly,0.021937,16817
5,2024-11,s_cic_score,Monthly,0.015664,17931
6,2024-12,s_cic_score,Monthly,0.015624,34696
7,2025-01,s_cic_score,Monthly,0.024698,16804
8,2025-02,s_cic_score,Monthly,0.026525,3190


# Checking CIC psi with period testing from jan 2025 to feb2-25

In [10]:
import pandas as pd
import numpy as np
from datetime import datetime

# Replace this with your actual DataFrame loading process
sq = """
with base as 
(select 
  a.digitalLoanAccountId, 
  FORMAT_DATE('%Y-%m', a.ln_appln_submit_datetime) Application_month,
  ---FORMAT_DATE('%F', DATE_TRUNC(a.ln_appln_submit_datetime, WEEK(MONDAY))) as Appl_week_start_date,
 ---- EXTRACT(WEEK(MONDAY) FROM a.ln_appln_submit_datetime) as Appl_week_number,
  ---a.ln_loan_type,
  case when date_trunc(a.ln_appln_submit_datetime, day) between '2023-07-01' and '2024-06-30' then 'Train'
       when date_trunc(a.ln_appln_submit_datetime, day) >= '2024-07-01' then 'Test'
       Else 'Other' end dataselection,
  a.s_cic_score,
 from 
 prj-prod-dataplatform.risk_mart.sil_risk_ds_master_20230101_20250206 a
 where a.ln_loan_applied_flag = 1 and ln_dl_rule_reject_flag = 0
 and date_trunc(a.ln_appln_submit_datetime, day) >= '2023-07-01'
 )
 select * from base where dataselection in ('Train', 'Test') and s_cic_score is not null;"""
df = client.query(sq).to_dataframe(progress_bar_type='tqdm')

# Convert Application_month to datetime if it's not already
if df['Application_month'].dtype != 'datetime64[ns]':
    df['Application_month'] = pd.to_datetime(df['Application_month'] + '-01')

# Separate train and test data
train_df = df[df['dataselection'] == 'Train']
test_df = df[df['dataselection'] == 'Test']

# Function to calculate PSI between two periods
def calculate_psi(expected_array, actual_array, bins=10):
    """
    Calculate PSI for two arrays
    
    Parameters:
    -----------
    expected_array : numpy array of expected/training values
    actual_array : numpy array of actual/test values
    bins : number of bins to create
    
    Returns:
    --------
    psi_value : float, the calculated PSI value
    bin_details : DataFrame with binning details
    """
    # Create bins based on the expected array
    quantiles = np.linspace(0, 1, bins+1)
    bin_edges = np.quantile(expected_array, quantiles)
    
    # Ensure bin edges are unique (handle duplicates if they exist)
    bin_edges = np.unique(bin_edges)
    if len(bin_edges) < bins + 1:
        # Add small increments to duplicate values
        temp_edges = np.sort(np.unique(expected_array))
        if len(temp_edges) >= bins + 1:
            bin_edges = np.quantile(temp_edges, quantiles)
        else:
            # If not enough unique values, use min-max range divided into bins
            bin_edges = np.linspace(min(expected_array), max(expected_array), bins+1)
    
    # Create bins for both arrays
    expected_counts, _ = np.histogram(expected_array, bins=bin_edges)
    actual_counts, _ = np.histogram(actual_array, bins=bin_edges)
    
    # Calculate percentages
    expected_percents = expected_counts / len(expected_array) * 100
    actual_percents = actual_counts / len(actual_array) * 100
    
    # Calculate differences and PSI components
    diff = actual_percents - expected_percents
    
    # Safe division and log calculation (avoiding div by zero)
    ratio = np.divide(actual_percents, expected_percents, 
                     out=np.ones_like(actual_percents), 
                     where=expected_percents!=0)
    ln_ratio = np.log(ratio, out=np.zeros_like(ratio), where=ratio>0)
    
    # Calculate PSI components and total
    psi_components = diff / 100 * ln_ratio
    psi_value = np.sum(psi_components)
    
    # Create detailed results DataFrame
    bin_details = pd.DataFrame({
        'Bins': [f"{i+1}" for i in range(len(expected_counts))],
        '# Train': expected_counts,
        '# Train %': expected_percents,
        '# Test': actual_counts,
        '# Test %': actual_percents,
        'A-B': diff,
        'ln(A/B)': ln_ratio,
        'PSI': psi_components * 100
    })
    
    bin_details.loc['Grand Total'] = [
        '', sum(expected_counts), 100.0, sum(actual_counts), 100.0, '', '', psi_value * 100
    ]
    
    return psi_value, bin_details

# Calculate monthly PSI as in your original code
def calculate_monthly_psi():
    # Calculate decile bins for the entire training set
    train_deciles = pd.qcut(train_df['s_cic_score'], 10, labels=False, retbins=True)
    train_decile_bins = train_deciles[1]
    
    # Get the last month of the training set
    last_train_month = train_df['Application_month'].max()
    last_train_month_str = last_train_month.strftime('%Y-%m')
    
    # Calculate monthly PSI for the test set
    monthly_psi_results = []
    
    # Add the train set PSI to the results (with the correct last month)
    monthly_psi_results.append({
        'Month': last_train_month_str,
        'scorename': 's_cic_score',
        'DateCategory': 'Training',
        'psivalues': 0.0  # PSI against itself is 0
    })
    
    # Calculate monthly PSI for the test set
    for month in sorted(test_df['Application_month'].unique()):
        month_str = month.strftime('%Y-%m')
        month_df = test_df[test_df['Application_month'] == month]
        
        if not month_df.empty:
            # Calculate PSI using our function
            month_psi, _ = calculate_psi(train_df['s_cic_score'].values, month_df['s_cic_score'].values)
            
            monthly_psi_results.append({
                'Month': month_str,
                'scorename': 's_cic_score',
                'DateCategory': 'Monthly',
                'psivalues': month_psi
            })
    
    # Create the output DataFrame
    monthly_psi_df = pd.DataFrame(monthly_psi_results)
    monthly_psi_df.rename(columns={'psivalues': 's_cic_score_psivalues'}, inplace=True)
    
    return monthly_psi_df

# Calculate PSI between two specific periods (as shown in the image)
def calculate_period_psi():
    # Define the periods matching the image
    train_period = train_df  # Already defined as 2023-07 to 2024-06
    
    # Filter test data for Jan-Feb 2025 - using datetime objects to avoid the February 29 issue
    jan_2025 = pd.Timestamp('2025-01-01')
    feb_2025 = pd.Timestamp('2025-02-28')  # Using Feb 28 instead of Feb 29
    
    test_period = test_df[(test_df['Application_month'] >= jan_2025) & 
                          (test_df['Application_month'] <= feb_2025)]
    
    # Calculate PSI between periods
    period_psi, psi_details = calculate_psi(train_period['s_cic_score'].values, 
                                           test_period['s_cic_score'].values,
                                           bins=10)
    
    print("PSI between 2023-07 to 2024-06 and 2025-01 to 2025-02:")
    print(f"Overall PSI: {period_psi:.6f}")
    
    return period_psi, psi_details

# Run both calculations
print("Calculating monthly PSI values...")
monthly_psi_results = calculate_monthly_psi()
print(monthly_psi_results)

print("\nCalculating period PSI (matching the image)...")
period_psi, psi_details = calculate_period_psi()
print("\nDetailed PSI calculation by bin:")
print(psi_details)

Job ID 9f1f504d-4905-476e-b8ad-b3e6c19e1b68 successfully executed: 100%|[32m██████████[0m|




Downloading: 100%|[32m██████████[0m|
Calculating monthly PSI values...
     Month    scorename DateCategory  s_cic_score_psivalues
0  2024-06  s_cic_score     Training               0.000000
1  2024-07  s_cic_score      Monthly               0.080083
2  2024-08  s_cic_score      Monthly               0.052951
3  2024-09  s_cic_score      Monthly               0.029641
4  2024-10  s_cic_score      Monthly               0.021839
5  2024-11  s_cic_score      Monthly               0.015658
6  2024-12  s_cic_score      Monthly               0.015542
7  2025-01  s_cic_score      Monthly               0.024438
8  2025-02  s_cic_score      Monthly               0.026403

Calculating period PSI (matching the image)...
PSI between 2023-07 to 2024-06 and 2025-01 to 2025-02:
Overall PSI: 0.024400

Detailed PSI calculation by bin:
            Bins  # Train   # Train %  # Test    # Test %       A-B   ln(A/B)  \
0              1     4771   10.000000    1668    8.342503 -1.657497 -0.181222   
1    

In [11]:
sq = """
with base as 
(select 
  a.digitalLoanAccountId, 
  FORMAT_DATE('%Y-%m', a.ln_appln_submit_datetime) Application_month,
  case when date_trunc(a.ln_appln_submit_datetime, day) between '2023-07-01' and '2024-06-30' then 'Train'
       when date_trunc(a.ln_appln_submit_datetime, day) >= '2024-07-01' then 'Test'
       Else 'Other' end dataselection,
  a.s_cic_score,
 from 
 prj-prod-dataplatform.risk_mart.sil_risk_ds_master_20230101_20250206 a
 where a.ln_loan_applied_flag = 1 and ln_dl_rule_reject_flag = 0
 and date_trunc(a.ln_appln_submit_datetime, day) >= '2023-07-01'
 )
 select * from base where s_cic_score is not null and dataselection in ('Train', 'Test');"""
df = client.query(sq).to_dataframe(progress_bar_type='tqdm')


Job ID aa8e3c6e-846f-412e-8a24-dfac18be18fc successfully executed: 100%|[32m██████████[0m|




Downloading: 100%|[32m██████████[0m|


In [12]:
df.groupby(['Application_month', 'dataselection'])['digitalLoanAccountId'].nunique()

Application_month  dataselection
2023-07            Train              521
2023-08            Train             1148
2023-09            Train             1886
2023-10            Train             1836
2023-11            Train             2505
2023-12            Train             4458
2024-01            Train             2506
2024-02            Train             2382
2024-03            Train             3083
2024-04            Train             5851
2024-05            Train             8033
2024-06            Train            13471
2024-07            Test             13346
2024-08            Test             17808
2024-09            Test             17502
2024-10            Test             16817
2024-11            Test             17931
2024-12            Test             34696
2025-01            Test             16804
2025-02            Test              3190
Name: digitalLoanAccountId, dtype: int64

In [13]:
df.to_csv("Test.csv")

# sb_stack_score

In [14]:
import pandas as pd
import numpy as np
from datetime import datetime

# Assuming your DataFrame is called 'df' and has the structure from the image
# Replace this with your actual DataFrame loading process
sq = """
with base as 
(select 
  a.digitalLoanAccountId, 
  FORMAT_DATE('%Y-%m', a.ln_appln_submit_datetime) Application_month,
  FORMAT_DATE('%F', DATE_TRUNC(a.ln_appln_submit_datetime, WEEK(MONDAY))) as Appl_week_start_date,
  EXTRACT(WEEK(MONDAY) FROM a.ln_appln_submit_datetime) as Appl_week_number,
  a.ln_loan_type,
  case when date_trunc(a.ln_appln_submit_datetime, day) between '2023-07-01' and '2024-06-30' then 'Train'
       when date_trunc(a.ln_appln_submit_datetime, day) >= '2024-07-01' then 'Test'
       Else 'Other' end dataselection,
  a.sb_stack_score,
 from 
 prj-prod-dataplatform.risk_mart.sil_risk_ds_master_20230101_20250206 a
 where a.ln_loan_applied_flag = 1 and ln_dl_rule_reject_flag = 0
 and date_trunc(a.ln_appln_submit_datetime, day) >= '2023-04-01'
 )
 select * from base where sb_stack_score is not null and dataselection in ('Train', 'Test');"""
df = client.query(sq).to_dataframe(progress_bar_type='tqdm')
print(df.groupby(['dataselection'])['digitalLoanAccountId'].nunique())

# Convert Application_month to datetime if it's not already
if df['Application_month'].dtype != 'datetime64[ns]':
    df['Application_month'] = pd.to_datetime(df['Application_month'] + '-01')

# Separate train and test data
train_df = df[df['dataselection'] == 'Train']
test_df = df[df['dataselection'] == 'Test']

# Calculate decile bins for the entire training set
train_deciles = pd.qcut(train_df['sb_stack_score'], 10, labels=False, retbins=True)
train_decile_bins = train_deciles[1]
print(train_decile_bins)
# Function to calculate PSI using the pre-defined decile bins
def calculate_psi_with_bins(data_scores, decile_bins):
    """Calculates PSI using pre-defined decile bins."""
    data_deciles = pd.cut(data_scores, bins=decile_bins, labels=False, include_lowest=True)
    distribution = pd.Series(data_deciles).value_counts().sort_index() / len(data_scores)

    # Align with training distribution
    all_bins = range(10)  # Assuming 10 deciles
    distribution_aligned = distribution.reindex(all_bins, fill_value=0)
    train_dist_aligned = pd.Series(train_deciles[0]).value_counts().sort_index() / len(train_df['sb_stack_score'])
    train_dist_aligned = train_dist_aligned.reindex(all_bins, fill_value=0)

    psi_values = (distribution_aligned - train_dist_aligned) * np.log(distribution_aligned / train_dist_aligned)
    return psi_values.sum()

# Calculate PSI for the entire training set
train_psi = calculate_psi_with_bins(train_df['sb_stack_score'], train_decile_bins)

# Get the last month of the training set
last_train_month = train_df['Application_month'].max()
last_train_month_str = last_train_month.strftime('%Y-%m')

# Calculate monthly PSI for the test set
monthly_psi_results = []

# Add the train set PSI to the results (with the correct last month)
monthly_psi_results.append({
    'Month': last_train_month_str,  # Use the last month of the training set
    # 'loan_type': train_df['ln_loan_type'].iloc[0],  # Assuming loan_type is consistent in the train set
    'scorename': 'sb_stack_score',
    'DateCategory': 'Training',
    'psivalues': train_psi,
    'account_count': train_df['digitalLoanAccountId'].nunique()  # Add distinct account count
    
})

# Calculate monthly PSI for the test set
for month in sorted(test_df['Application_month'].unique()):
    month_str = month.strftime('%Y-%m')
    month_df = test_df[test_df['Application_month'] == month]
    if not month_df.empty:
        month_psi = calculate_psi_with_bins(month_df['sb_stack_score'], train_decile_bins)
        monthly_psi_results.append({
            'Month': month_str,
            # 'loan_type': month_df['ln_loan_type'].iloc[0],  # Assuming loan_type is consistent in the month
            'scorename': 'sb_stack_score',
            'DateCategory': 'Monthly',
            'psivalues': month_psi,
            'account_count': month_df['digitalLoanAccountId'].nunique()  # Add distinct account count
        })

# Create the output DataFrame
sb_stack_score_output_df = pd.DataFrame(monthly_psi_results)
sb_stack_score_output_df.rename(columns={'psivalues':'sb_stack_score_psivalues'}, inplace = True)
sb_stack_score_output_df

Job ID e62032a1-88a3-4973-95fe-7b5d37a5eadc successfully executed: 100%|[32m██████████[0m|




Downloading: 100%|[32m██████████[0m|
dataselection
Test     196860
Train    120879
Name: digitalLoanAccountId, dtype: int64
[0.00485871 0.02681654 0.03643278 0.04736433 0.06184279 0.07756521
 0.09464254 0.11425851 0.14086449 0.18536298 0.69430952]


Unnamed: 0,Month,scorename,DateCategory,sb_stack_score_psivalues,account_count
0,2024-06,sb_stack_score,Training,0.0,120879
1,2024-07,sb_stack_score,Monthly,0.032516,20382
2,2024-08,sb_stack_score,Monthly,0.033468,25136
3,2024-09,sb_stack_score,Monthly,0.042587,25284
4,2024-10,sb_stack_score,Monthly,0.03078,23980
5,2024-11,sb_stack_score,Monthly,0.033394,24407
6,2024-12,sb_stack_score,Monthly,0.030957,49721
7,2025-01,sb_stack_score,Monthly,0.181756,23473
8,2025-02,sb_stack_score,Monthly,0.189094,4477


# sa_stack_score

In [15]:
import pandas as pd
import numpy as np
from datetime import datetime

# Assuming your DataFrame is called 'df' and has the structure from the image
# Replace this with your actual DataFrame loading process
sq = """
with base as 
(select 
  a.digitalLoanAccountId, 
  FORMAT_DATE('%Y-%m', a.ln_appln_submit_datetime) Application_month,
  FORMAT_DATE('%F', DATE_TRUNC(a.ln_appln_submit_datetime, WEEK(MONDAY))) as Appl_week_start_date,
  EXTRACT(WEEK(MONDAY) FROM a.ln_appln_submit_datetime) as Appl_week_number,
  a.ln_loan_type,
  case when date_trunc(a.ln_appln_submit_datetime, day) between '2023-07-01' and '2024-06-30' then 'Train'
       when date_trunc(a.ln_appln_submit_datetime, day) >= '2024-07-01' then 'Test'
       Else 'Other' end dataselection,
  a.sa_stack_score,
 from 
 prj-prod-dataplatform.risk_mart.sil_risk_ds_master_20230101_20250206 a
 where a.ln_loan_applied_flag = 1  and ln_dl_rule_reject_flag = 0
 and date_trunc(a.ln_appln_submit_datetime, day) >= '2023-04-01'
 )
 select * from base where sa_stack_score is not null and dataselection in ('Train', 'Test');"""
df = client.query(sq).to_dataframe(progress_bar_type='tqdm')
print(df.groupby(['dataselection'])['digitalLoanAccountId'].nunique())

# Convert Application_month to datetime if it's not already
if df['Application_month'].dtype != 'datetime64[ns]':
    df['Application_month'] = pd.to_datetime(df['Application_month'] + '-01')

# Separate train and test data
train_df = df[df['dataselection'] == 'Train']
test_df = df[df['dataselection'] == 'Test']

# Calculate decile bins for the entire training set
train_deciles = pd.qcut(train_df['sa_stack_score'], 10, labels=False, retbins=True)
train_decile_bins = train_deciles[1]
print(train_decile_bins)
# Function to calculate PSI using the pre-defined decile bins
def calculate_psi_with_bins(data_scores, decile_bins):
    """Calculates PSI using pre-defined decile bins."""
    data_deciles = pd.cut(data_scores, bins=decile_bins, labels=False, include_lowest=True)
    distribution = pd.Series(data_deciles).value_counts().sort_index() / len(data_scores)

    # Align with training distribution
    all_bins = range(10)  # Assuming 10 deciles
    distribution_aligned = distribution.reindex(all_bins, fill_value=0)
    train_dist_aligned = pd.Series(train_deciles[0]).value_counts().sort_index() / len(train_df['sa_stack_score'])
    train_dist_aligned = train_dist_aligned.reindex(all_bins, fill_value=0)

    psi_values = (distribution_aligned - train_dist_aligned) * np.log(distribution_aligned / train_dist_aligned)
    return psi_values.sum()

# Calculate PSI for the entire training set
train_psi = calculate_psi_with_bins(train_df['sa_stack_score'], train_decile_bins)

# Get the last month of the training set
last_train_month = train_df['Application_month'].max()
last_train_month_str = last_train_month.strftime('%Y-%m')

# Calculate monthly PSI for the test set
monthly_psi_results = []

# Add the train set PSI to the results (with the correct last month)
monthly_psi_results.append({
    'Month': last_train_month_str,  # Use the last month of the training set
    # 'loan_type': train_df['ln_loan_type'].iloc[0],  # Assuming loan_type is consistent in the train set
    'scorename': 'sa_stack_score',
    'DateCategory': 'Training',
    'psivalues': train_psi,
    'account_count': train_df['digitalLoanAccountId'].nunique()  # Add distinct account count
})

# Calculate monthly PSI for the test set
for month in sorted(test_df['Application_month'].unique()):
    month_str = month.strftime('%Y-%m')
    month_df = test_df[test_df['Application_month'] == month]
    if not month_df.empty:
        month_psi = calculate_psi_with_bins(month_df['sa_stack_score'], train_decile_bins)
        monthly_psi_results.append({
            'Month': month_str,
            # 'loan_type': month_df['ln_loan_type'].iloc[0],  # Assuming loan_type is consistent in the month
            'scorename': 'sa_stack_score',
            'DateCategory': 'Monthly',
            'psivalues': month_psi,
            'account_count': month_df['digitalLoanAccountId'].nunique()  # Add distinct account count
        })

# Create the output DataFrame
sa_stack_score_output_df = pd.DataFrame(monthly_psi_results)
sa_stack_score_output_df.rename(columns={'psivalues':'sa_stack_score_psivalues'}, inplace = True)
sa_stack_score_output_df


Job ID 1c5eb31a-58e0-43a7-8626-2e71f45b5d53 successfully executed: 100%|[32m██████████[0m|




Downloading: 100%|[32m██████████[0m|
dataselection
Test     196860
Train    120879
Name: digitalLoanAccountId, dtype: int64
[0.00588077 0.0303127  0.04349289 0.0536179  0.06704428 0.0832338
 0.09947512 0.12143455 0.14961535 0.19755939 0.79084583]


Unnamed: 0,Month,scorename,DateCategory,sa_stack_score_psivalues,account_count
0,2024-06,sa_stack_score,Training,0.0,120879
1,2024-07,sa_stack_score,Monthly,0.040464,20382
2,2024-08,sa_stack_score,Monthly,0.035464,25136
3,2024-09,sa_stack_score,Monthly,0.039231,25284
4,2024-10,sa_stack_score,Monthly,0.02592,23980
5,2024-11,sa_stack_score,Monthly,0.03229,24407
6,2024-12,sa_stack_score,Monthly,0.026966,49721
7,2025-01,sa_stack_score,Monthly,0.035306,23473
8,2025-02,sa_stack_score,Monthly,0.028978,4477


# c_credo_score_output_df

In [16]:
import pandas as pd
import numpy as np
from datetime import datetime

# Assuming your DataFrame is called 'df' and has the structure from the image
# Replace this with your actual DataFrame loading process
sq = """
with base as 
(select 
  a.digitalLoanAccountId, 
  FORMAT_DATE('%Y-%m', a.ln_appln_submit_datetime) Application_month,
  FORMAT_DATE('%F', DATE_TRUNC(a.ln_appln_submit_datetime, WEEK(MONDAY))) as Appl_week_start_date,
  EXTRACT(WEEK(MONDAY) FROM a.ln_appln_submit_datetime) as Appl_week_number,
  a.ln_loan_type,
  case when date_trunc(a.ln_appln_submit_datetime, day) between '2023-07-01' and '2024-06-30' then 'Train'
       when date_trunc(a.ln_appln_submit_datetime, day) >= '2024-07-01' then 'Test'
       Else 'Other' end dataselection,
  a.c_credo_score,
 from 
 prj-prod-dataplatform.risk_mart.sil_risk_ds_master_20230101_20250206 a
 where a.ln_loan_applied_flag = 1  and ln_dl_rule_reject_flag = 0
 and date_trunc(a.ln_appln_submit_datetime, day) >= '2023-07-01'
 )
 select * from base where c_credo_score is not null and dataselection in ('Train', 'Test');"""
df = client.query(sq).to_dataframe(progress_bar_type='tqdm')
print(df.groupby(['dataselection'])['digitalLoanAccountId'].nunique())

# Convert Application_month to datetime if it's not already
if df['Application_month'].dtype != 'datetime64[ns]':
    df['Application_month'] = pd.to_datetime(df['Application_month'] + '-01')

# Separate train and test data
train_df = df[df['dataselection'] == 'Train']
test_df = df[df['dataselection'] == 'Test']

# Calculate decile bins for the entire training set
train_deciles = pd.qcut(train_df['c_credo_score'], 10, labels=False, retbins=True)
train_decile_bins = train_deciles[1]
print(train_decile_bins)
# Function to calculate PSI using the pre-defined decile bins
def calculate_psi_with_bins(data_scores, decile_bins):
    """Calculates PSI using pre-defined decile bins."""
    data_deciles = pd.cut(data_scores, bins=decile_bins, labels=False, include_lowest=True)
    distribution = pd.Series(data_deciles).value_counts().sort_index() / len(data_scores)

    # Align with training distribution
    all_bins = range(10)  # Assuming 10 deciles
    distribution_aligned = distribution.reindex(all_bins, fill_value=0)
    train_dist_aligned = pd.Series(train_deciles[0]).value_counts().sort_index() / len(train_df['c_credo_score'])
    train_dist_aligned = train_dist_aligned.reindex(all_bins, fill_value=0)

    psi_values = (distribution_aligned - train_dist_aligned) * np.log(distribution_aligned / train_dist_aligned)
    return psi_values.sum()

# Calculate PSI for the entire training set
train_psi = calculate_psi_with_bins(train_df['c_credo_score'], train_decile_bins)

# Get the last month of the training set
last_train_month = train_df['Application_month'].max()
last_train_month_str = last_train_month.strftime('%Y-%m')

# Calculate monthly PSI for the test set
monthly_psi_results = []

# Add the train set PSI to the results (with the correct last month)
monthly_psi_results.append({
    'Month': last_train_month_str,  # Use the last month of the training set
    # 'loan_type': train_df['ln_loan_type'].iloc[0],  # Assuming loan_type is consistent in the train set
    'scorename': 'c_credo_score',
    'DateCategory': 'Training',
    'psivalues': train_psi,
    'account_count': train_df['digitalLoanAccountId'].nunique()  # Add distinct account count
})

# Calculate monthly PSI for the test set
for month in sorted(test_df['Application_month'].unique()):
    month_str = month.strftime('%Y-%m')
    month_df = test_df[test_df['Application_month'] == month]
    if not month_df.empty:
        month_psi = calculate_psi_with_bins(month_df['c_credo_score'], train_decile_bins)
        monthly_psi_results.append({
            'Month': month_str,
            # 'loan_type': month_df['ln_loan_type'].iloc[0],  # Assuming loan_type is consistent in the month
            'scorename': 'c_credo_score',
            'DateCategory': 'Monthly',
            'psivalues': month_psi,
            'account_count': month_df['digitalLoanAccountId'].nunique()  # Add distinct account count
        })

# Create the output DataFrame
c_credo_score_output_df = pd.DataFrame(monthly_psi_results)
c_credo_score_output_df.rename(columns={'psivalues':'c_credo_score_psivalues'}, inplace = True)

Job ID 95731880-6af3-4275-8fe9-d4ac83e9db47 successfully executed: 100%|[32m██████████[0m|




Downloading: 100%|[32m██████████[0m|
dataselection
Test     196860
Train    120879
Name: digitalLoanAccountId, dtype: int64
[0.01944472 0.15415312 0.18755253 0.21867693 0.25137791 0.28300081
 0.31850413 0.35944225 0.40895377 0.47980304 0.85616948]


In [17]:
c_credo_score_output_df

Unnamed: 0,Month,scorename,DateCategory,c_credo_score_psivalues,account_count
0,2024-06,c_credo_score,Training,0.0,120879
1,2024-07,c_credo_score,Monthly,0.006339,20382
2,2024-08,c_credo_score,Monthly,0.003617,25136
3,2024-09,c_credo_score,Monthly,0.003706,25284
4,2024-10,c_credo_score,Monthly,0.00244,23980
5,2024-11,c_credo_score,Monthly,0.002303,24407
6,2024-12,c_credo_score,Monthly,0.002678,49721
7,2025-01,c_credo_score,Monthly,0.00262,23473
8,2025-02,c_credo_score,Monthly,0.011446,4477


# s_credo_score

In [18]:
import pandas as pd
import numpy as np
from datetime import datetime

# Assuming your DataFrame is called 'df' and has the structure from the image
# Replace this with your actual DataFrame loading process
sq = """
with base as 
(select 
  a.digitalLoanAccountId, 
  FORMAT_DATE('%Y-%m', a.ln_appln_submit_datetime) Application_month,
  FORMAT_DATE('%F', DATE_TRUNC(a.ln_appln_submit_datetime, WEEK(MONDAY))) as Appl_week_start_date,
  EXTRACT(WEEK(MONDAY) FROM a.ln_appln_submit_datetime) as Appl_week_number,
  a.ln_loan_type,
  case when date_trunc(a.ln_appln_submit_datetime, day) between '2023-07-01' and '2024-06-30' then 'Train'
       when date_trunc(a.ln_appln_submit_datetime, day) >= '2024-07-01' then 'Test'
       Else 'Other' end dataselection,
  a.s_credo_score,
 from 
 prj-prod-dataplatform.risk_mart.sil_risk_ds_master_20230101_20250206 a
 where a.ln_loan_applied_flag = 1  and ln_dl_rule_reject_flag = 0
 and date_trunc(a.ln_appln_submit_datetime, day) >= '2023-07-01'
 )
 select * from base where s_credo_score is not null and dataselection in ('Train', 'Test');"""
df = client.query(sq).to_dataframe(progress_bar_type='tqdm')
print(df.groupby(['dataselection'])['digitalLoanAccountId'].nunique())

# Convert Application_month to datetime if it's not already
if df['Application_month'].dtype != 'datetime64[ns]':
    df['Application_month'] = pd.to_datetime(df['Application_month'] + '-01')

# Separate train and test data
train_df = df[df['dataselection'] == 'Train']
test_df = df[df['dataselection'] == 'Test']

# Calculate decile bins for the entire training set
train_deciles = pd.qcut(train_df['s_credo_score'], 10, labels=False, retbins=True)
train_decile_bins = train_deciles[1]
print(train_decile_bins)
# Function to calculate PSI using the pre-defined decile bins
def calculate_psi_with_bins(data_scores, decile_bins):
    """Calculates PSI using pre-defined decile bins."""
    data_deciles = pd.cut(data_scores, bins=decile_bins, labels=False, include_lowest=True)
    distribution = pd.Series(data_deciles).value_counts().sort_index() / len(data_scores)

    # Align with training distribution
    all_bins = range(10)  # Assuming 10 deciles
    distribution_aligned = distribution.reindex(all_bins, fill_value=0)
    train_dist_aligned = pd.Series(train_deciles[0]).value_counts().sort_index() / len(train_df['s_credo_score'])
    train_dist_aligned = train_dist_aligned.reindex(all_bins, fill_value=0)

    psi_values = (distribution_aligned - train_dist_aligned) * np.log(distribution_aligned / train_dist_aligned)
    return psi_values.sum()

# Calculate PSI for the entire training set
train_psi = calculate_psi_with_bins(train_df['s_credo_score'], train_decile_bins)

# Get the last month of the training set
last_train_month = train_df['Application_month'].max()
last_train_month_str = last_train_month.strftime('%Y-%m')

# Calculate monthly PSI for the test set
monthly_psi_results = []

# Add the train set PSI to the results (with the correct last month)
monthly_psi_results.append({
    'Month': last_train_month_str,  # Use the last month of the training set
    # 'loan_type': train_df['ln_loan_type'].iloc[0],  # Assuming loan_type is consistent in the train set
    'scorename': 's_credo_score',
    'DateCategory': 'Training',
    'psivalues': train_psi,
    'account_count': train_df['digitalLoanAccountId'].nunique()  # Add distinct account count
})

# Calculate monthly PSI for the test set
for month in sorted(test_df['Application_month'].unique()):
    month_str = month.strftime('%Y-%m')
    month_df = test_df[test_df['Application_month'] == month]
    if not month_df.empty:
        month_psi = calculate_psi_with_bins(month_df['s_credo_score'], train_decile_bins)
        monthly_psi_results.append({
            'Month': month_str,
            # 'loan_type': month_df['ln_loan_type'].iloc[0],  # Assuming loan_type is consistent in the month
            'scorename': 's_credo_score',
            'DateCategory': 'Monthly',
            'psivalues': month_psi,
            'account_count': month_df['digitalLoanAccountId'].nunique()  # Add distinct account count
        })

# Create the output DataFrame
s_credo_score_output_df = pd.DataFrame(monthly_psi_results)
s_credo_score_output_df.rename(columns= {'psivalues':'s_credo_score_psivalues'}, inplace = True)
s_credo_score_output_df

Job ID 8c6b970e-463c-4d8d-a7e5-20b4919b36ef successfully executed: 100%|[32m██████████[0m|




Downloading: 100%|[32m██████████[0m|
dataselection
Test     196860
Train    120879
Name: digitalLoanAccountId, dtype: int64
[0.01777745 0.05814829 0.07035747 0.08058057 0.09079362 0.10110136
 0.11250886 0.12671437 0.14598791 0.17644707 0.44602214]


Unnamed: 0,Month,scorename,DateCategory,s_credo_score_psivalues,account_count
0,2024-06,s_credo_score,Training,0.0,120879
1,2024-07,s_credo_score,Monthly,0.006315,20382
2,2024-08,s_credo_score,Monthly,0.002445,25136
3,2024-09,s_credo_score,Monthly,0.01053,25284
4,2024-10,s_credo_score,Monthly,0.016362,23980
5,2024-11,s_credo_score,Monthly,0.007729,24407
6,2024-12,s_credo_score,Monthly,0.006495,49721
7,2025-01,s_credo_score,Monthly,0.025541,23473
8,2025-02,s_credo_score,Monthly,0.015285,4477


# fu_credo_score

In [19]:
import pandas as pd
import numpy as np
from datetime import datetime

# Assuming your DataFrame is called 'df' and has the structure from the image
# Replace this with your actual DataFrame loading process
sq = """
with base as 
(select 
  a.digitalLoanAccountId, 
  FORMAT_DATE('%Y-%m', a.ln_appln_submit_datetime) Application_month,
  FORMAT_DATE('%F', DATE_TRUNC(a.ln_appln_submit_datetime, WEEK(MONDAY))) as Appl_week_start_date,
  EXTRACT(WEEK(MONDAY) FROM a.ln_appln_submit_datetime) as Appl_week_number,
  a.ln_loan_type,
  case when date_trunc(a.ln_appln_submit_datetime, day) between '2023-07-01' and '2024-06-30' then 'Train'
       when date_trunc(a.ln_appln_submit_datetime, day) >= '2024-07-01' then 'Test'
       Else 'Other' end dataselection,
  a.fu_credo_score,
 from 
 prj-prod-dataplatform.risk_mart.sil_risk_ds_master_20230101_20250206 a
 where a.ln_loan_applied_flag = 1  and ln_dl_rule_reject_flag = 0
 and date_trunc(a.ln_appln_submit_datetime, day) >= '2023-07-01'
 )
 select * from base where fu_credo_score is not null and dataselection in ('Train', 'Test');"""
df = client.query(sq).to_dataframe(progress_bar_type='tqdm')
print(df.groupby(['dataselection'])['digitalLoanAccountId'].nunique())

# Convert Application_month to datetime if it's not already
if df['Application_month'].dtype != 'datetime64[ns]':
    df['Application_month'] = pd.to_datetime(df['Application_month'] + '-01')

# Separate train and test data
train_df = df[df['dataselection'] == 'Train']
test_df = df[df['dataselection'] == 'Test']

# Calculate decile bins for the entire training set
train_deciles = pd.qcut(train_df['fu_credo_score'], 10, labels=False, retbins=True)
train_decile_bins = train_deciles[1]
print(train_decile_bins)
# Function to calculate PSI using the pre-defined decile bins
def calculate_psi_with_bins(data_scores, decile_bins):
    """Calculates PSI using pre-defined decile bins."""
    data_deciles = pd.cut(data_scores, bins=decile_bins, labels=False, include_lowest=True)
    distribution = pd.Series(data_deciles).value_counts().sort_index() / len(data_scores)

    # Align with training distribution
    all_bins = range(10)  # Assuming 10 deciles
    distribution_aligned = distribution.reindex(all_bins, fill_value=0)
    train_dist_aligned = pd.Series(train_deciles[0]).value_counts().sort_index() / len(train_df['fu_credo_score'])
    train_dist_aligned = train_dist_aligned.reindex(all_bins, fill_value=0)

    psi_values = (distribution_aligned - train_dist_aligned) * np.log(distribution_aligned / train_dist_aligned)
    return psi_values.sum()

# Calculate PSI for the entire training set
train_psi = calculate_psi_with_bins(train_df['fu_credo_score'], train_decile_bins)

# Get the last month of the training set
last_train_month = train_df['Application_month'].max()
last_train_month_str = last_train_month.strftime('%Y-%m')

# Calculate monthly PSI for the test set
monthly_psi_results = []

# Add the train set PSI to the results (with the correct last month)
monthly_psi_results.append({
    'Month': last_train_month_str,  # Use the last month of the training set
    # 'loan_type': train_df['ln_loan_type'].iloc[0],  # Assuming loan_type is consistent in the train set
    'scorename': 'fu_credo_score',
    'DateCategory': 'Training',
    'psivalues': train_psi,
    'account_count': train_df['digitalLoanAccountId'].nunique()  # Add distinct account count
})

# Calculate monthly PSI for the test set
for month in sorted(test_df['Application_month'].unique()):
    month_str = month.strftime('%Y-%m')
    month_df = test_df[test_df['Application_month'] == month]
    if not month_df.empty:
        month_psi = calculate_psi_with_bins(month_df['fu_credo_score'], train_decile_bins)
        monthly_psi_results.append({
            'Month': month_str,
            # 'loan_type': month_df['ln_loan_type'].iloc[0],  # Assuming loan_type is consistent in the month
            'scorename': 'fu_credo_score',
            'DateCategory': 'Monthly',
            'psivalues': month_psi,
            'account_count': month_df['digitalLoanAccountId'].nunique()  # Add distinct account count
        })

# Create the output DataFrame
fu_credo_score_output_df = pd.DataFrame(monthly_psi_results)
fu_credo_score_output_df.rename(columns={'psivalues':'fu_credo_score_psivalues'}, inplace = True)
fu_credo_score_output_df

Job ID 9a1b41ea-e96b-4602-9bc5-71c896822a6b successfully executed: 100%|[32m██████████[0m|




Downloading: 100%|[32m██████████[0m|
dataselection
Test     196860
Train    120879
Name: digitalLoanAccountId, dtype: int64
[0.00198555 0.02844122 0.04201382 0.05616849 0.07195195 0.09119812
 0.11547502 0.1455796  0.18542508 0.23653321 0.86342827]


Unnamed: 0,Month,scorename,DateCategory,fu_credo_score_psivalues,account_count
0,2024-06,fu_credo_score,Training,0.0,120879
1,2024-07,fu_credo_score,Monthly,0.006642,20382
2,2024-08,fu_credo_score,Monthly,0.006873,25136
3,2024-09,fu_credo_score,Monthly,0.008629,25284
4,2024-10,fu_credo_score,Monthly,0.017082,23980
5,2024-11,fu_credo_score,Monthly,0.01192,24407
6,2024-12,fu_credo_score,Monthly,0.010395,49721
7,2025-01,fu_credo_score,Monthly,0.004568,23473
8,2025-02,fu_credo_score,Monthly,0.005654,4477


# r_credo_score

In [20]:
import pandas as pd
import numpy as np
from datetime import datetime

# Assuming your DataFrame is called 'df' and has the structure from the image
# Replace this with your actual DataFrame loading process
sq = """
with base as 
(select 
  a.digitalLoanAccountId, 
  FORMAT_DATE('%Y-%m', a.ln_appln_submit_datetime) Application_month,
  FORMAT_DATE('%F', DATE_TRUNC(a.ln_appln_submit_datetime, WEEK(MONDAY))) as Appl_week_start_date,
  EXTRACT(WEEK(MONDAY) FROM a.ln_appln_submit_datetime) as Appl_week_number,
  a.ln_loan_type,
  case when date_trunc(a.ln_appln_submit_datetime, day) between '2023-07-01' and '2024-06-30' then 'Train'
       when date_trunc(a.ln_appln_submit_datetime, day) >= '2024-07-01' then 'Test'
       Else 'Other' end dataselection,
  a.r_credo_score,
 from 
 prj-prod-dataplatform.risk_mart.sil_risk_ds_master_20230101_20250206 a
 where a.ln_loan_applied_flag = 1  and ln_dl_rule_reject_flag = 0
 and date_trunc(a.ln_appln_submit_datetime, day) >= '2023-07-01'
 )
 select * from base where r_credo_score is not null and dataselection in ('Train', 'Test');"""
df = client.query(sq).to_dataframe(progress_bar_type='tqdm')
print(df.groupby(['dataselection'])['digitalLoanAccountId'].nunique())

# Convert Application_month to datetime if it's not already
if df['Application_month'].dtype != 'datetime64[ns]':
    df['Application_month'] = pd.to_datetime(df['Application_month'] + '-01')

# Separate train and test data
train_df = df[df['dataselection'] == 'Train']
test_df = df[df['dataselection'] == 'Test']

# Calculate decile bins for the entire training set
train_deciles = pd.qcut(train_df['r_credo_score'], 10, labels=False, retbins=True)
train_decile_bins = train_deciles[1]
print(train_decile_bins)
# Function to calculate PSI using the pre-defined decile bins
def calculate_psi_with_bins(data_scores, decile_bins):
    """Calculates PSI using pre-defined decile bins."""
    data_deciles = pd.cut(data_scores, bins=decile_bins, labels=False, include_lowest=True)
    distribution = pd.Series(data_deciles).value_counts().sort_index() / len(data_scores)

    # Align with training distribution
    all_bins = range(10)  # Assuming 10 deciles
    distribution_aligned = distribution.reindex(all_bins, fill_value=0)
    train_dist_aligned = pd.Series(train_deciles[0]).value_counts().sort_index() / len(train_df['r_credo_score'])
    train_dist_aligned = train_dist_aligned.reindex(all_bins, fill_value=0)

    psi_values = (distribution_aligned - train_dist_aligned) * np.log(distribution_aligned / train_dist_aligned)
    return psi_values.sum()

# Calculate PSI for the entire training set
train_psi = calculate_psi_with_bins(train_df['r_credo_score'], train_decile_bins)

# Get the last month of the training set
last_train_month = train_df['Application_month'].max()
last_train_month_str = last_train_month.strftime('%Y-%m')

# Calculate monthly PSI for the test set
monthly_psi_results = []

# Add the train set PSI to the results (with the correct last month)
monthly_psi_results.append({
    'Month': last_train_month_str,  # Use the last month of the training set
    # 'loan_type': train_df['ln_loan_type'].iloc[0],  # Assuming loan_type is consistent in the train set
    'scorename': 'r_credo_score',
    'DateCategory': 'Training',
    'psivalues': train_psi,
    'account_count': train_df['digitalLoanAccountId'].nunique()  # Add distinct account count
})

# Calculate monthly PSI for the test set
for month in sorted(test_df['Application_month'].unique()):
    month_str = month.strftime('%Y-%m')
    month_df = test_df[test_df['Application_month'] == month]
    if not month_df.empty:
        month_psi = calculate_psi_with_bins(month_df['r_credo_score'], train_decile_bins)
        monthly_psi_results.append({
            'Month': month_str,
            # 'loan_type': month_df['ln_loan_type'].iloc[0],  # Assuming loan_type is consistent in the month
            'scorename': 'r_credo_score',
            'DateCategory': 'Monthly',
            'psivalues': month_psi,
            'account_count': month_df['digitalLoanAccountId'].nunique()  # Add distinct account count
        })

# Create the output DataFrame
r_credo_score_output_df = pd.DataFrame(monthly_psi_results)
r_credo_score_output_df.rename(columns={'psivalues':'r_credo_score_psivalues'}, inplace = True)
r_credo_score_output_df


Job ID 3a9b055e-64d4-4843-8747-58a5f04a46cb successfully executed: 100%|[32m██████████[0m|




Downloading: 100%|[32m██████████[0m|
dataselection
Test     196860
Train    120879
Name: digitalLoanAccountId, dtype: int64
[0.00715845 0.09851276 0.12676541 0.15483299 0.18619981 0.21976724
 0.25745943 0.30091241 0.35520741 0.4357053  0.87489346]


Unnamed: 0,Month,scorename,DateCategory,r_credo_score_psivalues,account_count
0,2024-06,r_credo_score,Training,0.0,120879
1,2024-07,r_credo_score,Monthly,0.010765,20382
2,2024-08,r_credo_score,Monthly,0.008318,25136
3,2024-09,r_credo_score,Monthly,0.004315,25284
4,2024-10,r_credo_score,Monthly,0.00619,23980
5,2024-11,r_credo_score,Monthly,0.002788,24407
6,2024-12,r_credo_score,Monthly,0.003019,49721
7,2025-01,r_credo_score,Monthly,0.01147,23473
8,2025-02,r_credo_score,Monthly,0.007871,4477


# gen_credo_score

In [21]:
import pandas as pd
import numpy as np
from datetime import datetime

# Assuming your DataFrame is called 'df' and has the structure from the image
# Replace this with your actual DataFrame loading process
sq = """
with base as 
(select 
  a.digitalLoanAccountId, 
  FORMAT_DATE('%Y-%m', a.ln_appln_submit_datetime) Application_month,
  FORMAT_DATE('%F', DATE_TRUNC(a.ln_appln_submit_datetime, WEEK(MONDAY))) as Appl_week_start_date,
  EXTRACT(WEEK(MONDAY) FROM a.ln_appln_submit_datetime) as Appl_week_number,
  a.ln_loan_type,
  case when date_trunc(a.ln_appln_submit_datetime, day) between '2023-07-01' and '2024-06-30' then 'Train'
       when date_trunc(a.ln_appln_submit_datetime, day) >= '2024-07-01' then 'Test'
       Else 'Other' end dataselection,
  a.gen_credo_score,
 from 
 prj-prod-dataplatform.risk_mart.sil_risk_ds_master_20230101_20250206 a
 where a.ln_loan_applied_flag = 1  and ln_dl_rule_reject_flag = 0
 and date_trunc(a.ln_appln_submit_datetime, day) >= '2023-04-01'
 )
 select * from base where gen_credo_score is not null and dataselection in ('Train', 'Test');"""
df = client.query(sq).to_dataframe(progress_bar_type='tqdm')
print(df.groupby(['dataselection'])['digitalLoanAccountId'].nunique())

# Convert Application_month to datetime if it's not already
if df['Application_month'].dtype != 'datetime64[ns]':
    df['Application_month'] = pd.to_datetime(df['Application_month'] + '-01')

# Separate train and test data
train_df = df[df['dataselection'] == 'Train']
test_df = df[df['dataselection'] == 'Test']

# Calculate decile bins for the entire training set
train_deciles = pd.qcut(train_df['gen_credo_score'], 10, labels=False, retbins=True)
train_decile_bins = train_deciles[1]
print(train_decile_bins)
# Function to calculate PSI using the pre-defined decile bins
def calculate_psi_with_bins(data_scores, decile_bins):
    """Calculates PSI using pre-defined decile bins."""
    data_deciles = pd.cut(data_scores, bins=decile_bins, labels=False, include_lowest=True)
    distribution = pd.Series(data_deciles).value_counts().sort_index() / len(data_scores)

    # Align with training distribution
    all_bins = range(10)  # Assuming 10 deciles
    distribution_aligned = distribution.reindex(all_bins, fill_value=0)
    train_dist_aligned = pd.Series(train_deciles[0]).value_counts().sort_index() / len(train_df['gen_credo_score'])
    train_dist_aligned = train_dist_aligned.reindex(all_bins, fill_value=0)

    psi_values = (distribution_aligned - train_dist_aligned) * np.log(distribution_aligned / train_dist_aligned)
    return psi_values.sum()

# Calculate PSI for the entire training set
train_psi = calculate_psi_with_bins(train_df['gen_credo_score'], train_decile_bins)

# Get the last month of the training set
last_train_month = train_df['Application_month'].max()
last_train_month_str = last_train_month.strftime('%Y-%m')

# Calculate monthly PSI for the test set
monthly_psi_results = []

# Add the train set PSI to the results (with the correct last month)
monthly_psi_results.append({
    'Month': last_train_month_str,  # Use the last month of the training set
    # 'loan_type': train_df['ln_loan_type'].iloc[0],  # Assuming loan_type is consistent in the train set
    'scorename': 'gen_credo_score',
    'DateCategory': 'Training',
    'psivalues': train_psi,
    'account_count': train_df['digitalLoanAccountId'].nunique()  # Add distinct account count
})

# Calculate monthly PSI for the test set
for month in sorted(test_df['Application_month'].unique()):
    month_str = month.strftime('%Y-%m')
    month_df = test_df[test_df['Application_month'] == month]
    if not month_df.empty:
        month_psi = calculate_psi_with_bins(month_df['gen_credo_score'], train_decile_bins)
        monthly_psi_results.append({
            'Month': month_str,
            # 'loan_type': month_df['ln_loan_type'].iloc[0],  # Assuming loan_type is consistent in the month
            'scorename': 'gen_credo_score',
            'DateCategory': 'Monthly',
            'psivalues': month_psi,
            'account_count': month_df['digitalLoanAccountId'].nunique()  # Add distinct account count
        })

# Create the output DataFrame
gen_credo_score_output_df = pd.DataFrame(monthly_psi_results)
gen_credo_score_output_df.rename(columns={'psivalues':'gen_credo_score_psivalues'}, inplace = True)
gen_credo_score_output_df

Job ID fc548c2b-6b16-494a-9e1c-0c51e9d42806 successfully executed: 100%|[32m██████████[0m|




Downloading: 100%|[32m██████████[0m|
dataselection
Test     196860
Train    120879
Name: digitalLoanAccountId, dtype: int64
[0.01431733 0.05413615 0.06841584 0.08116088 0.09353156 0.10626519
 0.12042339 0.13726809 0.15942609 0.19533982 0.57325398]


Unnamed: 0,Month,scorename,DateCategory,gen_credo_score_psivalues,account_count
0,2024-06,gen_credo_score,Training,0.0,120879
1,2024-07,gen_credo_score,Monthly,0.014691,20382
2,2024-08,gen_credo_score,Monthly,0.008808,25136
3,2024-09,gen_credo_score,Monthly,0.015344,25284
4,2024-10,gen_credo_score,Monthly,0.024166,23980
5,2024-11,gen_credo_score,Monthly,0.015595,24407
6,2024-12,gen_credo_score,Monthly,0.014782,49721
7,2025-01,gen_credo_score,Monthly,0.050278,23473
8,2025-02,gen_credo_score,Monthly,0.018422,4477


# Combining dataframes

In [22]:
# import functools

# dataframes = [s_apps_score_output_df, sb_demo_score_output_df, s_cic_score_output_df, sb_stack_score_output_df,sa_stack_score_output_df, c_credo_score_output_df, s_credo_score_output_df, fu_credo_score_output_df,
#               r_credo_score_output_df, gen_credo_score_output_df]
# common_columns = ['Month', 'scorename', 'DateCategory', 'psivalues']

# def merge_dataframes(df1, df2):
#     return pd.merge(df1, df2, on=common_columns, how='outer')

# final_df = functools.reduce(merge_dataframes, dataframes)

# final_df.columns.values

In [23]:
import pandas as pd

def concatenate_dataframes(dataframe_list):
    """
    Concatenates a list of Pandas DataFrames into a single DataFrame.

    Args:
        dataframe_list: A list of Pandas DataFrames to concatenate.

    Returns:
        A single concatenated Pandas DataFrame, or None if the input list is empty.
    """
    if not dataframe_list:
        return None  # Return None if the list is empty

    try:
        concatenated_df = pd.concat(dataframe_list, ignore_index=True)
        return concatenated_df
    except Exception as e:
        print(f"An error occurred during concatenation: {e}")
        return None

# Example usage (assuming your DataFrames are already defined):

# Replace these with your actual DataFrames
# s_apps_score_output_df = ...
# sb_demo_score_output_df = ...
# s_cic_score_output_df = ...
# sb_stack_score_output_df = ...
# sa_stack_score_output_df = ...
# c_credo_score_output_df = ...
# s_credo_score_output_df = ...
# fu_credo_score_output_df = ...
# r_credo_score_output_df = ...
# gen_credo_score_output_df = ...

dataframe_list = [
    s_apps_score_output_df,
    sb_demo_score_output_df,
    s_cic_score_output_df,
    sb_stack_score_output_df,
    sa_stack_score_output_df,
    c_credo_score_output_df,
    s_credo_score_output_df,
    fu_credo_score_output_df,
    r_credo_score_output_df,
    gen_credo_score_output_df,
]

concatenated_result = concatenate_dataframes(dataframe_list)

if concatenated_result is not None:
    print(concatenated_result)
else:
    print("Concatenation failed or the input list was empty.")

      Month        scorename DateCategory  s_apps_score_psivalues  \
0   2024-06     s_apps_score     Training                0.000000   
1   2024-07     s_apps_score      Monthly                0.034506   
2   2024-08     s_apps_score      Monthly                0.034966   
3   2024-09     s_apps_score      Monthly                0.045057   
4   2024-10     s_apps_score      Monthly                0.044652   
..      ...              ...          ...                     ...   
85  2024-10  gen_credo_score      Monthly                     NaN   
86  2024-11  gen_credo_score      Monthly                     NaN   
87  2024-12  gen_credo_score      Monthly                     NaN   
88  2025-01  gen_credo_score      Monthly                     NaN   
89  2025-02  gen_credo_score      Monthly                     NaN   

    account_count  sb_demo_score_psivalues  s_cic_score_psivalues  \
0          105027                      NaN                    NaN   
1           18549                

In [24]:
concatenated_result.dtypes

Month                         object
scorename                     object
DateCategory                  object
s_apps_score_psivalues       float64
account_count                  int64
sb_demo_score_psivalues      float64
s_cic_score_psivalues        float64
sb_stack_score_psivalues     float64
sa_stack_score_psivalues     float64
c_credo_score_psivalues      float64
s_credo_score_psivalues      float64
fu_credo_score_psivalues     float64
r_credo_score_psivalues      float64
gen_credo_score_psivalues    float64
dtype: object

In [25]:
sq = """drop table if exists prj-prod-dataplatform.dap_ds_poweruser_playground.Model_Psi;"""

client.query(sq)

QueryJob<project=prj-prod-dataplatform, location=asia-southeast1, id=c544f2c8-229e-4c4d-9ac9-6d17bf04d32c>

In [26]:
import pandas as pd
from google.cloud import bigquery

# Create a BigQuery client
client = bigquery.Client('prj-prod-dataplatform')

# Define your table schema
table_schema = [
    bigquery.SchemaField('Month', 'STRING'),
    bigquery.SchemaField('scorename', 'STRING'),
    bigquery.SchemaField('DateCategory', 'STRING'),
    bigquery.SchemaField('s_apps_score_psivalues', 'FLOAT64'),
    bigquery.SchemaField('account_count', 'INT64'),
    bigquery.SchemaField('sb_demo_score_psivalues', 'FLOAT64'),
    bigquery.SchemaField('s_cic_score_psivalues', 'FLOAT64'),
    bigquery.SchemaField('sb_stack_score_psivalues', 'FLOAT64'),
    bigquery.SchemaField('sa_stack_score_psivalues', 'FLOAT64'),
    bigquery.SchemaField('c_credo_score_psivalues', 'FLOAT64'),
    bigquery.SchemaField('s_credo_score_psivalue', 'FLOAT64'),
    bigquery.SchemaField('fu_credo_score_psivalues', 'FLOAT64'),
    bigquery.SchemaField('r_credo_score_psivalues', 'FLOAT64'),
    bigquery.SchemaField('gen_credo_score_psivalues', 'FLOAT64'),
   
]

# Create your BigQuery table
table_id = 'prj-prod-dataplatform.dap_ds_poweruser_playground.Model_Psi'
table = bigquery.Table(table_id, schema=table_schema)
table = client.create_table(table)

# Load your DataFrame into BigQuery
job_config = bigquery.LoadJobConfig(
    write_disposition='WRITE_TRUNCATE'
)

load_job = client.load_table_from_dataframe(
    concatenated_result, table_id, job_config=job_config
)

load_job.result()

LoadJob<project=prj-prod-dataplatform, location=asia-southeast1, id=16fe5bed-fce3-4087-afb6-645848c7ea57>

In [27]:
concatenated_result.head()

Unnamed: 0,Month,scorename,DateCategory,s_apps_score_psivalues,account_count,sb_demo_score_psivalues,s_cic_score_psivalues,sb_stack_score_psivalues,sa_stack_score_psivalues,c_credo_score_psivalues,s_credo_score_psivalues,fu_credo_score_psivalues,r_credo_score_psivalues,gen_credo_score_psivalues
0,2024-06,s_apps_score,Training,0.0,105027,,,,,,,,,
1,2024-07,s_apps_score,Monthly,0.034506,18549,,,,,,,,,
2,2024-08,s_apps_score,Monthly,0.034966,22940,,,,,,,,,
3,2024-09,s_apps_score,Monthly,0.045057,23136,,,,,,,,,
4,2024-10,s_apps_score,Monthly,0.044652,20461,,,,,,,,,


In [28]:
sq = """
with base as 
(select 
  a.digitalLoanAccountId, 
  FORMAT_DATE('%Y-%m', a.ln_appln_submit_datetime) Application_month,
  FORMAT_DATE('%F', DATE_TRUNC(a.ln_appln_submit_datetime, WEEK(MONDAY))) as Appl_week_start_date,
  EXTRACT(WEEK(MONDAY) FROM a.ln_appln_submit_datetime) as Appl_week_number,
  a.ln_loan_type,
  case when date_trunc(a.ln_appln_submit_datetime, day) between '2023-07-01' and '2024-06-30' then 'Train'
       when date_trunc(a.ln_appln_submit_datetime, day) >= '2024-07-01' then 'Test'
       Else 'Other' end dataselection,
  a.s_cic_score,
 from 
 prj-prod-dataplatform.risk_mart.sil_risk_ds_master_20230101_20250206 a
 where a.ln_loan_applied_flag = 1 and ln_dl_rule_reject_flag = 0
 and date_trunc(a.ln_appln_submit_datetime, day) >= '2023-04-01'
 )
 select * from base where s_cic_score is not null and dataselection in ('Train', 'Test');"""
df = client.query(sq).to_dataframe(progress_bar_type='tqdm')

Job ID 3143642e-6251-4821-8fae-c4ea92ec41d6 successfully executed: 100%|[32m██████████[0m|




Downloading: 100%|[32m██████████[0m|


In [29]:
df.groupby(['dataselection'])['digitalLoanAccountId'].nunique()

dataselection
Test     138094
Train     47680
Name: digitalLoanAccountId, dtype: int64