# <div align="center" style="color: #ff5733;">PSI Monitoring</div>

# Declare Libraries

In [1]:
# %% [markdown]
# # Jupyter Notebook Loading Header
#
# This is a custom loading header for Jupyter Notebooks in Visual Studio Code.
# It includes common imports and settings to get you started quickly.

# %% [markdown]
## Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from google.cloud import bigquery
from google.cloud import bigquery
import os
path = r'C:\Users\Dwaipayan\AppData\Roaming\gcloud\legacy_credentials\dchakroborti@tonikbank.com\adc.json'
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = path
client = bigquery.Client(project='prj-prod-dataplatform')


# %% [markdown]
## Configure Settings
# Set options or configurations as needed
# Example: pd.set_option('display.max_columns', None)

In [2]:
a = "`prj-prod-dataplatform.risk_credit_mis.application_score_master`"

# s_apps_score

In [3]:
sq = f"""
with base as 
(select 
  a.digitalLoanAccountId, 
  FORMAT_DATE('%Y-%m', a.ln_appln_submit_datetime) Application_month,
  FORMAT_DATE('%F', DATE_TRUNC(a.ln_appln_submit_datetime, WEEK(MONDAY))) as Appl_week_start_date,
  EXTRACT(WEEK(MONDAY) FROM a.ln_appln_submit_datetime) as Appl_week_number,
  a.ln_loan_type,
  case when date_trunc(a.ln_appln_submit_datetime, day) between '2023-07-01' and '2024-06-30' then 'Train'
       when date_trunc(a.ln_appln_submit_datetime, day) >= '2024-08-01' then 'Test'
       Else 'Other' end dataselection,
  a.beta_apps_score s_apps_score,
 from
  {a} a
 where a.ln_loan_applied_flag = 1 and ln_dl_rule_reject_flag = 0
 and date_trunc(a.ln_appln_submit_datetime, day) >= '2023-04-01'
 )
 select * from base where dataselection in ('Train', 'Test');"""
#  s_apps_score is not null and
 
df = client.query(sq).to_dataframe(progress_bar_type='tqdm')
print(f"The shape of df before duplicate drop is:\t {df.shape}")

df = df.drop_duplicates(keep='first')

print(f"The shape of df after duplicate drop is:\t {df.shape}")

Job ID 13e91ca8-d619-4944-b0bb-7f2167a2c605 successfully executed: 100%|[32m██████████[0m|




Downloading: 100%|[32m██████████[0m|
The shape of df before duplicate drop is:	 (322861, 7)
The shape of df after duplicate drop is:	 (322861, 7)


In [4]:
df.head()

Unnamed: 0,digitalLoanAccountId,Application_month,Appl_week_start_date,Appl_week_number,ln_loan_type,dataselection,s_apps_score
0,1da2fece-eafb-419b-9592-38a2b174296f,2025-05,2025-04-28,17,SIL-Instore,Test,0.4530694487935255
1,e94d2f93-55da-4cef-8da0-ba8af09e219d,2023-07,2023-07-24,30,SIL-Instore,Train,0.5632349436379913
2,1e0c2c1b-b49b-4ab8-a20b-f0e1a8c4729b,2023-10,2023-10-02,40,SIL-Instore,Train,0.5080933044180737
3,257c2f5c-901b-41ac-8d04-589fe750ec00,2023-08,2023-08-28,35,SIL-Instore,Train,0.5828646176268814
4,f5fded59-15e1-447d-acf5-ac17b9eb52b8,2025-04,2025-04-21,16,SIL-Instore,Test,0.5392054933755956


In [5]:
import pandas as pd
import numpy as np
from datetime import datetime

# Assuming your DataFrame is called 'df' and has the structure from the image
# Replace this with your actual DataFrame loading process
sq = f"""
with base as 
(select 
  a.digitalLoanAccountId, 
  FORMAT_DATE('%Y-%m', a.ln_appln_submit_datetime) Application_month,
  FORMAT_DATE('%F', DATE_TRUNC(a.ln_appln_submit_datetime, WEEK(MONDAY))) as Appl_week_start_date,
  EXTRACT(WEEK(MONDAY) FROM a.ln_appln_submit_datetime) as Appl_week_number,
  a.ln_loan_type,
  case when date_trunc(a.ln_appln_submit_datetime, day) between '2023-07-01' and '2024-06-30' then 'Train'
       when date_trunc(a.ln_appln_submit_datetime, day) >= '2024-07-01' then 'Test'
       Else 'Other' end dataselection,
  a.beta_apps_score s_apps_score,
 from 
 {a} a
 where a.ln_loan_applied_flag = 1 and ln_dl_rule_reject_flag = 0
 and date_trunc(a.ln_appln_submit_datetime, day) >= '2023-04-01'
 )
 select * from base where dataselection in ('Train', 'Test') and s_apps_score is not null;"""

print(df.groupby(['dataselection'])['digitalLoanAccountId'].nunique())
 
df = client.query(sq).to_dataframe(progress_bar_type='tqdm')

# Covert s_apps_score to numeric if it's not already
df['s_apps_score'] = pd.to_numeric(df['s_apps_score'], errors='coerce')

# Convert Application_month to datetime if it's not already
if df['Application_month'].dtype != 'datetime64[ns]':
    df['Application_month'] = pd.to_datetime(df['Application_month'] + '-01')

# Separate train and test data
train_df = df[df['dataselection'] == 'Train']
test_df = df[df['dataselection'] == 'Test']

# Calculate decile bins for the entire training set
train_deciles = pd.qcut(train_df['s_apps_score'], 10, labels=False, retbins=True)
train_decile_bins = train_deciles[1]

# Function to calculate PSI using the pre-defined decile bins
def calculate_psi_with_bins(data_scores, decile_bins):
    """Calculates PSI using pre-defined decile bins."""
    data_deciles = pd.cut(data_scores, bins=decile_bins, labels=False, include_lowest=True)
    distribution = pd.Series(data_deciles).value_counts().sort_index() / len(data_scores)

    # Align with training distribution
    all_bins = range(10)  # Assuming 10 deciles
    distribution_aligned = distribution.reindex(all_bins, fill_value=0)
    train_dist_aligned = pd.Series(train_deciles[0]).value_counts().sort_index() / len(train_df['s_apps_score'])
    train_dist_aligned = train_dist_aligned.reindex(all_bins, fill_value=0)

    psi_values = (distribution_aligned - train_dist_aligned) * np.log(distribution_aligned / train_dist_aligned)
    return psi_values.sum()

# Calculate PSI for the entire training set
train_psi = calculate_psi_with_bins(train_df['s_apps_score'], train_decile_bins)

# Get the last month of the training set
last_train_month = train_df['Application_month'].max()
last_train_month_str = last_train_month.strftime('%Y-%m')

import pandas as pd
import numpy as np
from datetime import datetime

# Your existing query and initial dataframe setup remains the same
# ...

# Calculate monthly PSI for the test set
monthly_psi_results = []

# Add the train set PSI to the results (with the correct last month)
monthly_psi_results.append({
    'Month': last_train_month_str,  # Use the last month of the training set
    'scorename': 's_apps_score',
    'DateCategory': 'Training',
    'psivalues': train_psi,
    'account_count': train_df['digitalLoanAccountId'].nunique()  # Add distinct account count
})

# Calculate monthly PSI for the test set
for month in sorted(test_df['Application_month'].unique()):
    month_str = month.strftime('%Y-%m')
    month_df = test_df[test_df['Application_month'] == month]
    if not month_df.empty:
        month_psi = calculate_psi_with_bins(month_df['s_apps_score'], train_decile_bins)
        monthly_psi_results.append({
            'Month': month_str,
            'scorename': 's_apps_score',
            'DateCategory': 'Monthly',
            'psivalues': month_psi,
            'account_count': month_df['digitalLoanAccountId'].nunique()  # Add distinct account count
        })

# Create the output DataFrame
s_apps_score_output_df = pd.DataFrame(monthly_psi_results)



dataselection
Test     201982
Train    120879
Name: digitalLoanAccountId, dtype: int64
Job ID 1e40e6b8-6b99-45ae-83b7-a7356fe58b9a successfully executed: 100%|[32m██████████[0m|




Downloading: 100%|[32m██████████[0m|


In [6]:
s_apps_score_output_df.rename(columns={'psivalues':'s_apps_score_psivalues'}, inplace = True)

In [7]:
s_apps_score_output_df

Unnamed: 0,Month,scorename,DateCategory,s_apps_score_psivalues,account_count
0,2024-06,s_apps_score,Training,0.0,109413
1,2024-07,s_apps_score,Monthly,0.034295,18571
2,2024-08,s_apps_score,Monthly,0.034844,22959
3,2024-09,s_apps_score,Monthly,0.045087,23145
4,2024-10,s_apps_score,Monthly,0.046058,21868
5,2024-11,s_apps_score,Monthly,0.039535,22271
6,2024-12,s_apps_score,Monthly,0.027855,45503
7,2025-01,s_apps_score,Monthly,0.046292,21253
8,2025-02,s_apps_score,Monthly,0.058903,18625
9,2025-03,s_apps_score,Monthly,0.056474,6494


# sb_demo_score

In [8]:
sq = f"""
with base as 
(select 
  a.digitalLoanAccountId, 
  FORMAT_DATE('%Y-%m', a.ln_appln_submit_datetime) Application_month,
  FORMAT_DATE('%F', DATE_TRUNC(a.ln_appln_submit_datetime, WEEK(MONDAY))) as Appl_week_start_date,
  EXTRACT(WEEK(MONDAY) FROM a.ln_appln_submit_datetime) as Appl_week_number,
  a.ln_loan_type,
  case when date_trunc(a.ln_appln_submit_datetime, day) between '2023-07-01' and '2024-06-30' then 'Train'
       when date_trunc(a.ln_appln_submit_datetime, day) >= '2024-07-01' then 'Test'
       Else 'Other' end dataselection,
  a.beta_demo_score sb_demo_score,
 from {a} a
 where a.ln_loan_applied_flag = 1 and ln_dl_rule_reject_flag = 0
 and date_trunc(a.ln_appln_submit_datetime, day) >= '2023-04-01'
 )
 select * from base where sb_demo_score is not null and dataselection in ('Train', 'Test');"""
 
df = client.query(sq).to_dataframe(progress_bar_type='tqdm')
df.head()

Job ID 647cece7-fcb4-4d21-b452-05b8f79c2bb9 successfully executed: 100%|[32m██████████[0m|




Downloading: 100%|[32m██████████[0m|


Unnamed: 0,digitalLoanAccountId,Application_month,Appl_week_start_date,Appl_week_number,ln_loan_type,dataselection,sb_demo_score
0,70be8c19-e29c-4a73-927b-d922b1ba5a06,2024-03,2024-02-26,9,SIL-Instore,Train,0.1087375900503546
1,1da2fece-eafb-419b-9592-38a2b174296f,2025-05,2025-04-28,17,SIL-Instore,Test,0.1694206828
2,e94d2f93-55da-4cef-8da0-ba8af09e219d,2023-07,2023-07-24,30,SIL-Instore,Train,0.1652616241338547
3,1e0c2c1b-b49b-4ab8-a20b-f0e1a8c4729b,2023-10,2023-10-02,40,SIL-Instore,Train,0.0722613840256545
4,257c2f5c-901b-41ac-8d04-589fe750ec00,2023-08,2023-08-28,35,SIL-Instore,Train,0.187182243730859


In [9]:
import pandas as pd
import numpy as np
from datetime import datetime

# Replace this with your actual DataFrame loading process
sq = f"""
with base as 
(select 
  a.digitalLoanAccountId, 
  FORMAT_DATE('%Y-%m', a.ln_appln_submit_datetime) Application_month,
  FORMAT_DATE('%F', DATE_TRUNC(a.ln_appln_submit_datetime, WEEK(MONDAY))) as Appl_week_start_date,
  EXTRACT(WEEK(MONDAY) FROM a.ln_appln_submit_datetime) as Appl_week_number,
  a.ln_loan_type,
  case when date_trunc(a.ln_appln_submit_datetime, day) between '2023-07-01' and '2024-06-30' then 'Train'
       when date_trunc(a.ln_appln_submit_datetime, day) >= '2024-07-01' then 'Test'
       Else 'Other' end dataselection,
  a.beta_demo_score sb_demo_score,
 from  {a} a
 where a.ln_loan_applied_flag = 1
 and date_trunc(a.ln_appln_submit_datetime, day) >= '2023-04-01'
 )
 select * from base where sb_demo_score is not null and dataselection in ('Train', 'Test');"""
df = client.query(sq).to_dataframe(progress_bar_type='tqdm')

# Covert sb_demo_score to numeric if it's not already
df['sb_demo_score'] = pd.to_numeric(df['sb_demo_score'], errors='coerce')

print(df.groupby(['dataselection'])['digitalLoanAccountId'].nunique())

# Convert Application_month to datetime if it's not already
if df['Application_month'].dtype != 'datetime64[ns]':
    df['Application_month'] = pd.to_datetime(df['Application_month'] + '-01')

# Separate train and test data
train_df = df[df['dataselection'] == 'Train']
test_df = df[df['dataselection'] == 'Test']

# Calculate decile bins for the entire training set
train_deciles = pd.qcut(train_df['sb_demo_score'], 10, labels=False, retbins=True)
train_decile_bins = train_deciles[1]
print(train_decile_bins)
# Function to calculate PSI using the pre-defined decile bins
def calculate_psi_with_bins(data_scores, decile_bins):
    """Calculates PSI using pre-defined decile bins."""
    data_deciles = pd.cut(data_scores, bins=decile_bins, labels=False, include_lowest=True)
    distribution = pd.Series(data_deciles).value_counts().sort_index() / len(data_scores)

    # Align with training distribution
    all_bins = range(10)  # Assuming 10 deciles
    distribution_aligned = distribution.reindex(all_bins, fill_value=0)
    train_dist_aligned = pd.Series(train_deciles[0]).value_counts().sort_index() / len(train_df['sb_demo_score'])
    train_dist_aligned = train_dist_aligned.reindex(all_bins, fill_value=0)

    psi_values = (distribution_aligned - train_dist_aligned) * np.log(distribution_aligned / train_dist_aligned)
    return psi_values.sum()

# Calculate PSI for the entire training set
train_psi = calculate_psi_with_bins(train_df['sb_demo_score'], train_decile_bins)

# Get the last month of the training set
last_train_month = train_df['Application_month'].max()
last_train_month_str = last_train_month.strftime('%Y-%m')

# Calculate monthly PSI for the test set
monthly_psi_results = []

# Add the train set PSI to the results (with the correct last month)
monthly_psi_results.append({
    'Month': last_train_month_str,  # Use the last month of the training set
    # 'loan_type': train_df['ln_loan_type'].iloc[0],  # Assuming loan_type is consistent in the train set
    'scorename': 'sb_demo_score',
    'DateCategory': 'Training',
    'psivalues': train_psi,
    'account_count': train_df['digitalLoanAccountId'].nunique()  # Add distinct account count
})

# Calculate monthly PSI for the test set
for month in sorted(test_df['Application_month'].unique()):
    month_str = month.strftime('%Y-%m')
    month_df = test_df[test_df['Application_month'] == month]
    if not month_df.empty:
        month_psi = calculate_psi_with_bins(month_df['sb_demo_score'], train_decile_bins)
        monthly_psi_results.append({
            'Month': month_str,
            # 'loan_type': month_df['ln_loan_type'].iloc[0],  # Assuming loan_type is consistent in the month
            'scorename': 'sb_demo_score',
            'DateCategory': 'Monthly',
            'psivalues': month_psi,
            'account_count': month_df['digitalLoanAccountId'].nunique()  # Add distinct account count
        })

# Create the output DataFrame
sb_demo_score_output_df = pd.DataFrame(monthly_psi_results)
sb_demo_score_output_df.rename(columns={'psivalues':'sb_demo_score_psivalues'}, inplace = True)
sb_demo_score_output_df

Job ID 95ba1314-a2e6-4ed3-95fc-637a35609bfb successfully executed: 100%|[32m██████████[0m|




Downloading: 100%|[32m██████████[0m|
dataselection
Test     222364
Train    121661
Name: digitalLoanAccountId, dtype: int64
[0.00933333 0.04167946 0.05390926 0.06440052 0.07481236 0.08554865
 0.09703304 0.10986142 0.12673287 0.1536843  0.46407017]


Unnamed: 0,Month,scorename,DateCategory,sb_demo_score_psivalues,account_count
0,2024-06,sb_demo_score,Training,0.0,121661
1,2024-07,sb_demo_score,Monthly,0.01219,20382
2,2024-08,sb_demo_score,Monthly,0.011571,25136
3,2024-09,sb_demo_score,Monthly,0.016299,25284
4,2024-10,sb_demo_score,Monthly,0.013297,23980
5,2024-11,sb_demo_score,Monthly,0.023381,24407
6,2024-12,sb_demo_score,Monthly,0.013167,49720
7,2025-01,sb_demo_score,Monthly,0.00288,23473
8,2025-02,sb_demo_score,Monthly,0.00491,20636
9,2025-03,sb_demo_score,Monthly,0.016699,7266


# s_cic_score

In [10]:
import pandas as pd
import numpy as np
from datetime import datetime

# Replace this with your actual DataFrame loading process
sq = f"""
with base as 
(select 
  a.digitalLoanAccountId, 
  FORMAT_DATE('%Y-%m', a.ln_appln_submit_datetime) Application_month,
  FORMAT_DATE('%F', DATE_TRUNC(a.ln_appln_submit_datetime, WEEK(MONDAY))) as Appl_week_start_date,
  EXTRACT(WEEK(MONDAY) FROM a.ln_appln_submit_datetime) as Appl_week_number,
  a.ln_loan_type,
  case when date_trunc(a.ln_appln_submit_datetime, day) between '2023-07-01' and '2024-06-30' then 'Train'
       when date_trunc(a.ln_appln_submit_datetime, day) >= '2024-07-01' then 'Test'
       Else 'Other' end dataselection,
  a.cic_score s_cic_score,
 from  {a} a
 where a.ln_loan_applied_flag = 1 and ln_dl_rule_reject_flag = 0
 and date_trunc(a.ln_appln_submit_datetime, day) >= '2023-04-01'
 )
 select * from base 
 where s_cic_score is not null and  dataselection in ('Train', 'Test');"""
df = client.query(sq).to_dataframe(progress_bar_type='tqdm')

print(df.groupby(['dataselection'])['digitalLoanAccountId'].nunique())

# Covert s_cic_score to numeric if it's not already
df['s_cic_score'] = pd.to_numeric(df['s_cic_score'], errors='coerce')

# Convert Application_month to datetime if it's not already
if df['Application_month'].dtype != 'datetime64[ns]':
    df['Application_month'] = pd.to_datetime(df['Application_month'] + '-01')

# Separate train and test data
train_df = df[df['dataselection'] == 'Train']
test_df = df[df['dataselection'] == 'Test']

# Calculate decile bins for the entire training set
train_deciles = pd.qcut(train_df['s_cic_score'], 10, labels=False, retbins=True)
train_decile_bins = train_deciles[1]
print(train_decile_bins)
# Function to calculate PSI using the pre-defined decile bins
def calculate_psi_with_bins(data_scores, decile_bins):
    """Calculates PSI using pre-defined decile bins."""
    data_deciles = pd.cut(data_scores, bins=decile_bins, labels=False, include_lowest=True)
    distribution = pd.Series(data_deciles).value_counts().sort_index() / len(data_scores)
    
    # Align with training distribution
    all_bins = range(10)  # Assuming 10 deciles
    distribution_aligned = distribution.reindex(all_bins, fill_value=0)
    train_dist_aligned = pd.Series(train_deciles[0]).value_counts().sort_index() / len(train_df['s_cic_score'])
    train_dist_aligned = train_dist_aligned.reindex(all_bins, fill_value=0)
    
    # print(f"distribution_aligned-{distribution_aligned}")
    # print(f"train_dist_aligned - {train_dist_aligned}")

    psi_values = (distribution_aligned - train_dist_aligned) * np.log(distribution_aligned / train_dist_aligned)
    return psi_values.sum()

# Calculate PSI for the entire training set
train_psi = calculate_psi_with_bins(train_df['s_cic_score'], train_decile_bins)

# Get the last month of the training set
last_train_month = train_df['Application_month'].max()
last_train_month_str = last_train_month.strftime('%Y-%m')

# Calculate monthly PSI for the test set
monthly_psi_results = []

# Add the train set PSI to the results (with the correct last month)
monthly_psi_results.append({
    'Month': last_train_month_str,  # Use the last month of the training set
    # 'loan_type': train_df['ln_loan_type'].iloc[0],  # Assuming loan_type is consistent in the train set
    'scorename': 's_cic_score',
    'DateCategory': 'Training',
    'psivalues': train_psi,
    'account_count': train_df['digitalLoanAccountId'].nunique()  # Add distinct account count
})

# Calculate monthly PSI for the test set
for month in sorted(test_df['Application_month'].unique()):
    month_str = month.strftime('%Y-%m')
    month_df = test_df[test_df['Application_month'] == month]
    if not month_df.empty:
        month_psi = calculate_psi_with_bins(month_df['s_cic_score'], train_decile_bins)
        monthly_psi_results.append({
            'Month': month_str,
            # 'loan_type': month_df['ln_loan_type'].iloc[0],  # Assuming loan_type is consistent in the month
            'scorename': 's_cic_score',
            'DateCategory': 'Monthly',
            'psivalues': month_psi,
            'account_count': month_df['digitalLoanAccountId'].nunique()  # Add distinct account count
        })

# Create the output DataFrame
s_cic_score_output_df = pd.DataFrame(monthly_psi_results)
s_cic_score_output_df.rename(columns={'psivalues':'s_cic_score_psivalues'}, inplace = True)
s_cic_score_output_df

Job ID 33aeca23-fa65-4d48-b7f3-278ecaa09495 successfully executed: 100%|[32m██████████[0m|




Downloading: 100%|[32m██████████[0m|
dataselection
Test     155674
Train     47681
Name: digitalLoanAccountId, dtype: int64
[0.01708888 0.07034814 0.09291445 0.11136085 0.12356264 0.12566888
 0.14012539 0.16042926 0.17681078 0.19927335 0.52746045]


Unnamed: 0,Month,scorename,DateCategory,s_cic_score_psivalues,account_count
0,2024-06,s_cic_score,Training,0.0,47681
1,2024-07,s_cic_score,Monthly,0.100945,13346
2,2024-08,s_cic_score,Monthly,0.064985,17808
3,2024-09,s_cic_score,Monthly,0.041037,17502
4,2024-10,s_cic_score,Monthly,0.039467,16817
5,2024-11,s_cic_score,Monthly,0.04929,17931
6,2024-12,s_cic_score,Monthly,0.036114,34696
7,2025-01,s_cic_score,Monthly,0.051591,16804
8,2025-02,s_cic_score,Monthly,0.060779,14246
9,2025-03,s_cic_score,Monthly,0.049267,5413


# Checking CIC psi with period testing from jan 2025 to feb2-25

In [11]:
import pandas as pd
import numpy as np
from datetime import datetime

# Replace this with your actual DataFrame loading process
sq = f"""
with base as 
(select 
  a.digitalLoanAccountId, 
  FORMAT_DATE('%Y-%m', a.ln_appln_submit_datetime) Application_month,
  ---FORMAT_DATE('%F', DATE_TRUNC(a.ln_appln_submit_datetime, WEEK(MONDAY))) as Appl_week_start_date,
 ---- EXTRACT(WEEK(MONDAY) FROM a.ln_appln_submit_datetime) as Appl_week_number,
  ---a.ln_loan_type,
  case when date_trunc(a.ln_appln_submit_datetime, day) between '2023-07-01' and '2024-06-30' then 'Train'
       when date_trunc(a.ln_appln_submit_datetime, day) >= '2024-07-01' then 'Test'
       Else 'Other' end dataselection,
  a.cic_score s_cic_score,
 from {a}  a
 where a.ln_loan_applied_flag = 1 and ln_dl_rule_reject_flag = 0
 and date_trunc(a.ln_appln_submit_datetime, day) >= '2023-07-01'
 )
 select * from base where dataselection in ('Train', 'Test') and s_cic_score is not null;"""
df = client.query(sq).to_dataframe(progress_bar_type='tqdm')

# convert s_cic_score to numeric if it's not already
df['s_cic_score'] = pd.to_numeric(df['s_cic_score'], errors='coerce')

# Convert Application_month to datetime if it's not already
if df['Application_month'].dtype != 'datetime64[ns]':
    df['Application_month'] = pd.to_datetime(df['Application_month'] + '-01')

# Separate train and test data
train_df = df[df['dataselection'] == 'Train']
test_df = df[df['dataselection'] == 'Test']

# Function to calculate PSI between two periods
def calculate_psi(expected_array, actual_array, bins=10):
    """
    Calculate PSI for two arrays
    
    Parameters:
    -----------
    expected_array : numpy array of expected/training values
    actual_array : numpy array of actual/test values
    bins : number of bins to create
    
    Returns:
    --------
    psi_value : float, the calculated PSI value
    bin_details : DataFrame with binning details
    """
    # Create bins based on the expected array
    quantiles = np.linspace(0, 1, bins+1)
    bin_edges = np.quantile(expected_array, quantiles)
    
    # Ensure bin edges are unique (handle duplicates if they exist)
    bin_edges = np.unique(bin_edges)
    if len(bin_edges) < bins + 1:
        # Add small increments to duplicate values
        temp_edges = np.sort(np.unique(expected_array))
        if len(temp_edges) >= bins + 1:
            bin_edges = np.quantile(temp_edges, quantiles)
        else:
            # If not enough unique values, use min-max range divided into bins
            bin_edges = np.linspace(min(expected_array), max(expected_array), bins+1)
    
    # Create bins for both arrays
    expected_counts, _ = np.histogram(expected_array, bins=bin_edges)
    actual_counts, _ = np.histogram(actual_array, bins=bin_edges)
    
    # Calculate percentages
    expected_percents = expected_counts / len(expected_array) * 100
    actual_percents = actual_counts / len(actual_array) * 100
    
    # Calculate differences and PSI components
    diff = actual_percents - expected_percents
    
    # Safe division and log calculation (avoiding div by zero)
    ratio = np.divide(actual_percents, expected_percents, 
                     out=np.ones_like(actual_percents), 
                     where=expected_percents!=0)
    ln_ratio = np.log(ratio, out=np.zeros_like(ratio), where=ratio>0)
    
    # Calculate PSI components and total
    psi_components = diff / 100 * ln_ratio
    psi_value = np.sum(psi_components)
    
    # Create detailed results DataFrame
    bin_details = pd.DataFrame({
        'Bins': [f"{i+1}" for i in range(len(expected_counts))],
        '# Train': expected_counts,
        '# Train %': expected_percents,
        '# Test': actual_counts,
        '# Test %': actual_percents,
        'A-B': diff,
        'ln(A/B)': ln_ratio,
        'PSI': psi_components * 100
    })
    
    bin_details.loc['Grand Total'] = [
        '', sum(expected_counts), 100.0, sum(actual_counts), 100.0, '', '', psi_value * 100
    ]
    
    return psi_value, bin_details

# Calculate monthly PSI as in your original code
def calculate_monthly_psi():
    # Calculate decile bins for the entire training set
    train_deciles = pd.qcut(train_df['s_cic_score'], 10, labels=False, retbins=True)
    train_decile_bins = train_deciles[1]
    
    # Get the last month of the training set
    last_train_month = train_df['Application_month'].max()
    last_train_month_str = last_train_month.strftime('%Y-%m')
    
    # Calculate monthly PSI for the test set
    monthly_psi_results = []
    
    # Add the train set PSI to the results (with the correct last month)
    monthly_psi_results.append({
        'Month': last_train_month_str,
        'scorename': 's_cic_score',
        'DateCategory': 'Training',
        'psivalues': 0.0  # PSI against itself is 0
    })
    
    # Calculate monthly PSI for the test set
    for month in sorted(test_df['Application_month'].unique()):
        month_str = month.strftime('%Y-%m')
        month_df = test_df[test_df['Application_month'] == month]
        
        if not month_df.empty:
            # Calculate PSI using our function
            month_psi, _ = calculate_psi(train_df['s_cic_score'].values, month_df['s_cic_score'].values)
            
            monthly_psi_results.append({
                'Month': month_str,
                'scorename': 's_cic_score',
                'DateCategory': 'Monthly',
                'psivalues': month_psi
            })
    
    # Create the output DataFrame
    monthly_psi_df = pd.DataFrame(monthly_psi_results)
    monthly_psi_df.rename(columns={'psivalues': 's_cic_score_psivalues'}, inplace=True)
    
    return monthly_psi_df

# Calculate PSI between two specific periods (as shown in the image)
def calculate_period_psi():
    # Define the periods matching the image
    train_period = train_df  # Already defined as 2023-07 to 2024-06
    
    # Filter test data for Jan-Feb 2025 - using datetime objects to avoid the February 29 issue
    jan_2025 = pd.Timestamp('2025-01-01')
    feb_2025 = pd.Timestamp('2025-02-28')  # Using Feb 28 instead of Feb 29
    
    test_period = test_df[(test_df['Application_month'] >= jan_2025) & 
                          (test_df['Application_month'] <= feb_2025)]
    
    # Calculate PSI between periods
    period_psi, psi_details = calculate_psi(train_period['s_cic_score'].values, 
                                           test_period['s_cic_score'].values,
                                           bins=10)
    
    print("PSI between 2023-07 to 2024-06 and 2025-01 to 2025-02:")
    print(f"Overall PSI: {period_psi:.6f}")
    
    return period_psi, psi_details

# Run both calculations
print("Calculating monthly PSI values...")
monthly_psi_results = calculate_monthly_psi()
print(monthly_psi_results)

print("\nCalculating period PSI (matching the image)...")
period_psi, psi_details = calculate_period_psi()
print("\nDetailed PSI calculation by bin:")
print(psi_details)

Job ID 818103f7-dfba-4bd8-81b5-c4762888dfdf successfully executed: 100%|[32m██████████[0m|




Downloading: 100%|[32m██████████[0m|
Calculating monthly PSI values...
      Month    scorename DateCategory  s_cic_score_psivalues
0   2024-06  s_cic_score     Training               0.000000
1   2024-07  s_cic_score      Monthly               0.089287
2   2024-08  s_cic_score      Monthly               0.059374
3   2024-09  s_cic_score      Monthly               0.035775
4   2024-10  s_cic_score      Monthly               0.034002
5   2024-11  s_cic_score      Monthly               0.034678
6   2024-12  s_cic_score      Monthly               0.026039
7   2025-01  s_cic_score      Monthly               0.040706
8   2025-02  s_cic_score      Monthly               0.042667
9   2025-03  s_cic_score      Monthly               0.037079
10  2025-04  s_cic_score      Monthly               0.082960
11  2025-05  s_cic_score      Monthly               0.069773

Calculating period PSI (matching the image)...
PSI between 2023-07 to 2024-06 and 2025-01 to 2025-02:
Overall PSI: 0.041226

Detaile

In [12]:
sq = f"""
with base as 
(select 
  a.digitalLoanAccountId, 
  FORMAT_DATE('%Y-%m', a.ln_appln_submit_datetime) Application_month,
  case when date_trunc(a.ln_appln_submit_datetime, day) between '2023-07-01' and '2024-06-30' then 'Train'
       when date_trunc(a.ln_appln_submit_datetime, day) >= '2024-07-01' then 'Test'
       Else 'Other' end dataselection,
  a.cic_score s_cic_score,
 from {a}  a
 where a.ln_loan_applied_flag = 1 and ln_dl_rule_reject_flag = 0
 and date_trunc(a.ln_appln_submit_datetime, day) >= '2023-07-01'
 )
 select * from base where s_cic_score is not null and dataselection in ('Train', 'Test');"""
df = client.query(sq).to_dataframe(progress_bar_type='tqdm')


Job ID 75f1ad26-6fd6-4c85-9dd1-528e0c852579 successfully executed: 100%|[32m██████████[0m|




Downloading: 100%|[32m██████████[0m|


In [13]:
df.groupby(['Application_month', 'dataselection'])['digitalLoanAccountId'].nunique()

Application_month  dataselection
2023-07            Train              521
2023-08            Train             1148
2023-09            Train             1886
2023-10            Train             1836
2023-11            Train             2505
2023-12            Train             4458
2024-01            Train             2506
2024-02            Train             2382
2024-03            Train             3083
2024-04            Train             5851
2024-05            Train             8033
2024-06            Train            13472
2024-07            Test             13346
2024-08            Test             17808
2024-09            Test             17502
2024-10            Test             16817
2024-11            Test             17931
2024-12            Test             34696
2025-01            Test             16804
2025-02            Test             14246
2025-03            Test              5413
2025-04            Test               541
2025-05            Test               570
N

In [14]:
df.to_csv("Test.csv")

# sb_stack_score

In [33]:
import pandas as pd
import numpy as np
from datetime import datetime

# Assuming your DataFrame is called 'df' and has the structure from the image
# Replace this with your actual DataFrame loading process
sq = f"""
    with base as 
    (select 
    a.digitalLoanAccountId, 
    FORMAT_DATE('%Y-%m', a.ln_appln_submit_datetime) Application_month,
    FORMAT_DATE('%F', DATE_TRUNC(a.ln_appln_submit_datetime, WEEK(MONDAY))) as Appl_week_start_date,
    EXTRACT(WEEK(MONDAY) FROM a.ln_appln_submit_datetime) as Appl_week_number,
    a.ln_loan_type,
    case when date_trunc(a.ln_appln_submit_datetime, day) between '2023-07-01' and '2024-06-30' then 'Train'
        when date_trunc(a.ln_appln_submit_datetime, day) >= '2024-07-01' and date_trunc(a.ln_appln_submit_datetime, day) <= '2025-04-14' then 'Test'
        when date_trunc(a.ln_appln_submit_datetime, day) >= '2025-04-15' and beta_api_called_flag = 1 then 'Test'
        Else 'Other' end dataselection,
    a.beta_stack_score sb_stack_score,
    from {a} a
    where a.ln_loan_applied_flag = 1 and ln_dl_rule_reject_flag = 0
    and date_trunc(a.ln_appln_submit_datetime, day) >= '2023-04-01'
    and a.ln_loan_type like 'SIL-Instore'
    )
    select * from base where sb_stack_score is not null and dataselection in ('Train', 'Test');"""
df = client.query(sq).to_dataframe(progress_bar_type='tqdm')
print(df.groupby(['dataselection'])['digitalLoanAccountId'].nunique())

# convert sb_stack_score to numeric if it's not already
df['sb_stack_score'] = pd.to_numeric(df['sb_stack_score'], errors='coerce')

# Convert Application_month to datetime if it's not already
if df['Application_month'].dtype != 'datetime64[ns]':
    df['Application_month'] = pd.to_datetime(df['Application_month'] + '-01')

# Separate train and test data
train_df = df[df['dataselection'] == 'Train']
test_df = df[df['dataselection'] == 'Test']

# Calculate decile bins for the entire training set
train_deciles = pd.qcut(train_df['sb_stack_score'], 10, labels=False, retbins=True)
train_decile_bins = train_deciles[1]
print(train_decile_bins)
# Function to calculate PSI using the pre-defined decile bins
def calculate_psi_with_bins(data_scores, decile_bins):
    """Calculates PSI using pre-defined decile bins."""
    data_deciles = pd.cut(data_scores, bins=decile_bins, labels=False, include_lowest=True)
    distribution = pd.Series(data_deciles).value_counts().sort_index() / len(data_scores)

    # Align with training distribution
    all_bins = range(10)  # Assuming 10 deciles
    distribution_aligned = distribution.reindex(all_bins, fill_value=0)
    train_dist_aligned = pd.Series(train_deciles[0]).value_counts().sort_index() / len(train_df['sb_stack_score'])
    train_dist_aligned = train_dist_aligned.reindex(all_bins, fill_value=0)

    psi_values = (distribution_aligned - train_dist_aligned) * np.log(distribution_aligned / train_dist_aligned)
    return psi_values.sum()

# Calculate PSI for the entire training set
train_psi = calculate_psi_with_bins(train_df['sb_stack_score'], train_decile_bins)

# Get the last month of the training set
last_train_month = train_df['Application_month'].max()
last_train_month_str = last_train_month.strftime('%Y-%m')

# Calculate monthly PSI for the test set
monthly_psi_results = []

# Add the train set PSI to the results (with the correct last month)
monthly_psi_results.append({
    'Month': last_train_month_str,  # Use the last month of the training set
    # 'loan_type': train_df['ln_loan_type'].iloc[0],  # Assuming loan_type is consistent in the train set
    'scorename': 'sb_stack_score',
    'DateCategory': 'Training',
    'psivalues': train_psi,
    'account_count': train_df['digitalLoanAccountId'].nunique()  # Add distinct account count
    
})

# Calculate monthly PSI for the test set
for month in sorted(test_df['Application_month'].unique()):
    month_str = month.strftime('%Y-%m')
    month_df = test_df[test_df['Application_month'] == month]
    if not month_df.empty:
        month_psi = calculate_psi_with_bins(month_df['sb_stack_score'], train_decile_bins)
        monthly_psi_results.append({
            'Month': month_str,
            # 'loan_type': month_df['ln_loan_type'].iloc[0],  # Assuming loan_type is consistent in the month
            'scorename': 'sb_stack_score',
            'DateCategory': 'Monthly',
            'psivalues': month_psi,
            'account_count': month_df['digitalLoanAccountId'].nunique()  # Add distinct account count
        })

# Create the output DataFrame
sb_stack_score_output_df = pd.DataFrame(monthly_psi_results)
sb_stack_score_output_df.rename(columns={'psivalues':'sb_stack_score_psivalues'}, inplace = True)
sb_stack_score_output_df

Job ID 347355eb-2462-43b1-863c-59736a11fc9e successfully executed: 100%|[32m██████████[0m|




Downloading: 100%|[32m██████████[0m|
dataselection
Test     197358
Train    120879
Name: digitalLoanAccountId, dtype: int64
[0.00768898 0.02260348 0.03504416 0.04701145 0.05500463 0.06715637
 0.08229596 0.09879489 0.1175644  0.15264814 0.67827956]


Unnamed: 0,Month,scorename,DateCategory,sb_stack_score_psivalues,account_count
0,2024-06,sb_stack_score,Training,0.0,120879
1,2024-07,sb_stack_score,Monthly,0.016515,20382
2,2024-08,sb_stack_score,Monthly,0.017257,24161
3,2024-09,sb_stack_score,Monthly,0.023889,21930
4,2024-10,sb_stack_score,Monthly,0.02539,21436
5,2024-11,sb_stack_score,Monthly,0.025993,19339
6,2024-12,sb_stack_score,Monthly,0.022366,43835
7,2025-01,sb_stack_score,Monthly,0.049729,21214
8,2025-02,sb_stack_score,Monthly,0.070291,17473
9,2025-03,sb_stack_score,Monthly,0.084891,5685


# sa_stack_score

In [16]:
import pandas as pd
import numpy as np
from datetime import datetime

# Assuming your DataFrame is called 'df' and has the structure from the image
# Replace this with your actual DataFrame loading process
sq = f"""
with base as 
(select 
  a.digitalLoanAccountId, 
  FORMAT_DATE('%Y-%m', a.ln_appln_submit_datetime) Application_month,
  FORMAT_DATE('%F', DATE_TRUNC(a.ln_appln_submit_datetime, WEEK(MONDAY))) as Appl_week_start_date,
  EXTRACT(WEEK(MONDAY) FROM a.ln_appln_submit_datetime) as Appl_week_number,
  a.ln_loan_type,
  case when date_trunc(a.ln_appln_submit_datetime, day) between '2023-07-01' and '2024-06-30' then 'Train'
       when date_trunc(a.ln_appln_submit_datetime, day) >= '2024-07-01' then 'Test'
       Else 'Other' end dataselection,
  a.alpha_stack_score sa_stack_score,
 from {a}  a
 where a.ln_loan_applied_flag = 1  and ln_dl_rule_reject_flag = 0
 and date_trunc(a.ln_appln_submit_datetime, day) >= '2023-04-01'
 )
 select * from base where sa_stack_score is not null and dataselection in ('Train', 'Test');"""
df = client.query(sq).to_dataframe(progress_bar_type='tqdm')
print(df.groupby(['dataselection'])['digitalLoanAccountId'].nunique())
# Covert sa_stack_score to numeric if it's not already
df['sa_stack_score'] = pd.to_numeric(df['sa_stack_score'], errors='coerce')

# Convert Application_month to datetime if it's not already
if df['Application_month'].dtype != 'datetime64[ns]':
    df['Application_month'] = pd.to_datetime(df['Application_month'] + '-01')

# Separate train and test data
train_df = df[df['dataselection'] == 'Train']
test_df = df[df['dataselection'] == 'Test']

# Calculate decile bins for the entire training set
train_deciles = pd.qcut(train_df['sa_stack_score'], 10, labels=False, retbins=True)
train_decile_bins = train_deciles[1]
print(train_decile_bins)
# Function to calculate PSI using the pre-defined decile bins
def calculate_psi_with_bins(data_scores, decile_bins):
    """Calculates PSI using pre-defined decile bins."""
    data_deciles = pd.cut(data_scores, bins=decile_bins, labels=False, include_lowest=True)
    distribution = pd.Series(data_deciles).value_counts().sort_index() / len(data_scores)

    # Align with training distribution
    all_bins = range(10)  # Assuming 10 deciles
    distribution_aligned = distribution.reindex(all_bins, fill_value=0)
    train_dist_aligned = pd.Series(train_deciles[0]).value_counts().sort_index() / len(train_df['sa_stack_score'])
    train_dist_aligned = train_dist_aligned.reindex(all_bins, fill_value=0)

    psi_values = (distribution_aligned - train_dist_aligned) * np.log(distribution_aligned / train_dist_aligned)
    return psi_values.sum()

# Calculate PSI for the entire training set
train_psi = calculate_psi_with_bins(train_df['sa_stack_score'], train_decile_bins)

# Get the last month of the training set
last_train_month = train_df['Application_month'].max()
last_train_month_str = last_train_month.strftime('%Y-%m')

# Calculate monthly PSI for the test set
monthly_psi_results = []

# Add the train set PSI to the results (with the correct last month)
monthly_psi_results.append({
    'Month': last_train_month_str,  # Use the last month of the training set
    # 'loan_type': train_df['ln_loan_type'].iloc[0],  # Assuming loan_type is consistent in the train set
    'scorename': 'sa_stack_score',
    'DateCategory': 'Training',
    'psivalues': train_psi,
    'account_count': train_df['digitalLoanAccountId'].nunique()  # Add distinct account count
})

# Calculate monthly PSI for the test set
for month in sorted(test_df['Application_month'].unique()):
    month_str = month.strftime('%Y-%m')
    month_df = test_df[test_df['Application_month'] == month]
    if not month_df.empty:
        month_psi = calculate_psi_with_bins(month_df['sa_stack_score'], train_decile_bins)
        monthly_psi_results.append({
            'Month': month_str,
            # 'loan_type': month_df['ln_loan_type'].iloc[0],  # Assuming loan_type is consistent in the month
            'scorename': 'sa_stack_score',
            'DateCategory': 'Monthly',
            'psivalues': month_psi,
            'account_count': month_df['digitalLoanAccountId'].nunique()  # Add distinct account count
        })

# Create the output DataFrame
sa_stack_score_output_df = pd.DataFrame(monthly_psi_results)
sa_stack_score_output_df.rename(columns={'psivalues':'sa_stack_score_psivalues'}, inplace = True)
sa_stack_score_output_df


Job ID 8a0fd9fa-561e-4d81-85fd-9b0f462b53a1 successfully executed: 100%|[32m██████████[0m|




Downloading: 100%|[32m██████████[0m|
dataselection
Test     155674
Train     47681
Name: digitalLoanAccountId, dtype: int64
[0.00758834 0.02078914 0.02803595 0.03564369 0.04440174 0.05423096
 0.06535389 0.07837858 0.09547307 0.12464445 0.65014194]


Unnamed: 0,Month,scorename,DateCategory,sa_stack_score_psivalues,account_count
0,2024-06,sa_stack_score,Training,0.0,47681
1,2024-07,sa_stack_score,Monthly,0.048171,13346
2,2024-08,sa_stack_score,Monthly,0.026966,17808
3,2024-09,sa_stack_score,Monthly,0.019346,17502
4,2024-10,sa_stack_score,Monthly,0.022262,16817
5,2024-11,sa_stack_score,Monthly,0.014487,17931
6,2024-12,sa_stack_score,Monthly,0.010317,34696
7,2025-01,sa_stack_score,Monthly,0.036582,16804
8,2025-02,sa_stack_score,Monthly,0.040083,14246
9,2025-03,sa_stack_score,Monthly,0.032365,5413


# c_credo_score_output_df

In [17]:
import pandas as pd
import numpy as np
from datetime import datetime

# Assuming your DataFrame is called 'df' and has the structure from the image
# Replace this with your actual DataFrame loading process
sq = f"""
with base as 
(select 
  a.digitalLoanAccountId, 
  FORMAT_DATE('%Y-%m', a.ln_appln_submit_datetime) Application_month,
  FORMAT_DATE('%F', DATE_TRUNC(a.ln_appln_submit_datetime, WEEK(MONDAY))) as Appl_week_start_date,
  EXTRACT(WEEK(MONDAY) FROM a.ln_appln_submit_datetime) as Appl_week_number,
  a.ln_loan_type,
  case when date_trunc(a.ln_appln_submit_datetime, day) between '2023-07-01' and '2024-06-30' then 'Train'
       when date_trunc(a.ln_appln_submit_datetime, day) >= '2024-07-01' then 'Test'
       Else 'Other' end dataselection,
  a.credo_quick_score c_credo_score,
 from {a}  a
 where a.ln_loan_applied_flag = 1  and ln_dl_rule_reject_flag = 0
 and date_trunc(a.ln_appln_submit_datetime, day) >= '2023-07-01'
 )
 select * from base where c_credo_score is not null and dataselection in ('Train', 'Test');"""
df = client.query(sq).to_dataframe(progress_bar_type='tqdm')
print(df.groupby(['dataselection'])['digitalLoanAccountId'].nunique())

# Convert c_credo_score to numeric if it's not already
df['c_credo_score'] = pd.to_numeric(df['c_credo_score'], errors='coerce')

# Convert Application_month to datetime if it's not already
if df['Application_month'].dtype != 'datetime64[ns]':
    df['Application_month'] = pd.to_datetime(df['Application_month'] + '-01')

# Separate train and test data
train_df = df[df['dataselection'] == 'Train']
test_df = df[df['dataselection'] == 'Test']

# Calculate decile bins for the entire training set
train_deciles = pd.qcut(train_df['c_credo_score'], 10, labels=False, retbins=True)
train_decile_bins = train_deciles[1]
print(train_decile_bins)
# Function to calculate PSI using the pre-defined decile bins
def calculate_psi_with_bins(data_scores, decile_bins):
    """Calculates PSI using pre-defined decile bins."""
    data_deciles = pd.cut(data_scores, bins=decile_bins, labels=False, include_lowest=True)
    distribution = pd.Series(data_deciles).value_counts().sort_index() / len(data_scores)

    # Align with training distribution
    all_bins = range(10)  # Assuming 10 deciles
    distribution_aligned = distribution.reindex(all_bins, fill_value=0)
    train_dist_aligned = pd.Series(train_deciles[0]).value_counts().sort_index() / len(train_df['c_credo_score'])
    train_dist_aligned = train_dist_aligned.reindex(all_bins, fill_value=0)

    psi_values = (distribution_aligned - train_dist_aligned) * np.log(distribution_aligned / train_dist_aligned)
    return psi_values.sum()

# Calculate PSI for the entire training set
train_psi = calculate_psi_with_bins(train_df['c_credo_score'], train_decile_bins)

# Get the last month of the training set
last_train_month = train_df['Application_month'].max()
last_train_month_str = last_train_month.strftime('%Y-%m')

# Calculate monthly PSI for the test set
monthly_psi_results = []

# Add the train set PSI to the results (with the correct last month)
monthly_psi_results.append({
    'Month': last_train_month_str,  # Use the last month of the training set
    # 'loan_type': train_df['ln_loan_type'].iloc[0],  # Assuming loan_type is consistent in the train set
    'scorename': 'c_credo_score',
    'DateCategory': 'Training',
    'psivalues': train_psi,
    'account_count': train_df['digitalLoanAccountId'].nunique()  # Add distinct account count
})

# Calculate monthly PSI for the test set
for month in sorted(test_df['Application_month'].unique()):
    month_str = month.strftime('%Y-%m')
    month_df = test_df[test_df['Application_month'] == month]
    if not month_df.empty:
        month_psi = calculate_psi_with_bins(month_df['c_credo_score'], train_decile_bins)
        monthly_psi_results.append({
            'Month': month_str,
            # 'loan_type': month_df['ln_loan_type'].iloc[0],  # Assuming loan_type is consistent in the month
            'scorename': 'c_credo_score',
            'DateCategory': 'Monthly',
            'psivalues': month_psi,
            'account_count': month_df['digitalLoanAccountId'].nunique()  # Add distinct account count
        })

# Create the output DataFrame
c_credo_score_output_df = pd.DataFrame(monthly_psi_results)
c_credo_score_output_df.rename(columns={'psivalues':'c_credo_score_psivalues'}, inplace = True)

Job ID 66c1c635-2804-44f7-adea-59d7c9135121 successfully executed: 100%|[32m██████████[0m|




Downloading: 100%|[32m██████████[0m|
dataselection
Test     211220
Train    120879
Name: digitalLoanAccountId, dtype: int64
[0.01944472 0.15411407 0.18755253 0.21858305 0.25137791 0.2829256
 0.31840205 0.35924825 0.40896756 0.47986202 0.85616948]


In [18]:
c_credo_score_output_df

Unnamed: 0,Month,scorename,DateCategory,c_credo_score_psivalues,account_count
0,2024-06,c_credo_score,Training,0.0,120879
1,2024-07,c_credo_score,Monthly,0.006324,20382
2,2024-08,c_credo_score,Monthly,0.00367,25136
3,2024-09,c_credo_score,Monthly,0.003719,25284
4,2024-10,c_credo_score,Monthly,0.002474,23980
5,2024-11,c_credo_score,Monthly,0.002364,24407
6,2024-12,c_credo_score,Monthly,0.003007,49720
7,2025-01,c_credo_score,Monthly,0.002737,23473
8,2025-02,c_credo_score,Monthly,0.004556,16359
9,2025-03,c_credo_score,Monthly,0.020526,402


# s_credo_score

In [19]:
import pandas as pd
import numpy as np
from datetime import datetime

# Assuming your DataFrame is called 'df' and has the structure from the image
# Replace this with your actual DataFrame loading process
sq = f"""
with base as 
(select 
  a.digitalLoanAccountId, 
  FORMAT_DATE('%Y-%m', a.ln_appln_submit_datetime) Application_month,
  FORMAT_DATE('%F', DATE_TRUNC(a.ln_appln_submit_datetime, WEEK(MONDAY))) as Appl_week_start_date,
  EXTRACT(WEEK(MONDAY) FROM a.ln_appln_submit_datetime) as Appl_week_number,
  a.ln_loan_type,
  case when date_trunc(a.ln_appln_submit_datetime, day) between '2023-07-01' and '2024-06-30' then 'Train'
       when date_trunc(a.ln_appln_submit_datetime, day) >= '2024-07-01' then 'Test'
       Else 'Other' end dataselection,
  a.credo_sil_score s_credo_score,
 from {a}  a
 where a.ln_loan_applied_flag = 1  and ln_dl_rule_reject_flag = 0
 and date_trunc(a.ln_appln_submit_datetime, day) >= '2023-07-01'
 )
 select * from base where s_credo_score is not null and dataselection in ('Train', 'Test');"""
df = client.query(sq).to_dataframe(progress_bar_type='tqdm')
print(df.groupby(['dataselection'])['digitalLoanAccountId'].nunique())

# Convert s_credo_score to numeric if it's not already
df['s_credo_score'] = pd.to_numeric(df['s_credo_score'], errors='coerce')

# Convert Application_month to datetime if it's not already
if df['Application_month'].dtype != 'datetime64[ns]':
    df['Application_month'] = pd.to_datetime(df['Application_month'] + '-01')

# Separate train and test data
train_df = df[df['dataselection'] == 'Train']
test_df = df[df['dataselection'] == 'Test']

# Calculate decile bins for the entire training set
train_deciles = pd.qcut(train_df['s_credo_score'], 10, labels=False, retbins=True)
train_decile_bins = train_deciles[1]
print(train_decile_bins)
# Function to calculate PSI using the pre-defined decile bins
def calculate_psi_with_bins(data_scores, decile_bins):
    """Calculates PSI using pre-defined decile bins."""
    data_deciles = pd.cut(data_scores, bins=decile_bins, labels=False, include_lowest=True)
    distribution = pd.Series(data_deciles).value_counts().sort_index() / len(data_scores)

    # Align with training distribution
    all_bins = range(10)  # Assuming 10 deciles
    distribution_aligned = distribution.reindex(all_bins, fill_value=0)
    train_dist_aligned = pd.Series(train_deciles[0]).value_counts().sort_index() / len(train_df['s_credo_score'])
    train_dist_aligned = train_dist_aligned.reindex(all_bins, fill_value=0)

    psi_values = (distribution_aligned - train_dist_aligned) * np.log(distribution_aligned / train_dist_aligned)
    return psi_values.sum()

# Calculate PSI for the entire training set
train_psi = calculate_psi_with_bins(train_df['s_credo_score'], train_decile_bins)

# Get the last month of the training set
last_train_month = train_df['Application_month'].max()
last_train_month_str = last_train_month.strftime('%Y-%m')

# Calculate monthly PSI for the test set
monthly_psi_results = []

# Add the train set PSI to the results (with the correct last month)
monthly_psi_results.append({
    'Month': last_train_month_str,  # Use the last month of the training set
    # 'loan_type': train_df['ln_loan_type'].iloc[0],  # Assuming loan_type is consistent in the train set
    'scorename': 's_credo_score',
    'DateCategory': 'Training',
    'psivalues': train_psi,
    'account_count': train_df['digitalLoanAccountId'].nunique()  # Add distinct account count
})

# Calculate monthly PSI for the test set
for month in sorted(test_df['Application_month'].unique()):
    month_str = month.strftime('%Y-%m')
    month_df = test_df[test_df['Application_month'] == month]
    if not month_df.empty:
        month_psi = calculate_psi_with_bins(month_df['s_credo_score'], train_decile_bins)
        monthly_psi_results.append({
            'Month': month_str,
            # 'loan_type': month_df['ln_loan_type'].iloc[0],  # Assuming loan_type is consistent in the month
            'scorename': 's_credo_score',
            'DateCategory': 'Monthly',
            'psivalues': month_psi,
            'account_count': month_df['digitalLoanAccountId'].nunique()  # Add distinct account count
        })

# Create the output DataFrame
s_credo_score_output_df = pd.DataFrame(monthly_psi_results)
s_credo_score_output_df.rename(columns= {'psivalues':'s_credo_score_psivalues'}, inplace = True)
s_credo_score_output_df

Job ID 391528bf-6f2d-4716-b49a-ce9b10965e3e successfully executed: 100%|[32m██████████[0m|




Downloading: 100%|[32m██████████[0m|
dataselection
Test     211220
Train    120879
Name: digitalLoanAccountId, dtype: int64
[0.01777745 0.05813897 0.07035747 0.08056919 0.09077494 0.10106529
 0.11250886 0.12668458 0.14596628 0.17638774 0.44602214]


Unnamed: 0,Month,scorename,DateCategory,s_credo_score_psivalues,account_count
0,2024-06,s_credo_score,Training,0.0,120879
1,2024-07,s_credo_score,Monthly,0.006408,20382
2,2024-08,s_credo_score,Monthly,0.00251,25136
3,2024-09,s_credo_score,Monthly,0.010615,25284
4,2024-10,s_credo_score,Monthly,0.016447,23980
5,2024-11,s_credo_score,Monthly,0.007789,24407
6,2024-12,s_credo_score,Monthly,0.006564,49720
7,2025-01,s_credo_score,Monthly,0.025764,23473
8,2025-02,s_credo_score,Monthly,0.013218,16359
9,2025-03,s_credo_score,Monthly,0.102192,402


# fu_credo_score

In [20]:
import pandas as pd
import numpy as np
from datetime import datetime

# Assuming your DataFrame is called 'df' and has the structure from the image
# Replace this with your actual DataFrame loading process
sq = f"""
with base as 
(select 
  a.digitalLoanAccountId, 
  FORMAT_DATE('%Y-%m', a.ln_appln_submit_datetime) Application_month,
  FORMAT_DATE('%F', DATE_TRUNC(a.ln_appln_submit_datetime, WEEK(MONDAY))) as Appl_week_start_date,
  EXTRACT(WEEK(MONDAY) FROM a.ln_appln_submit_datetime) as Appl_week_number,
  a.ln_loan_type,
  case when date_trunc(a.ln_appln_submit_datetime, day) between '2023-07-01' and '2024-06-30' then 'Train'
       when date_trunc(a.ln_appln_submit_datetime, day) >= '2024-07-01' then 'Test'
       Else 'Other' end dataselection,
  a.credo_flex_score fu_credo_score,
 from {a} a
 where a.ln_loan_applied_flag = 1  and ln_dl_rule_reject_flag = 0
 and date_trunc(a.ln_appln_submit_datetime, day) >= '2023-07-01'
 )
 select * from base where fu_credo_score is not null and dataselection in ('Train', 'Test');"""
df = client.query(sq).to_dataframe(progress_bar_type='tqdm')
print(df.groupby(['dataselection'])['digitalLoanAccountId'].nunique())

# Convert fu_credo_score to numeric if it's not already
df['fu_credo_score'] = pd.to_numeric(df['fu_credo_score'], errors='coerce')

# Convert Application_month to datetime if it's not already
if df['Application_month'].dtype != 'datetime64[ns]':
    df['Application_month'] = pd.to_datetime(df['Application_month'] + '-01')

# Separate train and test data
train_df = df[df['dataselection'] == 'Train']
test_df = df[df['dataselection'] == 'Test']

# Calculate decile bins for the entire training set
train_deciles = pd.qcut(train_df['fu_credo_score'], 10, labels=False, retbins=True)
train_decile_bins = train_deciles[1]
print(train_decile_bins)
# Function to calculate PSI using the pre-defined decile bins
def calculate_psi_with_bins(data_scores, decile_bins):
    """Calculates PSI using pre-defined decile bins."""
    data_deciles = pd.cut(data_scores, bins=decile_bins, labels=False, include_lowest=True)
    distribution = pd.Series(data_deciles).value_counts().sort_index() / len(data_scores)

    # Align with training distribution
    all_bins = range(10)  # Assuming 10 deciles
    distribution_aligned = distribution.reindex(all_bins, fill_value=0)
    train_dist_aligned = pd.Series(train_deciles[0]).value_counts().sort_index() / len(train_df['fu_credo_score'])
    train_dist_aligned = train_dist_aligned.reindex(all_bins, fill_value=0)

    psi_values = (distribution_aligned - train_dist_aligned) * np.log(distribution_aligned / train_dist_aligned)
    return psi_values.sum()

# Calculate PSI for the entire training set
train_psi = calculate_psi_with_bins(train_df['fu_credo_score'], train_decile_bins)

# Get the last month of the training set
last_train_month = train_df['Application_month'].max()
last_train_month_str = last_train_month.strftime('%Y-%m')

# Calculate monthly PSI for the test set
monthly_psi_results = []

# Add the train set PSI to the results (with the correct last month)
monthly_psi_results.append({
    'Month': last_train_month_str,  # Use the last month of the training set
    # 'loan_type': train_df['ln_loan_type'].iloc[0],  # Assuming loan_type is consistent in the train set
    'scorename': 'fu_credo_score',
    'DateCategory': 'Training',
    'psivalues': train_psi,
    'account_count': train_df['digitalLoanAccountId'].nunique()  # Add distinct account count
})

# Calculate monthly PSI for the test set
for month in sorted(test_df['Application_month'].unique()):
    month_str = month.strftime('%Y-%m')
    month_df = test_df[test_df['Application_month'] == month]
    if not month_df.empty:
        month_psi = calculate_psi_with_bins(month_df['fu_credo_score'], train_decile_bins)
        monthly_psi_results.append({
            'Month': month_str,
            # 'loan_type': month_df['ln_loan_type'].iloc[0],  # Assuming loan_type is consistent in the month
            'scorename': 'fu_credo_score',
            'DateCategory': 'Monthly',
            'psivalues': month_psi,
            'account_count': month_df['digitalLoanAccountId'].nunique()  # Add distinct account count
        })

# Create the output DataFrame
fu_credo_score_output_df = pd.DataFrame(monthly_psi_results)
fu_credo_score_output_df.rename(columns={'psivalues':'fu_credo_score_psivalues'}, inplace = True)
fu_credo_score_output_df

Job ID 59c8fe7f-e03d-4a53-9bbb-8d171889ec45 successfully executed: 100%|[32m██████████[0m|




Downloading: 100%|[32m██████████[0m|
dataselection
Test     211220
Train    120879
Name: digitalLoanAccountId, dtype: int64
[0.00198555 0.02843186 0.04202477 0.05620001 0.07197959 0.09121869
 0.11549339 0.14562888 0.18544468 0.23653321 0.86342827]


Unnamed: 0,Month,scorename,DateCategory,fu_credo_score_psivalues,account_count
0,2024-06,fu_credo_score,Training,0.0,120879
1,2024-07,fu_credo_score,Monthly,0.006596,20382
2,2024-08,fu_credo_score,Monthly,0.006915,25136
3,2024-09,fu_credo_score,Monthly,0.008612,25284
4,2024-10,fu_credo_score,Monthly,0.017032,23980
5,2024-11,fu_credo_score,Monthly,0.011919,24407
6,2024-12,fu_credo_score,Monthly,0.010394,49720
7,2025-01,fu_credo_score,Monthly,0.004554,23473
8,2025-02,fu_credo_score,Monthly,0.005974,16359
9,2025-03,fu_credo_score,Monthly,0.109135,402


# r_credo_score

In [21]:
import pandas as pd
import numpy as np
from datetime import datetime

# Assuming your DataFrame is called 'df' and has the structure from the image
# Replace this with your actual DataFrame loading process
sq = f"""
with base as 
(select 
  a.digitalLoanAccountId, 
  FORMAT_DATE('%Y-%m', a.ln_appln_submit_datetime) Application_month,
  FORMAT_DATE('%F', DATE_TRUNC(a.ln_appln_submit_datetime, WEEK(MONDAY))) as Appl_week_start_date,
  EXTRACT(WEEK(MONDAY) FROM a.ln_appln_submit_datetime) as Appl_week_number,
  a.ln_loan_type,
  case when date_trunc(a.ln_appln_submit_datetime, day) between '2023-07-01' and '2024-06-30' then 'Train'
       when date_trunc(a.ln_appln_submit_datetime, day) >= '2024-07-01' then 'Test'
       Else 'Other' end dataselection,
  a.credo_reloan_score r_credo_score,
 from  {a} a
 where a.ln_loan_applied_flag = 1  and ln_dl_rule_reject_flag = 0
 and date_trunc(a.ln_appln_submit_datetime, day) >= '2023-07-01'
 )
 select * from base where r_credo_score is not null and dataselection in ('Train', 'Test');"""
df = client.query(sq).to_dataframe(progress_bar_type='tqdm')
print(df.groupby(['dataselection'])['digitalLoanAccountId'].nunique())

# Convert r_credo_score to numeric if it's not already
df['r_credo_score'] = pd.to_numeric(df['r_credo_score'], errors='coerce')

# Convert Application_month to datetime if it's not already
if df['Application_month'].dtype != 'datetime64[ns]':
    df['Application_month'] = pd.to_datetime(df['Application_month'] + '-01')

# Separate train and test data
train_df = df[df['dataselection'] == 'Train']
test_df = df[df['dataselection'] == 'Test']

# Calculate decile bins for the entire training set
train_deciles = pd.qcut(train_df['r_credo_score'], 10, labels=False, retbins=True)
train_decile_bins = train_deciles[1]
print(train_decile_bins)
# Function to calculate PSI using the pre-defined decile bins
def calculate_psi_with_bins(data_scores, decile_bins):
    """Calculates PSI using pre-defined decile bins."""
    data_deciles = pd.cut(data_scores, bins=decile_bins, labels=False, include_lowest=True)
    distribution = pd.Series(data_deciles).value_counts().sort_index() / len(data_scores)

    # Align with training distribution
    all_bins = range(10)  # Assuming 10 deciles
    distribution_aligned = distribution.reindex(all_bins, fill_value=0)
    train_dist_aligned = pd.Series(train_deciles[0]).value_counts().sort_index() / len(train_df['r_credo_score'])
    train_dist_aligned = train_dist_aligned.reindex(all_bins, fill_value=0)

    psi_values = (distribution_aligned - train_dist_aligned) * np.log(distribution_aligned / train_dist_aligned)
    return psi_values.sum()

# Calculate PSI for the entire training set
train_psi = calculate_psi_with_bins(train_df['r_credo_score'], train_decile_bins)

# Get the last month of the training set
last_train_month = train_df['Application_month'].max()
last_train_month_str = last_train_month.strftime('%Y-%m')

# Calculate monthly PSI for the test set
monthly_psi_results = []

# Add the train set PSI to the results (with the correct last month)
monthly_psi_results.append({
    'Month': last_train_month_str,  # Use the last month of the training set
    # 'loan_type': train_df['ln_loan_type'].iloc[0],  # Assuming loan_type is consistent in the train set
    'scorename': 'r_credo_score',
    'DateCategory': 'Training',
    'psivalues': train_psi,
    'account_count': train_df['digitalLoanAccountId'].nunique()  # Add distinct account count
})

# Calculate monthly PSI for the test set
for month in sorted(test_df['Application_month'].unique()):
    month_str = month.strftime('%Y-%m')
    month_df = test_df[test_df['Application_month'] == month]
    if not month_df.empty:
        month_psi = calculate_psi_with_bins(month_df['r_credo_score'], train_decile_bins)
        monthly_psi_results.append({
            'Month': month_str,
            # 'loan_type': month_df['ln_loan_type'].iloc[0],  # Assuming loan_type is consistent in the month
            'scorename': 'r_credo_score',
            'DateCategory': 'Monthly',
            'psivalues': month_psi,
            'account_count': month_df['digitalLoanAccountId'].nunique()  # Add distinct account count
        })

# Create the output DataFrame
r_credo_score_output_df = pd.DataFrame(monthly_psi_results)
r_credo_score_output_df.rename(columns={'psivalues':'r_credo_score_psivalues'}, inplace = True)
r_credo_score_output_df


Job ID b305a418-2df3-4b48-ba49-2bf7d69e6696 successfully executed: 100%|[32m██████████[0m|




Downloading: 100%|[32m██████████[0m|
dataselection
Test     211220
Train    120879
Name: digitalLoanAccountId, dtype: int64
[0.00715845 0.0984758  0.12671816 0.15475974 0.18609169 0.21967947
 0.25740281 0.30077284 0.35506124 0.43564143 0.87489346]


Unnamed: 0,Month,scorename,DateCategory,r_credo_score_psivalues,account_count
0,2024-06,r_credo_score,Training,0.0,120879
1,2024-07,r_credo_score,Monthly,0.010792,20382
2,2024-08,r_credo_score,Monthly,0.008494,25136
3,2024-09,r_credo_score,Monthly,0.005283,25284
4,2024-10,r_credo_score,Monthly,0.007098,23980
5,2024-11,r_credo_score,Monthly,0.004153,24407
6,2024-12,r_credo_score,Monthly,0.004144,49720
7,2025-01,r_credo_score,Monthly,0.012277,23473
8,2025-02,r_credo_score,Monthly,0.005315,16359
9,2025-03,r_credo_score,Monthly,0.028078,402


# gen_credo_score

In [22]:
import pandas as pd
import numpy as np
from datetime import datetime

# Assuming your DataFrame is called 'df' and has the structure from the image
# Replace this with your actual DataFrame loading process
sq = f"""
with base as 
(select 
  a.digitalLoanAccountId, 
  FORMAT_DATE('%Y-%m', a.ln_appln_submit_datetime) Application_month,
  FORMAT_DATE('%F', DATE_TRUNC(a.ln_appln_submit_datetime, WEEK(MONDAY))) as Appl_week_start_date,
  EXTRACT(WEEK(MONDAY) FROM a.ln_appln_submit_datetime) as Appl_week_number,
  a.ln_loan_type,
  case when date_trunc(a.ln_appln_submit_datetime, day) between '2023-07-01' and '2024-06-30' then 'Train'
       when date_trunc(a.ln_appln_submit_datetime, day) >= '2024-07-01' then 'Test'
       Else 'Other' end dataselection,
  a.credo_gen_score gen_credo_score,
 from {a}  a
 where a.ln_loan_applied_flag = 1  and ln_dl_rule_reject_flag = 0
 and date_trunc(a.ln_appln_submit_datetime, day) >= '2023-04-01'
 )
 select * from base where gen_credo_score is not null and dataselection in ('Train', 'Test');"""
df = client.query(sq).to_dataframe(progress_bar_type='tqdm')
print(df.groupby(['dataselection'])['digitalLoanAccountId'].nunique())

# Convert gen_credo_score to numeric if it's not already
df['gen_credo_score'] = pd.to_numeric(df['gen_credo_score'], errors='coerce')

# Convert Application_month to datetime if it's not already
if df['Application_month'].dtype != 'datetime64[ns]':
    df['Application_month'] = pd.to_datetime(df['Application_month'] + '-01')

# Separate train and test data
train_df = df[df['dataselection'] == 'Train']
test_df = df[df['dataselection'] == 'Test']

# Calculate decile bins for the entire training set
train_deciles = pd.qcut(train_df['gen_credo_score'], 10, labels=False, retbins=True)
train_decile_bins = train_deciles[1]
print(train_decile_bins)
# Function to calculate PSI using the pre-defined decile bins
def calculate_psi_with_bins(data_scores, decile_bins):
    """Calculates PSI using pre-defined decile bins."""
    data_deciles = pd.cut(data_scores, bins=decile_bins, labels=False, include_lowest=True)
    distribution = pd.Series(data_deciles).value_counts().sort_index() / len(data_scores)

    # Align with training distribution
    all_bins = range(10)  # Assuming 10 deciles
    distribution_aligned = distribution.reindex(all_bins, fill_value=0)
    train_dist_aligned = pd.Series(train_deciles[0]).value_counts().sort_index() / len(train_df['gen_credo_score'])
    train_dist_aligned = train_dist_aligned.reindex(all_bins, fill_value=0)

    psi_values = (distribution_aligned - train_dist_aligned) * np.log(distribution_aligned / train_dist_aligned)
    return psi_values.sum()

# Calculate PSI for the entire training set
train_psi = calculate_psi_with_bins(train_df['gen_credo_score'], train_decile_bins)

# Get the last month of the training set
last_train_month = train_df['Application_month'].max()
last_train_month_str = last_train_month.strftime('%Y-%m')

# Calculate monthly PSI for the test set
monthly_psi_results = []

# Add the train set PSI to the results (with the correct last month)
monthly_psi_results.append({
    'Month': last_train_month_str,  # Use the last month of the training set
    # 'loan_type': train_df['ln_loan_type'].iloc[0],  # Assuming loan_type is consistent in the train set
    'scorename': 'gen_credo_score',
    'DateCategory': 'Training',
    'psivalues': train_psi,
    'account_count': train_df['digitalLoanAccountId'].nunique()  # Add distinct account count
})

# Calculate monthly PSI for the test set
for month in sorted(test_df['Application_month'].unique()):
    month_str = month.strftime('%Y-%m')
    month_df = test_df[test_df['Application_month'] == month]
    if not month_df.empty:
        month_psi = calculate_psi_with_bins(month_df['gen_credo_score'], train_decile_bins)
        monthly_psi_results.append({
            'Month': month_str,
            # 'loan_type': month_df['ln_loan_type'].iloc[0],  # Assuming loan_type is consistent in the month
            'scorename': 'gen_credo_score',
            'DateCategory': 'Monthly',
            'psivalues': month_psi,
            'account_count': month_df['digitalLoanAccountId'].nunique()  # Add distinct account count
        })

# Create the output DataFrame
gen_credo_score_output_df = pd.DataFrame(monthly_psi_results)
gen_credo_score_output_df.rename(columns={'psivalues':'gen_credo_score_psivalues'}, inplace = True)
gen_credo_score_output_df

Job ID e212eea8-02be-4ebd-8878-e15bd849453f successfully executed: 100%|[32m██████████[0m|




Downloading: 100%|[32m██████████[0m|
dataselection
Test     222362
Train    120879
Name: digitalLoanAccountId, dtype: int64
[0.01431733 0.05413145 0.06841259 0.0811483  0.09350863 0.10622257
 0.1203887  0.13724142 0.15937649 0.195322   0.57325398]


Unnamed: 0,Month,scorename,DateCategory,gen_credo_score_psivalues,account_count
0,2024-06,gen_credo_score,Training,0.0,120879
1,2024-07,gen_credo_score,Monthly,0.014756,20382
2,2024-08,gen_credo_score,Monthly,0.008826,25136
3,2024-09,gen_credo_score,Monthly,0.015472,25284
4,2024-10,gen_credo_score,Monthly,0.024325,23980
5,2024-11,gen_credo_score,Monthly,0.015672,24407
6,2024-12,gen_credo_score,Monthly,0.014879,49720
7,2025-01,gen_credo_score,Monthly,0.050419,23473
8,2025-02,gen_credo_score,Monthly,0.025805,20636
9,2025-03,gen_credo_score,Monthly,0.036191,7266


# Combining dataframes

In [23]:
# import functools

# dataframes = [s_apps_score_output_df, sb_demo_score_output_df, s_cic_score_output_df, sb_stack_score_output_df,sa_stack_score_output_df, c_credo_score_output_df, s_credo_score_output_df, fu_credo_score_output_df,
#               r_credo_score_output_df, gen_credo_score_output_df]
# common_columns = ['Month', 'scorename', 'DateCategory', 'psivalues']

# def merge_dataframes(df1, df2):
#     return pd.merge(df1, df2, on=common_columns, how='outer')

# final_df = functools.reduce(merge_dataframes, dataframes)

# final_df.columns.values

In [24]:
import pandas as pd

def concatenate_dataframes(dataframe_list):
    """
    Concatenates a list of Pandas DataFrames into a single DataFrame.

    Args:
        dataframe_list: A list of Pandas DataFrames to concatenate.

    Returns:
        A single concatenated Pandas DataFrame, or None if the input list is empty.
    """
    if not dataframe_list:
        return None  # Return None if the list is empty

    try:
        concatenated_df = pd.concat(dataframe_list, ignore_index=True)
        return concatenated_df
    except Exception as e:
        print(f"An error occurred during concatenation: {e}")
        return None

# Example usage (assuming your DataFrames are already defined):

# Replace these with your actual DataFrames
# s_apps_score_output_df = ...
# sb_demo_score_output_df = ...
# s_cic_score_output_df = ...
# sb_stack_score_output_df = ...
# sa_stack_score_output_df = ...
# c_credo_score_output_df = ...
# s_credo_score_output_df = ...
# fu_credo_score_output_df = ...
# r_credo_score_output_df = ...
# gen_credo_score_output_df = ...

dataframe_list = [
    s_apps_score_output_df,
    sb_demo_score_output_df,
    s_cic_score_output_df,
    sb_stack_score_output_df,
    sa_stack_score_output_df,
    c_credo_score_output_df,
    s_credo_score_output_df,
    fu_credo_score_output_df,
    r_credo_score_output_df,
    gen_credo_score_output_df,
]

concatenated_result = concatenate_dataframes(dataframe_list)

if concatenated_result is not None:
    print(concatenated_result)
else:
    print("Concatenation failed or the input list was empty.")

       Month        scorename DateCategory  s_apps_score_psivalues  \
0    2024-06     s_apps_score     Training                0.000000   
1    2024-07     s_apps_score      Monthly                0.034295   
2    2024-08     s_apps_score      Monthly                0.034844   
3    2024-09     s_apps_score      Monthly                0.045087   
4    2024-10     s_apps_score      Monthly                0.046058   
..       ...              ...          ...                     ...   
115  2025-01  gen_credo_score      Monthly                     NaN   
116  2025-02  gen_credo_score      Monthly                     NaN   
117  2025-03  gen_credo_score      Monthly                     NaN   
118  2025-04  gen_credo_score      Monthly                     NaN   
119  2025-05  gen_credo_score      Monthly                     NaN   

     account_count  sb_demo_score_psivalues  s_cic_score_psivalues  \
0           109413                      NaN                    NaN   
1            18571 

In [25]:
concatenated_result.dtypes

Month                         object
scorename                     object
DateCategory                  object
s_apps_score_psivalues       float64
account_count                  int64
sb_demo_score_psivalues      float64
s_cic_score_psivalues        float64
sb_stack_score_psivalues     float64
sa_stack_score_psivalues     float64
c_credo_score_psivalues      float64
s_credo_score_psivalues      float64
fu_credo_score_psivalues     float64
r_credo_score_psivalues      float64
gen_credo_score_psivalues    float64
dtype: object

In [26]:
sq = """drop table if exists prj-prod-dataplatform.dap_ds_poweruser_playground.Model_Psi;"""

client.query(sq)

QueryJob<project=prj-prod-dataplatform, location=asia-southeast1, id=60f416af-2f85-4b66-8e27-d6b439c4ff48>

In [27]:
import pandas as pd
from google.cloud import bigquery

# Create a BigQuery client
client = bigquery.Client('prj-prod-dataplatform')

# Define your table schema
table_schema = [
    bigquery.SchemaField('Month', 'STRING'),
    bigquery.SchemaField('scorename', 'STRING'),
    bigquery.SchemaField('DateCategory', 'STRING'),
    bigquery.SchemaField('s_apps_score_psivalues', 'FLOAT64'),
    bigquery.SchemaField('account_count', 'INT64'),
    bigquery.SchemaField('sb_demo_score_psivalues', 'FLOAT64'),
    bigquery.SchemaField('s_cic_score_psivalues', 'FLOAT64'),
    bigquery.SchemaField('sb_stack_score_psivalues', 'FLOAT64'),
    bigquery.SchemaField('sa_stack_score_psivalues', 'FLOAT64'),
    bigquery.SchemaField('c_credo_score_psivalues', 'FLOAT64'),
    bigquery.SchemaField('s_credo_score_psivalue', 'FLOAT64'),
    bigquery.SchemaField('fu_credo_score_psivalues', 'FLOAT64'),
    bigquery.SchemaField('r_credo_score_psivalues', 'FLOAT64'),
    bigquery.SchemaField('gen_credo_score_psivalues', 'FLOAT64'),
   
]

# Create your BigQuery table
table_id = 'prj-prod-dataplatform.dap_ds_poweruser_playground.Model_Psi'
table = bigquery.Table(table_id, schema=table_schema)
table = client.create_table(table)

# Load your DataFrame into BigQuery
job_config = bigquery.LoadJobConfig(
    write_disposition='WRITE_TRUNCATE'
)

load_job = client.load_table_from_dataframe(
    concatenated_result, table_id, job_config=job_config
)

load_job.result()



LoadJob<project=prj-prod-dataplatform, location=asia-southeast1, id=e685d9aa-6b7e-4799-966b-2c637f5da98b>

In [28]:
concatenated_result.head()

Unnamed: 0,Month,scorename,DateCategory,s_apps_score_psivalues,account_count,sb_demo_score_psivalues,s_cic_score_psivalues,sb_stack_score_psivalues,sa_stack_score_psivalues,c_credo_score_psivalues,s_credo_score_psivalues,fu_credo_score_psivalues,r_credo_score_psivalues,gen_credo_score_psivalues
0,2024-06,s_apps_score,Training,0.0,109413,,,,,,,,,
1,2024-07,s_apps_score,Monthly,0.034295,18571,,,,,,,,,
2,2024-08,s_apps_score,Monthly,0.034844,22959,,,,,,,,,
3,2024-09,s_apps_score,Monthly,0.045087,23145,,,,,,,,,
4,2024-10,s_apps_score,Monthly,0.046058,21868,,,,,,,,,


In [29]:
# sq = f"""
# with base as 
# (select 
#   a.digitalLoanAccountId, 
#   FORMAT_DATE('%Y-%m', a.ln_appln_submit_datetime) Application_month,
#   FORMAT_DATE('%F', DATE_TRUNC(a.ln_appln_submit_datetime, WEEK(MONDAY))) as Appl_week_start_date,
#   EXTRACT(WEEK(MONDAY) FROM a.ln_appln_submit_datetime) as Appl_week_number,
#   a.ln_loan_type,
#   case when date_trunc(a.ln_appln_submit_datetime, day) between '2023-07-01' and '2024-06-30' then 'Train'
#        when date_trunc(a.ln_appln_submit_datetime, day) >= '2024-07-01' then 'Test'
#        Else 'Other' end dataselection,
#   a.s_cic_score,
#  from 
# #  {a} a
#  where a.ln_loan_applied_flag = 1 and ln_dl_rule_reject_flag = 0
#  and date_trunc(a.ln_appln_submit_datetime, day) >= '2023-04-01'
#  )
#  select * from base where s_cic_score is not null and dataselection in ('Train', 'Test');"""
# df = client.query(sq).to_dataframe(progress_bar_type='tqdm')

In [30]:
df.groupby(['dataselection'])['digitalLoanAccountId'].nunique()

dataselection
Test     222362
Train    120879
Name: digitalLoanAccountId, dtype: int64