# <div align = "center" style="color:rgb(0, 255, 0);"> Gini Calculation for SIL ALPHA BETA for Different Trenches </div>

# Define Library

In [1]:
# %% [markdown]
# # Jupyter Notebook Loading Header
#
# This is a custom loading header for Jupyter Notebooks in Visual Studio Code.
# It includes common imports and settings to get you started quickly.
# %% [markdown]
## Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from google.cloud import bigquery
from google.cloud import storage
import os
import tempfile
import time
from datetime import datetime
import uuid
import joblib
import uuid

import gcsfs
import duckdb as dd
import pickle
import joblib
from typing import Union
import io

path = r'C:\Users\Dwaipayan\AppData\Roaming\gcloud\legacy_credentials\dchakroborti@tonikbank.com\adc.json'
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = path
client = bigquery.Client(project='prj-prod-dataplatform')
os.environ["GOOGLE_CLOUD_PROJECT"] = "prj-prod-dataplatform"
# %% [markdown]
## Configure Settings
# Set options or configurations as needed
pd.set_option('display.max_columns', None)
pd.set_option("Display.max_rows", 100)


# Constant

In [2]:
CURRENT_DATE = datetime.now().strftime("%Y%m%d")


# Config

In [3]:
unique_id = str(uuid.uuid4()).replace('-', '')[-12:]
print(f"The unique Id is: {unique_id}")
BUCKETNAME = 'prod-asia-southeast1-tonik-aiml-workspace'
CLOUDPATH = 'DC/Model_Monitoring/Gini_Values'
LOCALPATH = r'D:\OneDrive - Tonik Financial Pte Ltd\MyStuff\Data Engineering\Model_Monitoring\New_Model_Monitoring\Data\Gini_Values'
VERSION = 'V1'
PROJECT_ID = 'prj-prod-dataplatform'

The unique Id is: 272d02f6d21c


# <div align="left" style="color:rgb(51, 250, 250);"> Functions </div>

## <div align="left" style="color:rgb(51, 250, 250);"> Save the data to google clound storage </div>

In [4]:
def save_df_to_gcs(df, bucket_name, destination_blob_name, file_format='csv'):
    """Saves a pandas DataFrame to Google Cloud Storage.

    Args:
        df: The pandas DataFrame to save.
        bucket_name: The name of the GCS bucket.
        destination_blob_name: The name of the blob to be created.
        file_format: The file format to save the DataFrame in ('csv' or 'parquet').
    """

    # Create a temporary file
    if file_format == 'csv':
        temp_file = 'temp.csv'
        df.to_csv(temp_file, index=False)
    elif file_format == 'parquet':
        temp_file = 'temp.parquet'
        df.to_parquet(temp_file, index=False)
    else:
        raise ValueError("Invalid file format. Please choose 'csv' or 'parquet'.")

    # Upload the file to GCS
    storage_client = storage.Client(project="prj-prod-dataplatform")

    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(destination_blob_name)

    blob.upload_from_filename(temp_file)

    # Remove the temporary file
    import os
    os.remove(temp_file)
    


## <div align="left" style="color:rgb(51, 250, 250);"> Read the Data from Google Cloud Storage </div>

In [5]:
def read_df_from_gcs(bucket_name, source_blob_name, file_format='csv'):
    """Reads a DataFrame from Google Cloud Storage.

    Args:
        bucket_name: The name of the GCS bucket.
        source_blob_name: The name of the blob to read.
        file_format: The file format to read ('csv' or 'parquet').

    Returns:
        pandas.DataFrame: The data loaded from the GCS file.
    """
    # Create a temporary file name
    temp_file = f'temp.{file_format}'
    
    try:
        # Initialize GCS client
        storage_client = storage.Client()
        bucket = storage_client.bucket(bucket_name)
        blob = bucket.blob(source_blob_name)

        # Download the file to a temporary location
        blob.download_to_filename(temp_file)

        # Read the file into a DataFrame
        if file_format == 'csv':
            df = pd.read_csv(temp_file, low_memory=False)
        elif file_format == 'parquet':
            df = pd.read_parquet(temp_file)
        else:
            raise ValueError("Invalid file format. Please choose 'csv' or 'parquet'.")

        return df

    finally:
        # Clean up the temporary file
        if os.path.exists(temp_file):
            os.remove(temp_file)

## <div align = "left" style="color:rgb(51, 250, 250);"> Data Quality Report </div>

In [6]:
def data_quality_report(df, target_col='ln_fspd30_flag'):
    # Initialize an empty list to store each row of data
    report_data = []
    # Iterate over each column in the DataFrame to compute metrics
    for col in df.columns:
        # Determine the data type of the column
        data_type = df[col].dtype
       
        # Calculate the number of missing values in the column
        missing_values = df[col].isnull().sum()
       
        # Calculate the percentage of missing values relative to the total number of rows
        missing_percentage = (missing_values / len(df)) * 100
       
        # Calculate the number of unique values in the column
        unique_values = df[col].nunique()
       
        # Calculate the percentage of non-missing values
        non_missing_percentage = ((len(df) - missing_values) / len(df)) * 100
       
        # Check if the column is numeric to compute additional metrics
        if pd.api.types.is_numeric_dtype(df[col]):
            # Compute minimum, maximum, mean, median, mode, mode percentage, standard deviation, and quantiles
            min_value = df[col].min()
            max_value = df[col].max()
            mean_value = df[col].mean()
            median_value = df[col].median()
            mode_value = df[col].mode().iloc[0] if not df[col].mode().empty else None
            mode_percentage = (df[col] == mode_value).sum() / len(df) * 100 if mode_value is not None else None
            std_dev = df[col].std()
            quantile_25 = df[col].quantile(0.25)
            quantile_50 = df[col].quantile(0.50)  # Same as median
            quantile_75 = df[col].quantile(0.75)
            
            # Calculate the Interquartile Range (IQR)
            iqr = quantile_75 - quantile_25
            
            # Calculate Skewness and Kurtosis
            skewness = df[col].skew()
            kurtosis = df[col].kurt()
            
            # Calculate Coefficient of Variation (CV) - standardized measure of dispersion
            cv = (std_dev / mean_value) * 100 if mean_value != 0 else None
            
            # Calculate correlation with target variable if target exists in dataframe
            if target_col in df.columns and col != target_col and pd.api.types.is_numeric_dtype(df[target_col]):
                # Calculate correlation only using rows where both columns have non-null values
                correlation = df[[col, target_col]].dropna().corr().iloc[0, 1]
            else:
                correlation = None
        else:
            # Assign None for non-numeric columns where appropriate
            min_value = None
            max_value = None
            mean_value = None
            median_value = None
            mode_value = df[col].mode().iloc[0] if not df[col].mode().empty else None
            mode_percentage = (df[col] == mode_value).sum() / len(df) * 100 if mode_value is not None else None
            std_dev = None
            quantile_25 = None
            quantile_50 = None
            quantile_75 = None
            iqr = None
            skewness = None
            kurtosis = None
            cv = None
            correlation = None
       
        # Append the computed metrics for the current column to the list
        report_data.append({
            'Column': col,
            'Data Type': data_type,
            'Missing Values': missing_values,
            'Missing Percentage': missing_percentage,
            'Unique Values': unique_values,
            'Min': min_value,
            'Max': max_value,
            'Mean': mean_value,
            'Median': median_value,
            'Mode': mode_value,
            'Mode Percentage': mode_percentage,
            'Std Dev': std_dev,
            'Non-missing Percentage': non_missing_percentage,
            '25% Quantile': quantile_25,
            '50% Quantile': quantile_50,
            '75% Quantile': quantile_75,
            'IQR': iqr,
            'Skewness': skewness,
            'Kurtosis': kurtosis,
            'CV (%)': cv,
            f'Correlation with {target_col}': correlation
        })
    # Create the DataFrame from the list of dictionaries
    report = pd.DataFrame(report_data)
   
    # Return the complete data quality report DataFrame
    return report

# <div align = "left" style="color:rgb(51,250,250);"> Upload pickle file to Google Cloud Storage Bucke </div>

In [7]:
def upload_to_gcs(bucket_name, source_file_path, destination_blob_name):
    """Uploads a file to Google Cloud Storage"""
    client = storage.Client()
    bucket = client.bucket(bucket_name)
    blob = bucket.blob(destination_blob_name)
    
    blob.upload_from_filename(source_file_path)
    print(f"File {source_file_path} uploaded to {bucket_name}/{destination_blob_name}")

In [8]:
import pickle
import io
from google.cloud import storage
def save_pickle_to_gcs(data, bucket_name, destination_blob_name):
    """
    Save any Python object as a pickle file to Google Cloud Storage
    
    Args:
        data: The Python object to pickle (DataFrame, dict, list, etc.)
        bucket_name: Name of the GCS bucket
        destination_blob_name: Path/filename in the bucket
    """
    # Initialize the GCS client
    client = storage.Client()
    bucket = client.bucket(bucket_name)
    blob = bucket.blob(destination_blob_name)
    
    # Serialize the data to pickle format in memory
    pickle_buffer = io.BytesIO()
    pickle.dump(data, pickle_buffer)
    pickle_buffer.seek(0)
    
    # Upload the pickle data to GCS
    blob.upload_from_file(pickle_buffer, content_type='application/octet-stream')
    print(f"Pickle file uploaded to gs://{bucket_name}/{destination_blob_name}")

# save_dataframe_multi_format

In [9]:
def save_dataframe_multi_format(
    dataframe: pd.DataFrame, 
    cloud_path: str, 
    filename: str, 
    client: bigquery.Client = None,
    bucket_name: str = None
) -> dict:
    """
    Save a pandas DataFrame to Google Cloud Storage in multiple formats (CSV, Pickle, Parquet, Joblib).
    
    Args:
        dataframe (pd.DataFrame): The DataFrame to save
        cloud_path (str): The cloud path (e.g., 'DC/Model_Monitoring/cash_beta_trench1_data')
        filename (str): The base filename without extension
        client (bigquery.Client, optional): BigQuery client (for project reference)
        bucket_name (str, optional): GCS bucket name. If None, will try to extract from client
        
    Returns:
        dict: Dictionary with status of each file saved
        
    Example:
        client = bigquery.Client(project='prj-prod-dataplatform')
        CLOUDPATH = 'DC/Model_Monitoring/cash_beta_trench1_data'
        
        results = save_dataframe_multi_format(
            dataframe=d1,
            cloud_path=CLOUDPATH,
            filename='my_data',
            client=client,
            bucket_name='your-bucket-name'  # Replace with your actual bucket name
        )
    """
    
    # Initialize Google Cloud Storage client
    storage_client = storage.Client(project=client.project if client else None)
    
    # You'll need to specify your bucket name here
    # Common bucket names in GCP data platforms might be like:
    # - 'prj-prod-dataplatform-storage'
    # - 'dataplatform-storage'
    # - or similar pattern
    if bucket_name is None:
        # You need to replace this with your actual bucket name
        raise ValueError("Please provide the bucket_name parameter")
    
    bucket = storage_client.bucket(bucket_name)
    
    # Results dictionary to track saves
    results = {}
    
    # Ensure cloud_path doesn't start with '/'
    cloud_path = cloud_path.lstrip('/')
    
    try:
        # 1. Save as CSV
        csv_buffer = io.StringIO()
        dataframe.to_csv(csv_buffer, index=False)
        csv_blob = bucket.blob(f"{cloud_path}/{filename}.csv")
        csv_blob.upload_from_string(csv_buffer.getvalue(), content_type='text/csv')
        results['csv'] = f"gs://{bucket_name}/{cloud_path}/{filename}.csv"
        
        # 2. Save as Pickle
        pickle_buffer = io.BytesIO()
        pickle.dump(dataframe, pickle_buffer)
        pickle_blob = bucket.blob(f"{cloud_path}/{filename}.pkl")
        pickle_blob.upload_from_string(pickle_buffer.getvalue(), content_type='application/octet-stream')
        results['pickle'] = f"gs://{bucket_name}/{cloud_path}/{filename}.pkl"
        
        # 3. Save as Parquet
        parquet_buffer = io.BytesIO()
        dataframe.to_parquet(parquet_buffer, index=False)
        parquet_blob = bucket.blob(f"{cloud_path}/{filename}.parquet")
        parquet_blob.upload_from_string(parquet_buffer.getvalue(), content_type='application/octet-stream')
        results['parquet'] = f"gs://{bucket_name}/{cloud_path}/{filename}.parquet"
        
        # 4. Save as Joblib
        joblib_buffer = io.BytesIO()
        joblib.dump(dataframe, joblib_buffer)
        joblib_blob = bucket.blob(f"{cloud_path}/{filename}.joblib")
        joblib_blob.upload_from_string(joblib_buffer.getvalue(), content_type='application/octet-stream')
        results['joblib'] = f"gs://{bucket_name}/{cloud_path}/{filename}.joblib"
        
        print("All files saved successfully!")
        for format_type, path in results.items():
            print(f"{format_type.upper()}: {path}")
            
    except Exception as e:
        print(f"Error occurred: {str(e)}")
        results['error'] = str(e)
    
    return results

# Test SQL

In [10]:
a = " `prj-prod-dataplatform.risk_credit_mis.application_score_master`"

In [13]:
sq = f"""select * from {a} where beta_stack_score is not null;"""
df = client.query(sq).to_dataframe(progress_bar_type = 'tqdm')
print(f"The shape of the beta_stack_score not null rows are:\t{df.shape}")

  print(f"The shape of the beta_stack_score not null rows are:\{df.shape}")


Job ID 50dc96db-a5a5-41af-9fd6-09af11425509 successfully executed: 100%|[32m██████████[0m|




Downloading: 100%|[32m██████████[0m|
The shape of the beta_stack_score not null rows are:\(427123, 194)


In [None]:
WITH
  bss AS (
  SELECT
    digitalLoanAccountId,
    beta_stack_score
  FROM
    prj-prod-dataplatform.risk_credit_mis.application_score_master
  WHERE
    beta_stack_score IS NOT NULL )
Select 
a1.digitalLoanAccountId,
a1.ln_os_type, 
a1.ln_appln_submit_datetime,
case when ldd.obs_min_inst_def30 >=1 then 1 else 0 end flag_mature_fpd30,
case when ldd.obs_min_inst_def30 >=1 and ldd.min_inst_def30 in (1) then 1 else 0 end fpd30,
bss.beta_stack_score prod_beta_stack_score
from {schema1}.{sil_beta_trench1} a1
inner join `risk_credit_mis.loan_master_table` lmt on lmt.digitalLoanAccountId = a1.digitalLoanAccountId
inner join bss on bss.digitalLoanAccountId = a1.digitalLoanAccountId
inner join prj-prod-dataplatform.risk_credit_mis.loan_deliquency_data ldd on lmt.loanAccountNumber = ldd.loanAccountNumber

Unnamed: 0,customerId,digitalLoanAccountId,loanAccountNumber,onb_tsa_onboarding_datetime,onb_first_name,onb_middle_name,onb_last_name,onb_age,onb_gender,onb_mobile_no,onb_email,onb_email_verified_flag,onb_place_of_birth,onb_country,onb_province,onb_city,onb_barangay,onb_postalcode,onb_latitude,onb_longitude,onb_kyc_status,onb_kyc_status_upgrade_datetime,ln_user_type,ln_loan_type,ln_prod_type,ln_loan_applied_flag,ln_facta_flag,ln_dl_rule_reject_flag,ln_taran_rule_reject_flag,ln_taran_scorecard_reject_flag,ln_cdd_reject_flag,ln_marked_underwriter_check_flag,ln_underwriting_reject_flag,ln_final_approved_flag,ln_disb_flag,ln_approved_not_disb_flag,ln_appln_submit_datetime,ln_disb_dtime,ln_chosen_principal,ln_chosen_tenor,ln_approved_principal,ln_approved_tenor,ln_cnt_ongoing_tdb_loans,ln_tot_ongoing_tdb_loans_emi,ln_purpose,ln_vas_opted_flag,ln_vas_used_flag,ln_age,ln_mobile_no,ln_alt_mobile_no,ln_osversion,ln_brand,ln_os_type,ln_address,ln_province,ln_city,ln_barangay,ln_postal_code,ln_latitude,ln_longitude,ln_doc_type,ln_doc_number,ln_marital_status,ln_cnt_dependents,ln_education_level,ln_source_funds,ln_source_funds_new,ln_employment_type,ln_employment_type_new,ln_nature_of_work,ln_nature_of_work_new,ln_industry,ln_industry_new,ln_company_name,ln_self_dec_income,ln_salary_scaled_income,ln_ref1_type,ln_ref2_type,cic_called_flag,cic_hit_flag,ln_mature_fpd10_flag,ln_mature_fpd30_flag,ln_mature_fspd30_flag,ln_mature_fstpd30_flag,ln_fpd10_flag,ln_fpd30_flag,ln_fspd30_flag,ln_fstpd30_flag,ln_spd30_flag,ln_tpd30_flag,ln_fpd10_os_principal,ln_fpd30_os_principal,ln_fspd30_os_principal,ln_fstpd30_os_principal,app_cnt_absence_tag_30d,app_cnt_absence_tag_90d,app_cnt_business_ever,app_cnt_competitors_30d,app_cnt_competitors_90d,app_cnt_education_ever,app_cnt_finance_7d,app_cnt_finance_90d,app_cnt_music_and_audio_ever,app_cnt_payday_90d,app_cnt_rated_for_3plus_ever,app_cnt_travel_and_local_ever,app_first_competitors_install_to_apply_days,app_first_payday_install_to_apply_days,app_median_time_bw_installed_mins_30d,app_vel_finance_30_over_365,inc_beta_encoded_company_name_grouped,inc_beta_ln_loan_type,inc_beta_ln_education_level,inc_beta_ln_industry_new,inc_beta_ln_employment_type_new,inc_beta_ln_age,inc_beta_ln_city,inc_beta_ln_brand,inc_beta_ln_purpose,inc_beta_ln_doc_type_rolled,inc_beta_ln_gender,inc_beta_ln_source_of_funds_new,inc_beta_ln_postal_code,inc_beta_ln_osversion_bin,inc_alpha_encoded_company_name_grouped,inc_alpha_ln_loan_prod_type,inc_alpha_ln_education_level,inc_alpha_ln_employment_type_new,inc_alpha_ln_industry_new,inc_alpha_ln_age,inc_alpha_ln_city,inc_alpha_ln_purpose,inc_alpha_ln_osversion_bin,inc_alpha_ln_brand,inc_alpha_doc_type_rolled,inc_alpha_cic_credit_avg_credit_limit,inc_alpha_cic_max_active_contracts_amt,inc_alpha_ln_gender,inc_alpha_ln_cnt_dependents,inc_alpha_ln_source_of_funds_new,inc_alpha_ln_email_primary_domain,inc_alpha_ln_postal_code,beta_de_ln_vas_opted_flag,beta_de_ln_doc_type_rolled,beta_de_ln_marital_status,beta_de_ln_age_bin,beta_de_ln_ref2_type,beta_de_ln_education_level,beta_de_ln_ref1_type,beta_de_ln_industry_new_bin,beta_de_ln_province_bin,beta_de_onb_name_email_match_score,beta_de_ln_appln_day_of_week,beta_de_time_bw_onb_loan_appln_mins,beta_de_ln_employment_type_new_bin,beta_de_ln_telconame,beta_de_ln_source_of_funds_new_bin,beta_de_ln_brand_bin,beta_de_ln_email_primary_domain,cic_Personal_Loans_granted_contracts_amt_24M,cic_days_since_last_inquiry,cic_cnt_active_contracts,cic_vel_contract_nongranted_cnt_12on24,cic_max_amt_granted_24M,cic_zero_non_granted_ever_flag,cic_tot_active_contracts_util,cic_vel_contract_granted_amt_12on24,cic_zero_granted_ever_flag,old_demo_score,old_cic_score,credo_old_gen_score,credo_gen_score,cic_score_range,ts_trusting_social_score,bu_bureau_score,credo_fraud_score,credo_sil_score,credo_quick_score,credo_flex_score,credo_reloan_score,beta_apps_score,beta_demo_score,cic_score,beta_stack_score,alpha_stack_score,gamma_stack_score,gamma_demo_score,gamma_trx_score,beta_estimated_income,alpha_estimated_income,sourceDataAsOf,input_source,beta_api_called_flag,alpha_api_called_flag
0,3647637,f8587fc1-3d32-4e1f-a6e6-e07e159e3d7c,60836476370018.0,2025-08-28 11:52:13,KARYLE MHAY,DUMAWAL,REUNIR,18,F,639126236599,imkarylereunir@gmail.com,V,GMA Cavite,PHL,CAVITE,GEN. MARIANO ALVAREZ,JACINTO LUMBRERAS,4117,14.2837832,120.9996409,S,2025-08-28 12:06:53,2_New Applicant,SIL-Instore,Mall,1,0,0,0,0,0,0,0,1,1,0,2025-08-28 12:14:44,2025-08-28 12:18:06,7555.0,6,7555.0,6.0,0,0.0,"Stereo set, Soundbar, Audio equipment",1,0,18,639126236599,,android11,realme,Android,1329 B7 L30 Havanah st,Cavite,GEN MARIANO ALVAREZ,JACINTO LUMBRERAS,4117,14.2838246,120.9974385,Philippines - Id Card (2020),8108605768473897,Single,No dependents,High School Graduate,Salary,Salary,Employed - Private Employee,Employed - Private Employee,Staff/Rank and File,Staff/Rank and File,Manufacturing,Manufacturing,Rohm Electronics Philippines Inc.,25000,23121.0,Friend,Friend,1,1,,,,,,,,,,,,,,,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,6.0,0.0,-0.3,-0.3,1457.25,0.0,117660.5460362991,BNPL,High School Graduate,Manufacturing,Employed - Private Employee,18,CF0510,realme,"Stereo set, Soundbar, Audio equipment",ID Card,F,Salary,4117,2-Voyager,,,,,,,,,,,,,,,,,,,1,National ID,Single,"(-inf, 27.0]",Sibling,High School Graduate,Friend,3.0,cavite,100,Thursday,8.0,Employed - Private Employee,Smart,Salary,realme,gmail.com,,,,,,,,,,0.136569,0.202060407876,375.0,0.1572,NH_Ei,,0.0,944.0,0.1695,0.3168,0.012,0.1915,0.5875173661348763,0.0481266529,,0.0844249974142327,,,,,37260.607548416185,,2025-09-18,inference,1,0
1,3609959,878c8ded-eaff-4fc6-9071-f7f3b8c2ad87,,2025-08-09 17:04:38,EDGAR,PADILLA,CASTILA,34,M,639751568702,edgarpcastila@gmail.com,V,Samar,PHL,METRO MANILA,CITY OF MALABON,TANONG (POB.),1470,14.6575156,120.9604752,S,,3_Applied_Not_Disbursed,SIL-Instore,Mall,1,0,0,1,1,0,0,0,0,0,0,2025-08-09 17:11:16,NaT,20498.0,12,,,0,0.0,"Washing machine, dryer or both,Televisions",1,0,34,639751568702,,android15,OPPO,Android,BLOCK 12 LOT 21 2ND STREET BARANGAY TANONG M...,METRO MANILA,CITY OF MALABON,TANONG POB,1470,14.6575156,120.9604752,Philippines - Driving License (2023),N2525003442,With a Live-in Partner,No dependents,Technical/Vocational Graduate,Income from Business,Income from Business,Business Owner,Business Owner,Owner,Owner,Others,Others,Fruits and Vegetables Seller,35000,24034.0,Spouse,Spouse,1,0,,,,,,,,,,,,,,,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,0.0,,-0.3,5814.2333333333,0.0,0.0,BNPL,Technical/Vocational Graduate,Others,Business Owner,34,CF1599,OPPO,Televisions,Driving License,M,Income from Business,1470,5-Pioneer,,,,,,,,,,,,,,,,,,,1,Driving License,With a Live-in Partner,"(32.0, 38.0]",Friend,Technical/Vocational Graduate,Spouse,3.0,metro manila,100,Saturday,7.0,New,Globe,Income from Business,oppo,gmail.com,,,,,,,,,,0.166903,0.155400949798,415.0,0.0986,NH_Gi,,0.0,811.0,0.1569,0.3862,0.3756,0.485,0.5402898606369326,0.0855239976,,0.1121108057036463,,,,,7826.479197323941,,2025-09-18,inference,1,0
2,3630890,80549d89-b478-4737-a252-48d964703e0e,60836308900016.0,2025-08-19 15:46:01,ALEXIS,ARCILLA,CULTURA,31,M,639694895753,alexiscultura8@gmail.com,V,Valenzuela city,PHL,NORTHERN SAMAR,ALLEN,LONDRES,6405,12.4897094,124.6396508,S,2025-08-19 16:05:07,2_New Applicant,SIL-Instore,Mall,1,0,0,0,0,0,0,0,1,1,0,2025-08-19 16:12:24,2025-08-19 16:31:13,13491.0,10,13491.0,10.0,0,0.0,Televisions,1,0,31,639694895753,,android15,vivo,Android,Purok 3,Northern Samar,ALLEN,LONDRES,6405,12.4897094,124.6396508,Philippines - Id Card (2020),8974381536014524,With a Live-in Partner,3,Technical/Vocational Graduate,Salary,Salary,Employed - Private Employee,Employed - Private Employee,Delivery Rider / Driver,Delivery Rider / Driver,Others,Others,Amoec Transport,14000,22177.0,Parent,Parent,1,0,,,,,,,,,,,,,,,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,0.0,,139.8,2882.2,0.0,0.0,BNPL,Technical/Vocational Graduate,Others,Employed - Private Employee,31,CF0039,vivo,Televisions,ID Card,M,Salary,6405,5-Pioneer,,,,,,,,,,,,,,,,,,,1,National ID,With a Live-in Partner,"(27.0, 32.0]",Sibling,Technical/Vocational Graduate,Parent,3.0,others,100,Tuesday,8.0,Employed - Private Employee,Smart,Salary,vivo,gmail.com,,,,,,,,,,0.155709,0.155400949798,379.0,0.111,NH_Gi,,0.0,811.0,0.1031,0.4201,0.0337,0.247,0.546122869446438,0.0495521492,,0.0659808025552173,,,,,8465.906973506175,,2025-09-18,inference,1,0
3,3503617,c16a3315-94da-492c-845e-8ed482ee0509,,2025-06-17 16:58:15,SANDRA DANIELLE,CRUZ,ABARRA,27,F,639298044191,sandraabarra3034@gmail.com,V,Bulacan,PHL,RIZAL,BINANGONAN,KALAWAAN,1940,14.48142994957326,121.18733363958202,S,,3_Applied_Not_Disbursed,SIL-Instore,Mall,1,0,0,1,1,0,0,0,0,0,0,2025-06-17 17:08:23,NaT,13995.0,5,,,0,0.0,Air conditioners,1,0,27,639298044191,,ios15.5,Apple,iOS,0010 fuentes compd brgy darangan kalawaan bina...,Rizal,BINANGONAN,KALAWAAN,1940,14.48142994957326,121.18733363958202,Philippines - ePassport (2016),P8204198A,Single,No dependents,College Undergraduate,Income from Business,Income from Business,Business Owner,Business Owner,Owner,Owner,Wholesale and Retail Trade,Wholesale and Retail Trade,Pawss and Clawss Animal Daycare Services,40000,21784.0,Friend,Friend,1,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,BNPL,College Undergraduate,Wholesale and Retail Trade,Business Owner,27,CF0208,Apple,Air conditioners,Passport,F,Income from Business,1940,2-Voyager,,,,,,,,,,,,,,,,,,,1,Passport,Single,"(-inf, 27.0]",Friend,College Undergraduate,Friend,2.0,rizal,100,Tuesday,11.0,New,Smart,Income from Business,apple,gmail.com,,,,,,,,,,0.089285,0.155400949798,,0.1544,NH_Fi,,0.0,,0.1263,0.2514,0.1493,0.1358,,0.1948612518,,0.2335424519028009,,,,,9233.529800327247,,2025-09-18,inference,1,0
4,3518666,2115a499-98cc-4011-8e38-65a2efe955a2,,2025-06-25 09:12:28,LUCIA,TANDO,REYES,55,F,639567510770,lucialumboreyes@gmail.com,V,Aklan,PHL,METRO MANILA,QUEZON CITY,SAN JOSE,1115,14.6227638,120.9991927,S,,3_Applied_Not_Disbursed,SIL-Instore,Mall,1,0,0,1,1,0,0,0,0,0,0,2025-07-12 12:24:38,NaT,5269.0,6,,,0,0.0,"Bed frames or Cribs,Accessories or Peripherals",1,0,55,639567510770,,android12,TECNO,Android,36 D TINAGAN ST SAN JOSE QUEZON CITY NCR S...,METRO MANILA,QUEZON CITY,SAN JOSE,1115,14.6227638,120.9991927,Philippines - Id Card (2020),2914897018023163,Married,3,High School Graduate,Remittance,Remittance,Remittance Beneficiary,Remittance Beneficiary,,,,,,20000,157991.0,Sibling,Sibling,1,1,,,,,,,,,,,,,,,1.0,1.0,0.0,0.0,0.0,0.0,1.0,3.0,0.0,2.0,13.0,0.0,214.8,11.6,4344.0,0.5,30892.5062448281,BNPL,High School Graduate,missing,Remittance Beneficiary,55,CF1512,TECNO,Bed frames or Cribs,ID Card,F,Remittance,1115,2-Voyager,,,,,,,,,,,,,,,,,,,1,National ID,Married,"(46.0, inf]",Friend,High School Graduate,Sibling,,metro manila,100,Saturday,24673.0,Remittance Beneficiary,Globe,Remittance,tecno,gmail.com,,,,,,,,,,0.108845,0.202060407876,444.0,0.0759,NH_Ii,,0.0,811.0,0.1231,0.2664,0.1344,0.395,0.5009334622964323,0.149371321,,0.1271378408009575,,,,,25923.028543455785,,2025-09-18,inference,1,0


# calculate_gini_for_table

In [15]:
import pandas as pd
from google.cloud import bigquery
from sklearn.metrics import roc_auc_score
from typing import Dict

def calculate_gini_for_table(
    df,
    date_column: str,
    score_column: str,
    target_column: str,
    target_maturity_column: str,
    data_periods_dict: Dict
):
    """
    Calculate Gini coefficient for different time periods.
    
    Args:
        project_id: BigQuery project ID
        table_name: Full table name (dataset.table)
        date_column: Name of the date column
        score_column: Name of the score column
        target_column: Name of the target column
        target_maturity_column: Name of the target maturity column
        data_periods_dict: Dictionary with periods, e.g.:
            {'Train': {'start': '2024-01-01', 'end': '2025-01-31'}, 
             'Test': {'start': '2025-02-01', 'end': '2025-12-31'}}
    
    Returns:
        pandas.DataFrame: Table with Gini coefficients for each period
    """
           
    # Get all data
    dt = df.copy()
    
    # Convert date column to datetime and extract just the date part
    dt[date_column] = pd.to_datetime(dt[date_column]).dt.date
    
    # Initialize results
    gini_results = []
    
    print("Gini Coefficient Results:")
    print("=" * 50)
    
    # Calculate Gini for each period
    for period_name, period_info in data_periods_dict.items():
        start_date = pd.to_datetime(period_info['start']).date()
        end_date = pd.to_datetime(period_info['end']).date()
        
        # Filter data for the current period
        period_mask = (dt[date_column] >= start_date) & (dt[date_column] <= end_date)
        period_data = dt[period_mask].copy()
        
        if len(period_data) == 0:
            print(f"{period_name}: No data available for period {start_date.date()} to {end_date.date()}")
            gini_results.append({
                'Period': period_name,
                'Start_Date': start_date,
                'End_Date': end_date,
                'Sample_Size': 0,
                'Gini_Coefficient': None
            })
            continue
        
        # Check if we have both classes (0 and 1) in target
        unique_targets = period_data[target_column].unique()
        if len(unique_targets) < 2:
            print(f"{period_name}: Only one class present in target variable. Cannot calculate Gini.")
            gini_results.append({
                'Period': period_name,
                'Start_Date': start_date,
                'End_Date': end_date,
                'Sample_Size': len(period_data),
                'Gini_Coefficient': None
            })
            continue
        
        # Calculate Gini coefficient
        try:
            auc = roc_auc_score(period_data[target_column], period_data[score_column])
            gini = 2 * auc - 1
            
            print(f"{period_name}: {round(gini, 4)} (Sample size: {len(period_data):,})")
            
            gini_results.append({
                'Period': period_name,
                'Start_Date': start_date,
                'End_Date': end_date,
                'Sample_Size': len(period_data),
                'Gini_Coefficient': round(gini, 4)
            })
            
        except Exception as e:
            print(f"{period_name}: Error calculating Gini - {str(e)}")
            gini_results.append({
                'Period': period_name,
                'Start_Date': start_date,
                'End_Date': end_date,
                'Sample_Size': len(period_data),
                'Gini_Coefficient': None
            })
    
    # Create results DataFrame
    results_df = pd.DataFrame(gini_results)
    
    print("\n" + "=" * 50)
    print("Summary Table:")
    print(results_df.to_string(index=False))
    
    return results_df

# SIL Beta Trench1 Android

In [22]:
sq = f"""WITH
  bss AS (
  SELECT
    digitalLoanAccountId,
    beta_stack_score
  FROM
    prj-prod-dataplatform.risk_credit_mis.application_score_master
  WHERE
    beta_stack_score IS NOT NULL )
Select 
a1.digitalLoanAccountId,
a1.ln_os_type, 
a1.ln_appln_submit_datetime,
case when ldd.obs_min_inst_def30 >=1 then 1 else 0 end flag_mature_fpd30,
case when ldd.obs_min_inst_def30 >=1 and ldd.min_inst_def30 in (1) then 1 else 0 end fpd30,
bss.beta_stack_score prod_beta_stack_score
from worktable_data_analysis.sil_beta_applied_loans_backscored_20240901_20250730 a1
inner join `risk_credit_mis.loan_master_table` lmt on lmt.digitalLoanAccountId = a1.digitalLoanAccountId
inner join bss on bss.digitalLoanAccountId = a1.digitalLoanAccountId
inner join prj-prod-dataplatform.risk_credit_mis.loan_deliquency_data ldd on lmt.loanAccountNumber = ldd.loanAccountNumber
where a1.ln_os_type like 'Android'
and a1.trench_category = 1
and bss.beta_stack_score is not null
;
"""
d1 = client.query(sq).to_dataframe(progress_bar_type = 'tqdm')


print('SIL Beta Trench 1 - Android')
print("\n" + "=" * 50)

data_periods_dict = {
    'Train': {'start': '2024-09-01', 'end': '2025-06-05'}, 
    'OOT 1': {'start': '2025-06-06', 'end': '2025-06-30'},
    'OOT 2': {'start': '2025-07-01', 'end': '2025-07-18'},
 }

calculate_gini_for_table(
    d1,
    date_column = 'ln_appln_submit_datetime',
    score_column = 'prod_beta_stack_score',
    target_column = 'fpd30',
    target_maturity_column = 'flag_mature_fpd30',
    data_periods_dict = data_periods_dict
)

Job ID 4e670e47-4ce2-4715-b394-e3925620fa26 successfully executed: 100%|[32m██████████[0m|




Downloading: 100%|[32m██████████[0m|
SIL Beta Trench 1 - Android

Gini Coefficient Results:
Train: 0.3429 (Sample size: 89,859)
OOT 1: 0.2955 (Sample size: 8,992)
OOT 2: 0.3009 (Sample size: 7,051)

Summary Table:
Period Start_Date   End_Date  Sample_Size  Gini_Coefficient
 Train 2024-09-01 2025-06-05        89859            0.3429
 OOT 1 2025-06-06 2025-06-30         8992            0.2955
 OOT 2 2025-07-01 2025-07-18         7051            0.3009


Unnamed: 0,Period,Start_Date,End_Date,Sample_Size,Gini_Coefficient
0,Train,2024-09-01,2025-06-05,89859,0.3429
1,OOT 1,2025-06-06,2025-06-30,8992,0.2955
2,OOT 2,2025-07-01,2025-07-18,7051,0.3009


# SIL Beta Trench1 IOS

In [23]:
sq = f"""WITH
  bss AS (
  SELECT
    digitalLoanAccountId,
    beta_stack_score
  FROM
    prj-prod-dataplatform.risk_credit_mis.application_score_master
  WHERE
    beta_stack_score IS NOT NULL )
Select 
a1.digitalLoanAccountId,
a1.ln_os_type, 
a1.ln_appln_submit_datetime,
case when ldd.obs_min_inst_def30 >=1 then 1 else 0 end flag_mature_fpd30,
case when ldd.obs_min_inst_def30 >=1 and ldd.min_inst_def30 in (1) then 1 else 0 end fpd30,
bss.beta_stack_score prod_beta_stack_score
from worktable_data_analysis.sil_beta_applied_loans_backscored_20240901_20250730 a1
inner join `risk_credit_mis.loan_master_table` lmt on lmt.digitalLoanAccountId = a1.digitalLoanAccountId
inner join bss on bss.digitalLoanAccountId = a1.digitalLoanAccountId
inner join prj-prod-dataplatform.risk_credit_mis.loan_deliquency_data ldd on lmt.loanAccountNumber = ldd.loanAccountNumber
where a1.ln_os_type not like 'Android'
and a1.trench_category = 1
and bss.beta_stack_score is not null
;
"""
d1 = client.query(sq).to_dataframe(progress_bar_type = 'tqdm')


print('SIL Beta Trench 1 - IOS')
print("\n" + "=" * 50)

data_periods_dict = {
    'Train': {'start': '2024-09-01', 'end': '2025-06-05'}, 
    'OOT 1': {'start': '2025-06-06', 'end': '2025-06-30'},
    'OOT 2': {'start': '2025-07-01', 'end': '2025-07-18'},
 }

calculate_gini_for_table(
    d1,
    date_column = 'ln_appln_submit_datetime',
    score_column = 'prod_beta_stack_score',
    target_column = 'fpd30',
    target_maturity_column = 'flag_mature_fpd30',
    data_periods_dict = data_periods_dict
)

Job ID 738374fe-b645-44fa-8a07-41340c8e372b successfully executed: 100%|[32m██████████[0m|




Downloading: 100%|[32m██████████[0m|
SIL Beta Trench 1 - IOS

Gini Coefficient Results:
Train: 0.215 (Sample size: 9,166)
OOT 1: 0.2855 (Sample size: 1,228)
OOT 2: 0.2992 (Sample size: 974)

Summary Table:
Period Start_Date   End_Date  Sample_Size  Gini_Coefficient
 Train 2024-09-01 2025-06-05         9166            0.2150
 OOT 1 2025-06-06 2025-06-30         1228            0.2855
 OOT 2 2025-07-01 2025-07-18          974            0.2992


Unnamed: 0,Period,Start_Date,End_Date,Sample_Size,Gini_Coefficient
0,Train,2024-09-01,2025-06-05,9166,0.215
1,OOT 1,2025-06-06,2025-06-30,1228,0.2855
2,OOT 2,2025-07-01,2025-07-18,974,0.2992


# Trench2 Beta

# SIL Beta Trench2 Android

In [25]:
sq = f"""WITH
  bss AS (
  SELECT
    digitalLoanAccountId,
    beta_stack_score
  FROM
    prj-prod-dataplatform.risk_credit_mis.application_score_master
  WHERE
    beta_stack_score IS NOT NULL )
Select 
a1.digitalLoanAccountId,
a1.ln_os_type, 
a1.ln_appln_submit_datetime,
case when ldd.obs_min_inst_def30 >=1 then 1 else 0 end flag_mature_fpd30,
case when ldd.obs_min_inst_def30 >=1 and ldd.min_inst_def30 in (1) then 1 else 0 end fpd30,
bss.beta_stack_score prod_beta_stack_score
from worktable_data_analysis.sil_beta_applied_loans_backscored_20240901_20250730 a1
inner join `risk_credit_mis.loan_master_table` lmt on lmt.digitalLoanAccountId = a1.digitalLoanAccountId
inner join bss on bss.digitalLoanAccountId = a1.digitalLoanAccountId
inner join prj-prod-dataplatform.risk_credit_mis.loan_deliquency_data ldd on lmt.loanAccountNumber = ldd.loanAccountNumber
where a1.ln_os_type like 'Android'
and a1.trench_category = 2
and bss.beta_stack_score is not null
;
"""
d1 = client.query(sq).to_dataframe(progress_bar_type = 'tqdm')


print('SIL Beta Trench 2 - Android')
print("\n" + "=" * 50)

data_periods_dict = {
    'Train': {'start': '2024-09-01', 'end': '2025-06-05'}, 
    'OOT 1': {'start': '2025-06-06', 'end': '2025-06-30'},
    'OOT 2': {'start': '2025-07-01', 'end': '2025-07-18'},
 }

calculate_gini_for_table(
    d1,
    date_column = 'ln_appln_submit_datetime',
    score_column = 'prod_beta_stack_score',
    target_column = 'fpd30',
    target_maturity_column = 'flag_mature_fpd30',
    data_periods_dict = data_periods_dict
)

Job ID ab76de5f-fee3-4a1f-ad7c-ab046bcb09af successfully executed: 100%|[32m██████████[0m|




Downloading: 100%|[32m██████████[0m|
SIL Beta Trench 2 - Android

Gini Coefficient Results:
Train: 0.2733 (Sample size: 1,375)
OOT 1: 0.192 (Sample size: 204)
OOT 2: 0.3979 (Sample size: 188)

Summary Table:
Period Start_Date   End_Date  Sample_Size  Gini_Coefficient
 Train 2024-09-01 2025-06-05         1375            0.2733
 OOT 1 2025-06-06 2025-06-30          204            0.1920
 OOT 2 2025-07-01 2025-07-18          188            0.3979


Unnamed: 0,Period,Start_Date,End_Date,Sample_Size,Gini_Coefficient
0,Train,2024-09-01,2025-06-05,1375,0.2733
1,OOT 1,2025-06-06,2025-06-30,204,0.192
2,OOT 2,2025-07-01,2025-07-18,188,0.3979


# SIL Beta Trench1 IOS

In [26]:
sq = f"""WITH
  bss AS (
  SELECT
    digitalLoanAccountId,
    beta_stack_score
  FROM
    prj-prod-dataplatform.risk_credit_mis.application_score_master
  WHERE
    beta_stack_score IS NOT NULL )
Select 
a1.digitalLoanAccountId,
a1.ln_os_type, 
a1.ln_appln_submit_datetime,
case when ldd.obs_min_inst_def30 >=1 then 1 else 0 end flag_mature_fpd30,
case when ldd.obs_min_inst_def30 >=1 and ldd.min_inst_def30 in (1) then 1 else 0 end fpd30,
bss.beta_stack_score prod_beta_stack_score
from worktable_data_analysis.sil_beta_applied_loans_backscored_20240901_20250730 a1
inner join `risk_credit_mis.loan_master_table` lmt on lmt.digitalLoanAccountId = a1.digitalLoanAccountId
inner join bss on bss.digitalLoanAccountId = a1.digitalLoanAccountId
inner join prj-prod-dataplatform.risk_credit_mis.loan_deliquency_data ldd on lmt.loanAccountNumber = ldd.loanAccountNumber
where a1.ln_os_type not like 'Android'
and a1.trench_category = 2
and bss.beta_stack_score is not null
;
"""
d1 = client.query(sq).to_dataframe(progress_bar_type = 'tqdm')


print('SIL Beta Trench 2 - IOS')
print("\n" + "=" * 50)

data_periods_dict = {
    'Train': {'start': '2024-09-01', 'end': '2025-06-05'}, 
    'OOT 1': {'start': '2025-06-06', 'end': '2025-06-30'},
    'OOT 2': {'start': '2025-07-01', 'end': '2025-07-18'},
 }

calculate_gini_for_table(
    d1,
    date_column = 'ln_appln_submit_datetime',
    score_column = 'prod_beta_stack_score',
    target_column = 'fpd30',
    target_maturity_column = 'flag_mature_fpd30',
    data_periods_dict = data_periods_dict
)

Job ID f58d8645-3b98-415f-84e1-78bc81d3106e successfully executed: 100%|[32m██████████[0m|




Downloading: 100%|[32m██████████[0m|
SIL Beta Trench 2 - IOS

Gini Coefficient Results:
Train: 0.1662 (Sample size: 226)
OOT 1: 0.3023 (Sample size: 46)
OOT 2: 0.4167 (Sample size: 34)

Summary Table:
Period Start_Date   End_Date  Sample_Size  Gini_Coefficient
 Train 2024-09-01 2025-06-05          226            0.1662
 OOT 1 2025-06-06 2025-06-30           46            0.3023
 OOT 2 2025-07-01 2025-07-18           34            0.4167


Unnamed: 0,Period,Start_Date,End_Date,Sample_Size,Gini_Coefficient
0,Train,2024-09-01,2025-06-05,226,0.1662
1,OOT 1,2025-06-06,2025-06-30,46,0.3023
2,OOT 2,2025-07-01,2025-07-18,34,0.4167


# Trench3 Beta

# SIL Beta Trench3 Android

In [27]:
sq = f"""WITH
  bss AS (
  SELECT
    digitalLoanAccountId,
    beta_stack_score
  FROM
    prj-prod-dataplatform.risk_credit_mis.application_score_master
  WHERE
    beta_stack_score IS NOT NULL )
Select 
a1.digitalLoanAccountId,
a1.ln_os_type, 
a1.ln_appln_submit_datetime,
case when ldd.obs_min_inst_def30 >=1 then 1 else 0 end flag_mature_fpd30,
case when ldd.obs_min_inst_def30 >=1 and ldd.min_inst_def30 in (1) then 1 else 0 end fpd30,
bss.beta_stack_score prod_beta_stack_score
from worktable_data_analysis.sil_beta_applied_loans_backscored_20240901_20250730 a1
inner join `risk_credit_mis.loan_master_table` lmt on lmt.digitalLoanAccountId = a1.digitalLoanAccountId
inner join bss on bss.digitalLoanAccountId = a1.digitalLoanAccountId
inner join prj-prod-dataplatform.risk_credit_mis.loan_deliquency_data ldd on lmt.loanAccountNumber = ldd.loanAccountNumber
where a1.ln_os_type like 'Android'
and a1.trench_category = 3
and bss.beta_stack_score is not null
;
"""
d1 = client.query(sq).to_dataframe(progress_bar_type = 'tqdm')


print('SIL Beta Trench 3 - Android')
print("\n" + "=" * 50)

data_periods_dict = {
    'Train': {'start': '2024-09-01', 'end': '2025-06-05'}, 
    'OOT 1': {'start': '2025-06-06', 'end': '2025-06-30'},
    'OOT 2': {'start': '2025-07-01', 'end': '2025-07-18'},
 }

calculate_gini_for_table(
    d1,
    date_column = 'ln_appln_submit_datetime',
    score_column = 'prod_beta_stack_score',
    target_column = 'fpd30',
    target_maturity_column = 'flag_mature_fpd30',
    data_periods_dict = data_periods_dict
)

Job ID ce18d833-1f38-4d93-8ae5-c0e1c75aeb06 successfully executed: 100%|[32m██████████[0m|




Downloading: 100%|[32m██████████[0m|
SIL Beta Trench 3 - Android

Gini Coefficient Results:
Train: 0.2606 (Sample size: 3,563)
OOT 1: 0.1238 (Sample size: 414)
OOT 2: 0.3061 (Sample size: 335)

Summary Table:
Period Start_Date   End_Date  Sample_Size  Gini_Coefficient
 Train 2024-09-01 2025-06-05         3563            0.2606
 OOT 1 2025-06-06 2025-06-30          414            0.1238
 OOT 2 2025-07-01 2025-07-18          335            0.3061


Unnamed: 0,Period,Start_Date,End_Date,Sample_Size,Gini_Coefficient
0,Train,2024-09-01,2025-06-05,3563,0.2606
1,OOT 1,2025-06-06,2025-06-30,414,0.1238
2,OOT 2,2025-07-01,2025-07-18,335,0.3061


# SIL Beta Trench3 IOS

In [28]:
sq = f"""WITH
  bss AS (
  SELECT
    digitalLoanAccountId,
    beta_stack_score
  FROM
    prj-prod-dataplatform.risk_credit_mis.application_score_master
  WHERE
    beta_stack_score IS NOT NULL )
Select 
a1.digitalLoanAccountId,
a1.ln_os_type, 
a1.ln_appln_submit_datetime,
case when ldd.obs_min_inst_def30 >=1 then 1 else 0 end flag_mature_fpd30,
case when ldd.obs_min_inst_def30 >=1 and ldd.min_inst_def30 in (1) then 1 else 0 end fpd30,
bss.beta_stack_score prod_beta_stack_score
from worktable_data_analysis.sil_beta_applied_loans_backscored_20240901_20250730 a1
inner join `risk_credit_mis.loan_master_table` lmt on lmt.digitalLoanAccountId = a1.digitalLoanAccountId
inner join bss on bss.digitalLoanAccountId = a1.digitalLoanAccountId
inner join prj-prod-dataplatform.risk_credit_mis.loan_deliquency_data ldd on lmt.loanAccountNumber = ldd.loanAccountNumber
where a1.ln_os_type not like 'Android'
and a1.trench_category = 3
and bss.beta_stack_score is not null
;
"""
d1 = client.query(sq).to_dataframe(progress_bar_type = 'tqdm')


print('SIL Beta Trench 3 - IOS')
print("\n" + "=" * 50)

data_periods_dict = {
    'Train': {'start': '2024-09-01', 'end': '2025-06-05'}, 
    'OOT 1': {'start': '2025-06-06', 'end': '2025-06-30'},
    'OOT 2': {'start': '2025-07-01', 'end': '2025-07-18'},
 }

calculate_gini_for_table(
    d1,
    date_column = 'ln_appln_submit_datetime',
    score_column = 'prod_beta_stack_score',
    target_column = 'fpd30',
    target_maturity_column = 'flag_mature_fpd30',
    data_periods_dict = data_periods_dict
)

Job ID 0e05a527-c36d-4048-9378-0fbe726ba6c1 successfully executed: 100%|[32m██████████[0m|




Downloading: 100%|[32m██████████[0m|
SIL Beta Trench 3 - IOS

Gini Coefficient Results:
Train: 0.2742 (Sample size: 402)
OOT 1: 0.6981 (Sample size: 55)
OOT 2: 0.5357 (Sample size: 57)

Summary Table:
Period Start_Date   End_Date  Sample_Size  Gini_Coefficient
 Train 2024-09-01 2025-06-05          402            0.2742
 OOT 1 2025-06-06 2025-06-30           55            0.6981
 OOT 2 2025-07-01 2025-07-18           57            0.5357


Unnamed: 0,Period,Start_Date,End_Date,Sample_Size,Gini_Coefficient
0,Train,2024-09-01,2025-06-05,402,0.2742
1,OOT 1,2025-06-06,2025-06-30,55,0.6981
2,OOT 2,2025-07-01,2025-07-18,57,0.5357


<!-- # All Trench Beta -->

<!-- # SIL Beta All Trench Android -->

In [None]:
-- sq = f"""WITH
--   bss AS (
--   SELECT
--     digitalLoanAccountId,
--     beta_stack_score
--   FROM
--     prj-prod-dataplatform.risk_credit_mis.application_score_master
--   WHERE
--     beta_stack_score IS NOT NULL )
-- Select 
-- a1.digitalLoanAccountId,
-- a1.ln_os_type, 
-- a1.ln_appln_submit_datetime,
-- case when ldd.obs_min_inst_def30 >=2 then 1 else 0 end flag_mature_fspd30,
-- case when ldd.obs_min_inst_def30 >=2 and ldd.min_inst_def30 in (1,2) then 1 else 0 end fspd30,
-- bss.beta_stack_score prod_beta_stack_score
-- from worktable_data_analysis.sil_beta_applied_loans_backscored_20240901_20250730 a1
-- inner join `risk_credit_mis.loan_master_table` lmt on lmt.digitalLoanAccountId = a1.digitalLoanAccountId
-- inner join bss on bss.digitalLoanAccountId = a1.digitalLoanAccountId
-- inner join prj-prod-dataplatform.risk_credit_mis.loan_deliquency_data ldd on lmt.loanAccountNumber = ldd.loanAccountNumber
-- where a1.ln_os_type like 'Android'
-- and a1.trench_category in (1,2,3)
-- and bss.beta_stack_score is not null
-- ;
-- """
-- d1 = client.query(sq).to_dataframe(progress_bar_type = 'tqdm')


-- print('SIL Beta Trench 3 - Android')
-- print("\n" + "=" * 50)

-- data_periods_dict = {
--     'Train': {'start': '2024-09-01', 'end': '2025-02-28'}, 
--     'OOT 1': {'start': '2025-03-01', 'end': '2025-03-31'},
--     'OOT 2': {'start': '2025-04-01', 'end': '2025-04-30'},
--     'OOT 3': {'start': '2025-05-01', 'end': '2025-05-31'},
--  }

-- calculate_gini_for_table(
--     d1,
--     date_column = 'ln_appln_submit_datetime',
--     score_column = 'prod_beta_stack_score',
--     target_column = 'fspd30',
--     target_maturity_column = 'flag_mature_fspd30',
--     data_periods_dict = data_periods_dict
-- )

Job ID 307dac1b-fd57-4011-9d5f-4fad860de7b6 successfully executed: 100%|[32m██████████[0m|




Downloading: 100%|[32m██████████[0m|
SIL Beta Trench 3 - Android

Gini Coefficient Results:
Train: 0.337 (Sample size: 89,307)
OOT 1: 0.3148 (Sample size: 3,919)
OOT 2: 0.2063 (Sample size: 567)
OOT 3: 0.2608 (Sample size: 844)

Summary Table:
Period Start_Date   End_Date  Sample_Size  Gini_Coefficient
 Train 2024-09-01 2025-02-28        89307            0.3370
 OOT 1 2025-03-01 2025-03-31         3919            0.3148
 OOT 2 2025-04-01 2025-04-30          567            0.2063
 OOT 3 2025-05-01 2025-05-31          844            0.2608


Unnamed: 0,Period,Start_Date,End_Date,Sample_Size,Gini_Coefficient
0,Train,2024-09-01,2025-02-28,89307,0.337
1,OOT 1,2025-03-01,2025-03-31,3919,0.3148
2,OOT 2,2025-04-01,2025-04-30,567,0.2063
3,OOT 3,2025-05-01,2025-05-31,844,0.2608


<!-- # SIL Beta All Trench IOS -->

In [None]:
-- sq = f"""WITH
--   bss AS (
--   SELECT
--     digitalLoanAccountId,
--     beta_stack_score
--   FROM
--     prj-prod-dataplatform.risk_credit_mis.application_score_master
--   WHERE
--     beta_stack_score IS NOT NULL )
-- Select 
-- a1.digitalLoanAccountId,
-- a1.ln_os_type, 
-- a1.ln_appln_submit_datetime,
-- case when ldd.obs_min_inst_def30 >=2 then 1 else 0 end flag_mature_fspd30,
-- case when ldd.obs_min_inst_def30 >=2 and ldd.min_inst_def30 in (1,2) then 1 else 0 end fspd30,
-- bss.beta_stack_score prod_beta_stack_score
-- from worktable_data_analysis.sil_beta_applied_loans_backscored_20240901_20250730 a1
-- inner join `risk_credit_mis.loan_master_table` lmt on lmt.digitalLoanAccountId = a1.digitalLoanAccountId
-- inner join bss on bss.digitalLoanAccountId = a1.digitalLoanAccountId
-- inner join prj-prod-dataplatform.risk_credit_mis.loan_deliquency_data ldd on lmt.loanAccountNumber = ldd.loanAccountNumber
-- where a1.ln_os_type not like 'Android'
-- and a1.trench_category in (1,2,3)
-- and bss.beta_stack_score is not null
-- ;
-- """
-- d1 = client.query(sq).to_dataframe(progress_bar_type = 'tqdm')


-- print('SIL Beta All Trench - IOS')
-- print("\n" + "=" * 50)

-- data_periods_dict = {
--     'Train': {'start': '2024-09-01', 'end': '2025-02-28'}, 
--     'OOT 1': {'start': '2025-03-01', 'end': '2025-03-31'},
--     'OOT 2': {'start': '2025-04-01', 'end': '2025-04-30'},
--     'OOT 3': {'start': '2025-05-01', 'end': '2025-05-31'},
--  }

-- calculate_gini_for_table(
--     d1,
--     date_column = 'ln_appln_submit_datetime',
--     score_column = 'prod_beta_stack_score',
--     target_column = 'fspd30',
--     target_maturity_column = 'flag_mature_fspd30',
--     data_periods_dict = data_periods_dict
-- )

Job ID ca3a5bce-89ca-4f43-8a46-27049b4e9c73 successfully executed: 100%|[32m██████████[0m|




Downloading: 100%|[32m██████████[0m|
SIL Beta All Trench - IOS

Gini Coefficient Results:
Train: 0.1844 (Sample size: 9,016)
OOT 1: 0.3067 (Sample size: 570)
OOT 2: 0.3676 (Sample size: 72)
OOT 3: 0.1736 (Sample size: 105)

Summary Table:
Period Start_Date   End_Date  Sample_Size  Gini_Coefficient
 Train 2024-09-01 2025-02-28         9016            0.1844
 OOT 1 2025-03-01 2025-03-31          570            0.3067
 OOT 2 2025-04-01 2025-04-30           72            0.3676
 OOT 3 2025-05-01 2025-05-31          105            0.1736


Unnamed: 0,Period,Start_Date,End_Date,Sample_Size,Gini_Coefficient
0,Train,2024-09-01,2025-02-28,9016,0.1844
1,OOT 1,2025-03-01,2025-03-31,570,0.3067
2,OOT 2,2025-04-01,2025-04-30,72,0.3676
3,OOT 3,2025-05-01,2025-05-31,105,0.1736


# Alpha

In [29]:
alphabackscore = 'worktable_data_analysis.sil_alpha_applied_loans_backscored_20240801_20250730'

# SIL Alpha Trench1 Android

In [30]:
sq = f"""WITH
  bss AS (
  SELECT
    digitalLoanAccountId,
    alpha_stack_score
  FROM
    prj-prod-dataplatform.risk_credit_mis.application_score_master
  WHERE
    alpha_stack_score IS NOT NULL )
Select 
a1.digitalLoanAccountId,
a1.ln_os_type, 
a1.ln_appln_submit_datetime,
case when ldd.obs_min_inst_def30 >=1 then 1 else 0 end flag_mature_fpd30,
case when ldd.obs_min_inst_def30 >=1 and ldd.min_inst_def30 in (1) then 1 else 0 end fpd30,
bss.alpha_stack_score prod_alpha_stack_score
from worktable_data_analysis.sil_alpha_applied_loans_backscored_20240801_20250730 a1
inner join `risk_credit_mis.loan_master_table` lmt on lmt.digitalLoanAccountId = a1.digitalLoanAccountId
inner join bss on bss.digitalLoanAccountId = a1.digitalLoanAccountId
inner join prj-prod-dataplatform.risk_credit_mis.loan_deliquency_data ldd on lmt.loanAccountNumber = ldd.loanAccountNumber
where a1.ln_os_type like 'Android'
and a1.trench_category = 1
and bss.alpha_stack_score is not null
;
"""
d1 = client.query(sq).to_dataframe(progress_bar_type = 'tqdm')


print('SIL Alpha Trench 1 - Android')
print("\n" + "=" * 50)

data_periods_dict = {
    'Train': {'start': '2024-09-01', 'end': '2025-06-05'}, 
    'OOT 1': {'start': '2025-06-06', 'end': '2025-06-30'},
    'OOT 2': {'start': '2025-07-01', 'end': '2025-07-18'},
 }

calculate_gini_for_table(
    d1,
    date_column = 'ln_appln_submit_datetime',
    score_column = 'prod_alpha_stack_score',
    target_column = 'fpd30',
    target_maturity_column = 'flag_mature_fpd30',
    data_periods_dict = data_periods_dict
)

Job ID cfb44f0f-e2ef-4b8e-8a66-816c3ac5d50c successfully executed: 100%|[32m██████████[0m|




Downloading: 100%|[32m██████████[0m|
SIL Alpha Trench 1 - Android

Gini Coefficient Results:
Train: 0.3957 (Sample size: 67,037)
OOT 1: 0.3487 (Sample size: 5,555)
OOT 2: 0.3464 (Sample size: 4,288)

Summary Table:
Period Start_Date   End_Date  Sample_Size  Gini_Coefficient
 Train 2024-09-01 2025-06-05        67037            0.3957
 OOT 1 2025-06-06 2025-06-30         5555            0.3487
 OOT 2 2025-07-01 2025-07-18         4288            0.3464


Unnamed: 0,Period,Start_Date,End_Date,Sample_Size,Gini_Coefficient
0,Train,2024-09-01,2025-06-05,67037,0.3957
1,OOT 1,2025-06-06,2025-06-30,5555,0.3487
2,OOT 2,2025-07-01,2025-07-18,4288,0.3464


# SIL Alpha Trench1 IOS

In [31]:
sq = f"""WITH
  bss AS (
  SELECT
    digitalLoanAccountId,
    alpha_stack_score
  FROM
    prj-prod-dataplatform.risk_credit_mis.application_score_master
  WHERE
    alpha_stack_score IS NOT NULL )
Select 
a1.digitalLoanAccountId,
a1.ln_os_type, 
a1.ln_appln_submit_datetime,
case when ldd.obs_min_inst_def30 >=1 then 1 else 0 end flag_mature_fpd30,
case when ldd.obs_min_inst_def30 >=1 and ldd.min_inst_def30 in (1) then 1 else 0 end fpd30,
bss.alpha_stack_score prod_alpha_stack_score
from worktable_data_analysis.sil_alpha_applied_loans_backscored_20240801_20250730 a1
inner join `risk_credit_mis.loan_master_table` lmt on lmt.digitalLoanAccountId = a1.digitalLoanAccountId
inner join bss on bss.digitalLoanAccountId = a1.digitalLoanAccountId
inner join prj-prod-dataplatform.risk_credit_mis.loan_deliquency_data ldd on lmt.loanAccountNumber = ldd.loanAccountNumber
where a1.ln_os_type not like 'Android'
and a1.trench_category = 1
and bss.alpha_stack_score is not null
;
"""
d1 = client.query(sq).to_dataframe(progress_bar_type = 'tqdm')


print('SIL Alpha Trench 1 - IOS')
print("\n" + "=" * 50)

data_periods_dict = {
    'Train': {'start': '2024-09-01', 'end': '2025-06-05'}, 
    'OOT 1': {'start': '2025-06-06', 'end': '2025-06-30'},
    'OOT 2': {'start': '2025-07-01', 'end': '2025-07-18'},
 }

calculate_gini_for_table(
    d1,
    date_column = 'ln_appln_submit_datetime',
    score_column = 'prod_alpha_stack_score',
    target_column = 'fpd30',
    target_maturity_column = 'flag_mature_fpd30',
    data_periods_dict = data_periods_dict
)

Job ID 4770de0d-b07c-4c71-a20b-bf956e44bd6c successfully executed: 100%|[32m██████████[0m|




Downloading: 100%|[32m██████████[0m|
SIL Alpha Trench 1 - IOS

Gini Coefficient Results:
Train: 0.2928 (Sample size: 6,958)
OOT 1: 0.3101 (Sample size: 788)
OOT 2: 0.1568 (Sample size: 592)

Summary Table:
Period Start_Date   End_Date  Sample_Size  Gini_Coefficient
 Train 2024-09-01 2025-06-05         6958            0.2928
 OOT 1 2025-06-06 2025-06-30          788            0.3101
 OOT 2 2025-07-01 2025-07-18          592            0.1568


Unnamed: 0,Period,Start_Date,End_Date,Sample_Size,Gini_Coefficient
0,Train,2024-09-01,2025-06-05,6958,0.2928
1,OOT 1,2025-06-06,2025-06-30,788,0.3101
2,OOT 2,2025-07-01,2025-07-18,592,0.1568


# Trench2 Alpha

# SIL Alpha Trench2 Android

In [32]:
sq = f"""WITH
  bss AS (
  SELECT
    digitalLoanAccountId,
    alpha_stack_score
  FROM
    prj-prod-dataplatform.risk_credit_mis.application_score_master
  WHERE
    alpha_stack_score IS NOT NULL )
Select 
a1.digitalLoanAccountId,
a1.ln_os_type, 
a1.ln_appln_submit_datetime,
case when ldd.obs_min_inst_def30 >=1 then 1 else 0 end flag_mature_fpd30,
case when ldd.obs_min_inst_def30 >=1 and ldd.min_inst_def30 in (1) then 1 else 0 end fpd30,
bss.alpha_stack_score prod_alpha_stack_score
from worktable_data_analysis.sil_alpha_applied_loans_backscored_20240801_20250730 a1
inner join `risk_credit_mis.loan_master_table` lmt on lmt.digitalLoanAccountId = a1.digitalLoanAccountId
inner join bss on bss.digitalLoanAccountId = a1.digitalLoanAccountId
inner join prj-prod-dataplatform.risk_credit_mis.loan_deliquency_data ldd on lmt.loanAccountNumber = ldd.loanAccountNumber
where a1.ln_os_type like 'Android'
and a1.trench_category = 2
and bss.alpha_stack_score is not null
;
"""
d1 = client.query(sq).to_dataframe(progress_bar_type = 'tqdm')


print('SIL Alpha Trench 2 - Android')
print("\n" + "=" * 50)

data_periods_dict = {
    'Train': {'start': '2024-09-01', 'end': '2025-06-05'}, 
    'OOT 1': {'start': '2025-06-06', 'end': '2025-06-30'},
    'OOT 2': {'start': '2025-07-01', 'end': '2025-07-18'},
 }

calculate_gini_for_table(
    d1,
    date_column = 'ln_appln_submit_datetime',
    score_column = 'prod_alpha_stack_score',
    target_column = 'fpd30',
    target_maturity_column = 'flag_mature_fpd30',
    data_periods_dict = data_periods_dict
)

Job ID 4eebb0b7-19a3-4260-b420-b5d9e6e0455e successfully executed: 100%|[32m██████████[0m|




Downloading: 100%|[32m██████████[0m|
SIL Alpha Trench 2 - Android

Gini Coefficient Results:
Train: 0.2739 (Sample size: 1,283)
OOT 1: 0.3419 (Sample size: 167)
OOT 2: 0.4325 (Sample size: 148)

Summary Table:
Period Start_Date   End_Date  Sample_Size  Gini_Coefficient
 Train 2024-09-01 2025-06-05         1283            0.2739
 OOT 1 2025-06-06 2025-06-30          167            0.3419
 OOT 2 2025-07-01 2025-07-18          148            0.4325


Unnamed: 0,Period,Start_Date,End_Date,Sample_Size,Gini_Coefficient
0,Train,2024-09-01,2025-06-05,1283,0.2739
1,OOT 1,2025-06-06,2025-06-30,167,0.3419
2,OOT 2,2025-07-01,2025-07-18,148,0.4325


# SIL Alpha Trench2 IOS

In [50]:
sq = f"""WITH
  bss AS (
  SELECT
    digitalLoanAccountId,
    alpha_stack_score
  FROM
    prj-prod-dataplatform.risk_credit_mis.application_score_master
  WHERE
    alpha_stack_score IS NOT NULL )
Select 
a1.digitalLoanAccountId,
a1.ln_os_type, 
a1.ln_appln_submit_datetime,
case when ldd.obs_min_inst_def30 >=1 then 1 else 0 end flag_mature_fpd30,
case when ldd.obs_min_inst_def30 >=1 and ldd.min_inst_def30 in (1) then 1 else 0 end fpd30,
bss.alpha_stack_score prod_alpha_stack_score,
a1.trench_category
from worktable_data_analysis.sil_alpha_applied_loans_backscored_20240801_20250730 a1
inner join `risk_credit_mis.loan_master_table` lmt on lmt.digitalLoanAccountId = a1.digitalLoanAccountId
inner join bss on bss.digitalLoanAccountId = a1.digitalLoanAccountId
inner join prj-prod-dataplatform.risk_credit_mis.loan_deliquency_data ldd on lmt.loanAccountNumber = ldd.loanAccountNumber
where a1.ln_os_type not like 'Android'
and a1.trench_category = 2
and bss.alpha_stack_score is not null
;
"""
d1 = client.query(sq).to_dataframe(progress_bar_type = 'tqdm')


print('SIL Alpha Trench 2 - IOS')
print("\n" + "=" * 50)

data_periods_dict = {
    'Train': {'start': '2024-09-01', 'end': '2025-06-05'}, 
    'OOT 1': {'start': '2025-06-06', 'end': '2025-06-30'},
    'OOT 2': {'start': '2025-07-01', 'end': '2025-07-18'},
 }

calculate_gini_for_table(
    d1,
    date_column = 'ln_appln_submit_datetime',
    score_column = 'prod_alpha_stack_score',
    target_column = 'fpd30',
    target_maturity_column = 'flag_mature_fpd30',
    data_periods_dict = data_periods_dict
)

Job ID f6ea7976-195f-48a3-8916-578e6a482a99 successfully executed: 100%|[32m██████████[0m|




Downloading: 100%|[32m██████████[0m|
SIL Alpha Trench 2 - IOS

Gini Coefficient Results:
Train: 0.218 (Sample size: 208)
OOT 1: 0.3158 (Sample size: 41)
OOT 2: -0.52 (Sample size: 28)

Summary Table:
Period Start_Date   End_Date  Sample_Size  Gini_Coefficient
 Train 2024-09-01 2025-06-05          208            0.2180
 OOT 1 2025-06-06 2025-06-30           41            0.3158
 OOT 2 2025-07-01 2025-07-18           28           -0.5200


Unnamed: 0,Period,Start_Date,End_Date,Sample_Size,Gini_Coefficient
0,Train,2024-09-01,2025-06-05,208,0.218
1,OOT 1,2025-06-06,2025-06-30,41,0.3158
2,OOT 2,2025-07-01,2025-07-18,28,-0.52


In [51]:
dd.query("""select 
ln_os_type,  trench_category,  count(digitalLoanAccountId)cntloan ,
sum(fpd30)fpd30,
sum(flag_mature_fpd30)flag_mature_fpd30
from d1 
         where date(ln_appln_submit_datetime) between '2025-07-01' and '2025-07-18'
         group by 1,2;""").to_df()

Unnamed: 0,ln_os_type,trench_category,cntloan,fpd30,flag_mature_fpd30
0,iOS,2,27,3.0,18.0


# Trench3 Alpha

# SIL Alpha Trench3 Android

In [34]:
sq = f"""WITH
  bss AS (
  SELECT
    digitalLoanAccountId,
    alpha_stack_score
  FROM
    prj-prod-dataplatform.risk_credit_mis.application_score_master
  WHERE
    alpha_stack_score IS NOT NULL )
Select 
a1.digitalLoanAccountId,
a1.ln_os_type, 
a1.ln_appln_submit_datetime,
case when ldd.obs_min_inst_def30 >=1 then 1 else 0 end flag_mature_fpd30,
case when ldd.obs_min_inst_def30 >=1 and ldd.min_inst_def30 in (1) then 1 else 0 end fpd30,
bss.alpha_stack_score prod_alpha_stack_score
from worktable_data_analysis.sil_alpha_applied_loans_backscored_20240801_20250730 a1
inner join `risk_credit_mis.loan_master_table` lmt on lmt.digitalLoanAccountId = a1.digitalLoanAccountId
inner join bss on bss.digitalLoanAccountId = a1.digitalLoanAccountId
inner join prj-prod-dataplatform.risk_credit_mis.loan_deliquency_data ldd on lmt.loanAccountNumber = ldd.loanAccountNumber
where a1.ln_os_type like 'Android'
and a1.trench_category = 3
and bss.alpha_stack_score is not null
;
"""
d1 = client.query(sq).to_dataframe(progress_bar_type = 'tqdm')


print('SIL Alpha Trench 3 - Android')
print("\n" + "=" * 50)

data_periods_dict = {
    'Train': {'start': '2024-09-01', 'end': '2025-06-05'}, 
    'OOT 1': {'start': '2025-06-06', 'end': '2025-06-30'},
    'OOT 2': {'start': '2025-07-01', 'end': '2025-07-18'},
 }

calculate_gini_for_table(
    d1,
    date_column = 'ln_appln_submit_datetime',
    score_column = 'prod_alpha_stack_score',
    target_column = 'fpd30',
    target_maturity_column = 'flag_mature_fpd30',
    data_periods_dict = data_periods_dict
)

Job ID 68d9e7ca-cc94-438b-8c88-d614d7f8ac33 successfully executed: 100%|[32m██████████[0m|




Downloading: 100%|[32m██████████[0m|
SIL Alpha Trench 3 - Android

Gini Coefficient Results:
Train: 0.2945 (Sample size: 3,364)
OOT 1: 0.2514 (Sample size: 377)
OOT 2: 0.2067 (Sample size: 285)

Summary Table:
Period Start_Date   End_Date  Sample_Size  Gini_Coefficient
 Train 2024-09-01 2025-06-05         3364            0.2945
 OOT 1 2025-06-06 2025-06-30          377            0.2514
 OOT 2 2025-07-01 2025-07-18          285            0.2067


Unnamed: 0,Period,Start_Date,End_Date,Sample_Size,Gini_Coefficient
0,Train,2024-09-01,2025-06-05,3364,0.2945
1,OOT 1,2025-06-06,2025-06-30,377,0.2514
2,OOT 2,2025-07-01,2025-07-18,285,0.2067


# SIL Alpha Trench3 IOS

In [46]:
sq = f"""WITH
  bss AS (
  SELECT
    digitalLoanAccountId,
    alpha_stack_score
  FROM
    prj-prod-dataplatform.risk_credit_mis.application_score_master
  WHERE
    alpha_stack_score IS NOT NULL )
Select 
a1.digitalLoanAccountId,
a1.ln_os_type, 
a1.ln_appln_submit_datetime,
case when ldd.obs_min_inst_def30 >=1 then 1 else 0 end flag_mature_fpd30,
case when ldd.obs_min_inst_def30 >=1 and ldd.min_inst_def30 in (1) then 1 else 0 end fpd30,
bss.alpha_stack_score prod_alpha_stack_score,
a1.trench_category
from worktable_data_analysis.sil_alpha_applied_loans_backscored_20240801_20250730 a1
inner join `risk_credit_mis.loan_master_table` lmt on lmt.digitalLoanAccountId = a1.digitalLoanAccountId
inner join bss on bss.digitalLoanAccountId = a1.digitalLoanAccountId
inner join prj-prod-dataplatform.risk_credit_mis.loan_deliquency_data ldd on lmt.loanAccountNumber = ldd.loanAccountNumber
where a1.ln_os_type not like 'Android'
and a1.trench_category = 3
and bss.alpha_stack_score is not null
;
"""
d1 = client.query(sq).to_dataframe(progress_bar_type = 'tqdm')


print('SIL Alpha Trench 3 - IOS')
print("\n" + "=" * 50)

data_periods_dict = {
    'Train': {'start': '2024-09-01', 'end': '2025-06-05'}, 
    'OOT 1': {'start': '2025-06-06', 'end': '2025-06-30'},
    'OOT 2': {'start': '2025-07-01', 'end': '2025-07-18'},
 }

calculate_gini_for_table(
    d1,
    date_column = 'ln_appln_submit_datetime',
    score_column = 'prod_alpha_stack_score',
    target_column = 'fpd30',
    target_maturity_column = 'flag_mature_fpd30',
    data_periods_dict = data_periods_dict
)

Job ID d67175c3-63ec-444e-b6aa-fd81ff6e29ff successfully executed: 100%|[32m██████████[0m|




Downloading: 100%|[32m██████████[0m|
SIL Alpha Trench 3 - IOS

Gini Coefficient Results:
Train: 0.1046 (Sample size: 381)
OOT 1: 0.4898 (Sample size: 51)
OOT 2: -0.5111 (Sample size: 46)

Summary Table:
Period Start_Date   End_Date  Sample_Size  Gini_Coefficient
 Train 2024-09-01 2025-06-05          381            0.1046
 OOT 1 2025-06-06 2025-06-30           51            0.4898
 OOT 2 2025-07-01 2025-07-18           46           -0.5111


Unnamed: 0,Period,Start_Date,End_Date,Sample_Size,Gini_Coefficient
0,Train,2024-09-01,2025-06-05,381,0.1046
1,OOT 1,2025-06-06,2025-06-30,51,0.4898
2,OOT 2,2025-07-01,2025-07-18,46,-0.5111


In [47]:
d1.columns

Index(['digitalLoanAccountId', 'ln_os_type', 'ln_appln_submit_datetime',
       'flag_mature_fpd30', 'fpd30', 'prod_alpha_stack_score',
       'trench_category'],
      dtype='object')

In [49]:
dd.query("""select 
ln_os_type,  trench_category,  count(digitalLoanAccountId)cntloan ,
sum(fpd30)fpd30,
sum(flag_mature_fpd30)flag_mature_fpd30
from d1 
         where date(ln_appln_submit_datetime) between '2025-07-01' and '2025-07-18'
         group by 1,2;""").to_df()

Unnamed: 0,ln_os_type,trench_category,cntloan,fpd30,flag_mature_fpd30
0,iOS,3,46,1.0,36.0


# Trench1 + Trench2 + Trench3 Beta Android


# Beta

# Android

In [37]:
sq = f"""WITH
  bss AS (
  SELECT
    digitalLoanAccountId,
    beta_stack_score
  FROM
    prj-prod-dataplatform.risk_credit_mis.application_score_master
  WHERE
    beta_stack_score IS NOT NULL )
Select 
a1.digitalLoanAccountId,
a1.ln_os_type, 
a1.ln_appln_submit_datetime,
case when ldd.obs_min_inst_def30 >=1 then 1 else 0 end flag_mature_fpd30,
case when ldd.obs_min_inst_def30 >=1 and ldd.min_inst_def30 in (1) then 1 else 0 end fpd30,
bss.beta_stack_score prod_beta_stack_score
from worktable_data_analysis.sil_beta_applied_loans_backscored_20240901_20250730 a1
inner join `risk_credit_mis.loan_master_table` lmt on lmt.digitalLoanAccountId = a1.digitalLoanAccountId
inner join bss on bss.digitalLoanAccountId = a1.digitalLoanAccountId
inner join prj-prod-dataplatform.risk_credit_mis.loan_deliquency_data ldd on lmt.loanAccountNumber = ldd.loanAccountNumber
where a1.ln_os_type like 'Android'
and a1.trench_category in (1,2,3)
and bss.beta_stack_score is not null
;
"""
d1 = client.query(sq).to_dataframe(progress_bar_type = 'tqdm')


print('SIL Beta Trench 1,2,3 - Android')
print("\n" + "=" * 50)

data_periods_dict = {
    'Train': {'start': '2024-09-01', 'end': '2025-06-05'}, 
    'OOT 1': {'start': '2025-06-06', 'end': '2025-06-30'},
    'OOT 2': {'start': '2025-07-01', 'end': '2025-07-18'},
 }

calculate_gini_for_table(
    d1,
    date_column = 'ln_appln_submit_datetime',
    score_column = 'prod_beta_stack_score',
    target_column = 'fpd30',
    target_maturity_column = 'flag_mature_fpd30',
    data_periods_dict = data_periods_dict
)

Job ID aff7bdcf-875c-4b2b-b62a-93f77691dffb successfully executed: 100%|[32m██████████[0m|




Downloading: 100%|[32m██████████[0m|
SIL Beta Trench 1,2,3 - Android

Gini Coefficient Results:
Train: 0.3419 (Sample size: 94,797)
OOT 1: 0.2808 (Sample size: 9,610)
OOT 2: 0.2928 (Sample size: 7,574)

Summary Table:
Period Start_Date   End_Date  Sample_Size  Gini_Coefficient
 Train 2024-09-01 2025-06-05        94797            0.3419
 OOT 1 2025-06-06 2025-06-30         9610            0.2808
 OOT 2 2025-07-01 2025-07-18         7574            0.2928


Unnamed: 0,Period,Start_Date,End_Date,Sample_Size,Gini_Coefficient
0,Train,2024-09-01,2025-06-05,94797,0.3419
1,OOT 1,2025-06-06,2025-06-30,9610,0.2808
2,OOT 2,2025-07-01,2025-07-18,7574,0.2928


# IOS

In [38]:
sq = f"""WITH
  bss AS (
  SELECT
    digitalLoanAccountId,
    beta_stack_score
  FROM
    prj-prod-dataplatform.risk_credit_mis.application_score_master
  WHERE
    beta_stack_score IS NOT NULL )
Select 
a1.digitalLoanAccountId,
a1.ln_os_type, 
a1.ln_appln_submit_datetime,
case when ldd.obs_min_inst_def30 >=1 then 1 else 0 end flag_mature_fpd30,
case when ldd.obs_min_inst_def30 >=1 and ldd.min_inst_def30 in (1) then 1 else 0 end fpd30,
bss.beta_stack_score prod_beta_stack_score
from worktable_data_analysis.sil_beta_applied_loans_backscored_20240901_20250730 a1
inner join `risk_credit_mis.loan_master_table` lmt on lmt.digitalLoanAccountId = a1.digitalLoanAccountId
inner join bss on bss.digitalLoanAccountId = a1.digitalLoanAccountId
inner join prj-prod-dataplatform.risk_credit_mis.loan_deliquency_data ldd on lmt.loanAccountNumber = ldd.loanAccountNumber
where a1.ln_os_type not like 'Android'
and a1.trench_category in (1,2,3)
and bss.beta_stack_score is not null
;
"""
d1 = client.query(sq).to_dataframe(progress_bar_type = 'tqdm')


print('SIL Beta Trench 1,2,3 - IOS')
print("\n" + "=" * 50)

data_periods_dict = {
    'Train': {'start': '2024-09-01', 'end': '2025-06-05'}, 
    'OOT 1': {'start': '2025-06-06', 'end': '2025-06-30'},
    'OOT 2': {'start': '2025-07-01', 'end': '2025-07-18'},
 }

calculate_gini_for_table(
    d1,
    date_column = 'ln_appln_submit_datetime',
    score_column = 'prod_beta_stack_score',
    target_column = 'fpd30',
    target_maturity_column = 'flag_mature_fpd30',
    data_periods_dict = data_periods_dict
)

Job ID 8080a8f9-2039-4ec4-9278-081f8ef17ab1 successfully executed: 100%|[32m██████████[0m|




Downloading: 100%|[32m██████████[0m|
SIL Beta Trench 1,2,3 - IOS

Gini Coefficient Results:
Train: 0.1955 (Sample size: 9,794)
OOT 1: 0.2637 (Sample size: 1,329)
OOT 2: 0.3204 (Sample size: 1,065)

Summary Table:
Period Start_Date   End_Date  Sample_Size  Gini_Coefficient
 Train 2024-09-01 2025-06-05         9794            0.1955
 OOT 1 2025-06-06 2025-06-30         1329            0.2637
 OOT 2 2025-07-01 2025-07-18         1065            0.3204


Unnamed: 0,Period,Start_Date,End_Date,Sample_Size,Gini_Coefficient
0,Train,2024-09-01,2025-06-05,9794,0.1955
1,OOT 1,2025-06-06,2025-06-30,1329,0.2637
2,OOT 2,2025-07-01,2025-07-18,1065,0.3204


# Alpha

# Alpha Android

In [None]:
sq = f"""WITH
  bss AS (
  SELECT
    digitalLoanAccountId,
    alpha_stack_score
  FROM
    prj-prod-dataplatform.risk_credit_mis.application_score_master
  WHERE
    alpha_stack_score IS NOT NULL )
Select 
a1.digitalLoanAccountId,
a1.ln_os_type, 
a1.ln_appln_submit_datetime,
case when ldd.obs_min_inst_def30 >=1 then 1 else 0 end flag_mature_fpd30,
case when ldd.obs_min_inst_def30 >=1 and ldd.min_inst_def30 in (1) then 1 else 0 end fpd30,
bss.alpha_stack_score prod_alpha_stack_score
from worktable_data_analysis.sil_alpha_applied_loans_backscored_20240801_20250730 a1
inner join `risk_credit_mis.loan_master_table` lmt on lmt.digitalLoanAccountId = a1.digitalLoanAccountId
inner join bss on bss.digitalLoanAccountId = a1.digitalLoanAccountId
inner join prj-prod-dataplatform.risk_credit_mis.loan_deliquency_data ldd on lmt.loanAccountNumber = ldd.loanAccountNumber
where a1.ln_os_type like 'Android'
and a1.trench_category in (1,2,3)
and bss.alpha_stack_score is not null
;
"""
d1 = client.query(sq).to_dataframe(progress_bar_type = 'tqdm')


print('SIL Alpha Trench 1,2,3 - IOS')
print("\n" + "=" * 50)

data_periods_dict = {
    'Train': {'start': '2024-09-01', 'end': '2025-06-05'}, 
    'OOT 1': {'start': '2025-06-06', 'end': '2025-06-30'},
    'OOT 2': {'start': '2025-07-01', 'end': '2025-07-18'},
 }

calculate_gini_for_table(
    d1,
    date_column = 'ln_appln_submit_datetime',
    score_column = 'prod_alpha_stack_score',
    target_column = 'fpd30',
    target_maturity_column = 'flag_mature_fpd30',
    data_periods_dict = data_periods_dict
)

Job ID b100f33c-8dd4-41fb-a2ee-6a3aa45bc5bc successfully executed: 100%|[32m██████████[0m|




Downloading: 100%|[32m██████████[0m|
SIL Alpha Trench 1,2,3 - IOS

Gini Coefficient Results:
Train: 0.3935 (Sample size: 71,684)
OOT 1: 0.3388 (Sample size: 6,099)
OOT 2: 0.3191 (Sample size: 4,415)

Summary Table:
Period Start_Date   End_Date  Sample_Size  Gini_Coefficient
 Train 2024-09-01 2025-06-05        71684            0.3935
 OOT 1 2025-06-06 2025-06-30         6099            0.3388
 OOT 2 2025-07-01 2025-07-18         4415            0.3191


Unnamed: 0,Period,Start_Date,End_Date,Sample_Size,Gini_Coefficient
0,Train,2024-09-01,2025-06-05,71684,0.3935
1,OOT 1,2025-06-06,2025-06-30,6099,0.3388
2,OOT 2,2025-07-01,2025-07-18,4415,0.3191


# IOS

In [91]:
sq = f"""WITH
  bss AS (
  SELECT
    digitalLoanAccountId,
    alpha_stack_score
  FROM
    prj-prod-dataplatform.risk_credit_mis.application_score_master
  WHERE
    alpha_stack_score IS NOT NULL )
Select 
a1.digitalLoanAccountId,
a1.ln_os_type, 
a1.ln_appln_submit_datetime,
case when ldd.obs_min_inst_def30 >=1 then 1 else 0 end flag_mature_fpd30,
case when ldd.obs_min_inst_def30 >=1 and ldd.min_inst_def30 in (1) then 1 else 0 end fpd30,
bss.alpha_stack_score prod_alpha_stack_score
from worktable_data_analysis.sil_alpha_applied_loans_backscored_20240801_20250730 a1
inner join `risk_credit_mis.loan_master_table` lmt on lmt.digitalLoanAccountId = a1.digitalLoanAccountId
inner join bss on bss.digitalLoanAccountId = a1.digitalLoanAccountId
inner join prj-prod-dataplatform.risk_credit_mis.loan_deliquency_data ldd on lmt.loanAccountNumber = ldd.loanAccountNumber
where a1.ln_os_type not like 'Android'
and a1.trench_category in (1,2,3) 
and bss.alpha_stack_score is not null
;
"""
d1 = client.query(sq).to_dataframe(progress_bar_type = 'tqdm')


print('SIL Alpha Trench 1,2,3 - IOS')
print("\n" + "=" * 50)

data_periods_dict = {
    'Train': {'start': '2024-09-01', 'end': '2025-06-05'}, 
    'OOT 1': {'start': '2025-06-06', 'end': '2025-06-30'},
    'OOT 2': {'start': '2025-07-01', 'end': '2025-07-18'},
 }

calculate_gini_for_table(
    d1,
    date_column = 'ln_appln_submit_datetime',
    score_column = 'prod_alpha_stack_score',
    target_column = 'fpd30',
    target_maturity_column = 'flag_mature_fpd30',
    data_periods_dict = data_periods_dict
)

Job ID 1cbf38b8-52a3-43d9-846e-8adf52ca6fb9 successfully executed: 100%|[32m██████████[0m|




Downloading: 100%|[32m██████████[0m|
SIL Alpha Trench 1,2,3 - IOS

Gini Coefficient Results:
Train: 0.277 (Sample size: 7,547)
OOT 1: 0.3201 (Sample size: 880)
OOT 2: 0.1108 (Sample size: 666)

Summary Table:
Period Start_Date   End_Date  Sample_Size  Gini_Coefficient
 Train 2024-09-01 2025-06-05         7547            0.2770
 OOT 1 2025-06-06 2025-06-30          880            0.3201
 OOT 2 2025-07-01 2025-07-18          666            0.1108


Unnamed: 0,Period,Start_Date,End_Date,Sample_Size,Gini_Coefficient
0,Train,2024-09-01,2025-06-05,7547,0.277
1,OOT 1,2025-06-06,2025-06-30,880,0.3201
2,OOT 2,2025-07-01,2025-07-18,666,0.1108


In [93]:
dd.query("""select ln_os_type, count(distinct digitalLoanAccountId) cntloans, sum(fpd30)fpd30, sum(flag_mature_fpd30) from d1
where date(ln_appln_submit_datetime) between '2025-07-01' and '2025-07-18'
group by 1;
""").to_df()

Unnamed: 0,ln_os_type,cntloans,fpd30,sum(flag_mature_fpd30)
0,iOS,657,31.0,624.0


In [67]:
import pandas as pd
from google.cloud import bigquery
from sklearn.metrics import roc_auc_score
from typing import Dict

def calculate_gini_for_table1(
    df,
    date_column: str,
    score_column: str,
    target_column: str,
    target_maturity_column: str,
    data_periods_dict: Dict
):
    """
    Calculate Gini coefficient for different time periods.
    
    Args:
        project_id: BigQuery project ID
        table_name: Full table name (dataset.table)
        date_column: Name of the date column
        score_column: Name of the score column
        target_column: Name of the target column
        target_maturity_column: Name of the target maturity column
        data_periods_dict: Dictionary with periods, e.g.:
            {'Train': {'start': '2024-01-01', 'end': '2025-01-31'}, 
             'Test': {'start': '2025-02-01', 'end': '2025-12-31'}}
    
    Returns:
        pandas.DataFrame: Table with Gini coefficients for each period
    """
           
    # Get all data
    dt = df.copy()
    
    # Convert date column to datetime and extract just the date part
    dt[date_column] = pd.to_datetime(dt[date_column]).dt.date
    
    # Initialize results
    gini_results = []
    
    print("Gini Coefficient Results:")
    print("=" * 50)
    
    # Calculate Gini for each period
    for period_name, period_info in data_periods_dict.items():
        start_date = pd.to_datetime(period_info['start']).date()
        end_date = pd.to_datetime(period_info['end']).date()
        
        # Filter data for the current period
        period_mask = (dt[date_column] >= start_date) & (dt[date_column] <= end_date)
        period_data = dt[period_mask].copy()
        
        if len(period_data) == 0:
            # Fixed: removed .date() calls since start_date and end_date are already date objects
            print(f"{period_name}: No data available for period {start_date} to {end_date}")
            gini_results.append({
                'Period': period_name,
                'Start_Date': start_date,
                'End_Date': end_date,
                'Sample_Size': 0,
                'Gini_Coefficient': None
            })
            continue
        
        # Check if we have both classes (0 and 1) in target
        unique_targets = period_data[target_column].unique()
        if len(unique_targets) < 2:
            print(f"{period_name}: Only one class present in target variable. Cannot calculate Gini.")
            gini_results.append({
                'Period': period_name,
                'Start_Date': start_date,
                'End_Date': end_date,
                'Sample_Size': len(period_data),
                'Gini_Coefficient': None
            })
            continue
        
        # Calculate Gini coefficient
        try:
            auc = roc_auc_score(period_data[target_column], period_data[score_column])
            gini = 2 * auc - 1
            
            print(f"{period_name}: {round(gini, 4)} (Sample size: {len(period_data):,})")
            
            gini_results.append({
                'Period': period_name,
                'Start_Date': start_date,
                'End_Date': end_date,
                'Sample_Size': len(period_data),
                'Gini_Coefficient': round(gini, 4)
            })
            
        except Exception as e:
            print(f"{period_name}: Error calculating Gini - {str(e)}")
            gini_results.append({
                'Period': period_name,
                'Start_Date': start_date,
                'End_Date': end_date,
                'Sample_Size': len(period_data),
                'Gini_Coefficient': None
            })
    
    # Create results DataFrame
    results_df = pd.DataFrame(gini_results)
    
    print("\n" + "=" * 50)
    print("Summary Table:")
    print(results_df.to_string(index=False))
    
    return results_df

# All Trench Alpha

# SIL Alpha All Trench Android

In [94]:
sq = f"""WITH
  bss AS (
  SELECT
    digitalLoanAccountId,
    beta_stack_score,
    alpha_stack_score,
    ln_fpd30_flag,
    ln_mature_fspd30_flag
  FROM
    risk_mart.sil_risk_ds_master_20230101_20250831
  WHERE
    alpha_stack_score IS NOT NULL )
Select 
a1.digitalLoanAccountId,
a1.ln_os_type, 
datetime(a1.ln_appln_submit_datetime) ln_appln_submit_datetime,
datetime(lmt.disbursementDateTime)disbursementDateTime,
bss.ln_fpd30_flag,
bss.ln_mature_fspd30_flag,
case when ldd.obs_min_inst_def30 >=2 then 1 else 0 end flag_mature_fspd30,
case when ldd.obs_min_inst_def30 >=2 and ldd.min_inst_def30 in (1,2) then 1 else 0 end fspd30,
bss.alpha_stack_score prod_alpha_stack_score
from worktable_data_analysis.sil_alpha_applied_loans_backscored_20240801_20250730 a1
inner join `risk_credit_mis.loan_master_table` lmt on lmt.digitalLoanAccountId = a1.digitalLoanAccountId
inner join bss on bss.digitalLoanAccountId = a1.digitalLoanAccountId
inner join prj-prod-dataplatform.risk_credit_mis.loan_deliquency_data ldd on lmt.loanAccountNumber = ldd.loanAccountNumber
where a1.ln_os_type like 'Android'
and a1.trench_category in (1,2,3)
and bss.alpha_stack_score is not null
and case when ldd.obs_min_inst_def30 >=2 then 1 else 0 end = 1
;
"""
d1 = client.query(sq).to_dataframe(progress_bar_type = 'tqdm')
print(f"The shape of the dataframe after reading query is:\t{d1.shape}")


print('SIL Alpha All Trenches - Android')
print("\n" + "=" * 50)

data_periods_dict = {
    'Train': {'start': '2024-09-01', 'end': '2025-02-28'}, 
    'OOT 1': {'start': '2025-03-01', 'end': '2025-03-31'},
    'OOT 2': {'start': '2025-04-01', 'end': '2025-04-30'},
    'OOT 3': {'start': '2025-05-01', 'end': '2025-05-31'},
 }

calculate_gini_for_table1(
    d1,
    date_column = 'disbursementDateTime',
    score_column = 'prod_alpha_stack_score',
    target_column = 'fspd30',
    target_maturity_column = 'flag_mature_fspd30',
    data_periods_dict = data_periods_dict
)

Job ID e600eb75-3152-4d00-8038-9934ace41e1a successfully executed: 100%|[32m██████████[0m|




Downloading: 100%|[32m██████████[0m|
The shape of the dataframe after reading query is:	(106901, 9)
SIL Alpha All Trenches - Android

Gini Coefficient Results:
Train: 0.4008 (Sample size: 67,623)
OOT 1: 0.3461 (Sample size: 9,988)
OOT 2: 0.405 (Sample size: 11,184)
OOT 3: 0.3423 (Sample size: 11,683)

Summary Table:
Period Start_Date   End_Date  Sample_Size  Gini_Coefficient
 Train 2024-09-01 2025-02-28        67623            0.4008
 OOT 1 2025-03-01 2025-03-31         9988            0.3461
 OOT 2 2025-04-01 2025-04-30        11184            0.4050
 OOT 3 2025-05-01 2025-05-31        11683            0.3423


Unnamed: 0,Period,Start_Date,End_Date,Sample_Size,Gini_Coefficient
0,Train,2024-09-01,2025-02-28,67623,0.4008
1,OOT 1,2025-03-01,2025-03-31,9988,0.3461
2,OOT 2,2025-04-01,2025-04-30,11184,0.405
3,OOT 3,2025-05-01,2025-05-31,11683,0.3423


In [96]:
dd.query("""SELECT
  ln_os_type, 
  STRFTIME(disbursementDateTime, '%Y-%m') AS submit_year_month,
  COUNT(DISTINCT digitalLoanAccountId) AS cntloans, 
  SUM(ln_fpd30_flag) AS ln_fpd30_flag,
  SUM(ln_mature_fspd30_flag) AS ln_mature_fspd30_flag,
  SUM(fspd30) AS fspd30,
  SUM(flag_mature_fspd30) AS flag_mature_fspd30
FROM d1
GROUP BY 1, 2
order by 2 desc
;
"""
).to_df()

Unnamed: 0,ln_os_type,submit_year_month,cntloans,ln_fpd30_flag,ln_mature_fspd30_flag,fspd30,flag_mature_fspd30
0,Android,2025-06,6423,334.0,83.0,668.0,6423.0
1,Android,2025-05,11683,0.0,0.0,1423.0,11683.0
2,Android,2025-04,11184,30.0,0.0,1233.0,11184.0
3,Android,2025-03,9988,492.0,0.0,1165.0,9988.0
4,Android,2025-02,8021,50.0,0.0,910.0,8021.0
5,Android,2025-01,9046,552.0,1299.0,976.0,9046.0
6,Android,2024-12,19599,1160.0,19599.0,1966.0,19599.0
7,Android,2024-11,10808,680.0,10808.0,1194.0,10808.0
8,Android,2024-10,9781,805.0,9781.0,1313.0,9781.0
9,Android,2024-09,10368,865.0,10368.0,1419.0,10368.0


In [66]:
d1.head()

Unnamed: 0,digitalLoanAccountId,ln_os_type,ln_appln_submit_datetime,ln_fpd30_flag,ln_mature_fspd30_flag,flag_mature_fspd30,fspd30,prod_alpha_stack_score
0,cb3c2b5e-cc35-4abe-9767-a636fdd30db0,Android,2024-12-26 14:19:44,0,1,1,0,0.245589
1,2cc2650b-b140-4b55-a971-d1c713bd2196,Android,2024-12-06 10:04:25,0,1,1,0,0.242041
2,30e4872f-feda-4159-9531-ef72016252fc,Android,2024-12-01 14:50:57,0,1,1,0,0.209461
3,5bb4a09a-3c41-4c53-acca-cd928bff77ad,Android,2024-09-06 13:45:17,0,1,1,1,0.136079
4,ef94fb5b-6b62-4d4c-9a6d-d6193696751a,Android,2024-10-22 12:59:03,1,1,1,1,0.249948


# SIL Alpha All Trench IOS

In [97]:
sq = f"""WITH
  bss AS (
  SELECT
    digitalLoanAccountId,
    beta_stack_score,
    alpha_stack_score,
    ln_fpd30_flag,
    ln_mature_fspd30_flag
  FROM
    risk_mart.sil_risk_ds_master_20230101_20250831
  WHERE
    alpha_stack_score IS NOT NULL )
Select 
a1.digitalLoanAccountId,
a1.ln_os_type, 
datetime(a1.ln_appln_submit_datetime) ln_appln_submit_datetime,\
datetime(lmt.disbursementDateTime)disbursementDateTime,
bss.ln_fpd30_flag,
bss.ln_mature_fspd30_flag,
case when ldd.obs_min_inst_def30 >=2 then 1 else 0 end flag_mature_fspd30,
case when ldd.obs_min_inst_def30 >=2 and ldd.min_inst_def30 in (1,2) then 1 else 0 end fspd30,
bss.alpha_stack_score prod_alpha_stack_score
from worktable_data_analysis.sil_alpha_applied_loans_backscored_20240801_20250730 a1
inner join `risk_credit_mis.loan_master_table` lmt on lmt.digitalLoanAccountId = a1.digitalLoanAccountId
inner join bss on bss.digitalLoanAccountId = a1.digitalLoanAccountId
inner join prj-prod-dataplatform.risk_credit_mis.loan_deliquency_data ldd on lmt.loanAccountNumber = ldd.loanAccountNumber
where a1.ln_os_type not like 'Android'
and a1.trench_category in (1,2,3)
and bss.alpha_stack_score is not null
and case when ldd.obs_min_inst_def30 >=2 then 1 else 0 end = 1
;
"""
d1 = client.query(sq).to_dataframe(progress_bar_type = 'tqdm')
print(f"The shape of the dataframe after reading query is:\t{d1.shape}")


print('SIL Alpha All Trenches - IOS')
print("\n" + "=" * 50)

data_periods_dict = {
    'Train': {'start': '2024-09-01', 'end': '2025-02-28'}, 
    'OOT 1': {'start': '2025-03-01', 'end': '2025-03-31'},
    'OOT 2': {'start': '2025-04-01', 'end': '2025-04-30'},
    'OOT 3': {'start': '2025-05-01', 'end': '2025-05-31'},
 }

calculate_gini_for_table1(
    d1,
    date_column = 'disbursementDateTime',
    score_column = 'prod_alpha_stack_score',
    target_column = 'fspd30',
    target_maturity_column = 'flag_mature_fspd30',
    data_periods_dict = data_periods_dict
)

Job ID 5576cd9e-411e-471f-9892-45e3d76201a2 successfully executed: 100%|[32m██████████[0m|




Downloading: 100%|[32m██████████[0m|
The shape of the dataframe after reading query is:	(12811, 9)
SIL Alpha All Trenches - IOS

Gini Coefficient Results:
Train: 0.2743 (Sample size: 6,971)
OOT 1: 0.2646 (Sample size: 1,476)
OOT 2: 0.3345 (Sample size: 1,801)
OOT 3: 0.2712 (Sample size: 1,630)

Summary Table:
Period Start_Date   End_Date  Sample_Size  Gini_Coefficient
 Train 2024-09-01 2025-02-28         6971            0.2743
 OOT 1 2025-03-01 2025-03-31         1476            0.2646
 OOT 2 2025-04-01 2025-04-30         1801            0.3345
 OOT 3 2025-05-01 2025-05-31         1630            0.2712


Unnamed: 0,Period,Start_Date,End_Date,Sample_Size,Gini_Coefficient
0,Train,2024-09-01,2025-02-28,6971,0.2743
1,OOT 1,2025-03-01,2025-03-31,1476,0.2646
2,OOT 2,2025-04-01,2025-04-30,1801,0.3345
3,OOT 3,2025-05-01,2025-05-31,1630,0.2712


In [98]:
dd.query("""SELECT
  ln_os_type, 
  STRFTIME(disbursementDateTime, '%Y-%m') AS submit_year_month,
  COUNT(DISTINCT digitalLoanAccountId) AS cntloans, 
  SUM(ln_fpd30_flag) AS ln_fpd30_flag,
  SUM(ln_mature_fspd30_flag) AS ln_mature_fspd30_flag,
  SUM(fspd30) AS fspd30,
  SUM(flag_mature_fspd30) AS flag_mature_fspd30
FROM d1
GROUP BY 1, 2
order by 2 desc
;
"""
).to_df()

Unnamed: 0,ln_os_type,submit_year_month,cntloans,ln_fpd30_flag,ln_mature_fspd30_flag,fspd30,flag_mature_fspd30
0,iOS,2025-06,933,53.0,19.0,97.0,933.0
1,iOS,2025-05,1630,0.0,0.0,140.0,1630.0
2,iOS,2025-04,1801,2.0,0.0,194.0,1801.0
3,iOS,2025-03,1476,80.0,0.0,163.0,1476.0
4,iOS,2025-02,1031,11.0,0.0,113.0,1031.0
5,iOS,2025-01,1106,54.0,152.0,88.0,1106.0
6,iOS,2024-12,1994,119.0,1994.0,187.0,1994.0
7,iOS,2024-11,1089,56.0,1089.0,98.0,1089.0
8,iOS,2024-10,881,61.0,881.0,94.0,881.0
9,iOS,2024-09,870,65.0,870.0,95.0,870.0


# All Trench Beta

# SIL Beta All Trench Android

In [99]:
sq = f"""WITH
  bss AS (
  SELECT
    digitalLoanAccountId,
    beta_stack_score,
    alpha_stack_score,
    ln_fpd30_flag,
    ln_mature_fspd30_flag
  FROM
    risk_mart.sil_risk_ds_master_20230101_20250831
  WHERE
    beta_stack_score IS NOT NULL )
Select 
a1.digitalLoanAccountId,
a1.ln_os_type, 
datetime(a1.ln_appln_submit_datetime) ln_appln_submit_datetime,
datetime(lmt.disbursementDateTime)disbursementDateTime,
bss.ln_fpd30_flag,
bss.ln_mature_fspd30_flag,
case when ldd.obs_min_inst_def30 >=2 then 1 else 0 end flag_mature_fspd30,
case when ldd.obs_min_inst_def30 >=2 and ldd.min_inst_def30 in (1,2) then 1 else 0 end fspd30,
bss.beta_stack_score prod_beta_stack_score
from worktable_data_analysis.sil_alpha_applied_loans_backscored_20240801_20250730 a1
inner join `risk_credit_mis.loan_master_table` lmt on lmt.digitalLoanAccountId = a1.digitalLoanAccountId
inner join bss on bss.digitalLoanAccountId = a1.digitalLoanAccountId
inner join prj-prod-dataplatform.risk_credit_mis.loan_deliquency_data ldd on lmt.loanAccountNumber = ldd.loanAccountNumber
where a1.ln_os_type like 'Android'
and a1.trench_category in (1,2,3)
and bss.beta_stack_score is not null
and case when ldd.obs_min_inst_def30 >=2 then 1 else 0 end = 1
;
"""
d1 = client.query(sq).to_dataframe(progress_bar_type = 'tqdm')
print(f"The shape of the dataframe after reading query is:\t{d1.shape}")


print('SIL Beta All Trenches - Android')
print("\n" + "=" * 50)

data_periods_dict = {
    'Train': {'start': '2024-09-01', 'end': '2025-02-28'}, 
    'OOT 1': {'start': '2025-03-01', 'end': '2025-03-31'},
    'OOT 2': {'start': '2025-04-01', 'end': '2025-04-30'},
    'OOT 3': {'start': '2025-05-01', 'end': '2025-05-31'},
 }

calculate_gini_for_table1(
    d1,
    date_column = 'disbursementDateTime',
    score_column = 'prod_beta_stack_score',
    target_column = 'fspd30',
    target_maturity_column = 'flag_mature_fspd30',
    data_periods_dict = data_periods_dict
)

Job ID 8381b226-ccaf-45f4-9420-a286600f801f successfully executed: 100%|[32m██████████[0m|




Downloading: 100%|[32m██████████[0m|
The shape of the dataframe after reading query is:	(106901, 9)
SIL Beta All Trenches - Android

Gini Coefficient Results:
Train: 0.3613 (Sample size: 67,623)
OOT 1: 0.3266 (Sample size: 9,988)
OOT 2: 0.37 (Sample size: 11,184)
OOT 3: 0.343 (Sample size: 11,683)

Summary Table:
Period Start_Date   End_Date  Sample_Size  Gini_Coefficient
 Train 2024-09-01 2025-02-28        67623            0.3613
 OOT 1 2025-03-01 2025-03-31         9988            0.3266
 OOT 2 2025-04-01 2025-04-30        11184            0.3700
 OOT 3 2025-05-01 2025-05-31        11683            0.3430


Unnamed: 0,Period,Start_Date,End_Date,Sample_Size,Gini_Coefficient
0,Train,2024-09-01,2025-02-28,67623,0.3613
1,OOT 1,2025-03-01,2025-03-31,9988,0.3266
2,OOT 2,2025-04-01,2025-04-30,11184,0.37
3,OOT 3,2025-05-01,2025-05-31,11683,0.343


In [100]:
dd.query("""SELECT
  ln_os_type, 
  STRFTIME(disbursementDateTime, '%Y-%m') AS submit_year_month,
  COUNT(DISTINCT digitalLoanAccountId) AS cntloans, 
  SUM(ln_fpd30_flag) AS ln_fpd30_flag,
  SUM(ln_mature_fspd30_flag) AS ln_mature_fspd30_flag,
  SUM(fspd30) AS fspd30,
  SUM(flag_mature_fspd30) AS flag_mature_fspd30
FROM d1
GROUP BY 1, 2
order by 2 desc
;
"""
).to_df()

Unnamed: 0,ln_os_type,submit_year_month,cntloans,ln_fpd30_flag,ln_mature_fspd30_flag,fspd30,flag_mature_fspd30
0,Android,2025-06,6423,334.0,83.0,668.0,6423.0
1,Android,2025-05,11683,0.0,0.0,1423.0,11683.0
2,Android,2025-04,11184,30.0,0.0,1233.0,11184.0
3,Android,2025-03,9988,492.0,0.0,1165.0,9988.0
4,Android,2025-02,8021,50.0,0.0,910.0,8021.0
5,Android,2025-01,9046,552.0,1299.0,976.0,9046.0
6,Android,2024-12,19599,1160.0,19599.0,1966.0,19599.0
7,Android,2024-11,10808,680.0,10808.0,1194.0,10808.0
8,Android,2024-10,9781,805.0,9781.0,1313.0,9781.0
9,Android,2024-09,10368,865.0,10368.0,1419.0,10368.0


In [88]:
d1.head()

Unnamed: 0,digitalLoanAccountId,ln_os_type,ln_appln_submit_datetime,ln_fpd30_flag,ln_mature_fspd30_flag,flag_mature_fspd30,fspd30,prod_beta_stack_score
0,e29d7d69-47cf-4cef-92b6-a9040513aa40,Android,2025-05-28 17:51:01,,,1,1,0.026367
1,9982eebe-2086-4d37-a1f1-b6c477ccf4c5,Android,2025-05-26 19:51:14,,,1,1,0.161328
2,a7b2dfad-9a4b-4c78-8e15-4c8568a834f4,Android,2025-05-20 09:08:55,,,1,0,0.053212
3,271e4af4-12f6-4f59-b58b-79cbc7565310,Android,2025-05-15 15:42:01,,,1,0,0.018862
4,6f2415a9-6f66-4455-9169-cddb618cc5a7,Android,2025-05-21 09:14:41,,,1,0,0.016873


# SIL Beta All Trench IOS

In [101]:
sq = f"""WITH
  bss AS (
  SELECT
    digitalLoanAccountId,
    beta_stack_score,
    alpha_stack_score,
    ln_fpd30_flag,
    ln_mature_fspd30_flag
  FROM
    risk_mart.sil_risk_ds_master_20230101_20250831
  WHERE
    beta_stack_score IS NOT NULL )
Select 
a1.digitalLoanAccountId,
a1.ln_os_type, 
datetime(a1.ln_appln_submit_datetime) ln_appln_submit_datetime,
datetime(lmt.disbursementDateTime)disbursementDateTime,
bss.ln_fpd30_flag,
bss.ln_mature_fspd30_flag,
case when ldd.obs_min_inst_def30 >=2 then 1 else 0 end flag_mature_fspd30,
case when ldd.obs_min_inst_def30 >=2 and ldd.min_inst_def30 in (1,2) then 1 else 0 end fspd30,
bss.beta_stack_score prod_beta_stack_score
from worktable_data_analysis.sil_alpha_applied_loans_backscored_20240801_20250730 a1
inner join `risk_credit_mis.loan_master_table` lmt on lmt.digitalLoanAccountId = a1.digitalLoanAccountId
inner join bss on bss.digitalLoanAccountId = a1.digitalLoanAccountId
inner join prj-prod-dataplatform.risk_credit_mis.loan_deliquency_data ldd on lmt.loanAccountNumber = ldd.loanAccountNumber
where a1.ln_os_type not like 'Android'
and a1.trench_category in (1,2,3)
and bss.beta_stack_score is not null
and case when ldd.obs_min_inst_def30 >=2 then 1 else 0 end = 1
;
"""
d1 = client.query(sq).to_dataframe(progress_bar_type = 'tqdm')
print(f"The shape of the dataframe after reading query is:\t{d1.shape}")


print('SIL Beta All Trenches - IOS')
print("\n" + "=" * 50)

data_periods_dict = {
    'Train': {'start': '2024-09-01', 'end': '2025-02-28'}, 
    'OOT 1': {'start': '2025-03-01', 'end': '2025-03-31'},
    'OOT 2': {'start': '2025-04-01', 'end': '2025-04-30'},
    'OOT 3': {'start': '2025-05-01', 'end': '2025-05-31'},
 }

calculate_gini_for_table1(
    d1,
    date_column = 'disbursementDateTime',
    score_column = 'prod_beta_stack_score',
    target_column = 'fspd30',
    target_maturity_column = 'flag_mature_fspd30',
    data_periods_dict = data_periods_dict
)

Job ID 0c4ef40d-c329-4379-bf1f-6913955ca2d8 successfully executed: 100%|[32m██████████[0m|




Downloading: 100%|[32m██████████[0m|
The shape of the dataframe after reading query is:	(12811, 9)
SIL Beta All Trenches - IOS

Gini Coefficient Results:
Train: 0.2416 (Sample size: 6,971)
OOT 1: 0.242 (Sample size: 1,476)
OOT 2: 0.2493 (Sample size: 1,801)
OOT 3: 0.306 (Sample size: 1,630)

Summary Table:
Period Start_Date   End_Date  Sample_Size  Gini_Coefficient
 Train 2024-09-01 2025-02-28         6971            0.2416
 OOT 1 2025-03-01 2025-03-31         1476            0.2420
 OOT 2 2025-04-01 2025-04-30         1801            0.2493
 OOT 3 2025-05-01 2025-05-31         1630            0.3060


Unnamed: 0,Period,Start_Date,End_Date,Sample_Size,Gini_Coefficient
0,Train,2024-09-01,2025-02-28,6971,0.2416
1,OOT 1,2025-03-01,2025-03-31,1476,0.242
2,OOT 2,2025-04-01,2025-04-30,1801,0.2493
3,OOT 3,2025-05-01,2025-05-31,1630,0.306


In [102]:
dd.query("""SELECT
  ln_os_type, 
  STRFTIME(disbursementDateTime, '%Y-%m') AS submit_year_month,
  COUNT(DISTINCT digitalLoanAccountId) AS cntloans, 
  SUM(ln_fpd30_flag) AS ln_fpd30_flag,
  SUM(ln_mature_fspd30_flag) AS ln_mature_fspd30_flag,
  SUM(fspd30) AS fspd30,
  SUM(flag_mature_fspd30) AS flag_mature_fspd30
FROM d1
GROUP BY 1, 2
order by 2 desc
;
"""
).to_df()

Unnamed: 0,ln_os_type,submit_year_month,cntloans,ln_fpd30_flag,ln_mature_fspd30_flag,fspd30,flag_mature_fspd30
0,iOS,2025-06,933,53.0,19.0,97.0,933.0
1,iOS,2025-05,1630,0.0,0.0,140.0,1630.0
2,iOS,2025-04,1801,2.0,0.0,194.0,1801.0
3,iOS,2025-03,1476,80.0,0.0,163.0,1476.0
4,iOS,2025-02,1031,11.0,0.0,113.0,1031.0
5,iOS,2025-01,1106,54.0,152.0,88.0,1106.0
6,iOS,2024-12,1994,119.0,1994.0,187.0,1994.0
7,iOS,2024-11,1089,56.0,1089.0,98.0,1089.0
8,iOS,2024-10,881,61.0,881.0,94.0,881.0
9,iOS,2024-09,870,65.0,870.0,95.0,870.0
