# <div align = "center" style="color:rgb(0, 255, 0);"> Gini Calculation for Gen Credo Score and Credo Score for Different Trenches in SIL</div>

# Define Library

In [1]:
# %% [markdown]
# # Jupyter Notebook Loading Header
#
# This is a custom loading header for Jupyter Notebooks in Visual Studio Code.
# It includes common imports and settings to get you started quickly.
# %% [markdown]
## Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from google.cloud import bigquery
from google.cloud import storage
import os
import tempfile
import time
from datetime import datetime
import uuid
import joblib
import uuid

import gcsfs
import duckdb as dd
import pickle
import joblib
from typing import Union
import io

path = r'C:\Users\Dwaipayan\AppData\Roaming\gcloud\legacy_credentials\dchakroborti@tonikbank.com\adc.json'
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = path
client = bigquery.Client(project='prj-prod-dataplatform')
os.environ["GOOGLE_CLOUD_PROJECT"] = "prj-prod-dataplatform"
# %% [markdown]
## Configure Settings
# Set options or configurations as needed
pd.set_option('display.max_columns', None)
pd.set_option("Display.max_rows", 100)


# Constant

In [2]:
CURRENT_DATE = datetime.now().strftime("%Y%m%d")


# Config

In [3]:
unique_id = str(uuid.uuid4()).replace('-', '')[-12:]
print(f"The unique Id is: {unique_id}")
BUCKETNAME = 'prod-asia-southeast1-tonik-aiml-workspace'
CLOUDPATH = 'DC/Model_Monitoring/Gini_Values'
LOCALPATH = r'D:\OneDrive - Tonik Financial Pte Ltd\MyStuff\Data Engineering\Model_Monitoring\New_Model_Monitoring\Data\Gini_Values'
VERSION = 'V1'
PROJECT_ID = 'prj-prod-dataplatform'

The unique Id is: 6d6f2422efe8


# <div align="left" style="color:rgb(51, 250, 250);"> Functions </div>

## <div align="left" style="color:rgb(51, 250, 250);"> Save the data to google clound storage </div>

In [4]:
def save_df_to_gcs(df, bucket_name, destination_blob_name, file_format='csv'):
    """Saves a pandas DataFrame to Google Cloud Storage.

    Args:
        df: The pandas DataFrame to save.
        bucket_name: The name of the GCS bucket.
        destination_blob_name: The name of the blob to be created.
        file_format: The file format to save the DataFrame in ('csv' or 'parquet').
    """

    # Create a temporary file
    if file_format == 'csv':
        temp_file = 'temp.csv'
        df.to_csv(temp_file, index=False)
    elif file_format == 'parquet':
        temp_file = 'temp.parquet'
        df.to_parquet(temp_file, index=False)
    else:
        raise ValueError("Invalid file format. Please choose 'csv' or 'parquet'.")

    # Upload the file to GCS
    storage_client = storage.Client(project="prj-prod-dataplatform")

    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(destination_blob_name)

    blob.upload_from_filename(temp_file)

    # Remove the temporary file
    import os
    os.remove(temp_file)
    


## <div align="left" style="color:rgb(51, 250, 250);"> Read the Data from Google Cloud Storage </div>

In [5]:
def read_df_from_gcs(bucket_name, source_blob_name, file_format='csv'):
    """Reads a DataFrame from Google Cloud Storage.

    Args:
        bucket_name: The name of the GCS bucket.
        source_blob_name: The name of the blob to read.
        file_format: The file format to read ('csv' or 'parquet').

    Returns:
        pandas.DataFrame: The data loaded from the GCS file.
    """
    # Create a temporary file name
    temp_file = f'temp.{file_format}'
    
    try:
        # Initialize GCS client
        storage_client = storage.Client()
        bucket = storage_client.bucket(bucket_name)
        blob = bucket.blob(source_blob_name)

        # Download the file to a temporary location
        blob.download_to_filename(temp_file)

        # Read the file into a DataFrame
        if file_format == 'csv':
            df = pd.read_csv(temp_file, low_memory=False)
        elif file_format == 'parquet':
            df = pd.read_parquet(temp_file)
        else:
            raise ValueError("Invalid file format. Please choose 'csv' or 'parquet'.")

        return df

    finally:
        # Clean up the temporary file
        if os.path.exists(temp_file):
            os.remove(temp_file)

## <div align = "left" style="color:rgb(51, 250, 250);"> Data Quality Report </div>

In [6]:
def data_quality_report(df, target_col='ln_fspd30_flag'):
    # Initialize an empty list to store each row of data
    report_data = []
    # Iterate over each column in the DataFrame to compute metrics
    for col in df.columns:
        # Determine the data type of the column
        data_type = df[col].dtype
       
        # Calculate the number of missing values in the column
        missing_values = df[col].isnull().sum()
       
        # Calculate the percentage of missing values relative to the total number of rows
        missing_percentage = (missing_values / len(df)) * 100
       
        # Calculate the number of unique values in the column
        unique_values = df[col].nunique()
       
        # Calculate the percentage of non-missing values
        non_missing_percentage = ((len(df) - missing_values) / len(df)) * 100
       
        # Check if the column is numeric to compute additional metrics
        if pd.api.types.is_numeric_dtype(df[col]):
            # Compute minimum, maximum, mean, median, mode, mode percentage, standard deviation, and quantiles
            min_value = df[col].min()
            max_value = df[col].max()
            mean_value = df[col].mean()
            median_value = df[col].median()
            mode_value = df[col].mode().iloc[0] if not df[col].mode().empty else None
            mode_percentage = (df[col] == mode_value).sum() / len(df) * 100 if mode_value is not None else None
            std_dev = df[col].std()
            quantile_25 = df[col].quantile(0.25)
            quantile_50 = df[col].quantile(0.50)  # Same as median
            quantile_75 = df[col].quantile(0.75)
            
            # Calculate the Interquartile Range (IQR)
            iqr = quantile_75 - quantile_25
            
            # Calculate Skewness and Kurtosis
            skewness = df[col].skew()
            kurtosis = df[col].kurt()
            
            # Calculate Coefficient of Variation (CV) - standardized measure of dispersion
            cv = (std_dev / mean_value) * 100 if mean_value != 0 else None
            
            # Calculate correlation with target variable if target exists in dataframe
            if target_col in df.columns and col != target_col and pd.api.types.is_numeric_dtype(df[target_col]):
                # Calculate correlation only using rows where both columns have non-null values
                correlation = df[[col, target_col]].dropna().corr().iloc[0, 1]
            else:
                correlation = None
        else:
            # Assign None for non-numeric columns where appropriate
            min_value = None
            max_value = None
            mean_value = None
            median_value = None
            mode_value = df[col].mode().iloc[0] if not df[col].mode().empty else None
            mode_percentage = (df[col] == mode_value).sum() / len(df) * 100 if mode_value is not None else None
            std_dev = None
            quantile_25 = None
            quantile_50 = None
            quantile_75 = None
            iqr = None
            skewness = None
            kurtosis = None
            cv = None
            correlation = None
       
        # Append the computed metrics for the current column to the list
        report_data.append({
            'Column': col,
            'Data Type': data_type,
            'Missing Values': missing_values,
            'Missing Percentage': missing_percentage,
            'Unique Values': unique_values,
            'Min': min_value,
            'Max': max_value,
            'Mean': mean_value,
            'Median': median_value,
            'Mode': mode_value,
            'Mode Percentage': mode_percentage,
            'Std Dev': std_dev,
            'Non-missing Percentage': non_missing_percentage,
            '25% Quantile': quantile_25,
            '50% Quantile': quantile_50,
            '75% Quantile': quantile_75,
            'IQR': iqr,
            'Skewness': skewness,
            'Kurtosis': kurtosis,
            'CV (%)': cv,
            f'Correlation with {target_col}': correlation
        })
    # Create the DataFrame from the list of dictionaries
    report = pd.DataFrame(report_data)
   
    # Return the complete data quality report DataFrame
    return report

# <div align = "left" style="color:rgb(51,250,250);"> Upload pickle file to Google Cloud Storage Bucke </div>

In [7]:
def upload_to_gcs(bucket_name, source_file_path, destination_blob_name):
    """Uploads a file to Google Cloud Storage"""
    client = storage.Client()
    bucket = client.bucket(bucket_name)
    blob = bucket.blob(destination_blob_name)
    
    blob.upload_from_filename(source_file_path)
    print(f"File {source_file_path} uploaded to {bucket_name}/{destination_blob_name}")

In [8]:
import pickle
import io
from google.cloud import storage
def save_pickle_to_gcs(data, bucket_name, destination_blob_name):
    """
    Save any Python object as a pickle file to Google Cloud Storage
    
    Args:
        data: The Python object to pickle (DataFrame, dict, list, etc.)
        bucket_name: Name of the GCS bucket
        destination_blob_name: Path/filename in the bucket
    """
    # Initialize the GCS client
    client = storage.Client()
    bucket = client.bucket(bucket_name)
    blob = bucket.blob(destination_blob_name)
    
    # Serialize the data to pickle format in memory
    pickle_buffer = io.BytesIO()
    pickle.dump(data, pickle_buffer)
    pickle_buffer.seek(0)
    
    # Upload the pickle data to GCS
    blob.upload_from_file(pickle_buffer, content_type='application/octet-stream')
    print(f"Pickle file uploaded to gs://{bucket_name}/{destination_blob_name}")

# save_dataframe_multi_format

In [9]:
def save_dataframe_multi_format(
    dataframe: pd.DataFrame, 
    cloud_path: str, 
    filename: str, 
    client: bigquery.Client = None,
    bucket_name: str = None
) -> dict:
    """
    Save a pandas DataFrame to Google Cloud Storage in multiple formats (CSV, Pickle, Parquet, Joblib).
    
    Args:
        dataframe (pd.DataFrame): The DataFrame to save
        cloud_path (str): The cloud path (e.g., 'DC/Model_Monitoring/sil_beta_trench1_data')
        filename (str): The base filename without extension
        client (bigquery.Client, optional): BigQuery client (for project reference)
        bucket_name (str, optional): GCS bucket name. If None, will try to extract from client
        
    Returns:
        dict: Dictionary with status of each file saved
        
    Example:
        client = bigquery.Client(project='prj-prod-dataplatform')
        CLOUDPATH = 'DC/Model_Monitoring/sil_beta_trench1_data'
        
        results = save_dataframe_multi_format(
            dataframe=d1,
            cloud_path=CLOUDPATH,
            filename='my_data',
            client=client,
            bucket_name='your-bucket-name'  # Replace with your actual bucket name
        )
    """
    
    # Initialize Google Cloud Storage client
    storage_client = storage.Client(project=client.project if client else None)
    
    # You'll need to specify your bucket name here
    # Common bucket names in GCP data platforms might be like:
    # - 'prj-prod-dataplatform-storage'
    # - 'dataplatform-storage'
    # - or similar pattern
    if bucket_name is None:
        # You need to replace this with your actual bucket name
        raise ValueError("Please provide the bucket_name parameter")
    
    bucket = storage_client.bucket(bucket_name)
    
    # Results dictionary to track saves
    results = {}
    
    # Ensure cloud_path doesn't start with '/'
    cloud_path = cloud_path.lstrip('/')
    
    try:
        # 1. Save as CSV
        csv_buffer = io.StringIO()
        dataframe.to_csv(csv_buffer, index=False)
        csv_blob = bucket.blob(f"{cloud_path}/{filename}.csv")
        csv_blob.upload_from_string(csv_buffer.getvalue(), content_type='text/csv')
        results['csv'] = f"gs://{bucket_name}/{cloud_path}/{filename}.csv"
        
        # 2. Save as Pickle
        pickle_buffer = io.BytesIO()
        pickle.dump(dataframe, pickle_buffer)
        pickle_blob = bucket.blob(f"{cloud_path}/{filename}.pkl")
        pickle_blob.upload_from_string(pickle_buffer.getvalue(), content_type='application/octet-stream')
        results['pickle'] = f"gs://{bucket_name}/{cloud_path}/{filename}.pkl"
        
        # 3. Save as Parquet
        parquet_buffer = io.BytesIO()
        dataframe.to_parquet(parquet_buffer, index=False)
        parquet_blob = bucket.blob(f"{cloud_path}/{filename}.parquet")
        parquet_blob.upload_from_string(parquet_buffer.getvalue(), content_type='application/octet-stream')
        results['parquet'] = f"gs://{bucket_name}/{cloud_path}/{filename}.parquet"
        
        # 4. Save as Joblib
        joblib_buffer = io.BytesIO()
        joblib.dump(dataframe, joblib_buffer)
        joblib_blob = bucket.blob(f"{cloud_path}/{filename}.joblib")
        joblib_blob.upload_from_string(joblib_buffer.getvalue(), content_type='application/octet-stream')
        results['joblib'] = f"gs://{bucket_name}/{cloud_path}/{filename}.joblib"
        
        print("All files saved successfully!")
        for format_type, path in results.items():
            print(f"{format_type.upper()}: {path}")
            
    except Exception as e:
        print(f"Error occurred: {str(e)}")
        results['error'] = str(e)
    
    return results

# calculate_gini

In [10]:
import pandas as pd
from sklearn.metrics import roc_auc_score

def calculate_gini(df, date_column, target_column, periods_dict):
    # Make a copy to avoid modifying the original dataframe
    df = df.copy()
    
    # Handle different date column formats
    if df[date_column].dtype == 'object':
        # If it's string/object, try to extract date part and convert
        try:
            df[date_column] = pd.to_datetime(df[date_column].str[:10])
        except AttributeError:
            # If .str fails, it might be mixed types, convert directly
            df[date_column] = pd.to_datetime(df[date_column])
    else:
        # If it's already datetime or numeric, convert to datetime
        df[date_column] = pd.to_datetime(df[date_column])
    
    # Extract just the date part (remove time if present)
    df[date_column] = df[date_column].dt.date
    df[date_column] = pd.to_datetime(df[date_column])
    
    results = []
    
    for period, dates in periods_dict.items():
        start_date = pd.to_datetime(dates['start'])
        end_date = pd.to_datetime(dates['end'])
        
        # Filter data for the current period
        period_mask = (df[date_column] >= start_date) & (df[date_column] <= end_date)
        period_df = df.loc[period_mask].copy()
        
        # Convert score columns to numeric, handling any non-numeric values
        period_df['credo_score'] = pd.to_numeric(period_df['credo_score'], errors='coerce')
        period_df['credolabScore'] = pd.to_numeric(period_df['credolabScore'], errors='coerce')
        
        # Drop rows with missing target or scores (including those that couldn't be converted to numeric)
        period_df = period_df.dropna(subset=[target_column, 'credo_score', 'credolabScore'])
        
        if len(period_df) == 0:
            results.append({'Period': period, 'Start_Date': dates['start'], 'End_Date': dates['end'], 'credo_score_gini': None, 'credolabScore_gini': None, 'sample_size': 0})
            continue
        
        # Check if target column has variation (both 0s and 1s)
        if len(period_df[target_column].unique()) < 2:
            print(f"Warning: {period} has no variation in target variable (all {period_df[target_column].iloc[0]})")
            results.append({'Period': period, 'credo_score_gini': None, 'credolabScore_gini': None, 'sample_size': len(period_df)})
            continue
        
        # Calculate Gini for credo_score (probability score)
        try:
            auc_credo = roc_auc_score(period_df[target_column], period_df['credo_score'])
            gini_credo = 2 * auc_credo - 1
        except (ValueError, TypeError) as e:
            print(f"Error calculating Gini for credo_score in {period}: {e}")
            gini_credo = None
        
        # Calculate Gini for credolabScore (reverse the score since higher score = lower risk)
        try:
            auc_credolab = roc_auc_score(period_df[target_column], -period_df['credolabScore'])
            gini_credolab = 2 * auc_credolab - 1
        except (ValueError, TypeError) as e:
            print(f"Error calculating Gini for credolabScore in {period}: {e}")
            gini_credolab = None
        
        results.append({
            'Period': period,
            'Start_Date': dates['start'],
            'End_Date': dates['end'],
            'credo_score_gini': gini_credo,
            'credolabScore_gini': gini_credolab,
            'sample_size': len(period_df)
        })
    
    return pd.DataFrame(results)

# sil_beta_applied_loans_backscored_20240901_20250730

# Table

In [31]:
schema1 = 'worktable_data_analysis'
sil_beta_trench1 = f'sil_beta_applied_loans_backscored_20240901_20250730'

# Query Android

In [None]:
sq = f"""
with base as 
(select a1.digitalLoanAccountId,
lmt.loanAccountNumber,a1.ln_os_type, a1.ln_appln_submit_datetime, a1.s_credo_score credo_score
, coalesce(ctial.score_all_score, lmt.credolabScore) credolabScore
--, asm.credo_gen_score credolabScore
, ldd.obs_min_inst_def30, ldd.min_inst_def30
, a1.trench_category
, case when ldd.obs_min_inst_def30 >=2 then 1 else 0 end flag_mature_fspd30,
case when ldd.obs_min_inst_def30 >=2 and ldd.min_inst_def30 in (1,2) then 1 else 0 end fspd30,
from {schema1}.{sil_beta_trench1} a1
inner join `risk_credit_mis.loan_master_table` lmt on lmt.digitalLoanAccountId = a1.digitalLoanAccountId
inner join prj-prod-dataplatform.risk_credit_mis.loan_deliquency_data ldd on lmt.loanAccountNumber = ldd.loanAccountNumber
left join risk_mart.applied_loans_20230101_20250831 al on al.digitalLoanAccountId = a1.digitalLoanAccountId
left join risk_credit_mis.credolab_trace_insight_all_loans ctial on ctial.referenceNumber = lmt.credolabRefNumber
left join prj-prod-dataplatform.risk_credit_mis.application_score_master asm on asm.digitalLoanAccountId = a1.digitalLoanAccountId
where flagDisbursement = 1
)
select * from base 
where 
flag_mature_fspd30 = 1
and ln_os_type like 'Android'
and credo_score is not null
and credolabScore is not null
and trench_category in (1,2)
;
"""
d1 = client.query(sq).to_dataframe(progress_bar_type='tqdm')
print(f"The shape of {schema1}.{sil_beta_trench1} table is:\t {d1.shape}")

# Define your periods
data_periods_dict = {
    'Train': {'start': '2024-08-13', 'end': '2025-01-31'}, 
    'OOT 1': {'start': '2025-02-01', 'end': '2025-02-28'},
    'OOT 2': {'start': '2025-03-01', 'end': '2025-03-31'},
    'OOT 3': {'start': '2025-04-01', 'end': '2025-04-30'},
    'OOT 4': {'start': '2025-05-01', 'end': '2025-05-31'},
}

# Debug: Check your date column format first
print("Date column info:")
print(f"Data type: {d1['ln_appln_submit_datetime'].dtype}")
print(f"Sample values: {d1['ln_appln_submit_datetime'].head()}")
print(f"Any null values: {d1['ln_appln_submit_datetime'].isnull().sum()}")

# Calculate Gini coefficients
gini_df = calculate_gini(d1, 'ln_appln_submit_datetime', 'fspd30', data_periods_dict)
print("\nGini Results:")
print(gini_df)


[A
[A
[A
Job ID f6d699ac-b2fd-453b-b3a9-6049214b3935 successfully executed: 100%|[32m██████████[0m|





[A
[A
[A
[A
[A
Downloading: 100%|[32m██████████[0m|
The shape of worktable_data_analysis.sil_beta_applied_loans_backscored_20240901_20250730 table is:	 (144953, 11)
Date column info:
Data type: datetime64[us, UTC]
Sample values: 0   2025-06-02 18:23:57+00:00
1   2025-06-10 13:13:18+00:00
2   2025-06-14 15:53:41+00:00
3   2025-06-11 12:13:49+00:00
4   2025-06-02 18:14:53+00:00
Name: ln_appln_submit_datetime, dtype: datetime64[us, UTC]
Any null values: 0
Query is running:   0%|[32m          [0m|

Gini Results:
  Period  Start_Date    End_Date  credo_score_gini  credolabScore_gini  \
0  Train  2024-08-13  2025-01-31          0.356765            0.101649   
1  OOT 1  2025-02-01  2025-02-28          0.392927            0.111620   
2  OOT 2  2025-03-01  2025-03-31          0.354315            0.102810   
3  OOT 3  2025-04-01  2025-04-30          0.364523            0.309256   
4  OOT 4  2025-05-01  2025-05-31          0.344795            0.290010   

   sample_size  
0        8390

Total count of Android loans for table worktable_data_analysis.sil_beta_trench1_applied_loans_backscored_20241001_20250831 - (201845, 6)

when credo_score is not null -  (201845, 6)

flag_disbursement = 1 - (5591, 6)

after added fspd30 table =  (5411, 8)

after adding flag_mature_fspd30 = 1 -   (4583, 13)






In [16]:
import pandas as pd
from sklearn.metrics import roc_auc_score

def calculate_gini(df, date_column, target_column, periods_dict):
    # Make a copy to avoid modifying the original dataframe
    df = df.copy()
    
    # Handle different date column formats
    if df[date_column].dtype == 'object':
        # If it's string/object, try to extract date part and convert
        try:
            df[date_column] = pd.to_datetime(df[date_column].str[:10])
        except AttributeError:
            # If .str fails, it might be mixed types, convert directly
            df[date_column] = pd.to_datetime(df[date_column])
    else:
        # If it's already datetime or numeric, convert to datetime
        df[date_column] = pd.to_datetime(df[date_column])
    
    # Extract just the date part (remove time if present)
    df[date_column] = df[date_column].dt.date
    df[date_column] = pd.to_datetime(df[date_column])
    
    results = []
    
    for period, dates in periods_dict.items():
        start_date = pd.to_datetime(dates['start'])
        end_date = pd.to_datetime(dates['end'])
        
        # Filter data for the current period
        period_mask = (df[date_column] >= start_date) & (df[date_column] <= end_date)
        period_df = df.loc[period_mask].copy()
        
        # Convert score columns to numeric, handling any non-numeric values
        period_df['credo_score'] = pd.to_numeric(period_df['credo_score'], errors='coerce')
        period_df['credolabScore'] = pd.to_numeric(period_df['credolabScore'], errors='coerce')
        
        # Drop rows with missing target or scores (including those that couldn't be converted to numeric)
        period_df = period_df.dropna(subset=[target_column, 'credo_score', 'credolabScore'])
        
        if len(period_df) == 0:
            results.append({'Period': period, 'Start_Date': dates['start'], 'End_Date': dates['end'], 'credo_score_gini': None, 'credolabScore_gini': None, 'sample_size': 0})
            continue
        
        # Check if target column has variation (both 0s and 1s)
        if len(period_df[target_column].unique()) < 2:
            print(f"Warning: {period} has no variation in target variable (all {period_df[target_column].iloc[0]})")
            results.append({'Period': period, 'credo_score_gini': None, 'credolabScore_gini': None, 'sample_size': len(period_df)})
            continue
        
        # Calculate Gini for credo_score (probability score)
        try:
            auc_credo = roc_auc_score(period_df[target_column], period_df['credo_score'])
            gini_credo = 2 * auc_credo - 1
        except (ValueError, TypeError) as e:
            print(f"Error calculating Gini for credo_score in {period}: {e}")
            gini_credo = None
        
        # Calculate Gini for credolabScore (reverse the score since higher score = lower risk)
        try:
            auc_credolab = roc_auc_score(period_df[target_column], -period_df['credolabScore'])
            gini_credolab = 2 * auc_credolab - 1
        except (ValueError, TypeError) as e:
            print(f"Error calculating Gini for credolabScore in {period}: {e}")
            gini_credolab = None
        
        results.append({
            'Period': period,
            'Start_Date': dates['start'],
            'End_Date': dates['end'],
            'credo_score_gini': gini_credo,
            'credolabScore_gini': gini_credolab,
            'sample_size': len(period_df)
        })
    
    return pd.DataFrame(results)


# Define your periods
data_periods_dict = {
    'Train': {'start': '2024-08-13', 'end': '2025-01-31'}, 
    'OOT 1': {'start': '2025-02-01', 'end': '2025-02-28'},
    'OOT 2': {'start': '2025-03-01', 'end': '2025-03-31'},
    'OOT 3': {'start': '2025-04-01', 'end': '2025-04-30'},
    'OOT 4': {'start': '2025-05-01', 'end': '2025-05-31'},
}

# Debug: Check your date column format first
print("Date column info:")
print(f"Data type: {d1['ln_appln_submit_datetime'].dtype}")
print(f"Sample values: {d1['ln_appln_submit_datetime'].head()}")
print(f"Any null values: {d1['ln_appln_submit_datetime'].isnull().sum()}")

# Calculate Gini coefficients
gini_df = calculate_gini(d1, 'ln_appln_submit_datetime', 'fspd30', data_periods_dict)
print("\nGini Results:")
print(gini_df)

Date column info:
Data type: datetime64[us, UTC]
Sample values: 0   2025-06-02 18:23:57+00:00
1   2025-06-10 13:13:18+00:00
2   2025-06-14 15:53:41+00:00
3   2025-06-11 12:13:49+00:00
4   2025-06-02 18:14:53+00:00
Name: ln_appln_submit_datetime, dtype: datetime64[us, UTC]
Any null values: 0

Gini Results:
  Period  Start_Date    End_Date  credo_score_gini  credolabScore_gini  \
0  Train  2024-08-13  2025-01-31          0.356765            0.101649   
1  OOT 1  2025-02-01  2025-02-28          0.392927            0.111620   
2  OOT 2  2025-03-01  2025-03-31          0.354315            0.102810   
3  OOT 3  2025-04-01  2025-04-30          0.364523            0.309256   
4  OOT 4  2025-05-01  2025-05-31          0.344795            0.290010   

   sample_size  
0        83901  
1        10262  
2        12513  
3        13312  
4        13563  


In [17]:
sil_beta_trench1_df = gini_df.copy()
print("\nGini Results:")
gini_df


Gini Results:


Unnamed: 0,Period,Start_Date,End_Date,credo_score_gini,credolabScore_gini,sample_size
0,Train,2024-08-13,2025-01-31,0.356765,0.101649,83901
1,OOT 1,2025-02-01,2025-02-28,0.392927,0.11162,10262
2,OOT 2,2025-03-01,2025-03-31,0.354315,0.10281,12513
3,OOT 3,2025-04-01,2025-04-30,0.364523,0.309256,13312
4,OOT 4,2025-05-01,2025-05-31,0.344795,0.29001,13563


# Query IOS

In [18]:
sq = f"""
with base as 
(select a1.digitalLoanAccountId,
lmt.loanAccountNumber,a1.ln_os_type, a1.ln_appln_submit_datetime, a1.s_credo_score credo_score
, coalesce(ctial.score_all_score, lmt.credolabScore) credolabScore
--, asm.credo_gen_score credolabScore
, ldd.obs_min_inst_def30, ldd.min_inst_def30
, a1.trench_category
, case when ldd.obs_min_inst_def30 >=2 then 1 else 0 end flag_mature_fspd30,
case when ldd.obs_min_inst_def30 >=2 and ldd.min_inst_def30 in (1,2) then 1 else 0 end fspd30,
from {schema1}.{sil_beta_trench1} a1
inner join `risk_credit_mis.loan_master_table` lmt on lmt.digitalLoanAccountId = a1.digitalLoanAccountId
inner join prj-prod-dataplatform.risk_credit_mis.loan_deliquency_data ldd on lmt.loanAccountNumber = ldd.loanAccountNumber
left join risk_mart.applied_loans_20230101_20250831 al on al.digitalLoanAccountId = a1.digitalLoanAccountId
left join risk_credit_mis.credolab_trace_insight_all_loans ctial on ctial.referenceNumber = lmt.credolabRefNumber
left join prj-prod-dataplatform.risk_credit_mis.application_score_master asm on asm.digitalLoanAccountId = a1.digitalLoanAccountId
where flagDisbursement = 1
)
select * from base 
where 
flag_mature_fspd30 = 1
and ln_os_type not like 'Android'
and credo_score is not null
and credolabScore is not null
and trench_category in (1,2)
;
"""
d1 = client.query(sq).to_dataframe(progress_bar_type='tqdm')
print(f"The shape of {schema1}.{sil_beta_trench1} table is:\t {d1.shape}")

Job ID b7d1e8ec-e6b3-4d41-91a2-7f4282b21c80 successfully executed: 100%|[32m██████████[0m|




Downloading: 100%|[32m██████████[0m|
The shape of worktable_data_analysis.sil_beta_applied_loans_backscored_20240901_20250730 table is:	 (5853, 11)


original without andriod -  (8626, 13)
credo_score not null - (8626, 13)
flag_disbursement = 1 - (8626, 13)
flag_mature_fspd30 -  (7169, 13)


In [19]:
# Define your periods
data_periods_dict = {
    'Train': {'start': '2024-08-13', 'end': '2025-01-31'}, 
    'OOT 1': {'start': '2025-02-01', 'end': '2025-02-28'},
    'OOT 2': {'start': '2025-03-01', 'end': '2025-03-31'},
    'OOT 3': {'start': '2025-04-01', 'end': '2025-04-30'},
    'OOT 4': {'start': '2025-05-01', 'end': '2025-05-31'},
}

# Debug: Check your date column format first
print("Date column info:")
print(f"Data type: {d1['ln_appln_submit_datetime'].dtype}")
print(f"Sample values: {d1['ln_appln_submit_datetime'].head()}")
print(f"Any null values: {d1['ln_appln_submit_datetime'].isnull().sum()}")

# Calculate Gini coefficients
gini_df = calculate_gini(d1, 'ln_appln_submit_datetime', 'fspd30', data_periods_dict)
print("\nGini Results:")
gini_df

Date column info:
Data type: datetime64[us, UTC]
Sample values: 0   2025-06-11 11:46:21+00:00
1   2025-05-31 14:22:23+00:00
2   2025-06-08 14:33:08+00:00
3   2025-06-14 12:46:21+00:00
4   2025-05-29 19:45:14+00:00
Name: ln_appln_submit_datetime, dtype: datetime64[us, UTC]
Any null values: 0

Gini Results:


Unnamed: 0,Period,Start_Date,End_Date,credo_score_gini,credolabScore_gini,sample_size
0,Train,2024-08-13,2025-01-31,,,0
1,OOT 1,2025-02-01,2025-02-28,,,0
2,OOT 2,2025-03-01,2025-03-31,0.118673,0.00654,752
3,OOT 3,2025-04-01,2025-04-30,0.196648,0.067934,2243
4,OOT 4,2025-05-01,2025-05-31,0.141179,0.143005,2002


In [20]:
sil_beta_trench1_ios_df = gini_df.copy()

# Trench 3 Beta

# Query Android

In [26]:
sq = f"""
with base as 
(select a1.digitalLoanAccountId,
lmt.loanAccountNumber,a1.ln_os_type, a1.ln_appln_submit_datetime, a1.s_credo_score credo_score
, coalesce(ctial.score_all_score, lmt.credolabScore) credolabScore
--, asm.credo_gen_score credolabScore
, ldd.obs_min_inst_def30, ldd.min_inst_def30
, a1.trench_category
, case when ldd.obs_min_inst_def30 >=2 then 1 else 0 end flag_mature_fspd30,
case when ldd.obs_min_inst_def30 >=2 and ldd.min_inst_def30 in (1,2) then 1 else 0 end fspd30,
from {schema1}.{sil_beta_trench1} a1
inner join `risk_credit_mis.loan_master_table` lmt on lmt.digitalLoanAccountId = a1.digitalLoanAccountId
inner join prj-prod-dataplatform.risk_credit_mis.loan_deliquency_data ldd on lmt.loanAccountNumber = ldd.loanAccountNumber
left join risk_mart.applied_loans_20230101_20250831 al on al.digitalLoanAccountId = a1.digitalLoanAccountId
left join risk_credit_mis.credolab_trace_insight_all_loans ctial on ctial.referenceNumber = lmt.credolabRefNumber
left join prj-prod-dataplatform.risk_credit_mis.application_score_master asm on asm.digitalLoanAccountId = a1.digitalLoanAccountId
where flagDisbursement = 1
)
select * from base 
where 
flag_mature_fspd30 = 1
and ln_os_type like 'Android'
and credo_score is not null
and credolabScore is not null
and trench_category in (3)
;
"""
d1 = client.query(sq).to_dataframe(progress_bar_type='tqdm')
print(f"The shape of {schema1}.{sil_beta_trench1} table is:\t {d1.shape}")

Job ID dc46e395-b972-4b02-9c08-fe513c17406c successfully executed: 100%|[32m██████████[0m|




Downloading: 100%|[32m██████████[0m|
The shape of worktable_data_analysis.sil_beta_applied_loans_backscored_20240901_20250730 table is:	 (5985, 11)


In [27]:
# Define your periods
data_periods_dict = {
    'Train': {'start': '2024-08-13', 'end': '2025-01-31'}, 
    'OOT 1': {'start': '2025-02-01', 'end': '2025-02-28'},
    'OOT 2': {'start': '2025-03-01', 'end': '2025-03-31'},
    'OOT 3': {'start': '2025-04-01', 'end': '2025-04-30'},
    'OOT 4': {'start': '2025-05-01', 'end': '2025-05-31'},
}

# Debug: Check your date column format first
print("Date column info:")
print(f"Data type: {d1['ln_appln_submit_datetime'].dtype}")
print(f"Sample values: {d1['ln_appln_submit_datetime'].head()}")
print(f"Any null values: {d1['ln_appln_submit_datetime'].isnull().sum()}")

# Calculate Gini coefficients
gini_df = calculate_gini(d1, 'ln_appln_submit_datetime', 'fspd30', data_periods_dict)
print("\nGini Results:")
gini_df

Date column info:
Data type: datetime64[us, UTC]
Sample values: 0   2025-05-21 16:52:10+00:00
1   2025-06-01 13:35:02+00:00
2   2025-05-25 19:53:54+00:00
3   2025-05-26 09:26:32+00:00
4   2025-05-20 11:46:56+00:00
Name: ln_appln_submit_datetime, dtype: datetime64[us, UTC]
Any null values: 0

Gini Results:


Unnamed: 0,Period,Start_Date,End_Date,credo_score_gini,credolabScore_gini,sample_size
0,Train,2024-08-13,2025-01-31,0.249174,0.131183,3139
1,OOT 1,2025-02-01,2025-02-28,0.314477,-0.083232,435
2,OOT 2,2025-03-01,2025-03-31,0.155844,-0.030114,582
3,OOT 3,2025-04-01,2025-04-30,0.421416,0.463418,747
4,OOT 4,2025-05-01,2025-05-31,0.101634,0.129521,574


In [28]:
dd.query("""select 
ln_os_type,  trench_category,  count(digitalLoanAccountId)cntloan ,
sum(fspd30)fspd30,
sum(flag_mature_fspd30)
from d1 
         where date(ln_appln_submit_datetime) between '2025-03-01' and '2025-03-31'
         group by 1,2;""")

┌────────────┬─────────────────┬─────────┬────────┬─────────────────────────┐
│ ln_os_type │ trench_category │ cntloan │ fspd30 │ sum(flag_mature_fspd30) │
│  varchar   │      int64      │  int64  │ int128 │         int128          │
├────────────┼─────────────────┼─────────┼────────┼─────────────────────────┤
│ Android    │               3 │     582 │     22 │                     582 │
└────────────┴─────────────────┴─────────┴────────┴─────────────────────────┘

# Query IOS

In [23]:
sq = f"""
with base as 
(select a1.digitalLoanAccountId,
lmt.loanAccountNumber,a1.ln_os_type, a1.ln_appln_submit_datetime, a1.s_credo_score credo_score
, coalesce(ctial.score_all_score, lmt.credolabScore) credolabScore
--, asm.credo_gen_score credolabScore
, ldd.obs_min_inst_def30, ldd.min_inst_def30
, a1.trench_category
, case when ldd.obs_min_inst_def30 >=2 then 1 else 0 end flag_mature_fspd30,
case when ldd.obs_min_inst_def30 >=2 and ldd.min_inst_def30 in (1,2) then 1 else 0 end fspd30,
from {schema1}.{sil_beta_trench1} a1
inner join `risk_credit_mis.loan_master_table` lmt on lmt.digitalLoanAccountId = a1.digitalLoanAccountId
inner join prj-prod-dataplatform.risk_credit_mis.loan_deliquency_data ldd on lmt.loanAccountNumber = ldd.loanAccountNumber
left join risk_mart.applied_loans_20230101_20250831 al on al.digitalLoanAccountId = a1.digitalLoanAccountId
left join risk_credit_mis.credolab_trace_insight_all_loans ctial on ctial.referenceNumber = lmt.credolabRefNumber
left join prj-prod-dataplatform.risk_credit_mis.application_score_master asm on asm.digitalLoanAccountId = a1.digitalLoanAccountId
where flagDisbursement = 1
)
select * from base 
where 
flag_mature_fspd30 = 1
and ln_os_type not like 'Android'
and credo_score is not null
and credolabScore is not null
and trench_category in (3)
;
"""
d1 = client.query(sq).to_dataframe(progress_bar_type='tqdm')
print(f"The shape of {schema1}.{sil_beta_trench1} table is:\t {d1.shape}")

Job ID ddbb38f0-bd9e-405c-887c-f73c66567c6f successfully executed: 100%|[32m██████████[0m|




Downloading: 100%|[32m██████████[0m|
The shape of worktable_data_analysis.sil_beta_applied_loans_backscored_20240901_20250730 table is:	 (223, 11)


In [24]:
# Define your periods
data_periods_dict = {
    'Train': {'start': '2024-08-13', 'end': '2025-01-31'}, 
    'OOT 1': {'start': '2025-02-01', 'end': '2025-02-28'},
    'OOT 2': {'start': '2025-03-01', 'end': '2025-03-31'},
    'OOT 3': {'start': '2025-04-01', 'end': '2025-04-30'},
    'OOT 4': {'start': '2025-05-01', 'end': '2025-05-31'},
}

# Debug: Check your date column format first
print("Date column info:")
print(f"Data type: {d1['ln_appln_submit_datetime'].dtype}")
print(f"Sample values: {d1['ln_appln_submit_datetime'].head()}")
print(f"Any null values: {d1['ln_appln_submit_datetime'].isnull().sum()}")

# Calculate Gini coefficients
gini_df = calculate_gini(d1, 'ln_appln_submit_datetime', 'fspd30', data_periods_dict)
print("\nGini Results:")
gini_df

Date column info:
Data type: datetime64[us, UTC]
Sample values: 0   2025-05-19 17:10:54+00:00
1   2025-06-15 09:07:14+00:00
2   2025-04-15 13:18:24+00:00
3   2025-04-20 18:01:52+00:00
4   2025-06-06 16:58:18+00:00
Name: ln_appln_submit_datetime, dtype: datetime64[us, UTC]
Any null values: 0

Gini Results:


Unnamed: 0,Period,Start_Date,End_Date,credo_score_gini,credolabScore_gini,sample_size
0,Train,2024-08-13,2025-01-31,,,0
1,OOT 1,2025-02-01,2025-02-28,,,0
2,OOT 2,2025-03-01,2025-03-31,0.259259,-0.185185,21
3,OOT 3,2025-04-01,2025-04-30,0.6875,0.45,82
4,OOT 4,2025-05-01,2025-05-31,0.277778,0.161616,69


In [25]:
dd.query("""select 
ln_os_type,  trench_category,  count(digitalLoanAccountId)cntloan ,
sum(fspd30)fspd30,
sum(flag_mature_fspd30)
from d1 
         where date(ln_appln_submit_datetime) between '2025-03-01' and '2025-03-31'
         group by 1,2;""")

┌────────────┬─────────────────┬─────────┬────────┬─────────────────────────┐
│ ln_os_type │ trench_category │ cntloan │ fspd30 │ sum(flag_mature_fspd30) │
│  varchar   │      int64      │  int64  │ int128 │         int128          │
├────────────┼─────────────────┼─────────┼────────┼─────────────────────────┤
│ iOS        │               3 │      21 │      3 │                      21 │
└────────────┴─────────────────┴─────────┴────────┴─────────────────────────┘

# Sil Alpha

## Table

In [30]:
schema1 = 'worktable_data_analysis'
sil_alpha_trench1 = f'sil_alpha_applied_loans_backscored_20240801_20250730'

# Trench1 + Trench2 Alpha Android

In [33]:
sq = f"""
with base as 
(select a1.digitalLoanAccountId,
lmt.loanAccountNumber,a1.ln_os_type, a1.ln_appln_submit_datetime, a1.s_credo_score credo_score
, coalesce(ctial.score_all_score, lmt.credolabScore) credolabScore
--, asm.credo_gen_score credolabScore
, ldd.obs_min_inst_def30, ldd.min_inst_def30
, a1.trench_category
, case when ldd.obs_min_inst_def30 >=2 then 1 else 0 end flag_mature_fspd30,
case when ldd.obs_min_inst_def30 >=2 and ldd.min_inst_def30 in (1,2) then 1 else 0 end fspd30,
from {schema1}.{sil_alpha_trench1} a1
inner join `risk_credit_mis.loan_master_table` lmt on lmt.digitalLoanAccountId = a1.digitalLoanAccountId
inner join prj-prod-dataplatform.risk_credit_mis.loan_deliquency_data ldd on lmt.loanAccountNumber = ldd.loanAccountNumber
left join risk_mart.applied_loans_20230101_20250831 al on al.digitalLoanAccountId = a1.digitalLoanAccountId
left join risk_credit_mis.credolab_trace_insight_all_loans ctial on ctial.referenceNumber = lmt.credolabRefNumber
left join prj-prod-dataplatform.risk_credit_mis.application_score_master asm on asm.digitalLoanAccountId = a1.digitalLoanAccountId
where flagDisbursement = 1
)
select * from base 
where 
flag_mature_fspd30 = 1
and ln_os_type like 'Android'
and credo_score is not null
and credolabScore is not null
and trench_category in (1,2)
;
"""
d1 = client.query(sq).to_dataframe(progress_bar_type='tqdm')
print(f"The shape of {schema1}.{sil_alpha_trench1} table is:\t {d1.shape}")



Job ID 4468a1f3-397e-4a11-9dcd-e304f83df8f3 successfully executed: 100%|[32m██████████[0m|
Downloading: 100%|[32m██████████[0m|
The shape of worktable_data_analysis.sil_alpha_applied_loans_backscored_20240801_20250730 table is:	 (100417, 11)


In [35]:
# Define your periods
data_periods_dict = {
    'Train': {'start': '2024-08-13', 'end': '2025-01-31'}, 
    'OOT 1': {'start': '2025-02-01', 'end': '2025-02-28'},
    'OOT 2': {'start': '2025-03-01', 'end': '2025-03-31'},
    'OOT 3': {'start': '2025-04-01', 'end': '2025-04-30'},
    'OOT 4': {'start': '2025-05-01', 'end': '2025-05-31'},
}

# Debug: Check your date column format first
print("Date column info:")
print(f"Data type: {d1['ln_appln_submit_datetime'].dtype}")
print(f"Sample values: {d1['ln_appln_submit_datetime'].head()}")
print(f"Any null values: {d1['ln_appln_submit_datetime'].isnull().sum()}")

# Calculate Gini coefficients
gini_df = calculate_gini(d1, 'ln_appln_submit_datetime', 'fspd30', data_periods_dict)
print("\nGini Results:")
gini_df

Date column info:
Data type: datetime64[us, UTC]
Sample values: 0   2025-06-05 18:48:09+00:00
1   2025-05-24 13:26:13+00:00
2   2025-06-03 12:50:55+00:00
3   2025-05-30 16:30:10+00:00
4   2025-05-15 18:57:59+00:00
Name: ln_appln_submit_datetime, dtype: datetime64[us, UTC]
Any null values: 0

Gini Results:


Unnamed: 0,Period,Start_Date,End_Date,credo_score_gini,credolabScore_gini,sample_size
0,Train,2024-08-13,2025-01-31,0.363112,0.114442,56870
1,OOT 1,2025-02-01,2025-02-28,0.398656,0.121757,7634
2,OOT 2,2025-03-01,2025-03-31,0.351226,0.106961,9444
3,OOT 3,2025-04-01,2025-04-30,0.375895,0.323388,10496
4,OOT 4,2025-05-01,2025-05-31,0.343023,0.280994,11114


# Trench1 + Trench2 Alpha IOS

In [36]:
sq = f"""
with base as 
(select a1.digitalLoanAccountId,
lmt.loanAccountNumber,a1.ln_os_type, a1.ln_appln_submit_datetime, a1.s_credo_score credo_score
, coalesce(ctial.score_all_score, lmt.credolabScore) credolabScore
--, asm.credo_gen_score credolabScore
, ldd.obs_min_inst_def30, ldd.min_inst_def30
, a1.trench_category
, case when ldd.obs_min_inst_def30 >=2 then 1 else 0 end flag_mature_fspd30,
case when ldd.obs_min_inst_def30 >=2 and ldd.min_inst_def30 in (1,2) then 1 else 0 end fspd30,
from {schema1}.{sil_alpha_trench1} a1
inner join `risk_credit_mis.loan_master_table` lmt on lmt.digitalLoanAccountId = a1.digitalLoanAccountId
inner join prj-prod-dataplatform.risk_credit_mis.loan_deliquency_data ldd on lmt.loanAccountNumber = ldd.loanAccountNumber
left join risk_mart.applied_loans_20230101_20250831 al on al.digitalLoanAccountId = a1.digitalLoanAccountId
left join risk_credit_mis.credolab_trace_insight_all_loans ctial on ctial.referenceNumber = lmt.credolabRefNumber
left join prj-prod-dataplatform.risk_credit_mis.application_score_master asm on asm.digitalLoanAccountId = a1.digitalLoanAccountId
where flagDisbursement = 1
)
select * from base 
where 
flag_mature_fspd30 = 1
and ln_os_type not like 'Android'
and credo_score is not null
and credolabScore is not null
and trench_category in (1,2)
;
"""
d1 = client.query(sq).to_dataframe(progress_bar_type='tqdm')
print(f"The shape of {schema1}.{sil_alpha_trench1} table is:\t {d1.shape}")



Job ID 46e9625c-b3f2-46ff-ada8-6561dadc1faf successfully executed: 100%|[32m██████████[0m|




Downloading: 100%|[32m██████████[0m|
The shape of worktable_data_analysis.sil_alpha_applied_loans_backscored_20240801_20250730 table is:	 (4483, 11)


In [37]:
# Define your periods
data_periods_dict = {
    'Train': {'start': '2024-08-13', 'end': '2025-01-31'}, 
    'OOT 1': {'start': '2025-02-01', 'end': '2025-02-28'},
    'OOT 2': {'start': '2025-03-01', 'end': '2025-03-31'},
    'OOT 3': {'start': '2025-04-01', 'end': '2025-04-30'},
    'OOT 4': {'start': '2025-05-01', 'end': '2025-05-31'},
}

# Debug: Check your date column format first
print("Date column info:")
print(f"Data type: {d1['ln_appln_submit_datetime'].dtype}")
print(f"Sample values: {d1['ln_appln_submit_datetime'].head()}")
print(f"Any null values: {d1['ln_appln_submit_datetime'].isnull().sum()}")

# Calculate Gini coefficients
gini_df = calculate_gini(d1, 'ln_appln_submit_datetime', 'fspd30', data_periods_dict)
print("\nGini Results:")
gini_df

Date column info:
Data type: datetime64[us, UTC]
Sample values: 0   2025-05-03 16:14:45+00:00
1   2025-04-01 16:21:34+00:00
2   2025-03-28 09:22:22+00:00
3   2025-03-30 16:11:39+00:00
4   2025-04-13 13:18:22+00:00
Name: ln_appln_submit_datetime, dtype: datetime64[us, UTC]
Any null values: 0

Gini Results:


Unnamed: 0,Period,Start_Date,End_Date,credo_score_gini,credolabScore_gini,sample_size
0,Train,2024-08-13,2025-01-31,,,0
1,OOT 1,2025-02-01,2025-02-28,,,0
2,OOT 2,2025-03-01,2025-03-31,0.068678,0.027751,494
3,OOT 3,2025-04-01,2025-04-30,0.17169,0.06752,1710
4,OOT 4,2025-05-01,2025-05-31,0.15372,0.170335,1563


Trench3 Alpha Android

In [38]:
sq = f"""
with base as 
(select a1.digitalLoanAccountId,
lmt.loanAccountNumber,a1.ln_os_type, a1.ln_appln_submit_datetime, a1.s_credo_score credo_score
, coalesce(ctial.score_all_score, lmt.credolabScore) credolabScore
--, asm.credo_gen_score credolabScore
, ldd.obs_min_inst_def30, ldd.min_inst_def30
, a1.trench_category
, case when ldd.obs_min_inst_def30 >=2 then 1 else 0 end flag_mature_fspd30,
case when ldd.obs_min_inst_def30 >=2 and ldd.min_inst_def30 in (1,2) then 1 else 0 end fspd30,
from {schema1}.{sil_alpha_trench1} a1
inner join `risk_credit_mis.loan_master_table` lmt on lmt.digitalLoanAccountId = a1.digitalLoanAccountId
inner join prj-prod-dataplatform.risk_credit_mis.loan_deliquency_data ldd on lmt.loanAccountNumber = ldd.loanAccountNumber
left join risk_mart.applied_loans_20230101_20250831 al on al.digitalLoanAccountId = a1.digitalLoanAccountId
left join risk_credit_mis.credolab_trace_insight_all_loans ctial on ctial.referenceNumber = lmt.credolabRefNumber
left join prj-prod-dataplatform.risk_credit_mis.application_score_master asm on asm.digitalLoanAccountId = a1.digitalLoanAccountId
where flagDisbursement = 1
)
select * from base 
where 
flag_mature_fspd30 = 1
and ln_os_type like 'Android'
and credo_score is not null
and credolabScore is not null
and trench_category in (3)
;
"""
d1 = client.query(sq).to_dataframe(progress_bar_type='tqdm')
print(f"The shape of {schema1}.{sil_alpha_trench1} table is:\t {d1.shape}")



Job ID 3cc5f8b5-0fc5-4755-abca-6ab84b9a1643 successfully executed: 100%|[32m██████████[0m|




Downloading: 100%|[32m██████████[0m|
The shape of worktable_data_analysis.sil_alpha_applied_loans_backscored_20240801_20250730 table is:	 (5280, 11)


In [39]:
# Define your periods
data_periods_dict = {
    'Train': {'start': '2024-08-13', 'end': '2025-01-31'}, 
    'OOT 1': {'start': '2025-02-01', 'end': '2025-02-28'},
    'OOT 2': {'start': '2025-03-01', 'end': '2025-03-31'},
    'OOT 3': {'start': '2025-04-01', 'end': '2025-04-30'},
    'OOT 4': {'start': '2025-05-01', 'end': '2025-05-31'},
}

# Debug: Check your date column format first
print("Date column info:")
print(f"Data type: {d1['ln_appln_submit_datetime'].dtype}")
print(f"Sample values: {d1['ln_appln_submit_datetime'].head()}")
print(f"Any null values: {d1['ln_appln_submit_datetime'].isnull().sum()}")

# Calculate Gini coefficients
gini_df = calculate_gini(d1, 'ln_appln_submit_datetime', 'fspd30', data_periods_dict)
print("\nGini Results:")
gini_df

Date column info:
Data type: datetime64[us, UTC]
Sample values: 0   2025-05-21 19:29:11+00:00
1   2025-05-18 10:00:52+00:00
2   2025-05-25 17:56:41+00:00
3   2025-06-14 15:41:21+00:00
4   2025-05-23 14:49:09+00:00
Name: ln_appln_submit_datetime, dtype: datetime64[us, UTC]
Any null values: 0

Gini Results:


Unnamed: 0,Period,Start_Date,End_Date,credo_score_gini,credolabScore_gini,sample_size
0,Train,2024-08-13,2025-01-31,0.252707,0.144726,2740
1,OOT 1,2025-02-01,2025-02-28,0.310041,-0.035714,389
2,OOT 2,2025-03-01,2025-03-31,0.157505,-0.051885,538
3,OOT 3,2025-04-01,2025-04-30,0.43181,0.452148,694
4,OOT 4,2025-05-01,2025-05-31,0.101124,0.129158,568


In [40]:
dd.query("""select 
ln_os_type,  trench_category,  count(digitalLoanAccountId)cntloan ,
sum(fspd30)fspd30,
sum(flag_mature_fspd30)
from d1 
         where date(ln_appln_submit_datetime) between '2025-03-01' and '2025-03-31'
         group by 1,2;""")

┌────────────┬─────────────────┬─────────┬────────┬─────────────────────────┐
│ ln_os_type │ trench_category │ cntloan │ fspd30 │ sum(flag_mature_fspd30) │
│  varchar   │      int64      │  int64  │ int128 │         int128          │
├────────────┼─────────────────┼─────────┼────────┼─────────────────────────┤
│ Android    │               3 │     538 │     22 │                     538 │
└────────────┴─────────────────┴─────────┴────────┴─────────────────────────┘

# Trench1 + Trench2 Alpha IOS

In [41]:
sq = f"""
with base as 
(select a1.digitalLoanAccountId,
lmt.loanAccountNumber,a1.ln_os_type, a1.ln_appln_submit_datetime, a1.s_credo_score credo_score
, coalesce(ctial.score_all_score, lmt.credolabScore) credolabScore
--, asm.credo_gen_score credolabScore
, ldd.obs_min_inst_def30, ldd.min_inst_def30
, a1.trench_category
, case when ldd.obs_min_inst_def30 >=2 then 1 else 0 end flag_mature_fspd30,
case when ldd.obs_min_inst_def30 >=2 and ldd.min_inst_def30 in (1,2) then 1 else 0 end fspd30,
from {schema1}.{sil_alpha_trench1} a1
inner join `risk_credit_mis.loan_master_table` lmt on lmt.digitalLoanAccountId = a1.digitalLoanAccountId
inner join prj-prod-dataplatform.risk_credit_mis.loan_deliquency_data ldd on lmt.loanAccountNumber = ldd.loanAccountNumber
left join risk_mart.applied_loans_20230101_20250831 al on al.digitalLoanAccountId = a1.digitalLoanAccountId
left join risk_credit_mis.credolab_trace_insight_all_loans ctial on ctial.referenceNumber = lmt.credolabRefNumber
left join prj-prod-dataplatform.risk_credit_mis.application_score_master asm on asm.digitalLoanAccountId = a1.digitalLoanAccountId
where flagDisbursement = 1
)
select * from base 
where 
flag_mature_fspd30 = 1
and ln_os_type not like 'Android'
and credo_score is not null
and credolabScore is not null
and trench_category in (3)
;
"""
d1 = client.query(sq).to_dataframe(progress_bar_type='tqdm')
print(f"The shape of {schema1}.{sil_alpha_trench1} table is:\t {d1.shape}")



Job ID a3109086-e21f-48de-9f7d-e3d74962e552 successfully executed: 100%|[32m██████████[0m|




Downloading: 100%|[32m██████████[0m|
The shape of worktable_data_analysis.sil_alpha_applied_loans_backscored_20240801_20250730 table is:	 (214, 11)


In [42]:
# Define your periods
data_periods_dict = {
    'Train': {'start': '2024-08-13', 'end': '2025-01-31'}, 
    'OOT 1': {'start': '2025-02-01', 'end': '2025-02-28'},
    'OOT 2': {'start': '2025-03-01', 'end': '2025-03-31'},
    'OOT 3': {'start': '2025-04-01', 'end': '2025-04-30'},
    'OOT 4': {'start': '2025-05-01', 'end': '2025-05-31'},
}

# Debug: Check your date column format first
print("Date column info:")
print(f"Data type: {d1['ln_appln_submit_datetime'].dtype}")
print(f"Sample values: {d1['ln_appln_submit_datetime'].head()}")
print(f"Any null values: {d1['ln_appln_submit_datetime'].isnull().sum()}")

# Calculate Gini coefficients
gini_df = calculate_gini(d1, 'ln_appln_submit_datetime', 'fspd30', data_periods_dict)
print("\nGini Results:")
gini_df

Date column info:
Data type: datetime64[us, UTC]
Sample values: 0   2025-04-15 18:43:40+00:00
1   2025-05-11 19:17:59+00:00
2   2025-04-15 13:18:44+00:00
3   2025-06-12 11:06:18+00:00
4   2025-06-18 17:14:19+00:00
Name: ln_appln_submit_datetime, dtype: datetime64[us, UTC]
Any null values: 0

Gini Results:


Unnamed: 0,Period,Start_Date,End_Date,credo_score_gini,credolabScore_gini,sample_size
0,Train,2024-08-13,2025-01-31,,,0
1,OOT 1,2025-02-01,2025-02-28,,,0
2,OOT 2,2025-03-01,2025-03-31,0.25,-0.208333,19
3,OOT 3,2025-04-01,2025-04-30,0.666667,0.48,77
4,OOT 4,2025-05-01,2025-05-31,0.276923,0.169231,68


In [43]:
dd.query("""select 
ln_os_type,  trench_category,  count(digitalLoanAccountId)cntloan ,
sum(fspd30)fspd30,
sum(flag_mature_fspd30)
from d1 
         where date(ln_appln_submit_datetime) between '2025-03-01' and '2025-03-31'
         group by 1,2;""")

┌────────────┬─────────────────┬─────────┬────────┬─────────────────────────┐
│ ln_os_type │ trench_category │ cntloan │ fspd30 │ sum(flag_mature_fspd30) │
│  varchar   │      int64      │  int64  │ int128 │         int128          │
├────────────┼─────────────────┼─────────┼────────┼─────────────────────────┤
│ iOS        │               3 │      19 │      3 │                      19 │
└────────────┴─────────────────┴─────────┴────────┴─────────────────────────┘

# Slide19 - SIL Credo Score FSPD30 Gini Performance Improvement over Current Model in Prod (latest Specialized Credo Score for SIL vs previous Specialized Credo Score for SIL)

# Trench1 + Trench2 Android

In [54]:
sq = f"""
with un as 
(Select digitalLoanAccountId, ln_appln_submit_datetime, ln_os_type,  s_credo_score credo_score ,
ln_loan_type, trench_category, ln_fspd30_flag , ln_mature_fspd30_flag, 'beta' as tab 
from worktable_data_analysis.sil_beta_applied_loans_backscored_20240901_20250730
union all 
select  digitalLoanAccountId, ln_appln_submit_datetime, ln_os_type, s_credo_score credo_score ,
ln_loan_type, trench_category, ln_fspd30_flag, ln_mature_fspd30_flag ,'alpha' as tab   from worktable_data_analysis.sil_alpha_applied_loans_backscored_20240801_20250730
)
-- select digitalLoanAccountId, count(digitalLoanAccountId) from un group by 1 having count(digitalLoanAccountId) > 1 order by 2 desc;
,
un1 as 
(select * from un 
where trench_category in (1,2)
qualify row_number() over(partition by digitalLoanAccountId order by tab, trench_category desc, ln_appln_submit_datetime desc) = 1
),
base as 
(select a1.digitalLoanAccountId,
lmt.loanAccountNumber,a1.ln_os_type, a1.ln_appln_submit_datetime, a1.credo_score
, coalesce(ctial.score_all_score, lmt.credolabScore) credolabScore
--, asm.credo_gen_score credolabScore
, ldd.obs_min_inst_def30, ldd.min_inst_def30
, a1.trench_category
, case when ldd.obs_min_inst_def30 >=2 then 1 else 0 end flag_mature_fspd30,
case when ldd.obs_min_inst_def30 >=2 and ldd.min_inst_def30 in (1,2) then 1 else 0 end fspd30,
from un1 a1
inner join `risk_credit_mis.loan_master_table` lmt on lmt.digitalLoanAccountId = a1.digitalLoanAccountId
inner join prj-prod-dataplatform.risk_credit_mis.loan_deliquency_data ldd on lmt.loanAccountNumber = ldd.loanAccountNumber
left join risk_mart.applied_loans_20230101_20250831 al on al.digitalLoanAccountId = a1.digitalLoanAccountId
left join risk_credit_mis.credolab_trace_insight_all_loans ctial on ctial.referenceNumber = lmt.credolabRefNumber
left join prj-prod-dataplatform.risk_credit_mis.application_score_master asm on asm.digitalLoanAccountId = a1.digitalLoanAccountId
where flagDisbursement = 1
)
select * from base 
where 
flag_mature_fspd30 = 1
and ln_os_type like 'Android'
and credo_score is not null
and credolabScore is not null
and trench_category in (1,2)
;
"""
d1 = client.query(sq).to_dataframe(progress_bar_type='tqdm')
print(f"The shape of table is:\t {d1.shape}")



Job ID 72212e6e-2cf4-4751-9505-4c538dd5b3c7 successfully executed: 100%|[32m██████████[0m|




Downloading: 100%|[32m██████████[0m|
The shape of table is:	 (146804, 11)


In [55]:
# Define your periods
data_periods_dict = {
    'Train': {'start': '2024-08-13', 'end': '2025-01-31'}, 
    'OOT 1': {'start': '2025-02-01', 'end': '2025-02-28'},
    'OOT 2': {'start': '2025-03-01', 'end': '2025-03-31'},
    'OOT 3': {'start': '2025-04-01', 'end': '2025-04-30'},
    'OOT 4': {'start': '2025-05-01', 'end': '2025-05-31'},
}

# Debug: Check your date column format first
print("Date column info:")
print(f"Data type: {d1['ln_appln_submit_datetime'].dtype}")
print(f"Sample values: {d1['ln_appln_submit_datetime'].head()}")
print(f"Any null values: {d1['ln_appln_submit_datetime'].isnull().sum()}")

# Calculate Gini coefficients
gini_df = calculate_gini(d1, 'ln_appln_submit_datetime', 'fspd30', data_periods_dict)
print("\nGini Results:")
gini_df

Date column info:
Data type: datetime64[us, UTC]
Sample values: 0   2025-05-22 16:03:36+00:00
1   2025-05-31 17:33:31+00:00
2   2025-06-11 10:48:26+00:00
3   2025-06-02 18:28:32+00:00
4   2025-06-08 17:32:10+00:00
Name: ln_appln_submit_datetime, dtype: datetime64[us, UTC]
Any null values: 0

Gini Results:


Unnamed: 0,Period,Start_Date,End_Date,credo_score_gini,credolabScore_gini,sample_size
0,Train,2024-08-13,2025-01-31,0.356765,0.101649,83901
1,OOT 1,2025-02-01,2025-02-28,0.392927,0.11162,10262
2,OOT 2,2025-03-01,2025-03-31,0.354315,0.10281,12513
3,OOT 3,2025-04-01,2025-04-30,0.364523,0.309256,13312
4,OOT 4,2025-05-01,2025-05-31,0.344795,0.29001,13563


In [59]:
dd.query("""select 
ln_os_type,  STRFTIME(ln_appln_submit_datetime, '%Y-%m') AS submit_year_month, count(digitalLoanAccountId)cntloan ,
sum(fspd30)fspd30,
sum(flag_mature_fspd30)
from d1 
         where date(ln_appln_submit_datetime) between '2025-03-01' and '2025-05-31'
         group by 1,2;""").to_df()

Unnamed: 0,ln_os_type,submit_year_month,cntloan,fspd30,sum(flag_mature_fspd30)
0,Android,2025-03,12512,1469.0,12512.0
1,Android,2025-04,13307,1465.0,13307.0
2,Android,2025-05,13574,1619.0,13574.0


# Trench1 + Trench2 iOS

In [60]:
sq = f"""
with un as 
(Select digitalLoanAccountId, ln_appln_submit_datetime, ln_os_type,  s_credo_score credo_score ,
ln_loan_type, trench_category, ln_fspd30_flag , ln_mature_fspd30_flag, 'beta' as tab 
from worktable_data_analysis.sil_beta_applied_loans_backscored_20240901_20250730
union all 
select  digitalLoanAccountId, ln_appln_submit_datetime, ln_os_type, s_credo_score credo_score ,
ln_loan_type, trench_category, ln_fspd30_flag, ln_mature_fspd30_flag ,'alpha' as tab   from worktable_data_analysis.sil_alpha_applied_loans_backscored_20240801_20250730
)
-- select digitalLoanAccountId, count(digitalLoanAccountId) from un group by 1 having count(digitalLoanAccountId) > 1 order by 2 desc;
,
un1 as 
(select * from un 
where trench_category in (1,2)
qualify row_number() over(partition by digitalLoanAccountId order by tab, trench_category desc, ln_appln_submit_datetime desc) = 1
),
base as 
(select a1.digitalLoanAccountId,
lmt.loanAccountNumber,a1.ln_os_type, a1.ln_appln_submit_datetime, a1.credo_score
, coalesce(ctial.score_all_score, lmt.credolabScore) credolabScore
--, asm.credo_gen_score credolabScore
, ldd.obs_min_inst_def30, ldd.min_inst_def30
, a1.trench_category
, case when ldd.obs_min_inst_def30 >=2 then 1 else 0 end flag_mature_fspd30,
case when ldd.obs_min_inst_def30 >=2 and ldd.min_inst_def30 in (1,2) then 1 else 0 end fspd30,
from un1 a1
inner join `risk_credit_mis.loan_master_table` lmt on lmt.digitalLoanAccountId = a1.digitalLoanAccountId
inner join prj-prod-dataplatform.risk_credit_mis.loan_deliquency_data ldd on lmt.loanAccountNumber = ldd.loanAccountNumber
left join risk_mart.applied_loans_20230101_20250831 al on al.digitalLoanAccountId = a1.digitalLoanAccountId
left join risk_credit_mis.credolab_trace_insight_all_loans ctial on ctial.referenceNumber = lmt.credolabRefNumber
left join prj-prod-dataplatform.risk_credit_mis.application_score_master asm on asm.digitalLoanAccountId = a1.digitalLoanAccountId
where flagDisbursement = 1
)
select * from base 
where 
flag_mature_fspd30 = 1
and ln_os_type not like 'Android'
and credo_score is not null
and credolabScore is not null
and trench_category in (1,2)
;
"""
d1 = client.query(sq).to_dataframe(progress_bar_type='tqdm')
print(f"The shape of table is:\t {d1.shape}")



Job ID aa9061b0-1ece-451d-84b3-60854ff65aa9 successfully executed: 100%|[32m██████████[0m|




Downloading: 100%|[32m██████████[0m|
The shape of table is:	 (6091, 11)


In [61]:
# Define your periods
data_periods_dict = {
    'Train': {'start': '2024-08-13', 'end': '2025-01-31'}, 
    'OOT 1': {'start': '2025-02-01', 'end': '2025-02-28'},
    'OOT 2': {'start': '2025-03-01', 'end': '2025-03-31'},
    'OOT 3': {'start': '2025-04-01', 'end': '2025-04-30'},
    'OOT 4': {'start': '2025-05-01', 'end': '2025-05-31'},
}

# Debug: Check your date column format first
print("Date column info:")
print(f"Data type: {d1['ln_appln_submit_datetime'].dtype}")
print(f"Sample values: {d1['ln_appln_submit_datetime'].head()}")
print(f"Any null values: {d1['ln_appln_submit_datetime'].isnull().sum()}")

# Calculate Gini coefficients
gini_df = calculate_gini(d1, 'ln_appln_submit_datetime', 'fspd30', data_periods_dict)
print("\nGini Results:")
gini_df

Date column info:
Data type: datetime64[us, UTC]
Sample values: 0   2025-04-22 18:02:08+00:00
1   2025-04-16 16:44:32+00:00
2   2025-04-10 17:43:21+00:00
3   2025-04-25 10:51:22+00:00
4   2025-04-09 10:52:58+00:00
Name: ln_appln_submit_datetime, dtype: datetime64[us, UTC]
Any null values: 0

Gini Results:


Unnamed: 0,Period,Start_Date,End_Date,credo_score_gini,credolabScore_gini,sample_size
0,Train,2024-08-13,2025-01-31,,,0
1,OOT 1,2025-02-01,2025-02-28,,,0
2,OOT 2,2025-03-01,2025-03-31,0.118673,0.00654,752
3,OOT 3,2025-04-01,2025-04-30,0.196648,0.067934,2243
4,OOT 4,2025-05-01,2025-05-31,0.141179,0.143005,2002


In [62]:
dd.query("""select 
ln_os_type,  STRFTIME(ln_appln_submit_datetime, '%Y-%m') AS submit_year_month, count(digitalLoanAccountId)cntloan ,
sum(fspd30)fspd30,
sum(flag_mature_fspd30)
from d1 
         where date(ln_appln_submit_datetime) between '2025-03-01' and '2025-05-31'
         group by 1,2;""").to_df()

Unnamed: 0,ln_os_type,submit_year_month,cntloan,fspd30,sum(flag_mature_fspd30)
0,iOS,2025-03,745,83.0,745.0
1,iOS,2025-05,2003,181.0,2003.0
2,iOS,2025-04,2242,244.0,2242.0


# Trench3 Android

In [63]:
sq = f"""
with un as 
(Select digitalLoanAccountId, ln_appln_submit_datetime, ln_os_type,  s_credo_score credo_score ,
ln_loan_type, trench_category, ln_fspd30_flag , ln_mature_fspd30_flag, 'beta' as tab 
from worktable_data_analysis.sil_beta_applied_loans_backscored_20240901_20250730
union all 
select  digitalLoanAccountId, ln_appln_submit_datetime, ln_os_type, s_credo_score credo_score ,
ln_loan_type, trench_category, ln_fspd30_flag, ln_mature_fspd30_flag ,'alpha' as tab   from worktable_data_analysis.sil_alpha_applied_loans_backscored_20240801_20250730
)
-- select digitalLoanAccountId, count(digitalLoanAccountId) from un group by 1 having count(digitalLoanAccountId) > 1 order by 2 desc;
,
un1 as 
(select * from un 
where trench_category in (3)
qualify row_number() over(partition by digitalLoanAccountId order by tab, trench_category desc, ln_appln_submit_datetime desc) = 1
),
base as 
(select a1.digitalLoanAccountId,
lmt.loanAccountNumber,a1.ln_os_type, a1.ln_appln_submit_datetime, a1.credo_score
, coalesce(ctial.score_all_score, lmt.credolabScore) credolabScore
--, asm.credo_gen_score credolabScore
, ldd.obs_min_inst_def30, ldd.min_inst_def30
, a1.trench_category
, case when ldd.obs_min_inst_def30 >=2 then 1 else 0 end flag_mature_fspd30,
case when ldd.obs_min_inst_def30 >=2 and ldd.min_inst_def30 in (1,2) then 1 else 0 end fspd30,
from un1 a1
inner join `risk_credit_mis.loan_master_table` lmt on lmt.digitalLoanAccountId = a1.digitalLoanAccountId
inner join prj-prod-dataplatform.risk_credit_mis.loan_deliquency_data ldd on lmt.loanAccountNumber = ldd.loanAccountNumber
left join risk_mart.applied_loans_20230101_20250831 al on al.digitalLoanAccountId = a1.digitalLoanAccountId
left join risk_credit_mis.credolab_trace_insight_all_loans ctial on ctial.referenceNumber = lmt.credolabRefNumber
left join prj-prod-dataplatform.risk_credit_mis.application_score_master asm on asm.digitalLoanAccountId = a1.digitalLoanAccountId
where flagDisbursement = 1
)
select * from base 
where 
flag_mature_fspd30 = 1
and ln_os_type like 'Android'
and credo_score is not null
and credolabScore is not null
and trench_category in (3)
;
"""
d1 = client.query(sq).to_dataframe(progress_bar_type='tqdm')
print(f"The shape of table is:\t {d1.shape}")



Job ID 7ce76f48-6dfd-4107-b08b-784cfb4e3663 successfully executed: 100%|[32m██████████[0m|




Downloading: 100%|[32m██████████[0m|
The shape of table is:	 (6082, 11)


In [64]:
# Define your periods
data_periods_dict = {
    'Train': {'start': '2024-08-13', 'end': '2025-01-31'}, 
    'OOT 1': {'start': '2025-02-01', 'end': '2025-02-28'},
    'OOT 2': {'start': '2025-03-01', 'end': '2025-03-31'},
    'OOT 3': {'start': '2025-04-01', 'end': '2025-04-30'},
    'OOT 4': {'start': '2025-05-01', 'end': '2025-05-31'},
}

# Debug: Check your date column format first
print("Date column info:")
print(f"Data type: {d1['ln_appln_submit_datetime'].dtype}")
print(f"Sample values: {d1['ln_appln_submit_datetime'].head()}")
print(f"Any null values: {d1['ln_appln_submit_datetime'].isnull().sum()}")

# Calculate Gini coefficients
gini_df = calculate_gini(d1, 'ln_appln_submit_datetime', 'fspd30', data_periods_dict)
print("\nGini Results:")
gini_df

Date column info:
Data type: datetime64[us, UTC]
Sample values: 0   2025-05-23 11:41:24+00:00
1   2025-05-18 13:06:34+00:00
2   2025-06-11 11:15:24+00:00
3   2025-06-04 11:26:42+00:00
4   2025-05-26 18:00:10+00:00
Name: ln_appln_submit_datetime, dtype: datetime64[us, UTC]
Any null values: 0

Gini Results:


Unnamed: 0,Period,Start_Date,End_Date,credo_score_gini,credolabScore_gini,sample_size
0,Train,2024-08-13,2025-01-31,0.249174,0.131183,3139
1,OOT 1,2025-02-01,2025-02-28,0.314477,-0.083232,435
2,OOT 2,2025-03-01,2025-03-31,0.155844,-0.030114,582
3,OOT 3,2025-04-01,2025-04-30,0.421416,0.463418,747
4,OOT 4,2025-05-01,2025-05-31,0.101634,0.129521,574


In [65]:
dd.query("""select 
ln_os_type,  STRFTIME(ln_appln_submit_datetime, '%Y-%m') AS submit_year_month, count(digitalLoanAccountId)cntloan ,
sum(fspd30)fspd30,
sum(flag_mature_fspd30)
from d1 
         where date(ln_appln_submit_datetime) between '2025-03-01' and '2025-05-31'
         group by 1,2;""").to_df()

Unnamed: 0,ln_os_type,submit_year_month,cntloan,fspd30,sum(flag_mature_fspd30)
0,Android,2025-03,582,22.0,582.0
1,Android,2025-04,747,32.0,747.0
2,Android,2025-05,574,34.0,574.0


# Trench3 iOS

In [66]:
sq = f"""
with un as 
(Select digitalLoanAccountId, ln_appln_submit_datetime, ln_os_type,  s_credo_score credo_score ,
ln_loan_type, trench_category, ln_fspd30_flag , ln_mature_fspd30_flag, 'beta' as tab 
from worktable_data_analysis.sil_beta_applied_loans_backscored_20240901_20250730
union all 
select  digitalLoanAccountId, ln_appln_submit_datetime, ln_os_type, s_credo_score credo_score ,
ln_loan_type, trench_category, ln_fspd30_flag, ln_mature_fspd30_flag ,'alpha' as tab   from worktable_data_analysis.sil_alpha_applied_loans_backscored_20240801_20250730
)
-- select digitalLoanAccountId, count(digitalLoanAccountId) from un group by 1 having count(digitalLoanAccountId) > 1 order by 2 desc;
,
un1 as 
(select * from un 
where trench_category in (3)
qualify row_number() over(partition by digitalLoanAccountId order by tab, trench_category desc, ln_appln_submit_datetime desc) = 1
),
base as 
(select a1.digitalLoanAccountId,
lmt.loanAccountNumber,a1.ln_os_type, a1.ln_appln_submit_datetime, a1.credo_score
, coalesce(ctial.score_all_score, lmt.credolabScore) credolabScore
--, asm.credo_gen_score credolabScore
, ldd.obs_min_inst_def30, ldd.min_inst_def30
, a1.trench_category
, case when ldd.obs_min_inst_def30 >=2 then 1 else 0 end flag_mature_fspd30,
case when ldd.obs_min_inst_def30 >=2 and ldd.min_inst_def30 in (1,2) then 1 else 0 end fspd30,
from un1 a1
inner join `risk_credit_mis.loan_master_table` lmt on lmt.digitalLoanAccountId = a1.digitalLoanAccountId
inner join prj-prod-dataplatform.risk_credit_mis.loan_deliquency_data ldd on lmt.loanAccountNumber = ldd.loanAccountNumber
left join risk_mart.applied_loans_20230101_20250831 al on al.digitalLoanAccountId = a1.digitalLoanAccountId
left join risk_credit_mis.credolab_trace_insight_all_loans ctial on ctial.referenceNumber = lmt.credolabRefNumber
left join prj-prod-dataplatform.risk_credit_mis.application_score_master asm on asm.digitalLoanAccountId = a1.digitalLoanAccountId
where flagDisbursement = 1
)
select * from base 
where 
flag_mature_fspd30 = 1
and ln_os_type not like 'Android'
and credo_score is not null
and credolabScore is not null
and trench_category in (3)
;
"""
d1 = client.query(sq).to_dataframe(progress_bar_type='tqdm')
print(f"The shape of table is:\t {d1.shape}")



Job ID cddf6579-6bfb-4ba3-aad8-c3831e34776b successfully executed: 100%|[32m██████████[0m|




Downloading: 100%|[32m██████████[0m|
The shape of table is:	 (233, 11)


In [67]:
# Define your periods
data_periods_dict = {
    'Train': {'start': '2024-08-13', 'end': '2025-01-31'}, 
    'OOT 1': {'start': '2025-02-01', 'end': '2025-02-28'},
    'OOT 2': {'start': '2025-03-01', 'end': '2025-03-31'},
    'OOT 3': {'start': '2025-04-01', 'end': '2025-04-30'},
    'OOT 4': {'start': '2025-05-01', 'end': '2025-05-31'},
}

# Debug: Check your date column format first
print("Date column info:")
print(f"Data type: {d1['ln_appln_submit_datetime'].dtype}")
print(f"Sample values: {d1['ln_appln_submit_datetime'].head()}")
print(f"Any null values: {d1['ln_appln_submit_datetime'].isnull().sum()}")

# Calculate Gini coefficients
gini_df = calculate_gini(d1, 'ln_appln_submit_datetime', 'fspd30', data_periods_dict)
print("\nGini Results:")
gini_df

Date column info:
Data type: datetime64[us, UTC]
Sample values: 0   2025-06-08 13:39:27+00:00
1   2025-05-25 18:38:25+00:00
2   2025-06-12 13:41:26+00:00
3   2025-06-01 17:22:58+00:00
4   2025-06-05 11:24:04+00:00
Name: ln_appln_submit_datetime, dtype: datetime64[us, UTC]
Any null values: 0

Gini Results:


Unnamed: 0,Period,Start_Date,End_Date,credo_score_gini,credolabScore_gini,sample_size
0,Train,2024-08-13,2025-01-31,,,0
1,OOT 1,2025-02-01,2025-02-28,,,0
2,OOT 2,2025-03-01,2025-03-31,0.259259,-0.185185,21
3,OOT 3,2025-04-01,2025-04-30,0.6875,0.45,82
4,OOT 4,2025-05-01,2025-05-31,0.277778,0.161616,69


In [69]:
dd.query("""select 
ln_os_type,  STRFTIME(ln_appln_submit_datetime, '%Y-%m') AS submit_year_month, count(digitalLoanAccountId)cntloan ,
sum(fspd30)fspd30,
sum(flag_mature_fspd30)
from d1 
         where date(ln_appln_submit_datetime) between '2025-03-01' and '2025-05-31'
         group by 1,2 order by 2;""").to_df()

Unnamed: 0,ln_os_type,submit_year_month,cntloan,fspd30,sum(flag_mature_fspd30)
0,iOS,2025-03,21,3.0,21.0
1,iOS,2025-04,82,2.0,82.0
2,iOS,2025-05,67,3.0,67.0
