# <div align = "center" style="color:rgb(0, 255, 0);"> Gini Calculation for Gen Credo Score and Credo Score for Different Trenches </div>

# Define Library

In [1]:
# %% [markdown]
# # Jupyter Notebook Loading Header
#
# This is a custom loading header for Jupyter Notebooks in Visual Studio Code.
# It includes common imports and settings to get you started quickly.
# %% [markdown]
## Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from google.cloud import bigquery
from google.cloud import storage
import os
import tempfile
import time
from datetime import datetime
import uuid
import joblib
import uuid

import gcsfs
import duckdb as dd
import pickle
import joblib
from typing import Union
import io

path = r'C:\Users\Dwaipayan\AppData\Roaming\gcloud\legacy_credentials\dchakroborti@tonikbank.com\adc.json'
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = path
client = bigquery.Client(project='prj-prod-dataplatform')
os.environ["GOOGLE_CLOUD_PROJECT"] = "prj-prod-dataplatform"
# %% [markdown]
## Configure Settings
# Set options or configurations as needed
pd.set_option('display.max_columns', None)
pd.set_option("Display.max_rows", 100)


# Constant

In [2]:
CURRENT_DATE = datetime.now().strftime("%Y%m%d")


# Config

In [3]:
unique_id = str(uuid.uuid4()).replace('-', '')[-12:]
print(f"The unique Id is: {unique_id}")
BUCKETNAME = 'prod-asia-southeast1-tonik-aiml-workspace'
CLOUDPATH = 'DC/Model_Monitoring/Gini_Values'
LOCALPATH = r'D:\OneDrive - Tonik Financial Pte Ltd\MyStuff\Data Engineering\Model_Monitoring\New_Model_Monitoring\Data\Gini_Values'
VERSION = 'V1'
PROJECT_ID = 'prj-prod-dataplatform'

The unique Id is: 954d7fb89bdf


# <div align="left" style="color:rgb(51, 250, 250);"> Functions </div>

## <div align="left" style="color:rgb(51, 250, 250);"> Save the data to google clound storage </div>

In [4]:
def save_df_to_gcs(df, bucket_name, destination_blob_name, file_format='csv'):
    """Saves a pandas DataFrame to Google Cloud Storage.

    Args:
        df: The pandas DataFrame to save.
        bucket_name: The name of the GCS bucket.
        destination_blob_name: The name of the blob to be created.
        file_format: The file format to save the DataFrame in ('csv' or 'parquet').
    """

    # Create a temporary file
    if file_format == 'csv':
        temp_file = 'temp.csv'
        df.to_csv(temp_file, index=False)
    elif file_format == 'parquet':
        temp_file = 'temp.parquet'
        df.to_parquet(temp_file, index=False)
    else:
        raise ValueError("Invalid file format. Please choose 'csv' or 'parquet'.")

    # Upload the file to GCS
    storage_client = storage.Client(project="prj-prod-dataplatform")

    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(destination_blob_name)

    blob.upload_from_filename(temp_file)

    # Remove the temporary file
    import os
    os.remove(temp_file)
    


## <div align="left" style="color:rgb(51, 250, 250);"> Read the Data from Google Cloud Storage </div>

In [5]:
def read_df_from_gcs(bucket_name, source_blob_name, file_format='csv'):
    """Reads a DataFrame from Google Cloud Storage.

    Args:
        bucket_name: The name of the GCS bucket.
        source_blob_name: The name of the blob to read.
        file_format: The file format to read ('csv' or 'parquet').

    Returns:
        pandas.DataFrame: The data loaded from the GCS file.
    """
    # Create a temporary file name
    temp_file = f'temp.{file_format}'
    
    try:
        # Initialize GCS client
        storage_client = storage.Client()
        bucket = storage_client.bucket(bucket_name)
        blob = bucket.blob(source_blob_name)

        # Download the file to a temporary location
        blob.download_to_filename(temp_file)

        # Read the file into a DataFrame
        if file_format == 'csv':
            df = pd.read_csv(temp_file, low_memory=False)
        elif file_format == 'parquet':
            df = pd.read_parquet(temp_file)
        else:
            raise ValueError("Invalid file format. Please choose 'csv' or 'parquet'.")

        return df

    finally:
        # Clean up the temporary file
        if os.path.exists(temp_file):
            os.remove(temp_file)

## <div align = "left" style="color:rgb(51, 250, 250);"> Data Quality Report </div>

In [6]:
def data_quality_report(df, target_col='ln_fspd30_flag'):
    # Initialize an empty list to store each row of data
    report_data = []
    # Iterate over each column in the DataFrame to compute metrics
    for col in df.columns:
        # Determine the data type of the column
        data_type = df[col].dtype
       
        # Calculate the number of missing values in the column
        missing_values = df[col].isnull().sum()
       
        # Calculate the percentage of missing values relative to the total number of rows
        missing_percentage = (missing_values / len(df)) * 100
       
        # Calculate the number of unique values in the column
        unique_values = df[col].nunique()
       
        # Calculate the percentage of non-missing values
        non_missing_percentage = ((len(df) - missing_values) / len(df)) * 100
       
        # Check if the column is numeric to compute additional metrics
        if pd.api.types.is_numeric_dtype(df[col]):
            # Compute minimum, maximum, mean, median, mode, mode percentage, standard deviation, and quantiles
            min_value = df[col].min()
            max_value = df[col].max()
            mean_value = df[col].mean()
            median_value = df[col].median()
            mode_value = df[col].mode().iloc[0] if not df[col].mode().empty else None
            mode_percentage = (df[col] == mode_value).sum() / len(df) * 100 if mode_value is not None else None
            std_dev = df[col].std()
            quantile_25 = df[col].quantile(0.25)
            quantile_50 = df[col].quantile(0.50)  # Same as median
            quantile_75 = df[col].quantile(0.75)
            
            # Calculate the Interquartile Range (IQR)
            iqr = quantile_75 - quantile_25
            
            # Calculate Skewness and Kurtosis
            skewness = df[col].skew()
            kurtosis = df[col].kurt()
            
            # Calculate Coefficient of Variation (CV) - standardized measure of dispersion
            cv = (std_dev / mean_value) * 100 if mean_value != 0 else None
            
            # Calculate correlation with target variable if target exists in dataframe
            if target_col in df.columns and col != target_col and pd.api.types.is_numeric_dtype(df[target_col]):
                # Calculate correlation only using rows where both columns have non-null values
                correlation = df[[col, target_col]].dropna().corr().iloc[0, 1]
            else:
                correlation = None
        else:
            # Assign None for non-numeric columns where appropriate
            min_value = None
            max_value = None
            mean_value = None
            median_value = None
            mode_value = df[col].mode().iloc[0] if not df[col].mode().empty else None
            mode_percentage = (df[col] == mode_value).sum() / len(df) * 100 if mode_value is not None else None
            std_dev = None
            quantile_25 = None
            quantile_50 = None
            quantile_75 = None
            iqr = None
            skewness = None
            kurtosis = None
            cv = None
            correlation = None
       
        # Append the computed metrics for the current column to the list
        report_data.append({
            'Column': col,
            'Data Type': data_type,
            'Missing Values': missing_values,
            'Missing Percentage': missing_percentage,
            'Unique Values': unique_values,
            'Min': min_value,
            'Max': max_value,
            'Mean': mean_value,
            'Median': median_value,
            'Mode': mode_value,
            'Mode Percentage': mode_percentage,
            'Std Dev': std_dev,
            'Non-missing Percentage': non_missing_percentage,
            '25% Quantile': quantile_25,
            '50% Quantile': quantile_50,
            '75% Quantile': quantile_75,
            'IQR': iqr,
            'Skewness': skewness,
            'Kurtosis': kurtosis,
            'CV (%)': cv,
            f'Correlation with {target_col}': correlation
        })
    # Create the DataFrame from the list of dictionaries
    report = pd.DataFrame(report_data)
   
    # Return the complete data quality report DataFrame
    return report

# <div align = "left" style="color:rgb(51,250,250);"> Upload pickle file to Google Cloud Storage Bucke </div>

In [7]:
def upload_to_gcs(bucket_name, source_file_path, destination_blob_name):
    """Uploads a file to Google Cloud Storage"""
    client = storage.Client()
    bucket = client.bucket(bucket_name)
    blob = bucket.blob(destination_blob_name)
    
    blob.upload_from_filename(source_file_path)
    print(f"File {source_file_path} uploaded to {bucket_name}/{destination_blob_name}")

In [8]:
import pickle
import io
from google.cloud import storage
def save_pickle_to_gcs(data, bucket_name, destination_blob_name):
    """
    Save any Python object as a pickle file to Google Cloud Storage
    
    Args:
        data: The Python object to pickle (DataFrame, dict, list, etc.)
        bucket_name: Name of the GCS bucket
        destination_blob_name: Path/filename in the bucket
    """
    # Initialize the GCS client
    client = storage.Client()
    bucket = client.bucket(bucket_name)
    blob = bucket.blob(destination_blob_name)
    
    # Serialize the data to pickle format in memory
    pickle_buffer = io.BytesIO()
    pickle.dump(data, pickle_buffer)
    pickle_buffer.seek(0)
    
    # Upload the pickle data to GCS
    blob.upload_from_file(pickle_buffer, content_type='application/octet-stream')
    print(f"Pickle file uploaded to gs://{bucket_name}/{destination_blob_name}")

# save_dataframe_multi_format

In [9]:
def save_dataframe_multi_format(
    dataframe: pd.DataFrame, 
    cloud_path: str, 
    filename: str, 
    client: bigquery.Client = None,
    bucket_name: str = None
) -> dict:
    """
    Save a pandas DataFrame to Google Cloud Storage in multiple formats (CSV, Pickle, Parquet, Joblib).
    
    Args:
        dataframe (pd.DataFrame): The DataFrame to save
        cloud_path (str): The cloud path (e.g., 'DC/Model_Monitoring/cash_beta_trench1_data')
        filename (str): The base filename without extension
        client (bigquery.Client, optional): BigQuery client (for project reference)
        bucket_name (str, optional): GCS bucket name. If None, will try to extract from client
        
    Returns:
        dict: Dictionary with status of each file saved
        
    Example:
        client = bigquery.Client(project='prj-prod-dataplatform')
        CLOUDPATH = 'DC/Model_Monitoring/cash_beta_trench1_data'
        
        results = save_dataframe_multi_format(
            dataframe=d1,
            cloud_path=CLOUDPATH,
            filename='my_data',
            client=client,
            bucket_name='your-bucket-name'  # Replace with your actual bucket name
        )
    """
    
    # Initialize Google Cloud Storage client
    storage_client = storage.Client(project=client.project if client else None)
    
    # You'll need to specify your bucket name here
    # Common bucket names in GCP data platforms might be like:
    # - 'prj-prod-dataplatform-storage'
    # - 'dataplatform-storage'
    # - or similar pattern
    if bucket_name is None:
        # You need to replace this with your actual bucket name
        raise ValueError("Please provide the bucket_name parameter")
    
    bucket = storage_client.bucket(bucket_name)
    
    # Results dictionary to track saves
    results = {}
    
    # Ensure cloud_path doesn't start with '/'
    cloud_path = cloud_path.lstrip('/')
    
    try:
        # 1. Save as CSV
        csv_buffer = io.StringIO()
        dataframe.to_csv(csv_buffer, index=False)
        csv_blob = bucket.blob(f"{cloud_path}/{filename}.csv")
        csv_blob.upload_from_string(csv_buffer.getvalue(), content_type='text/csv')
        results['csv'] = f"gs://{bucket_name}/{cloud_path}/{filename}.csv"
        
        # 2. Save as Pickle
        pickle_buffer = io.BytesIO()
        pickle.dump(dataframe, pickle_buffer)
        pickle_blob = bucket.blob(f"{cloud_path}/{filename}.pkl")
        pickle_blob.upload_from_string(pickle_buffer.getvalue(), content_type='application/octet-stream')
        results['pickle'] = f"gs://{bucket_name}/{cloud_path}/{filename}.pkl"
        
        # 3. Save as Parquet
        parquet_buffer = io.BytesIO()
        dataframe.to_parquet(parquet_buffer, index=False)
        parquet_blob = bucket.blob(f"{cloud_path}/{filename}.parquet")
        parquet_blob.upload_from_string(parquet_buffer.getvalue(), content_type='application/octet-stream')
        results['parquet'] = f"gs://{bucket_name}/{cloud_path}/{filename}.parquet"
        
        # 4. Save as Joblib
        joblib_buffer = io.BytesIO()
        joblib.dump(dataframe, joblib_buffer)
        joblib_blob = bucket.blob(f"{cloud_path}/{filename}.joblib")
        joblib_blob.upload_from_string(joblib_buffer.getvalue(), content_type='application/octet-stream')
        results['joblib'] = f"gs://{bucket_name}/{cloud_path}/{filename}.joblib"
        
        print("All files saved successfully!")
        for format_type, path in results.items():
            print(f"{format_type.upper()}: {path}")
            
    except Exception as e:
        print(f"Error occurred: {str(e)}")
        results['error'] = str(e)
    
    return results

# calculate_gini

In [10]:
import pandas as pd
from sklearn.metrics import roc_auc_score

def calculate_gini(df, date_column, target_column, periods_dict):
    # Make a copy to avoid modifying the original dataframe
    df = df.copy()
    
    # Handle different date column formats
    if df[date_column].dtype == 'object':
        # If it's string/object, try to extract date part and convert
        try:
            df[date_column] = pd.to_datetime(df[date_column].str[:10])
        except AttributeError:
            # If .str fails, it might be mixed types, convert directly
            df[date_column] = pd.to_datetime(df[date_column])
    else:
        # If it's already datetime or numeric, convert to datetime
        df[date_column] = pd.to_datetime(df[date_column])
    
    # Extract just the date part (remove time if present)
    df[date_column] = df[date_column].dt.date
    df[date_column] = pd.to_datetime(df[date_column])
    
    results = []
    
    for period, dates in periods_dict.items():
        start_date = pd.to_datetime(dates['start'])
        end_date = pd.to_datetime(dates['end'])
        
        # Filter data for the current period
        period_mask = (df[date_column] >= start_date) & (df[date_column] <= end_date)
        period_df = df.loc[period_mask].copy()
        
        # Convert score columns to numeric, handling any non-numeric values
        period_df['credo_score'] = pd.to_numeric(period_df['credo_score'], errors='coerce')
        period_df['credolabScore'] = pd.to_numeric(period_df['credolabScore'], errors='coerce')
        
        # Drop rows with missing target or scores (including those that couldn't be converted to numeric)
        period_df = period_df.dropna(subset=[target_column, 'credo_score', 'credolabScore'])
        
        if len(period_df) == 0:
            results.append({'Period': period, 'Start_Date': dates['start'], 'End_Date': dates['end'], 'credo_score_gini': None, 'credolabScore_gini': None, 'sample_size': 0})
            continue
        
        # Check if target column has variation (both 0s and 1s)
        if len(period_df[target_column].unique()) < 2:
            print(f"Warning: {period} has no variation in target variable (all {period_df[target_column].iloc[0]})")
            results.append({'Period': period, 'credo_score_gini': None, 'credolabScore_gini': None, 'sample_size': len(period_df)})
            continue
        
        # Calculate Gini for credo_score (probability score)
        try:
            auc_credo = roc_auc_score(period_df[target_column], period_df['credo_score'])
            gini_credo = 2 * auc_credo - 1
        except (ValueError, TypeError) as e:
            print(f"Error calculating Gini for credo_score in {period}: {e}")
            gini_credo = None
        
        # Calculate Gini for credolabScore (reverse the score since higher score = lower risk)
        try:
            auc_credolab = roc_auc_score(period_df[target_column], -period_df['credolabScore'])
            gini_credolab = 2 * auc_credolab - 1
        except (ValueError, TypeError) as e:
            print(f"Error calculating Gini for credolabScore in {period}: {e}")
            gini_credolab = None
        
        results.append({
            'Period': period,
            'Start_Date': dates['start'],
            'End_Date': dates['end'],
            'credo_score_gini': gini_credo,
            'credolabScore_gini': gini_credolab,
            'sample_size': len(period_df)
        })
    
    return pd.DataFrame(results)

# cash_beta_trench1_applied_loans_backscored_20241001_20250831

# Table

In [11]:
schema1 = 'worktable_data_analysis'
cash_beta_trench1 = f'cash_beta_trench1_applied_loans_backscored_20241001_20250831'

# Query Android

In [21]:
sq = f"""
with base as 
(select a1.customer_id, a1.digitalLoanAccountId,
lmt.loanAccountNumber,a1.ln_os_type, a1.ln_appln_submit_datetime, a1.credo_score
-- ,lmt.credolabScore
-- , al.gen_credo_score credolabScore
, coalesce(ctial.score_all_score, lmt.credolabScore) credolabScore
,a1.ln_fspd30_flag,a1.ln_mature_fspd30_flag, ldd.obs_min_inst_def30, ldd.min_inst_def30, 
case when ldd.obs_min_inst_def30 >=2 then 1 else 0 end flag_mature_fspd30,
case when ldd.obs_min_inst_def30 >=2 and ldd.min_inst_def30 in (1,2) then 1 else 0 end fspd30,
from {schema1}.{cash_beta_trench1} a1
inner join `risk_credit_mis.loan_master_table` lmt on lmt.digitalLoanAccountId = a1.digitalLoanAccountId
inner join prj-prod-dataplatform.risk_credit_mis.loan_deliquency_data ldd on lmt.loanAccountNumber = ldd.loanAccountNumber
left join risk_mart.applied_loans_20230101_20250831 al on al.digitalLoanAccountId = a1.digitalLoanAccountId
left join risk_credit_mis.credolab_trace_insight_all_loans ctial on ctial.referenceNumber = lmt.credolabRefNumber
where flagDisbursement = 1
)
select * from base 
where 
flag_mature_fspd30 = 1
and ln_os_type like 'Android'
and credo_score is not null
and credolabScore is not null
;
"""
d1 = client.query(sq).to_dataframe(progress_bar_type='tqdm')
print(f"The shape of {schema1}.{cash_beta_trench1} table is:\t {d1.shape}")

# Define your periods
data_periods_dict = {
    'Train': {'start': '2024-08-13', 'end': '2025-01-31'}, 
    'OOT 1': {'start': '2025-02-01', 'end': '2025-02-28'},
    'OOT 2': {'start': '2025-03-01', 'end': '2025-03-31'},
    'OOT 3': {'start': '2025-04-01', 'end': '2025-04-30'},
    'OOT 4': {'start': '2025-05-01', 'end': '2025-05-31'},
}

# Debug: Check your date column format first
print("Date column info:")
print(f"Data type: {d1['ln_appln_submit_datetime'].dtype}")
print(f"Sample values: {d1['ln_appln_submit_datetime'].head()}")
print(f"Any null values: {d1['ln_appln_submit_datetime'].isnull().sum()}")

# Calculate Gini coefficients
gini_df = calculate_gini(d1, 'ln_appln_submit_datetime', 'fspd30', data_periods_dict)
print("\nGini Results:")
print(gini_df)

Job ID 70e4f01d-ebb2-4195-9744-b16665aff9b5 successfully executed: 100%|[32m██████████[0m|




Downloading: 100%|[32m██████████[0m|
The shape of worktable_data_analysis.cash_beta_trench1_applied_loans_backscored_20241001_20250831 table is:	 (4630, 13)
Date column info:
Data type: datetime64[us, UTC]
Sample values: 0   2024-10-15 10:14:21+00:00
1   2024-10-31 20:42:11+00:00
2   2024-11-15 15:12:37+00:00
3   2024-10-01 23:09:06+00:00
4   2024-10-30 10:56:41+00:00
Name: ln_appln_submit_datetime, dtype: datetime64[us, UTC]
Any null values: 0

Gini Results:
  Period  Start_Date    End_Date  credo_score_gini  credolabScore_gini  \
0  Train  2024-08-13  2025-01-31          0.301360            0.090981   
1  OOT 1  2025-02-01  2025-02-28          0.285793            0.063103   
2  OOT 2  2025-03-01  2025-03-31          0.336065            0.261779   
3  OOT 3  2025-04-01  2025-04-30          0.271644            0.043692   
4  OOT 4  2025-05-01  2025-05-31          0.290000            0.127838   

   sample_size  
0         2762  
1          548  
2          335  
3          334  
4  

Total count of Android loans for table worktable_data_analysis.cash_beta_trench1_applied_loans_backscored_20241001_20250831 - (201845, 6)

when credo_score is not null -  (201845, 6)

flag_disbursement = 1 - (5591, 6)

after added fspd30 table =  (5411, 8)

after adding flag_mature_fspd30 = 1 -   (4583, 13)






In [15]:
d1.to_csv(rf"{LOCALPATH}/credonewandold.csv", index = False)

In [None]:
# import pandas as pd
# from google.cloud import bigquery
# from sklearn.metrics import roc_auc_score
# from typing import Dict

# def calculate_gini_for_table(
#     project_id: str,
#     table_name: str,
#     date_column: str,
#     score_column: str,
#     target_column: str,
#     target_maturity_column: str,
#     data_periods_dict: Dict
# ):
#     """
#     Calculate Gini coefficient for different time periods.
    
#     Args:
#         project_id: BigQuery project ID
#         table_name: Full table name (dataset.table)
#         date_column: Name of the date column
#         score_column: Name of the score column
#         target_column: Name of the target column
#         target_maturity_column: Name of the target maturity column
#         data_periods_dict: Dictionary with periods, e.g.:
#             {'Train': {'start': '2024-01-01', 'end': '2025-01-31'}, 
#              'Test': {'start': '2025-02-01', 'end': '2025-12-31'}}
    
#     Returns:
#         pandas.DataFrame: Table with Gini coefficients for each period
#     """
    
#     client = bigquery.Client(project_id)
    
#     # Base query to get all data
#     sql_query = f"""
#     WITH base as (
#         SELECT DISTINCT 
#             digitalLoanAccountId, 
#             {date_column}, 
#             {score_column}, 
#             {target_column}
#         FROM {table_name}
#         WHERE {score_column} IS NOT NULL
#         AND {target_maturity_column} = 1
#     )
#     SELECT {date_column}, {score_column}, {target_column}
#     FROM base
#     ORDER BY {date_column}
#     """
    
#     # Get all data
#     dt = client.query(sql_query).to_dataframe()
    
#     # Convert date column to datetime and extract just the date part
#     dt[date_column] = pd.to_datetime(dt[date_column]).dt.date
    
#     # Initialize results
#     gini_results = []
    
#     print("Gini Coefficient Results:")
#     print("=" * 50)
    
#     # Calculate Gini for each period
#     for period_name, period_info in data_periods_dict.items():
#         start_date = pd.to_datetime(period_info['start']).date()
#         end_date = pd.to_datetime(period_info['end']).date()
        
#         # Filter data for the current period
#         period_mask = (dt[date_column] >= start_date) & (dt[date_column] <= end_date)
#         period_data = dt[period_mask].copy()
        
#         if len(period_data) == 0:
#             print(f"{period_name}: No data available for period {start_date.date()} to {end_date.date()}")
#             gini_results.append({
#                 'Period': period_name,
#                 'Start_Date': start_date,
#                 'End_Date': end_date,
#                 'Sample_Size': 0,
#                 'Gini_Coefficient': None
#             })
#             continue
        
#         # Check if we have both classes (0 and 1) in target
#         unique_targets = period_data[target_column].unique()
#         if len(unique_targets) < 2:
#             print(f"{period_name}: Only one class present in target variable. Cannot calculate Gini.")
#             gini_results.append({
#                 'Period': period_name,
#                 'Start_Date': start_date,
#                 'End_Date': end_date,
#                 'Sample_Size': len(period_data),
#                 'Gini_Coefficient': None
#             })
#             continue
        
#         # Calculate Gini coefficient
#         try:
#             auc = roc_auc_score(period_data[target_column], period_data[score_column])
#             gini = 2 * auc - 1
            
#             print(f"{period_name}: {round(gini, 4)} (Sample size: {len(period_data):,})")
            
#             gini_results.append({
#                 'Period': period_name,
#                 'Start_Date': start_date,
#                 'End_Date': end_date,
#                 'Sample_Size': len(period_data),
#                 'Gini_Coefficient': round(gini, 4)
#             })
            
#         except Exception as e:
#             print(f"{period_name}: Error calculating Gini - {str(e)}")
#             gini_results.append({
#                 'Period': period_name,
#                 'Start_Date': start_date,
#                 'End_Date': end_date,
#                 'Sample_Size': len(period_data),
#                 'Gini_Coefficient': None
#             })
    
#     # Create results DataFrame
#     results_df = pd.DataFrame(gini_results)
    
#     print("\n" + "=" * 50)
#     print("Summary Table:")
#     print(results_df.to_string(index=False))
    
#     return results_df

In [22]:
import pandas as pd
from sklearn.metrics import roc_auc_score

def calculate_gini(df, date_column, target_column, periods_dict):
    # Make a copy to avoid modifying the original dataframe
    df = df.copy()
    
    # Handle different date column formats
    if df[date_column].dtype == 'object':
        # If it's string/object, try to extract date part and convert
        try:
            df[date_column] = pd.to_datetime(df[date_column].str[:10])
        except AttributeError:
            # If .str fails, it might be mixed types, convert directly
            df[date_column] = pd.to_datetime(df[date_column])
    else:
        # If it's already datetime or numeric, convert to datetime
        df[date_column] = pd.to_datetime(df[date_column])
    
    # Extract just the date part (remove time if present)
    df[date_column] = df[date_column].dt.date
    df[date_column] = pd.to_datetime(df[date_column])
    
    results = []
    
    for period, dates in periods_dict.items():
        start_date = pd.to_datetime(dates['start'])
        end_date = pd.to_datetime(dates['end'])
        
        # Filter data for the current period
        period_mask = (df[date_column] >= start_date) & (df[date_column] <= end_date)
        period_df = df.loc[period_mask].copy()
        
        # Convert score columns to numeric, handling any non-numeric values
        period_df['credo_score'] = pd.to_numeric(period_df['credo_score'], errors='coerce')
        period_df['credolabScore'] = pd.to_numeric(period_df['credolabScore'], errors='coerce')
        
        # Drop rows with missing target or scores (including those that couldn't be converted to numeric)
        period_df = period_df.dropna(subset=[target_column, 'credo_score', 'credolabScore'])
        
        if len(period_df) == 0:
            results.append({'Period': period, 'Start_Date': dates['start'], 'End_Date': dates['end'], 'credo_score_gini': None, 'credolabScore_gini': None, 'sample_size': 0})
            continue
        
        # Check if target column has variation (both 0s and 1s)
        if len(period_df[target_column].unique()) < 2:
            print(f"Warning: {period} has no variation in target variable (all {period_df[target_column].iloc[0]})")
            results.append({'Period': period, 'credo_score_gini': None, 'credolabScore_gini': None, 'sample_size': len(period_df)})
            continue
        
        # Calculate Gini for credo_score (probability score)
        try:
            auc_credo = roc_auc_score(period_df[target_column], period_df['credo_score'])
            gini_credo = 2 * auc_credo - 1
        except (ValueError, TypeError) as e:
            print(f"Error calculating Gini for credo_score in {period}: {e}")
            gini_credo = None
        
        # Calculate Gini for credolabScore (reverse the score since higher score = lower risk)
        try:
            auc_credolab = roc_auc_score(period_df[target_column], -period_df['credolabScore'])
            gini_credolab = 2 * auc_credolab - 1
        except (ValueError, TypeError) as e:
            print(f"Error calculating Gini for credolabScore in {period}: {e}")
            gini_credolab = None
        
        results.append({
            'Period': period,
            'Start_Date': dates['start'],
            'End_Date': dates['end'],
            'credo_score_gini': gini_credo,
            'credolabScore_gini': gini_credolab,
            'sample_size': len(period_df)
        })
    
    return pd.DataFrame(results)


# Define your periods
data_periods_dict = {
    'Train': {'start': '2024-08-13', 'end': '2025-01-31'}, 
    'OOT 1': {'start': '2025-02-01', 'end': '2025-02-28'},
    'OOT 2': {'start': '2025-03-01', 'end': '2025-03-31'},
    'OOT 3': {'start': '2025-04-01', 'end': '2025-04-30'},
    'OOT 4': {'start': '2025-05-01', 'end': '2025-05-31'},
}

# Debug: Check your date column format first
print("Date column info:")
print(f"Data type: {d1['ln_appln_submit_datetime'].dtype}")
print(f"Sample values: {d1['ln_appln_submit_datetime'].head()}")
print(f"Any null values: {d1['ln_appln_submit_datetime'].isnull().sum()}")

# Calculate Gini coefficients
gini_df = calculate_gini(d1, 'ln_appln_submit_datetime', 'fspd30', data_periods_dict)
print("\nGini Results:")
print(gini_df)

Date column info:
Data type: datetime64[us, UTC]
Sample values: 0   2024-10-15 10:14:21+00:00
1   2024-10-31 20:42:11+00:00
2   2024-11-15 15:12:37+00:00
3   2024-10-01 23:09:06+00:00
4   2024-10-30 10:56:41+00:00
Name: ln_appln_submit_datetime, dtype: datetime64[us, UTC]
Any null values: 0

Gini Results:
  Period  Start_Date    End_Date  credo_score_gini  credolabScore_gini  \
0  Train  2024-08-13  2025-01-31          0.301360            0.090981   
1  OOT 1  2025-02-01  2025-02-28          0.285793            0.063103   
2  OOT 2  2025-03-01  2025-03-31          0.336065            0.261779   
3  OOT 3  2025-04-01  2025-04-30          0.271644            0.043692   
4  OOT 4  2025-05-01  2025-05-31          0.290000            0.127838   

   sample_size  
0         2762  
1          548  
2          335  
3          334  
4          371  


In [23]:
cash_beta_trench1_df = gini_df.copy()
print("\nGini Results:")
gini_df


Gini Results:


Unnamed: 0,Period,Start_Date,End_Date,credo_score_gini,credolabScore_gini,sample_size
0,Train,2024-08-13,2025-01-31,0.30136,0.090981,2762
1,OOT 1,2025-02-01,2025-02-28,0.285793,0.063103,548
2,OOT 2,2025-03-01,2025-03-31,0.336065,0.261779,335
3,OOT 3,2025-04-01,2025-04-30,0.271644,0.043692,334
4,OOT 4,2025-05-01,2025-05-31,0.29,0.127838,371


In [24]:
d1.columns

Index(['customer_id', 'digitalLoanAccountId', 'loanAccountNumber',
       'ln_os_type', 'ln_appln_submit_datetime', 'credo_score',
       'credolabScore', 'ln_fspd30_flag', 'ln_mature_fspd30_flag',
       'obs_min_inst_def30', 'min_inst_def30', 'flag_mature_fspd30', 'fspd30'],
      dtype='object')

In [31]:
dd.query("""SELECT
  ln_os_type, 
  case when credo_score is not null then 1 else 0 end credo_score_flag, 
  case when credolabScore is not null then 1 else 0 end credolabscore_flag,
  STRFTIME(ln_appln_submit_datetime, '%Y-%m') AS submit_year_month,
  COUNT(DISTINCT digitalLoanAccountId) AS cntloans, 
  SUM(ln_fspd30_flag) AS ln_fspd30_flag,
  SUM(ln_mature_fspd30_flag) AS ln_mature_fspd30_flag,
  SUM(fspd30) AS fspd30,
  SUM(flag_mature_fspd30) AS flag_mature_fspd30
FROM d1
         where STRFTIME(ln_appln_submit_datetime, '%Y-%m') in ('2025-03', '2025-04', '2025-05')
GROUP BY 1, 2,3,4
order by 4 desc
;
"""
).to_df()

# .to_csv(r"D:\OneDrive - Tonik Financial Pte Ltd\MyStuff\Data Engineering\Model_Monitoring\New_Model_Monitoring\Data\Gini_Values\androidcashbetatrench1.csv", index = False)

Unnamed: 0,ln_os_type,credo_score_flag,credolabscore_flag,submit_year_month,cntloans,ln_fspd30_flag,ln_mature_fspd30_flag,fspd30,flag_mature_fspd30
0,Android,1,1,2025-05,374,65.0,318.0,76.0,374.0
1,Android,1,1,2025-04,334,63.0,334.0,63.0,334.0
2,Android,1,1,2025-03,335,70.0,335.0,70.0,335.0


# Query IOS

In [32]:
sq = f"""
with base as 
(select a1.customer_id, a1.digitalLoanAccountId,
lmt.loanAccountNumber,a1.ln_os_type, a1.ln_appln_submit_datetime, a1.credo_score
-- ,lmt.credolabScore
-- , al.gen_credo_score credolabScore
, coalesce(ctial.score_all_score, lmt.credolabScore) credolabScore
,a1.ln_fspd30_flag,a1.ln_mature_fspd30_flag, ldd.obs_min_inst_def30, ldd.min_inst_def30, 
case when ldd.obs_min_inst_def30 >=2 then 1 else 0 end flag_mature_fspd30,
case when ldd.obs_min_inst_def30 >=2 and ldd.min_inst_def30 in (1,2) then 1 else 0 end fspd30,
from {schema1}.{cash_beta_trench1}  a1
inner join `risk_credit_mis.loan_master_table` lmt on lmt.digitalLoanAccountId = a1.digitalLoanAccountId
inner join prj-prod-dataplatform.risk_credit_mis.loan_deliquency_data ldd on lmt.loanAccountNumber = ldd.loanAccountNumber
left join risk_mart.applied_loans_20230101_20250831 al on al.digitalLoanAccountId = a1.digitalLoanAccountId
left join risk_credit_mis.credolab_trace_insight_all_loans ctial on ctial.referenceNumber = lmt.credolabRefNumber
where flagDisbursement = 1
)
select * from base 
where 
flag_mature_fspd30 = 1
and ln_os_type not like 'Android'
and credo_score is not null
and credolabScore is not null
;
"""
d1 = client.query(sq).to_dataframe(progress_bar_type='tqdm')
print(f"The shape of {schema1}.{cash_beta_trench1} table is:\t {d1.shape}")

Job ID 8cf5e66c-7156-4cc3-9b46-1008fdcb7a90 successfully executed: 100%|[32m██████████[0m|




Downloading: 100%|[32m██████████[0m|
The shape of worktable_data_analysis.cash_beta_trench1_applied_loans_backscored_20241001_20250831 table is:	 (1580, 13)


original without andriod -  (8626, 13)
credo_score not null - (8626, 13)
flag_disbursement = 1 - (8626, 13)
flag_mature_fspd30 -  (7169, 13)


In [33]:
# Define your periods
data_periods_dict = {
    'Train': {'start': '2024-08-13', 'end': '2025-01-31'}, 
    'OOT 1': {'start': '2025-02-01', 'end': '2025-02-28'},
    'OOT 2': {'start': '2025-03-01', 'end': '2025-03-31'},
    'OOT 3': {'start': '2025-04-01', 'end': '2025-04-30'},
    'OOT 4': {'start': '2025-05-01', 'end': '2025-05-31'},
}

# Debug: Check your date column format first
print("Date column info:")
print(f"Data type: {d1['ln_appln_submit_datetime'].dtype}")
print(f"Sample values: {d1['ln_appln_submit_datetime'].head()}")
print(f"Any null values: {d1['ln_appln_submit_datetime'].isnull().sum()}")

# Calculate Gini coefficients
gini_df = calculate_gini(d1, 'ln_appln_submit_datetime', 'fspd30', data_periods_dict)
print("\nGini Results:")
gini_df

Date column info:
Data type: datetime64[us, UTC]
Sample values: 0   2025-04-16 15:45:33+00:00
1   2025-05-07 14:20:41+00:00
2   2025-05-07 12:03:25+00:00
3   2025-05-23 14:00:45+00:00
4   2025-05-28 17:48:53+00:00
Name: ln_appln_submit_datetime, dtype: datetime64[us, UTC]
Any null values: 0

Gini Results:


Unnamed: 0,Period,Start_Date,End_Date,credo_score_gini,credolabScore_gini,sample_size
0,Train,2024-08-13,2025-01-31,,,0
1,OOT 1,2025-02-01,2025-02-28,,,0
2,OOT 2,2025-03-01,2025-03-31,0.170189,0.132925,153
3,OOT 3,2025-04-01,2025-04-30,0.086147,0.108097,477
4,OOT 4,2025-05-01,2025-05-31,0.211641,0.074699,552


In [34]:
dd.query("""SELECT
  ln_os_type, 
  case when credo_score is not null then 1 else 0 end credo_score_flag, 
  case when credolabScore is not null then 1 else 0 end credolabscore_flag,
  STRFTIME(ln_appln_submit_datetime, '%Y-%m') AS submit_year_month,
  COUNT(DISTINCT digitalLoanAccountId) AS cntloans, 
  SUM(ln_fspd30_flag) AS ln_fspd30_flag,
  SUM(ln_mature_fspd30_flag) AS ln_mature_fspd30_flag,
  SUM(fspd30) AS fspd30,
  SUM(flag_mature_fspd30) AS flag_mature_fspd30
FROM d1
         where STRFTIME(ln_appln_submit_datetime, '%Y-%m') in ('2025-03', '2025-04', '2025-05')
GROUP BY 1, 2,3,4
order by 4 desc
;
"""
).to_df()

# .to_csv(r"D:\OneDrive - Tonik Financial Pte Ltd\MyStuff\Data Engineering\Model_Monitoring\New_Model_Monitoring\Data\Gini_Values\androidcashbetatrench1.csv", index = False)

Unnamed: 0,ln_os_type,credo_score_flag,credolabscore_flag,submit_year_month,cntloans,ln_fspd30_flag,ln_mature_fspd30_flag,fspd30,flag_mature_fspd30
0,iOS,1,1,2025-05,552,104.0,477.0,114.0,552.0
1,iOS,1,1,2025-04,477,92.0,477.0,92.0,477.0
2,iOS,1,1,2025-03,148,28.0,148.0,28.0,148.0


In [20]:
cash_beta_trench1_ios_df = gini_df.copy()

# cash_beta_trench2_applied_loans_backscored_20241001_20250831

# Table

In [35]:
schema1 = 'worktable_data_analysis'
cash_beta_trench2 = f'cash_beta_trench2_applied_loans_backscored_20241001_20250831'

# Query Android

In [36]:
sq = f"""
with base as 
(select a1.customer_id, a1.digitalLoanAccountId,
lmt.loanAccountNumber,a1.ln_os_type, a1.ln_appln_submit_datetime, a1.credo_score
-- ,lmt.credolabScore
-- , al.gen_credo_score credolabScore
, coalesce(ctial.score_all_score, lmt.credolabScore) credolabScore
,a1.ln_fspd30_flag,a1.ln_mature_fspd30_flag, ldd.obs_min_inst_def30, ldd.min_inst_def30, 
case when ldd.obs_min_inst_def30 >=2 then 1 else 0 end flag_mature_fspd30,
case when ldd.obs_min_inst_def30 >=2 and ldd.min_inst_def30 in (1,2) then 1 else 0 end fspd30,
from {schema1}.{cash_beta_trench2} a1
inner join `risk_credit_mis.loan_master_table` lmt on lmt.digitalLoanAccountId = a1.digitalLoanAccountId
inner join prj-prod-dataplatform.risk_credit_mis.loan_deliquency_data ldd on lmt.loanAccountNumber = ldd.loanAccountNumber
left join risk_mart.applied_loans_20230101_20250831 al on al.digitalLoanAccountId = a1.digitalLoanAccountId
left join risk_credit_mis.credolab_trace_insight_all_loans ctial on ctial.referenceNumber = lmt.credolabRefNumber
where flagDisbursement = 1
)
select * from base 
where 
flag_mature_fspd30 = 1
and ln_os_type like 'Android'
and credo_score is not null
and credolabScore is not null
;
"""
d2 = client.query(sq).to_dataframe(progress_bar_type='tqdm')
print(f"The shape of {schema1}.{cash_beta_trench2} table is:\t {d2.shape}")

Job ID e0bdaf98-6378-4881-a92c-af033c722884 successfully executed: 100%|[32m██████████[0m|




Downloading: 100%|[32m██████████[0m|
The shape of worktable_data_analysis.cash_beta_trench2_applied_loans_backscored_20241001_20250831 table is:	 (3903, 13)


In [37]:
# Define your periods
data_periods_dict = {
    'Train': {'start': '2024-08-13', 'end': '2025-01-31'}, 
    'OOT 1': {'start': '2025-02-01', 'end': '2025-02-28'},
    'OOT 2': {'start': '2025-03-01', 'end': '2025-03-31'},
    'OOT 3': {'start': '2025-04-01', 'end': '2025-04-30'},
    'OOT 4': {'start': '2025-05-01', 'end': '2025-05-31'},
}

# Debug: Check your date column format first
print("Date column info:")
print(f"Data type: {d2['ln_appln_submit_datetime'].dtype}")
print(f"Sample values: {d2['ln_appln_submit_datetime'].head()}")
print(f"Any null values: {d2['ln_appln_submit_datetime'].isnull().sum()}")

# Calculate Gini coefficients
gini_df2 = calculate_gini(d2, 'ln_appln_submit_datetime', 'fspd30', data_periods_dict)
print("\nGini Results:")
gini_df2

Date column info:
Data type: datetime64[us, UTC]
Sample values: 0   2024-10-27 21:01:45+00:00
1   2024-10-26 14:49:22+00:00
2   2024-11-08 16:03:27+00:00
3   2024-10-24 08:36:32+00:00
4   2024-11-11 13:46:59+00:00
Name: ln_appln_submit_datetime, dtype: datetime64[us, UTC]
Any null values: 0

Gini Results:


Unnamed: 0,Period,Start_Date,End_Date,credo_score_gini,credolabScore_gini,sample_size
0,Train,2024-08-13,2025-01-31,0.266925,0.158478,1933
1,OOT 1,2025-02-01,2025-02-28,0.423827,0.235004,463
2,OOT 2,2025-03-01,2025-03-31,0.352999,0.034759,473
3,OOT 3,2025-04-01,2025-04-30,0.295992,0.187835,397
4,OOT 4,2025-05-01,2025-05-31,0.306634,0.171568,387


In [38]:
cash_beta_trench2_andriod_df = gini_df2.copy()

In [40]:
dd.query("""SELECT
  ln_os_type, 
  case when credo_score is not null then 1 else 0 end credo_score_flag, 
  case when credolabScore is not null then 1 else 0 end credolabscore_flag,
  STRFTIME(ln_appln_submit_datetime, '%Y-%m') AS submit_year_month,
  COUNT(DISTINCT digitalLoanAccountId) AS cntloans, 
  SUM(ln_fspd30_flag) AS ln_fspd30_flag,
  SUM(ln_mature_fspd30_flag) AS ln_mature_fspd30_flag,
  SUM(fspd30) AS fspd30,
  SUM(flag_mature_fspd30) AS flag_mature_fspd30
FROM d2
         where STRFTIME(ln_appln_submit_datetime, '%Y-%m') in ('2025-03', '2025-04', '2025-05')
GROUP BY 1, 2,3,4
order by 4 desc
;
"""
).to_df()

# .to_csv(r"D:\OneDrive - Tonik Financial Pte Ltd\MyStuff\Data Engineering\Model_Monitoring\New_Model_Monitoring\Data\Gini_Values\androidcashbetatrench1.csv", index = False)

Unnamed: 0,ln_os_type,credo_score_flag,credolabscore_flag,submit_year_month,cntloans,ln_fspd30_flag,ln_mature_fspd30_flag,fspd30,flag_mature_fspd30
0,Android,1,1,2025-05,383,86.0,345.0,93.0,383.0
1,Android,1,1,2025-04,399,94.0,399.0,94.0,399.0
2,Android,1,1,2025-03,481,118.0,481.0,118.0,481.0


# Query IOS

In [41]:
sq = f"""
with base as 
(select a1.customer_id, a1.digitalLoanAccountId,
lmt.loanAccountNumber,a1.ln_os_type, a1.ln_appln_submit_datetime, a1.credo_score
-- ,lmt.credolabScore
-- , al.gen_credo_score credolabScore
, coalesce(ctial.score_all_score, lmt.credolabScore) credolabScore
,a1.ln_fspd30_flag,a1.ln_mature_fspd30_flag, ldd.obs_min_inst_def30, ldd.min_inst_def30, 
case when ldd.obs_min_inst_def30 >=2 then 1 else 0 end flag_mature_fspd30,
case when ldd.obs_min_inst_def30 >=2 and ldd.min_inst_def30 in (1,2) then 1 else 0 end fspd30,
from {schema1}.{cash_beta_trench2} a1
inner join `risk_credit_mis.loan_master_table` lmt on lmt.digitalLoanAccountId = a1.digitalLoanAccountId
inner join prj-prod-dataplatform.risk_credit_mis.loan_deliquency_data ldd on lmt.loanAccountNumber = ldd.loanAccountNumber
left join risk_mart.applied_loans_20230101_20250831 al on al.digitalLoanAccountId = a1.digitalLoanAccountId
left join risk_credit_mis.credolab_trace_insight_all_loans ctial on ctial.referenceNumber = lmt.credolabRefNumber
where flagDisbursement = 1
)
select * from base 
where 
flag_mature_fspd30 = 1
and ln_os_type not like 'Android'
and credo_score is not null
and credolabScore is not null
;
"""
d2 = client.query(sq).to_dataframe(progress_bar_type='tqdm')
print(f"The shape of {schema1}.{cash_beta_trench2} table is:\t {d2.shape}")

Job ID 0a7f3630-ff8e-4cab-98e8-d7662d52499c successfully executed: 100%|[32m██████████[0m|




Downloading: 100%|[32m██████████[0m|
The shape of worktable_data_analysis.cash_beta_trench2_applied_loans_backscored_20241001_20250831 table is:	 (1232, 13)


In [42]:
# Define your periods
data_periods_dict = {
    'Train': {'start': '2024-08-13', 'end': '2025-01-31'}, 
    'OOT 1': {'start': '2025-02-01', 'end': '2025-02-28'},
    'OOT 2': {'start': '2025-03-01', 'end': '2025-03-31'},
    'OOT 3': {'start': '2025-04-01', 'end': '2025-04-30'},
    'OOT 4': {'start': '2025-05-01', 'end': '2025-05-31'},
}

# Debug: Check your date column format first
print("Date column info:")
print(f"Data type: {d2['ln_appln_submit_datetime'].dtype}")
print(f"Sample values: {d2['ln_appln_submit_datetime'].head()}")
print(f"Any null values: {d2['ln_appln_submit_datetime'].isnull().sum()}")

# Calculate Gini coefficients
gini_df2 = calculate_gini(d2, 'ln_appln_submit_datetime', 'fspd30', data_periods_dict)
print("\nGini Results:")
gini_df2

Date column info:
Data type: datetime64[us, UTC]
Sample values: 0   2025-04-06 14:25:55+00:00
1   2025-04-29 15:56:30+00:00
2   2025-03-26 08:45:33+00:00
3   2025-05-17 09:50:55+00:00
4   2025-04-28 22:20:29+00:00
Name: ln_appln_submit_datetime, dtype: datetime64[us, UTC]
Any null values: 0

Gini Results:


Unnamed: 0,Period,Start_Date,End_Date,credo_score_gini,credolabScore_gini,sample_size
0,Train,2024-08-13,2025-01-31,,,0
1,OOT 1,2025-02-01,2025-02-28,,,0
2,OOT 2,2025-03-01,2025-03-31,0.177807,0.105615,100
3,OOT 3,2025-04-01,2025-04-30,0.096853,0.030743,430
4,OOT 4,2025-05-01,2025-05-31,0.067697,0.025946,422


In [43]:
cash_beta_trench2_ios_df = gini_df2.copy()

In [44]:
dd.query("""SELECT
  ln_os_type, 
  case when credo_score is not null then 1 else 0 end credo_score_flag, 
  case when credolabScore is not null then 1 else 0 end credolabscore_flag,
  STRFTIME(ln_appln_submit_datetime, '%Y-%m') AS submit_year_month,
  COUNT(DISTINCT digitalLoanAccountId) AS cntloans, 
  SUM(ln_fspd30_flag) AS ln_fspd30_flag,
  SUM(ln_mature_fspd30_flag) AS ln_mature_fspd30_flag,
  SUM(fspd30) AS fspd30,
  SUM(flag_mature_fspd30) AS flag_mature_fspd30
FROM d2
         where STRFTIME(ln_appln_submit_datetime, '%Y-%m') in ('2025-03', '2025-04', '2025-05')
GROUP BY 1, 2,3,4
order by 4 desc
;
"""
).to_df()

# .to_csv(r"D:\OneDrive - Tonik Financial Pte Ltd\MyStuff\Data Engineering\Model_Monitoring\New_Model_Monitoring\Data\Gini_Values\androidcashbetatrench1.csv", index = False)

Unnamed: 0,ln_os_type,credo_score_flag,credolabscore_flag,submit_year_month,cntloans,ln_fspd30_flag,ln_mature_fspd30_flag,fspd30,flag_mature_fspd30
0,iOS,1,1,2025-05,424,81.0,365.0,94.0,424.0
1,iOS,1,1,2025-04,425,93.0,424.0,93.0,425.0
2,iOS,1,1,2025-03,100,34.0,100.0,34.0,100.0


# cash_beta_trench3_applied_loans_backscored_20241001_20250831

# Table

In [45]:
schema1 = 'worktable_data_analysis'
cash_beta_trench3 = f'cash_beta_trench3_applied_loans_backscored_20241001_20250831'

# Query Android

In [46]:
sq = f"""
with base as 
(select a1.customer_id, a1.digitalLoanAccountId,
lmt.loanAccountNumber,a1.ln_os_type, a1.ln_appln_submit_datetime, a1.credo_score
, coalesce(ctial.score_all_score, lmt.credolabScore) credolabScore
,a1.ln_fspd30_flag,a1.ln_mature_fspd30_flag, ldd.obs_min_inst_def30, ldd.min_inst_def30, 
case when ldd.obs_min_inst_def30 >=2 then 1 else 0 end flag_mature_fspd30,
case when ldd.obs_min_inst_def30 >=2 and ldd.min_inst_def30 in (1,2) then 1 else 0 end fspd30,
from {schema1}.{cash_beta_trench3} a1
inner join `risk_credit_mis.loan_master_table` lmt on lmt.digitalLoanAccountId = a1.digitalLoanAccountId
inner join prj-prod-dataplatform.risk_credit_mis.loan_deliquency_data ldd on lmt.loanAccountNumber = ldd.loanAccountNumber
left join risk_mart.applied_loans_20230101_20250831 al on al.digitalLoanAccountId = a1.digitalLoanAccountId
left join risk_credit_mis.credolab_trace_insight_all_loans ctial on ctial.referenceNumber = lmt.credolabRefNumber
where flagDisbursement = 1
)
select * from base 
where 
flag_mature_fspd30 = 1
and ln_os_type like 'Android'
and credo_score is not null
and credolabScore is not null
;
"""
d3 = client.query(sq).to_dataframe(progress_bar_type='tqdm')
print(f"The shape of {schema1}.{cash_beta_trench3} table is:\t {d3.shape}")

Job ID 9846c344-2d66-4304-902b-ad89a41617b3 successfully executed: 100%|[32m██████████[0m|




Downloading: 100%|[32m██████████[0m|
The shape of worktable_data_analysis.cash_beta_trench3_applied_loans_backscored_20241001_20250831 table is:	 (3343, 13)


In [47]:
# Define your periods
data_periods_dict = {
    'Train': {'start': '2024-08-13', 'end': '2025-01-31'}, 
    'OOT 1': {'start': '2025-02-01', 'end': '2025-02-28'},
    'OOT 2': {'start': '2025-03-01', 'end': '2025-03-31'},
    'OOT 3': {'start': '2025-04-01', 'end': '2025-04-30'},
    'OOT 4': {'start': '2025-05-01', 'end': '2025-05-31'},
}

# Debug: Check your date column format first"
print("Date column info:")
print(f"Data type: {d3['ln_appln_submit_datetime'].dtype}")
print(f"Sample values: {d3['ln_appln_submit_datetime'].head()}")
print(f"Any null values: {d3['ln_appln_submit_datetime'].isnull().sum()}")

# Calculate Gini coefficients
gini_df3 = calculate_gini(d3, 'ln_appln_submit_datetime', 'fspd30', data_periods_dict)
print("\nGini Results:")
gini_df3

Date column info:
Data type: datetime64[us, UTC]
Sample values: 0   2025-01-27 12:38:32+00:00
1   2024-11-05 19:44:52+00:00
2   2025-04-14 12:13:45+00:00
3   2025-05-09 11:56:09+00:00
4   2025-04-06 11:34:21+00:00
Name: ln_appln_submit_datetime, dtype: datetime64[us, UTC]
Any null values: 0

Gini Results:


Unnamed: 0,Period,Start_Date,End_Date,credo_score_gini,credolabScore_gini,sample_size
0,Train,2024-08-13,2025-01-31,0.166167,0.084071,1374
1,OOT 1,2025-02-01,2025-02-28,0.255133,-0.03466,391
2,OOT 2,2025-03-01,2025-03-31,0.297123,0.069909,425
3,OOT 3,2025-04-01,2025-04-30,0.13861,0.159289,388
4,OOT 4,2025-05-01,2025-05-31,0.053406,0.236495,483


In [48]:
cash_beta_trench3_android_df = gini_df3.copy()

In [49]:
dfd = dd.query("""select credolabScore, sum(fspd30)/sum(flag_mature_fspd30) fspd30 from d3 where ln_appln_submit_datetime between '2025-05-01' and '2025-05-31' group by 1 order by 1;""").to_df()
dfd

Unnamed: 0,credolabScore,fspd30
0,415.0,0.0
1,440.0,0.0
2,447.0,0.0
3,448.0,1.0
4,449.0,0.0
...,...,...
113,581.0,0.0
114,583.0,0.0
115,584.0,0.0
116,588.0,0.0


In [50]:
dd.query("""SELECT
  ln_os_type, 
  case when credo_score is not null then 1 else 0 end credo_score_flag, 
  case when credolabScore is not null then 1 else 0 end credolabscore_flag,
  STRFTIME(ln_appln_submit_datetime, '%Y-%m') AS submit_year_month,
  COUNT(DISTINCT digitalLoanAccountId) AS cntloans, 
  SUM(ln_fspd30_flag) AS ln_fspd30_flag,
  SUM(ln_mature_fspd30_flag) AS ln_mature_fspd30_flag,
  SUM(fspd30) AS fspd30,
  SUM(flag_mature_fspd30) AS flag_mature_fspd30
FROM d3
         where STRFTIME(ln_appln_submit_datetime, '%Y-%m') in ('2025-03', '2025-04', '2025-05')
GROUP BY 1, 2,3,4
order by 4 desc
;
"""
).to_df()

# .to_csv(r"D:\OneDrive - Tonik Financial Pte Ltd\MyStuff\Data Engineering\Model_Monitoring\New_Model_Monitoring\Data\Gini_Values\androidcashbetatrench1.csv", index = False)

Unnamed: 0,ln_os_type,credo_score_flag,credolabscore_flag,submit_year_month,cntloans,ln_fspd30_flag,ln_mature_fspd30_flag,fspd30,flag_mature_fspd30
0,Android,1,1,2025-05,482,57.0,427.0,63.0,482.0
1,Android,1,1,2025-04,392,48.0,392.0,48.0,392.0
2,Android,1,1,2025-03,421,55.0,421.0,55.0,421.0


# Query IOS

In [51]:
sq = f"""
with base as 
(select a1.customer_id, a1.digitalLoanAccountId,
lmt.loanAccountNumber,a1.ln_os_type, a1.ln_appln_submit_datetime, a1.credo_score
-- ,lmt.credolabScore
-- , al.gen_credo_score credolabScore
, coalesce(ctial.score_all_score, lmt.credolabScore) credolabScore
,a1.ln_fspd30_flag,a1.ln_mature_fspd30_flag, ldd.obs_min_inst_def30, ldd.min_inst_def30, 
case when ldd.obs_min_inst_def30 >=2 then 1 else 0 end flag_mature_fspd30,
case when ldd.obs_min_inst_def30 >=2 and ldd.min_inst_def30 in (1,2) then 1 else 0 end fspd30,
from {schema1}.{cash_beta_trench3} a1
inner join `risk_credit_mis.loan_master_table` lmt on lmt.digitalLoanAccountId = a1.digitalLoanAccountId
inner join prj-prod-dataplatform.risk_credit_mis.loan_deliquency_data ldd on lmt.loanAccountNumber = ldd.loanAccountNumber
left join risk_mart.applied_loans_20230101_20250831 al on al.digitalLoanAccountId = a1.digitalLoanAccountId
left join risk_credit_mis.credolab_trace_insight_all_loans ctial on ctial.referenceNumber = lmt.credolabRefNumber
where flagDisbursement = 1
)
select * from base 
where 
flag_mature_fspd30 = 1
and ln_os_type not like 'Android'
and credo_score is not null
and credolabScore is not null
;
"""
d3 = client.query(sq).to_dataframe(progress_bar_type='tqdm')
print(f"The shape of {schema1}.{cash_beta_trench3} table is:\t {d3.shape}")

Job ID bc699ded-dcca-4436-884e-0ab8a384248b successfully executed: 100%|[32m██████████[0m|




Downloading: 100%|[32m██████████[0m|
The shape of worktable_data_analysis.cash_beta_trench3_applied_loans_backscored_20241001_20250831 table is:	 (1315, 13)


In [52]:
# Define your periods
data_periods_dict = {
    'Train': {'start': '2024-08-13', 'end': '2025-01-31'}, 
    'OOT 1': {'start': '2025-02-01', 'end': '2025-02-28'},
    'OOT 2': {'start': '2025-03-01', 'end': '2025-03-31'},
    'OOT 3': {'start': '2025-04-01', 'end': '2025-04-30'},
    'OOT 4': {'start': '2025-05-01', 'end': '2025-05-31'},
}

# Debug: Check your date column format first"
print("Date column info:")
print(f"Data type: {d3['ln_appln_submit_datetime'].dtype}")
print(f"Sample values: {d3['ln_appln_submit_datetime'].head()}")
print(f"Any null values: {d3['ln_appln_submit_datetime'].isnull().sum()}")

# Calculate Gini coefficients
gini_df3 = calculate_gini(d3, 'ln_appln_submit_datetime', 'fspd30', data_periods_dict)
print("\nGini Results:")
gini_df3

Date column info:
Data type: datetime64[us, UTC]
Sample values: 0   2025-06-10 19:34:04+00:00
1   2025-06-11 13:22:50+00:00
2   2025-05-17 11:37:44+00:00
3   2025-06-16 09:33:03+00:00
4   2025-06-15 16:24:56+00:00
Name: ln_appln_submit_datetime, dtype: datetime64[us, UTC]
Any null values: 0

Gini Results:


Unnamed: 0,Period,Start_Date,End_Date,credo_score_gini,credolabScore_gini,sample_size
0,Train,2024-08-13,2025-01-31,,,0
1,OOT 1,2025-02-01,2025-02-28,,,0
2,OOT 2,2025-03-01,2025-03-31,0.258491,-0.203774,63
3,OOT 3,2025-04-01,2025-04-30,0.105058,-0.046354,390
4,OOT 4,2025-05-01,2025-05-31,-0.06349,-0.03645,520


In [53]:
cash_beta_trench3_ios_df = gini_df3.copy()

In [54]:
dd.query("""SELECT
  ln_os_type, 
  case when credo_score is not null then 1 else 0 end credo_score_flag, 
  case when credolabScore is not null then 1 else 0 end credolabscore_flag,
  STRFTIME(ln_appln_submit_datetime, '%Y-%m') AS submit_year_month,
  COUNT(DISTINCT digitalLoanAccountId) AS cntloans, 
  SUM(ln_fspd30_flag) AS ln_fspd30_flag,
  SUM(ln_mature_fspd30_flag) AS ln_mature_fspd30_flag,
  SUM(fspd30) AS fspd30,
  SUM(flag_mature_fspd30) AS flag_mature_fspd30
FROM d3
         where STRFTIME(ln_appln_submit_datetime, '%Y-%m') in ('2025-03', '2025-04', '2025-05')
GROUP BY 1, 2,3,4
order by 4 desc
;
"""
).to_df()

# .to_csv(r"D:\OneDrive - Tonik Financial Pte Ltd\MyStuff\Data Engineering\Model_Monitoring\New_Model_Monitoring\Data\Gini_Values\androidcashbetatrench1.csv", index = False)

Unnamed: 0,ln_os_type,credo_score_flag,credolabscore_flag,submit_year_month,cntloans,ln_fspd30_flag,ln_mature_fspd30_flag,fspd30,flag_mature_fspd30
0,iOS,1,1,2025-05,522,63.0,463.0,73.0,522.0
1,iOS,1,1,2025-04,387,54.0,387.0,54.0,387.0
2,iOS,1,1,2025-03,62,10.0,62.0,10.0,62.0


# cash_alpha_trench1_applied_loans_backscored_20241001_20250831

# Table

In [55]:
schema1 = 'worktable_data_analysis'
cash_alpha_trench1 = f'cash_alpha_trench1_applied_loans_backscored_20241001_20250831'

# Query Android

In [56]:
sq = f"""
with base as 
(select a1.customer_id, a1.digitalLoanAccountId,
lmt.loanAccountNumber,a1.ln_os_type, a1.ln_appln_submit_datetime, a1.credo_score
-- ,lmt.credolabScore
-- , al.gen_credo_score credolabScore
, coalesce(ctial.score_all_score, lmt.credolabScore) credolabScore
,a1.ln_fspd30_flag,a1.ln_mature_fspd30_flag, ldd.obs_min_inst_def30, ldd.min_inst_def30, 
case when ldd.obs_min_inst_def30 >=2 then 1 else 0 end flag_mature_fspd30,
case when ldd.obs_min_inst_def30 >=2 and ldd.min_inst_def30 in (1,2) then 1 else 0 end fspd30,
from {schema1}.{cash_alpha_trench1} a1
inner join `risk_credit_mis.loan_master_table` lmt on lmt.digitalLoanAccountId = a1.digitalLoanAccountId
inner join prj-prod-dataplatform.risk_credit_mis.loan_deliquency_data ldd on lmt.loanAccountNumber = ldd.loanAccountNumber
left join risk_mart.applied_loans_20230101_20250831 al on al.digitalLoanAccountId = a1.digitalLoanAccountId
left join risk_credit_mis.credolab_trace_insight_all_loans ctial on ctial.referenceNumber = lmt.credolabRefNumber
where flagDisbursement = 1
)
select * from base 
where 
flag_mature_fspd30 = 1
and ln_os_type like 'Android'
and credo_score is not null
and credolabScore is not null
;
"""
da1 = client.query(sq).to_dataframe(progress_bar_type='tqdm')
print(f"The shape of {schema1}.{cash_alpha_trench1} table is:\t {da1.shape}")

Job ID 6bdbb2a3-4bf0-447f-930d-3120c115c91f successfully executed: 100%|[32m██████████[0m|




Downloading: 100%|[32m██████████[0m|
The shape of worktable_data_analysis.cash_alpha_trench1_applied_loans_backscored_20241001_20250831 table is:	 (4488, 13)


In [57]:
# Define your periods
data_periods_dict = {
    'Train': {'start': '2024-08-13', 'end': '2025-01-31'}, 
    'OOT 1': {'start': '2025-02-01', 'end': '2025-02-28'},
    'OOT 2': {'start': '2025-03-01', 'end': '2025-03-31'},
    'OOT 3': {'start': '2025-04-01', 'end': '2025-04-30'},
    'OOT 4': {'start': '2025-05-01', 'end': '2025-05-31'},
}

# Debug: Check your date column format first
print("Date column info:")
print(f"Data type: {da1['ln_appln_submit_datetime'].dtype}")
print(f"Sample values: {da1['ln_appln_submit_datetime'].head()}")
print(f"Any null values: {da1['ln_appln_submit_datetime'].isnull().sum()}")

# Calculate Gini coefficients
gini_dfa1 = calculate_gini(da1, 'ln_appln_submit_datetime', 'fspd30', data_periods_dict)
print("\nGini Results:")
cash_alpha_trench1_android_df = gini_dfa1.copy()
gini_dfa1


Date column info:
Data type: datetime64[us, UTC]
Sample values: 0   2024-10-17 13:08:36+00:00
1   2024-11-20 14:52:04+00:00
2   2024-10-18 22:42:11+00:00
3   2024-12-16 18:35:31+00:00
4   2024-10-09 20:27:49+00:00
Name: ln_appln_submit_datetime, dtype: datetime64[us, UTC]
Any null values: 0

Gini Results:


Unnamed: 0,Period,Start_Date,End_Date,credo_score_gini,credolabScore_gini,sample_size
0,Train,2024-08-13,2025-01-31,0.297348,0.086991,2671
1,OOT 1,2025-02-01,2025-02-28,0.266585,0.041626,526
2,OOT 2,2025-03-01,2025-03-31,0.330236,0.272928,331
3,OOT 3,2025-04-01,2025-04-30,0.27977,0.0391,322
4,OOT 4,2025-05-01,2025-05-31,0.274099,0.094342,366


In [58]:
dd.query("""SELECT
  ln_os_type, 
  case when credo_score is not null then 1 else 0 end credo_score_flag, 
  case when credolabScore is not null then 1 else 0 end credolabscore_flag,
  STRFTIME(ln_appln_submit_datetime, '%Y-%m') AS submit_year_month,
  COUNT(DISTINCT digitalLoanAccountId) AS cntloans, 
  SUM(ln_fspd30_flag) AS ln_fspd30_flag,
  SUM(ln_mature_fspd30_flag) AS ln_mature_fspd30_flag,
  SUM(fspd30) AS fspd30,
  SUM(flag_mature_fspd30) AS flag_mature_fspd30
FROM da1
         where STRFTIME(ln_appln_submit_datetime, '%Y-%m') in ('2025-03', '2025-04', '2025-05')
GROUP BY 1, 2,3,4
order by 4 desc
;
"""
).to_df()

# .to_csv(r"D:\OneDrive - Tonik Financial Pte Ltd\MyStuff\Data Engineering\Model_Monitoring\New_Model_Monitoring\Data\Gini_Values\androidcashbetatrench1.csv", index = False)

Unnamed: 0,ln_os_type,credo_score_flag,credolabscore_flag,submit_year_month,cntloans,ln_fspd30_flag,ln_mature_fspd30_flag,fspd30,flag_mature_fspd30
0,Android,1,1,2025-05,369,62.0,314.0,72.0,369.0
1,Android,1,1,2025-04,321,62.0,321.0,62.0,321.0
2,Android,1,1,2025-03,332,69.0,332.0,69.0,332.0


# Query IOS

In [59]:
sq = f"""
with base as 
(select a1.customer_id, a1.digitalLoanAccountId,
lmt.loanAccountNumber,a1.ln_os_type, a1.ln_appln_submit_datetime, a1.credo_score
-- ,lmt.credolabScore
-- , al.gen_credo_score credolabScore
, coalesce(ctial.score_all_score, lmt.credolabScore) credolabScore
,a1.ln_fspd30_flag,a1.ln_mature_fspd30_flag, ldd.obs_min_inst_def30, ldd.min_inst_def30, 
case when ldd.obs_min_inst_def30 >=2 then 1 else 0 end flag_mature_fspd30,
case when ldd.obs_min_inst_def30 >=2 and ldd.min_inst_def30 in (1,2) then 1 else 0 end fspd30,
from {schema1}.{cash_alpha_trench1} a1
inner join `risk_credit_mis.loan_master_table` lmt on lmt.digitalLoanAccountId = a1.digitalLoanAccountId
inner join prj-prod-dataplatform.risk_credit_mis.loan_deliquency_data ldd on lmt.loanAccountNumber = ldd.loanAccountNumber
left join risk_mart.applied_loans_20230101_20250831 al on al.digitalLoanAccountId = a1.digitalLoanAccountId
left join risk_credit_mis.credolab_trace_insight_all_loans ctial on ctial.referenceNumber = lmt.credolabRefNumber
where flagDisbursement = 1
)
select * from base 
where 
flag_mature_fspd30 = 1
and ln_os_type not like 'Android'
and credo_score is not null
and credolabScore is not null
;
"""
da1 = client.query(sq).to_dataframe(progress_bar_type='tqdm')
print(f"The shape of {schema1}.{cash_alpha_trench1} table is:\t {da1.shape}")

Job ID fa8aa5a6-840c-4dd1-9303-d02ef8aa747b successfully executed: 100%|[32m██████████[0m|




Downloading: 100%|[32m██████████[0m|
The shape of worktable_data_analysis.cash_alpha_trench1_applied_loans_backscored_20241001_20250831 table is:	 (1511, 13)


In [60]:
# Define your periods
data_periods_dict = {
    'Train': {'start': '2024-08-13', 'end': '2025-01-31'}, 
    'OOT 1': {'start': '2025-02-01', 'end': '2025-02-28'},
    'OOT 2': {'start': '2025-03-01', 'end': '2025-03-31'},
    'OOT 3': {'start': '2025-04-01', 'end': '2025-04-30'},
    'OOT 4': {'start': '2025-05-01', 'end': '2025-05-31'},
}

# Debug: Check your date column format first
print("Date column info:")
print(f"Data type: {da1['ln_appln_submit_datetime'].dtype}")
print(f"Sample values: {da1['ln_appln_submit_datetime'].head()}")
print(f"Any null values: {da1['ln_appln_submit_datetime'].isnull().sum()}")

# Calculate Gini coefficients
gini_dfa1 = calculate_gini(da1, 'ln_appln_submit_datetime', 'fspd30', data_periods_dict)
print("\nGini Results:")
cash_alpha_trench1_ios_df = gini_dfa1.copy()
gini_dfa1


Date column info:
Data type: datetime64[us, UTC]
Sample values: 0   2025-05-03 10:21:29+00:00
1   2025-05-11 10:41:34+00:00
2   2025-05-24 11:49:37+00:00
3   2025-03-21 15:51:43+00:00
4   2025-06-10 14:22:05+00:00
Name: ln_appln_submit_datetime, dtype: datetime64[us, UTC]
Any null values: 0

Gini Results:


Unnamed: 0,Period,Start_Date,End_Date,credo_score_gini,credolabScore_gini,sample_size
0,Train,2024-08-13,2025-01-31,,,0
1,OOT 1,2025-02-01,2025-02-28,,,0
2,OOT 2,2025-03-01,2025-03-31,0.135904,0.099786,148
3,OOT 3,2025-04-01,2025-04-30,0.094354,0.077344,432
4,OOT 4,2025-05-01,2025-05-31,0.20022,0.066995,543


In [61]:
dd.query("""SELECT
  ln_os_type, 
  case when credo_score is not null then 1 else 0 end credo_score_flag, 
  case when credolabScore is not null then 1 else 0 end credolabscore_flag,
  STRFTIME(ln_appln_submit_datetime, '%Y-%m') AS submit_year_month,
  COUNT(DISTINCT digitalLoanAccountId) AS cntloans, 
  SUM(ln_fspd30_flag) AS ln_fspd30_flag,
  SUM(ln_mature_fspd30_flag) AS ln_mature_fspd30_flag,
  SUM(fspd30) AS fspd30,
  SUM(flag_mature_fspd30) AS flag_mature_fspd30
FROM da1
         where STRFTIME(ln_appln_submit_datetime, '%Y-%m') in ('2025-03', '2025-04', '2025-05')
GROUP BY 1, 2,3,4
order by 4 desc
;
"""
).to_df()

# .to_csv(r"D:\OneDrive - Tonik Financial Pte Ltd\MyStuff\Data Engineering\Model_Monitoring\New_Model_Monitoring\Data\Gini_Values\androidcashbetatrench1.csv", index = False)

Unnamed: 0,ln_os_type,credo_score_flag,credolabscore_flag,submit_year_month,cntloans,ln_fspd30_flag,ln_mature_fspd30_flag,fspd30,flag_mature_fspd30
0,iOS,1,1,2025-05,543,101.0,468.0,111.0,543.0
1,iOS,1,1,2025-04,432,81.0,432.0,81.0,432.0
2,iOS,1,1,2025-03,143,26.0,143.0,26.0,143.0


# cash_alpha_trench2_applied_loans_backscored_20241001_20250831

# Table

In [62]:
schema1 = 'worktable_data_analysis'
cash_alpha_trench2 = f'cash_alpha_trench2_applied_loans_backscored_20241001_20250831'

# Query Android

In [63]:
sq = f"""
with base as 
(select a1.customer_id, a1.digitalLoanAccountId,
lmt.loanAccountNumber,a1.ln_os_type, a1.ln_appln_submit_datetime, a1.credo_score
-- ,lmt.credolabScore
-- , al.gen_credo_score credolabScore
, coalesce(ctial.score_all_score, lmt.credolabScore) credolabScore
,a1.ln_fspd30_flag,a1.ln_mature_fspd30_flag, ldd.obs_min_inst_def30, ldd.min_inst_def30, 
case when ldd.obs_min_inst_def30 >=2 then 1 else 0 end flag_mature_fspd30,
case when ldd.obs_min_inst_def30 >=2 and ldd.min_inst_def30 in (1,2) then 1 else 0 end fspd30,
from {schema1}.{cash_alpha_trench2} a1
inner join `risk_credit_mis.loan_master_table` lmt on lmt.digitalLoanAccountId = a1.digitalLoanAccountId
inner join prj-prod-dataplatform.risk_credit_mis.loan_deliquency_data ldd on lmt.loanAccountNumber = ldd.loanAccountNumber
left join risk_mart.applied_loans_20230101_20250831 al on al.digitalLoanAccountId = a1.digitalLoanAccountId
left join risk_credit_mis.credolab_trace_insight_all_loans ctial on ctial.referenceNumber = lmt.credolabRefNumber
where flagDisbursement = 1
)
select * from base 
where 
flag_mature_fspd30 = 1
and ln_os_type like 'Android'
and credo_score is not null
and credolabScore is not null
;
"""
da2 = client.query(sq).to_dataframe(progress_bar_type='tqdm')
print(f"The shape of {schema1}.{cash_alpha_trench2} table is:\t {da2.shape}")

Job ID 8e9ab12d-07d7-4c68-8798-6365de4b3539 successfully executed: 100%|[32m██████████[0m|




Downloading: 100%|[32m██████████[0m|
The shape of worktable_data_analysis.cash_alpha_trench2_applied_loans_backscored_20241001_20250831 table is:	 (3759, 13)


In [64]:
# Define your periods
data_periods_dict = {
    'Train': {'start': '2024-08-13', 'end': '2025-01-31'}, 
    'OOT 1': {'start': '2025-02-01', 'end': '2025-02-28'},
    'OOT 2': {'start': '2025-03-01', 'end': '2025-03-31'},
    'OOT 3': {'start': '2025-04-01', 'end': '2025-04-30'},
    'OOT 4': {'start': '2025-05-01', 'end': '2025-05-31'},
}

# Debug: Check your date column format first
print("Date column info:")
print(f"Data type: {da2['ln_appln_submit_datetime'].dtype}")
print(f"Sample values: {da2['ln_appln_submit_datetime'].head()}")
print(f"Any null values: {da2['ln_appln_submit_datetime'].isnull().sum()}")

# Calculate Gini coefficients
gini_dfa2 = calculate_gini(da2, 'ln_appln_submit_datetime', 'fspd30', data_periods_dict)
print("\nGini Results:")
cash_alpha_trench2_android_df = gini_dfa2.copy()
gini_dfa2

Date column info:
Data type: datetime64[us, UTC]
Sample values: 0   2024-10-27 21:01:45+00:00
1   2024-11-08 16:03:27+00:00
2   2024-10-26 14:49:22+00:00
3   2024-10-24 08:36:32+00:00
4   2024-11-11 13:46:59+00:00
Name: ln_appln_submit_datetime, dtype: datetime64[us, UTC]
Any null values: 0

Gini Results:


Unnamed: 0,Period,Start_Date,End_Date,credo_score_gini,credolabScore_gini,sample_size
0,Train,2024-08-13,2025-01-31,0.26919,0.168788,1829
1,OOT 1,2025-02-01,2025-02-28,0.402986,0.233028,447
2,OOT 2,2025-03-01,2025-03-31,0.353773,0.038355,467
3,OOT 3,2025-04-01,2025-04-30,0.296702,0.18587,382
4,OOT 4,2025-05-01,2025-05-31,0.309598,0.178941,386


In [65]:
dd.query("""SELECT
  ln_os_type, 
  case when credo_score is not null then 1 else 0 end credo_score_flag, 
  case when credolabScore is not null then 1 else 0 end credolabscore_flag,
  STRFTIME(ln_appln_submit_datetime, '%Y-%m') AS submit_year_month,
  COUNT(DISTINCT digitalLoanAccountId) AS cntloans, 
  SUM(ln_fspd30_flag) AS ln_fspd30_flag,
  SUM(ln_mature_fspd30_flag) AS ln_mature_fspd30_flag,
  SUM(fspd30) AS fspd30,
  SUM(flag_mature_fspd30) AS flag_mature_fspd30
FROM da2
         where STRFTIME(ln_appln_submit_datetime, '%Y-%m') in ('2025-03', '2025-04', '2025-05')
GROUP BY 1, 2,3,4
order by 4 desc
;
"""
).to_df()

# .to_csv(r"D:\OneDrive - Tonik Financial Pte Ltd\MyStuff\Data Engineering\Model_Monitoring\New_Model_Monitoring\Data\Gini_Values\androidcashbetatrench1.csv", index = False)

Unnamed: 0,ln_os_type,credo_score_flag,credolabscore_flag,submit_year_month,cntloans,ln_fspd30_flag,ln_mature_fspd30_flag,fspd30,flag_mature_fspd30
0,Android,1,1,2025-05,382,85.0,344.0,92.0,382.0
1,Android,1,1,2025-04,384,91.0,384.0,91.0,384.0
2,Android,1,1,2025-03,475,117.0,475.0,117.0,475.0


# Query IOS

In [66]:
sq = f"""
with base as 
(select a1.customer_id, a1.digitalLoanAccountId,
lmt.loanAccountNumber,a1.ln_os_type, a1.ln_appln_submit_datetime, a1.credo_score
-- ,lmt.credolabScore
-- , al.gen_credo_score credolabScore
, coalesce(ctial.score_all_score, lmt.credolabScore) credolabScore
,a1.ln_fspd30_flag,a1.ln_mature_fspd30_flag, ldd.obs_min_inst_def30, ldd.min_inst_def30, 
case when ldd.obs_min_inst_def30 >=2 then 1 else 0 end flag_mature_fspd30,
case when ldd.obs_min_inst_def30 >=2 and ldd.min_inst_def30 in (1,2) then 1 else 0 end fspd30,
from {schema1}.{cash_alpha_trench2} a1
inner join `risk_credit_mis.loan_master_table` lmt on lmt.digitalLoanAccountId = a1.digitalLoanAccountId
inner join prj-prod-dataplatform.risk_credit_mis.loan_deliquency_data ldd on lmt.loanAccountNumber = ldd.loanAccountNumber
left join risk_mart.applied_loans_20230101_20250831 al on al.digitalLoanAccountId = a1.digitalLoanAccountId
left join risk_credit_mis.credolab_trace_insight_all_loans ctial on ctial.referenceNumber = lmt.credolabRefNumber
where flagDisbursement = 1
)
select * from base 
where 
flag_mature_fspd30 = 1
and ln_os_type not like 'Android'
and credo_score is not null
and credolabScore is not null
;
"""
da2 = client.query(sq).to_dataframe(progress_bar_type='tqdm')
print(f"The shape of {schema1}.{cash_alpha_trench2} table is:\t {da2.shape}")

Job ID 1d7f7ec0-12b2-4ea3-8d1f-06d40ddab275 successfully executed: 100%|[32m██████████[0m|




Downloading: 100%|[32m██████████[0m|
The shape of worktable_data_analysis.cash_alpha_trench2_applied_loans_backscored_20241001_20250831 table is:	 (1186, 13)


In [67]:
# Define your periods
data_periods_dict = {
    'Train': {'start': '2024-08-13', 'end': '2025-01-31'}, 
    'OOT 1': {'start': '2025-02-01', 'end': '2025-02-28'},
    'OOT 2': {'start': '2025-03-01', 'end': '2025-03-31'},
    'OOT 3': {'start': '2025-04-01', 'end': '2025-04-30'},
    'OOT 4': {'start': '2025-05-01', 'end': '2025-05-31'},
}

# Debug: Check your date column format first
print("Date column info:")
print(f"Data type: {da2['ln_appln_submit_datetime'].dtype}")
print(f"Sample values: {da2['ln_appln_submit_datetime'].head()}")
print(f"Any null values: {da2['ln_appln_submit_datetime'].isnull().sum()}")

# Calculate Gini coefficients
gini_dfa2 = calculate_gini(da2, 'ln_appln_submit_datetime', 'fspd30', data_periods_dict)
print("\nGini Results:")
cash_alpha_trench2_ios_df = gini_dfa2.copy()
gini_dfa2

Date column info:
Data type: datetime64[us, UTC]
Sample values: 0   2025-05-22 13:26:53+00:00
1   2025-05-28 15:14:15+00:00
2   2025-04-22 12:06:37+00:00
3   2025-06-07 15:47:01+00:00
4   2025-04-11 11:37:22+00:00
Name: ln_appln_submit_datetime, dtype: datetime64[us, UTC]
Any null values: 0

Gini Results:


Unnamed: 0,Period,Start_Date,End_Date,credo_score_gini,credolabScore_gini,sample_size
0,Train,2024-08-13,2025-01-31,,,0
1,OOT 1,2025-02-01,2025-02-28,,,0
2,OOT 2,2025-03-01,2025-03-31,0.177807,0.105615,100
3,OOT 3,2025-04-01,2025-04-30,0.075477,-0.006093,389
4,OOT 4,2025-05-01,2025-05-31,0.063732,0.01689,419


In [68]:
dd.query("""SELECT
  ln_os_type, 
  case when credo_score is not null then 1 else 0 end credo_score_flag, 
  case when credolabScore is not null then 1 else 0 end credolabscore_flag,
  STRFTIME(ln_appln_submit_datetime, '%Y-%m') AS submit_year_month,
  COUNT(DISTINCT digitalLoanAccountId) AS cntloans, 
  SUM(ln_fspd30_flag) AS ln_fspd30_flag,
  SUM(ln_mature_fspd30_flag) AS ln_mature_fspd30_flag,
  SUM(fspd30) AS fspd30,
  SUM(flag_mature_fspd30) AS flag_mature_fspd30
FROM da2
         where STRFTIME(ln_appln_submit_datetime, '%Y-%m') in ('2025-03', '2025-04', '2025-05')
GROUP BY 1, 2,3,4
order by 4 desc
;
"""
).to_df()

# .to_csv(r"D:\OneDrive - Tonik Financial Pte Ltd\MyStuff\Data Engineering\Model_Monitoring\New_Model_Monitoring\Data\Gini_Values\androidcashbetatrench1.csv", index = False)

Unnamed: 0,ln_os_type,credo_score_flag,credolabscore_flag,submit_year_month,cntloans,ln_fspd30_flag,ln_mature_fspd30_flag,fspd30,flag_mature_fspd30
0,iOS,1,1,2025-05,421,80.0,362.0,93.0,421.0
1,iOS,1,1,2025-04,384,80.0,383.0,80.0,384.0
2,iOS,1,1,2025-03,100,34.0,100.0,34.0,100.0


# cash_alpha_trench3_applied_loans_backscored_20241001_20250831

# Table

In [69]:
schema1 = 'worktable_data_analysis'
cash_alpha_trench3 = f'cash_alpha_trench3_applied_loans_backscored_20241001_20250831'

# Query Android

In [70]:
sq = f"""
with base as 
(select a1.customer_id, a1.digitalLoanAccountId,
lmt.loanAccountNumber,a1.ln_os_type, a1.ln_appln_submit_datetime, a1.credo_score
-- ,lmt.credolabScore
-- , al.gen_credo_score credolabScore
, coalesce(ctial.score_all_score, lmt.credolabScore) credolabScore
,a1.ln_fspd30_flag,a1.ln_mature_fspd30_flag, ldd.obs_min_inst_def30, ldd.min_inst_def30, 
case when ldd.obs_min_inst_def30 >=2 then 1 else 0 end flag_mature_fspd30,
case when ldd.obs_min_inst_def30 >=2 and ldd.min_inst_def30 in (1,2) then 1 else 0 end fspd30,
from {schema1}.{cash_alpha_trench3} a1
inner join `risk_credit_mis.loan_master_table` lmt on lmt.digitalLoanAccountId = a1.digitalLoanAccountId
inner join prj-prod-dataplatform.risk_credit_mis.loan_deliquency_data ldd on lmt.loanAccountNumber = ldd.loanAccountNumber
left join risk_mart.applied_loans_20230101_20250831 al on al.digitalLoanAccountId = a1.digitalLoanAccountId
left join risk_credit_mis.credolab_trace_insight_all_loans ctial on ctial.referenceNumber = lmt.credolabRefNumber
where flagDisbursement = 1
)
select * from base 
where 
flag_mature_fspd30 = 1
and ln_os_type like 'Android'
and credo_score is not null
and credolabScore is not null
;
"""
da3 = client.query(sq).to_dataframe(progress_bar_type='tqdm')
print(f"The shape of {schema1}.{cash_alpha_trench3} table is:\t {da3.shape}")

Job ID 6e37534d-8886-456e-b86a-8fbb83fa2c55 successfully executed: 100%|[32m██████████[0m|




Downloading: 100%|[32m██████████[0m|
The shape of worktable_data_analysis.cash_alpha_trench3_applied_loans_backscored_20241001_20250831 table is:	 (3216, 13)


In [71]:
# Define your periods
data_periods_dict = {
    'Train': {'start': '2024-08-13', 'end': '2025-01-31'}, 
    'OOT 1': {'start': '2025-02-01', 'end': '2025-02-28'},
    'OOT 2': {'start': '2025-03-01', 'end': '2025-03-31'},
    'OOT 3': {'start': '2025-04-01', 'end': '2025-04-30'},
    'OOT 4': {'start': '2025-05-01', 'end': '2025-05-31'},
}

# Debug: Check your date column format first
print("Date column info:")
print(f"Data type: {da3['ln_appln_submit_datetime'].dtype}")
print(f"Sample values: {da3['ln_appln_submit_datetime'].head()}")
print(f"Any null values: {da3['ln_appln_submit_datetime'].isnull().sum()}")

# Calculate Gini coefficients
gini_dfa3 = calculate_gini(da3, 'ln_appln_submit_datetime', 'fspd30', data_periods_dict)
print("\nGini Results:")
cash_alpha_trench3_android_df = gini_dfa3.copy()
gini_dfa3

Date column info:
Data type: datetime64[us, UTC]
Sample values: 0   2025-05-12 08:40:32+00:00
1   2025-05-01 13:09:06+00:00
2   2025-04-15 20:31:16+00:00
3   2025-02-24 11:08:38+00:00
4   2025-04-10 09:17:11+00:00
Name: ln_appln_submit_datetime, dtype: datetime64[us, UTC]
Any null values: 0

Gini Results:


Unnamed: 0,Period,Start_Date,End_Date,credo_score_gini,credolabScore_gini,sample_size
0,Train,2024-08-13,2025-01-31,0.157475,0.089785,1305
1,OOT 1,2025-02-01,2025-02-28,0.243067,-0.037301,376
2,OOT 2,2025-03-01,2025-03-31,0.291165,0.065186,409
3,OOT 3,2025-04-01,2025-04-30,0.193138,0.15735,364
4,OOT 4,2025-05-01,2025-05-31,0.052198,0.233544,481


In [72]:
dd.query("""SELECT
  ln_os_type, 
  case when credo_score is not null then 1 else 0 end credo_score_flag, 
  case when credolabScore is not null then 1 else 0 end credolabscore_flag,
  STRFTIME(ln_appln_submit_datetime, '%Y-%m') AS submit_year_month,
  COUNT(DISTINCT digitalLoanAccountId) AS cntloans, 
  SUM(ln_fspd30_flag) AS ln_fspd30_flag,
  SUM(ln_mature_fspd30_flag) AS ln_mature_fspd30_flag,
  SUM(fspd30) AS fspd30,
  SUM(flag_mature_fspd30) AS flag_mature_fspd30
FROM da3
         where STRFTIME(ln_appln_submit_datetime, '%Y-%m') in ('2025-03', '2025-04', '2025-05')
GROUP BY 1, 2,3,4
order by 4 desc
;
"""
).to_df()

# .to_csv(r"D:\OneDrive - Tonik Financial Pte Ltd\MyStuff\Data Engineering\Model_Monitoring\New_Model_Monitoring\Data\Gini_Values\androidcashbetatrench1.csv", index = False)

Unnamed: 0,ln_os_type,credo_score_flag,credolabscore_flag,submit_year_month,cntloans,ln_fspd30_flag,ln_mature_fspd30_flag,fspd30,flag_mature_fspd30
0,Android,1,1,2025-05,480,57.0,425.0,63.0,480.0
1,Android,1,1,2025-04,368,46.0,368.0,46.0,368.0
2,Android,1,1,2025-03,405,55.0,405.0,55.0,405.0


# Query IOS

In [73]:
sq = f"""
with base as 
(select a1.customer_id, a1.digitalLoanAccountId,
lmt.loanAccountNumber,a1.ln_os_type, a1.ln_appln_submit_datetime, a1.credo_score
-- ,lmt.credolabScore
-- , al.gen_credo_score credolabScore
, coalesce(ctial.score_all_score, lmt.credolabScore) credolabScore
,a1.ln_fspd30_flag,a1.ln_mature_fspd30_flag, ldd.obs_min_inst_def30, ldd.min_inst_def30, 
case when ldd.obs_min_inst_def30 >=2 then 1 else 0 end flag_mature_fspd30,
case when ldd.obs_min_inst_def30 >=2 and ldd.min_inst_def30 in (1,2) then 1 else 0 end fspd30,
from {schema1}.{cash_alpha_trench3} a1
inner join `risk_credit_mis.loan_master_table` lmt on lmt.digitalLoanAccountId = a1.digitalLoanAccountId
inner join prj-prod-dataplatform.risk_credit_mis.loan_deliquency_data ldd on lmt.loanAccountNumber = ldd.loanAccountNumber
left join risk_mart.applied_loans_20230101_20250831 al on al.digitalLoanAccountId = a1.digitalLoanAccountId
left join risk_credit_mis.credolab_trace_insight_all_loans ctial on ctial.referenceNumber = lmt.credolabRefNumber
where flagDisbursement = 1
)
select * from base 
where 
flag_mature_fspd30 = 1
and ln_os_type not like 'Android'
and credo_score is not null
and credolabScore is not null
;
"""
da3 = client.query(sq).to_dataframe(progress_bar_type='tqdm')
print(f"The shape of {schema1}.{cash_alpha_trench3} table is:\t {da3.shape}")

Job ID 65d1896b-1388-4531-9843-9b8eed579ec2 successfully executed: 100%|[32m██████████[0m|




Downloading: 100%|[32m██████████[0m|
The shape of worktable_data_analysis.cash_alpha_trench3_applied_loans_backscored_20241001_20250831 table is:	 (1275, 13)


In [74]:
# Define your periods
data_periods_dict = {
    'Train': {'start': '2024-08-13', 'end': '2025-01-31'}, 
    'OOT 1': {'start': '2025-02-01', 'end': '2025-02-28'},
    'OOT 2': {'start': '2025-03-01', 'end': '2025-03-31'},
    'OOT 3': {'start': '2025-04-01', 'end': '2025-04-30'},
    'OOT 4': {'start': '2025-05-01', 'end': '2025-05-31'},
}

# Debug: Check your date column format first
print("Date column info:")
print(f"Data type: {da3['ln_appln_submit_datetime'].dtype}")
print(f"Sample values: {da3['ln_appln_submit_datetime'].head()}")
print(f"Any null values: {da3['ln_appln_submit_datetime'].isnull().sum()}")

# Calculate Gini coefficients
gini_dfa3 = calculate_gini(da3, 'ln_appln_submit_datetime', 'fspd30', data_periods_dict)
print("\nGini Results:")
cash_alpha_trench3_ios_df = gini_dfa3.copy()
gini_dfa3

Date column info:
Data type: datetime64[us, UTC]
Sample values: 0   2025-05-29 22:13:51+00:00
1   2025-05-21 19:04:18+00:00
2   2025-04-17 09:23:58+00:00
3   2025-03-30 06:50:10+00:00
4   2025-05-31 13:12:51+00:00
Name: ln_appln_submit_datetime, dtype: datetime64[us, UTC]
Any null values: 0

Gini Results:


Unnamed: 0,Period,Start_Date,End_Date,credo_score_gini,credolabScore_gini,sample_size
0,Train,2024-08-13,2025-01-31,,,0
1,OOT 1,2025-02-01,2025-02-28,,,0
2,OOT 2,2025-03-01,2025-03-31,0.258491,-0.203774,63
3,OOT 3,2025-04-01,2025-04-30,0.162034,-0.020437,355
4,OOT 4,2025-05-01,2025-05-31,-0.058195,-0.035268,518


In [75]:
dd.query("""SELECT
  ln_os_type, 
  case when credo_score is not null then 1 else 0 end credo_score_flag, 
  case when credolabScore is not null then 1 else 0 end credolabscore_flag,
  STRFTIME(ln_appln_submit_datetime, '%Y-%m') AS submit_year_month,
  COUNT(DISTINCT digitalLoanAccountId) AS cntloans, 
  SUM(ln_fspd30_flag) AS ln_fspd30_flag,
  SUM(ln_mature_fspd30_flag) AS ln_mature_fspd30_flag,
  SUM(fspd30) AS fspd30,
  SUM(flag_mature_fspd30) AS flag_mature_fspd30
FROM da3
         where STRFTIME(ln_appln_submit_datetime, '%Y-%m') in ('2025-03', '2025-04', '2025-05')
GROUP BY 1, 2,3,4
order by 4 desc
;
"""
).to_df()

# .to_csv(r"D:\OneDrive - Tonik Financial Pte Ltd\MyStuff\Data Engineering\Model_Monitoring\New_Model_Monitoring\Data\Gini_Values\androidcashbetatrench1.csv", index = False)

Unnamed: 0,ln_os_type,credo_score_flag,credolabscore_flag,submit_year_month,cntloans,ln_fspd30_flag,ln_mature_fspd30_flag,fspd30,flag_mature_fspd30
0,iOS,1,1,2025-05,520,62.0,461.0,72.0,520.0
1,iOS,1,1,2025-04,352,50.0,352.0,50.0,352.0
2,iOS,1,1,2025-03,62,10.0,62.0,10.0,62.0


# Testing

In [51]:
schema1 = 'worktable_data_analysis'
tab = f'cash_beta_trench1_applied_loans_backscored_20241001_20250831'
sq = f"""with base as 
(select a1.customer_id, a1.digitalLoanAccountId,
lmt.loanAccountNumber,a1.ln_os_type, a1.ln_appln_submit_datetime, a1.credo_score
-- ,lmt.credolabScore  --- when selecting only credolabscore from loan master table
-- , al.gen_credo_score credolabScore when selecting only gen_credo_score from Bala's applied table 
, coalesce(ctial.score_all_score, lmt.credolabScore) credolabScore  -- when mixing loan master and trace credo score -- current approach
-- , ctial.score_all_score credolabScore  -- when selecting only trace table
-- , coalesce(cast(ctial.score_all_probability as numeric),cast(al.gen_credo_score as numeric))credolabScore
,a1.ln_fspd30_flag,a1.ln_mature_fspd30_flag, ldd.obs_min_inst_def30, ldd.min_inst_def30, 
case when ldd.obs_min_inst_def30 >=2 then 1 else 0 end flag_mature_fspd30,
case when ldd.obs_min_inst_def30 >=2 and ldd.min_inst_def30 in (1,2) then 1 else 0 end fspd30,
from {schema1}.{tab} a1
inner join `risk_credit_mis.loan_master_table` lmt on lmt.digitalLoanAccountId = a1.digitalLoanAccountId
inner join prj-prod-dataplatform.risk_credit_mis.loan_deliquency_data ldd on lmt.loanAccountNumber = ldd.loanAccountNumber
left join risk_mart.applied_loans_20230101_20250831 al on al.digitalLoanAccountId = a1.digitalLoanAccountId
left join risk_credit_mis.credolab_trace_insight_all_loans ctial on ctial.referenceNumber = lmt.credolabRefNumber
where flagDisbursement = 1
)
select * from base 
where 
flag_mature_fspd30 = 1
and ln_os_type like 'Android'
and credo_score is not null
and credolabScore is not null
;"""

dfd = client.query(sq).to_dataframe(progress_bar_type='tqdm')
print(f"The shape of {schema1}.{tab} table is:\t {dfd.shape}")

# Define your periods
data_periods_dict = {
    'Train': {'start': '2024-08-13', 'end': '2025-01-31'}, 
    'OOT 1': {'start': '2025-02-01', 'end': '2025-02-28'},
    'OOT 2': {'start': '2025-03-01', 'end': '2025-03-31'},
    'OOT 3': {'start': '2025-04-01', 'end': '2025-04-30'},
    'OOT 4': {'start': '2025-05-01', 'end': '2025-05-31'},
}

# Debug: Check your date column format first
print("Date column info:")
print(f"Data type: {dfd['ln_appln_submit_datetime'].dtype}")
print(f"Sample values: {dfd['ln_appln_submit_datetime'].head()}")
print(f"Any null values: {dfd['ln_appln_submit_datetime'].isnull().sum()}")

# Calculate Gini coefficients
gini_dfd = calculate_gini(dfd, 'ln_appln_submit_datetime', 'fspd30', data_periods_dict)
print("\nGini Results:")
gini_dfd



Job ID 013f92e5-c61f-4b51-b3d3-ea2ee0b431a9 successfully executed: 100%|[32m██████████[0m|




Downloading: 100%|[32m██████████[0m|
The shape of worktable_data_analysis.cash_beta_trench1_applied_loans_backscored_20241001_20250831 table is:	 (4597, 13)
Date column info:
Data type: datetime64[us, UTC]
Sample values: 0   2024-10-15 10:14:21+00:00
1   2024-10-31 20:42:11+00:00
2   2024-11-15 15:12:37+00:00
3   2024-10-01 23:09:06+00:00
4   2024-10-30 10:56:41+00:00
Name: ln_appln_submit_datetime, dtype: datetime64[us, UTC]
Any null values: 0

Gini Results:


Unnamed: 0,Period,Start_Date,End_Date,credo_score_gini,credolabScore_gini,sample_size
0,Train,2024-08-13,2025-01-31,0.30136,0.090981,2762
1,OOT 1,2025-02-01,2025-02-28,0.285793,0.063103,548
2,OOT 2,2025-03-01,2025-03-31,0.336065,0.261779,335
3,OOT 3,2025-04-01,2025-04-30,0.271644,0.043692,334
4,OOT 4,2025-05-01,2025-05-31,0.29,0.127838,371


In [None]:
dfd.head()

In [None]:
dfd.to_pickle(r"D:\OneDrive - Tonik Financial Pte Ltd\MyStuff\Data Engineering\Model_Monitoring\New_Model_Monitoring\Data\data.pkl")
dfd.to_parquet(r"D:\OneDrive - Tonik Financial Pte Ltd\MyStuff\Data Engineering\Model_Monitoring\New_Model_Monitoring\Data\data.parquet")

# Slide 9 - Cash Credo Score FSPD30 Gini Performance Improvement over Current Model in Prod (latest Specialized Credo Score for Quick vs old generic Credo Score)


# Trench1 + Trench2 Android

# Android

In [79]:
sq = f"""with un as 
(Select customer_id, digitalLoanAccountId, ln_appln_submit_datetime, ln_os_type, credo_score ,
ln_loan_type, trench_category, ln_fspd30_flag , ln_mature_fspd30_flag, 'beta' as tab 
from worktable_data_analysis.cash_beta_trench1_applied_loans_backscored_20241001_20250831
union all 
select customer_id, digitalLoanAccountId, ln_appln_submit_datetime, ln_os_type, credo_score ,
ln_loan_type, trench_category, ln_fspd30_flag, ln_mature_fspd30_flag ,'beta' as tab   from worktable_data_analysis.cash_beta_trench2_applied_loans_backscored_20241001_20250831
union all 
select customer_id, digitalLoanAccountId, ln_appln_submit_datetime, ln_os_type, credo_score ,
ln_loan_type, trench_category, ln_fspd30_flag, ln_mature_fspd30_flag, 'alpha' as tab 
from worktable_data_analysis.cash_alpha_trench1_applied_loans_backscored_20241001_20250831
union all 
select customer_id, digitalLoanAccountId, ln_appln_submit_datetime, ln_os_type, credo_score ,
ln_loan_type, trench_category, ln_fspd30_flag, ln_mature_fspd30_flag, 'alpha' as tab 
from worktable_data_analysis.cash_alpha_trench2_applied_loans_backscored_20241001_20250831
)
--select digitalLoanAccountId, count(digitalLoanAccountId) from un group by 1 having count(digitalLoanAccountId) > 1 order by 2 desc;
,
un1 as 
(select * from un 
qualify row_number() over(partition by digitalLoanAccountId order by tab, trench_category desc, ln_appln_submit_datetime desc) = 1
),
base as 
(select a1.customer_id, a1.digitalLoanAccountId,
lmt.loanAccountNumber,a1.ln_os_type, a1.ln_appln_submit_datetime, a1.credo_score
-- ,lmt.credolabScore
-- , al.gen_credo_score credolabScore
, coalesce(ctial.score_all_score, lmt.credolabScore) credolabScore
,a1.ln_fspd30_flag,a1.ln_mature_fspd30_flag, ldd.obs_min_inst_def30, ldd.min_inst_def30, 
case when ldd.obs_min_inst_def30 >=2 then 1 else 0 end flag_mature_fspd30,
case when ldd.obs_min_inst_def30 >=2 and ldd.min_inst_def30 in (1,2) then 1 else 0 end fspd30,
from un1 a1
inner join `risk_credit_mis.loan_master_table` lmt on lmt.digitalLoanAccountId = a1.digitalLoanAccountId
inner join prj-prod-dataplatform.risk_credit_mis.loan_deliquency_data ldd on lmt.loanAccountNumber = ldd.loanAccountNumber
left join risk_mart.applied_loans_20230101_20250831 al on al.digitalLoanAccountId = a1.digitalLoanAccountId
left join risk_credit_mis.credolab_trace_insight_all_loans ctial on ctial.referenceNumber = lmt.credolabRefNumber
where flagDisbursement = 1
)
select * from base 
where 
flag_mature_fspd30 = 1
and ln_os_type like 'Android'
and credo_score is not null
and credolabScore is not null
;"""

dfd = client.query(sq).to_dataframe(progress_bar_type='tqdm')
print(f"The shape of table is:\t {dfd.shape}")

# Define your periods
data_periods_dict = {
    'Train': {'start': '2024-08-13', 'end': '2025-01-31'}, 
    'OOT 1': {'start': '2025-02-01', 'end': '2025-02-28'},
    'OOT 2': {'start': '2025-03-01', 'end': '2025-03-31'},
    'OOT 3': {'start': '2025-04-01', 'end': '2025-04-30'},
    'OOT 4': {'start': '2025-05-01', 'end': '2025-05-31'},
}

# Debug: Check your date column format first
print("Date column info:")
print(f"Data type: {dfd['ln_appln_submit_datetime'].dtype}")
print(f"Sample values: {dfd['ln_appln_submit_datetime'].head()}")
print(f"Any null values: {dfd['ln_appln_submit_datetime'].isnull().sum()}")

# Calculate Gini coefficients
gini_dfd = calculate_gini(dfd, 'ln_appln_submit_datetime', 'fspd30', data_periods_dict)
print("\nGini Results:")
gini_dfd



Job ID 7a676949-4163-41fe-a6e1-080d9757a1f3 successfully executed: 100%|[32m██████████[0m|




Downloading: 100%|[32m██████████[0m|
The shape of table is:	 (8562, 13)
Date column info:
Data type: datetime64[us, UTC]
Sample values: 0   2024-12-05 05:13:56+00:00
1   2024-11-11 13:46:59+00:00
2   2024-11-20 09:57:26+00:00
3   2024-11-15 15:12:37+00:00
4   2024-10-27 21:01:45+00:00
Name: ln_appln_submit_datetime, dtype: datetime64[us, UTC]
Any null values: 0

Gini Results:


Unnamed: 0,Period,Start_Date,End_Date,credo_score_gini,credolabScore_gini,sample_size
0,Train,2024-08-13,2025-01-31,0.287428,0.120566,4695
1,OOT 1,2025-02-01,2025-02-28,0.352513,0.14004,1011
2,OOT 2,2025-03-01,2025-03-31,0.344804,0.114754,808
3,OOT 3,2025-04-01,2025-04-30,0.276026,0.123664,731
4,OOT 4,2025-05-01,2025-05-31,0.289064,0.150242,758


In [80]:
dd.query("""SELECT
  ln_os_type, 
  case when credo_score is not null then 1 else 0 end credo_score_flag, 
  case when credolabScore is not null then 1 else 0 end credolabscore_flag,
  STRFTIME(ln_appln_submit_datetime, '%Y-%m') AS submit_year_month,
  COUNT(DISTINCT digitalLoanAccountId) AS cntloans, 
  SUM(ln_fspd30_flag) AS ln_fspd30_flag,
  SUM(ln_mature_fspd30_flag) AS ln_mature_fspd30_flag,
  SUM(fspd30) AS fspd30,
  SUM(flag_mature_fspd30) AS flag_mature_fspd30
FROM dfd
         where STRFTIME(ln_appln_submit_datetime, '%Y-%m') in ('2025-03', '2025-04', '2025-05')
GROUP BY 1, 2,3,4
order by 4 desc
;
"""
).to_df()

# .to_csv(r"D:\OneDrive - Tonik Financial Pte Ltd\MyStuff\Data Engineering\Model_Monitoring\New_Model_Monitoring\Data\Gini_Values\androidcashbetatrench1.csv", index = False)

Unnamed: 0,ln_os_type,credo_score_flag,credolabscore_flag,submit_year_month,cntloans,ln_fspd30_flag,ln_mature_fspd30_flag,fspd30,flag_mature_fspd30
0,Android,1,1,2025-05,757,151.0,663.0,169.0,757.0
1,Android,1,1,2025-04,733,157.0,733.0,157.0,733.0
2,Android,1,1,2025-03,816,188.0,816.0,188.0,816.0


# Trench1 + Trench2 iOS

# IOS

In [81]:
sq = f"""with un as 
(Select customer_id, digitalLoanAccountId, ln_appln_submit_datetime, ln_os_type, credo_score ,
ln_loan_type, trench_category, ln_fspd30_flag , ln_mature_fspd30_flag, 'beta' as tab 
from worktable_data_analysis.cash_beta_trench1_applied_loans_backscored_20241001_20250831
union all 
select customer_id, digitalLoanAccountId, ln_appln_submit_datetime, ln_os_type, credo_score ,
ln_loan_type, trench_category, ln_fspd30_flag, ln_mature_fspd30_flag ,'beta' as tab   from worktable_data_analysis.cash_beta_trench2_applied_loans_backscored_20241001_20250831
union all 
select customer_id, digitalLoanAccountId, ln_appln_submit_datetime, ln_os_type, credo_score ,
ln_loan_type, trench_category, ln_fspd30_flag, ln_mature_fspd30_flag, 'alpha' as tab 
from worktable_data_analysis.cash_alpha_trench1_applied_loans_backscored_20241001_20250831
union all 
select customer_id, digitalLoanAccountId, ln_appln_submit_datetime, ln_os_type, credo_score ,
ln_loan_type, trench_category, ln_fspd30_flag, ln_mature_fspd30_flag, 'alpha' as tab 
from worktable_data_analysis.cash_alpha_trench2_applied_loans_backscored_20241001_20250831
)
--select digitalLoanAccountId, count(digitalLoanAccountId) from un group by 1 having count(digitalLoanAccountId) > 1 order by 2 desc;
,
un1 as 
(select * from un 
qualify row_number() over(partition by digitalLoanAccountId order by tab, trench_category desc, ln_appln_submit_datetime desc) = 1
),
base as 
(select a1.customer_id, a1.digitalLoanAccountId,
lmt.loanAccountNumber,a1.ln_os_type, a1.ln_appln_submit_datetime, a1.credo_score
-- ,lmt.credolabScore
-- , al.gen_credo_score credolabScore
, coalesce(ctial.score_all_score, lmt.credolabScore) credolabScore
,a1.ln_fspd30_flag,a1.ln_mature_fspd30_flag, ldd.obs_min_inst_def30, ldd.min_inst_def30, 
case when ldd.obs_min_inst_def30 >=2 then 1 else 0 end flag_mature_fspd30,
case when ldd.obs_min_inst_def30 >=2 and ldd.min_inst_def30 in (1,2) then 1 else 0 end fspd30,
from un1 a1
inner join `risk_credit_mis.loan_master_table` lmt on lmt.digitalLoanAccountId = a1.digitalLoanAccountId
inner join prj-prod-dataplatform.risk_credit_mis.loan_deliquency_data ldd on lmt.loanAccountNumber = ldd.loanAccountNumber
left join risk_mart.applied_loans_20230101_20250831 al on al.digitalLoanAccountId = a1.digitalLoanAccountId
left join risk_credit_mis.credolab_trace_insight_all_loans ctial on ctial.referenceNumber = lmt.credolabRefNumber
where flagDisbursement = 1
)
select * from base 
where 
flag_mature_fspd30 = 1
and ln_os_type not like 'Android'
and credo_score is not null
and credolabScore is not null
;"""

dfd = client.query(sq).to_dataframe(progress_bar_type='tqdm')
print(f"The shape of table is:\t {dfd.shape}")

# Define your periods
data_periods_dict = {
    'Train': {'start': '2024-08-13', 'end': '2025-01-31'}, 
    'OOT 1': {'start': '2025-02-01', 'end': '2025-02-28'},
    'OOT 2': {'start': '2025-03-01', 'end': '2025-03-31'},
    'OOT 3': {'start': '2025-04-01', 'end': '2025-04-30'},
    'OOT 4': {'start': '2025-05-01', 'end': '2025-05-31'},
}

# Debug: Check your date column format first
print("Date column info:")
print(f"Data type: {dfd['ln_appln_submit_datetime'].dtype}")
print(f"Sample values: {dfd['ln_appln_submit_datetime'].head()}")
print(f"Any null values: {dfd['ln_appln_submit_datetime'].isnull().sum()}")

# Calculate Gini coefficients
gini_dfd = calculate_gini(dfd, 'ln_appln_submit_datetime', 'fspd30', data_periods_dict)
print("\nGini Results:")
gini_dfd



Job ID 871e13b3-8c39-4e2b-bfaf-05d60c4080b1 successfully executed: 100%|[32m██████████[0m|




Downloading: 100%|[32m██████████[0m|
The shape of table is:	 (2846, 13)
Date column info:
Data type: datetime64[us, UTC]
Sample values: 0   2025-06-13 21:29:59+00:00
1   2025-05-20 09:46:44+00:00
2   2025-06-13 19:39:27+00:00
3   2025-05-12 09:05:47+00:00
4   2025-06-19 14:02:27+00:00
Name: ln_appln_submit_datetime, dtype: datetime64[us, UTC]
Any null values: 0

Gini Results:


Unnamed: 0,Period,Start_Date,End_Date,credo_score_gini,credolabScore_gini,sample_size
0,Train,2024-08-13,2025-01-31,,,0
1,OOT 1,2025-02-01,2025-02-28,,,0
2,OOT 2,2025-03-01,2025-03-31,0.178279,0.121053,253
3,OOT 3,2025-04-01,2025-04-30,0.091937,0.073505,907
4,OOT 4,2025-05-01,2025-05-31,0.150312,0.052811,974


In [82]:
dd.query("""SELECT
  ln_os_type, 
  case when credo_score is not null then 1 else 0 end credo_score_flag, 
  case when credolabScore is not null then 1 else 0 end credolabscore_flag,
  STRFTIME(ln_appln_submit_datetime, '%Y-%m') AS submit_year_month,
  COUNT(DISTINCT digitalLoanAccountId) AS cntloans, 
  SUM(ln_fspd30_flag) AS ln_fspd30_flag,
  SUM(ln_mature_fspd30_flag) AS ln_mature_fspd30_flag,
  SUM(fspd30) AS fspd30,
  SUM(flag_mature_fspd30) AS flag_mature_fspd30
FROM dfd
         where STRFTIME(ln_appln_submit_datetime, '%Y-%m') in ('2025-03', '2025-04', '2025-05')
GROUP BY 1, 2,3,4
order by 4 desc
;
"""
).to_df()

# .to_csv(r"D:\OneDrive - Tonik Financial Pte Ltd\MyStuff\Data Engineering\Model_Monitoring\New_Model_Monitoring\Data\Gini_Values\androidcashbetatrench1.csv", index = False)

Unnamed: 0,ln_os_type,credo_score_flag,credolabscore_flag,submit_year_month,cntloans,ln_fspd30_flag,ln_mature_fspd30_flag,fspd30,flag_mature_fspd30
0,iOS,1,1,2025-05,976,185.0,842.0,208.0,976.0
1,iOS,1,1,2025-04,902,185.0,901.0,185.0,902.0
2,iOS,1,1,2025-03,248,62.0,248.0,62.0,248.0


# Trench3  Android

In [83]:
sq = f"""with un as 
(Select customer_id, digitalLoanAccountId, ln_appln_submit_datetime, ln_os_type, credo_score ,
ln_loan_type, trench_category, ln_fspd30_flag , ln_mature_fspd30_flag, 'beta' as tab 
from worktable_data_analysis.cash_beta_trench3_applied_loans_backscored_20241001_20250831
union all 
select customer_id, digitalLoanAccountId, ln_appln_submit_datetime, ln_os_type, credo_score ,
ln_loan_type, trench_category, ln_fspd30_flag, ln_mature_fspd30_flag ,'alpha' as tab   from worktable_data_analysis.cash_alpha_trench3_applied_loans_backscored_20241001_20250831
)
-- select digitalLoanAccountId, count(digitalLoanAccountId) from un group by 1 having count(digitalLoanAccountId) > 1 order by 2 desc;
,
un1 as 
(select * from un 
qualify row_number() over(partition by digitalLoanAccountId order by tab, trench_category desc, ln_appln_submit_datetime desc) = 1
)
,
base as 
(select a1.customer_id, a1.digitalLoanAccountId,
lmt.loanAccountNumber,a1.ln_os_type, a1.ln_appln_submit_datetime, a1.credo_score
-- ,lmt.credolabScore
-- , al.gen_credo_score credolabScore
, coalesce(ctial.score_all_score, lmt.credolabScore) credolabScore
,a1.ln_fspd30_flag,a1.ln_mature_fspd30_flag, ldd.obs_min_inst_def30, ldd.min_inst_def30, 
case when ldd.obs_min_inst_def30 >=2 then 1 else 0 end flag_mature_fspd30,
case when ldd.obs_min_inst_def30 >=2 and ldd.min_inst_def30 in (1,2) then 1 else 0 end fspd30,
from un1 a1
inner join `risk_credit_mis.loan_master_table` lmt on lmt.digitalLoanAccountId = a1.digitalLoanAccountId
inner join prj-prod-dataplatform.risk_credit_mis.loan_deliquency_data ldd on lmt.loanAccountNumber = ldd.loanAccountNumber
left join risk_mart.applied_loans_20230101_20250831 al on al.digitalLoanAccountId = a1.digitalLoanAccountId
left join risk_credit_mis.credolab_trace_insight_all_loans ctial on ctial.referenceNumber = lmt.credolabRefNumber
where flagDisbursement = 1
)
select * from base 
where 
flag_mature_fspd30 = 1
and ln_os_type like 'Android'
and credo_score is not null
and credolabScore is not null
;"""

dfd = client.query(sq).to_dataframe(progress_bar_type='tqdm')
print(f"The shape of table is:\t {dfd.shape}")

# Define your periods
data_periods_dict = {
    'Train': {'start': '2024-08-13', 'end': '2025-01-31'}, 
    'OOT 1': {'start': '2025-02-01', 'end': '2025-02-28'},
    'OOT 2': {'start': '2025-03-01', 'end': '2025-03-31'},
    'OOT 3': {'start': '2025-04-01', 'end': '2025-04-30'},
    'OOT 4': {'start': '2025-05-01', 'end': '2025-05-31'},
}

# Debug: Check your date column format first
print("Date column info:")
print(f"Data type: {dfd['ln_appln_submit_datetime'].dtype}")
print(f"Sample values: {dfd['ln_appln_submit_datetime'].head()}")
print(f"Any null values: {dfd['ln_appln_submit_datetime'].isnull().sum()}")

# Calculate Gini coefficients
gini_dfd = calculate_gini(dfd, 'ln_appln_submit_datetime', 'fspd30', data_periods_dict)
print("\nGini Results:")
gini_dfd



Job ID 8f2b8fe3-7825-4832-a02e-139426890f71 successfully executed: 100%|[32m██████████[0m|




Downloading: 100%|[32m██████████[0m|
The shape of table is:	 (3360, 13)
Date column info:
Data type: datetime64[us, UTC]
Sample values: 0   2025-06-07 07:02:15+00:00
1   2025-06-04 11:42:34+00:00
2   2025-06-03 01:17:28+00:00
3   2025-06-18 09:08:14+00:00
4   2025-06-03 18:12:29+00:00
Name: ln_appln_submit_datetime, dtype: datetime64[us, UTC]
Any null values: 0

Gini Results:


Unnamed: 0,Period,Start_Date,End_Date,credo_score_gini,credolabScore_gini,sample_size
0,Train,2024-08-13,2025-01-31,0.166167,0.084071,1374
1,OOT 1,2025-02-01,2025-02-28,0.255133,-0.03466,391
2,OOT 2,2025-03-01,2025-03-31,0.297123,0.069909,425
3,OOT 3,2025-04-01,2025-04-30,0.13861,0.159289,388
4,OOT 4,2025-05-01,2025-05-31,0.053406,0.236495,483


In [84]:
dd.query("""SELECT
  ln_os_type, 
  case when credo_score is not null then 1 else 0 end credo_score_flag, 
  case when credolabScore is not null then 1 else 0 end credolabscore_flag,
  STRFTIME(ln_appln_submit_datetime, '%Y-%m') AS submit_year_month,
  COUNT(DISTINCT digitalLoanAccountId) AS cntloans, 
  SUM(ln_fspd30_flag) AS ln_fspd30_flag,
  SUM(ln_mature_fspd30_flag) AS ln_mature_fspd30_flag,
  SUM(fspd30) AS fspd30,
  SUM(flag_mature_fspd30) AS flag_mature_fspd30
FROM dfd
         where STRFTIME(ln_appln_submit_datetime, '%Y-%m') in ('2025-03', '2025-04', '2025-05')
GROUP BY 1, 2,3,4
order by 4 desc
;
"""
).to_df()

# .to_csv(r"D:\OneDrive - Tonik Financial Pte Ltd\MyStuff\Data Engineering\Model_Monitoring\New_Model_Monitoring\Data\Gini_Values\androidcashbetatrench1.csv", index = False)

Unnamed: 0,ln_os_type,credo_score_flag,credolabscore_flag,submit_year_month,cntloans,ln_fspd30_flag,ln_mature_fspd30_flag,fspd30,flag_mature_fspd30
0,Android,1,1,2025-05,482,57.0,427.0,63.0,482.0
1,Android,1,1,2025-04,392,48.0,392.0,48.0,392.0
2,Android,1,1,2025-03,421,55.0,421.0,55.0,421.0


# Trench3 iOS


In [85]:
sq = f"""with un as 
(Select customer_id, digitalLoanAccountId, ln_appln_submit_datetime, ln_os_type, credo_score ,
ln_loan_type, trench_category, ln_fspd30_flag , ln_mature_fspd30_flag, 'beta' as tab 
from worktable_data_analysis.cash_beta_trench3_applied_loans_backscored_20241001_20250831
union all 
select customer_id, digitalLoanAccountId, ln_appln_submit_datetime, ln_os_type, credo_score ,
ln_loan_type, trench_category, ln_fspd30_flag, ln_mature_fspd30_flag ,'alpha' as tab   from worktable_data_analysis.cash_alpha_trench3_applied_loans_backscored_20241001_20250831
)
-- select digitalLoanAccountId, count(digitalLoanAccountId) from un group by 1 having count(digitalLoanAccountId) > 1 order by 2 desc;
,
un1 as 
(select * from un 
qualify row_number() over(partition by digitalLoanAccountId order by tab, trench_category desc, ln_appln_submit_datetime desc) = 1
)
,
base as 
(select a1.customer_id, a1.digitalLoanAccountId,
lmt.loanAccountNumber,a1.ln_os_type, a1.ln_appln_submit_datetime, a1.credo_score
-- ,lmt.credolabScore
-- , al.gen_credo_score credolabScore
, coalesce(ctial.score_all_score, lmt.credolabScore) credolabScore
,a1.ln_fspd30_flag,a1.ln_mature_fspd30_flag, ldd.obs_min_inst_def30, ldd.min_inst_def30, 
case when ldd.obs_min_inst_def30 >=2 then 1 else 0 end flag_mature_fspd30,
case when ldd.obs_min_inst_def30 >=2 and ldd.min_inst_def30 in (1,2) then 1 else 0 end fspd30,
from un1 a1
inner join `risk_credit_mis.loan_master_table` lmt on lmt.digitalLoanAccountId = a1.digitalLoanAccountId
inner join prj-prod-dataplatform.risk_credit_mis.loan_deliquency_data ldd on lmt.loanAccountNumber = ldd.loanAccountNumber
left join risk_mart.applied_loans_20230101_20250831 al on al.digitalLoanAccountId = a1.digitalLoanAccountId
left join risk_credit_mis.credolab_trace_insight_all_loans ctial on ctial.referenceNumber = lmt.credolabRefNumber
where flagDisbursement = 1
)
select * from base 
where 
flag_mature_fspd30 = 1
and ln_os_type not like 'Android'
and credo_score is not null
and credolabScore is not null
;"""

dfd = client.query(sq).to_dataframe(progress_bar_type='tqdm')
print(f"The shape of table is:\t {dfd.shape}")

# Define your periods
data_periods_dict = {
    'Train': {'start': '2024-08-13', 'end': '2025-01-31'}, 
    'OOT 1': {'start': '2025-02-01', 'end': '2025-02-28'},
    'OOT 2': {'start': '2025-03-01', 'end': '2025-03-31'},
    'OOT 3': {'start': '2025-04-01', 'end': '2025-04-30'},
    'OOT 4': {'start': '2025-05-01', 'end': '2025-05-31'},
}

# Debug: Check your date column format first
print("Date column info:")
print(f"Data type: {dfd['ln_appln_submit_datetime'].dtype}")
print(f"Sample values: {dfd['ln_appln_submit_datetime'].head()}")
print(f"Any null values: {dfd['ln_appln_submit_datetime'].isnull().sum()}")

# Calculate Gini coefficients
gini_dfd = calculate_gini(dfd, 'ln_appln_submit_datetime', 'fspd30', data_periods_dict)
print("\nGini Results:")
gini_dfd



Job ID fa0342bd-140b-47a8-a661-25fbc6714a30 successfully executed: 100%|[32m██████████[0m|




Downloading: 100%|[32m██████████[0m|
The shape of table is:	 (1331, 13)
Date column info:
Data type: datetime64[us, UTC]
Sample values: 0   2025-05-23 10:47:21+00:00
1   2025-05-19 18:06:18+00:00
2   2025-04-14 00:49:05+00:00
3   2025-05-24 11:18:56+00:00
4   2025-06-12 17:55:24+00:00
Name: ln_appln_submit_datetime, dtype: datetime64[us, UTC]
Any null values: 0

Gini Results:


Unnamed: 0,Period,Start_Date,End_Date,credo_score_gini,credolabScore_gini,sample_size
0,Train,2024-08-13,2025-01-31,,,0
1,OOT 1,2025-02-01,2025-02-28,,,0
2,OOT 2,2025-03-01,2025-03-31,0.258491,-0.203774,63
3,OOT 3,2025-04-01,2025-04-30,0.105058,-0.046354,390
4,OOT 4,2025-05-01,2025-05-31,-0.06349,-0.03645,520


In [86]:
dd.query("""SELECT
  ln_os_type, 
  case when credo_score is not null then 1 else 0 end credo_score_flag, 
  case when credolabScore is not null then 1 else 0 end credolabscore_flag,
  STRFTIME(ln_appln_submit_datetime, '%Y-%m') AS submit_year_month,
  COUNT(DISTINCT digitalLoanAccountId) AS cntloans, 
  SUM(ln_fspd30_flag) AS ln_fspd30_flag,
  SUM(ln_mature_fspd30_flag) AS ln_mature_fspd30_flag,
  SUM(fspd30) AS fspd30,
  SUM(flag_mature_fspd30) AS flag_mature_fspd30
FROM dfd
         where STRFTIME(ln_appln_submit_datetime, '%Y-%m') in ('2025-03', '2025-04', '2025-05')
GROUP BY 1, 2,3,4
order by 4 desc
;
"""
).to_df()

# .to_csv(r"D:\OneDrive - Tonik Financial Pte Ltd\MyStuff\Data Engineering\Model_Monitoring\New_Model_Monitoring\Data\Gini_Values\androidcashbetatrench1.csv", index = False)

Unnamed: 0,ln_os_type,credo_score_flag,credolabscore_flag,submit_year_month,cntloans,ln_fspd30_flag,ln_mature_fspd30_flag,fspd30,flag_mature_fspd30
0,iOS,1,1,2025-05,522,63.0,463.0,73.0,522.0
1,iOS,1,1,2025-04,387,54.0,387.0,54.0,387.0
2,iOS,1,1,2025-03,62,10.0,62.0,10.0,62.0
