# <div align = "center" style="color:rgb(0, 255, 0);"> Gini Calculation for Gen Credo Score and Credo Score for Different Trenches </div>

# Define Library

In [1]:
# %% [markdown]
# # Jupyter Notebook Loading Header
#
# This is a custom loading header for Jupyter Notebooks in Visual Studio Code.
# It includes common imports and settings to get you started quickly.
# %% [markdown]
## Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from google.cloud import bigquery
from google.cloud import storage
import os
import tempfile
import time
from datetime import datetime
import uuid
import joblib
import uuid

import gcsfs
import duckdb as dd
import pickle
import joblib
from typing import Union
import io

path = r'C:\Users\Dwaipayan\AppData\Roaming\gcloud\legacy_credentials\dchakroborti@tonikbank.com\adc.json'
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = path
client = bigquery.Client(project='prj-prod-dataplatform')
os.environ["GOOGLE_CLOUD_PROJECT"] = "prj-prod-dataplatform"
# %% [markdown]
## Configure Settings
# Set options or configurations as needed
pd.set_option('display.max_columns', None)
pd.set_option("Display.max_rows", 100)


# Constant

In [2]:
CURRENT_DATE = datetime.now().strftime("%Y%m%d")


# Config

In [3]:
unique_id = str(uuid.uuid4()).replace('-', '')[-12:]
print(f"The unique Id is: {unique_id}")
BUCKETNAME = 'prod-asia-southeast1-tonik-aiml-workspace'
CLOUDPATH = 'DC/Model_Monitoring/Gini_Values'
LOCALPATH = r'D:\OneDrive - Tonik Financial Pte Ltd\MyStuff\Data Engineering\Model_Monitoring\New_Model_Monitoring\Data\Gini_Values'
VERSION = 'V1'
PROJECT_ID = 'prj-prod-dataplatform'

The unique Id is: 8a3faa744398


# <div align="left" style="color:rgb(51, 250, 250);"> Functions </div>

## <div align="left" style="color:rgb(51, 250, 250);"> Save the data to google clound storage </div>

In [4]:
def save_df_to_gcs(df, bucket_name, destination_blob_name, file_format='csv'):
    """Saves a pandas DataFrame to Google Cloud Storage.

    Args:
        df: The pandas DataFrame to save.
        bucket_name: The name of the GCS bucket.
        destination_blob_name: The name of the blob to be created.
        file_format: The file format to save the DataFrame in ('csv' or 'parquet').
    """

    # Create a temporary file
    if file_format == 'csv':
        temp_file = 'temp.csv'
        df.to_csv(temp_file, index=False)
    elif file_format == 'parquet':
        temp_file = 'temp.parquet'
        df.to_parquet(temp_file, index=False)
    else:
        raise ValueError("Invalid file format. Please choose 'csv' or 'parquet'.")

    # Upload the file to GCS
    storage_client = storage.Client(project="prj-prod-dataplatform")

    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(destination_blob_name)

    blob.upload_from_filename(temp_file)

    # Remove the temporary file
    import os
    os.remove(temp_file)
    


## <div align="left" style="color:rgb(51, 250, 250);"> Read the Data from Google Cloud Storage </div>

In [5]:
def read_df_from_gcs(bucket_name, source_blob_name, file_format='csv'):
    """Reads a DataFrame from Google Cloud Storage.

    Args:
        bucket_name: The name of the GCS bucket.
        source_blob_name: The name of the blob to read.
        file_format: The file format to read ('csv' or 'parquet').

    Returns:
        pandas.DataFrame: The data loaded from the GCS file.
    """
    # Create a temporary file name
    temp_file = f'temp.{file_format}'
    
    try:
        # Initialize GCS client
        storage_client = storage.Client()
        bucket = storage_client.bucket(bucket_name)
        blob = bucket.blob(source_blob_name)

        # Download the file to a temporary location
        blob.download_to_filename(temp_file)

        # Read the file into a DataFrame
        if file_format == 'csv':
            df = pd.read_csv(temp_file, low_memory=False)
        elif file_format == 'parquet':
            df = pd.read_parquet(temp_file)
        else:
            raise ValueError("Invalid file format. Please choose 'csv' or 'parquet'.")

        return df

    finally:
        # Clean up the temporary file
        if os.path.exists(temp_file):
            os.remove(temp_file)

## <div align = "left" style="color:rgb(51, 250, 250);"> Data Quality Report </div>

In [6]:
def data_quality_report(df, target_col='ln_fspd30_flag'):
    # Initialize an empty list to store each row of data
    report_data = []
    # Iterate over each column in the DataFrame to compute metrics
    for col in df.columns:
        # Determine the data type of the column
        data_type = df[col].dtype
       
        # Calculate the number of missing values in the column
        missing_values = df[col].isnull().sum()
       
        # Calculate the percentage of missing values relative to the total number of rows
        missing_percentage = (missing_values / len(df)) * 100
       
        # Calculate the number of unique values in the column
        unique_values = df[col].nunique()
       
        # Calculate the percentage of non-missing values
        non_missing_percentage = ((len(df) - missing_values) / len(df)) * 100
       
        # Check if the column is numeric to compute additional metrics
        if pd.api.types.is_numeric_dtype(df[col]):
            # Compute minimum, maximum, mean, median, mode, mode percentage, standard deviation, and quantiles
            min_value = df[col].min()
            max_value = df[col].max()
            mean_value = df[col].mean()
            median_value = df[col].median()
            mode_value = df[col].mode().iloc[0] if not df[col].mode().empty else None
            mode_percentage = (df[col] == mode_value).sum() / len(df) * 100 if mode_value is not None else None
            std_dev = df[col].std()
            quantile_25 = df[col].quantile(0.25)
            quantile_50 = df[col].quantile(0.50)  # Same as median
            quantile_75 = df[col].quantile(0.75)
            
            # Calculate the Interquartile Range (IQR)
            iqr = quantile_75 - quantile_25
            
            # Calculate Skewness and Kurtosis
            skewness = df[col].skew()
            kurtosis = df[col].kurt()
            
            # Calculate Coefficient of Variation (CV) - standardized measure of dispersion
            cv = (std_dev / mean_value) * 100 if mean_value != 0 else None
            
            # Calculate correlation with target variable if target exists in dataframe
            if target_col in df.columns and col != target_col and pd.api.types.is_numeric_dtype(df[target_col]):
                # Calculate correlation only using rows where both columns have non-null values
                correlation = df[[col, target_col]].dropna().corr().iloc[0, 1]
            else:
                correlation = None
        else:
            # Assign None for non-numeric columns where appropriate
            min_value = None
            max_value = None
            mean_value = None
            median_value = None
            mode_value = df[col].mode().iloc[0] if not df[col].mode().empty else None
            mode_percentage = (df[col] == mode_value).sum() / len(df) * 100 if mode_value is not None else None
            std_dev = None
            quantile_25 = None
            quantile_50 = None
            quantile_75 = None
            iqr = None
            skewness = None
            kurtosis = None
            cv = None
            correlation = None
       
        # Append the computed metrics for the current column to the list
        report_data.append({
            'Column': col,
            'Data Type': data_type,
            'Missing Values': missing_values,
            'Missing Percentage': missing_percentage,
            'Unique Values': unique_values,
            'Min': min_value,
            'Max': max_value,
            'Mean': mean_value,
            'Median': median_value,
            'Mode': mode_value,
            'Mode Percentage': mode_percentage,
            'Std Dev': std_dev,
            'Non-missing Percentage': non_missing_percentage,
            '25% Quantile': quantile_25,
            '50% Quantile': quantile_50,
            '75% Quantile': quantile_75,
            'IQR': iqr,
            'Skewness': skewness,
            'Kurtosis': kurtosis,
            'CV (%)': cv,
            f'Correlation with {target_col}': correlation
        })
    # Create the DataFrame from the list of dictionaries
    report = pd.DataFrame(report_data)
   
    # Return the complete data quality report DataFrame
    return report

# <div align = "left" style="color:rgb(51,250,250);"> Upload pickle file to Google Cloud Storage Bucke </div>

In [7]:
def upload_to_gcs(bucket_name, source_file_path, destination_blob_name):
    """Uploads a file to Google Cloud Storage"""
    client = storage.Client()
    bucket = client.bucket(bucket_name)
    blob = bucket.blob(destination_blob_name)
    
    blob.upload_from_filename(source_file_path)
    print(f"File {source_file_path} uploaded to {bucket_name}/{destination_blob_name}")

In [8]:
import pickle
import io
from google.cloud import storage
def save_pickle_to_gcs(data, bucket_name, destination_blob_name):
    """
    Save any Python object as a pickle file to Google Cloud Storage
    
    Args:
        data: The Python object to pickle (DataFrame, dict, list, etc.)
        bucket_name: Name of the GCS bucket
        destination_blob_name: Path/filename in the bucket
    """
    # Initialize the GCS client
    client = storage.Client()
    bucket = client.bucket(bucket_name)
    blob = bucket.blob(destination_blob_name)
    
    # Serialize the data to pickle format in memory
    pickle_buffer = io.BytesIO()
    pickle.dump(data, pickle_buffer)
    pickle_buffer.seek(0)
    
    # Upload the pickle data to GCS
    blob.upload_from_file(pickle_buffer, content_type='application/octet-stream')
    print(f"Pickle file uploaded to gs://{bucket_name}/{destination_blob_name}")

# save_dataframe_multi_format

In [9]:
def save_dataframe_multi_format(
    dataframe: pd.DataFrame, 
    cloud_path: str, 
    filename: str, 
    client: bigquery.Client = None,
    bucket_name: str = None
) -> dict:
    """
    Save a pandas DataFrame to Google Cloud Storage in multiple formats (CSV, Pickle, Parquet, Joblib).
    
    Args:
        dataframe (pd.DataFrame): The DataFrame to save
        cloud_path (str): The cloud path (e.g., 'DC/Model_Monitoring/cash_beta_trench1_data')
        filename (str): The base filename without extension
        client (bigquery.Client, optional): BigQuery client (for project reference)
        bucket_name (str, optional): GCS bucket name. If None, will try to extract from client
        
    Returns:
        dict: Dictionary with status of each file saved
        
    Example:
        client = bigquery.Client(project='prj-prod-dataplatform')
        CLOUDPATH = 'DC/Model_Monitoring/cash_beta_trench1_data'
        
        results = save_dataframe_multi_format(
            dataframe=d1,
            cloud_path=CLOUDPATH,
            filename='my_data',
            client=client,
            bucket_name='your-bucket-name'  # Replace with your actual bucket name
        )
    """
    
    # Initialize Google Cloud Storage client
    storage_client = storage.Client(project=client.project if client else None)
    
    # You'll need to specify your bucket name here
    # Common bucket names in GCP data platforms might be like:
    # - 'prj-prod-dataplatform-storage'
    # - 'dataplatform-storage'
    # - or similar pattern
    if bucket_name is None:
        # You need to replace this with your actual bucket name
        raise ValueError("Please provide the bucket_name parameter")
    
    bucket = storage_client.bucket(bucket_name)
    
    # Results dictionary to track saves
    results = {}
    
    # Ensure cloud_path doesn't start with '/'
    cloud_path = cloud_path.lstrip('/')
    
    try:
        # 1. Save as CSV
        csv_buffer = io.StringIO()
        dataframe.to_csv(csv_buffer, index=False)
        csv_blob = bucket.blob(f"{cloud_path}/{filename}.csv")
        csv_blob.upload_from_string(csv_buffer.getvalue(), content_type='text/csv')
        results['csv'] = f"gs://{bucket_name}/{cloud_path}/{filename}.csv"
        
        # 2. Save as Pickle
        pickle_buffer = io.BytesIO()
        pickle.dump(dataframe, pickle_buffer)
        pickle_blob = bucket.blob(f"{cloud_path}/{filename}.pkl")
        pickle_blob.upload_from_string(pickle_buffer.getvalue(), content_type='application/octet-stream')
        results['pickle'] = f"gs://{bucket_name}/{cloud_path}/{filename}.pkl"
        
        # 3. Save as Parquet
        parquet_buffer = io.BytesIO()
        dataframe.to_parquet(parquet_buffer, index=False)
        parquet_blob = bucket.blob(f"{cloud_path}/{filename}.parquet")
        parquet_blob.upload_from_string(parquet_buffer.getvalue(), content_type='application/octet-stream')
        results['parquet'] = f"gs://{bucket_name}/{cloud_path}/{filename}.parquet"
        
        # 4. Save as Joblib
        joblib_buffer = io.BytesIO()
        joblib.dump(dataframe, joblib_buffer)
        joblib_blob = bucket.blob(f"{cloud_path}/{filename}.joblib")
        joblib_blob.upload_from_string(joblib_buffer.getvalue(), content_type='application/octet-stream')
        results['joblib'] = f"gs://{bucket_name}/{cloud_path}/{filename}.joblib"
        
        print("All files saved successfully!")
        for format_type, path in results.items():
            print(f"{format_type.upper()}: {path}")
            
    except Exception as e:
        print(f"Error occurred: {str(e)}")
        results['error'] = str(e)
    
    return results

# calculate_gini

In [12]:
import pandas as pd
from sklearn.metrics import roc_auc_score

def calculate_gini(df, date_column, target_column, periods_dict):
    # Make a copy to avoid modifying the original dataframe
    df = df.copy()
    
    # Handle different date column formats
    if df[date_column].dtype == 'object':
        # If it's string/object, try to extract date part and convert
        try:
            df[date_column] = pd.to_datetime(df[date_column].str[:10])
        except AttributeError:
            # If .str fails, it might be mixed types, convert directly
            df[date_column] = pd.to_datetime(df[date_column])
    else:
        # If it's already datetime or numeric, convert to datetime
        df[date_column] = pd.to_datetime(df[date_column])
    
    # Extract just the date part (remove time if present)
    df[date_column] = df[date_column].dt.date
    df[date_column] = pd.to_datetime(df[date_column])
    
    results = []
    
    for period, dates in periods_dict.items():
        start_date = pd.to_datetime(dates['start'])
        end_date = pd.to_datetime(dates['end'])
        
        # Filter data for the current period
        period_mask = (df[date_column] >= start_date) & (df[date_column] <= end_date)
        period_df = df.loc[period_mask].copy()
        
        # Convert score columns to numeric, handling any non-numeric values
        period_df['credo_score'] = pd.to_numeric(period_df['credo_score'], errors='coerce')
        period_df['credolabScore'] = pd.to_numeric(period_df['credolabScore'], errors='coerce')
        
        # Drop rows with missing target or scores (including those that couldn't be converted to numeric)
        period_df = period_df.dropna(subset=[target_column, 'credo_score', 'credolabScore'])
        
        if len(period_df) == 0:
            results.append({'Period': period, 'Start_Date': dates['start'], 'End_Date': dates['end'], 'credo_score_gini': None, 'credolabScore_gini': None, 'sample_size': 0})
            continue
        
        # Check if target column has variation (both 0s and 1s)
        if len(period_df[target_column].unique()) < 2:
            print(f"Warning: {period} has no variation in target variable (all {period_df[target_column].iloc[0]})")
            results.append({'Period': period, 'credo_score_gini': None, 'credolabScore_gini': None, 'sample_size': len(period_df)})
            continue
        
        # Calculate Gini for credo_score (probability score)
        try:
            auc_credo = roc_auc_score(period_df[target_column], period_df['credo_score'])
            gini_credo = 2 * auc_credo - 1
        except (ValueError, TypeError) as e:
            print(f"Error calculating Gini for credo_score in {period}: {e}")
            gini_credo = None
        
        # Calculate Gini for credolabScore (reverse the score since higher score = lower risk)
        try:
            auc_credolab = roc_auc_score(period_df[target_column], -period_df['credolabScore'])
            gini_credolab = 2 * auc_credolab - 1
        except (ValueError, TypeError) as e:
            print(f"Error calculating Gini for credolabScore in {period}: {e}")
            gini_credolab = None
        
        results.append({
            'Period': period,
            'Start_Date': dates['start'],
            'End_Date': dates['end'],
            'credo_score_gini': gini_credo,
            'credolabScore_gini': gini_credolab,
            'sample_size': len(period_df)
        })
    
    return pd.DataFrame(results)

# cash_beta_trench1_applied_loans_backscored_20241001_20250831

# Table

In [13]:
schema1 = 'worktable_data_analysis'
cash_beta_trench1 = f'cash_beta_trench1_applied_loans_backscored_20241001_20250831'

# Query Android

In [14]:
sq = f"""
with base as 
(select a1.customer_id, a1.digitalLoanAccountId,
lmt.loanAccountNumber,a1.ln_os_type, a1.ln_appln_submit_datetime, a1.credo_score
-- ,lmt.credolabScore
-- , al.gen_credo_score credolabScore
, coalesce(ctial.score_all_score, lmt.credolabScore) credolabScore
,a1.ln_fspd30_flag,a1.ln_mature_fspd30_flag, ldd.obs_min_inst_def30, ldd.min_inst_def30, 
case when ldd.obs_min_inst_def30 >=2 then 1 else 0 end flag_mature_fspd30,
case when ldd.obs_min_inst_def30 >=2 and ldd.min_inst_def30 in (1,2) then 1 else 0 end fspd30,
from {schema1}.{cash_beta_trench1} a1
inner join `risk_credit_mis.loan_master_table` lmt on lmt.digitalLoanAccountId = a1.digitalLoanAccountId
inner join prj-prod-dataplatform.risk_credit_mis.loan_deliquency_data ldd on lmt.loanAccountNumber = ldd.loanAccountNumber
left join risk_mart.applied_loans_20230101_20250831 al on al.digitalLoanAccountId = a1.digitalLoanAccountId
left join risk_credit_mis.credolab_trace_insight_all_loans ctial on ctial.referenceNumber = lmt.credolabRefNumber
where flagDisbursement = 1
)
select * from base 
where 
flag_mature_fspd30 = 1
and ln_os_type like 'Android'
and credo_score is not null
and credolabScore is not null
;
"""
d1 = client.query(sq).to_dataframe(progress_bar_type='tqdm')
print(f"The shape of {schema1}.{cash_beta_trench1} table is:\t {d1.shape}")

# Define your periods
data_periods_dict = {
    'Train': {'start': '2024-08-13', 'end': '2025-01-31'}, 
    'OOT 1': {'start': '2025-02-01', 'end': '2025-02-28'},
    'OOT 2': {'start': '2025-03-01', 'end': '2025-03-31'},
    'OOT 3': {'start': '2025-04-01', 'end': '2025-04-30'},
    'OOT 4': {'start': '2025-05-01', 'end': '2025-05-31'},
}

# Debug: Check your date column format first
print("Date column info:")
print(f"Data type: {d1['ln_appln_submit_datetime'].dtype}")
print(f"Sample values: {d1['ln_appln_submit_datetime'].head()}")
print(f"Any null values: {d1['ln_appln_submit_datetime'].isnull().sum()}")

# Calculate Gini coefficients
gini_df = calculate_gini(d1, 'ln_appln_submit_datetime', 'fspd30', data_periods_dict)
print("\nGini Results:")
print(gini_df)

Job ID 5cd42831-71dc-43d9-a28b-695cc3259e38 successfully executed: 100%|[32m██████████[0m|




Downloading: 100%|[32m██████████[0m|
The shape of worktable_data_analysis.cash_beta_trench1_applied_loans_backscored_20241001_20250831 table is:	 (4597, 13)
Date column info:
Data type: datetime64[us, UTC]
Sample values: 0   2024-10-15 10:14:21+00:00
1   2024-10-31 20:42:11+00:00
2   2024-11-15 15:12:37+00:00
3   2024-10-01 23:09:06+00:00
4   2024-10-30 10:56:41+00:00
Name: ln_appln_submit_datetime, dtype: datetime64[us, UTC]
Any null values: 0

Gini Results:
  Period  Start_Date    End_Date  credo_score_gini  credolabScore_gini  \
0  Train  2024-08-13  2025-01-31          0.301360            0.090981   
1  OOT 1  2025-02-01  2025-02-28          0.285793            0.063103   
2  OOT 2  2025-03-01  2025-03-31          0.336065            0.261779   
3  OOT 3  2025-04-01  2025-04-30          0.271644            0.043692   
4  OOT 4  2025-05-01  2025-05-31          0.290000            0.127838   

   sample_size  
0         2762  
1          548  
2          335  
3          334  
4  

Total count of Android loans for table worktable_data_analysis.cash_beta_trench1_applied_loans_backscored_20241001_20250831 - (201845, 6)

when credo_score is not null -  (201845, 6)

flag_disbursement = 1 - (5591, 6)

after added fspd30 table =  (5411, 8)

after adding flag_mature_fspd30 = 1 -   (4583, 13)






In [15]:
d1.to_csv(rf"{LOCALPATH}/credonewandold.csv", index = False)

In [None]:
# import pandas as pd
# from google.cloud import bigquery
# from sklearn.metrics import roc_auc_score
# from typing import Dict

# def calculate_gini_for_table(
#     project_id: str,
#     table_name: str,
#     date_column: str,
#     score_column: str,
#     target_column: str,
#     target_maturity_column: str,
#     data_periods_dict: Dict
# ):
#     """
#     Calculate Gini coefficient for different time periods.
    
#     Args:
#         project_id: BigQuery project ID
#         table_name: Full table name (dataset.table)
#         date_column: Name of the date column
#         score_column: Name of the score column
#         target_column: Name of the target column
#         target_maturity_column: Name of the target maturity column
#         data_periods_dict: Dictionary with periods, e.g.:
#             {'Train': {'start': '2024-01-01', 'end': '2025-01-31'}, 
#              'Test': {'start': '2025-02-01', 'end': '2025-12-31'}}
    
#     Returns:
#         pandas.DataFrame: Table with Gini coefficients for each period
#     """
    
#     client = bigquery.Client(project_id)
    
#     # Base query to get all data
#     sql_query = f"""
#     WITH base as (
#         SELECT DISTINCT 
#             digitalLoanAccountId, 
#             {date_column}, 
#             {score_column}, 
#             {target_column}
#         FROM {table_name}
#         WHERE {score_column} IS NOT NULL
#         AND {target_maturity_column} = 1
#     )
#     SELECT {date_column}, {score_column}, {target_column}
#     FROM base
#     ORDER BY {date_column}
#     """
    
#     # Get all data
#     dt = client.query(sql_query).to_dataframe()
    
#     # Convert date column to datetime and extract just the date part
#     dt[date_column] = pd.to_datetime(dt[date_column]).dt.date
    
#     # Initialize results
#     gini_results = []
    
#     print("Gini Coefficient Results:")
#     print("=" * 50)
    
#     # Calculate Gini for each period
#     for period_name, period_info in data_periods_dict.items():
#         start_date = pd.to_datetime(period_info['start']).date()
#         end_date = pd.to_datetime(period_info['end']).date()
        
#         # Filter data for the current period
#         period_mask = (dt[date_column] >= start_date) & (dt[date_column] <= end_date)
#         period_data = dt[period_mask].copy()
        
#         if len(period_data) == 0:
#             print(f"{period_name}: No data available for period {start_date.date()} to {end_date.date()}")
#             gini_results.append({
#                 'Period': period_name,
#                 'Start_Date': start_date,
#                 'End_Date': end_date,
#                 'Sample_Size': 0,
#                 'Gini_Coefficient': None
#             })
#             continue
        
#         # Check if we have both classes (0 and 1) in target
#         unique_targets = period_data[target_column].unique()
#         if len(unique_targets) < 2:
#             print(f"{period_name}: Only one class present in target variable. Cannot calculate Gini.")
#             gini_results.append({
#                 'Period': period_name,
#                 'Start_Date': start_date,
#                 'End_Date': end_date,
#                 'Sample_Size': len(period_data),
#                 'Gini_Coefficient': None
#             })
#             continue
        
#         # Calculate Gini coefficient
#         try:
#             auc = roc_auc_score(period_data[target_column], period_data[score_column])
#             gini = 2 * auc - 1
            
#             print(f"{period_name}: {round(gini, 4)} (Sample size: {len(period_data):,})")
            
#             gini_results.append({
#                 'Period': period_name,
#                 'Start_Date': start_date,
#                 'End_Date': end_date,
#                 'Sample_Size': len(period_data),
#                 'Gini_Coefficient': round(gini, 4)
#             })
            
#         except Exception as e:
#             print(f"{period_name}: Error calculating Gini - {str(e)}")
#             gini_results.append({
#                 'Period': period_name,
#                 'Start_Date': start_date,
#                 'End_Date': end_date,
#                 'Sample_Size': len(period_data),
#                 'Gini_Coefficient': None
#             })
    
#     # Create results DataFrame
#     results_df = pd.DataFrame(gini_results)
    
#     print("\n" + "=" * 50)
#     print("Summary Table:")
#     print(results_df.to_string(index=False))
    
#     return results_df

In [16]:
import pandas as pd
from sklearn.metrics import roc_auc_score

def calculate_gini(df, date_column, target_column, periods_dict):
    # Make a copy to avoid modifying the original dataframe
    df = df.copy()
    
    # Handle different date column formats
    if df[date_column].dtype == 'object':
        # If it's string/object, try to extract date part and convert
        try:
            df[date_column] = pd.to_datetime(df[date_column].str[:10])
        except AttributeError:
            # If .str fails, it might be mixed types, convert directly
            df[date_column] = pd.to_datetime(df[date_column])
    else:
        # If it's already datetime or numeric, convert to datetime
        df[date_column] = pd.to_datetime(df[date_column])
    
    # Extract just the date part (remove time if present)
    df[date_column] = df[date_column].dt.date
    df[date_column] = pd.to_datetime(df[date_column])
    
    results = []
    
    for period, dates in periods_dict.items():
        start_date = pd.to_datetime(dates['start'])
        end_date = pd.to_datetime(dates['end'])
        
        # Filter data for the current period
        period_mask = (df[date_column] >= start_date) & (df[date_column] <= end_date)
        period_df = df.loc[period_mask].copy()
        
        # Convert score columns to numeric, handling any non-numeric values
        period_df['credo_score'] = pd.to_numeric(period_df['credo_score'], errors='coerce')
        period_df['credolabScore'] = pd.to_numeric(period_df['credolabScore'], errors='coerce')
        
        # Drop rows with missing target or scores (including those that couldn't be converted to numeric)
        period_df = period_df.dropna(subset=[target_column, 'credo_score', 'credolabScore'])
        
        if len(period_df) == 0:
            results.append({'Period': period, 'Start_Date': dates['start'], 'End_Date': dates['end'], 'credo_score_gini': None, 'credolabScore_gini': None, 'sample_size': 0})
            continue
        
        # Check if target column has variation (both 0s and 1s)
        if len(period_df[target_column].unique()) < 2:
            print(f"Warning: {period} has no variation in target variable (all {period_df[target_column].iloc[0]})")
            results.append({'Period': period, 'credo_score_gini': None, 'credolabScore_gini': None, 'sample_size': len(period_df)})
            continue
        
        # Calculate Gini for credo_score (probability score)
        try:
            auc_credo = roc_auc_score(period_df[target_column], period_df['credo_score'])
            gini_credo = 2 * auc_credo - 1
        except (ValueError, TypeError) as e:
            print(f"Error calculating Gini for credo_score in {period}: {e}")
            gini_credo = None
        
        # Calculate Gini for credolabScore (reverse the score since higher score = lower risk)
        try:
            auc_credolab = roc_auc_score(period_df[target_column], -period_df['credolabScore'])
            gini_credolab = 2 * auc_credolab - 1
        except (ValueError, TypeError) as e:
            print(f"Error calculating Gini for credolabScore in {period}: {e}")
            gini_credolab = None
        
        results.append({
            'Period': period,
            'Start_Date': dates['start'],
            'End_Date': dates['end'],
            'credo_score_gini': gini_credo,
            'credolabScore_gini': gini_credolab,
            'sample_size': len(period_df)
        })
    
    return pd.DataFrame(results)


# Define your periods
data_periods_dict = {
    'Train': {'start': '2024-08-13', 'end': '2025-01-31'}, 
    'OOT 1': {'start': '2025-02-01', 'end': '2025-02-28'},
    'OOT 2': {'start': '2025-03-01', 'end': '2025-03-31'},
    'OOT 3': {'start': '2025-04-01', 'end': '2025-04-30'},
    'OOT 4': {'start': '2025-05-01', 'end': '2025-05-31'},
}

# Debug: Check your date column format first
print("Date column info:")
print(f"Data type: {d1['ln_appln_submit_datetime'].dtype}")
print(f"Sample values: {d1['ln_appln_submit_datetime'].head()}")
print(f"Any null values: {d1['ln_appln_submit_datetime'].isnull().sum()}")

# Calculate Gini coefficients
gini_df = calculate_gini(d1, 'ln_appln_submit_datetime', 'fspd30', data_periods_dict)
print("\nGini Results:")
print(gini_df)

Date column info:
Data type: datetime64[us, UTC]
Sample values: 0   2024-10-15 10:14:21+00:00
1   2024-10-31 20:42:11+00:00
2   2024-11-15 15:12:37+00:00
3   2024-10-01 23:09:06+00:00
4   2024-10-30 10:56:41+00:00
Name: ln_appln_submit_datetime, dtype: datetime64[us, UTC]
Any null values: 0

Gini Results:
  Period  Start_Date    End_Date  credo_score_gini  credolabScore_gini  \
0  Train  2024-08-13  2025-01-31          0.301360            0.090981   
1  OOT 1  2025-02-01  2025-02-28          0.285793            0.063103   
2  OOT 2  2025-03-01  2025-03-31          0.336065            0.261779   
3  OOT 3  2025-04-01  2025-04-30          0.271644            0.043692   
4  OOT 4  2025-05-01  2025-05-31          0.290000            0.127838   

   sample_size  
0         2762  
1          548  
2          335  
3          334  
4          371  


In [17]:
cash_beta_trench1_df = gini_df.copy()
print("\nGini Results:")
gini_df


Gini Results:


Unnamed: 0,Period,Start_Date,End_Date,credo_score_gini,credolabScore_gini,sample_size
0,Train,2024-08-13,2025-01-31,0.30136,0.090981,2762
1,OOT 1,2025-02-01,2025-02-28,0.285793,0.063103,548
2,OOT 2,2025-03-01,2025-03-31,0.336065,0.261779,335
3,OOT 3,2025-04-01,2025-04-30,0.271644,0.043692,334
4,OOT 4,2025-05-01,2025-05-31,0.29,0.127838,371


# Query IOS

In [18]:
sq = f"""
with base as 
(select a1.customer_id, a1.digitalLoanAccountId,
lmt.loanAccountNumber,a1.ln_os_type, a1.ln_appln_submit_datetime, a1.credo_score
-- ,lmt.credolabScore
-- , al.gen_credo_score credolabScore
, coalesce(ctial.score_all_score, lmt.credolabScore) credolabScore
,a1.ln_fspd30_flag,a1.ln_mature_fspd30_flag, ldd.obs_min_inst_def30, ldd.min_inst_def30, 
case when ldd.obs_min_inst_def30 >=2 then 1 else 0 end flag_mature_fspd30,
case when ldd.obs_min_inst_def30 >=2 and ldd.min_inst_def30 in (1,2) then 1 else 0 end fspd30,
from {schema1}.{cash_beta_trench1}  a1
inner join `risk_credit_mis.loan_master_table` lmt on lmt.digitalLoanAccountId = a1.digitalLoanAccountId
inner join prj-prod-dataplatform.risk_credit_mis.loan_deliquency_data ldd on lmt.loanAccountNumber = ldd.loanAccountNumber
left join risk_mart.applied_loans_20230101_20250831 al on al.digitalLoanAccountId = a1.digitalLoanAccountId
left join risk_credit_mis.credolab_trace_insight_all_loans ctial on ctial.referenceNumber = lmt.credolabRefNumber
where flagDisbursement = 1
)
select * from base 
where 
flag_mature_fspd30 = 1
and ln_os_type not like 'Android'
and credo_score is not null
and credolabScore is not null
;
"""
d1 = client.query(sq).to_dataframe(progress_bar_type='tqdm')
print(f"The shape of {schema1}.{cash_beta_trench1} table is:\t {d1.shape}")

Job ID c467720e-117d-4ba2-a511-a5809ce1281e successfully executed: 100%|[32m██████████[0m|




Downloading: 100%|[32m██████████[0m|
The shape of worktable_data_analysis.cash_beta_trench1_applied_loans_backscored_20241001_20250831 table is:	 (1540, 13)


original without andriod -  (8626, 13)
credo_score not null - (8626, 13)
flag_disbursement = 1 - (8626, 13)
flag_mature_fspd30 -  (7169, 13)


In [19]:
# Define your periods
data_periods_dict = {
    'Train': {'start': '2024-08-13', 'end': '2025-01-31'}, 
    'OOT 1': {'start': '2025-02-01', 'end': '2025-02-28'},
    'OOT 2': {'start': '2025-03-01', 'end': '2025-03-31'},
    'OOT 3': {'start': '2025-04-01', 'end': '2025-04-30'},
    'OOT 4': {'start': '2025-05-01', 'end': '2025-05-31'},
}

# Debug: Check your date column format first
print("Date column info:")
print(f"Data type: {d1['ln_appln_submit_datetime'].dtype}")
print(f"Sample values: {d1['ln_appln_submit_datetime'].head()}")
print(f"Any null values: {d1['ln_appln_submit_datetime'].isnull().sum()}")

# Calculate Gini coefficients
gini_df = calculate_gini(d1, 'ln_appln_submit_datetime', 'fspd30', data_periods_dict)
print("\nGini Results:")
gini_df

Date column info:
Data type: datetime64[us, UTC]
Sample values: 0   2025-04-02 14:37:13+00:00
1   2025-05-09 14:15:53+00:00
2   2025-03-20 20:54:49+00:00
3   2025-05-02 14:39:58+00:00
4   2025-03-21 20:37:45+00:00
Name: ln_appln_submit_datetime, dtype: datetime64[us, UTC]
Any null values: 0

Gini Results:


Unnamed: 0,Period,Start_Date,End_Date,credo_score_gini,credolabScore_gini,sample_size
0,Train,2024-08-13,2025-01-31,,,0
1,OOT 1,2025-02-01,2025-02-28,,,0
2,OOT 2,2025-03-01,2025-03-31,0.170189,0.132925,153
3,OOT 3,2025-04-01,2025-04-30,0.086147,0.108097,477
4,OOT 4,2025-05-01,2025-05-31,0.212114,0.071524,550


In [20]:
cash_beta_trench1_ios_df = gini_df.copy()

# cash_beta_trench2_applied_loans_backscored_20241001_20250831

# Table

In [21]:
schema1 = 'worktable_data_analysis'
cash_beta_trench2 = f'cash_beta_trench2_applied_loans_backscored_20241001_20250831'

# Query Android

In [22]:
sq = f"""
with base as 
(select a1.customer_id, a1.digitalLoanAccountId,
lmt.loanAccountNumber,a1.ln_os_type, a1.ln_appln_submit_datetime, a1.credo_score
-- ,lmt.credolabScore
-- , al.gen_credo_score credolabScore
, coalesce(ctial.score_all_score, lmt.credolabScore) credolabScore
,a1.ln_fspd30_flag,a1.ln_mature_fspd30_flag, ldd.obs_min_inst_def30, ldd.min_inst_def30, 
case when ldd.obs_min_inst_def30 >=2 then 1 else 0 end flag_mature_fspd30,
case when ldd.obs_min_inst_def30 >=2 and ldd.min_inst_def30 in (1,2) then 1 else 0 end fspd30,
from {schema1}.{cash_beta_trench2} a1
inner join `risk_credit_mis.loan_master_table` lmt on lmt.digitalLoanAccountId = a1.digitalLoanAccountId
inner join prj-prod-dataplatform.risk_credit_mis.loan_deliquency_data ldd on lmt.loanAccountNumber = ldd.loanAccountNumber
left join risk_mart.applied_loans_20230101_20250831 al on al.digitalLoanAccountId = a1.digitalLoanAccountId
left join risk_credit_mis.credolab_trace_insight_all_loans ctial on ctial.referenceNumber = lmt.credolabRefNumber
where flagDisbursement = 1
)
select * from base 
where 
flag_mature_fspd30 = 1
and ln_os_type like 'Android'
and credo_score is not null
and credolabScore is not null
;
"""
d2 = client.query(sq).to_dataframe(progress_bar_type='tqdm')
print(f"The shape of {schema1}.{cash_beta_trench2} table is:\t {d2.shape}")

Job ID b9262198-df41-43a9-89f3-41b6bfc47d34 successfully executed: 100%|[32m██████████[0m|




Downloading: 100%|[32m██████████[0m|
The shape of worktable_data_analysis.cash_beta_trench2_applied_loans_backscored_20241001_20250831 table is:	 (3877, 13)


In [23]:
# Define your periods
data_periods_dict = {
    'Train': {'start': '2024-08-13', 'end': '2025-01-31'}, 
    'OOT 1': {'start': '2025-02-01', 'end': '2025-02-28'},
    'OOT 2': {'start': '2025-03-01', 'end': '2025-03-31'},
    'OOT 3': {'start': '2025-04-01', 'end': '2025-04-30'},
    'OOT 4': {'start': '2025-05-01', 'end': '2025-05-31'},
}

# Debug: Check your date column format first
print("Date column info:")
print(f"Data type: {d2['ln_appln_submit_datetime'].dtype}")
print(f"Sample values: {d2['ln_appln_submit_datetime'].head()}")
print(f"Any null values: {d2['ln_appln_submit_datetime'].isnull().sum()}")

# Calculate Gini coefficients
gini_df2 = calculate_gini(d2, 'ln_appln_submit_datetime', 'fspd30', data_periods_dict)
print("\nGini Results:")
gini_df2

Date column info:
Data type: datetime64[us, UTC]
Sample values: 0   2024-10-27 21:01:45+00:00
1   2024-10-26 14:49:22+00:00
2   2024-11-08 16:03:27+00:00
3   2024-10-24 08:36:32+00:00
4   2024-11-11 13:46:59+00:00
Name: ln_appln_submit_datetime, dtype: datetime64[us, UTC]
Any null values: 0

Gini Results:


Unnamed: 0,Period,Start_Date,End_Date,credo_score_gini,credolabScore_gini,sample_size
0,Train,2024-08-13,2025-01-31,0.266925,0.158478,1933
1,OOT 1,2025-02-01,2025-02-28,0.423827,0.235004,463
2,OOT 2,2025-03-01,2025-03-31,0.352999,0.034759,473
3,OOT 3,2025-04-01,2025-04-30,0.295992,0.187835,397
4,OOT 4,2025-05-01,2025-05-31,0.306634,0.171568,387


In [24]:
cash_beta_trench2_andriod_df = gini_df2.copy()

# Query IOS

In [25]:
sq = f"""
with base as 
(select a1.customer_id, a1.digitalLoanAccountId,
lmt.loanAccountNumber,a1.ln_os_type, a1.ln_appln_submit_datetime, a1.credo_score
-- ,lmt.credolabScore
-- , al.gen_credo_score credolabScore
, coalesce(ctial.score_all_score, lmt.credolabScore) credolabScore
,a1.ln_fspd30_flag,a1.ln_mature_fspd30_flag, ldd.obs_min_inst_def30, ldd.min_inst_def30, 
case when ldd.obs_min_inst_def30 >=2 then 1 else 0 end flag_mature_fspd30,
case when ldd.obs_min_inst_def30 >=2 and ldd.min_inst_def30 in (1,2) then 1 else 0 end fspd30,
from {schema1}.{cash_beta_trench2} a1
inner join `risk_credit_mis.loan_master_table` lmt on lmt.digitalLoanAccountId = a1.digitalLoanAccountId
inner join prj-prod-dataplatform.risk_credit_mis.loan_deliquency_data ldd on lmt.loanAccountNumber = ldd.loanAccountNumber
left join risk_mart.applied_loans_20230101_20250831 al on al.digitalLoanAccountId = a1.digitalLoanAccountId
left join risk_credit_mis.credolab_trace_insight_all_loans ctial on ctial.referenceNumber = lmt.credolabRefNumber
where flagDisbursement = 1
)
select * from base 
where 
flag_mature_fspd30 = 1
and ln_os_type not like 'Android'
and credo_score is not null
and credolabScore is not null
;
"""
d2 = client.query(sq).to_dataframe(progress_bar_type='tqdm')
print(f"The shape of {schema1}.{cash_beta_trench2} table is:\t {d2.shape}")

Job ID 0689d298-8c37-41df-8980-161957963653 successfully executed: 100%|[32m██████████[0m|




Downloading: 100%|[32m██████████[0m|
The shape of worktable_data_analysis.cash_beta_trench2_applied_loans_backscored_20241001_20250831 table is:	 (1191, 13)


In [26]:
# Define your periods
data_periods_dict = {
    'Train': {'start': '2024-08-13', 'end': '2025-01-31'}, 
    'OOT 1': {'start': '2025-02-01', 'end': '2025-02-28'},
    'OOT 2': {'start': '2025-03-01', 'end': '2025-03-31'},
    'OOT 3': {'start': '2025-04-01', 'end': '2025-04-30'},
    'OOT 4': {'start': '2025-05-01', 'end': '2025-05-31'},
}

# Debug: Check your date column format first
print("Date column info:")
print(f"Data type: {d2['ln_appln_submit_datetime'].dtype}")
print(f"Sample values: {d2['ln_appln_submit_datetime'].head()}")
print(f"Any null values: {d2['ln_appln_submit_datetime'].isnull().sum()}")

# Calculate Gini coefficients
gini_df2 = calculate_gini(d2, 'ln_appln_submit_datetime', 'fspd30', data_periods_dict)
print("\nGini Results:")
gini_df2

Date column info:
Data type: datetime64[us, UTC]
Sample values: 0   2025-06-10 10:50:24+00:00
1   2025-05-25 22:34:05+00:00
2   2025-05-16 05:49:33+00:00
3   2025-05-18 14:08:43+00:00
4   2025-05-24 09:20:17+00:00
Name: ln_appln_submit_datetime, dtype: datetime64[us, UTC]
Any null values: 0

Gini Results:


Unnamed: 0,Period,Start_Date,End_Date,credo_score_gini,credolabScore_gini,sample_size
0,Train,2024-08-13,2025-01-31,,,0
1,OOT 1,2025-02-01,2025-02-28,,,0
2,OOT 2,2025-03-01,2025-03-31,0.177807,0.105615,100
3,OOT 3,2025-04-01,2025-04-30,0.096853,0.030743,430
4,OOT 4,2025-05-01,2025-05-31,0.067697,0.025946,422


In [27]:
cash_beta_trench2_ios_df = gini_df2.copy()

# cash_beta_trench3_applied_loans_backscored_20241001_20250831

# Table

In [28]:
schema1 = 'worktable_data_analysis'
cash_beta_trench3 = f'cash_beta_trench3_applied_loans_backscored_20241001_20250831'

# Query Android

In [29]:
sq = f"""
with base as 
(select a1.customer_id, a1.digitalLoanAccountId,
lmt.loanAccountNumber,a1.ln_os_type, a1.ln_appln_submit_datetime, a1.credo_score
, coalesce(ctial.score_all_score, lmt.credolabScore) credolabScore
,a1.ln_fspd30_flag,a1.ln_mature_fspd30_flag, ldd.obs_min_inst_def30, ldd.min_inst_def30, 
case when ldd.obs_min_inst_def30 >=2 then 1 else 0 end flag_mature_fspd30,
case when ldd.obs_min_inst_def30 >=2 and ldd.min_inst_def30 in (1,2) then 1 else 0 end fspd30,
from {schema1}.{cash_beta_trench3} a1
inner join `risk_credit_mis.loan_master_table` lmt on lmt.digitalLoanAccountId = a1.digitalLoanAccountId
inner join prj-prod-dataplatform.risk_credit_mis.loan_deliquency_data ldd on lmt.loanAccountNumber = ldd.loanAccountNumber
left join risk_mart.applied_loans_20230101_20250831 al on al.digitalLoanAccountId = a1.digitalLoanAccountId
left join risk_credit_mis.credolab_trace_insight_all_loans ctial on ctial.referenceNumber = lmt.credolabRefNumber
where flagDisbursement = 1
)
select * from base 
where 
flag_mature_fspd30 = 1
and ln_os_type like 'Android'
and credo_score is not null
and credolabScore is not null
;
"""
d3 = client.query(sq).to_dataframe(progress_bar_type='tqdm')
print(f"The shape of {schema1}.{cash_beta_trench3} table is:\t {d3.shape}")

Job ID fe4bd414-e78b-449a-b8b2-c8b886be27eb successfully executed: 100%|[32m██████████[0m|




Downloading: 100%|[32m██████████[0m|
The shape of worktable_data_analysis.cash_beta_trench3_applied_loans_backscored_20241001_20250831 table is:	 (3325, 13)


In [30]:
# Define your periods
data_periods_dict = {
    'Train': {'start': '2024-08-13', 'end': '2025-01-31'}, 
    'OOT 1': {'start': '2025-02-01', 'end': '2025-02-28'},
    'OOT 2': {'start': '2025-03-01', 'end': '2025-03-31'},
    'OOT 3': {'start': '2025-04-01', 'end': '2025-04-30'},
    'OOT 4': {'start': '2025-05-01', 'end': '2025-05-31'},
}

# Debug: Check your date column format first"
print("Date column info:")
print(f"Data type: {d3['ln_appln_submit_datetime'].dtype}")
print(f"Sample values: {d3['ln_appln_submit_datetime'].head()}")
print(f"Any null values: {d3['ln_appln_submit_datetime'].isnull().sum()}")

# Calculate Gini coefficients
gini_df3 = calculate_gini(d3, 'ln_appln_submit_datetime', 'fspd30', data_periods_dict)
print("\nGini Results:")
gini_df3

Date column info:
Data type: datetime64[us, UTC]
Sample values: 0   2024-11-02 12:54:38+00:00
1   2024-11-18 09:34:49+00:00
2   2024-11-23 15:46:19+00:00
3   2025-04-26 11:53:55+00:00
4   2025-01-13 09:00:30+00:00
Name: ln_appln_submit_datetime, dtype: datetime64[us, UTC]
Any null values: 0

Gini Results:


Unnamed: 0,Period,Start_Date,End_Date,credo_score_gini,credolabScore_gini,sample_size
0,Train,2024-08-13,2025-01-31,0.166167,0.084071,1374
1,OOT 1,2025-02-01,2025-02-28,0.255133,-0.03466,391
2,OOT 2,2025-03-01,2025-03-31,0.297123,0.069909,425
3,OOT 3,2025-04-01,2025-04-30,0.13861,0.159289,388
4,OOT 4,2025-05-01,2025-05-31,0.053406,0.236495,483


In [31]:
cash_beta_trench3_android_df = gini_df3.copy()

In [32]:
dfd = dd.query("""select credolabScore, sum(fspd30)/sum(flag_mature_fspd30) fspd30 from d3 where ln_appln_submit_datetime between '2025-05-01' and '2025-05-31' group by 1 order by 1;""").to_df()
dfd

Unnamed: 0,credolabScore,fspd30
0,415.0,0.0
1,440.0,0.0
2,447.0,0.0
3,448.0,1.0
4,449.0,0.0
...,...,...
113,581.0,0.0
114,583.0,0.0
115,584.0,0.0
116,588.0,0.0


# Query IOS

In [33]:
sq = f"""
with base as 
(select a1.customer_id, a1.digitalLoanAccountId,
lmt.loanAccountNumber,a1.ln_os_type, a1.ln_appln_submit_datetime, a1.credo_score
-- ,lmt.credolabScore
-- , al.gen_credo_score credolabScore
, coalesce(ctial.score_all_score, lmt.credolabScore) credolabScore
,a1.ln_fspd30_flag,a1.ln_mature_fspd30_flag, ldd.obs_min_inst_def30, ldd.min_inst_def30, 
case when ldd.obs_min_inst_def30 >=2 then 1 else 0 end flag_mature_fspd30,
case when ldd.obs_min_inst_def30 >=2 and ldd.min_inst_def30 in (1,2) then 1 else 0 end fspd30,
from {schema1}.{cash_beta_trench3} a1
inner join `risk_credit_mis.loan_master_table` lmt on lmt.digitalLoanAccountId = a1.digitalLoanAccountId
inner join prj-prod-dataplatform.risk_credit_mis.loan_deliquency_data ldd on lmt.loanAccountNumber = ldd.loanAccountNumber
left join risk_mart.applied_loans_20230101_20250831 al on al.digitalLoanAccountId = a1.digitalLoanAccountId
left join risk_credit_mis.credolab_trace_insight_all_loans ctial on ctial.referenceNumber = lmt.credolabRefNumber
where flagDisbursement = 1
)
select * from base 
where 
flag_mature_fspd30 = 1
and ln_os_type not like 'Android'
and credo_score is not null
and credolabScore is not null
;
"""
d3 = client.query(sq).to_dataframe(progress_bar_type='tqdm')
print(f"The shape of {schema1}.{cash_beta_trench3} table is:\t {d3.shape}")

Job ID cd96bc13-9627-4358-880f-37b84f199a43 successfully executed: 100%|[32m██████████[0m|




Downloading: 100%|[32m██████████[0m|
The shape of worktable_data_analysis.cash_beta_trench3_applied_loans_backscored_20241001_20250831 table is:	 (1279, 13)


In [34]:
# Define your periods
data_periods_dict = {
    'Train': {'start': '2024-08-13', 'end': '2025-01-31'}, 
    'OOT 1': {'start': '2025-02-01', 'end': '2025-02-28'},
    'OOT 2': {'start': '2025-03-01', 'end': '2025-03-31'},
    'OOT 3': {'start': '2025-04-01', 'end': '2025-04-30'},
    'OOT 4': {'start': '2025-05-01', 'end': '2025-05-31'},
}

# Debug: Check your date column format first"
print("Date column info:")
print(f"Data type: {d3['ln_appln_submit_datetime'].dtype}")
print(f"Sample values: {d3['ln_appln_submit_datetime'].head()}")
print(f"Any null values: {d3['ln_appln_submit_datetime'].isnull().sum()}")

# Calculate Gini coefficients
gini_df3 = calculate_gini(d3, 'ln_appln_submit_datetime', 'fspd30', data_periods_dict)
print("\nGini Results:")
gini_df3

Date column info:
Data type: datetime64[us, UTC]
Sample values: 0   2025-06-02 12:35:33+00:00
1   2025-05-17 21:19:45+00:00
2   2025-06-04 07:58:58+00:00
3   2025-06-12 16:00:30+00:00
4   2025-06-11 22:40:07+00:00
Name: ln_appln_submit_datetime, dtype: datetime64[us, UTC]
Any null values: 0

Gini Results:


Unnamed: 0,Period,Start_Date,End_Date,credo_score_gini,credolabScore_gini,sample_size
0,Train,2024-08-13,2025-01-31,,,0
1,OOT 1,2025-02-01,2025-02-28,,,0
2,OOT 2,2025-03-01,2025-03-31,0.258491,-0.203774,63
3,OOT 3,2025-04-01,2025-04-30,0.105058,-0.046354,390
4,OOT 4,2025-05-01,2025-05-31,-0.06349,-0.03645,520


In [35]:
cash_beta_trench3_ios_df = gini_df3.copy()

# cash_alpha_trench1_applied_loans_backscored_20241001_20250831

# Table

In [36]:
schema1 = 'worktable_data_analysis'
cash_alpha_trench1 = f'cash_alpha_trench1_applied_loans_backscored_20241001_20250831'

# Query Android

In [37]:
sq = f"""
with base as 
(select a1.customer_id, a1.digitalLoanAccountId,
lmt.loanAccountNumber,a1.ln_os_type, a1.ln_appln_submit_datetime, a1.credo_score
-- ,lmt.credolabScore
-- , al.gen_credo_score credolabScore
, coalesce(ctial.score_all_score, lmt.credolabScore) credolabScore
,a1.ln_fspd30_flag,a1.ln_mature_fspd30_flag, ldd.obs_min_inst_def30, ldd.min_inst_def30, 
case when ldd.obs_min_inst_def30 >=2 then 1 else 0 end flag_mature_fspd30,
case when ldd.obs_min_inst_def30 >=2 and ldd.min_inst_def30 in (1,2) then 1 else 0 end fspd30,
from {schema1}.{cash_alpha_trench1} a1
inner join `risk_credit_mis.loan_master_table` lmt on lmt.digitalLoanAccountId = a1.digitalLoanAccountId
inner join prj-prod-dataplatform.risk_credit_mis.loan_deliquency_data ldd on lmt.loanAccountNumber = ldd.loanAccountNumber
left join risk_mart.applied_loans_20230101_20250831 al on al.digitalLoanAccountId = a1.digitalLoanAccountId
left join risk_credit_mis.credolab_trace_insight_all_loans ctial on ctial.referenceNumber = lmt.credolabRefNumber
where flagDisbursement = 1
)
select * from base 
where 
flag_mature_fspd30 = 1
and ln_os_type like 'Android'
and credo_score is not null
and credolabScore is not null
;
"""
da1 = client.query(sq).to_dataframe(progress_bar_type='tqdm')
print(f"The shape of {schema1}.{cash_alpha_trench1} table is:\t {da1.shape}")

Job ID f688cb60-f869-4959-bbc1-3f22e747fa7d successfully executed: 100%|[32m██████████[0m|




Downloading: 100%|[32m██████████[0m|
The shape of worktable_data_analysis.cash_alpha_trench1_applied_loans_backscored_20241001_20250831 table is:	 (4455, 13)


In [38]:
# Define your periods
data_periods_dict = {
    'Train': {'start': '2024-08-13', 'end': '2025-01-31'}, 
    'OOT 1': {'start': '2025-02-01', 'end': '2025-02-28'},
    'OOT 2': {'start': '2025-03-01', 'end': '2025-03-31'},
    'OOT 3': {'start': '2025-04-01', 'end': '2025-04-30'},
    'OOT 4': {'start': '2025-05-01', 'end': '2025-05-31'},
}

# Debug: Check your date column format first
print("Date column info:")
print(f"Data type: {da1['ln_appln_submit_datetime'].dtype}")
print(f"Sample values: {da1['ln_appln_submit_datetime'].head()}")
print(f"Any null values: {da1['ln_appln_submit_datetime'].isnull().sum()}")

# Calculate Gini coefficients
gini_dfa1 = calculate_gini(da1, 'ln_appln_submit_datetime', 'fspd30', data_periods_dict)
print("\nGini Results:")
cash_alpha_trench1_android_df = gini_dfa1.copy()
gini_dfa1


Date column info:
Data type: datetime64[us, UTC]
Sample values: 0   2024-10-17 13:08:36+00:00
1   2024-11-20 14:52:04+00:00
2   2024-10-18 22:42:11+00:00
3   2024-10-09 20:27:49+00:00
4   2024-10-08 23:36:36+00:00
Name: ln_appln_submit_datetime, dtype: datetime64[us, UTC]
Any null values: 0

Gini Results:


Unnamed: 0,Period,Start_Date,End_Date,credo_score_gini,credolabScore_gini,sample_size
0,Train,2024-08-13,2025-01-31,0.297348,0.086991,2671
1,OOT 1,2025-02-01,2025-02-28,0.266585,0.041626,526
2,OOT 2,2025-03-01,2025-03-31,0.330236,0.272928,331
3,OOT 3,2025-04-01,2025-04-30,0.27977,0.0391,322
4,OOT 4,2025-05-01,2025-05-31,0.274099,0.094342,366


# Query IOS

In [39]:
sq = f"""
with base as 
(select a1.customer_id, a1.digitalLoanAccountId,
lmt.loanAccountNumber,a1.ln_os_type, a1.ln_appln_submit_datetime, a1.credo_score
-- ,lmt.credolabScore
-- , al.gen_credo_score credolabScore
, coalesce(ctial.score_all_score, lmt.credolabScore) credolabScore
,a1.ln_fspd30_flag,a1.ln_mature_fspd30_flag, ldd.obs_min_inst_def30, ldd.min_inst_def30, 
case when ldd.obs_min_inst_def30 >=2 then 1 else 0 end flag_mature_fspd30,
case when ldd.obs_min_inst_def30 >=2 and ldd.min_inst_def30 in (1,2) then 1 else 0 end fspd30,
from {schema1}.{cash_alpha_trench1} a1
inner join `risk_credit_mis.loan_master_table` lmt on lmt.digitalLoanAccountId = a1.digitalLoanAccountId
inner join prj-prod-dataplatform.risk_credit_mis.loan_deliquency_data ldd on lmt.loanAccountNumber = ldd.loanAccountNumber
left join risk_mart.applied_loans_20230101_20250831 al on al.digitalLoanAccountId = a1.digitalLoanAccountId
left join risk_credit_mis.credolab_trace_insight_all_loans ctial on ctial.referenceNumber = lmt.credolabRefNumber
where flagDisbursement = 1
)
select * from base 
where 
flag_mature_fspd30 = 1
and ln_os_type not like 'Android'
and credo_score is not null
and credolabScore is not null
;
"""
da1 = client.query(sq).to_dataframe(progress_bar_type='tqdm')
print(f"The shape of {schema1}.{cash_alpha_trench1} table is:\t {da1.shape}")

Job ID 87243922-2645-4e06-b00c-8501a35d6038 successfully executed: 100%|[32m██████████[0m|




Downloading: 100%|[32m██████████[0m|
The shape of worktable_data_analysis.cash_alpha_trench1_applied_loans_backscored_20241001_20250831 table is:	 (1473, 13)


In [40]:
# Define your periods
data_periods_dict = {
    'Train': {'start': '2024-08-13', 'end': '2025-01-31'}, 
    'OOT 1': {'start': '2025-02-01', 'end': '2025-02-28'},
    'OOT 2': {'start': '2025-03-01', 'end': '2025-03-31'},
    'OOT 3': {'start': '2025-04-01', 'end': '2025-04-30'},
    'OOT 4': {'start': '2025-05-01', 'end': '2025-05-31'},
}

# Debug: Check your date column format first
print("Date column info:")
print(f"Data type: {da1['ln_appln_submit_datetime'].dtype}")
print(f"Sample values: {da1['ln_appln_submit_datetime'].head()}")
print(f"Any null values: {da1['ln_appln_submit_datetime'].isnull().sum()}")

# Calculate Gini coefficients
gini_dfa1 = calculate_gini(da1, 'ln_appln_submit_datetime', 'fspd30', data_periods_dict)
print("\nGini Results:")
cash_alpha_trench1_ios_df = gini_dfa1.copy()
gini_dfa1


Date column info:
Data type: datetime64[us, UTC]
Sample values: 0   2025-03-24 14:50:02+00:00
1   2025-06-17 04:18:38+00:00
2   2025-04-30 10:44:39+00:00
3   2025-05-07 14:50:22+00:00
4   2025-05-08 11:02:52+00:00
Name: ln_appln_submit_datetime, dtype: datetime64[us, UTC]
Any null values: 0

Gini Results:


Unnamed: 0,Period,Start_Date,End_Date,credo_score_gini,credolabScore_gini,sample_size
0,Train,2024-08-13,2025-01-31,,,0
1,OOT 1,2025-02-01,2025-02-28,,,0
2,OOT 2,2025-03-01,2025-03-31,0.135904,0.099786,148
3,OOT 3,2025-04-01,2025-04-30,0.094354,0.077344,432
4,OOT 4,2025-05-01,2025-05-31,0.200716,0.06377,541


# cash_alpha_trench2_applied_loans_backscored_20241001_20250831

# Table

In [41]:
schema1 = 'worktable_data_analysis'
cash_alpha_trench2 = f'cash_alpha_trench2_applied_loans_backscored_20241001_20250831'

# Query Android

In [42]:
sq = f"""
with base as 
(select a1.customer_id, a1.digitalLoanAccountId,
lmt.loanAccountNumber,a1.ln_os_type, a1.ln_appln_submit_datetime, a1.credo_score
-- ,lmt.credolabScore
-- , al.gen_credo_score credolabScore
, coalesce(ctial.score_all_score, lmt.credolabScore) credolabScore
,a1.ln_fspd30_flag,a1.ln_mature_fspd30_flag, ldd.obs_min_inst_def30, ldd.min_inst_def30, 
case when ldd.obs_min_inst_def30 >=2 then 1 else 0 end flag_mature_fspd30,
case when ldd.obs_min_inst_def30 >=2 and ldd.min_inst_def30 in (1,2) then 1 else 0 end fspd30,
from {schema1}.{cash_alpha_trench2} a1
inner join `risk_credit_mis.loan_master_table` lmt on lmt.digitalLoanAccountId = a1.digitalLoanAccountId
inner join prj-prod-dataplatform.risk_credit_mis.loan_deliquency_data ldd on lmt.loanAccountNumber = ldd.loanAccountNumber
left join risk_mart.applied_loans_20230101_20250831 al on al.digitalLoanAccountId = a1.digitalLoanAccountId
left join risk_credit_mis.credolab_trace_insight_all_loans ctial on ctial.referenceNumber = lmt.credolabRefNumber
where flagDisbursement = 1
)
select * from base 
where 
flag_mature_fspd30 = 1
and ln_os_type like 'Android'
and credo_score is not null
and credolabScore is not null
;
"""
da2 = client.query(sq).to_dataframe(progress_bar_type='tqdm')
print(f"The shape of {schema1}.{cash_alpha_trench2} table is:\t {da2.shape}")

Job ID 35adb798-7a4a-4b63-aa89-dcc5feb7ad1a successfully executed: 100%|[32m██████████[0m|




Downloading: 100%|[32m██████████[0m|
The shape of worktable_data_analysis.cash_alpha_trench2_applied_loans_backscored_20241001_20250831 table is:	 (3733, 13)


In [43]:
# Define your periods
data_periods_dict = {
    'Train': {'start': '2024-08-13', 'end': '2025-01-31'}, 
    'OOT 1': {'start': '2025-02-01', 'end': '2025-02-28'},
    'OOT 2': {'start': '2025-03-01', 'end': '2025-03-31'},
    'OOT 3': {'start': '2025-04-01', 'end': '2025-04-30'},
    'OOT 4': {'start': '2025-05-01', 'end': '2025-05-31'},
}

# Debug: Check your date column format first
print("Date column info:")
print(f"Data type: {da2['ln_appln_submit_datetime'].dtype}")
print(f"Sample values: {da2['ln_appln_submit_datetime'].head()}")
print(f"Any null values: {da2['ln_appln_submit_datetime'].isnull().sum()}")

# Calculate Gini coefficients
gini_dfa2 = calculate_gini(da2, 'ln_appln_submit_datetime', 'fspd30', data_periods_dict)
print("\nGini Results:")
cash_alpha_trench2_android_df = gini_dfa2.copy()
gini_dfa2

Date column info:
Data type: datetime64[us, UTC]
Sample values: 0   2024-10-27 21:01:45+00:00
1   2024-11-08 16:03:27+00:00
2   2024-10-26 14:49:22+00:00
3   2024-10-24 08:36:32+00:00
4   2024-11-11 13:46:59+00:00
Name: ln_appln_submit_datetime, dtype: datetime64[us, UTC]
Any null values: 0

Gini Results:


Unnamed: 0,Period,Start_Date,End_Date,credo_score_gini,credolabScore_gini,sample_size
0,Train,2024-08-13,2025-01-31,0.26919,0.168788,1829
1,OOT 1,2025-02-01,2025-02-28,0.402986,0.233028,447
2,OOT 2,2025-03-01,2025-03-31,0.353773,0.038355,467
3,OOT 3,2025-04-01,2025-04-30,0.296702,0.18587,382
4,OOT 4,2025-05-01,2025-05-31,0.309598,0.178941,386


# Query IOS

In [44]:
sq = f"""
with base as 
(select a1.customer_id, a1.digitalLoanAccountId,
lmt.loanAccountNumber,a1.ln_os_type, a1.ln_appln_submit_datetime, a1.credo_score
-- ,lmt.credolabScore
-- , al.gen_credo_score credolabScore
, coalesce(ctial.score_all_score, lmt.credolabScore) credolabScore
,a1.ln_fspd30_flag,a1.ln_mature_fspd30_flag, ldd.obs_min_inst_def30, ldd.min_inst_def30, 
case when ldd.obs_min_inst_def30 >=2 then 1 else 0 end flag_mature_fspd30,
case when ldd.obs_min_inst_def30 >=2 and ldd.min_inst_def30 in (1,2) then 1 else 0 end fspd30,
from {schema1}.{cash_alpha_trench2} a1
inner join `risk_credit_mis.loan_master_table` lmt on lmt.digitalLoanAccountId = a1.digitalLoanAccountId
inner join prj-prod-dataplatform.risk_credit_mis.loan_deliquency_data ldd on lmt.loanAccountNumber = ldd.loanAccountNumber
left join risk_mart.applied_loans_20230101_20250831 al on al.digitalLoanAccountId = a1.digitalLoanAccountId
left join risk_credit_mis.credolab_trace_insight_all_loans ctial on ctial.referenceNumber = lmt.credolabRefNumber
where flagDisbursement = 1
)
select * from base 
where 
flag_mature_fspd30 = 1
and ln_os_type not like 'Android'
and credo_score is not null
and credolabScore is not null
;
"""
da2 = client.query(sq).to_dataframe(progress_bar_type='tqdm')
print(f"The shape of {schema1}.{cash_alpha_trench2} table is:\t {da2.shape}")

Job ID dff885b8-7021-42f7-8f57-22db62c6da8f successfully executed: 100%|[32m██████████[0m|




Downloading: 100%|[32m██████████[0m|
The shape of worktable_data_analysis.cash_alpha_trench2_applied_loans_backscored_20241001_20250831 table is:	 (1145, 13)


In [45]:
# Define your periods
data_periods_dict = {
    'Train': {'start': '2024-08-13', 'end': '2025-01-31'}, 
    'OOT 1': {'start': '2025-02-01', 'end': '2025-02-28'},
    'OOT 2': {'start': '2025-03-01', 'end': '2025-03-31'},
    'OOT 3': {'start': '2025-04-01', 'end': '2025-04-30'},
    'OOT 4': {'start': '2025-05-01', 'end': '2025-05-31'},
}

# Debug: Check your date column format first
print("Date column info:")
print(f"Data type: {da2['ln_appln_submit_datetime'].dtype}")
print(f"Sample values: {da2['ln_appln_submit_datetime'].head()}")
print(f"Any null values: {da2['ln_appln_submit_datetime'].isnull().sum()}")

# Calculate Gini coefficients
gini_dfa2 = calculate_gini(da2, 'ln_appln_submit_datetime', 'fspd30', data_periods_dict)
print("\nGini Results:")
cash_alpha_trench2_ios_df = gini_dfa2.copy()
gini_dfa2

Date column info:
Data type: datetime64[us, UTC]
Sample values: 0   2025-04-05 11:03:20+00:00
1   2025-05-04 19:51:16+00:00
2   2025-04-23 10:41:57+00:00
3   2025-03-25 19:12:03+00:00
4   2025-06-02 05:39:43+00:00
Name: ln_appln_submit_datetime, dtype: datetime64[us, UTC]
Any null values: 0

Gini Results:


Unnamed: 0,Period,Start_Date,End_Date,credo_score_gini,credolabScore_gini,sample_size
0,Train,2024-08-13,2025-01-31,,,0
1,OOT 1,2025-02-01,2025-02-28,,,0
2,OOT 2,2025-03-01,2025-03-31,0.177807,0.105615,100
3,OOT 3,2025-04-01,2025-04-30,0.075477,-0.006093,389
4,OOT 4,2025-05-01,2025-05-31,0.063732,0.01689,419


# cash_alpha_trench3_applied_loans_backscored_20241001_20250831

# Table

In [46]:
schema1 = 'worktable_data_analysis'
cash_alpha_trench3 = f'cash_alpha_trench3_applied_loans_backscored_20241001_20250831'

# Query Android

In [47]:
sq = f"""
with base as 
(select a1.customer_id, a1.digitalLoanAccountId,
lmt.loanAccountNumber,a1.ln_os_type, a1.ln_appln_submit_datetime, a1.credo_score
-- ,lmt.credolabScore
-- , al.gen_credo_score credolabScore
, coalesce(ctial.score_all_score, lmt.credolabScore) credolabScore
,a1.ln_fspd30_flag,a1.ln_mature_fspd30_flag, ldd.obs_min_inst_def30, ldd.min_inst_def30, 
case when ldd.obs_min_inst_def30 >=2 then 1 else 0 end flag_mature_fspd30,
case when ldd.obs_min_inst_def30 >=2 and ldd.min_inst_def30 in (1,2) then 1 else 0 end fspd30,
from {schema1}.{cash_alpha_trench3} a1
inner join `risk_credit_mis.loan_master_table` lmt on lmt.digitalLoanAccountId = a1.digitalLoanAccountId
inner join prj-prod-dataplatform.risk_credit_mis.loan_deliquency_data ldd on lmt.loanAccountNumber = ldd.loanAccountNumber
left join risk_mart.applied_loans_20230101_20250831 al on al.digitalLoanAccountId = a1.digitalLoanAccountId
left join risk_credit_mis.credolab_trace_insight_all_loans ctial on ctial.referenceNumber = lmt.credolabRefNumber
where flagDisbursement = 1
)
select * from base 
where 
flag_mature_fspd30 = 1
and ln_os_type like 'Android'
and credo_score is not null
and credolabScore is not null
;
"""
da3 = client.query(sq).to_dataframe(progress_bar_type='tqdm')
print(f"The shape of {schema1}.{cash_alpha_trench3} table is:\t {da3.shape}")

Job ID c7266b7b-d347-4457-83c0-c3ac358b8430 successfully executed: 100%|[32m██████████[0m|




Downloading: 100%|[32m██████████[0m|
The shape of worktable_data_analysis.cash_alpha_trench3_applied_loans_backscored_20241001_20250831 table is:	 (3198, 13)


In [48]:
# Define your periods
data_periods_dict = {
    'Train': {'start': '2024-08-13', 'end': '2025-01-31'}, 
    'OOT 1': {'start': '2025-02-01', 'end': '2025-02-28'},
    'OOT 2': {'start': '2025-03-01', 'end': '2025-03-31'},
    'OOT 3': {'start': '2025-04-01', 'end': '2025-04-30'},
    'OOT 4': {'start': '2025-05-01', 'end': '2025-05-31'},
}

# Debug: Check your date column format first
print("Date column info:")
print(f"Data type: {da3['ln_appln_submit_datetime'].dtype}")
print(f"Sample values: {da3['ln_appln_submit_datetime'].head()}")
print(f"Any null values: {da3['ln_appln_submit_datetime'].isnull().sum()}")

# Calculate Gini coefficients
gini_dfa3 = calculate_gini(da3, 'ln_appln_submit_datetime', 'fspd30', data_periods_dict)
print("\nGini Results:")
cash_alpha_trench3_android_df = gini_dfa3.copy()
gini_dfa3

Date column info:
Data type: datetime64[us, UTC]
Sample values: 0   2025-05-16 18:13:24+00:00
1   2025-02-27 12:42:50+00:00
2   2024-10-20 05:23:03+00:00
3   2025-01-30 13:32:47+00:00
4   2025-01-08 06:20:39+00:00
Name: ln_appln_submit_datetime, dtype: datetime64[us, UTC]
Any null values: 0

Gini Results:


Unnamed: 0,Period,Start_Date,End_Date,credo_score_gini,credolabScore_gini,sample_size
0,Train,2024-08-13,2025-01-31,0.157475,0.089785,1305
1,OOT 1,2025-02-01,2025-02-28,0.243067,-0.037301,376
2,OOT 2,2025-03-01,2025-03-31,0.291165,0.065186,409
3,OOT 3,2025-04-01,2025-04-30,0.193138,0.15735,364
4,OOT 4,2025-05-01,2025-05-31,0.052198,0.233544,481


# Query IOS

In [49]:
sq = f"""
with base as 
(select a1.customer_id, a1.digitalLoanAccountId,
lmt.loanAccountNumber,a1.ln_os_type, a1.ln_appln_submit_datetime, a1.credo_score
-- ,lmt.credolabScore
-- , al.gen_credo_score credolabScore
, coalesce(ctial.score_all_score, lmt.credolabScore) credolabScore
,a1.ln_fspd30_flag,a1.ln_mature_fspd30_flag, ldd.obs_min_inst_def30, ldd.min_inst_def30, 
case when ldd.obs_min_inst_def30 >=2 then 1 else 0 end flag_mature_fspd30,
case when ldd.obs_min_inst_def30 >=2 and ldd.min_inst_def30 in (1,2) then 1 else 0 end fspd30,
from {schema1}.{cash_alpha_trench3} a1
inner join `risk_credit_mis.loan_master_table` lmt on lmt.digitalLoanAccountId = a1.digitalLoanAccountId
inner join prj-prod-dataplatform.risk_credit_mis.loan_deliquency_data ldd on lmt.loanAccountNumber = ldd.loanAccountNumber
left join risk_mart.applied_loans_20230101_20250831 al on al.digitalLoanAccountId = a1.digitalLoanAccountId
left join risk_credit_mis.credolab_trace_insight_all_loans ctial on ctial.referenceNumber = lmt.credolabRefNumber
where flagDisbursement = 1
)
select * from base 
where 
flag_mature_fspd30 = 1
and ln_os_type not like 'Android'
and credo_score is not null
and credolabScore is not null
;
"""
da3 = client.query(sq).to_dataframe(progress_bar_type='tqdm')
print(f"The shape of {schema1}.{cash_alpha_trench3} table is:\t {da3.shape}")

Job ID b1228e10-48c3-49b3-8ed5-86e3249bc0c5 successfully executed: 100%|[32m██████████[0m|




Downloading: 100%|[32m██████████[0m|
The shape of worktable_data_analysis.cash_alpha_trench3_applied_loans_backscored_20241001_20250831 table is:	 (1239, 13)


In [50]:
# Define your periods
data_periods_dict = {
    'Train': {'start': '2024-08-13', 'end': '2025-01-31'}, 
    'OOT 1': {'start': '2025-02-01', 'end': '2025-02-28'},
    'OOT 2': {'start': '2025-03-01', 'end': '2025-03-31'},
    'OOT 3': {'start': '2025-04-01', 'end': '2025-04-30'},
    'OOT 4': {'start': '2025-05-01', 'end': '2025-05-31'},
}

# Debug: Check your date column format first
print("Date column info:")
print(f"Data type: {da3['ln_appln_submit_datetime'].dtype}")
print(f"Sample values: {da3['ln_appln_submit_datetime'].head()}")
print(f"Any null values: {da3['ln_appln_submit_datetime'].isnull().sum()}")

# Calculate Gini coefficients
gini_dfa3 = calculate_gini(da3, 'ln_appln_submit_datetime', 'fspd30', data_periods_dict)
print("\nGini Results:")
cash_alpha_trench3_ios_df = gini_dfa3.copy()
gini_dfa3

Date column info:
Data type: datetime64[us, UTC]
Sample values: 0   2025-04-20 03:00:04+00:00
1   2025-03-22 14:16:05+00:00
2   2025-04-20 11:35:53+00:00
3   2025-04-23 17:00:34+00:00
4   2025-05-25 11:02:27+00:00
Name: ln_appln_submit_datetime, dtype: datetime64[us, UTC]
Any null values: 0

Gini Results:


Unnamed: 0,Period,Start_Date,End_Date,credo_score_gini,credolabScore_gini,sample_size
0,Train,2024-08-13,2025-01-31,,,0
1,OOT 1,2025-02-01,2025-02-28,,,0
2,OOT 2,2025-03-01,2025-03-31,0.258491,-0.203774,63
3,OOT 3,2025-04-01,2025-04-30,0.162034,-0.020437,355
4,OOT 4,2025-05-01,2025-05-31,-0.058195,-0.035268,518


# Testing

In [51]:
schema1 = 'worktable_data_analysis'
tab = f'cash_beta_trench1_applied_loans_backscored_20241001_20250831'
sq = f"""with base as 
(select a1.customer_id, a1.digitalLoanAccountId,
lmt.loanAccountNumber,a1.ln_os_type, a1.ln_appln_submit_datetime, a1.credo_score
-- ,lmt.credolabScore  --- when selecting only credolabscore from loan master table
-- , al.gen_credo_score credolabScore when selecting only gen_credo_score from Bala's applied table 
, coalesce(ctial.score_all_score, lmt.credolabScore) credolabScore  -- when mixing loan master and trace credo score -- current approach
-- , ctial.score_all_score credolabScore  -- when selecting only trace table
-- , coalesce(cast(ctial.score_all_probability as numeric),cast(al.gen_credo_score as numeric))credolabScore
,a1.ln_fspd30_flag,a1.ln_mature_fspd30_flag, ldd.obs_min_inst_def30, ldd.min_inst_def30, 
case when ldd.obs_min_inst_def30 >=2 then 1 else 0 end flag_mature_fspd30,
case when ldd.obs_min_inst_def30 >=2 and ldd.min_inst_def30 in (1,2) then 1 else 0 end fspd30,
from {schema1}.{tab} a1
inner join `risk_credit_mis.loan_master_table` lmt on lmt.digitalLoanAccountId = a1.digitalLoanAccountId
inner join prj-prod-dataplatform.risk_credit_mis.loan_deliquency_data ldd on lmt.loanAccountNumber = ldd.loanAccountNumber
left join risk_mart.applied_loans_20230101_20250831 al on al.digitalLoanAccountId = a1.digitalLoanAccountId
left join risk_credit_mis.credolab_trace_insight_all_loans ctial on ctial.referenceNumber = lmt.credolabRefNumber
where flagDisbursement = 1
)
select * from base 
where 
flag_mature_fspd30 = 1
and ln_os_type like 'Android'
and credo_score is not null
and credolabScore is not null
;"""

dfd = client.query(sq).to_dataframe(progress_bar_type='tqdm')
print(f"The shape of {schema1}.{tab} table is:\t {dfd.shape}")

# Define your periods
data_periods_dict = {
    'Train': {'start': '2024-08-13', 'end': '2025-01-31'}, 
    'OOT 1': {'start': '2025-02-01', 'end': '2025-02-28'},
    'OOT 2': {'start': '2025-03-01', 'end': '2025-03-31'},
    'OOT 3': {'start': '2025-04-01', 'end': '2025-04-30'},
    'OOT 4': {'start': '2025-05-01', 'end': '2025-05-31'},
}

# Debug: Check your date column format first
print("Date column info:")
print(f"Data type: {dfd['ln_appln_submit_datetime'].dtype}")
print(f"Sample values: {dfd['ln_appln_submit_datetime'].head()}")
print(f"Any null values: {dfd['ln_appln_submit_datetime'].isnull().sum()}")

# Calculate Gini coefficients
gini_dfd = calculate_gini(dfd, 'ln_appln_submit_datetime', 'fspd30', data_periods_dict)
print("\nGini Results:")
gini_dfd



Job ID 013f92e5-c61f-4b51-b3d3-ea2ee0b431a9 successfully executed: 100%|[32m██████████[0m|




Downloading: 100%|[32m██████████[0m|
The shape of worktable_data_analysis.cash_beta_trench1_applied_loans_backscored_20241001_20250831 table is:	 (4597, 13)
Date column info:
Data type: datetime64[us, UTC]
Sample values: 0   2024-10-15 10:14:21+00:00
1   2024-10-31 20:42:11+00:00
2   2024-11-15 15:12:37+00:00
3   2024-10-01 23:09:06+00:00
4   2024-10-30 10:56:41+00:00
Name: ln_appln_submit_datetime, dtype: datetime64[us, UTC]
Any null values: 0

Gini Results:


Unnamed: 0,Period,Start_Date,End_Date,credo_score_gini,credolabScore_gini,sample_size
0,Train,2024-08-13,2025-01-31,0.30136,0.090981,2762
1,OOT 1,2025-02-01,2025-02-28,0.285793,0.063103,548
2,OOT 2,2025-03-01,2025-03-31,0.336065,0.261779,335
3,OOT 3,2025-04-01,2025-04-30,0.271644,0.043692,334
4,OOT 4,2025-05-01,2025-05-31,0.29,0.127838,371


In [None]:
dfd.head()

In [None]:
dfd.to_pickle(r"D:\OneDrive - Tonik Financial Pte Ltd\MyStuff\Data Engineering\Model_Monitoring\New_Model_Monitoring\Data\data.pkl")
dfd.to_parquet(r"D:\OneDrive - Tonik Financial Pte Ltd\MyStuff\Data Engineering\Model_Monitoring\New_Model_Monitoring\Data\data.parquet")