# Define Library

In [36]:
# %% [markdown]
# # Jupyter Notebook Loading Header
#
# This is a custom loading header for Jupyter Notebooks in Visual Studio Code.
# It includes common imports and settings to get you started quickly.
# %% [markdown]
## Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from google.cloud import bigquery
from google.cloud import storage
import os
import tempfile
import time
from datetime import datetime
import uuid
import joblib
import uuid

import gcsfs
import duckdb as dd
import pickle
import joblib
from typing import Union
import io

path = r'C:\Users\Dwaipayan\AppData\Roaming\gcloud\legacy_credentials\dchakroborti@tonikbank.com\adc.json'
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = path
client = bigquery.Client(project='prj-prod-dataplatform')
os.environ["GOOGLE_CLOUD_PROJECT"] = "prj-prod-dataplatform"
# %% [markdown]
## Configure Settings
# Set options or configurations as needed
pd.set_option('display.max_columns', None)
pd.set_option("Display.max_rows", 100)


# Constant

In [37]:
CURRENT_DATE = datetime.now().strftime("%Y%m%d")


# Config

In [38]:
unique_id = str(uuid.uuid4()).replace('-', '')[-12:]
print(f"The unique Id is: {unique_id}")
BUCKETNAME = 'prod-asia-southeast1-tonik-aiml-workspace'
CLOUDPATH = 'DC/Model_Monitoring/Model_Tables'
LOCALPATH = r'D:\OneDrive - Tonik Financial Pte Ltd\MyStuff\Data Engineering\Model_Monitoring\New_Model_Monitoring\Data'
VERSION = 'V1'

The unique Id is: 44dbc309ac1b


# <div align="left" style="color:rgb(51, 250, 250);"> Functions </div>

## <div align="left" style="color:rgb(51, 250, 250);"> Save the data to google clound storage </div>

In [39]:
def save_df_to_gcs(df, bucket_name, destination_blob_name, file_format='csv'):
    """Saves a pandas DataFrame to Google Cloud Storage.

    Args:
        df: The pandas DataFrame to save.
        bucket_name: The name of the GCS bucket.
        destination_blob_name: The name of the blob to be created.
        file_format: The file format to save the DataFrame in ('csv' or 'parquet').
    """

    # Create a temporary file
    if file_format == 'csv':
        temp_file = 'temp.csv'
        df.to_csv(temp_file, index=False)
    elif file_format == 'parquet':
        temp_file = 'temp.parquet'
        df.to_parquet(temp_file, index=False)
    else:
        raise ValueError("Invalid file format. Please choose 'csv' or 'parquet'.")

    # Upload the file to GCS
    storage_client = storage.Client(project="prj-prod-dataplatform")

    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(destination_blob_name)

    blob.upload_from_filename(temp_file)

    # Remove the temporary file
    import os
    os.remove(temp_file)
    


## <div align="left" style="color:rgb(51, 250, 250);"> Read the Data from Google Cloud Storage </div>

In [40]:
def read_df_from_gcs(bucket_name, source_blob_name, file_format='csv'):
    """Reads a DataFrame from Google Cloud Storage.

    Args:
        bucket_name: The name of the GCS bucket.
        source_blob_name: The name of the blob to read.
        file_format: The file format to read ('csv' or 'parquet').

    Returns:
        pandas.DataFrame: The data loaded from the GCS file.
    """
    # Create a temporary file name
    temp_file = f'temp.{file_format}'
    
    try:
        # Initialize GCS client
        storage_client = storage.Client()
        bucket = storage_client.bucket(bucket_name)
        blob = bucket.blob(source_blob_name)

        # Download the file to a temporary location
        blob.download_to_filename(temp_file)

        # Read the file into a DataFrame
        if file_format == 'csv':
            df = pd.read_csv(temp_file, low_memory=False)
        elif file_format == 'parquet':
            df = pd.read_parquet(temp_file)
        else:
            raise ValueError("Invalid file format. Please choose 'csv' or 'parquet'.")

        return df

    finally:
        # Clean up the temporary file
        if os.path.exists(temp_file):
            os.remove(temp_file)

## <div align = "left" style="color:rgb(51, 250, 250);"> Data Quality Report </div>

In [41]:
def data_quality_report(df, target_col='ln_fspd30_flag'):
    # Initialize an empty list to store each row of data
    report_data = []
    # Iterate over each column in the DataFrame to compute metrics
    for col in df.columns:
        # Determine the data type of the column
        data_type = df[col].dtype
       
        # Calculate the number of missing values in the column
        missing_values = df[col].isnull().sum()
       
        # Calculate the percentage of missing values relative to the total number of rows
        missing_percentage = (missing_values / len(df)) * 100
       
        # Calculate the number of unique values in the column
        unique_values = df[col].nunique()
       
        # Calculate the percentage of non-missing values
        non_missing_percentage = ((len(df) - missing_values) / len(df)) * 100
       
        # Check if the column is numeric to compute additional metrics
        if pd.api.types.is_numeric_dtype(df[col]):
            # Compute minimum, maximum, mean, median, mode, mode percentage, standard deviation, and quantiles
            min_value = df[col].min()
            max_value = df[col].max()
            mean_value = df[col].mean()
            median_value = df[col].median()
            mode_value = df[col].mode().iloc[0] if not df[col].mode().empty else None
            mode_percentage = (df[col] == mode_value).sum() / len(df) * 100 if mode_value is not None else None
            std_dev = df[col].std()
            quantile_25 = df[col].quantile(0.25)
            quantile_50 = df[col].quantile(0.50)  # Same as median
            quantile_75 = df[col].quantile(0.75)
            
            # Calculate the Interquartile Range (IQR)
            iqr = quantile_75 - quantile_25
            
            # Calculate Skewness and Kurtosis
            skewness = df[col].skew()
            kurtosis = df[col].kurt()
            
            # Calculate Coefficient of Variation (CV) - standardized measure of dispersion
            cv = (std_dev / mean_value) * 100 if mean_value != 0 else None
            
            # Calculate correlation with target variable if target exists in dataframe
            if target_col in df.columns and col != target_col and pd.api.types.is_numeric_dtype(df[target_col]):
                # Calculate correlation only using rows where both columns have non-null values
                correlation = df[[col, target_col]].dropna().corr().iloc[0, 1]
            else:
                correlation = None
        else:
            # Assign None for non-numeric columns where appropriate
            min_value = None
            max_value = None
            mean_value = None
            median_value = None
            mode_value = df[col].mode().iloc[0] if not df[col].mode().empty else None
            mode_percentage = (df[col] == mode_value).sum() / len(df) * 100 if mode_value is not None else None
            std_dev = None
            quantile_25 = None
            quantile_50 = None
            quantile_75 = None
            iqr = None
            skewness = None
            kurtosis = None
            cv = None
            correlation = None
       
        # Append the computed metrics for the current column to the list
        report_data.append({
            'Column': col,
            'Data Type': data_type,
            'Missing Values': missing_values,
            'Missing Percentage': missing_percentage,
            'Unique Values': unique_values,
            'Min': min_value,
            'Max': max_value,
            'Mean': mean_value,
            'Median': median_value,
            'Mode': mode_value,
            'Mode Percentage': mode_percentage,
            'Std Dev': std_dev,
            'Non-missing Percentage': non_missing_percentage,
            '25% Quantile': quantile_25,
            '50% Quantile': quantile_50,
            '75% Quantile': quantile_75,
            'IQR': iqr,
            'Skewness': skewness,
            'Kurtosis': kurtosis,
            'CV (%)': cv,
            f'Correlation with {target_col}': correlation
        })
    # Create the DataFrame from the list of dictionaries
    report = pd.DataFrame(report_data)
   
    # Return the complete data quality report DataFrame
    return report

# <div align = "left" style="color:rgb(51,250,250);"> Upload pickle file to Google Cloud Storage Bucke </div>

In [42]:
def upload_to_gcs(bucket_name, source_file_path, destination_blob_name):
    """Uploads a file to Google Cloud Storage"""
    client = storage.Client()
    bucket = client.bucket(bucket_name)
    blob = bucket.blob(destination_blob_name)
    
    blob.upload_from_filename(source_file_path)
    print(f"File {source_file_path} uploaded to {bucket_name}/{destination_blob_name}")

In [43]:
import pickle
import io
from google.cloud import storage
def save_pickle_to_gcs(data, bucket_name, destination_blob_name):
    """
    Save any Python object as a pickle file to Google Cloud Storage
    
    Args:
        data: The Python object to pickle (DataFrame, dict, list, etc.)
        bucket_name: Name of the GCS bucket
        destination_blob_name: Path/filename in the bucket
    """
    # Initialize the GCS client
    client = storage.Client()
    bucket = client.bucket(bucket_name)
    blob = bucket.blob(destination_blob_name)
    
    # Serialize the data to pickle format in memory
    pickle_buffer = io.BytesIO()
    pickle.dump(data, pickle_buffer)
    pickle_buffer.seek(0)
    
    # Upload the pickle data to GCS
    blob.upload_from_file(pickle_buffer, content_type='application/octet-stream')
    print(f"Pickle file uploaded to gs://{bucket_name}/{destination_blob_name}")

# save_dataframe_multi_format

In [44]:
def save_dataframe_multi_format(
    dataframe: pd.DataFrame, 
    cloud_path: str, 
    filename: str, 
    client: bigquery.Client = None,
    bucket_name: str = None
) -> dict:
    """
    Save a pandas DataFrame to Google Cloud Storage in multiple formats (CSV, Pickle, Parquet, Joblib).
    
    Args:
        dataframe (pd.DataFrame): The DataFrame to save
        cloud_path (str): The cloud path (e.g., 'DC/Model_Monitoring/cash_beta_trench1_data')
        filename (str): The base filename without extension
        client (bigquery.Client, optional): BigQuery client (for project reference)
        bucket_name (str, optional): GCS bucket name. If None, will try to extract from client
        
    Returns:
        dict: Dictionary with status of each file saved
        
    Example:
        client = bigquery.Client(project='prj-prod-dataplatform')
        CLOUDPATH = 'DC/Model_Monitoring/cash_beta_trench1_data'
        
        results = save_dataframe_multi_format(
            dataframe=d1,
            cloud_path=CLOUDPATH,
            filename='my_data',
            client=client,
            bucket_name='your-bucket-name'  # Replace with your actual bucket name
        )
    """
    
    # Initialize Google Cloud Storage client
    storage_client = storage.Client(project=client.project if client else None)
    
    # You'll need to specify your bucket name here
    # Common bucket names in GCP data platforms might be like:
    # - 'prj-prod-dataplatform-storage'
    # - 'dataplatform-storage'
    # - or similar pattern
    if bucket_name is None:
        # You need to replace this with your actual bucket name
        raise ValueError("Please provide the bucket_name parameter")
    
    bucket = storage_client.bucket(bucket_name)
    
    # Results dictionary to track saves
    results = {}
    
    # Ensure cloud_path doesn't start with '/'
    cloud_path = cloud_path.lstrip('/')
    
    try:
        # 1. Save as CSV
        csv_buffer = io.StringIO()
        dataframe.to_csv(csv_buffer, index=False)
        csv_blob = bucket.blob(f"{cloud_path}/{filename}.csv")
        csv_blob.upload_from_string(csv_buffer.getvalue(), content_type='text/csv')
        results['csv'] = f"gs://{bucket_name}/{cloud_path}/{filename}.csv"
        
        # 2. Save as Pickle
        pickle_buffer = io.BytesIO()
        pickle.dump(dataframe, pickle_buffer)
        pickle_blob = bucket.blob(f"{cloud_path}/{filename}.pkl")
        pickle_blob.upload_from_string(pickle_buffer.getvalue(), content_type='application/octet-stream')
        results['pickle'] = f"gs://{bucket_name}/{cloud_path}/{filename}.pkl"
        
        # 3. Save as Parquet
        parquet_buffer = io.BytesIO()
        dataframe.to_parquet(parquet_buffer, index=False)
        parquet_blob = bucket.blob(f"{cloud_path}/{filename}.parquet")
        parquet_blob.upload_from_string(parquet_buffer.getvalue(), content_type='application/octet-stream')
        results['parquet'] = f"gs://{bucket_name}/{cloud_path}/{filename}.parquet"
        
        # 4. Save as Joblib
        joblib_buffer = io.BytesIO()
        joblib.dump(dataframe, joblib_buffer)
        joblib_blob = bucket.blob(f"{cloud_path}/{filename}.joblib")
        joblib_blob.upload_from_string(joblib_buffer.getvalue(), content_type='application/octet-stream')
        results['joblib'] = f"gs://{bucket_name}/{cloud_path}/{filename}.joblib"
        
        print("All files saved successfully!")
        for format_type, path in results.items():
            print(f"{format_type.upper()}: {path}")
            
    except Exception as e:
        print(f"Error occurred: {str(e)}")
        results['error'] = str(e)
    
    return results

# cash_beta_trench1_applied_loans_backscored_20241001_20250831

# Table

In [45]:
schema1 = 'worktable_data_analysis'
cash_beta_trench1 = f'cash_beta_trench1_applied_loans_backscored_20241001_20250831'

# Query

In [46]:
sq = f"""
select * from worktable_data_analysis.cash_beta_trench1_applied_loans_backscored_20241001_20250831;
"""
d1 = client.query(sq).to_dataframe(progress_bar_type='tqdm')
print(f"The shape of {schema1}.{cash_beta_trench1} table is:\t {d1.shape}")

Job ID ae6335d2-3b6c-4001-aac4-975ceba394b6 successfully executed: 100%|[32m██████████[0m|




Downloading: 100%|[32m██████████[0m|
The shape of worktable_data_analysis.cash_beta_trench1_applied_loans_backscored_20241001_20250831 table is:	 (296480, 34)


In [47]:
d1.columns.values

array(['customer_id', 'digitalLoanAccountId', 'days_on_book',
       'ln_appln_submit_datetime', 'ln_os_type', 'ln_vas_opted_flag',
       'ln_self_dec_income', 'ln_age', 'ln_source_funds_new_bin',
       'ln_loan_level_user_type', 'ln_industry_new_cat_bin',
       'ln_marital_status', 'ln_doc_type_rolled', 'ln_education_level',
       'ln_ref2_type', 'ln_email_primary_domain', 'ln_province_bin',
       'ln_mature_fspd30_flag', 'ln_fspd30_flag', 'demo_score',
       'trench_category', 'ln_loan_type', 'ln_disb_dtime',
       'c_app_cnt_health_and_fitness_ever', 'c_app_cnt_shopping_ever',
       'c_app_median_time_bw_installed_mins_ever',
       'c_app_avg_time_bw_installed_mins_3d', 'c_app_cnt_crypto_ever',
       'c_app_cnt_driver_ever', 'c_app_cnt_payday_180d',
       'c_app_cnt_gambling_180d', 'apps_score', 'credo_score',
       'stack_score'], dtype=object)

In [48]:
d1['demo_score'] = np.nan
d1['cic_score'] = np.nan
d1['trx_score'] = np.nan

In [49]:
d1.head()

Unnamed: 0,customer_id,digitalLoanAccountId,days_on_book,ln_appln_submit_datetime,ln_os_type,ln_vas_opted_flag,ln_self_dec_income,ln_age,ln_source_funds_new_bin,ln_loan_level_user_type,ln_industry_new_cat_bin,ln_marital_status,ln_doc_type_rolled,ln_education_level,ln_ref2_type,ln_email_primary_domain,ln_province_bin,ln_mature_fspd30_flag,ln_fspd30_flag,demo_score,trench_category,ln_loan_type,ln_disb_dtime,c_app_cnt_health_and_fitness_ever,c_app_cnt_shopping_ever,c_app_median_time_bw_installed_mins_ever,c_app_avg_time_bw_installed_mins_3d,c_app_cnt_crypto_ever,c_app_cnt_driver_ever,c_app_cnt_payday_180d,c_app_cnt_gambling_180d,apps_score,credo_score,stack_score,cic_score,trx_score
0,3461645,6061f751-bb86-49a3-9b6b-0ea44b4306d6,0,2025-05-27 13:22:29+00:00,Android,1,28500,17,salary,2_New Applicant,1.0,Single,national id,College Undergraduate,Friend,gmail.com,others,,,,Trench 1,Quick,NaT,0.0,2.0,0.033333,,0.0,0.0,0.0,0.0,0.642715,0.240666,0.895453,,
1,3292475,80468034-cc21-4da8-b2aa-6e35c84ac387,1,2025-02-28 13:24:31+00:00,Android,1,15000,17,income from business,2_New Applicant,1.0,Single,national id,College Graduate,Sibling,gmail.com,others,,,,Trench 1,Quick,NaT,2.0,0.0,1499.5,1136.783333,0.0,0.0,3.0,0.0,0.532071,0.393578,0.697355,,
2,3192798,03cf7323-cf3a-44be-9975-ac302d61e7ed,0,2025-01-16 03:58:30+00:00,iOS,1,50000,18,salary,2_New Applicant,2.0,Single,driving license,College Graduate,Sibling,gmail.com,others,,,,Trench 1,Quick,NaT,,,,,,,,,,0.220727,0.610448,,
3,3573079,0e938f32-94e3-4473-9beb-9469807b1edf,0,2025-07-23 08:23:10+00:00,iOS,1,6000,18,remittance,2_New Applicant,missing,Single,national id,College Graduate,Sibling,gmail.com,others,,,,Trench 1,Quick,NaT,,,,,,,,,,0.24371,0.705892,,
4,3631270,3eb7e8c9-1412-46b2-bdfe-aa71f1ac9a14,0,2025-08-19 18:56:40+00:00,iOS,1,5000,18,income from business,2_New Applicant,1.0,Single,national id,College Undergraduate,Friend,gmail.com,others,,,,Trench 1,Quick,NaT,,,,,,,,,,0.280322,0.770477,,


In [50]:
import pandas as pd
import json
import uuid
from datetime import datetime

def transform_data(output_file_path):
    # Read the input CSV file
    df = d1.copy()
    
    # Create the output DataFrame with the required structure
    output_data = []
    
    for _, row in df.iterrows():
        # Create the calcFeature JSON with all the feature columns
        feature_columns = [
        'days_on_book',
       'ln_appln_submit_datetime', 'ln_os_type', 'ln_vas_opted_flag',
       'ln_self_dec_income', 'ln_age', 'ln_source_funds_new_bin',
       'ln_loan_level_user_type', 'ln_industry_new_cat_bin',
       'ln_marital_status', 'ln_doc_type_rolled', 'ln_education_level',
       'ln_ref2_type', 'ln_email_primary_domain', 'ln_province_bin',
       'ln_mature_fspd30_flag', 'ln_fspd30_flag', 'demo_score',
       'trench_category', 'ln_loan_type', 'ln_disb_dtime',
       'c_app_cnt_health_and_fitness_ever', 'c_app_cnt_shopping_ever',
       'c_app_median_time_bw_installed_mins_ever',
       'c_app_avg_time_bw_installed_mins_3d', 'c_app_cnt_crypto_ever',
       'c_app_cnt_driver_ever', 'c_app_cnt_payday_180d',
       'c_app_cnt_gambling_180d'
        ]
        
        calc_feature = {}
        for col in feature_columns:
            if col in row and pd.notna(row[col]):
                # Convert Timestamp objects to string
                if isinstance(row[col], pd.Timestamp):
                    calc_feature[col] = row[col].isoformat()
                else:
                    calc_feature[col] = row[col]
        
        # Create prediction JSON with the score columns
        prediction = {
            "demo_score": row.get('demo_score', 0),
            "apps_score": row.get('apps_score', 0),
            "credo_score": row.get('credo_score', 0),
            "stack_score": row.get('stack_score', 0),
            "cic_score": row.get('cic_score', 0),
            "trx_score":row.get('trx_score', 0),
        }
        
        # Get current timestamp
        current_time = datetime.now().isoformat()
        
        # Create the output row
        output_row = {
            "customerId": row['customer_id'],
            "digitalLoanAccountId": row['digitalLoanAccountId'],
            "crifApplicationId": str(uuid.uuid4()),  # Generate random UUID
            "prediction": json.dumps(prediction),
            "start_time": current_time,
            "end_time": current_time,
            "modelDisplayName":"Cash_beta_trench1_backscore",
            "modelVersionId":"v1",
            "subscription_name": "trench alpha beta",
            "message_id": str(uuid.uuid4()),  # Generate random UUID
            "publish_time": current_time,
            "attributes": "{}",  # Empty JSON object
            "calcFeature": json.dumps(calc_feature, default=str)  # Use default=str to handle non-serializable objects
            
        }
        
        output_data.append(output_row)
    
    # Create DataFrame from the output data
    output_df = pd.DataFrame(output_data)
    
    # Save to CSV
    output_df.to_csv(output_file_path, index=False)
    return output_df

# Example usage:
# transformeddata = 'cash_beta_trench1_applied_loans_backscored_20241001_20250831'
# transform_data(f'{LOCALPATH}/{transformeddata}.csv')

In [51]:
transformeddata = f'{cash_beta_trench1}'
dfd = transform_data(f'{LOCALPATH}/{transformeddata}.csv')
print(f"The shape of the transformed data is: {dfd.shape}")

The shape of the transformed data is: (296480, 13)


In [52]:
dfd.head()

Unnamed: 0,customerId,digitalLoanAccountId,crifApplicationId,prediction,start_time,end_time,modelDisplayName,modelVersionId,subscription_name,message_id,publish_time,attributes,calcFeature
0,3461645,6061f751-bb86-49a3-9b6b-0ea44b4306d6,12bcb144-a8dc-43e1-a1be-b934fa74c8c1,"{""demo_score"": NaN, ""apps_score"": 0.6427148873...",2025-09-17T10:25:17.939018,2025-09-17T10:25:17.939018,Cash_beta_trench1_backscore,v1,trench alpha beta,ec6fbd17-0ea0-4c9b-a982-4f090f6fc585,2025-09-17T10:25:17.939018,{},"{""days_on_book"": 0, ""ln_appln_submit_datetime""..."
1,3292475,80468034-cc21-4da8-b2aa-6e35c84ac387,d9ee1f49-aba0-4e37-b91d-98402905ff9b,"{""demo_score"": NaN, ""apps_score"": 0.5320705003...",2025-09-17T10:25:17.940021,2025-09-17T10:25:17.940021,Cash_beta_trench1_backscore,v1,trench alpha beta,96d28846-fa65-4121-a167-2416e833d9ea,2025-09-17T10:25:17.940021,{},"{""days_on_book"": 1, ""ln_appln_submit_datetime""..."
2,3192798,03cf7323-cf3a-44be-9975-ac302d61e7ed,0783ad74-7746-4e5e-b575-e85eefeae39c,"{""demo_score"": NaN, ""apps_score"": NaN, ""credo_...",2025-09-17T10:25:17.940021,2025-09-17T10:25:17.940021,Cash_beta_trench1_backscore,v1,trench alpha beta,099a8a6e-6829-4967-ac63-69829fa0d17d,2025-09-17T10:25:17.940021,{},"{""days_on_book"": 0, ""ln_appln_submit_datetime""..."
3,3573079,0e938f32-94e3-4473-9beb-9469807b1edf,1e010451-0c97-4673-8bd7-5902d6b5e5a5,"{""demo_score"": NaN, ""apps_score"": NaN, ""credo_...",2025-09-17T10:25:17.940021,2025-09-17T10:25:17.940021,Cash_beta_trench1_backscore,v1,trench alpha beta,ebdab44b-4746-46bf-bd03-9993db106590,2025-09-17T10:25:17.940021,{},"{""days_on_book"": 0, ""ln_appln_submit_datetime""..."
4,3631270,3eb7e8c9-1412-46b2-bdfe-aa71f1ac9a14,b9393179-8808-4511-8cca-0f1a6d0756b5,"{""demo_score"": NaN, ""apps_score"": NaN, ""credo_...",2025-09-17T10:25:17.941018,2025-09-17T10:25:17.941018,Cash_beta_trench1_backscore,v1,trench alpha beta,c3dbde99-5508-4454-8902-856193dcc6fe,2025-09-17T10:25:17.941018,{},"{""days_on_book"": 0, ""ln_appln_submit_datetime""..."


In [53]:
dfd.columns

Index(['customerId', 'digitalLoanAccountId', 'crifApplicationId', 'prediction',
       'start_time', 'end_time', 'modelDisplayName', 'modelVersionId',
       'subscription_name', 'message_id', 'publish_time', 'attributes',
       'calcFeature'],
      dtype='object')

In [54]:
df1 = dfd[['customerId', 'digitalLoanAccountId', 'prediction',
       'start_time', 'end_time', 'modelDisplayName', 'modelVersionId',
        'calcFeature'
       ]].copy()

Found no duplicate digitalLoanAccountId

In [55]:
filenames = f'{CURRENT_DATE}_{unique_id}_{cash_beta_trench1}'
print(filenames)

results = save_dataframe_multi_format(
     dataframe=df1,
     cloud_path=CLOUDPATH,
     filename=filenames,
     client=client,
     bucket_name=f'{BUCKETNAME}'
 )

20250917_44dbc309ac1b_cash_beta_trench1_applied_loans_backscored_20241001_20250831
All files saved successfully!
CSV: gs://prod-asia-southeast1-tonik-aiml-workspace/DC/Model_Monitoring/Model_Tables/20250917_44dbc309ac1b_cash_beta_trench1_applied_loans_backscored_20241001_20250831.csv
PICKLE: gs://prod-asia-southeast1-tonik-aiml-workspace/DC/Model_Monitoring/Model_Tables/20250917_44dbc309ac1b_cash_beta_trench1_applied_loans_backscored_20241001_20250831.pkl
PARQUET: gs://prod-asia-southeast1-tonik-aiml-workspace/DC/Model_Monitoring/Model_Tables/20250917_44dbc309ac1b_cash_beta_trench1_applied_loans_backscored_20241001_20250831.parquet
JOBLIB: gs://prod-asia-southeast1-tonik-aiml-workspace/DC/Model_Monitoring/Model_Tables/20250917_44dbc309ac1b_cash_beta_trench1_applied_loans_backscored_20241001_20250831.joblib


# Insert into a table

In [56]:
# Upload to BigQuery
table_id = f"prj-prod-dataplatform.dap_ds_poweruser_playground.temp_model_transformed_data"
print(table_id)
job_config = bigquery.LoadJobConfig(
    write_disposition="WRITE_TRUNCATE",  # or "WRITE_APPEND"
)
job = client.load_table_from_dataframe(df1, table_id, job_config=job_config)
job.result()  # Wait for the job to complete


prj-prod-dataplatform.dap_ds_poweruser_playground.temp_model_transformed_data




LoadJob<project=prj-prod-dataplatform, location=asia-southeast1, id=53c46b57-71df-400a-ae12-ecf61edb98a1>

# cash_beta_trench2_applied_loans_backscored_20241001_20250831

# Table

In [57]:
schema1 = 'worktable_data_analysis'
cash_beta_trench2 = f'cash_beta_trench2_applied_loans_backscored_20241001_20250831'

# Query

In [58]:
sq = f"""
select * from {schema1}.{cash_beta_trench2};
"""
d1 = client.query(sq).to_dataframe(progress_bar_type='tqdm')
print(f"The shape of {schema1}.{cash_beta_trench2} table is:\t {d1.shape}")

Job ID 90a1cd1d-6627-4f31-8d7a-717a607835fe successfully executed: 100%|[32m██████████[0m|




Downloading: 100%|[32m██████████[0m|
The shape of worktable_data_analysis.cash_beta_trench2_applied_loans_backscored_20241001_20250831 table is:	 (111973, 46)


In [59]:
d1.columns.values

array(['customer_id', 'digitalLoanAccountId', 'days_on_book',
       'ln_appln_submit_datetime', 'ln_os_type', 'ln_vas_opted_flag',
       'ln_self_dec_income', 'ln_age', 'ln_source_funds_new_bin',
       'ln_loan_level_user_type', 'ln_industry_new_cat_bin',
       'ln_marital_status', 'ln_doc_type_rolled', 'ln_education_level',
       'ln_ref2_type', 'ln_email_primary_domain', 'ln_province_bin',
       'ln_mature_fspd30_flag', 'ln_fspd30_flag', 'demo_score',
       'trench_category', 'ln_loan_type', 'ln_disb_dtime',
       'c_app_cnt_health_and_fitness_ever', 'c_app_cnt_shopping_ever',
       'c_app_median_time_bw_installed_mins_ever',
       'c_app_avg_time_bw_installed_mins_3d', 'c_app_cnt_crypto_ever',
       'c_app_cnt_driver_ever', 'c_app_cnt_payday_180d',
       'c_app_cnt_gambling_180d',
       'c_t2_tx_meng_ql_calculator_tot_visit_cnt',
       'c_t2_tx_first_product_user_segment_WOE',
       'c_t2_tx_first_applied_loan_type_bin_WOE',
       'c_t2_tx_cnt_rejected_loans',
      

In [60]:
d1['demo_score'] = np.nan
d1['cic_score'] = np.nan

In [61]:
d1.head()

Unnamed: 0,customer_id,digitalLoanAccountId,days_on_book,ln_appln_submit_datetime,ln_os_type,ln_vas_opted_flag,ln_self_dec_income,ln_age,ln_source_funds_new_bin,ln_loan_level_user_type,ln_industry_new_cat_bin,ln_marital_status,ln_doc_type_rolled,ln_education_level,ln_ref2_type,ln_email_primary_domain,ln_province_bin,ln_mature_fspd30_flag,ln_fspd30_flag,demo_score,trench_category,ln_loan_type,ln_disb_dtime,c_app_cnt_health_and_fitness_ever,c_app_cnt_shopping_ever,c_app_median_time_bw_installed_mins_ever,c_app_avg_time_bw_installed_mins_3d,c_app_cnt_crypto_ever,c_app_cnt_driver_ever,c_app_cnt_payday_180d,c_app_cnt_gambling_180d,c_t2_tx_meng_ql_calculator_tot_visit_cnt,c_t2_tx_first_product_user_segment_WOE,c_t2_tx_first_applied_loan_type_bin_WOE,c_t2_tx_cnt_rejected_loans,c_t2_tx_appsflyer_install_to_registration_minutes,c_t2_tx_first_applied_loan_amount,c_t2_tx_deposit_accnt_cnt,c_t2_tx_cnt_cash_in_total,c_t2_tx_cnt_incomplete_loan_apps,c_t2_tx_amt_cash_in_total,c_t2_tx_last_applied_loan_tenor_bin_WOE,trx_score,apps_score,credo_score,stack_score,cic_score
0,2292294,76c329f1-a80c-4f97-905a-cdc50eaefc4f,603,2025-06-28 22:14:14+00:00,iOS,1,35000,33,salary,2_New Applicant,1.0,Married,umid,College Graduate,Co-worker,gmail.com,metro manila,,,,Trench 2,Quick,NaT,,,,,,,,,3.0,-0.00838,0.322748,1.0,4.0,35000.0,,0,0.0,,-0.067628,0.550381,,0.242085,0.621407,
1,3109464,2f55c1fd-edde-4d76-804c-9faa6fe8276f,112,2025-04-07 13:40:27+00:00,iOS,1,20000,27,salary,2_New Applicant,2.0,Single,umid,College Graduate,Parent,gmail.com,others,,,,Trench 2,Quick,NaT,,,,,,,,,12.0,-0.00838,-0.112609,1.0,,12000.0,,0,0.0,,0.045849,0.531987,,0.127825,0.51924,
2,1491167,32ff1b2c-d847-4b05-8636-9c8e4aeba209,846,2024-11-17 14:07:32+00:00,iOS,1,85000,28,salary,2_New Applicant,missing,Single,national id,College Undergraduate,Sibling,gmail.com,others,1.0,0.0,,Trench 2,Quick,2024-11-17 14:16:38,,,,,,,,,32.0,-0.121208,-0.112609,7.0,,25000.0,1.0,28,0.0,95487.42,-0.112035,0.560202,,0.164833,0.452883,
3,2305465,9715ee51-7fac-4eda-b675-310d6ab79ac8,351,2024-11-01 05:17:34+00:00,iOS,0,75000,42,salary,2_New Applicant,3.0,Single,driving license,College Graduate,Friend,gmail.com,negros occidental,,,,Trench 2,Quick,NaT,,,,,,,,,11.0,-0.121208,0.322748,1.0,7.0,25000.0,,0,0.0,,-0.067628,0.515623,,0.177385,0.25649,
4,2080167,567e6cf6-9f0f-4fb8-8ee5-1d0022d34d2a,492,2024-10-10 14:41:19+00:00,iOS,1,32400,34,salary,2_New Applicant,1.0,Single,driving license,College Undergraduate,Co-worker,yahoo.com,metro manila,,,,Trench 2,Quick,NaT,,,,,,,,,10.0,-0.121208,0.322748,1.0,7.0,250000.0,,1,0.0,2000.0,-0.15912,0.452746,,0.206824,0.577553,


In [62]:
import pandas as pd
import json
import uuid
from datetime import datetime

def transform_data(output_file_path):
    # Read the input CSV file
    df = d1.copy()
    
    # Create the output DataFrame with the required structure
    output_data = []
    
    for _, row in df.iterrows():
        # Create the calcFeature JSON with all the feature columns
        feature_columns = [
        'days_on_book',
       'ln_appln_submit_datetime', 'ln_os_type', 'ln_vas_opted_flag',
       'ln_self_dec_income', 'ln_age', 'ln_source_funds_new_bin',
       'ln_loan_level_user_type', 'ln_industry_new_cat_bin',
       'ln_marital_status', 'ln_doc_type_rolled', 'ln_education_level',
       'ln_ref2_type', 'ln_email_primary_domain', 'ln_province_bin',
       'ln_mature_fspd30_flag', 'ln_fspd30_flag', 'demo_score',
       'trench_category', 'ln_loan_type', 'ln_disb_dtime',
       'c_app_cnt_health_and_fitness_ever', 'c_app_cnt_shopping_ever',
       'c_app_median_time_bw_installed_mins_ever',
       'c_app_avg_time_bw_installed_mins_3d', 'c_app_cnt_crypto_ever',
       'c_app_cnt_driver_ever', 'c_app_cnt_payday_180d',
       'c_app_cnt_gambling_180d',
       'c_t2_tx_meng_ql_calculator_tot_visit_cnt',
       'c_t2_tx_first_product_user_segment_WOE',
       'c_t2_tx_first_applied_loan_type_bin_WOE',
       'c_t2_tx_cnt_rejected_loans',
       'c_t2_tx_appsflyer_install_to_registration_minutes',
       'c_t2_tx_first_applied_loan_amount', 'c_t2_tx_deposit_accnt_cnt',
       'c_t2_tx_cnt_cash_in_total', 'c_t2_tx_cnt_incomplete_loan_apps',
       'c_t2_tx_amt_cash_in_total',
       'c_t2_tx_last_applied_loan_tenor_bin_WOE',
        ]
        
        calc_feature = {}
        for col in feature_columns:
            if col in row and pd.notna(row[col]):
                # Convert Timestamp objects to string
                if isinstance(row[col], pd.Timestamp):
                    calc_feature[col] = row[col].isoformat()
                else:
                    calc_feature[col] = row[col]
        
        # Create prediction JSON with the score columns
        prediction = {
            "demo_score": row.get('demo_score', 0),
            "apps_score": row.get('apps_score', 0),
            "credo_score": row.get('credo_score', 0),
            "stack_score": row.get('stack_score', 0),
            "cic_score": row.get('cic_score', 0),
            "trx_score":row.get('trx_score', 0),
        }
        
        # Get current timestamp
        current_time = datetime.now().isoformat()
        
        # Create the output row
        output_row = {
            "customerId": row['customer_id'],
            "digitalLoanAccountId": row['digitalLoanAccountId'],
            "crifApplicationId": str(uuid.uuid4()),  # Generate random UUID
            "prediction": json.dumps(prediction),
            "start_time": current_time,
            "end_time": current_time,
            "modelDisplayName":"Cash_beta_trench2_backscore",
            "modelVersionId":"v1",
            "subscription_name": "trench alpha beta",
            "message_id": str(uuid.uuid4()),  # Generate random UUID
            "publish_time": current_time,
            "attributes": "{}",  # Empty JSON object
            "calcFeature": json.dumps(calc_feature, default=str)  # Use default=str to handle non-serializable objects
            
        }
        
        output_data.append(output_row)
    
    # Create DataFrame from the output data
    output_df = pd.DataFrame(output_data)
    
    # Save to CSV
    output_df.to_csv(output_file_path, index=False)
    return output_df

# Example usage:
# transformeddata = 'cash_beta_trench1_applied_loans_backscored_20241001_20250831'
# transform_data(f'{LOCALPATH}/{transformeddata}.csv')

In [63]:
transformeddata = f'{cash_beta_trench2}'
dfd = transform_data(f'{LOCALPATH}/{transformeddata}.csv')
print(f"The shape of the transformed data is: {dfd.shape}")

The shape of the transformed data is: (111973, 13)


In [64]:
dfd.head()

Unnamed: 0,customerId,digitalLoanAccountId,crifApplicationId,prediction,start_time,end_time,modelDisplayName,modelVersionId,subscription_name,message_id,publish_time,attributes,calcFeature
0,2292294,76c329f1-a80c-4f97-905a-cdc50eaefc4f,768c8a60-a0a5-4ac8-8c3c-8a6a7f29b675,"{""demo_score"": NaN, ""apps_score"": NaN, ""credo_...",2025-09-17T10:28:43.933358,2025-09-17T10:28:43.933358,Cash_beta_trench2_backscore,v1,trench alpha beta,152023a1-6876-4992-97a7-f68ce6977d1a,2025-09-17T10:28:43.933358,{},"{""days_on_book"": 603, ""ln_appln_submit_datetim..."
1,3109464,2f55c1fd-edde-4d76-804c-9faa6fe8276f,f3f3039c-d328-4e81-a05c-5cf5408c82d5,"{""demo_score"": NaN, ""apps_score"": NaN, ""credo_...",2025-09-17T10:28:43.933358,2025-09-17T10:28:43.933358,Cash_beta_trench2_backscore,v1,trench alpha beta,863e69cb-607e-4847-a4e2-83a803c17458,2025-09-17T10:28:43.933358,{},"{""days_on_book"": 112, ""ln_appln_submit_datetim..."
2,1491167,32ff1b2c-d847-4b05-8636-9c8e4aeba209,63285186-7fa7-48e5-95c6-f435ce069c8f,"{""demo_score"": NaN, ""apps_score"": NaN, ""credo_...",2025-09-17T10:28:43.934359,2025-09-17T10:28:43.934359,Cash_beta_trench2_backscore,v1,trench alpha beta,5c6bc2aa-81b2-4d72-865e-b066f944b6f9,2025-09-17T10:28:43.934359,{},"{""days_on_book"": 846, ""ln_appln_submit_datetim..."
3,2305465,9715ee51-7fac-4eda-b675-310d6ab79ac8,717b6c15-8f96-40ae-9791-9ea5b7b91739,"{""demo_score"": NaN, ""apps_score"": NaN, ""credo_...",2025-09-17T10:28:43.934359,2025-09-17T10:28:43.934359,Cash_beta_trench2_backscore,v1,trench alpha beta,b3f60f00-902b-4803-9a83-db0e0c47f546,2025-09-17T10:28:43.934359,{},"{""days_on_book"": 351, ""ln_appln_submit_datetim..."
4,2080167,567e6cf6-9f0f-4fb8-8ee5-1d0022d34d2a,4f51a33e-5143-413e-ac53-4cd61bd77b33,"{""demo_score"": NaN, ""apps_score"": NaN, ""credo_...",2025-09-17T10:28:43.934359,2025-09-17T10:28:43.934359,Cash_beta_trench2_backscore,v1,trench alpha beta,6f737bd2-8449-4b80-af58-79f94c8d77d4,2025-09-17T10:28:43.934359,{},"{""days_on_book"": 492, ""ln_appln_submit_datetim..."


In [65]:
dfd.columns

Index(['customerId', 'digitalLoanAccountId', 'crifApplicationId', 'prediction',
       'start_time', 'end_time', 'modelDisplayName', 'modelVersionId',
       'subscription_name', 'message_id', 'publish_time', 'attributes',
       'calcFeature'],
      dtype='object')

In [66]:
df1 = dfd[['customerId', 'digitalLoanAccountId', 'prediction',
       'start_time', 'end_time', 'modelDisplayName', 'modelVersionId',
        'calcFeature'
       ]].copy()

Found no duplicate digitalLoanAccountId

In [67]:
filenames = f'{CURRENT_DATE}_{unique_id}_{transformeddata}'

results = save_dataframe_multi_format(
     dataframe=df1,
     cloud_path=CLOUDPATH,
     filename=filenames,
     client=client,
     bucket_name=f'{BUCKETNAME}'
 )

All files saved successfully!
CSV: gs://prod-asia-southeast1-tonik-aiml-workspace/DC/Model_Monitoring/Model_Tables/20250917_44dbc309ac1b_cash_beta_trench2_applied_loans_backscored_20241001_20250831.csv
PICKLE: gs://prod-asia-southeast1-tonik-aiml-workspace/DC/Model_Monitoring/Model_Tables/20250917_44dbc309ac1b_cash_beta_trench2_applied_loans_backscored_20241001_20250831.pkl
PARQUET: gs://prod-asia-southeast1-tonik-aiml-workspace/DC/Model_Monitoring/Model_Tables/20250917_44dbc309ac1b_cash_beta_trench2_applied_loans_backscored_20241001_20250831.parquet
JOBLIB: gs://prod-asia-southeast1-tonik-aiml-workspace/DC/Model_Monitoring/Model_Tables/20250917_44dbc309ac1b_cash_beta_trench2_applied_loans_backscored_20241001_20250831.joblib


# Insert the data into a table

In [68]:
# Upload to BigQuery
table_id = f"prj-prod-dataplatform.dap_ds_poweruser_playground.temp_model_transformed_data"
print(table_id)
job_config = bigquery.LoadJobConfig(
    write_disposition="WRITE_APPEND",  # or "WRITE_APPEND"
)
job = client.load_table_from_dataframe(df1, table_id, job_config=job_config)
job.result()  # Wait for the job to complete


prj-prod-dataplatform.dap_ds_poweruser_playground.temp_model_transformed_data




LoadJob<project=prj-prod-dataplatform, location=asia-southeast1, id=f9cf6102-8e8a-4fa7-9767-bf0fbff9f425>

# cash_beta_trench3_applied_loans_backscored_20241001_20250831

# Table

In [69]:
schema1 = 'worktable_data_analysis'
cash_beta_trench3 = f'cash_beta_trench3_applied_loans_backscored_20241001_20250831'

# Query

In [70]:
sq = f"""
select * from {schema1}.{cash_beta_trench3};
"""
d1 = client.query(sq).to_dataframe(progress_bar_type='tqdm')
print(f"The shape of {schema1}.{cash_beta_trench3} table is:\t {d1.shape}")

Job ID a5a39bdf-cdf9-44db-9507-e3b48374e0fe successfully executed: 100%|[32m██████████[0m|




Downloading: 100%|[32m██████████[0m|
The shape of worktable_data_analysis.cash_beta_trench3_applied_loans_backscored_20241001_20250831 table is:	 (38621, 48)


In [71]:
d1.columns.values

array(['customer_id', 'digitalLoanAccountId', 'days_on_book',
       'ln_appln_submit_datetime', 'ln_os_type', 'ln_vas_opted_flag',
       'ln_self_dec_income', 'ln_age', 'ln_source_funds_new_bin',
       'ln_loan_level_user_type', 'ln_industry_new_cat_bin',
       'ln_marital_status', 'ln_doc_type_rolled', 'ln_education_level',
       'ln_ref2_type', 'ln_email_primary_domain', 'ln_province_bin',
       'ln_mature_fspd30_flag', 'ln_fspd30_flag', 'demo_score',
       'trench_category', 'ln_loan_type', 'ln_disb_dtime',
       'c_app_cnt_absence_tag_365d_binned',
       'c_app_cnt_books_and_reference_ever_binned',
       'c_app_cnt_gaming_180d_binned',
       'c_app_cnt_health_and_fitness_ever_binned',
       'c_app_cnt_productivity_ever_binned',
       'c_app_cnt_rated_for_18plus_ever_binned',
       'c_app_last_payday_install_to_apply_days_binned',
       'c_t3_tx_cnt_installments_paid_tot_with_dpd',
       'c_t3_tx_time_since_last_applied_loan_application_time',
       'c_t3_tx_last_ap

In [72]:
d1['demo_score'] = np.nan
d1['cic_score'] = np.nan

In [73]:
d1.head()

Unnamed: 0,customer_id,digitalLoanAccountId,days_on_book,ln_appln_submit_datetime,ln_os_type,ln_vas_opted_flag,ln_self_dec_income,ln_age,ln_source_funds_new_bin,ln_loan_level_user_type,ln_industry_new_cat_bin,ln_marital_status,ln_doc_type_rolled,ln_education_level,ln_ref2_type,ln_email_primary_domain,ln_province_bin,ln_mature_fspd30_flag,ln_fspd30_flag,demo_score,trench_category,ln_loan_type,ln_disb_dtime,c_app_cnt_absence_tag_365d_binned,c_app_cnt_books_and_reference_ever_binned,c_app_cnt_gaming_180d_binned,c_app_cnt_health_and_fitness_ever_binned,c_app_cnt_productivity_ever_binned,c_app_cnt_rated_for_18plus_ever_binned,c_app_last_payday_install_to_apply_days_binned,c_t3_tx_cnt_installments_paid_tot_with_dpd,c_t3_tx_time_since_last_applied_loan_application_time,c_t3_tx_last_applied_loan_decision,c_t3_tx_min_age_completed_loans,c_t3_tx_dob_observation_date,c_t3_tx_cnt_jira_tickets_created_bin,c_t3_tx_max_ever_dpd,c_t3_tx_amt_cash_in_total,c_t3_tx_last_applied_loan_type_bin,c_t3_tx_cnt_completed_loans,c_t3_tx_meng_no_of_logins,c_t3_tx_last_applied_loan_tenor,c_t3_tx_med_days_bt_cash_out_trans,c_t3_tx_avg_days_bt_cash_in_trans,trx_score,apps_score,credo_score,stack_score,cic_score
0,2688774,96a63ada-7037-4302-be52-c98dc8dd3086,138,2024-12-10 23:26:24+00:00,iOS,0,115000,31,salary,1_Repeat Applicant,3.0,Single,umid,College Graduate,Friend,gmail.com,metro manila,,,,Trench 3,Quick,NaT,,,,,,,,1,138,APPROVED,0,138,Unknown,1.0,32260.0,Quick,0,25,6,1.0,26.25,0.124384,,0.094711,0.435645,
1,2834010,aa0f621c-291a-45be-9f41-38f79d0cfbde,51,2024-10-30 09:56:44+00:00,iOS,0,28000,24,salary,1_Repeat Applicant,2.0,Single,national id,College Graduate,Sibling,gmail.com,batangas,1.0,0.0,,Trench 3,Quick,2024-10-30 10:35:53,,,,,,,,0,25,APPROVED,0,51,2,,5500.0,Quick,0,18,6,25.0,,0.141644,,0.079651,0.492876,
2,1996933,86f87afa-0a6a-43ad-ba06-43c53d14bf93,715,2025-03-30 07:23:06+00:00,iOS,0,23000,31,salary,1_Repeat Applicant,3.0,Single,umid,College Graduate,Friend,gmail.com,metro manila,,,,Trench 3,Quick,NaT,,,,,,,,2,715,APPROVED,0,715,Unknown,12.0,61898.81,Quick,0,50,24,434.0,28.125,0.144358,,0.098734,0.483476,
3,3470262,58d360d3-0b12-4d92-ae3b-2f87e43d521c,27,2025-06-27 16:49:05+00:00,iOS,0,50000,43,salary,1_Repeat Applicant,3.0,Married,national id,College Graduate,Parent,yahoo.com,pampanga,0.0,0.0,,Trench 3,Quick,2025-06-27 16:51:30,,,,,,,,0,27,APPROVED,0,27,2,,0.0,Quick,0,1,6,26.0,,0.163753,,0.136967,0.467202,
4,1829519,7d525e35-28e6-4056-8048-e338442242d7,879,2025-05-18 15:56:27+00:00,iOS,1,50000,24,salary,1_Repeat Applicant,2.0,Single,less_frequent_cat,College Graduate,Co-worker,gmail.com,metro manila,,,,Trench 3,Quick,NaT,,,,,,,,0,176,REJECT,0,879,Unknown,,1061107.89,Quick,1,473,6,0.0,6.30597,0.075721,,0.080976,0.484917,


In [74]:
import pandas as pd
import json
import uuid
from datetime import datetime

def transform_data(output_file_path):
    # Read the input CSV file
    df = d1.copy()
    
    # Create the output DataFrame with the required structure
    output_data = []
    
    for _, row in df.iterrows():
        # Create the calcFeature JSON with all the feature columns
        feature_columns = [
        'days_on_book',
       'ln_appln_submit_datetime', 'ln_os_type', 'ln_vas_opted_flag',
       'ln_self_dec_income', 'ln_age', 'ln_source_funds_new_bin',
       'ln_loan_level_user_type', 'ln_industry_new_cat_bin',
       'ln_marital_status', 'ln_doc_type_rolled', 'ln_education_level',
       'ln_ref2_type', 'ln_email_primary_domain', 'ln_province_bin',
       'ln_mature_fspd30_flag', 'ln_fspd30_flag', 'demo_score',
       'trench_category', 'ln_loan_type', 'ln_disb_dtime',
       'c_app_cnt_absence_tag_365d_binned',
       'c_app_cnt_books_and_reference_ever_binned',
       'c_app_cnt_gaming_180d_binned',
       'c_app_cnt_health_and_fitness_ever_binned',
       'c_app_cnt_productivity_ever_binned',
       'c_app_cnt_rated_for_18plus_ever_binned',
       'c_app_last_payday_install_to_apply_days_binned',
       'c_t3_tx_cnt_installments_paid_tot_with_dpd',
       'c_t3_tx_time_since_last_applied_loan_application_time',
       'c_t3_tx_last_applied_loan_decision',
       'c_t3_tx_min_age_completed_loans', 'c_t3_tx_dob_observation_date',
       'c_t3_tx_cnt_jira_tickets_created_bin', 'c_t3_tx_max_ever_dpd',
       'c_t3_tx_amt_cash_in_total', 'c_t3_tx_last_applied_loan_type_bin',
       'c_t3_tx_cnt_completed_loans', 'c_t3_tx_meng_no_of_logins',
       'c_t3_tx_last_applied_loan_tenor',
       'c_t3_tx_med_days_bt_cash_out_trans',
       'c_t3_tx_avg_days_bt_cash_in_trans',
        ]
        
        calc_feature = {}
        for col in feature_columns:
            if col in row and pd.notna(row[col]):
                # Convert Timestamp objects to string
                if isinstance(row[col], pd.Timestamp):
                    calc_feature[col] = row[col].isoformat()
                else:
                    calc_feature[col] = row[col]
        
        # Create prediction JSON with the score columns
        prediction = {
            "demo_score": row.get('demo_score', 0),
            "apps_score": row.get('apps_score', 0),
            "credo_score": row.get('credo_score', 0),
            "stack_score": row.get('stack_score', 0),
            "cic_score": row.get('cic_score', 0),
            "trx_score":row.get('trx_score', 0),
        }
        
        # Get current timestamp
        current_time = datetime.now().isoformat()
        
        # Create the output row
        output_row = {
            "customerId": row['customer_id'],
            "digitalLoanAccountId": row['digitalLoanAccountId'],
            "crifApplicationId": str(uuid.uuid4()),  # Generate random UUID
            "prediction": json.dumps(prediction),
            "start_time": current_time,
            "end_time": current_time,
            "modelDisplayName":"Cash_beta_trench3_backscore",
            "modelVersionId":"v1",
            "subscription_name": "trench alpha beta",
            "message_id": str(uuid.uuid4()),  # Generate random UUID
            "publish_time": current_time,
            "attributes": "{}",  # Empty JSON object
            "calcFeature": json.dumps(calc_feature, default=str)  # Use default=str to handle non-serializable objects
            
        }
        
        output_data.append(output_row)
    
    # Create DataFrame from the output data
    output_df = pd.DataFrame(output_data)
    
    # Save to CSV
    output_df.to_csv(output_file_path, index=False)
    return output_df

# Example usage:
# transformeddata = 'cash_beta_trench1_applied_loans_backscored_20241001_20250831'
# transform_data(f'{LOCALPATH}/{transformeddata}.csv')

In [75]:
transformeddata = f'{cash_beta_trench3}'
dfd = transform_data(f'{LOCALPATH}/{transformeddata}.csv')
print(f"The shape of the transformed data is: {dfd.shape}")

The shape of the transformed data is: (38621, 13)


In [76]:
dfd.head()

Unnamed: 0,customerId,digitalLoanAccountId,crifApplicationId,prediction,start_time,end_time,modelDisplayName,modelVersionId,subscription_name,message_id,publish_time,attributes,calcFeature
0,2688774,96a63ada-7037-4302-be52-c98dc8dd3086,88c436d4-8182-4ce7-887d-92efbd00c253,"{""demo_score"": NaN, ""apps_score"": NaN, ""credo_...",2025-09-17T10:30:17.971372,2025-09-17T10:30:17.971372,Cash_beta_trench3_backscore,v1,trench alpha beta,9f5bfcf3-388d-4534-aecf-9c56c458a4d1,2025-09-17T10:30:17.971372,{},"{""days_on_book"": 138, ""ln_appln_submit_datetim..."
1,2834010,aa0f621c-291a-45be-9f41-38f79d0cfbde,c583136f-a3aa-4a20-81de-cf2682cf9459,"{""demo_score"": NaN, ""apps_score"": NaN, ""credo_...",2025-09-17T10:30:17.971372,2025-09-17T10:30:17.971372,Cash_beta_trench3_backscore,v1,trench alpha beta,b1d4d81e-fbd5-443d-afad-08ff9225a9b5,2025-09-17T10:30:17.971372,{},"{""days_on_book"": 51, ""ln_appln_submit_datetime..."
2,1996933,86f87afa-0a6a-43ad-ba06-43c53d14bf93,0e00961b-3337-4f95-a266-fc533de18089,"{""demo_score"": NaN, ""apps_score"": NaN, ""credo_...",2025-09-17T10:30:17.971372,2025-09-17T10:30:17.971372,Cash_beta_trench3_backscore,v1,trench alpha beta,e7808969-506c-4c42-860a-f815679fc20e,2025-09-17T10:30:17.971372,{},"{""days_on_book"": 715, ""ln_appln_submit_datetim..."
3,3470262,58d360d3-0b12-4d92-ae3b-2f87e43d521c,25ed2051-6d1f-4704-8fb4-982c5d63329c,"{""demo_score"": NaN, ""apps_score"": NaN, ""credo_...",2025-09-17T10:30:17.971372,2025-09-17T10:30:17.971372,Cash_beta_trench3_backscore,v1,trench alpha beta,7e1d3ef2-5679-4f28-abf1-65f454afa02b,2025-09-17T10:30:17.971372,{},"{""days_on_book"": 27, ""ln_appln_submit_datetime..."
4,1829519,7d525e35-28e6-4056-8048-e338442242d7,0ef264f1-d3ee-434c-b612-32e846cb390c,"{""demo_score"": NaN, ""apps_score"": NaN, ""credo_...",2025-09-17T10:30:17.972373,2025-09-17T10:30:17.972373,Cash_beta_trench3_backscore,v1,trench alpha beta,62b1d0f7-7265-48f3-8a0c-5a0ef5694230,2025-09-17T10:30:17.972373,{},"{""days_on_book"": 879, ""ln_appln_submit_datetim..."


In [77]:
dfd.columns

Index(['customerId', 'digitalLoanAccountId', 'crifApplicationId', 'prediction',
       'start_time', 'end_time', 'modelDisplayName', 'modelVersionId',
       'subscription_name', 'message_id', 'publish_time', 'attributes',
       'calcFeature'],
      dtype='object')

In [78]:
df1 = dfd[['customerId', 'digitalLoanAccountId', 'prediction',
       'start_time', 'end_time', 'modelDisplayName', 'modelVersionId',
        'calcFeature'
       ]].copy()

Found no duplicate digitalLoanAccountId

In [79]:
filenames = f'{CURRENT_DATE}_{unique_id}_{transformeddata}'

results = save_dataframe_multi_format(
     dataframe=df1,
     cloud_path=CLOUDPATH,
     filename=filenames,
     client=client,
     bucket_name=f'{BUCKETNAME}'
 )

All files saved successfully!
CSV: gs://prod-asia-southeast1-tonik-aiml-workspace/DC/Model_Monitoring/Model_Tables/20250917_44dbc309ac1b_cash_beta_trench3_applied_loans_backscored_20241001_20250831.csv
PICKLE: gs://prod-asia-southeast1-tonik-aiml-workspace/DC/Model_Monitoring/Model_Tables/20250917_44dbc309ac1b_cash_beta_trench3_applied_loans_backscored_20241001_20250831.pkl
PARQUET: gs://prod-asia-southeast1-tonik-aiml-workspace/DC/Model_Monitoring/Model_Tables/20250917_44dbc309ac1b_cash_beta_trench3_applied_loans_backscored_20241001_20250831.parquet
JOBLIB: gs://prod-asia-southeast1-tonik-aiml-workspace/DC/Model_Monitoring/Model_Tables/20250917_44dbc309ac1b_cash_beta_trench3_applied_loans_backscored_20241001_20250831.joblib


# Insert the data into a table

In [80]:
# Upload to BigQuery
table_id = f"prj-prod-dataplatform.dap_ds_poweruser_playground.temp_model_transformed_data"
print(table_id)
job_config = bigquery.LoadJobConfig(
    write_disposition="WRITE_APPEND",  # or "WRITE_APPEND"
)
job = client.load_table_from_dataframe(df1, table_id, job_config=job_config)
job.result()  # Wait for the job to complete


prj-prod-dataplatform.dap_ds_poweruser_playground.temp_model_transformed_data




LoadJob<project=prj-prod-dataplatform, location=asia-southeast1, id=950f9ff8-1978-464a-9152-a97624c3306a>

In [82]:
sq = """select modelDisplayName, count(distinct digitalLoanAccountId) from dap_ds_poweruser_playground.temp_model_transformed_data group by 1 order by 1;"""

d2 = client.query(sq).to_dataframe()
d2



Unnamed: 0,modelDisplayName,f0_
0,Cash_beta_trench1_backscore,296480
1,Cash_beta_trench2_backscore,111973
2,Cash_beta_trench3_backscore,38621


# cash_alpha_trench1_applied_loans_backscored_20241001_20250831

# Table

In [83]:
schema1 = 'worktable_data_analysis'
cash_alpha_trench1 = f'cash_alpha_trench1_applied_loans_backscored_20241001_20250831'

# Query

In [84]:
sq = f"""
select * from {schema1}.{cash_alpha_trench1};
"""
d1 = client.query(sq).to_dataframe(progress_bar_type='tqdm')
print(f"The shape of {schema1}.{cash_alpha_trench1} table is:\t {d1.shape}")

Job ID b1dba8cf-314f-47be-8601-43fafe158f4a successfully executed: 100%|[32m██████████[0m|




Downloading: 100%|[32m██████████[0m|
The shape of worktable_data_analysis.cash_alpha_trench1_applied_loans_backscored_20241001_20250831 table is:	 (62044, 48)


In [85]:
d1.columns.values

array(['customer_id', 'digitalLoanAccountId', 'days_on_book',
       'ln_appln_submit_datetime', 'ln_os_type', 'ln_vas_opted_flag',
       'ln_self_dec_income', 'ln_age', 'ln_source_funds_new_bin',
       'ln_loan_level_user_type', 'ln_industry_new_cat_bin',
       'ln_marital_status', 'ln_doc_type_rolled', 'ln_education_level',
       'ln_ref2_type', 'ln_email_primary_domain', 'ln_province_bin',
       'ln_mature_fspd30_flag', 'ln_fspd30_flag', 'trench_category',
       'ln_loan_type', 'ln_disb_dtime',
       'ca_app_cnt_health_and_fitness_ever', 'ca_app_cnt_shopping_ever',
       'ca_app_median_time_bw_installed_mins_ever',
       'ca_app_avg_time_bw_installed_mins_3d', 'ca_app_cnt_crypto_ever',
       'ca_app_cnt_driver_ever', 'ca_app_cnt_payday_180d',
       'ca_app_cnt_gambling_180d',
       'ca_cic_max_age_all_contracts_snapshot',
       'ca_cic_ratio_overdue_contracts_to_granted_contracts',
       'ca_cic_ScoreRange', 'ca_cic_ln_loan_level_user_type',
       'ca_cic_has_ever_bee

In [86]:
d1['trx_score'] = np.nan

In [87]:
d1.head()

Unnamed: 0,customer_id,digitalLoanAccountId,days_on_book,ln_appln_submit_datetime,ln_os_type,ln_vas_opted_flag,ln_self_dec_income,ln_age,ln_source_funds_new_bin,ln_loan_level_user_type,ln_industry_new_cat_bin,ln_marital_status,ln_doc_type_rolled,ln_education_level,ln_ref2_type,ln_email_primary_domain,ln_province_bin,ln_mature_fspd30_flag,ln_fspd30_flag,trench_category,ln_loan_type,ln_disb_dtime,ca_app_cnt_health_and_fitness_ever,ca_app_cnt_shopping_ever,ca_app_median_time_bw_installed_mins_ever,ca_app_avg_time_bw_installed_mins_3d,ca_app_cnt_crypto_ever,ca_app_cnt_driver_ever,ca_app_cnt_payday_180d,ca_app_cnt_gambling_180d,ca_cic_max_age_all_contracts_snapshot,ca_cic_ratio_overdue_contracts_to_granted_contracts,ca_cic_ScoreRange,ca_cic_ln_loan_level_user_type,ca_cic_has_ever_been_overdue,ca_cic_latest_granted_contract_overdue_flag,ca_cic_ratio_closed_over_new_granted_cnt_24M,ca_cic_ratio_risky_contracts_to_granted_contracts,ca_cic_Short_and_Term_Loans_granted_contracts_cnt_24M,ca_cic_flg_zero_non_granted_ever,ca_cic_Personal_Loans_granted_contracts_amt_24M,ca_cic_CreditAvgCreditLimit,ca_cic_flg_zero_granted_ever,demo_score,apps_score,credo_score,stack_score,cic_score,trx_score
0,2945653,e7e3e1dc-de0e-4ad8-b374-2cd2ab2bd87a,0,2024-10-16 15:18:51+00:00,iOS,0,62000,24,salary,2_New Applicant,2.0,Single,driving license,College Undergraduate,Co-worker,gmail.com,others,,,Trench 1,Quick,NaT,,,,,,,,,357.0,0.038462,Unknown,2_New Applicant,1.0,0.0,0.653846,0.0,13.0,1,102311.0,0,0,0.504412,,0.259178,0.62085,0.568481,
1,3436627,e07c1df0-56cd-4d5f-bfd4-8deb3afdc95e,0,2025-05-14 14:49:05+00:00,iOS,0,30308,28,salary,2_New Applicant,3.0,Married,national id,College Graduate,Parent,gmail.com,others,,,Trench 1,Quick,NaT,,,,,,,,,133.0,0.444444,NH_Gi,2_New Applicant,1.0,1.0,,0.0,,0,,0,0,0.44743,,0.191818,0.640329,0.71152,
2,3110665,bac4a185-04bf-4c90-b73e-2ee0d6707df4,0,2024-12-16 17:12:22+00:00,iOS,0,50000,37,salary,2_New Applicant,2.0,Single,umid,College Graduate,Sibling,gmail.com,rizal,,,Trench 1,Quick,NaT,,,,,,,,,1740.0,0.238095,Unknown,2_New Applicant,1.0,0.0,1.0,0.0,2.0,0,16463.0,40000,0,0.453835,,0.213692,0.534009,0.583933,
3,3143585,cc03f3f1-4132-4ed7-9e14-8a096139acc6,0,2024-12-27 11:08:27+00:00,iOS,0,75000,56,income from business,2_New Applicant,2.0,Single,driving license,College Graduate,Friend,gmail.com,davao del sur,1.0,0.0,Trench 1,Quick,2024-12-27 12:12:41,,,,,,,,,3493.0,0.097561,Unknown,2_New Applicant,1.0,0.0,1.0,0.04878,9.0,0,41015.0,50000,0,0.395059,,0.160674,0.24343,0.409816,
4,3616924,f3bd7ba1-a1a9-4868-97ab-ad1a75799f60,6,2025-08-18 10:29:47+00:00,iOS,1,70000,33,remittance,2_New Applicant,missing,Married,driving license,Technical/Vocational Graduate,Spouse,gmail.com,others,,,Trench 1,Quick,NaT,,,,,,,,,1688.0,0.018868,Unknown,2_New Applicant,1.0,0.0,0.8,0.018868,,0,150018.0,0,0,0.570916,,0.193953,0.553493,0.507959,


In [88]:
import pandas as pd
import json
import uuid
from datetime import datetime

def transform_data(output_file_path):
    # Read the input CSV file
    df = d1.copy()
    
    # Create the output DataFrame with the required structure
    output_data = []
    
    for _, row in df.iterrows():
        # Create the calcFeature JSON with all the feature columns
        feature_columns = [
        'days_on_book',
       'ln_appln_submit_datetime', 'ln_os_type', 'ln_vas_opted_flag',
       'ln_self_dec_income', 'ln_age', 'ln_source_funds_new_bin',
       'ln_loan_level_user_type', 'ln_industry_new_cat_bin',
       'ln_marital_status', 'ln_doc_type_rolled', 'ln_education_level',
       'ln_ref2_type', 'ln_email_primary_domain', 'ln_province_bin',
       'ln_mature_fspd30_flag', 'ln_fspd30_flag', 'trench_category',
       'ln_loan_type', 'ln_disb_dtime',
       'ca_app_cnt_health_and_fitness_ever', 'ca_app_cnt_shopping_ever',
       'ca_app_median_time_bw_installed_mins_ever',
       'ca_app_avg_time_bw_installed_mins_3d', 'ca_app_cnt_crypto_ever',
       'ca_app_cnt_driver_ever', 'ca_app_cnt_payday_180d',
       'ca_app_cnt_gambling_180d',
       'ca_cic_max_age_all_contracts_snapshot',
       'ca_cic_ratio_overdue_contracts_to_granted_contracts',
       'ca_cic_ScoreRange', 'ca_cic_ln_loan_level_user_type',
       'ca_cic_has_ever_been_overdue',
       'ca_cic_latest_granted_contract_overdue_flag',
       'ca_cic_ratio_closed_over_new_granted_cnt_24M',
       'ca_cic_ratio_risky_contracts_to_granted_contracts',
       'ca_cic_Short_and_Term_Loans_granted_contracts_cnt_24M',
       'ca_cic_flg_zero_non_granted_ever',
       'ca_cic_Personal_Loans_granted_contracts_amt_24M',
       'ca_cic_CreditAvgCreditLimit', 'ca_cic_flg_zero_granted_ever',
        ]
        
        calc_feature = {}
        for col in feature_columns:
            if col in row and pd.notna(row[col]):
                # Convert Timestamp objects to string
                if isinstance(row[col], pd.Timestamp):
                    calc_feature[col] = row[col].isoformat()
                else:
                    calc_feature[col] = row[col]
        
        # Create prediction JSON with the score columns
        prediction = {
            "demo_score": row.get('demo_score', 0),
            "apps_score": row.get('apps_score', 0),
            "credo_score": row.get('credo_score', 0),
            "stack_score": row.get('stack_score', 0),
            "cic_score": row.get('cic_score', 0),
            "trx_score":row.get('trx_score', 0),
        }
        
        # Get current timestamp
        current_time = datetime.now().isoformat()
        
        # Create the output row
        output_row = {
            "customerId": row['customer_id'],
            "digitalLoanAccountId": row['digitalLoanAccountId'],
            "crifApplicationId": str(uuid.uuid4()),  # Generate random UUID
            "prediction": json.dumps(prediction),
            "start_time": current_time,
            "end_time": current_time,
            "modelDisplayName":"Cash_alpha_trench1_backscore",
            "modelVersionId":"v1",
            "subscription_name": "trench alpha beta",
            "message_id": str(uuid.uuid4()),  # Generate random UUID
            "publish_time": current_time,
            "attributes": "{}",  # Empty JSON object
            "calcFeature": json.dumps(calc_feature, default=str)  # Use default=str to handle non-serializable objects
            
        }
        
        output_data.append(output_row)
    
    # Create DataFrame from the output data
    output_df = pd.DataFrame(output_data)
    
    # Save to CSV
    output_df.to_csv(output_file_path, index=False)
    return output_df

# Example usage:
# transformeddata = 'cash_beta_trench1_applied_loans_backscored_20241001_20250831'
# transform_data(f'{LOCALPATH}/{transformeddata}.csv')

In [89]:
transformeddata = f'{cash_alpha_trench1}'
dfd = transform_data(f'{LOCALPATH}/{transformeddata}.csv')
print(f"The shape of the transformed data is: {dfd.shape}")

The shape of the transformed data is: (62044, 13)


In [90]:
dfd.head()

Unnamed: 0,customerId,digitalLoanAccountId,crifApplicationId,prediction,start_time,end_time,modelDisplayName,modelVersionId,subscription_name,message_id,publish_time,attributes,calcFeature
0,2945653,e7e3e1dc-de0e-4ad8-b374-2cd2ab2bd87a,49e16621-7514-41db-956b-8167a44e47ae,"{""demo_score"": 0.504411926197625, ""apps_score""...",2025-09-17T10:32:17.395819,2025-09-17T10:32:17.395819,Cash_alpha_trench1_backscore,v1,trench alpha beta,7a60fdd8-dba2-4436-bd60-f1917f2b7ea8,2025-09-17T10:32:17.395819,{},"{""days_on_book"": 0, ""ln_appln_submit_datetime""..."
1,3436627,e07c1df0-56cd-4d5f-bfd4-8deb3afdc95e,fb874d57-9422-4bbb-be88-2781c29338af,"{""demo_score"": 0.44742975606632246, ""apps_scor...",2025-09-17T10:32:17.395819,2025-09-17T10:32:17.395819,Cash_alpha_trench1_backscore,v1,trench alpha beta,95de696a-cd16-49fa-82a2-b8fcd8adfeae,2025-09-17T10:32:17.395819,{},"{""days_on_book"": 0, ""ln_appln_submit_datetime""..."
2,3110665,bac4a185-04bf-4c90-b73e-2ee0d6707df4,e8d497c4-0c09-4b52-bb3a-b7f6a0c21708,"{""demo_score"": 0.45383506885111236, ""apps_scor...",2025-09-17T10:32:17.396825,2025-09-17T10:32:17.396825,Cash_alpha_trench1_backscore,v1,trench alpha beta,9d6c121f-1248-4926-a0ba-9d512820b8e7,2025-09-17T10:32:17.396825,{},"{""days_on_book"": 0, ""ln_appln_submit_datetime""..."
3,3143585,cc03f3f1-4132-4ed7-9e14-8a096139acc6,1a8eb9ae-2a1a-4125-82ab-fe70c6d4b681,"{""demo_score"": 0.39505892294939876, ""apps_scor...",2025-09-17T10:32:17.396825,2025-09-17T10:32:17.396825,Cash_alpha_trench1_backscore,v1,trench alpha beta,117d4a91-7cb6-4703-a270-5318a11b7eaf,2025-09-17T10:32:17.396825,{},"{""days_on_book"": 0, ""ln_appln_submit_datetime""..."
4,3616924,f3bd7ba1-a1a9-4868-97ab-ad1a75799f60,14e57ce5-a728-4811-bc20-e4c21b3b60f0,"{""demo_score"": 0.5709163633237098, ""apps_score...",2025-09-17T10:32:17.396825,2025-09-17T10:32:17.396825,Cash_alpha_trench1_backscore,v1,trench alpha beta,e76d2cb4-9517-4150-b5a5-a41961d9d7ab,2025-09-17T10:32:17.396825,{},"{""days_on_book"": 6, ""ln_appln_submit_datetime""..."


In [91]:
dfd.columns

Index(['customerId', 'digitalLoanAccountId', 'crifApplicationId', 'prediction',
       'start_time', 'end_time', 'modelDisplayName', 'modelVersionId',
       'subscription_name', 'message_id', 'publish_time', 'attributes',
       'calcFeature'],
      dtype='object')

In [92]:
df1 = dfd[['customerId', 'digitalLoanAccountId', 'prediction',
       'start_time', 'end_time', 'modelDisplayName', 'modelVersionId',
        'calcFeature'
       ]].copy()

Found no duplicate digitalLoanAccountId

In [93]:
filenames = f'{CURRENT_DATE}_{unique_id}_{transformeddata}'

results = save_dataframe_multi_format(
     dataframe=df1,
     cloud_path=CLOUDPATH,
     filename=filenames,
     client=client,
     bucket_name=f'{BUCKETNAME}'
 )

All files saved successfully!
CSV: gs://prod-asia-southeast1-tonik-aiml-workspace/DC/Model_Monitoring/Model_Tables/20250917_44dbc309ac1b_cash_alpha_trench1_applied_loans_backscored_20241001_20250831.csv
PICKLE: gs://prod-asia-southeast1-tonik-aiml-workspace/DC/Model_Monitoring/Model_Tables/20250917_44dbc309ac1b_cash_alpha_trench1_applied_loans_backscored_20241001_20250831.pkl
PARQUET: gs://prod-asia-southeast1-tonik-aiml-workspace/DC/Model_Monitoring/Model_Tables/20250917_44dbc309ac1b_cash_alpha_trench1_applied_loans_backscored_20241001_20250831.parquet
JOBLIB: gs://prod-asia-southeast1-tonik-aiml-workspace/DC/Model_Monitoring/Model_Tables/20250917_44dbc309ac1b_cash_alpha_trench1_applied_loans_backscored_20241001_20250831.joblib


# Insert the data into a table

In [94]:
# Upload to BigQuery
table_id = f"prj-prod-dataplatform.dap_ds_poweruser_playground.temp_model_transformed_data"
print(table_id)
job_config = bigquery.LoadJobConfig(
    write_disposition="WRITE_APPEND",  # or "WRITE_APPEND"
)
job = client.load_table_from_dataframe(df1, table_id, job_config=job_config)
job.result()  # Wait for the job to complete


prj-prod-dataplatform.dap_ds_poweruser_playground.temp_model_transformed_data




LoadJob<project=prj-prod-dataplatform, location=asia-southeast1, id=3251b9de-ba7c-433d-9bfd-e4528ea2677c>

In [95]:
sq = """select modelDisplayName, count(distinct digitalLoanAccountId) from dap_ds_poweruser_playground.temp_model_transformed_data group by 1 order by 1;"""

d2 = client.query(sq).to_dataframe()
d2



Unnamed: 0,modelDisplayName,f0_
0,Cash_alpha_trench1_backscore,62044
1,Cash_beta_trench1_backscore,296480
2,Cash_beta_trench2_backscore,111973
3,Cash_beta_trench3_backscore,38621


# cash_alpha_trench2_applied_loans_backscored_20241001_20250831

# Table

In [96]:
schema1 = 'worktable_data_analysis'
cash_alpha_trench2 = f'cash_alpha_trench2_applied_loans_backscored_20241001_20250831'

# Query

In [97]:
sq = f"""
select * from {schema1}.{cash_alpha_trench2};
"""
d1 = client.query(sq).to_dataframe(progress_bar_type='tqdm')
print(f"The shape of {schema1}.{cash_alpha_trench2} table is:\t {d1.shape}")

Job ID 7ba3f382-37f3-4e11-9a7f-f345229ef0fd successfully executed: 100%|[32m██████████[0m|




Downloading: 100%|[32m██████████[0m|
The shape of worktable_data_analysis.cash_alpha_trench2_applied_loans_backscored_20241001_20250831 table is:	 (39651, 60)


In [98]:
d1.columns.values

array(['customer_id', 'digitalLoanAccountId', 'days_on_book',
       'ln_appln_submit_datetime', 'ln_os_type', 'ln_vas_opted_flag',
       'ln_self_dec_income', 'ln_age', 'ln_source_funds_new_bin',
       'ln_loan_level_user_type', 'ln_industry_new_cat_bin',
       'ln_marital_status', 'ln_doc_type_rolled', 'ln_education_level',
       'ln_ref2_type', 'ln_email_primary_domain', 'ln_province_bin',
       'ln_mature_fspd30_flag', 'ln_fspd30_flag', 'trench_category',
       'ln_loan_type', 'ln_disb_dtime',
       'ca_app_cnt_health_and_fitness_ever', 'ca_app_cnt_shopping_ever',
       'ca_app_median_time_bw_installed_mins_ever',
       'ca_app_avg_time_bw_installed_mins_3d', 'ca_app_cnt_crypto_ever',
       'ca_app_cnt_driver_ever', 'ca_app_cnt_payday_180d',
       'ca_app_cnt_gambling_180d',
       'ca_t2_tx_meng_ql_calculator_tot_visit_cnt',
       'ca_t2_tx_first_product_user_segment_WOE',
       'ca_t2_tx_first_applied_loan_type_bin_WOE',
       'ca_t2_tx_cnt_rejected_loans',
       '

In [99]:
d1['trx_score'] = np.nan

In [100]:
d1.head()

Unnamed: 0,customer_id,digitalLoanAccountId,days_on_book,ln_appln_submit_datetime,ln_os_type,ln_vas_opted_flag,ln_self_dec_income,ln_age,ln_source_funds_new_bin,ln_loan_level_user_type,ln_industry_new_cat_bin,ln_marital_status,ln_doc_type_rolled,ln_education_level,ln_ref2_type,ln_email_primary_domain,ln_province_bin,ln_mature_fspd30_flag,ln_fspd30_flag,trench_category,ln_loan_type,ln_disb_dtime,ca_app_cnt_health_and_fitness_ever,ca_app_cnt_shopping_ever,ca_app_median_time_bw_installed_mins_ever,ca_app_avg_time_bw_installed_mins_3d,ca_app_cnt_crypto_ever,ca_app_cnt_driver_ever,ca_app_cnt_payday_180d,ca_app_cnt_gambling_180d,ca_t2_tx_meng_ql_calculator_tot_visit_cnt,ca_t2_tx_first_product_user_segment_WOE,ca_t2_tx_first_applied_loan_type_bin_WOE,ca_t2_tx_cnt_rejected_loans,ca_t2_tx_appsflyer_install_to_registration_minutes,ca_t2_tx_first_applied_loan_amount,ca_t2_tx_deposit_accnt_cnt,ca_t2_tx_cnt_cash_in_total,ca_t2_tx_cnt_incomplete_loan_apps,ca_t2_tx_amt_cash_in_total,ca_t2_tx_last_applied_loan_tenor_bin_WOE,trx_score,ca_cic_max_age_all_contracts_snapshot,ca_cic_ratio_overdue_contracts_to_granted_contracts,ca_cic_ScoreRange,ca_cic_ln_loan_level_user_type,ca_cic_has_ever_been_overdue,ca_cic_latest_granted_contract_overdue_flag,ca_cic_ratio_closed_over_new_granted_cnt_24M,ca_cic_ratio_risky_contracts_to_granted_contracts,ca_cic_Short_and_Term_Loans_granted_contracts_cnt_24M,ca_cic_flg_zero_non_granted_ever,ca_cic_Personal_Loans_granted_contracts_amt_24M,ca_cic_CreditAvgCreditLimit,ca_cic_flg_zero_granted_ever,demo_score,apps_score,credo_score,cic_score,stack_score
0,2278578,527a7986-f598-4498-8dbe-9f5baa0397ce,421,2024-12-14 00:48:46+00:00,iOS,1,50000,28,salary,2_New Applicant,1.0,Single,umid,College Graduate,Parent,gmail.com,davao del sur,,,Trench 2,Quick,NaT,,,,,,,,,3.0,-0.00838,0.177273,0.0,8.0,,,0,1.0,,0.182395,,730.0,0.571429,Unknown,2_New Applicant,1.0,1.0,,0.142857,,0,,0,0,0.591407,,0.336768,0.74858,0.867653
1,2808330,18122fb1-947f-4004-9033-c4bdcbbca5e6,40,2024-10-09 08:06:30+00:00,iOS,1,26790,48,salary,2_New Applicant,3.0,Married,umid,College Graduate,Friend,gmail.com,others,1.0,0.0,Trench 2,Quick,2024-10-09 18:52:09,,,,,,,,,73.0,-0.00838,0.177273,0.0,11.0,,,0,1.0,,0.182395,,2646.0,0.066667,Bi,2_New Applicant,1.0,0.0,,0.0,,1,,180000,0,0.464558,,0.272219,0.413837,0.203759
2,3403996,a087c785-8d47-49d7-83f1-896dfc0b007a,415,2025-04-26 22:49:01+00:00,iOS,0,27000,25,salary,2_New Applicant,3.0,Single,national id,College Graduate,Sibling,gmail.com,others,,,Trench 2,Quick,NaT,,,,,,,,,6.0,0.555098,0.177273,,,,,0,,,0.182395,,89.0,0.0,Ai,2_New Applicant,0.0,0.0,0.888889,0.0,,0,18000.0,0,0,0.460365,,0.198726,0.506584,0.370856
3,3172423,965de26f-3ab6-4992-b678-f7f6f8219736,31,2025-02-07 16:49:25+00:00,iOS,1,27500,31,salary,2_New Applicant,1.0,Married,passport,College Undergraduate,Friend,gmail.com,laguna,,,Trench 2,Quick,NaT,,,,,,,,,8.0,-0.00838,0.177273,0.0,366.0,,,0,1.0,,0.182395,,471.0,0.0,Di,2_New Applicant,0.0,0.0,0.625,0.0,,0,8020.0,0,0,0.583504,,0.234277,0.487934,0.54
4,2422495,59483165-7e09-4ab2-a038-c13577f02479,241,2024-11-04 23:37:50+00:00,iOS,1,138500,33,salary,2_New Applicant,3.0,Single,umid,College Graduate,Friend,gmail.com,metro manila,1.0,0.0,Trench 2,Quick,2024-11-05 10:29:12,,,,,,,,,23.0,-0.00838,0.177273,,,,,0,,,0.182395,,2085.0,0.0,Bi,2_New Applicant,0.0,0.0,0.333333,0.0,,0,50000.0,209666,0,0.433422,,0.20472,0.394879,0.228603


In [101]:
import pandas as pd
import json
import uuid
from datetime import datetime

def transform_data(output_file_path):
    # Read the input CSV file
    df = d1.copy()
    
    # Create the output DataFrame with the required structure
    output_data = []
    
    for _, row in df.iterrows():
        # Create the calcFeature JSON with all the feature columns
        feature_columns = [
         'days_on_book',
       'ln_appln_submit_datetime', 'ln_os_type', 'ln_vas_opted_flag',
       'ln_self_dec_income', 'ln_age', 'ln_source_funds_new_bin',
       'ln_loan_level_user_type', 'ln_industry_new_cat_bin',
       'ln_marital_status', 'ln_doc_type_rolled', 'ln_education_level',
       'ln_ref2_type', 'ln_email_primary_domain', 'ln_province_bin',
       'ln_mature_fspd30_flag', 'ln_fspd30_flag', 'trench_category',
       'ln_loan_type', 'ln_disb_dtime',
       'ca_app_cnt_health_and_fitness_ever', 'ca_app_cnt_shopping_ever',
       'ca_app_median_time_bw_installed_mins_ever',
       'ca_app_avg_time_bw_installed_mins_3d', 'ca_app_cnt_crypto_ever',
       'ca_app_cnt_driver_ever', 'ca_app_cnt_payday_180d',
       'ca_app_cnt_gambling_180d',
       'ca_t2_tx_meng_ql_calculator_tot_visit_cnt',
       'ca_t2_tx_first_product_user_segment_WOE',
       'ca_t2_tx_first_applied_loan_type_bin_WOE',
       'ca_t2_tx_cnt_rejected_loans',
       'ca_t2_tx_appsflyer_install_to_registration_minutes',
       'ca_t2_tx_first_applied_loan_amount', 'ca_t2_tx_deposit_accnt_cnt',
       'ca_t2_tx_cnt_cash_in_total', 'ca_t2_tx_cnt_incomplete_loan_apps',
       'ca_t2_tx_amt_cash_in_total',
       'ca_t2_tx_last_applied_loan_tenor_bin_WOE', 'trx_score',
       'ca_cic_max_age_all_contracts_snapshot',
       'ca_cic_ratio_overdue_contracts_to_granted_contracts',
       'ca_cic_ScoreRange', 'ca_cic_ln_loan_level_user_type',
       'ca_cic_has_ever_been_overdue',
       'ca_cic_latest_granted_contract_overdue_flag',
       'ca_cic_ratio_closed_over_new_granted_cnt_24M',
       'ca_cic_ratio_risky_contracts_to_granted_contracts',
       'ca_cic_Short_and_Term_Loans_granted_contracts_cnt_24M',
       'ca_cic_flg_zero_non_granted_ever',
       'ca_cic_Personal_Loans_granted_contracts_amt_24M',
       'ca_cic_CreditAvgCreditLimit', 'ca_cic_flg_zero_granted_ever',
        ]
        
        calc_feature = {}
        for col in feature_columns:
            if col in row and pd.notna(row[col]):
                # Convert Timestamp objects to string
                if isinstance(row[col], pd.Timestamp):
                    calc_feature[col] = row[col].isoformat()
                else:
                    calc_feature[col] = row[col]
        
        # Create prediction JSON with the score columns
        prediction = {
            "demo_score": row.get('demo_score', 0),
            "apps_score": row.get('apps_score', 0),
            "credo_score": row.get('credo_score', 0),
            "stack_score": row.get('stack_score', 0),
            "cic_score": row.get('cic_score', 0),
            "trx_score":row.get('trx_score', 0),
        }
        
        # Get current timestamp
        current_time = datetime.now().isoformat()
        
        # Create the output row
        output_row = {
            "customerId": row['customer_id'],
            "digitalLoanAccountId": row['digitalLoanAccountId'],
            "crifApplicationId": str(uuid.uuid4()),  # Generate random UUID
            "prediction": json.dumps(prediction),
            "start_time": current_time,
            "end_time": current_time,
            "modelDisplayName":"Cash_alpha_trench2_backscore",
            "modelVersionId":"v1",
            "subscription_name": "trench alpha beta",
            "message_id": str(uuid.uuid4()),  # Generate random UUID
            "publish_time": current_time,
            "attributes": "{}",  # Empty JSON object
            "calcFeature": json.dumps(calc_feature, default=str)  # Use default=str to handle non-serializable objects
            
        }
        
        output_data.append(output_row)
    
    # Create DataFrame from the output data
    output_df = pd.DataFrame(output_data)
    
    # Save to CSV
    output_df.to_csv(output_file_path, index=False)
    return output_df

# Example usage:
# transformeddata = 'cash_beta_trench1_applied_loans_backscored_20241001_20250831'
# transform_data(f'{LOCALPATH}/{transformeddata}.csv')

In [102]:
transformeddata = f'{cash_alpha_trench2}'
print(transformeddata)

cash_alpha_trench2_applied_loans_backscored_20241001_20250831


In [103]:
dfd = transform_data(f'{LOCALPATH}/{transformeddata}.csv')
print(f"The shape of the transformed data is: {dfd.shape}")

The shape of the transformed data is: (39651, 13)


In [104]:
dfd.head()

Unnamed: 0,customerId,digitalLoanAccountId,crifApplicationId,prediction,start_time,end_time,modelDisplayName,modelVersionId,subscription_name,message_id,publish_time,attributes,calcFeature
0,2278578,527a7986-f598-4498-8dbe-9f5baa0397ce,d98bf478-4817-4c5b-82ea-dfb9ea980d12,"{""demo_score"": 0.5914070310856829, ""apps_score...",2025-09-17T10:34:00.777594,2025-09-17T10:34:00.777594,Cash_alpha_trench2_backscore,v1,trench alpha beta,5eb9f55a-e772-40bd-92e9-2f8d16dcc45b,2025-09-17T10:34:00.777594,{},"{""days_on_book"": 421, ""ln_appln_submit_datetim..."
1,2808330,18122fb1-947f-4004-9033-c4bdcbbca5e6,5251c97b-2020-481d-bcd6-abb5e45bbe52,"{""demo_score"": 0.4645575141164843, ""apps_score...",2025-09-17T10:34:00.778593,2025-09-17T10:34:00.778593,Cash_alpha_trench2_backscore,v1,trench alpha beta,bd022624-412a-4ce8-93a3-10ae1875534e,2025-09-17T10:34:00.778593,{},"{""days_on_book"": 40, ""ln_appln_submit_datetime..."
2,3403996,a087c785-8d47-49d7-83f1-896dfc0b007a,c0867a05-2186-4836-8bfc-180e712ad0bf,"{""demo_score"": 0.4603653023836028, ""apps_score...",2025-09-17T10:34:00.778593,2025-09-17T10:34:00.778593,Cash_alpha_trench2_backscore,v1,trench alpha beta,00c65a09-698b-48e8-a746-723bb1c697fc,2025-09-17T10:34:00.778593,{},"{""days_on_book"": 415, ""ln_appln_submit_datetim..."
3,3172423,965de26f-3ab6-4992-b678-f7f6f8219736,9ee3bba6-28b7-416a-9c84-5fb9a06cc5b7,"{""demo_score"": 0.5835036923589388, ""apps_score...",2025-09-17T10:34:00.778593,2025-09-17T10:34:00.778593,Cash_alpha_trench2_backscore,v1,trench alpha beta,712a3a57-ce14-4439-9576-9789fe9fb48b,2025-09-17T10:34:00.778593,{},"{""days_on_book"": 31, ""ln_appln_submit_datetime..."
4,2422495,59483165-7e09-4ab2-a038-c13577f02479,68747784-4912-4c38-ae21-418a24ace204,"{""demo_score"": 0.43342198257276704, ""apps_scor...",2025-09-17T10:34:00.778593,2025-09-17T10:34:00.778593,Cash_alpha_trench2_backscore,v1,trench alpha beta,b60679eb-fa34-4b45-8a8c-604b2cad1c20,2025-09-17T10:34:00.778593,{},"{""days_on_book"": 241, ""ln_appln_submit_datetim..."


In [105]:
dfd.columns

Index(['customerId', 'digitalLoanAccountId', 'crifApplicationId', 'prediction',
       'start_time', 'end_time', 'modelDisplayName', 'modelVersionId',
       'subscription_name', 'message_id', 'publish_time', 'attributes',
       'calcFeature'],
      dtype='object')

In [106]:
df1 = dfd[['customerId', 'digitalLoanAccountId', 'prediction',
       'start_time', 'end_time', 'modelDisplayName', 'modelVersionId',
        'calcFeature'
       ]].copy()

Found no duplicate digitalLoanAccountId

In [107]:
filenames = f'{CURRENT_DATE}_{unique_id}_{transformeddata}'

results = save_dataframe_multi_format(
     dataframe=df1,
     cloud_path=CLOUDPATH,
     filename=filenames,
     client=client,
     bucket_name=f'{BUCKETNAME}'
 )

All files saved successfully!
CSV: gs://prod-asia-southeast1-tonik-aiml-workspace/DC/Model_Monitoring/Model_Tables/20250917_44dbc309ac1b_cash_alpha_trench2_applied_loans_backscored_20241001_20250831.csv
PICKLE: gs://prod-asia-southeast1-tonik-aiml-workspace/DC/Model_Monitoring/Model_Tables/20250917_44dbc309ac1b_cash_alpha_trench2_applied_loans_backscored_20241001_20250831.pkl
PARQUET: gs://prod-asia-southeast1-tonik-aiml-workspace/DC/Model_Monitoring/Model_Tables/20250917_44dbc309ac1b_cash_alpha_trench2_applied_loans_backscored_20241001_20250831.parquet
JOBLIB: gs://prod-asia-southeast1-tonik-aiml-workspace/DC/Model_Monitoring/Model_Tables/20250917_44dbc309ac1b_cash_alpha_trench2_applied_loans_backscored_20241001_20250831.joblib


# Insert the data into a table

In [108]:
# Upload to BigQuery
table_id = f"prj-prod-dataplatform.dap_ds_poweruser_playground.temp_model_transformed_data"
print(table_id)
job_config = bigquery.LoadJobConfig(
    write_disposition="WRITE_APPEND",  # or "WRITE_APPEND"
)
job = client.load_table_from_dataframe(df1, table_id, job_config=job_config)
job.result()  # Wait for the job to complete


prj-prod-dataplatform.dap_ds_poweruser_playground.temp_model_transformed_data




LoadJob<project=prj-prod-dataplatform, location=asia-southeast1, id=15fb3d9f-1e11-4299-8a32-8b0ca2e6bf4e>

In [109]:
sq = """select modelDisplayName, count(distinct digitalLoanAccountId) from dap_ds_poweruser_playground.temp_model_transformed_data group by 1 order by 1;"""

d2 = client.query(sq).to_dataframe()
d2



Unnamed: 0,modelDisplayName,f0_
0,Cash_alpha_trench1_backscore,62044
1,Cash_alpha_trench2_backscore,39651
2,Cash_beta_trench1_backscore,296480
3,Cash_beta_trench2_backscore,111973
4,Cash_beta_trench3_backscore,38621


# cash_alpha_trench3_applied_loans_backscored_20241001_20250831

# Table

In [110]:
schema1 = 'worktable_data_analysis'
cash_alpha_trench3 = f'cash_alpha_trench3_applied_loans_backscored_20241001_20250831'

# Query

In [111]:
sq = f"""
select * from {schema1}.{cash_alpha_trench3};
"""
d1 = client.query(sq).to_dataframe(progress_bar_type='tqdm')
print(f"The shape of {schema1}.{cash_alpha_trench3} table is:\t {d1.shape}")

Job ID 9e3696a2-5f6e-40af-b23c-17627ec8ef95 successfully executed: 100%|[32m██████████[0m|




Downloading: 100%|[32m██████████[0m|
The shape of worktable_data_analysis.cash_alpha_trench3_applied_loans_backscored_20241001_20250831 table is:	 (16631, 49)


In [112]:
d1.columns.values

array(['customer_id', 'digitalLoanAccountId', 'days_on_book',
       'ln_appln_submit_datetime', 'ln_os_type', 'ln_vas_opted_flag',
       'ln_self_dec_income', 'ln_age', 'ln_source_funds_new_bin',
       'ln_loan_level_user_type', 'ln_industry_new_cat_bin',
       'ln_marital_status', 'ln_doc_type_rolled', 'ln_education_level',
       'ln_ref2_type', 'ln_email_primary_domain', 'ln_province_bin',
       'ln_mature_fspd30_flag', 'ln_fspd30_flag', 'trench_category',
       'ln_loan_type', 'ln_disb_dtime',
       'c_app_cnt_absence_tag_365d_binned',
       'c_app_cnt_books_and_reference_ever_binned',
       'c_app_cnt_gaming_180d_binned',
       'c_app_cnt_health_and_fitness_ever_binned',
       'c_app_cnt_productivity_ever_binned',
       'c_app_cnt_rated_for_18plus_ever_binned',
       'c_app_last_payday_install_to_apply_days_binned',
       'ca_t3_tx_cnt_installments_paid_tot_with_dpd',
       'ca_t3_tx_time_since_last_applied_loan_application_time',
       'ca_t3_tx_last_applied_loan_

In [113]:
# d1['trx_score'] = np.nan

In [114]:
d1.head()

Unnamed: 0,customer_id,digitalLoanAccountId,days_on_book,ln_appln_submit_datetime,ln_os_type,ln_vas_opted_flag,ln_self_dec_income,ln_age,ln_source_funds_new_bin,ln_loan_level_user_type,ln_industry_new_cat_bin,ln_marital_status,ln_doc_type_rolled,ln_education_level,ln_ref2_type,ln_email_primary_domain,ln_province_bin,ln_mature_fspd30_flag,ln_fspd30_flag,trench_category,ln_loan_type,ln_disb_dtime,c_app_cnt_absence_tag_365d_binned,c_app_cnt_books_and_reference_ever_binned,c_app_cnt_gaming_180d_binned,c_app_cnt_health_and_fitness_ever_binned,c_app_cnt_productivity_ever_binned,c_app_cnt_rated_for_18plus_ever_binned,c_app_last_payday_install_to_apply_days_binned,ca_t3_tx_cnt_installments_paid_tot_with_dpd,ca_t3_tx_time_since_last_applied_loan_application_time,ca_t3_tx_last_applied_loan_decision,ca_t3_tx_min_age_completed_loans,ca_t3_tx_dob_observation_date,ca_t3_tx_cnt_jira_tickets_created_bin,ca_t3_tx_max_ever_dpd,ca_t3_tx_amt_cash_in_total,ca_t3_tx_last_applied_loan_type_bin,ca_t3_tx_cnt_completed_loans,ca_t3_tx_meng_no_of_logins,ca_t3_tx_last_applied_loan_tenor,ca_t3_tx_med_days_bt_cash_out_trans,ca_t3_tx_avg_days_bt_cash_in_trans,trx_score,demo_score,apps_score,credo_score,cic_score,stack_score
0,1897828,40801a80-b17a-4d51-adc9-748240e650ee,721,2025-02-02 15:07:28+00:00,iOS,1,43000,28,salary,1_Repeat Applicant,missing,Married,less_frequent_cat,College Graduate,Sibling,gmail.com,others,1,0,Trench 3,Quick,2025-02-02 15:12:46,,,,,,,,0,708,REJECT,4,721,1,,623709.0,Quick,1,113,6,0.0,3.962025,0.08765,0.465872,,0.092285,0.575668,0.529028
1,2441372,e993fa25-eefe-4863-82cf-472d90bd622e,220,2024-11-01 11:17:47+00:00,iOS,0,25000,33,income from business,1_Repeat Applicant,2.0,Single,umid,College Graduate,Sibling,yahoo.com,cavite,1,0,Trench 3,Quick,2024-11-02 18:39:18,,,,,,,,0,220,APPROVED,0,220,1,,8977.24,SIL,0,23,6,,30.666667,0.088483,0.488606,,0.098789,0.552489,0.528184
2,2226781,34568e85-907c-4639-a859-7bc86932b853,465,2024-12-17 19:11:53+00:00,iOS,1,30000,23,salary,1_Repeat Applicant,1.0,Single,national id,College Graduate,Parent,gmail.com,metro manila,1,1,Trench 3,Quick,2024-12-17 19:58:15,,,,,,,,0,83,REJECT,6,465,1,,51847.0,Quick,1,106,12,1.0,10.777778,0.125995,0.554495,,0.113403,0.397009,0.512387
3,2271357,580c38bc-ec05-4a48-89f4-ec02313afe6b,510,2025-03-06 14:34:26+00:00,iOS,0,50000,30,income from business,1_Repeat Applicant,1.0,Single,driving license,College Graduate,Sibling,yahoo.com,pampanga,1,0,Trench 3,Quick,2025-03-06 14:43:02,,,,,,,,0,154,APPROVED,0,510,1,,33866.52,Quick,0,21,6,76.5,24.2,0.074312,0.501559,,0.0864,0.391259,0.475434
4,2173280,a631ab67-e35e-48f0-9051-f84d9c4df5cb,560,2025-02-15 08:02:26+00:00,iOS,0,59000,45,salary,1_Repeat Applicant,1.0,Married,umid,College Graduate,Child,yahoo.com,negros occidental,1,0,Trench 3,Quick,2025-02-15 08:10:49,,,,,,,,0,276,APPROVED,182,560,1,,35450.0,Quick,1,72,9,182.0,35.75,0.048529,0.343457,,0.154069,0.369945,0.420548


In [115]:
import pandas as pd
import json
import uuid
from datetime import datetime

def transform_data(output_file_path):
    # Read the input CSV file
    df = d1.copy()
    
    # Create the output DataFrame with the required structure
    output_data = []
    
    for _, row in df.iterrows():
        # Create the calcFeature JSON with all the feature columns
        feature_columns = [
        'days_on_book',
       'ln_appln_submit_datetime', 'ln_os_type', 'ln_vas_opted_flag',
       'ln_self_dec_income', 'ln_age', 'ln_source_funds_new_bin',
       'ln_loan_level_user_type', 'ln_industry_new_cat_bin',
       'ln_marital_status', 'ln_doc_type_rolled', 'ln_education_level',
       'ln_ref2_type', 'ln_email_primary_domain', 'ln_province_bin',
       'ln_mature_fspd30_flag', 'ln_fspd30_flag', 'trench_category',
       'ln_loan_type', 'ln_disb_dtime',
       'c_app_cnt_absence_tag_365d_binned',
       'c_app_cnt_books_and_reference_ever_binned',
       'c_app_cnt_gaming_180d_binned',
       'c_app_cnt_health_and_fitness_ever_binned',
       'c_app_cnt_productivity_ever_binned',
       'c_app_cnt_rated_for_18plus_ever_binned',
       'c_app_last_payday_install_to_apply_days_binned',
       'ca_t3_tx_cnt_installments_paid_tot_with_dpd',
       'ca_t3_tx_time_since_last_applied_loan_application_time',
       'ca_t3_tx_last_applied_loan_decision',
       'ca_t3_tx_min_age_completed_loans',
       'ca_t3_tx_dob_observation_date',
       'ca_t3_tx_cnt_jira_tickets_created_bin', 'ca_t3_tx_max_ever_dpd',
       'ca_t3_tx_amt_cash_in_total',
       'ca_t3_tx_last_applied_loan_type_bin',
       'ca_t3_tx_cnt_completed_loans', 'ca_t3_tx_meng_no_of_logins',
       'ca_t3_tx_last_applied_loan_tenor',
       'ca_t3_tx_med_days_bt_cash_out_trans',
       'ca_t3_tx_avg_days_bt_cash_in_trans',
        ]
        
        calc_feature = {}
        for col in feature_columns:
            if col in row and pd.notna(row[col]):
                # Convert Timestamp objects to string
                if isinstance(row[col], pd.Timestamp):
                    calc_feature[col] = row[col].isoformat()
                else:
                    calc_feature[col] = row[col]
        
        # Create prediction JSON with the score columns
        prediction = {
            "demo_score": row.get('demo_score', 0),
            "apps_score": row.get('apps_score', 0),
            "credo_score": row.get('credo_score', 0),
            "stack_score": row.get('stack_score', 0),
            "cic_score": row.get('cic_score', 0),
            "trx_score":row.get('trx_score', 0),
        }
        
        # Get current timestamp
        current_time = datetime.now().isoformat()
        
        # Create the output row
        output_row = {
            "customerId": row['customer_id'],
            "digitalLoanAccountId": row['digitalLoanAccountId'],
            "crifApplicationId": str(uuid.uuid4()),  # Generate random UUID
            "prediction": json.dumps(prediction),
            "start_time": current_time,
            "end_time": current_time,
            "modelDisplayName":"Cash_alpha_trench3_backscore",
            "modelVersionId":"v1",
            "subscription_name": "trench alpha beta",
            "message_id": str(uuid.uuid4()),  # Generate random UUID
            "publish_time": current_time,
            "attributes": "{}",  # Empty JSON object
            "calcFeature": json.dumps(calc_feature, default=str)  # Use default=str to handle non-serializable objects
            
        }
        
        output_data.append(output_row)
    
    # Create DataFrame from the output data
    output_df = pd.DataFrame(output_data)
    
    # Save to CSV
    output_df.to_csv(output_file_path, index=False)
    return output_df

# Example usage:
# transformeddata = 'cash_beta_trench1_applied_loans_backscored_20241001_20250831'
# transform_data(f'{LOCALPATH}/{transformeddata}.csv')

In [116]:
transformeddata = f'{cash_alpha_trench3}'
print(transformeddata)

cash_alpha_trench3_applied_loans_backscored_20241001_20250831


In [117]:
dfd = transform_data(f'{LOCALPATH}/{transformeddata}.csv')
print(f"The shape of the transformed data is: {dfd.shape}")

The shape of the transformed data is: (16631, 13)


In [118]:
dfd.head()

Unnamed: 0,customerId,digitalLoanAccountId,crifApplicationId,prediction,start_time,end_time,modelDisplayName,modelVersionId,subscription_name,message_id,publish_time,attributes,calcFeature
0,1897828,40801a80-b17a-4d51-adc9-748240e650ee,1ec8d0fd-543d-4df1-b762-2e832ecbdab4,"{""demo_score"": 0.4658721040375334, ""apps_score...",2025-09-17T10:35:42.306202,2025-09-17T10:35:42.306202,Cash_alpha_trench3_backscore,v1,trench alpha beta,ff151af4-43ee-46db-9315-d30806bb8620,2025-09-17T10:35:42.306202,{},"{""days_on_book"": 721, ""ln_appln_submit_datetim..."
1,2441372,e993fa25-eefe-4863-82cf-472d90bd622e,c721cb37-c5db-47bc-8c4d-922cdc246e78,"{""demo_score"": 0.48860554783439586, ""apps_scor...",2025-09-17T10:35:42.306202,2025-09-17T10:35:42.306202,Cash_alpha_trench3_backscore,v1,trench alpha beta,759ea5f5-be8a-4c4b-aeb3-89ee4d506f0e,2025-09-17T10:35:42.306202,{},"{""days_on_book"": 220, ""ln_appln_submit_datetim..."
2,2226781,34568e85-907c-4639-a859-7bc86932b853,a4f711a8-f077-4997-93b8-e3c48418e41f,"{""demo_score"": 0.5544949520322454, ""apps_score...",2025-09-17T10:35:42.306202,2025-09-17T10:35:42.306202,Cash_alpha_trench3_backscore,v1,trench alpha beta,6b65da32-701b-4511-8d6c-0463f8b7ee09,2025-09-17T10:35:42.306202,{},"{""days_on_book"": 465, ""ln_appln_submit_datetim..."
3,2271357,580c38bc-ec05-4a48-89f4-ec02313afe6b,92a88cb9-4549-4f05-aad3-5e949ece1cfc,"{""demo_score"": 0.5015593445757445, ""apps_score...",2025-09-17T10:35:42.307201,2025-09-17T10:35:42.307201,Cash_alpha_trench3_backscore,v1,trench alpha beta,13652e5a-c256-44fd-84e4-252a553a008e,2025-09-17T10:35:42.307201,{},"{""days_on_book"": 510, ""ln_appln_submit_datetim..."
4,2173280,a631ab67-e35e-48f0-9051-f84d9c4df5cb,65818a81-d2f3-4d19-9c5d-d8b638ea68d5,"{""demo_score"": 0.3434565250849461, ""apps_score...",2025-09-17T10:35:42.307201,2025-09-17T10:35:42.307201,Cash_alpha_trench3_backscore,v1,trench alpha beta,66fe4137-a7ae-4131-b9b3-281a93e248f1,2025-09-17T10:35:42.307201,{},"{""days_on_book"": 560, ""ln_appln_submit_datetim..."


In [119]:
dfd.columns

Index(['customerId', 'digitalLoanAccountId', 'crifApplicationId', 'prediction',
       'start_time', 'end_time', 'modelDisplayName', 'modelVersionId',
       'subscription_name', 'message_id', 'publish_time', 'attributes',
       'calcFeature'],
      dtype='object')

In [120]:
df1 = dfd[['customerId', 'digitalLoanAccountId', 'prediction',
       'start_time', 'end_time', 'modelDisplayName', 'modelVersionId',
        'calcFeature'
       ]].copy()

Found no duplicate digitalLoanAccountId

In [121]:
filenames = f'{CURRENT_DATE}_{unique_id}_{transformeddata}'

results = save_dataframe_multi_format(
     dataframe=df1,
     cloud_path=CLOUDPATH,
     filename=filenames,
     client=client,
     bucket_name=f'{BUCKETNAME}'
 )

All files saved successfully!
CSV: gs://prod-asia-southeast1-tonik-aiml-workspace/DC/Model_Monitoring/Model_Tables/20250917_44dbc309ac1b_cash_alpha_trench3_applied_loans_backscored_20241001_20250831.csv
PICKLE: gs://prod-asia-southeast1-tonik-aiml-workspace/DC/Model_Monitoring/Model_Tables/20250917_44dbc309ac1b_cash_alpha_trench3_applied_loans_backscored_20241001_20250831.pkl
PARQUET: gs://prod-asia-southeast1-tonik-aiml-workspace/DC/Model_Monitoring/Model_Tables/20250917_44dbc309ac1b_cash_alpha_trench3_applied_loans_backscored_20241001_20250831.parquet
JOBLIB: gs://prod-asia-southeast1-tonik-aiml-workspace/DC/Model_Monitoring/Model_Tables/20250917_44dbc309ac1b_cash_alpha_trench3_applied_loans_backscored_20241001_20250831.joblib


# Insert the data into a table

In [122]:
# Upload to BigQuery
table_id = f"prj-prod-dataplatform.dap_ds_poweruser_playground.temp_model_transformed_data"
print(table_id)
job_config = bigquery.LoadJobConfig(
    write_disposition="WRITE_APPEND",  # or "WRITE_APPEND"
)
job = client.load_table_from_dataframe(df1, table_id, job_config=job_config)
job.result()  # Wait for the job to complete


prj-prod-dataplatform.dap_ds_poweruser_playground.temp_model_transformed_data




LoadJob<project=prj-prod-dataplatform, location=asia-southeast1, id=d4ff6860-4311-47df-a1d9-84601ab4494e>

In [123]:
sq = """select modelDisplayName, count(distinct digitalLoanAccountId) from dap_ds_poweruser_playground.temp_model_transformed_data group by 1 order by 1;"""

d2 = client.query(sq).to_dataframe()
d2



Unnamed: 0,modelDisplayName,f0_
0,Cash_alpha_trench1_backscore,62044
1,Cash_alpha_trench2_backscore,39651
2,Cash_alpha_trench3_backscore,16631
3,Cash_beta_trench1_backscore,296480
4,Cash_beta_trench2_backscore,111973
5,Cash_beta_trench3_backscore,38621


# Merged final table

In [125]:
schema3 = 'dap_ds_poweruser_playground'
tab1 = 'temp_final_model_transformed_data'

In [126]:
sq = f"""
create or replace table {schema3}.{tab1} as
SELECT cast(customerId as numeric)customerId, digitalLoanAccountId, prediction, start_time, end_time  
, modelDisplayName
, modelVersionId
, calcFeature
, 'ml_model_run_details' source
FROM `prj-prod-dataplatform.audit_balance.ml_model_run_details` 
qualify row_number() over(partition by customerId ,digitalLoanAccountId, modelDisplayName order by start_time
desc) = 1

union all 

select customerId, digitalLoanAccountId, prediction, datetime(start_time)start_time, datetime(end_time) end_time 
, modelDisplayName
, modelVersionId
, calcFeature 
, 'Manual_Backscore_tables' source
from dap_ds_poweruser_playground.temp_model_transformed_data

;
"""
job = client.query(sq)
job.result()  # Wait for the job to complete.
time.sleep(5) # Delays for 30 seconds
print(f'Table {schema3}.{tab1} created successfully')



Table dap_ds_poweruser_playground.temp_final_model_transformed_data created successfully


In [128]:
sq = """select modelDisplayName, source, count(distinct digitalLoanAccountId) from dap_ds_poweruser_playground.temp_final_model_transformed_data
group by 1,2
order by 1,2"""

d3 = client.query(sq).to_dataframe(progress_bar_type='tqdm')
d3

Job ID 9d45e4a8-6714-44ad-91cf-93c6cd286f8d successfully executed: 100%|[32m██████████[0m|




Downloading: 100%|[32m██████████[0m|


Unnamed: 0,modelDisplayName,source,f0_
0,Alpha - IncomeEstimationModel,ml_model_run_details,48169
1,Alpha - CIC-SIL-Model,ml_model_run_details,48169
2,Alpha - StackingModel,ml_model_run_details,48169
3,Beta - AppsScoreModel,ml_model_run_details,68042
4,Beta - DemoScoreModel,ml_model_run_details,76893
5,Beta - IncomeEstimationModel,ml_model_run_details,76893
6,Beta - StackScoreModel,ml_model_run_details,76893
7,Cash_alpha_trench1_backscore,Manual_Backscore_tables,62044
8,Cash_alpha_trench2_backscore,Manual_Backscore_tables,39651
9,Cash_alpha_trench3_backscore,Manual_Backscore_tables,16631


In [None]:
# d1.to_csv(fr"{LOCALPATH}\{CURRENT_DATE}_{unique_id}_{transformeddata}.csv", index = False)
# d1.to_parquet(fr"{LOCALPATH}\{CURRENT_DATE}_{unique_id}_{transformeddata}.parquet")
# d1.to_pickle(fr"{LOCALPATH}\{CURRENT_DATE}_{unique_id}_{transformeddata}.pkl")
# joblib.dump(d1, f"{LOCALPATH}\{CURRENT_DATE}_{unique_id}_{transformeddata}.joblib")