# Define Library

In [127]:
# %% [markdown]
# # Jupyter Notebook Loading Header
#
# This is a custom loading header for Jupyter Notebooks in Visual Studio Code.
# It includes common imports and settings to get you started quickly.
# %% [markdown]
## Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from google.cloud import bigquery
from google.cloud import storage
import os
import tempfile
import time
from datetime import datetime
import uuid
import joblib
import uuid

import gcsfs
import duckdb as dd
import pickle
import joblib
from typing import Union
import io

path = r'C:\Users\Dwaipayan\AppData\Roaming\gcloud\legacy_credentials\dchakroborti@tonikbank.com\adc.json'
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = path
client = bigquery.Client(project='prj-prod-dataplatform')
os.environ["GOOGLE_CLOUD_PROJECT"] = "prj-prod-dataplatform"
# %% [markdown]
## Configure Settings
# Set options or configurations as needed
pd.set_option('display.max_columns', None)
pd.set_option("Display.max_rows", 100)


# Constant

In [128]:
CURRENT_DATE = datetime.now().strftime("%Y%m%d")


# Config

In [129]:
unique_id = str(uuid.uuid4()).replace('-', '')[-12:]
print(f"The unique Id is: {unique_id}")
BUCKETNAME = 'prod-asia-southeast1-tonik-aiml-workspace'
CLOUDPATH = 'DC/Model_Monitoring/Model_Tables'
LOCALPATH = r'D:\OneDrive - Tonik Financial Pte Ltd\MyStuff\Data Engineering\Model_Monitoring\New_Model_Monitoring\Data'
VERSION = 'V1'

The unique Id is: 36afc2cd41f7


# <div align="left" style="color:rgb(51, 250, 250);"> Functions </div>

## <div align="left" style="color:rgb(51, 250, 250);"> Save the data to google clound storage </div>

In [130]:
def save_df_to_gcs(df, bucket_name, destination_blob_name, file_format='csv'):
    """Saves a pandas DataFrame to Google Cloud Storage.

    Args:
        df: The pandas DataFrame to save.
        bucket_name: The name of the GCS bucket.
        destination_blob_name: The name of the blob to be created.
        file_format: The file format to save the DataFrame in ('csv' or 'parquet').
    """

    # Create a temporary file
    if file_format == 'csv':
        temp_file = 'temp.csv'
        df.to_csv(temp_file, index=False)
    elif file_format == 'parquet':
        temp_file = 'temp.parquet'
        df.to_parquet(temp_file, index=False)
    else:
        raise ValueError("Invalid file format. Please choose 'csv' or 'parquet'.")

    # Upload the file to GCS
    storage_client = storage.Client(project="prj-prod-dataplatform")

    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(destination_blob_name)

    blob.upload_from_filename(temp_file)

    # Remove the temporary file
    import os
    os.remove(temp_file)
    


## <div align="left" style="color:rgb(51, 250, 250);"> Read the Data from Google Cloud Storage </div>

In [131]:
def read_df_from_gcs(bucket_name, source_blob_name, file_format='csv'):
    """Reads a DataFrame from Google Cloud Storage.

    Args:
        bucket_name: The name of the GCS bucket.
        source_blob_name: The name of the blob to read.
        file_format: The file format to read ('csv' or 'parquet').

    Returns:
        pandas.DataFrame: The data loaded from the GCS file.
    """
    # Create a temporary file name
    temp_file = f'temp.{file_format}'
    
    try:
        # Initialize GCS client
        storage_client = storage.Client()
        bucket = storage_client.bucket(bucket_name)
        blob = bucket.blob(source_blob_name)

        # Download the file to a temporary location
        blob.download_to_filename(temp_file)

        # Read the file into a DataFrame
        if file_format == 'csv':
            df = pd.read_csv(temp_file, low_memory=False)
        elif file_format == 'parquet':
            df = pd.read_parquet(temp_file)
        else:
            raise ValueError("Invalid file format. Please choose 'csv' or 'parquet'.")

        return df

    finally:
        # Clean up the temporary file
        if os.path.exists(temp_file):
            os.remove(temp_file)

## <div align = "left" style="color:rgb(51, 250, 250);"> Data Quality Report </div>

In [132]:
def data_quality_report(df, target_col='ln_fspd30_flag'):
    # Initialize an empty list to store each row of data
    report_data = []
    # Iterate over each column in the DataFrame to compute metrics
    for col in df.columns:
        # Determine the data type of the column
        data_type = df[col].dtype
       
        # Calculate the number of missing values in the column
        missing_values = df[col].isnull().sum()
       
        # Calculate the percentage of missing values relative to the total number of rows
        missing_percentage = (missing_values / len(df)) * 100
       
        # Calculate the number of unique values in the column
        unique_values = df[col].nunique()
       
        # Calculate the percentage of non-missing values
        non_missing_percentage = ((len(df) - missing_values) / len(df)) * 100
       
        # Check if the column is numeric to compute additional metrics
        if pd.api.types.is_numeric_dtype(df[col]):
            # Compute minimum, maximum, mean, median, mode, mode percentage, standard deviation, and quantiles
            min_value = df[col].min()
            max_value = df[col].max()
            mean_value = df[col].mean()
            median_value = df[col].median()
            mode_value = df[col].mode().iloc[0] if not df[col].mode().empty else None
            mode_percentage = (df[col] == mode_value).sum() / len(df) * 100 if mode_value is not None else None
            std_dev = df[col].std()
            quantile_25 = df[col].quantile(0.25)
            quantile_50 = df[col].quantile(0.50)  # Same as median
            quantile_75 = df[col].quantile(0.75)
            
            # Calculate the Interquartile Range (IQR)
            iqr = quantile_75 - quantile_25
            
            # Calculate Skewness and Kurtosis
            skewness = df[col].skew()
            kurtosis = df[col].kurt()
            
            # Calculate Coefficient of Variation (CV) - standardized measure of dispersion
            cv = (std_dev / mean_value) * 100 if mean_value != 0 else None
            
            # Calculate correlation with target variable if target exists in dataframe
            if target_col in df.columns and col != target_col and pd.api.types.is_numeric_dtype(df[target_col]):
                # Calculate correlation only using rows where both columns have non-null values
                correlation = df[[col, target_col]].dropna().corr().iloc[0, 1]
            else:
                correlation = None
        else:
            # Assign None for non-numeric columns where appropriate
            min_value = None
            max_value = None
            mean_value = None
            median_value = None
            mode_value = df[col].mode().iloc[0] if not df[col].mode().empty else None
            mode_percentage = (df[col] == mode_value).sum() / len(df) * 100 if mode_value is not None else None
            std_dev = None
            quantile_25 = None
            quantile_50 = None
            quantile_75 = None
            iqr = None
            skewness = None
            kurtosis = None
            cv = None
            correlation = None
       
        # Append the computed metrics for the current column to the list
        report_data.append({
            'Column': col,
            'Data Type': data_type,
            'Missing Values': missing_values,
            'Missing Percentage': missing_percentage,
            'Unique Values': unique_values,
            'Min': min_value,
            'Max': max_value,
            'Mean': mean_value,
            'Median': median_value,
            'Mode': mode_value,
            'Mode Percentage': mode_percentage,
            'Std Dev': std_dev,
            'Non-missing Percentage': non_missing_percentage,
            '25% Quantile': quantile_25,
            '50% Quantile': quantile_50,
            '75% Quantile': quantile_75,
            'IQR': iqr,
            'Skewness': skewness,
            'Kurtosis': kurtosis,
            'CV (%)': cv,
            f'Correlation with {target_col}': correlation
        })
    # Create the DataFrame from the list of dictionaries
    report = pd.DataFrame(report_data)
   
    # Return the complete data quality report DataFrame
    return report

# <div align = "left" style="color:rgb(51,250,250);"> Upload pickle file to Google Cloud Storage Bucke </div>

In [133]:
def upload_to_gcs(bucket_name, source_file_path, destination_blob_name):
    """Uploads a file to Google Cloud Storage"""
    client = storage.Client()
    bucket = client.bucket(bucket_name)
    blob = bucket.blob(destination_blob_name)
    
    blob.upload_from_filename(source_file_path)
    print(f"File {source_file_path} uploaded to {bucket_name}/{destination_blob_name}")

In [134]:
import pickle
import io
from google.cloud import storage
def save_pickle_to_gcs(data, bucket_name, destination_blob_name):
    """
    Save any Python object as a pickle file to Google Cloud Storage
    
    Args:
        data: The Python object to pickle (DataFrame, dict, list, etc.)
        bucket_name: Name of the GCS bucket
        destination_blob_name: Path/filename in the bucket
    """
    # Initialize the GCS client
    client = storage.Client()
    bucket = client.bucket(bucket_name)
    blob = bucket.blob(destination_blob_name)
    
    # Serialize the data to pickle format in memory
    pickle_buffer = io.BytesIO()
    pickle.dump(data, pickle_buffer)
    pickle_buffer.seek(0)
    
    # Upload the pickle data to GCS
    blob.upload_from_file(pickle_buffer, content_type='application/octet-stream')
    print(f"Pickle file uploaded to gs://{bucket_name}/{destination_blob_name}")

# save_dataframe_multi_format

In [135]:
def save_dataframe_multi_format(
    dataframe: pd.DataFrame, 
    cloud_path: str, 
    filename: str, 
    client: bigquery.Client = None,
    bucket_name: str = None
) -> dict:
    """
    Save a pandas DataFrame to Google Cloud Storage in multiple formats (CSV, Pickle, Parquet, Joblib).
    
    Args:
        dataframe (pd.DataFrame): The DataFrame to save
        cloud_path (str): The cloud path (e.g., 'DC/Model_Monitoring/cash_beta_trench1_data')
        filename (str): The base filename without extension
        client (bigquery.Client, optional): BigQuery client (for project reference)
        bucket_name (str, optional): GCS bucket name. If None, will try to extract from client
        
    Returns:
        dict: Dictionary with status of each file saved
        
    Example:
        client = bigquery.Client(project='prj-prod-dataplatform')
        CLOUDPATH = 'DC/Model_Monitoring/cash_beta_trench1_data'
        
        results = save_dataframe_multi_format(
            dataframe=d1,
            cloud_path=CLOUDPATH,
            filename='my_data',
            client=client,
            bucket_name='your-bucket-name'  # Replace with your actual bucket name
        )
    """
    
    # Initialize Google Cloud Storage client
    storage_client = storage.Client(project=client.project if client else None)
    
    # You'll need to specify your bucket name here
    # Common bucket names in GCP data platforms might be like:
    # - 'prj-prod-dataplatform-storage'
    # - 'dataplatform-storage'
    # - or similar pattern
    if bucket_name is None:
        # You need to replace this with your actual bucket name
        raise ValueError("Please provide the bucket_name parameter")
    
    bucket = storage_client.bucket(bucket_name)
    
    # Results dictionary to track saves
    results = {}
    
    # Ensure cloud_path doesn't start with '/'
    cloud_path = cloud_path.lstrip('/')
    
    try:
        # 1. Save as CSV
        csv_buffer = io.StringIO()
        dataframe.to_csv(csv_buffer, index=False)
        csv_blob = bucket.blob(f"{cloud_path}/{filename}.csv")
        csv_blob.upload_from_string(csv_buffer.getvalue(), content_type='text/csv')
        results['csv'] = f"gs://{bucket_name}/{cloud_path}/{filename}.csv"
        
        # 2. Save as Pickle
        pickle_buffer = io.BytesIO()
        pickle.dump(dataframe, pickle_buffer)
        pickle_blob = bucket.blob(f"{cloud_path}/{filename}.pkl")
        pickle_blob.upload_from_string(pickle_buffer.getvalue(), content_type='application/octet-stream')
        results['pickle'] = f"gs://{bucket_name}/{cloud_path}/{filename}.pkl"
        
        # 3. Save as Parquet
        parquet_buffer = io.BytesIO()
        dataframe.to_parquet(parquet_buffer, index=False)
        parquet_blob = bucket.blob(f"{cloud_path}/{filename}.parquet")
        parquet_blob.upload_from_string(parquet_buffer.getvalue(), content_type='application/octet-stream')
        results['parquet'] = f"gs://{bucket_name}/{cloud_path}/{filename}.parquet"
        
        # 4. Save as Joblib
        joblib_buffer = io.BytesIO()
        joblib.dump(dataframe, joblib_buffer)
        joblib_blob = bucket.blob(f"{cloud_path}/{filename}.joblib")
        joblib_blob.upload_from_string(joblib_buffer.getvalue(), content_type='application/octet-stream')
        results['joblib'] = f"gs://{bucket_name}/{cloud_path}/{filename}.joblib"
        
        print("All files saved successfully!")
        for format_type, path in results.items():
            print(f"{format_type.upper()}: {path}")
            
    except Exception as e:
        print(f"Error occurred: {str(e)}")
        results['error'] = str(e)
    
    return results

# cash_beta_trench1_applied_loans_backscored_20241001_20250831

# Table

In [136]:
schema1 = 'worktable_data_analysis'
cash_beta_trench1 = f'cash_beta_trench1_applied_loans_backscored_20241001_20250831'

# Query

In [137]:
sq = f"""
select * from worktable_data_analysis.cash_beta_trench1_applied_loans_backscored_20241001_20250831;
"""
d1 = client.query(sq).to_dataframe(progress_bar_type='tqdm')
print(f"The shape of {schema1}.{cash_beta_trench1} table is:\t {d1.shape}")

Job ID ac0744ba-9067-40ed-8c62-ac8f4490e392 successfully executed: 100%|[32m██████████[0m|




Downloading: 100%|[32m██████████[0m|
The shape of worktable_data_analysis.cash_beta_trench1_applied_loans_backscored_20241001_20250831 table is:	 (296480, 34)


In [138]:
d1.head()

Unnamed: 0,customer_id,digitalLoanAccountId,days_on_book,ln_appln_submit_datetime,ln_os_type,ln_vas_opted_flag,ln_self_dec_income,ln_age,ln_source_funds_new_bin,ln_loan_level_user_type,ln_industry_new_cat_bin,ln_marital_status,ln_doc_type_rolled,ln_education_level,ln_ref2_type,ln_email_primary_domain,ln_province_bin,ln_mature_fspd30_flag,ln_fspd30_flag,demo_score,trench_category,ln_loan_type,ln_disb_dtime,c_app_cnt_health_and_fitness_ever,c_app_cnt_shopping_ever,c_app_median_time_bw_installed_mins_ever,c_app_avg_time_bw_installed_mins_3d,c_app_cnt_crypto_ever,c_app_cnt_driver_ever,c_app_cnt_payday_180d,c_app_cnt_gambling_180d,apps_score,credo_score,stack_score
0,3461645,6061f751-bb86-49a3-9b6b-0ea44b4306d6,0,2025-05-27 13:22:29+00:00,Android,1,28500,17,salary,2_New Applicant,1.0,Single,national id,College Undergraduate,Friend,gmail.com,others,,,0.624579,Trench 1,Quick,NaT,0.0,2.0,0.033333,,0.0,0.0,0.0,0.0,0.642715,0.240666,0.895453
1,3292475,80468034-cc21-4da8-b2aa-6e35c84ac387,1,2025-02-28 13:24:31+00:00,Android,1,15000,17,income from business,2_New Applicant,1.0,Single,national id,College Graduate,Sibling,gmail.com,others,,,0.680918,Trench 1,Quick,NaT,2.0,0.0,1499.5,1136.783333,0.0,0.0,3.0,0.0,0.532071,0.393578,0.697355
2,3192798,03cf7323-cf3a-44be-9975-ac302d61e7ed,0,2025-01-16 03:58:30+00:00,iOS,1,50000,18,salary,2_New Applicant,2.0,Single,driving license,College Graduate,Sibling,gmail.com,others,,,0.587765,Trench 1,Quick,NaT,,,,,,,,,,0.220727,0.610448
3,3573079,0e938f32-94e3-4473-9beb-9469807b1edf,0,2025-07-23 08:23:10+00:00,iOS,1,6000,18,remittance,2_New Applicant,missing,Single,national id,College Graduate,Sibling,gmail.com,others,,,0.660842,Trench 1,Quick,NaT,,,,,,,,,,0.24371,0.705892
4,3631270,3eb7e8c9-1412-46b2-bdfe-aa71f1ac9a14,0,2025-08-19 18:56:40+00:00,iOS,1,5000,18,income from business,2_New Applicant,1.0,Single,national id,College Undergraduate,Friend,gmail.com,others,,,0.701313,Trench 1,Quick,NaT,,,,,,,,,,0.280322,0.770477


In [139]:
d1.columns.values

array(['customer_id', 'digitalLoanAccountId', 'days_on_book',
       'ln_appln_submit_datetime', 'ln_os_type', 'ln_vas_opted_flag',
       'ln_self_dec_income', 'ln_age', 'ln_source_funds_new_bin',
       'ln_loan_level_user_type', 'ln_industry_new_cat_bin',
       'ln_marital_status', 'ln_doc_type_rolled', 'ln_education_level',
       'ln_ref2_type', 'ln_email_primary_domain', 'ln_province_bin',
       'ln_mature_fspd30_flag', 'ln_fspd30_flag', 'demo_score',
       'trench_category', 'ln_loan_type', 'ln_disb_dtime',
       'c_app_cnt_health_and_fitness_ever', 'c_app_cnt_shopping_ever',
       'c_app_median_time_bw_installed_mins_ever',
       'c_app_avg_time_bw_installed_mins_3d', 'c_app_cnt_crypto_ever',
       'c_app_cnt_driver_ever', 'c_app_cnt_payday_180d',
       'c_app_cnt_gambling_180d', 'apps_score', 'credo_score',
       'stack_score'], dtype=object)

In [140]:
import pandas as pd
import json
import uuid
from datetime import datetime

def transform_data(output_file_path):
    # Read the input CSV file
    df = d1.copy()
    
    # Create the output DataFrame with the required structure
    output_data = []
    
    for _, row in df.iterrows():
        # Create the calcFeature JSON with all the feature columns
        feature_columns = [
        'days_on_book',
       'ln_appln_submit_datetime', 'ln_os_type', 'ln_vas_opted_flag',
       'ln_self_dec_income', 'ln_age', 'ln_source_funds_new_bin',
       'ln_loan_level_user_type', 'ln_industry_new_cat_bin',
       'ln_marital_status', 'ln_doc_type_rolled', 'ln_education_level',
       'ln_ref2_type', 'ln_email_primary_domain', 'ln_province_bin',
       'ln_mature_fspd30_flag', 'ln_fspd30_flag',  'trench_category', 'ln_loan_type', 'ln_disb_dtime',
        ]
        
        calc_feature = {}
        for col in feature_columns:
            if col in row and pd.notna(row[col]):
                # Convert Timestamp objects to string
                if isinstance(row[col], pd.Timestamp):
                    calc_feature[col] = row[col].isoformat()
                else:
                    calc_feature[col] = row[col]
        
       
        # Get current timestamp
        current_time = datetime.now().isoformat()
        
        # Create the output row
        output_row = {
            "customerId": row['customer_id'],
            "digitalLoanAccountId": row['digitalLoanAccountId'],
            "crifApplicationId": str(uuid.uuid4()),  # Generate random UUID
            "prediction": row.get('demo_score', 0),
            "start_time": current_time,
            "end_time": current_time,
            "modelDisplayName":"Cash_beta_trench1_Demo_backscore",
            "modelVersionId":"v1",
            "subscription_name": "trench alpha beta",
            "message_id": str(uuid.uuid4()),  # Generate random UUID
            "publish_time": current_time,
            "attributes": "{}",  # Empty JSON object
            "calcFeature": json.dumps(calc_feature, default=str)  # Use default=str to handle non-serializable objects
            
        }
        
        output_data.append(output_row)
    
    # Create DataFrame from the output data
    output_df = pd.DataFrame(output_data)
    
    # Save to CSV
    output_df.to_csv(output_file_path, index=False)
    return output_df

# Example usage:
# transformeddata = 'cash_beta_trench1_applied_loans_backscored_20241001_20250831'
# transform_data(f'{LOCALPATH}/{transformeddata}.csv')

In [141]:
transformeddata = f'cash_beta_trench1_demo'
dfd = transform_data(f'{LOCALPATH}/{transformeddata}.csv')
print(f"The shape of the transformed data is: {dfd.shape}")

The shape of the transformed data is: (296480, 13)


In [142]:
dfd.head()

Unnamed: 0,customerId,digitalLoanAccountId,crifApplicationId,prediction,start_time,end_time,modelDisplayName,modelVersionId,subscription_name,message_id,publish_time,attributes,calcFeature
0,3461645,6061f751-bb86-49a3-9b6b-0ea44b4306d6,a2f4bd3f-a72c-4d59-bc5e-5d3cbc9bd766,0.624579,2025-09-17T14:24:31.361514,2025-09-17T14:24:31.361514,Cash_beta_trench1_Demo_backscore,v1,trench alpha beta,55bdd05c-0514-41d0-87a3-378715741d96,2025-09-17T14:24:31.361514,{},"{""days_on_book"": 0, ""ln_appln_submit_datetime""..."
1,3292475,80468034-cc21-4da8-b2aa-6e35c84ac387,bd56fce6-39b8-43d3-8ffb-d3116e180e1e,0.680918,2025-09-17T14:24:31.361514,2025-09-17T14:24:31.361514,Cash_beta_trench1_Demo_backscore,v1,trench alpha beta,309e968b-7c45-4c88-8faf-0d545b369f54,2025-09-17T14:24:31.361514,{},"{""days_on_book"": 1, ""ln_appln_submit_datetime""..."
2,3192798,03cf7323-cf3a-44be-9975-ac302d61e7ed,38ff6ea0-6e42-4e5b-af58-e469f91f7c43,0.587765,2025-09-17T14:24:31.361514,2025-09-17T14:24:31.361514,Cash_beta_trench1_Demo_backscore,v1,trench alpha beta,83358dab-02ca-4f23-8831-b1be5a03d9c1,2025-09-17T14:24:31.361514,{},"{""days_on_book"": 0, ""ln_appln_submit_datetime""..."
3,3573079,0e938f32-94e3-4473-9beb-9469807b1edf,c860acb0-278a-4f7d-bc01-c181e4e8a15f,0.660842,2025-09-17T14:24:31.361514,2025-09-17T14:24:31.361514,Cash_beta_trench1_Demo_backscore,v1,trench alpha beta,0648d8c1-c928-491f-a2d0-3dce7deb5f24,2025-09-17T14:24:31.361514,{},"{""days_on_book"": 0, ""ln_appln_submit_datetime""..."
4,3631270,3eb7e8c9-1412-46b2-bdfe-aa71f1ac9a14,f3d1b9ff-3c73-4d57-806f-7028a55bdc82,0.701313,2025-09-17T14:24:31.361514,2025-09-17T14:24:31.361514,Cash_beta_trench1_Demo_backscore,v1,trench alpha beta,a60d9ef0-2856-4d38-bfeb-18618d833ab3,2025-09-17T14:24:31.361514,{},"{""days_on_book"": 0, ""ln_appln_submit_datetime""..."


In [143]:
dfd.columns

Index(['customerId', 'digitalLoanAccountId', 'crifApplicationId', 'prediction',
       'start_time', 'end_time', 'modelDisplayName', 'modelVersionId',
       'subscription_name', 'message_id', 'publish_time', 'attributes',
       'calcFeature'],
      dtype='object')

In [144]:
df1 = dfd[['customerId', 'digitalLoanAccountId', 'prediction',
       'start_time', 'end_time', 'modelDisplayName', 'modelVersionId',
        'calcFeature'
       ]].copy()

In [145]:
filenames = f'{CURRENT_DATE}_{unique_id}_{transformeddata}'
print(filenames)

results = save_dataframe_multi_format(
     dataframe=df1,
     cloud_path=CLOUDPATH,
     filename=filenames,
     client=client,
     bucket_name=f'{BUCKETNAME}'
 )

20250917_36afc2cd41f7_cash_beta_trench1_demo
All files saved successfully!
CSV: gs://prod-asia-southeast1-tonik-aiml-workspace/DC/Model_Monitoring/Model_Tables/20250917_36afc2cd41f7_cash_beta_trench1_demo.csv
PICKLE: gs://prod-asia-southeast1-tonik-aiml-workspace/DC/Model_Monitoring/Model_Tables/20250917_36afc2cd41f7_cash_beta_trench1_demo.pkl
PARQUET: gs://prod-asia-southeast1-tonik-aiml-workspace/DC/Model_Monitoring/Model_Tables/20250917_36afc2cd41f7_cash_beta_trench1_demo.parquet
JOBLIB: gs://prod-asia-southeast1-tonik-aiml-workspace/DC/Model_Monitoring/Model_Tables/20250917_36afc2cd41f7_cash_beta_trench1_demo.joblib


# Insert into a table

In [146]:
# Upload to BigQuery
table_id = f"prj-prod-dataplatform.dap_ds_poweruser_playground.temp_model_transformed_data"
print(table_id)
job_config = bigquery.LoadJobConfig(
    write_disposition="WRITE_TRUNCATE",  # or "WRITE_APPEND"
)
job = client.load_table_from_dataframe(df1, table_id, job_config=job_config)
job.result()  # Wait for the job to complete


prj-prod-dataplatform.dap_ds_poweruser_playground.temp_model_transformed_data




LoadJob<project=prj-prod-dataplatform, location=asia-southeast1, id=39bfafa9-5d5b-4f5b-976c-88dbe02e9a97>

In [147]:
d2 = d1[d1['ln_os_type'] == 'Android']

In [148]:
d2.columns

Index(['customer_id', 'digitalLoanAccountId', 'days_on_book',
       'ln_appln_submit_datetime', 'ln_os_type', 'ln_vas_opted_flag',
       'ln_self_dec_income', 'ln_age', 'ln_source_funds_new_bin',
       'ln_loan_level_user_type', 'ln_industry_new_cat_bin',
       'ln_marital_status', 'ln_doc_type_rolled', 'ln_education_level',
       'ln_ref2_type', 'ln_email_primary_domain', 'ln_province_bin',
       'ln_mature_fspd30_flag', 'ln_fspd30_flag', 'demo_score',
       'trench_category', 'ln_loan_type', 'ln_disb_dtime',
       'c_app_cnt_health_and_fitness_ever', 'c_app_cnt_shopping_ever',
       'c_app_median_time_bw_installed_mins_ever',
       'c_app_avg_time_bw_installed_mins_3d', 'c_app_cnt_crypto_ever',
       'c_app_cnt_driver_ever', 'c_app_cnt_payday_180d',
       'c_app_cnt_gambling_180d', 'apps_score', 'credo_score', 'stack_score'],
      dtype='object')

In [149]:
import pandas as pd
import json
import uuid
from datetime import datetime

def transform_data(output_file_path):
    # Read the input CSV file
    df = d2.copy()
    
    # Create the output DataFrame with the required structure
    output_data = []
    
    for _, row in df.iterrows():
        # Create the calcFeature JSON with all the feature columns
        feature_columns = [
        'c_app_cnt_health_and_fitness_ever', 'c_app_cnt_shopping_ever',
       'c_app_median_time_bw_installed_mins_ever',
       'c_app_avg_time_bw_installed_mins_3d', 'c_app_cnt_crypto_ever',
       'c_app_cnt_driver_ever', 'c_app_cnt_payday_180d',
       'c_app_cnt_gambling_180d'
        ]
        
        calc_feature = {}
        for col in feature_columns:
            if col in row and pd.notna(row[col]):
                # Convert Timestamp objects to string
                if isinstance(row[col], pd.Timestamp):
                    calc_feature[col] = row[col].isoformat()
                else:
                    calc_feature[col] = row[col]
        
       
        # Get current timestamp
        current_time = datetime.now().isoformat()
        
        # Create the output row
        output_row = {
            "customerId": row['customer_id'],
            "digitalLoanAccountId": row['digitalLoanAccountId'],
            "crifApplicationId": str(uuid.uuid4()),  # Generate random UUID
            "prediction": row.get('apps_score', 0),
            "start_time": current_time,
            "end_time": current_time,
            "modelDisplayName":"Cash_beta_trench1_appscore_backscore",
            "modelVersionId":"v1",
            "subscription_name": "trench alpha beta",
            "message_id": str(uuid.uuid4()),  # Generate random UUID
            "publish_time": current_time,
            "attributes": "{}",  # Empty JSON object
            "calcFeature": json.dumps(calc_feature, default=str)  # Use default=str to handle non-serializable objects
            
        }
        
        output_data.append(output_row)
    
    # Create DataFrame from the output data
    output_df = pd.DataFrame(output_data)
    
    # Save to CSV
    output_df.to_csv(output_file_path, index=False)
    return output_df

# Example usage:
# transformeddata = 'cash_beta_trench1_applied_loans_backscored_20241001_20250831'
# transform_data(f'{LOCALPATH}/{transformeddata}.csv')

In [150]:
transformeddata = f'cash_beta_trench1_app'
dfd = transform_data(f'{LOCALPATH}/{transformeddata}.csv')
print(f"The shape of the transformed data is: {dfd.shape}")

The shape of the transformed data is: (201845, 13)


In [151]:
dfd.head()

Unnamed: 0,customerId,digitalLoanAccountId,crifApplicationId,prediction,start_time,end_time,modelDisplayName,modelVersionId,subscription_name,message_id,publish_time,attributes,calcFeature
0,3461645,6061f751-bb86-49a3-9b6b-0ea44b4306d6,93beb599-0052-4cae-985e-62dacf78d591,0.642715,2025-09-17T14:26:13.564442,2025-09-17T14:26:13.564442,Cash_beta_trench1_appscore_backscore,v1,trench alpha beta,bb1fc2db-e801-40e0-b360-dd21a35c7083,2025-09-17T14:26:13.564442,{},"{""c_app_cnt_health_and_fitness_ever"": 0.0, ""c_..."
1,3292475,80468034-cc21-4da8-b2aa-6e35c84ac387,6483a046-d31d-40a1-8901-d5b0b91c1413,0.532071,2025-09-17T14:26:13.564442,2025-09-17T14:26:13.564442,Cash_beta_trench1_appscore_backscore,v1,trench alpha beta,87c69e5f-7ef8-432a-bfc6-62acce9a79ec,2025-09-17T14:26:13.564442,{},"{""c_app_cnt_health_and_fitness_ever"": 2.0, ""c_..."
2,2943376,38939b31-211f-456e-b9c1-9ace4f431b3a,86ebfc86-4c20-4f19-bd1c-80d8959cf4c8,0.732883,2025-09-17T14:26:13.564442,2025-09-17T14:26:13.564442,Cash_beta_trench1_appscore_backscore,v1,trench alpha beta,769287f0-1893-4595-8664-85a4d54e613b,2025-09-17T14:26:13.564442,{},"{""c_app_cnt_health_and_fitness_ever"": 0.0, ""c_..."
3,3076048,c7608431-0327-4448-aaf8-dd1f244aa0ba,bc2b65ff-f0ff-4092-adc7-f3196d383b32,0.648517,2025-09-17T14:26:13.564442,2025-09-17T14:26:13.564442,Cash_beta_trench1_appscore_backscore,v1,trench alpha beta,57f30781-9b04-480a-ae4f-916930c1ed85,2025-09-17T14:26:13.564442,{},"{""c_app_cnt_health_and_fitness_ever"": 0.0, ""c_..."
4,3648485,7af56b05-af84-4460-b83e-e11ab6c77e81,afdfe424-dd27-458f-8a83-6d30e3505b2d,0.538931,2025-09-17T14:26:13.564442,2025-09-17T14:26:13.564442,Cash_beta_trench1_appscore_backscore,v1,trench alpha beta,de75ca71-c22c-4395-9195-7801c46567c2,2025-09-17T14:26:13.564442,{},"{""c_app_cnt_health_and_fitness_ever"": 0.0, ""c_..."


In [152]:
dfd.columns

Index(['customerId', 'digitalLoanAccountId', 'crifApplicationId', 'prediction',
       'start_time', 'end_time', 'modelDisplayName', 'modelVersionId',
       'subscription_name', 'message_id', 'publish_time', 'attributes',
       'calcFeature'],
      dtype='object')

In [153]:
df1 = dfd[['customerId', 'digitalLoanAccountId', 'prediction',
       'start_time', 'end_time', 'modelDisplayName', 'modelVersionId',
        'calcFeature'
       ]].copy()

Found no duplicate digitalLoanAccountId

In [154]:
filenames = f'{CURRENT_DATE}_{unique_id}_{transformeddata}'
print(filenames)

results = save_dataframe_multi_format(
     dataframe=df1,
     cloud_path=CLOUDPATH,
     filename=filenames,
     client=client,
     bucket_name=f'{BUCKETNAME}'
 )

20250917_36afc2cd41f7_cash_beta_trench1_app
All files saved successfully!
CSV: gs://prod-asia-southeast1-tonik-aiml-workspace/DC/Model_Monitoring/Model_Tables/20250917_36afc2cd41f7_cash_beta_trench1_app.csv
PICKLE: gs://prod-asia-southeast1-tonik-aiml-workspace/DC/Model_Monitoring/Model_Tables/20250917_36afc2cd41f7_cash_beta_trench1_app.pkl
PARQUET: gs://prod-asia-southeast1-tonik-aiml-workspace/DC/Model_Monitoring/Model_Tables/20250917_36afc2cd41f7_cash_beta_trench1_app.parquet
JOBLIB: gs://prod-asia-southeast1-tonik-aiml-workspace/DC/Model_Monitoring/Model_Tables/20250917_36afc2cd41f7_cash_beta_trench1_app.joblib


# Insert into a table

In [155]:
# Upload to BigQuery
table_id = f"prj-prod-dataplatform.dap_ds_poweruser_playground.temp_model_transformed_data"
print(table_id)
job_config = bigquery.LoadJobConfig(
    write_disposition="WRITE_APPEND",  # or "WRITE_APPEND"
)
job = client.load_table_from_dataframe(df1, table_id, job_config=job_config)
job.result()  # Wait for the job to complete


prj-prod-dataplatform.dap_ds_poweruser_playground.temp_model_transformed_data




LoadJob<project=prj-prod-dataplatform, location=asia-southeast1, id=120b7eb1-76ee-4a00-894d-c474a2a735aa>

In [156]:
d1.columns

Index(['customer_id', 'digitalLoanAccountId', 'days_on_book',
       'ln_appln_submit_datetime', 'ln_os_type', 'ln_vas_opted_flag',
       'ln_self_dec_income', 'ln_age', 'ln_source_funds_new_bin',
       'ln_loan_level_user_type', 'ln_industry_new_cat_bin',
       'ln_marital_status', 'ln_doc_type_rolled', 'ln_education_level',
       'ln_ref2_type', 'ln_email_primary_domain', 'ln_province_bin',
       'ln_mature_fspd30_flag', 'ln_fspd30_flag', 'demo_score',
       'trench_category', 'ln_loan_type', 'ln_disb_dtime',
       'c_app_cnt_health_and_fitness_ever', 'c_app_cnt_shopping_ever',
       'c_app_median_time_bw_installed_mins_ever',
       'c_app_avg_time_bw_installed_mins_3d', 'c_app_cnt_crypto_ever',
       'c_app_cnt_driver_ever', 'c_app_cnt_payday_180d',
       'c_app_cnt_gambling_180d', 'apps_score', 'credo_score', 'stack_score'],
      dtype='object')

In [157]:
import pandas as pd
import json
import uuid
from datetime import datetime

def transform_data(output_file_path):
    # Read the input CSV file
    df = d1.copy()
    
    # Create the output DataFrame with the required structure
    output_data = []
    
    for _, row in df.iterrows():
        # Create the calcFeature JSON with all the feature columns
        feature_columns = [
         'demo_score', 'apps_score', 'credo_score',
        ]
        
        calc_feature = {}
        for col in feature_columns:
            if col in row and pd.notna(row[col]):
                # Convert Timestamp objects to string
                if isinstance(row[col], pd.Timestamp):
                    calc_feature[col] = row[col].isoformat()
                else:
                    calc_feature[col] = row[col]
        
       
        # Get current timestamp
        current_time = datetime.now().isoformat()
        
        # Create the output row
        output_row = {
            "customerId": row['customer_id'],
            "digitalLoanAccountId": row['digitalLoanAccountId'],
            "crifApplicationId": str(uuid.uuid4()),  # Generate random UUID
            "prediction": row.get('stack_score', 0),
            "start_time": current_time,
            "end_time": current_time,
            "modelDisplayName":"Cash_beta_trench1_stackscore_backscore",
            "modelVersionId":"v1",
            "subscription_name": "trench alpha beta",
            "message_id": str(uuid.uuid4()),  # Generate random UUID
            "publish_time": current_time,
            "attributes": "{}",  # Empty JSON object
            "calcFeature": json.dumps(calc_feature, default=str)  # Use default=str to handle non-serializable objects
            
        }
        
        output_data.append(output_row)
    
    # Create DataFrame from the output data
    output_df = pd.DataFrame(output_data)
    
    # Save to CSV
    output_df.to_csv(output_file_path, index=False)
    return output_df

# Example usage:
# transformeddata = 'cash_beta_trench1_applied_loans_backscored_20241001_20250831'
# transform_data(f'{LOCALPATH}/{transformeddata}.csv')

In [158]:
transformeddata = f'cash_beta_trench1_stackscore'
dfd = transform_data(f'{LOCALPATH}/{transformeddata}.csv')
print(f"The shape of the transformed data is: {dfd.shape}")

The shape of the transformed data is: (296480, 13)


In [159]:
dfd.head()

Unnamed: 0,customerId,digitalLoanAccountId,crifApplicationId,prediction,start_time,end_time,modelDisplayName,modelVersionId,subscription_name,message_id,publish_time,attributes,calcFeature
0,3461645,6061f751-bb86-49a3-9b6b-0ea44b4306d6,a2c2ab40-a288-4f6a-94da-be017646bb52,0.895453,2025-09-17T14:26:59.349939,2025-09-17T14:26:59.349939,Cash_beta_trench1_stackscore_backscore,v1,trench alpha beta,c117ca42-62ef-4361-80dd-725cf08a651a,2025-09-17T14:26:59.349939,{},"{""demo_score"": 0.6245786265216958, ""apps_score..."
1,3292475,80468034-cc21-4da8-b2aa-6e35c84ac387,9e8f2279-b5a1-44bf-ac8b-23943f91181a,0.697355,2025-09-17T14:26:59.349939,2025-09-17T14:26:59.349939,Cash_beta_trench1_stackscore_backscore,v1,trench alpha beta,8cc2cf8c-ee14-47a9-8566-71cc20718515,2025-09-17T14:26:59.349939,{},"{""demo_score"": 0.6809183034639812, ""apps_score..."
2,3192798,03cf7323-cf3a-44be-9975-ac302d61e7ed,760f786a-de82-45b4-8cf6-257d8f6059f5,0.610448,2025-09-17T14:26:59.349939,2025-09-17T14:26:59.349939,Cash_beta_trench1_stackscore_backscore,v1,trench alpha beta,a71399ad-9bb8-431e-afb4-5dec3ef48bb2,2025-09-17T14:26:59.349939,{},"{""demo_score"": 0.5877648931665984, ""credo_scor..."
3,3573079,0e938f32-94e3-4473-9beb-9469807b1edf,d3b1aafa-f9b3-4b79-baf6-2d6eb1168f4f,0.705892,2025-09-17T14:26:59.349939,2025-09-17T14:26:59.349939,Cash_beta_trench1_stackscore_backscore,v1,trench alpha beta,e348f77f-1e65-4654-876b-6bf6289d2b8d,2025-09-17T14:26:59.349939,{},"{""demo_score"": 0.6608418447966299, ""credo_scor..."
4,3631270,3eb7e8c9-1412-46b2-bdfe-aa71f1ac9a14,b5b78a1a-777a-4baa-b923-72ac9d5b2602,0.770477,2025-09-17T14:26:59.349939,2025-09-17T14:26:59.349939,Cash_beta_trench1_stackscore_backscore,v1,trench alpha beta,b89db43d-def9-425f-8a98-00e46ceb6060,2025-09-17T14:26:59.349939,{},"{""demo_score"": 0.7013128224385257, ""credo_scor..."


In [160]:
df1 = dfd[['customerId', 'digitalLoanAccountId', 'prediction',
       'start_time', 'end_time', 'modelDisplayName', 'modelVersionId',
        'calcFeature'
       ]].copy()

In [161]:
filenames = f'{CURRENT_DATE}_{unique_id}_{transformeddata}'
print(filenames)

results = save_dataframe_multi_format(
     dataframe=df1,
     cloud_path=CLOUDPATH,
     filename=filenames,
     client=client,
     bucket_name=f'{BUCKETNAME}'
 )

20250917_36afc2cd41f7_cash_beta_trench1_stackscore
All files saved successfully!
CSV: gs://prod-asia-southeast1-tonik-aiml-workspace/DC/Model_Monitoring/Model_Tables/20250917_36afc2cd41f7_cash_beta_trench1_stackscore.csv
PICKLE: gs://prod-asia-southeast1-tonik-aiml-workspace/DC/Model_Monitoring/Model_Tables/20250917_36afc2cd41f7_cash_beta_trench1_stackscore.pkl
PARQUET: gs://prod-asia-southeast1-tonik-aiml-workspace/DC/Model_Monitoring/Model_Tables/20250917_36afc2cd41f7_cash_beta_trench1_stackscore.parquet
JOBLIB: gs://prod-asia-southeast1-tonik-aiml-workspace/DC/Model_Monitoring/Model_Tables/20250917_36afc2cd41f7_cash_beta_trench1_stackscore.joblib


# Insert into a table

In [162]:
# Upload to BigQuery
table_id = f"prj-prod-dataplatform.dap_ds_poweruser_playground.temp_model_transformed_data"
print(table_id)
job_config = bigquery.LoadJobConfig(
    write_disposition="WRITE_APPEND",  # or "WRITE_APPEND"
)
job = client.load_table_from_dataframe(df1, table_id, job_config=job_config)
job.result()  # Wait for the job to complete


prj-prod-dataplatform.dap_ds_poweruser_playground.temp_model_transformed_data




LoadJob<project=prj-prod-dataplatform, location=asia-southeast1, id=35185c81-e874-4178-88dc-9a083e52faf2>

# cash_beta_trench2_applied_loans_backscored_20241001_20250831

# Table

In [163]:
schema1 = 'worktable_data_analysis'
cash_beta_trench2 = f'cash_beta_trench2_applied_loans_backscored_20241001_20250831'

# Query

In [164]:
sq = f"""
select * from {schema1}.{cash_beta_trench2};
"""
d1 = client.query(sq).to_dataframe(progress_bar_type='tqdm')
print(f"The shape of {schema1}.{cash_beta_trench2} table is:\t {d1.shape}")

Job ID 9fcaea96-5b70-47c2-9837-3436e40f4282 successfully executed: 100%|[32m██████████[0m|




Downloading: 100%|[32m██████████[0m|
The shape of worktable_data_analysis.cash_beta_trench2_applied_loans_backscored_20241001_20250831 table is:	 (111973, 46)


In [165]:
d1.columns

Index(['customer_id', 'digitalLoanAccountId', 'days_on_book',
       'ln_appln_submit_datetime', 'ln_os_type', 'ln_vas_opted_flag',
       'ln_self_dec_income', 'ln_age', 'ln_source_funds_new_bin',
       'ln_loan_level_user_type', 'ln_industry_new_cat_bin',
       'ln_marital_status', 'ln_doc_type_rolled', 'ln_education_level',
       'ln_ref2_type', 'ln_email_primary_domain', 'ln_province_bin',
       'ln_mature_fspd30_flag', 'ln_fspd30_flag', 'demo_score',
       'trench_category', 'ln_loan_type', 'ln_disb_dtime',
       'c_app_cnt_health_and_fitness_ever', 'c_app_cnt_shopping_ever',
       'c_app_median_time_bw_installed_mins_ever',
       'c_app_avg_time_bw_installed_mins_3d', 'c_app_cnt_crypto_ever',
       'c_app_cnt_driver_ever', 'c_app_cnt_payday_180d',
       'c_app_cnt_gambling_180d', 'c_t2_tx_meng_ql_calculator_tot_visit_cnt',
       'c_t2_tx_first_product_user_segment_WOE',
       'c_t2_tx_first_applied_loan_type_bin_WOE', 'c_t2_tx_cnt_rejected_loans',
       'c_t2_tx_apps

In [166]:
import pandas as pd
import json
import uuid
from datetime import datetime

def transform_data(output_file_path):
    # Read the input CSV file
    df = d1.copy()
    
    # Create the output DataFrame with the required structure
    output_data = []
    
    for _, row in df.iterrows():
        # Create the calcFeature JSON with all the feature columns
        feature_columns = [
         'days_on_book',
       'ln_appln_submit_datetime', 'ln_os_type', 'ln_vas_opted_flag',
       'ln_self_dec_income', 'ln_age', 'ln_source_funds_new_bin',
       'ln_loan_level_user_type', 'ln_industry_new_cat_bin',
       'ln_marital_status', 'ln_doc_type_rolled', 'ln_education_level',
       'ln_ref2_type', 'ln_email_primary_domain', 'ln_province_bin',
       'ln_mature_fspd30_flag', 'ln_fspd30_flag', 'trench_category', 'ln_loan_type', 'ln_disb_dtime',
        ]
        
        calc_feature = {}
        for col in feature_columns:
            if col in row and pd.notna(row[col]):
                # Convert Timestamp objects to string
                if isinstance(row[col], pd.Timestamp):
                    calc_feature[col] = row[col].isoformat()
                else:
                    calc_feature[col] = row[col]
        
       
        # Get current timestamp
        current_time = datetime.now().isoformat()
        
        # Create the output row
        output_row = {
            "customerId": row['customer_id'],
            "digitalLoanAccountId": row['digitalLoanAccountId'],
            "crifApplicationId": str(uuid.uuid4()),  # Generate random UUID
            "prediction": row.get('demo_score', 0),
            "start_time": current_time,
            "end_time": current_time,
            "modelDisplayName":"Cash_beta_trench2_demo_backscore",
            "modelVersionId":"v1",
            "subscription_name": "trench alpha beta",
            "message_id": str(uuid.uuid4()),  # Generate random UUID
            "publish_time": current_time,
            "attributes": "{}",  # Empty JSON object
            "calcFeature": json.dumps(calc_feature, default=str)  # Use default=str to handle non-serializable objects
            
        }
        
        output_data.append(output_row)
    
    # Create DataFrame from the output data
    output_df = pd.DataFrame(output_data)
    
    # Save to CSV
    output_df.to_csv(output_file_path, index=False)
    return output_df

# Example usage:
# transformeddata = 'cash_beta_trench1_applied_loans_backscored_20241001_20250831'
# transform_data(f'{LOCALPATH}/{transformeddata}.csv')

In [167]:
transformeddata = f'cash_beta_trench2_demoscore'
dfd = transform_data(f'{LOCALPATH}/{transformeddata}.csv')
print(f"The shape of the transformed data is: {dfd.shape}")

The shape of the transformed data is: (111973, 13)


In [168]:
dfd.head()

Unnamed: 0,customerId,digitalLoanAccountId,crifApplicationId,prediction,start_time,end_time,modelDisplayName,modelVersionId,subscription_name,message_id,publish_time,attributes,calcFeature
0,2292294,76c329f1-a80c-4f97-905a-cdc50eaefc4f,4763d1f3-9474-405b-9890-a5fda81bd0b9,0.558904,2025-09-17T14:28:47.376391,2025-09-17T14:28:47.376391,Cash_beta_trench2_demo_backscore,v1,trench alpha beta,1d761f03-54f2-48d2-a304-99bc476fea33,2025-09-17T14:28:47.376391,{},"{""days_on_book"": 603, ""ln_appln_submit_datetim..."
1,3109464,2f55c1fd-edde-4d76-804c-9faa6fe8276f,1d9e7f69-37ea-4626-b058-5198a2633c37,0.599236,2025-09-17T14:28:47.376391,2025-09-17T14:28:47.376391,Cash_beta_trench2_demo_backscore,v1,trench alpha beta,44e0a9d9-058e-49ba-876b-70574bccd55f,2025-09-17T14:28:47.376391,{},"{""days_on_book"": 112, ""ln_appln_submit_datetim..."
2,1491167,32ff1b2c-d847-4b05-8636-9c8e4aeba209,2b39be52-4050-4052-a8a7-56baec80121e,0.498566,2025-09-17T14:28:47.376391,2025-09-17T14:28:47.376391,Cash_beta_trench2_demo_backscore,v1,trench alpha beta,913bb37f-4cbc-4061-8c48-1635b8558f2d,2025-09-17T14:28:47.376391,{},"{""days_on_book"": 846, ""ln_appln_submit_datetim..."
3,2305465,9715ee51-7fac-4eda-b675-310d6ab79ac8,979784ce-fb33-437e-82a1-5d454795a1c8,0.346935,2025-09-17T14:28:47.377392,2025-09-17T14:28:47.377392,Cash_beta_trench2_demo_backscore,v1,trench alpha beta,7e2a67b2-26f5-4f0a-9864-c79039ca80ec,2025-09-17T14:28:47.377392,{},"{""days_on_book"": 351, ""ln_appln_submit_datetim..."
4,2080167,567e6cf6-9f0f-4fb8-8ee5-1d0022d34d2a,425fb4de-8c98-4a67-87f8-1875f322366a,0.611919,2025-09-17T14:28:47.377392,2025-09-17T14:28:47.377392,Cash_beta_trench2_demo_backscore,v1,trench alpha beta,f0e544a8-e2c6-4ba9-9c7a-28abfefa349c,2025-09-17T14:28:47.377392,{},"{""days_on_book"": 492, ""ln_appln_submit_datetim..."


In [169]:
df1 = dfd[['customerId', 'digitalLoanAccountId', 'prediction',
       'start_time', 'end_time', 'modelDisplayName', 'modelVersionId',
        'calcFeature'
       ]].copy()

In [170]:
filenames = f'{CURRENT_DATE}_{unique_id}_{transformeddata}'
print(filenames)

results = save_dataframe_multi_format(
     dataframe=df1,
     cloud_path=CLOUDPATH,
     filename=filenames,
     client=client,
     bucket_name=f'{BUCKETNAME}'
 )

20250917_36afc2cd41f7_cash_beta_trench2_demoscore
All files saved successfully!
CSV: gs://prod-asia-southeast1-tonik-aiml-workspace/DC/Model_Monitoring/Model_Tables/20250917_36afc2cd41f7_cash_beta_trench2_demoscore.csv
PICKLE: gs://prod-asia-southeast1-tonik-aiml-workspace/DC/Model_Monitoring/Model_Tables/20250917_36afc2cd41f7_cash_beta_trench2_demoscore.pkl
PARQUET: gs://prod-asia-southeast1-tonik-aiml-workspace/DC/Model_Monitoring/Model_Tables/20250917_36afc2cd41f7_cash_beta_trench2_demoscore.parquet
JOBLIB: gs://prod-asia-southeast1-tonik-aiml-workspace/DC/Model_Monitoring/Model_Tables/20250917_36afc2cd41f7_cash_beta_trench2_demoscore.joblib


# Insert into a table

In [171]:
# Upload to BigQuery
table_id = f"prj-prod-dataplatform.dap_ds_poweruser_playground.temp_model_transformed_data"
print(table_id)
job_config = bigquery.LoadJobConfig(
    write_disposition="WRITE_APPEND",  # or "WRITE_APPEND"
)
job = client.load_table_from_dataframe(df1, table_id, job_config=job_config)
job.result()  # Wait for the job to complete


prj-prod-dataplatform.dap_ds_poweruser_playground.temp_model_transformed_data




LoadJob<project=prj-prod-dataplatform, location=asia-southeast1, id=1b7d5dd2-d734-48d7-86ae-508fde1bf51f>

In [172]:
d1.columns

Index(['customer_id', 'digitalLoanAccountId', 'days_on_book',
       'ln_appln_submit_datetime', 'ln_os_type', 'ln_vas_opted_flag',
       'ln_self_dec_income', 'ln_age', 'ln_source_funds_new_bin',
       'ln_loan_level_user_type', 'ln_industry_new_cat_bin',
       'ln_marital_status', 'ln_doc_type_rolled', 'ln_education_level',
       'ln_ref2_type', 'ln_email_primary_domain', 'ln_province_bin',
       'ln_mature_fspd30_flag', 'ln_fspd30_flag', 'demo_score',
       'trench_category', 'ln_loan_type', 'ln_disb_dtime',
       'c_app_cnt_health_and_fitness_ever', 'c_app_cnt_shopping_ever',
       'c_app_median_time_bw_installed_mins_ever',
       'c_app_avg_time_bw_installed_mins_3d', 'c_app_cnt_crypto_ever',
       'c_app_cnt_driver_ever', 'c_app_cnt_payday_180d',
       'c_app_cnt_gambling_180d', 'c_t2_tx_meng_ql_calculator_tot_visit_cnt',
       'c_t2_tx_first_product_user_segment_WOE',
       'c_t2_tx_first_applied_loan_type_bin_WOE', 'c_t2_tx_cnt_rejected_loans',
       'c_t2_tx_apps

In [173]:
d2 = d1[d1['ln_os_type']=='Android'].copy()
d2['ln_os_type'].value_counts()

ln_os_type
Android    74603
Name: count, dtype: int64

In [174]:
import pandas as pd
import json
import uuid
from datetime import datetime

def transform_data(output_file_path):
    # Read the input CSV file
    df = d2.copy()
    
    # Create the output DataFrame with the required structure
    output_data = []
    
    for _, row in df.iterrows():
        # Create the calcFeature JSON with all the feature columns
        feature_columns = [
         'ln_os_type', 'c_app_cnt_health_and_fitness_ever', 'c_app_cnt_shopping_ever',
       'c_app_median_time_bw_installed_mins_ever',
       'c_app_avg_time_bw_installed_mins_3d', 'c_app_cnt_crypto_ever',
       'c_app_cnt_driver_ever', 'c_app_cnt_payday_180d',
       'c_app_cnt_gambling_180d',
        ]
        
        calc_feature = {}
        for col in feature_columns:
            if col in row and pd.notna(row[col]):
                # Convert Timestamp objects to string
                if isinstance(row[col], pd.Timestamp):
                    calc_feature[col] = row[col].isoformat()
                else:
                    calc_feature[col] = row[col]
        
       
        # Get current timestamp
        current_time = datetime.now().isoformat()
        
        # Create the output row
        output_row = {
            "customerId": row['customer_id'],
            "digitalLoanAccountId": row['digitalLoanAccountId'],
            "crifApplicationId": str(uuid.uuid4()),  # Generate random UUID
            "prediction": row.get('apps_score', 0),
            "start_time": current_time,
            "end_time": current_time,
            "modelDisplayName":"Cash_beta_trench2_appscore_backscore",
            "modelVersionId":"v1",
            "subscription_name": "trench alpha beta",
            "message_id": str(uuid.uuid4()),  # Generate random UUID
            "publish_time": current_time,
            "attributes": "{}",  # Empty JSON object
            "calcFeature": json.dumps(calc_feature, default=str)  # Use default=str to handle non-serializable objects
            
        }
        
        output_data.append(output_row)
    
    # Create DataFrame from the output data
    output_df = pd.DataFrame(output_data)
    
    # Save to CSV
    output_df.to_csv(output_file_path, index=False)
    return output_df

# Example usage:
# transformeddata = 'cash_beta_trench1_applied_loans_backscored_20241001_20250831'
# transform_data(f'{LOCALPATH}/{transformeddata}.csv')

In [175]:
transformeddata = f'cash_beta_trench2_appscore'
dfd = transform_data(f'{LOCALPATH}/{transformeddata}.csv')
print(f"The shape of the transformed data is: {dfd.shape}")

The shape of the transformed data is: (74603, 13)


In [176]:
dfd.head()

Unnamed: 0,customerId,digitalLoanAccountId,crifApplicationId,prediction,start_time,end_time,modelDisplayName,modelVersionId,subscription_name,message_id,publish_time,attributes,calcFeature
0,2904485,d1983361-f30d-4594-be95-d4b45977a87a,37203037-0b29-45c4-ad96-93cb288dd925,0.462671,2025-09-17T14:29:29.477524,2025-09-17T14:29:29.477524,Cash_beta_trench2_appscore_backscore,v1,trench alpha beta,1b804bd6-7310-4ba2-87f5-8e165d0d3c60,2025-09-17T14:29:29.477524,{},"{""ln_os_type"": ""Android"", ""c_app_cnt_health_an..."
1,2532401,a87a15ae-3400-4967-9842-9473ef8018e4,47e64a9d-b5ac-4578-9c98-7ea296f3acae,0.524205,2025-09-17T14:29:29.477524,2025-09-17T14:29:29.477524,Cash_beta_trench2_appscore_backscore,v1,trench alpha beta,7205e906-b23d-46c9-9600-928c643ed2d9,2025-09-17T14:29:29.477524,{},"{""ln_os_type"": ""Android"", ""c_app_cnt_health_an..."
2,1193953,c8877602-9d57-4926-b92f-99ace007dc69,e9a726d5-626e-4625-9107-f6f01bf57599,0.495497,2025-09-17T14:29:29.477524,2025-09-17T14:29:29.477524,Cash_beta_trench2_appscore_backscore,v1,trench alpha beta,e73437e5-8315-49c2-b405-7d1e015d2cb0,2025-09-17T14:29:29.477524,{},"{""ln_os_type"": ""Android"", ""c_app_cnt_health_an..."
3,3011213,b4c2f0d9-6d75-4f47-abb1-1c5aa03fa6c6,79750e31-a9ef-472d-b663-4011481e743c,0.559838,2025-09-17T14:29:29.477524,2025-09-17T14:29:29.477524,Cash_beta_trench2_appscore_backscore,v1,trench alpha beta,73a5ae73-3fa6-44e2-8960-fcdd66805d23,2025-09-17T14:29:29.477524,{},"{""ln_os_type"": ""Android"", ""c_app_cnt_health_an..."
4,2912260,5e3fac60-ef9c-49e5-9b1b-7680e9f3b711,9a310203-29d8-4380-bdd6-e9e74178d38a,0.601748,2025-09-17T14:29:29.477524,2025-09-17T14:29:29.477524,Cash_beta_trench2_appscore_backscore,v1,trench alpha beta,fc5d7b6b-f6a7-4bce-b445-c24c0a1ee5aa,2025-09-17T14:29:29.477524,{},"{""ln_os_type"": ""Android"", ""c_app_cnt_health_an..."


In [177]:
df1 = dfd[['customerId', 'digitalLoanAccountId', 'prediction',
       'start_time', 'end_time', 'modelDisplayName', 'modelVersionId',
        'calcFeature'
       ]].copy()

In [178]:
filenames = f'{CURRENT_DATE}_{unique_id}_{transformeddata}'
print(filenames)

results = save_dataframe_multi_format(
     dataframe=df1,
     cloud_path=CLOUDPATH,
     filename=filenames,
     client=client,
     bucket_name=f'{BUCKETNAME}'
 )

20250917_36afc2cd41f7_cash_beta_trench2_appscore
All files saved successfully!
CSV: gs://prod-asia-southeast1-tonik-aiml-workspace/DC/Model_Monitoring/Model_Tables/20250917_36afc2cd41f7_cash_beta_trench2_appscore.csv
PICKLE: gs://prod-asia-southeast1-tonik-aiml-workspace/DC/Model_Monitoring/Model_Tables/20250917_36afc2cd41f7_cash_beta_trench2_appscore.pkl
PARQUET: gs://prod-asia-southeast1-tonik-aiml-workspace/DC/Model_Monitoring/Model_Tables/20250917_36afc2cd41f7_cash_beta_trench2_appscore.parquet
JOBLIB: gs://prod-asia-southeast1-tonik-aiml-workspace/DC/Model_Monitoring/Model_Tables/20250917_36afc2cd41f7_cash_beta_trench2_appscore.joblib


# Insert into a table

In [179]:
# Upload to BigQuery
table_id = f"prj-prod-dataplatform.dap_ds_poweruser_playground.temp_model_transformed_data"
print(table_id)
job_config = bigquery.LoadJobConfig(
    write_disposition="WRITE_APPEND",  # or "WRITE_APPEND"
)
job = client.load_table_from_dataframe(df1, table_id, job_config=job_config)
job.result()  # Wait for the job to complete


prj-prod-dataplatform.dap_ds_poweruser_playground.temp_model_transformed_data




LoadJob<project=prj-prod-dataplatform, location=asia-southeast1, id=4df5c8c1-b674-4e1e-bdcb-47049d1d497b>

In [180]:
d1.columns

Index(['customer_id', 'digitalLoanAccountId', 'days_on_book',
       'ln_appln_submit_datetime', 'ln_os_type', 'ln_vas_opted_flag',
       'ln_self_dec_income', 'ln_age', 'ln_source_funds_new_bin',
       'ln_loan_level_user_type', 'ln_industry_new_cat_bin',
       'ln_marital_status', 'ln_doc_type_rolled', 'ln_education_level',
       'ln_ref2_type', 'ln_email_primary_domain', 'ln_province_bin',
       'ln_mature_fspd30_flag', 'ln_fspd30_flag', 'demo_score',
       'trench_category', 'ln_loan_type', 'ln_disb_dtime',
       'c_app_cnt_health_and_fitness_ever', 'c_app_cnt_shopping_ever',
       'c_app_median_time_bw_installed_mins_ever',
       'c_app_avg_time_bw_installed_mins_3d', 'c_app_cnt_crypto_ever',
       'c_app_cnt_driver_ever', 'c_app_cnt_payday_180d',
       'c_app_cnt_gambling_180d', 'c_t2_tx_meng_ql_calculator_tot_visit_cnt',
       'c_t2_tx_first_product_user_segment_WOE',
       'c_t2_tx_first_applied_loan_type_bin_WOE', 'c_t2_tx_cnt_rejected_loans',
       'c_t2_tx_apps

In [181]:
import pandas as pd
import json
import uuid
from datetime import datetime

def transform_data(output_file_path):
    # Read the input CSV file
    df = d1.copy()
    
    # Create the output DataFrame with the required structure
    output_data = []
    
    for _, row in df.iterrows():
        # Create the calcFeature JSON with all the feature columns
        feature_columns = [
         'c_t2_tx_meng_ql_calculator_tot_visit_cnt',
       'c_t2_tx_first_product_user_segment_WOE',
       'c_t2_tx_first_applied_loan_type_bin_WOE', 'c_t2_tx_cnt_rejected_loans',
       'c_t2_tx_appsflyer_install_to_registration_minutes',
       'c_t2_tx_first_applied_loan_amount', 'c_t2_tx_deposit_accnt_cnt',
       'c_t2_tx_cnt_cash_in_total', 'c_t2_tx_cnt_incomplete_loan_apps',
       'c_t2_tx_amt_cash_in_total', 'c_t2_tx_last_applied_loan_tenor_bin_WOE',
        ]
        
        calc_feature = {}
        for col in feature_columns:
            if col in row and pd.notna(row[col]):
                # Convert Timestamp objects to string
                if isinstance(row[col], pd.Timestamp):
                    calc_feature[col] = row[col].isoformat()
                else:
                    calc_feature[col] = row[col]
        
       
        # Get current timestamp
        current_time = datetime.now().isoformat()
        
        # Create the output row
        output_row = {
            "customerId": row['customer_id'],
            "digitalLoanAccountId": row['digitalLoanAccountId'],
            "crifApplicationId": str(uuid.uuid4()),  # Generate random UUID
            "prediction": row.get('trx_score', 0),
            "start_time": current_time,
            "end_time": current_time,
            "modelDisplayName":"Cash_beta_trench2_transactionscore_backscore",
            "modelVersionId":"v1",
            "subscription_name": "trench alpha beta",
            "message_id": str(uuid.uuid4()),  # Generate random UUID
            "publish_time": current_time,
            "attributes": "{}",  # Empty JSON object
            "calcFeature": json.dumps(calc_feature, default=str)  # Use default=str to handle non-serializable objects
            
        }
        
        output_data.append(output_row)
    
    # Create DataFrame from the output data
    output_df = pd.DataFrame(output_data)
    
    # Save to CSV
    output_df.to_csv(output_file_path, index=False)
    return output_df

# Example usage:
# transformeddata = 'cash_beta_trench1_applied_loans_backscored_20241001_20250831'
# transform_data(f'{LOCALPATH}/{transformeddata}.csv')

In [182]:
transformeddata = f'cash_beta_trench2_transactioncore'
dfd = transform_data(f'{LOCALPATH}/{transformeddata}.csv')
print(f"The shape of the {transformeddata} data is: {dfd.shape}")

The shape of the cash_beta_trench2_transactioncore data is: (111973, 13)


In [183]:
dfd.head()

Unnamed: 0,customerId,digitalLoanAccountId,crifApplicationId,prediction,start_time,end_time,modelDisplayName,modelVersionId,subscription_name,message_id,publish_time,attributes,calcFeature
0,2292294,76c329f1-a80c-4f97-905a-cdc50eaefc4f,ac06569a-37dd-4c37-a0f1-37950c0d883f,0.550381,2025-09-17T14:29:50.961312,2025-09-17T14:29:50.961312,Cash_beta_trench2_transactionscore_backscore,v1,trench alpha beta,948b06d9-74b7-49d6-86d0-6f7b758b37e7,2025-09-17T14:29:50.961312,{},"{""c_t2_tx_meng_ql_calculator_tot_visit_cnt"": 3..."
1,3109464,2f55c1fd-edde-4d76-804c-9faa6fe8276f,4b87f54f-90dd-4884-8a60-8f5d56fa35e7,0.531987,2025-09-17T14:29:50.961312,2025-09-17T14:29:50.961312,Cash_beta_trench2_transactionscore_backscore,v1,trench alpha beta,584fa444-34be-474c-9dd6-54dda93ddd76,2025-09-17T14:29:50.961312,{},"{""c_t2_tx_meng_ql_calculator_tot_visit_cnt"": 1..."
2,1491167,32ff1b2c-d847-4b05-8636-9c8e4aeba209,a6079baf-91cd-4532-878f-2fbea7848064,0.560202,2025-09-17T14:29:50.961312,2025-09-17T14:29:50.961312,Cash_beta_trench2_transactionscore_backscore,v1,trench alpha beta,96cdbbc5-0843-4ea1-aeb5-8d95386798cd,2025-09-17T14:29:50.961312,{},"{""c_t2_tx_meng_ql_calculator_tot_visit_cnt"": 3..."
3,2305465,9715ee51-7fac-4eda-b675-310d6ab79ac8,1418ebb7-52a0-4196-aefb-8f7c66e1defd,0.515623,2025-09-17T14:29:50.962315,2025-09-17T14:29:50.962315,Cash_beta_trench2_transactionscore_backscore,v1,trench alpha beta,18bcebac-028f-4c04-a076-9a8a404eef69,2025-09-17T14:29:50.962315,{},"{""c_t2_tx_meng_ql_calculator_tot_visit_cnt"": 1..."
4,2080167,567e6cf6-9f0f-4fb8-8ee5-1d0022d34d2a,095ed698-b80d-4091-882b-9b06e3d61587,0.452746,2025-09-17T14:29:50.962315,2025-09-17T14:29:50.962315,Cash_beta_trench2_transactionscore_backscore,v1,trench alpha beta,4db44ea2-f630-4dd9-86e8-ac89e683aa08,2025-09-17T14:29:50.962315,{},"{""c_t2_tx_meng_ql_calculator_tot_visit_cnt"": 1..."


In [184]:
df1 = dfd[['customerId', 'digitalLoanAccountId', 'prediction',
       'start_time', 'end_time', 'modelDisplayName', 'modelVersionId',
        'calcFeature'
       ]].copy()

In [185]:
filenames = f'{CURRENT_DATE}_{unique_id}_{transformeddata}'
print(filenames)

results = save_dataframe_multi_format(
     dataframe=df1,
     cloud_path=CLOUDPATH,
     filename=filenames,
     client=client,
     bucket_name=f'{BUCKETNAME}'
 )

20250917_36afc2cd41f7_cash_beta_trench2_transactioncore
All files saved successfully!
CSV: gs://prod-asia-southeast1-tonik-aiml-workspace/DC/Model_Monitoring/Model_Tables/20250917_36afc2cd41f7_cash_beta_trench2_transactioncore.csv
PICKLE: gs://prod-asia-southeast1-tonik-aiml-workspace/DC/Model_Monitoring/Model_Tables/20250917_36afc2cd41f7_cash_beta_trench2_transactioncore.pkl
PARQUET: gs://prod-asia-southeast1-tonik-aiml-workspace/DC/Model_Monitoring/Model_Tables/20250917_36afc2cd41f7_cash_beta_trench2_transactioncore.parquet
JOBLIB: gs://prod-asia-southeast1-tonik-aiml-workspace/DC/Model_Monitoring/Model_Tables/20250917_36afc2cd41f7_cash_beta_trench2_transactioncore.joblib


# Insert into a table

In [186]:
# Upload to BigQuery
table_id = f"prj-prod-dataplatform.dap_ds_poweruser_playground.temp_model_transformed_data"
print(table_id)
job_config = bigquery.LoadJobConfig(
    write_disposition="WRITE_APPEND",  # or "WRITE_APPEND"
)
job = client.load_table_from_dataframe(df1, table_id, job_config=job_config)
job.result()  # Wait for the job to complete


prj-prod-dataplatform.dap_ds_poweruser_playground.temp_model_transformed_data




LoadJob<project=prj-prod-dataplatform, location=asia-southeast1, id=0673fea7-f8ba-4be8-81f3-cee28675a75e>

In [187]:
d1.columns

Index(['customer_id', 'digitalLoanAccountId', 'days_on_book',
       'ln_appln_submit_datetime', 'ln_os_type', 'ln_vas_opted_flag',
       'ln_self_dec_income', 'ln_age', 'ln_source_funds_new_bin',
       'ln_loan_level_user_type', 'ln_industry_new_cat_bin',
       'ln_marital_status', 'ln_doc_type_rolled', 'ln_education_level',
       'ln_ref2_type', 'ln_email_primary_domain', 'ln_province_bin',
       'ln_mature_fspd30_flag', 'ln_fspd30_flag', 'demo_score',
       'trench_category', 'ln_loan_type', 'ln_disb_dtime',
       'c_app_cnt_health_and_fitness_ever', 'c_app_cnt_shopping_ever',
       'c_app_median_time_bw_installed_mins_ever',
       'c_app_avg_time_bw_installed_mins_3d', 'c_app_cnt_crypto_ever',
       'c_app_cnt_driver_ever', 'c_app_cnt_payday_180d',
       'c_app_cnt_gambling_180d', 'c_t2_tx_meng_ql_calculator_tot_visit_cnt',
       'c_t2_tx_first_product_user_segment_WOE',
       'c_t2_tx_first_applied_loan_type_bin_WOE', 'c_t2_tx_cnt_rejected_loans',
       'c_t2_tx_apps

In [188]:
import pandas as pd
import json
import uuid
from datetime import datetime

def transform_data(output_file_path):
    # Read the input CSV file
    df = d1.copy()
    
    # Create the output DataFrame with the required structure
    output_data = []
    
    for _, row in df.iterrows():
        # Create the calcFeature JSON with all the feature columns
        feature_columns = [
          'demo_score', 'trx_score', 'apps_score', 'credo_score',
        ]
        
        calc_feature = {}
        for col in feature_columns:
            if col in row and pd.notna(row[col]):
                # Convert Timestamp objects to string
                if isinstance(row[col], pd.Timestamp):
                    calc_feature[col] = row[col].isoformat()
                else:
                    calc_feature[col] = row[col]
        
       
        # Get current timestamp
        current_time = datetime.now().isoformat()
        
        # Create the output row
        output_row = {
            "customerId": row['customer_id'],
            "digitalLoanAccountId": row['digitalLoanAccountId'],
            "crifApplicationId": str(uuid.uuid4()),  # Generate random UUID
            "prediction": row.get('stack_score', 0),
            "start_time": current_time,
            "end_time": current_time,
            "modelDisplayName":"Cash_beta_trench2_stackscore_backscore",
            "modelVersionId":"v1",
            "subscription_name": "trench alpha beta",
            "message_id": str(uuid.uuid4()),  # Generate random UUID
            "publish_time": current_time,
            "attributes": "{}",  # Empty JSON object
            "calcFeature": json.dumps(calc_feature, default=str)  # Use default=str to handle non-serializable objects
            
        }
        
        output_data.append(output_row)
    
    # Create DataFrame from the output data
    output_df = pd.DataFrame(output_data)
    
    # Save to CSV
    output_df.to_csv(output_file_path, index=False)
    return output_df

# Example usage:
# transformeddata = 'cash_beta_trench1_applied_loans_backscored_20241001_20250831'
# transform_data(f'{LOCALPATH}/{transformeddata}.csv')

In [189]:
transformeddata = f'cash_beta_trench2_stackcore'
dfd = transform_data(f'{LOCALPATH}/{transformeddata}.csv')
print(f"The shape of the {transformeddata} data is: {dfd.shape}")

The shape of the cash_beta_trench2_stackcore data is: (111973, 13)


In [190]:
dfd.head()

Unnamed: 0,customerId,digitalLoanAccountId,crifApplicationId,prediction,start_time,end_time,modelDisplayName,modelVersionId,subscription_name,message_id,publish_time,attributes,calcFeature
0,2292294,76c329f1-a80c-4f97-905a-cdc50eaefc4f,49ddd7aa-2c1a-478c-94c4-a68b00984c65,0.621407,2025-09-17T14:30:29.539807,2025-09-17T14:30:29.539807,Cash_beta_trench2_stackscore_backscore,v1,trench alpha beta,6851f7e9-9869-4323-89c5-50557cfc4cb0,2025-09-17T14:30:29.539807,{},"{""demo_score"": 0.5589038824077822, ""trx_score""..."
1,3109464,2f55c1fd-edde-4d76-804c-9faa6fe8276f,8bf3e333-b2ba-406b-9844-288be4e1b4c3,0.51924,2025-09-17T14:30:29.540807,2025-09-17T14:30:29.540807,Cash_beta_trench2_stackscore_backscore,v1,trench alpha beta,74b92bd7-bb4d-457f-b3c2-8f202759bc48,2025-09-17T14:30:29.540807,{},"{""demo_score"": 0.599235865559278, ""trx_score"":..."
2,1491167,32ff1b2c-d847-4b05-8636-9c8e4aeba209,2c462089-68cf-4212-99e8-beea55084803,0.452883,2025-09-17T14:30:29.540807,2025-09-17T14:30:29.540807,Cash_beta_trench2_stackscore_backscore,v1,trench alpha beta,01b4b621-ce73-4380-bd76-66a0779e694b,2025-09-17T14:30:29.540807,{},"{""demo_score"": 0.49856604913604247, ""trx_score..."
3,2305465,9715ee51-7fac-4eda-b675-310d6ab79ac8,8fb6a490-2ab9-411d-90d0-0a144f6d2d4c,0.25649,2025-09-17T14:30:29.540807,2025-09-17T14:30:29.540807,Cash_beta_trench2_stackscore_backscore,v1,trench alpha beta,0cd84eb5-d45a-4a94-a336-a25d714972e2,2025-09-17T14:30:29.540807,{},"{""demo_score"": 0.34693482397153746, ""trx_score..."
4,2080167,567e6cf6-9f0f-4fb8-8ee5-1d0022d34d2a,0f65c825-e662-4d29-a5e0-62c999bc2d77,0.577553,2025-09-17T14:30:29.540807,2025-09-17T14:30:29.540807,Cash_beta_trench2_stackscore_backscore,v1,trench alpha beta,8da1f94c-5c89-4528-8fb7-334b9e6afbd5,2025-09-17T14:30:29.540807,{},"{""demo_score"": 0.6119191767689968, ""trx_score""..."


In [191]:
df1 = dfd[['customerId', 'digitalLoanAccountId', 'prediction',
       'start_time', 'end_time', 'modelDisplayName', 'modelVersionId',
        'calcFeature'
       ]].copy()

In [192]:
filenames = f'{CURRENT_DATE}_{unique_id}_{transformeddata}'
print(filenames)

results = save_dataframe_multi_format(
     dataframe=df1,
     cloud_path=CLOUDPATH,
     filename=filenames,
     client=client,
     bucket_name=f'{BUCKETNAME}'
 )

20250917_36afc2cd41f7_cash_beta_trench2_stackcore
All files saved successfully!
CSV: gs://prod-asia-southeast1-tonik-aiml-workspace/DC/Model_Monitoring/Model_Tables/20250917_36afc2cd41f7_cash_beta_trench2_stackcore.csv
PICKLE: gs://prod-asia-southeast1-tonik-aiml-workspace/DC/Model_Monitoring/Model_Tables/20250917_36afc2cd41f7_cash_beta_trench2_stackcore.pkl
PARQUET: gs://prod-asia-southeast1-tonik-aiml-workspace/DC/Model_Monitoring/Model_Tables/20250917_36afc2cd41f7_cash_beta_trench2_stackcore.parquet
JOBLIB: gs://prod-asia-southeast1-tonik-aiml-workspace/DC/Model_Monitoring/Model_Tables/20250917_36afc2cd41f7_cash_beta_trench2_stackcore.joblib


# Insert into a table

In [193]:
# Upload to BigQuery
table_id = f"prj-prod-dataplatform.dap_ds_poweruser_playground.temp_model_transformed_data"
print(table_id)
job_config = bigquery.LoadJobConfig(
    write_disposition="WRITE_APPEND",  # or "WRITE_APPEND"
)
job = client.load_table_from_dataframe(df1, table_id, job_config=job_config)
job.result()  # Wait for the job to complete


prj-prod-dataplatform.dap_ds_poweruser_playground.temp_model_transformed_data




LoadJob<project=prj-prod-dataplatform, location=asia-southeast1, id=f02aff62-e222-4a15-9965-ac2cd4fb0f67>

# cash_beta_trench3_applied_loans_backscored_20241001_20250831

# Table

In [194]:
schema1 = 'worktable_data_analysis'
cash_beta_trench3 = f'cash_beta_trench3_applied_loans_backscored_20241001_20250831'

# Query

In [195]:
sq = f"""
select * from {schema1}.{cash_beta_trench3};
"""
d1 = client.query(sq).to_dataframe(progress_bar_type='tqdm')
print(f"The shape of {schema1}.{cash_beta_trench3} table is:\t {d1.shape}")

Job ID 25862987-b1b5-45cb-b8ee-44623c53f3e8 successfully executed: 100%|[32m██████████[0m|




Downloading: 100%|[32m██████████[0m|
The shape of worktable_data_analysis.cash_beta_trench3_applied_loans_backscored_20241001_20250831 table is:	 (38621, 48)


In [196]:
d1.columns

Index(['customer_id', 'digitalLoanAccountId', 'days_on_book',
       'ln_appln_submit_datetime', 'ln_os_type', 'ln_vas_opted_flag',
       'ln_self_dec_income', 'ln_age', 'ln_source_funds_new_bin',
       'ln_loan_level_user_type', 'ln_industry_new_cat_bin',
       'ln_marital_status', 'ln_doc_type_rolled', 'ln_education_level',
       'ln_ref2_type', 'ln_email_primary_domain', 'ln_province_bin',
       'ln_mature_fspd30_flag', 'ln_fspd30_flag', 'demo_score',
       'trench_category', 'ln_loan_type', 'ln_disb_dtime',
       'c_app_cnt_absence_tag_365d_binned',
       'c_app_cnt_books_and_reference_ever_binned',
       'c_app_cnt_gaming_180d_binned',
       'c_app_cnt_health_and_fitness_ever_binned',
       'c_app_cnt_productivity_ever_binned',
       'c_app_cnt_rated_for_18plus_ever_binned',
       'c_app_last_payday_install_to_apply_days_binned',
       'c_t3_tx_cnt_installments_paid_tot_with_dpd',
       'c_t3_tx_time_since_last_applied_loan_application_time',
       'c_t3_tx_last_ap

In [197]:
import pandas as pd
import json
import uuid
from datetime import datetime

def transform_data(output_file_path):
    # Read the input CSV file
    df = d1.copy()
    
    # Create the output DataFrame with the required structure
    output_data = []
    
    for _, row in df.iterrows():
        # Create the calcFeature JSON with all the feature columns
        feature_columns = [
           'days_on_book',
       'ln_appln_submit_datetime', 'ln_os_type', 'ln_vas_opted_flag',
       'ln_self_dec_income', 'ln_age', 'ln_source_funds_new_bin',
       'ln_loan_level_user_type', 'ln_industry_new_cat_bin',
       'ln_marital_status', 'ln_doc_type_rolled', 'ln_education_level',
       'ln_ref2_type', 'ln_email_primary_domain', 'ln_province_bin',
       'ln_mature_fspd30_flag', 'ln_fspd30_flag', 'trench_category', 'ln_loan_type', 'ln_disb_dtime'
        ]
        
        calc_feature = {}
        for col in feature_columns:
            if col in row and pd.notna(row[col]):
                # Convert Timestamp objects to string
                if isinstance(row[col], pd.Timestamp):
                    calc_feature[col] = row[col].isoformat()
                else:
                    calc_feature[col] = row[col]
        
       
        # Get current timestamp
        current_time = datetime.now().isoformat()
        
        # Create the output row
        output_row = {
            "customerId": row['customer_id'],
            "digitalLoanAccountId": row['digitalLoanAccountId'],
            "crifApplicationId": str(uuid.uuid4()),  # Generate random UUID
            "prediction": row.get('demo_score', 0),
            "start_time": current_time,
            "end_time": current_time,
            "modelDisplayName":"Cash_beta_trench3_demoscore_backscore",
            "modelVersionId":"v1",
            "subscription_name": "trench alpha beta",
            "message_id": str(uuid.uuid4()),  # Generate random UUID
            "publish_time": current_time,
            "attributes": "{}",  # Empty JSON object
            "calcFeature": json.dumps(calc_feature, default=str)  # Use default=str to handle non-serializable objects
            
        }
        
        output_data.append(output_row)
    
    # Create DataFrame from the output data
    output_df = pd.DataFrame(output_data)
    
    # Save to CSV
    output_df.to_csv(output_file_path, index=False)
    return output_df

# Example usage:
# transformeddata = 'cash_beta_trench1_applied_loans_backscored_20241001_20250831'
# transform_data(f'{LOCALPATH}/{transformeddata}.csv')

In [198]:
transformeddata = f'cash_beta_trench3_democore'
dfd = transform_data(f'{LOCALPATH}/{transformeddata}.csv')
print(f"The shape of the {transformeddata} data is: {dfd.shape}")

The shape of the cash_beta_trench3_democore data is: (38621, 13)


In [199]:
dfd.head()

Unnamed: 0,customerId,digitalLoanAccountId,crifApplicationId,prediction,start_time,end_time,modelDisplayName,modelVersionId,subscription_name,message_id,publish_time,attributes,calcFeature
0,2688774,96a63ada-7037-4302-be52-c98dc8dd3086,04809466-0269-4560-bd5a-e3c56e2a8f60,0.290677,2025-09-17T14:31:26.679722,2025-09-17T14:31:26.679722,Cash_beta_trench3_demoscore_backscore,v1,trench alpha beta,62c163c2-52c6-4e7b-8035-904aae703a36,2025-09-17T14:31:26.679722,{},"{""days_on_book"": 138, ""ln_appln_submit_datetim..."
1,2834010,aa0f621c-291a-45be-9f41-38f79d0cfbde,7cfdf523-a6ce-4d48-aa0b-88cd1513d633,0.447982,2025-09-17T14:31:26.679722,2025-09-17T14:31:26.679722,Cash_beta_trench3_demoscore_backscore,v1,trench alpha beta,cb04471e-42dc-4a8d-aea8-74b9fae1eccf,2025-09-17T14:31:26.679722,{},"{""days_on_book"": 51, ""ln_appln_submit_datetime..."
2,1996933,86f87afa-0a6a-43ad-ba06-43c53d14bf93,9781cb7f-3693-43fb-b234-1a0aeb57a9c6,0.414556,2025-09-17T14:31:26.680722,2025-09-17T14:31:26.680722,Cash_beta_trench3_demoscore_backscore,v1,trench alpha beta,85eea21b-aad8-4a71-90a5-3f615e0667f6,2025-09-17T14:31:26.680722,{},"{""days_on_book"": 715, ""ln_appln_submit_datetim..."
3,3470262,58d360d3-0b12-4d92-ae3b-2f87e43d521c,b7562837-baee-4e48-b51f-d62eb15c9b31,0.325798,2025-09-17T14:31:26.680722,2025-09-17T14:31:26.680722,Cash_beta_trench3_demoscore_backscore,v1,trench alpha beta,f5868722-b2e4-4958-aa26-95820d01f59c,2025-09-17T14:31:26.680722,{},"{""days_on_book"": 27, ""ln_appln_submit_datetime..."
4,1829519,7d525e35-28e6-4056-8048-e338442242d7,319da3ce-a730-421c-a419-0595a5e15346,0.515046,2025-09-17T14:31:26.680722,2025-09-17T14:31:26.680722,Cash_beta_trench3_demoscore_backscore,v1,trench alpha beta,a6087a4f-f87d-4f1e-b2a6-0cfc9d991a6a,2025-09-17T14:31:26.680722,{},"{""days_on_book"": 879, ""ln_appln_submit_datetim..."


In [200]:
df1 = dfd[['customerId', 'digitalLoanAccountId', 'prediction',
       'start_time', 'end_time', 'modelDisplayName', 'modelVersionId',
        'calcFeature'
       ]].copy()

In [201]:
filenames = f'{CURRENT_DATE}_{unique_id}_{transformeddata}'
print(filenames)

results = save_dataframe_multi_format(
     dataframe=df1,
     cloud_path=CLOUDPATH,
     filename=filenames,
     client=client,
     bucket_name=f'{BUCKETNAME}'
 )

20250917_36afc2cd41f7_cash_beta_trench3_democore
All files saved successfully!
CSV: gs://prod-asia-southeast1-tonik-aiml-workspace/DC/Model_Monitoring/Model_Tables/20250917_36afc2cd41f7_cash_beta_trench3_democore.csv
PICKLE: gs://prod-asia-southeast1-tonik-aiml-workspace/DC/Model_Monitoring/Model_Tables/20250917_36afc2cd41f7_cash_beta_trench3_democore.pkl
PARQUET: gs://prod-asia-southeast1-tonik-aiml-workspace/DC/Model_Monitoring/Model_Tables/20250917_36afc2cd41f7_cash_beta_trench3_democore.parquet
JOBLIB: gs://prod-asia-southeast1-tonik-aiml-workspace/DC/Model_Monitoring/Model_Tables/20250917_36afc2cd41f7_cash_beta_trench3_democore.joblib


# Insert into a table

In [202]:
# Upload to BigQuery
table_id = f"prj-prod-dataplatform.dap_ds_poweruser_playground.temp_model_transformed_data"
print(table_id)
job_config = bigquery.LoadJobConfig(
    write_disposition="WRITE_APPEND",  # or "WRITE_APPEND"
)
job = client.load_table_from_dataframe(df1, table_id, job_config=job_config)
job.result()  # Wait for the job to complete


prj-prod-dataplatform.dap_ds_poweruser_playground.temp_model_transformed_data




LoadJob<project=prj-prod-dataplatform, location=asia-southeast1, id=a37ea5c9-4a76-4bc8-8051-dd3a53e08ba0>

In [203]:
d1.columns

Index(['customer_id', 'digitalLoanAccountId', 'days_on_book',
       'ln_appln_submit_datetime', 'ln_os_type', 'ln_vas_opted_flag',
       'ln_self_dec_income', 'ln_age', 'ln_source_funds_new_bin',
       'ln_loan_level_user_type', 'ln_industry_new_cat_bin',
       'ln_marital_status', 'ln_doc_type_rolled', 'ln_education_level',
       'ln_ref2_type', 'ln_email_primary_domain', 'ln_province_bin',
       'ln_mature_fspd30_flag', 'ln_fspd30_flag', 'demo_score',
       'trench_category', 'ln_loan_type', 'ln_disb_dtime',
       'c_app_cnt_absence_tag_365d_binned',
       'c_app_cnt_books_and_reference_ever_binned',
       'c_app_cnt_gaming_180d_binned',
       'c_app_cnt_health_and_fitness_ever_binned',
       'c_app_cnt_productivity_ever_binned',
       'c_app_cnt_rated_for_18plus_ever_binned',
       'c_app_last_payday_install_to_apply_days_binned',
       'c_t3_tx_cnt_installments_paid_tot_with_dpd',
       'c_t3_tx_time_since_last_applied_loan_application_time',
       'c_t3_tx_last_ap

In [204]:
d2 = d1[d1['ln_os_type'] == 'Android'].copy()
d2['ln_os_type'].value_counts()

ln_os_type
Android    26018
Name: count, dtype: int64

In [205]:
import pandas as pd
import json
import uuid
from datetime import datetime

def transform_data(output_file_path):
    # Read the input CSV file
    df = d2.copy()
    
    # Create the output DataFrame with the required structure
    output_data = []
    
    for _, row in df.iterrows():
        # Create the calcFeature JSON with all the feature columns
        feature_columns = [
        'ln_os_type', 'c_app_cnt_absence_tag_365d_binned',
        'c_app_cnt_books_and_reference_ever_binned',
        'c_app_cnt_gaming_180d_binned',
        'c_app_cnt_health_and_fitness_ever_binned',
        'c_app_cnt_productivity_ever_binned',
        'c_app_cnt_rated_for_18plus_ever_binned',
        'c_app_last_payday_install_to_apply_days_binned',
        ]
        
        calc_feature = {}
        for col in feature_columns:
            if col in row and pd.notna(row[col]):
                # Convert Timestamp objects to string
                if isinstance(row[col], pd.Timestamp):
                    calc_feature[col] = row[col].isoformat()
                else:
                    calc_feature[col] = row[col]
        
       
        # Get current timestamp
        current_time = datetime.now().isoformat()
        
        # Create the output row
        output_row = {
            "customerId": row['customer_id'],
            "digitalLoanAccountId": row['digitalLoanAccountId'],
            "crifApplicationId": str(uuid.uuid4()),  # Generate random UUID
            "prediction": row.get('apps_score', 0),
            "start_time": current_time,
            "end_time": current_time,
            "modelDisplayName":"Cash_beta_trench3_appscore_backscore",
            "modelVersionId":"v1",
            "subscription_name": "trench alpha beta",
            "message_id": str(uuid.uuid4()),  # Generate random UUID
            "publish_time": current_time,
            "attributes": "{}",  # Empty JSON object
            "calcFeature": json.dumps(calc_feature, default=str)  # Use default=str to handle non-serializable objects
            
        }
        
        output_data.append(output_row)
    
    # Create DataFrame from the output data
    output_df = pd.DataFrame(output_data)
    
    # Save to CSV
    output_df.to_csv(output_file_path, index=False)
    return output_df

# Example usage:
# transformeddata = 'cash_beta_trench1_applied_loans_backscored_20241001_20250831'
# transform_data(f'{LOCALPATH}/{transformeddata}.csv')

In [206]:
transformeddata = f'cash_beta_trench3_appcore'
dfd = transform_data(f'{LOCALPATH}/{transformeddata}.csv')
print(f"The shape of the {transformeddata} data is: {dfd.shape}")

The shape of the cash_beta_trench3_appcore data is: (26018, 13)


In [207]:
dfd.head()

Unnamed: 0,customerId,digitalLoanAccountId,crifApplicationId,prediction,start_time,end_time,modelDisplayName,modelVersionId,subscription_name,message_id,publish_time,attributes,calcFeature
0,1956508,30b44b43-6ef5-4d47-baeb-5168208b84af,bf934ca8-084e-4749-a2ff-f980c87955e1,0.379153,2025-09-17T14:31:51.165876,2025-09-17T14:31:51.165876,Cash_beta_trench3_appscore_backscore,v1,trench alpha beta,67eff030-5406-4627-b79a-8ed620303d3a,2025-09-17T14:31:51.165876,{},"{""ln_os_type"": ""Android"", ""c_app_cnt_absence_t..."
1,2356633,d5e165f7-4e7d-4ed9-b8af-4da1d65bb642,db82a8b5-bb33-4555-95fd-04a827efbfb1,0.380197,2025-09-17T14:31:51.166478,2025-09-17T14:31:51.166478,Cash_beta_trench3_appscore_backscore,v1,trench alpha beta,9ce25f50-6fdc-4bca-a8ed-bca42ad01e13,2025-09-17T14:31:51.166478,{},"{""ln_os_type"": ""Android"", ""c_app_cnt_absence_t..."
2,3225371,1eb35d6a-7f60-4c3f-8171-c981e221b8e6,5eb2974e-7fd3-4081-86e0-561d95ed2d51,0.391904,2025-09-17T14:31:51.166478,2025-09-17T14:31:51.166478,Cash_beta_trench3_appscore_backscore,v1,trench alpha beta,78b5d290-0532-4102-8445-4443e19c32cc,2025-09-17T14:31:51.166478,{},"{""ln_os_type"": ""Android"", ""c_app_cnt_absence_t..."
3,2604750,069a097c-4afb-4c42-af3d-e61476eccfc9,3c32cfce-27c7-4356-81bc-781500148d3e,0.392407,2025-09-17T14:31:51.166478,2025-09-17T14:31:51.166478,Cash_beta_trench3_appscore_backscore,v1,trench alpha beta,2f128f33-95ee-4e63-9ba0-b7d9467c95d9,2025-09-17T14:31:51.166478,{},"{""ln_os_type"": ""Android"", ""c_app_cnt_absence_t..."
4,2971573,2c2c6c99-8d5d-4709-9bc7-ba01ecc65676,4e7c8f47-fc7a-43d3-9019-8c8663fcb5e9,0.392909,2025-09-17T14:31:51.167560,2025-09-17T14:31:51.167560,Cash_beta_trench3_appscore_backscore,v1,trench alpha beta,84a3f931-4925-42fa-b2a0-362d05e66538,2025-09-17T14:31:51.167560,{},"{""ln_os_type"": ""Android"", ""c_app_cnt_absence_t..."


In [208]:
df1 = dfd[['customerId', 'digitalLoanAccountId', 'prediction',
       'start_time', 'end_time', 'modelDisplayName', 'modelVersionId',
        'calcFeature'
       ]].copy()

In [209]:
filenames = f'{CURRENT_DATE}_{unique_id}_{transformeddata}'
print(filenames)

results = save_dataframe_multi_format(
     dataframe=df1,
     cloud_path=CLOUDPATH,
     filename=filenames,
     client=client,
     bucket_name=f'{BUCKETNAME}'
 )

20250917_36afc2cd41f7_cash_beta_trench3_appcore
All files saved successfully!
CSV: gs://prod-asia-southeast1-tonik-aiml-workspace/DC/Model_Monitoring/Model_Tables/20250917_36afc2cd41f7_cash_beta_trench3_appcore.csv
PICKLE: gs://prod-asia-southeast1-tonik-aiml-workspace/DC/Model_Monitoring/Model_Tables/20250917_36afc2cd41f7_cash_beta_trench3_appcore.pkl
PARQUET: gs://prod-asia-southeast1-tonik-aiml-workspace/DC/Model_Monitoring/Model_Tables/20250917_36afc2cd41f7_cash_beta_trench3_appcore.parquet
JOBLIB: gs://prod-asia-southeast1-tonik-aiml-workspace/DC/Model_Monitoring/Model_Tables/20250917_36afc2cd41f7_cash_beta_trench3_appcore.joblib


# Insert into a table

In [210]:
# Upload to BigQuery
table_id = f"prj-prod-dataplatform.dap_ds_poweruser_playground.temp_model_transformed_data"
print(table_id)
job_config = bigquery.LoadJobConfig(
    write_disposition="WRITE_APPEND",  # or "WRITE_APPEND"
)
job = client.load_table_from_dataframe(df1, table_id, job_config=job_config)
job.result()  # Wait for the job to complete


prj-prod-dataplatform.dap_ds_poweruser_playground.temp_model_transformed_data




LoadJob<project=prj-prod-dataplatform, location=asia-southeast1, id=b6acd3c4-6594-4629-988d-8287e61790b6>

In [213]:
d1.columns

Index(['customer_id', 'digitalLoanAccountId', 'days_on_book',
       'ln_appln_submit_datetime', 'ln_os_type', 'ln_vas_opted_flag',
       'ln_self_dec_income', 'ln_age', 'ln_source_funds_new_bin',
       'ln_loan_level_user_type', 'ln_industry_new_cat_bin',
       'ln_marital_status', 'ln_doc_type_rolled', 'ln_education_level',
       'ln_ref2_type', 'ln_email_primary_domain', 'ln_province_bin',
       'ln_mature_fspd30_flag', 'ln_fspd30_flag', 'demo_score',
       'trench_category', 'ln_loan_type', 'ln_disb_dtime',
       'c_app_cnt_absence_tag_365d_binned',
       'c_app_cnt_books_and_reference_ever_binned',
       'c_app_cnt_gaming_180d_binned',
       'c_app_cnt_health_and_fitness_ever_binned',
       'c_app_cnt_productivity_ever_binned',
       'c_app_cnt_rated_for_18plus_ever_binned',
       'c_app_last_payday_install_to_apply_days_binned',
       'c_t3_tx_cnt_installments_paid_tot_with_dpd',
       'c_t3_tx_time_since_last_applied_loan_application_time',
       'c_t3_tx_last_ap

In [218]:
import pandas as pd
import json
import uuid
from datetime import datetime

def transform_data(output_file_path):
    # Read the input CSV file
    df = d1.copy()
    
    # Create the output DataFrame with the required structure
    output_data = []
    
    for _, row in df.iterrows():
        # Create the calcFeature JSON with all the feature columns
        feature_columns = [
        'c_t3_tx_cnt_installments_paid_tot_with_dpd',
       'c_t3_tx_time_since_last_applied_loan_application_time',
       'c_t3_tx_last_applied_loan_decision', 'c_t3_tx_min_age_completed_loans',
       'c_t3_tx_dob_observation_date', 'c_t3_tx_cnt_jira_tickets_created_bin',
       'c_t3_tx_max_ever_dpd', 'c_t3_tx_amt_cash_in_total',
       'c_t3_tx_last_applied_loan_type_bin', 'c_t3_tx_cnt_completed_loans',
       'c_t3_tx_meng_no_of_logins', 'c_t3_tx_last_applied_loan_tenor',
       'c_t3_tx_med_days_bt_cash_out_trans',
       'c_t3_tx_avg_days_bt_cash_in_trans',
        ]
        
        calc_feature = {}
        for col in feature_columns:
            if col in row and pd.notna(row[col]):
                # Convert Timestamp objects to string
                if isinstance(row[col], pd.Timestamp):
                    calc_feature[col] = row[col].isoformat()
                else:
                    calc_feature[col] = row[col]
        
       
        # Get current timestamp
        current_time = datetime.now().isoformat()
        
        # Create the output row
        output_row = {
            "customerId": row['customer_id'],
            "digitalLoanAccountId": row['digitalLoanAccountId'],
            "crifApplicationId": str(uuid.uuid4()),  # Generate random UUID
            "prediction": row.get('trx_score', 0),
            "start_time": current_time,
            "end_time": current_time,
            "modelDisplayName":"Cash_beta_trench3_transactionscore_backscore",
            "modelVersionId":"v1",
            "subscription_name": "trench alpha beta",
            "message_id": str(uuid.uuid4()),  # Generate random UUID
            "publish_time": current_time,
            "attributes": "{}",  # Empty JSON object
            "calcFeature": json.dumps(calc_feature, default=str)  # Use default=str to handle non-serializable objects
            
        }
        
        output_data.append(output_row)
    
    # Create DataFrame from the output data
    output_df = pd.DataFrame(output_data)
    
    # Save to CSV
    output_df.to_csv(output_file_path, index=False)
    return output_df

# Example usage:
# transformeddata = 'cash_beta_trench1_applied_loans_backscored_20241001_20250831'
# transform_data(f'{LOCALPATH}/{transformeddata}.csv')

In [219]:
transformeddata = f'cash_beta_trench3_transactionscore'
dfd = transform_data(f'{LOCALPATH}/{transformeddata}.csv')
print(f"The shape of the {transformeddata} data is: {dfd.shape}")

The shape of the cash_beta_trench3_transactionscore data is: (38621, 13)


In [220]:
dfd.head()

Unnamed: 0,customerId,digitalLoanAccountId,crifApplicationId,prediction,start_time,end_time,modelDisplayName,modelVersionId,subscription_name,message_id,publish_time,attributes,calcFeature
0,2688774,96a63ada-7037-4302-be52-c98dc8dd3086,4adc2bfe-d1a2-4a3c-9d01-9b0496238fe5,0.124384,2025-09-17T14:35:10.337051,2025-09-17T14:35:10.337051,Cash_beta_trench3_transactionscore_backscore,v1,trench alpha beta,f95bee3f-6aaf-4de3-b21e-30244c240451,2025-09-17T14:35:10.337051,{},"{""c_t3_tx_cnt_installments_paid_tot_with_dpd"":..."
1,2834010,aa0f621c-291a-45be-9f41-38f79d0cfbde,7c2fd3d6-9429-4755-9fde-243bdf891d02,0.141644,2025-09-17T14:35:10.337051,2025-09-17T14:35:10.337051,Cash_beta_trench3_transactionscore_backscore,v1,trench alpha beta,cfc9fe84-6dd5-4f06-b891-13eb5df84c98,2025-09-17T14:35:10.337051,{},"{""c_t3_tx_cnt_installments_paid_tot_with_dpd"":..."
2,1996933,86f87afa-0a6a-43ad-ba06-43c53d14bf93,7016ec75-8da6-4b7f-b5b7-5809b7f8f1d9,0.144358,2025-09-17T14:35:10.337051,2025-09-17T14:35:10.337051,Cash_beta_trench3_transactionscore_backscore,v1,trench alpha beta,3ca633e2-e155-4378-9cc8-f8d8b2dd23e4,2025-09-17T14:35:10.337051,{},"{""c_t3_tx_cnt_installments_paid_tot_with_dpd"":..."
3,3470262,58d360d3-0b12-4d92-ae3b-2f87e43d521c,5f90229b-26bb-44ff-b835-2704c8682fcd,0.163753,2025-09-17T14:35:10.338049,2025-09-17T14:35:10.338049,Cash_beta_trench3_transactionscore_backscore,v1,trench alpha beta,da90e47c-1bc3-47d5-8852-f0be9573bbd0,2025-09-17T14:35:10.338049,{},"{""c_t3_tx_cnt_installments_paid_tot_with_dpd"":..."
4,1829519,7d525e35-28e6-4056-8048-e338442242d7,dd8868c3-839e-4472-bc32-f730d223f5bd,0.075721,2025-09-17T14:35:10.338049,2025-09-17T14:35:10.338049,Cash_beta_trench3_transactionscore_backscore,v1,trench alpha beta,5ebccf62-6660-419d-8232-fd3bd3f29fb0,2025-09-17T14:35:10.338049,{},"{""c_t3_tx_cnt_installments_paid_tot_with_dpd"":..."


In [221]:
df1 = dfd[['customerId', 'digitalLoanAccountId', 'prediction',
       'start_time', 'end_time', 'modelDisplayName', 'modelVersionId',
        'calcFeature'
       ]].copy()

In [222]:
filenames = f'{CURRENT_DATE}_{unique_id}_{transformeddata}'
print(filenames)

results = save_dataframe_multi_format(
     dataframe=df1,
     cloud_path=CLOUDPATH,
     filename=filenames,
     client=client,
     bucket_name=f'{BUCKETNAME}'
 )

20250917_36afc2cd41f7_cash_beta_trench3_transactionscore
All files saved successfully!
CSV: gs://prod-asia-southeast1-tonik-aiml-workspace/DC/Model_Monitoring/Model_Tables/20250917_36afc2cd41f7_cash_beta_trench3_transactionscore.csv
PICKLE: gs://prod-asia-southeast1-tonik-aiml-workspace/DC/Model_Monitoring/Model_Tables/20250917_36afc2cd41f7_cash_beta_trench3_transactionscore.pkl
PARQUET: gs://prod-asia-southeast1-tonik-aiml-workspace/DC/Model_Monitoring/Model_Tables/20250917_36afc2cd41f7_cash_beta_trench3_transactionscore.parquet
JOBLIB: gs://prod-asia-southeast1-tonik-aiml-workspace/DC/Model_Monitoring/Model_Tables/20250917_36afc2cd41f7_cash_beta_trench3_transactionscore.joblib


# Insert into a table

In [223]:
# Upload to BigQuery
table_id = f"prj-prod-dataplatform.dap_ds_poweruser_playground.temp_model_transformed_data"
print(table_id)
job_config = bigquery.LoadJobConfig(
    write_disposition="WRITE_APPEND",  # or "WRITE_APPEND"
)
job = client.load_table_from_dataframe(df1, table_id, job_config=job_config)
job.result()  # Wait for the job to complete


prj-prod-dataplatform.dap_ds_poweruser_playground.temp_model_transformed_data




LoadJob<project=prj-prod-dataplatform, location=asia-southeast1, id=54a5ab79-7fd4-4d96-a7fa-497dd8826da3>

In [None]:
d1.columns

Index(['customer_id', 'digitalLoanAccountId', 'days_on_book',
       'ln_appln_submit_datetime', 'ln_os_type', 'ln_vas_opted_flag',
       'ln_self_dec_income', 'ln_age', 'ln_source_funds_new_bin',
       'ln_loan_level_user_type', 'ln_industry_new_cat_bin',
       'ln_marital_status', 'ln_doc_type_rolled', 'ln_education_level',
       'ln_ref2_type', 'ln_email_primary_domain', 'ln_province_bin',
       'ln_mature_fspd30_flag', 'ln_fspd30_flag', 'demo_score',
       'trench_category', 'ln_loan_type', 'ln_disb_dtime',
       'c_app_cnt_absence_tag_365d_binned',
       'c_app_cnt_books_and_reference_ever_binned',
       'c_app_cnt_gaming_180d_binned',
       'c_app_cnt_health_and_fitness_ever_binned',
       'c_app_cnt_productivity_ever_binned',
       'c_app_cnt_rated_for_18plus_ever_binned',
       'c_app_last_payday_install_to_apply_days_binned',
       'c_t3_tx_cnt_installments_paid_tot_with_dpd',
       'c_t3_tx_time_since_last_applied_loan_application_time',
       'c_t3_tx_last_ap

In [225]:
import pandas as pd
import json
import uuid
from datetime import datetime

def transform_data(output_file_path):
    # Read the input CSV file
    df = d1.copy()
    
    # Create the output DataFrame with the required structure
    output_data = []
    
    for _, row in df.iterrows():
        # Create the calcFeature JSON with all the feature columns
        feature_columns = [
        'demo_score', 'trx_score', 'apps_score', 'credo_score',
        ]
        
        calc_feature = {}
        for col in feature_columns:
            if col in row and pd.notna(row[col]):
                # Convert Timestamp objects to string
                if isinstance(row[col], pd.Timestamp):
                    calc_feature[col] = row[col].isoformat()
                else:
                    calc_feature[col] = row[col]
        
       
        # Get current timestamp
        current_time = datetime.now().isoformat()
        
        # Create the output row
        output_row = {
            "customerId": row['customer_id'],
            "digitalLoanAccountId": row['digitalLoanAccountId'],
            "crifApplicationId": str(uuid.uuid4()),  # Generate random UUID
            "prediction": row.get('stack_score', 0),
            "start_time": current_time,
            "end_time": current_time,
            "modelDisplayName":"Cash_beta_trench3_stackscore_backscore",
            "modelVersionId":"v1",
            "subscription_name": "trench alpha beta",
            "message_id": str(uuid.uuid4()),  # Generate random UUID
            "publish_time": current_time,
            "attributes": "{}",  # Empty JSON object
            "calcFeature": json.dumps(calc_feature, default=str)  # Use default=str to handle non-serializable objects
            
        }
        
        output_data.append(output_row)
    
    # Create DataFrame from the output data
    output_df = pd.DataFrame(output_data)
    
    # Save to CSV
    output_df.to_csv(output_file_path, index=False)
    return output_df

# Example usage:
# transformeddata = 'cash_beta_trench1_applied_loans_backscored_20241001_20250831'
# transform_data(f'{LOCALPATH}/{transformeddata}.csv')

In [226]:
transformeddata = f'cash_beta_trench3_stackscore'
dfd = transform_data(f'{LOCALPATH}/{transformeddata}.csv')
print(f"The shape of the {transformeddata} data is: {dfd.shape}")

The shape of the cash_beta_trench3_stackscore data is: (38621, 13)


In [227]:
dfd.head()

Unnamed: 0,customerId,digitalLoanAccountId,crifApplicationId,prediction,start_time,end_time,modelDisplayName,modelVersionId,subscription_name,message_id,publish_time,attributes,calcFeature
0,2688774,96a63ada-7037-4302-be52-c98dc8dd3086,16b556b8-4c2c-4fef-ab35-1aed7d2cd333,0.435645,2025-09-17T14:36:53.806166,2025-09-17T14:36:53.806166,Cash_beta_trench3_stackscore_backscore,v1,trench alpha beta,ae6335ea-59a6-403c-acdd-8ff09da6ef4f,2025-09-17T14:36:53.806166,{},"{""demo_score"": 0.29067734219401176, ""trx_score..."
1,2834010,aa0f621c-291a-45be-9f41-38f79d0cfbde,c4e5b25e-8fa9-446a-a171-d9f40ad35d02,0.492876,2025-09-17T14:36:53.807167,2025-09-17T14:36:53.807167,Cash_beta_trench3_stackscore_backscore,v1,trench alpha beta,c17490e9-7504-4400-a4c2-8740a378c265,2025-09-17T14:36:53.807167,{},"{""demo_score"": 0.44798231975659975, ""trx_score..."
2,1996933,86f87afa-0a6a-43ad-ba06-43c53d14bf93,851ec9d5-7667-4180-aa7f-18695fe2e94d,0.483476,2025-09-17T14:36:53.807167,2025-09-17T14:36:53.807167,Cash_beta_trench3_stackscore_backscore,v1,trench alpha beta,74e888b2-7da0-45bf-91f3-2134f398a136,2025-09-17T14:36:53.807167,{},"{""demo_score"": 0.4145564083409764, ""trx_score""..."
3,3470262,58d360d3-0b12-4d92-ae3b-2f87e43d521c,e962a002-bba2-45d6-866f-ffeffe6a08eb,0.467202,2025-09-17T14:36:53.807167,2025-09-17T14:36:53.807167,Cash_beta_trench3_stackscore_backscore,v1,trench alpha beta,6788abc5-7809-4e38-b3ad-30a1d824b839,2025-09-17T14:36:53.807167,{},"{""demo_score"": 0.325798331655678, ""trx_score"":..."
4,1829519,7d525e35-28e6-4056-8048-e338442242d7,2604ccfd-dd55-47ba-bf22-d22a312ce91e,0.484917,2025-09-17T14:36:53.807167,2025-09-17T14:36:53.807167,Cash_beta_trench3_stackscore_backscore,v1,trench alpha beta,35da4a5c-59df-4249-a7d5-f696255700bc,2025-09-17T14:36:53.807167,{},"{""demo_score"": 0.515045555322528, ""trx_score"":..."


In [228]:
df1 = dfd[['customerId', 'digitalLoanAccountId', 'prediction',
       'start_time', 'end_time', 'modelDisplayName', 'modelVersionId',
        'calcFeature'
       ]].copy()

In [229]:
filenames = f'{CURRENT_DATE}_{unique_id}_{transformeddata}'
print(filenames)

results = save_dataframe_multi_format(
     dataframe=df1,
     cloud_path=CLOUDPATH,
     filename=filenames,
     client=client,
     bucket_name=f'{BUCKETNAME}'
 )

20250917_36afc2cd41f7_cash_beta_trench3_stackscore
All files saved successfully!
CSV: gs://prod-asia-southeast1-tonik-aiml-workspace/DC/Model_Monitoring/Model_Tables/20250917_36afc2cd41f7_cash_beta_trench3_stackscore.csv
PICKLE: gs://prod-asia-southeast1-tonik-aiml-workspace/DC/Model_Monitoring/Model_Tables/20250917_36afc2cd41f7_cash_beta_trench3_stackscore.pkl
PARQUET: gs://prod-asia-southeast1-tonik-aiml-workspace/DC/Model_Monitoring/Model_Tables/20250917_36afc2cd41f7_cash_beta_trench3_stackscore.parquet
JOBLIB: gs://prod-asia-southeast1-tonik-aiml-workspace/DC/Model_Monitoring/Model_Tables/20250917_36afc2cd41f7_cash_beta_trench3_stackscore.joblib


# Insert into a table

In [230]:
# Upload to BigQuery
table_id = f"prj-prod-dataplatform.dap_ds_poweruser_playground.temp_model_transformed_data"
print(table_id)
job_config = bigquery.LoadJobConfig(
    write_disposition="WRITE_APPEND",  # or "WRITE_APPEND"
)
job = client.load_table_from_dataframe(df1, table_id, job_config=job_config)
job.result()  # Wait for the job to complete


prj-prod-dataplatform.dap_ds_poweruser_playground.temp_model_transformed_data




LoadJob<project=prj-prod-dataplatform, location=asia-southeast1, id=6522d6aa-f270-4ed7-a0b6-10f8d8d207ca>

# cash_alpha_trench1_applied_loans_backscored_20241001_20250831

# Table

In [232]:
schema1 = 'worktable_data_analysis'
cash_alpha_trench1 = f'cash_alpha_trench1_applied_loans_backscored_20241001_20250831'

# Query

In [233]:
sq = f"""
select * from {schema1}.{cash_alpha_trench1};
"""
d1 = client.query(sq).to_dataframe(progress_bar_type='tqdm')
print(f"The shape of {schema1}.{cash_alpha_trench1} table is:\t {d1.shape}")

Job ID 955f1d8a-a866-4a78-916a-6a461fccd93b successfully executed: 100%|[32m██████████[0m|




Downloading: 100%|[32m██████████[0m|
The shape of worktable_data_analysis.cash_alpha_trench1_applied_loans_backscored_20241001_20250831 table is:	 (62044, 48)


In [235]:
d1.columns

Index(['customer_id', 'digitalLoanAccountId', 'days_on_book',
       'ln_appln_submit_datetime', 'ln_os_type', 'ln_vas_opted_flag',
       'ln_self_dec_income', 'ln_age', 'ln_source_funds_new_bin',
       'ln_loan_level_user_type', 'ln_industry_new_cat_bin',
       'ln_marital_status', 'ln_doc_type_rolled', 'ln_education_level',
       'ln_ref2_type', 'ln_email_primary_domain', 'ln_province_bin',
       'ln_mature_fspd30_flag', 'ln_fspd30_flag', 'trench_category',
       'ln_loan_type', 'ln_disb_dtime', 'ca_app_cnt_health_and_fitness_ever',
       'ca_app_cnt_shopping_ever', 'ca_app_median_time_bw_installed_mins_ever',
       'ca_app_avg_time_bw_installed_mins_3d', 'ca_app_cnt_crypto_ever',
       'ca_app_cnt_driver_ever', 'ca_app_cnt_payday_180d',
       'ca_app_cnt_gambling_180d', 'ca_cic_max_age_all_contracts_snapshot',
       'ca_cic_ratio_overdue_contracts_to_granted_contracts',
       'ca_cic_ScoreRange', 'ca_cic_ln_loan_level_user_type',
       'ca_cic_has_ever_been_overdue',
  

In [236]:
import pandas as pd
import json
import uuid
from datetime import datetime

def transform_data(output_file_path):
    # Read the input CSV file
    df = d1.copy()
    
    # Create the output DataFrame with the required structure
    output_data = []
    
    for _, row in df.iterrows():
        # Create the calcFeature JSON with all the feature columns
        feature_columns = [
         'days_on_book',
       'ln_appln_submit_datetime', 'ln_os_type', 'ln_vas_opted_flag',
       'ln_self_dec_income', 'ln_age', 'ln_source_funds_new_bin',
       'ln_loan_level_user_type', 'ln_industry_new_cat_bin',
       'ln_marital_status', 'ln_doc_type_rolled', 'ln_education_level',
       'ln_ref2_type', 'ln_email_primary_domain', 'ln_province_bin',
       'ln_mature_fspd30_flag', 'ln_fspd30_flag', 'trench_category',
       'ln_loan_type', 'ln_disb_dtime',
        ]
        
        calc_feature = {}
        for col in feature_columns:
            if col in row and pd.notna(row[col]):
                # Convert Timestamp objects to string
                if isinstance(row[col], pd.Timestamp):
                    calc_feature[col] = row[col].isoformat()
                else:
                    calc_feature[col] = row[col]
        
       
        # Get current timestamp
        current_time = datetime.now().isoformat()
        
        # Create the output row
        output_row = {
            "customerId": row['customer_id'],
            "digitalLoanAccountId": row['digitalLoanAccountId'],
            "crifApplicationId": str(uuid.uuid4()),  # Generate random UUID
            "prediction": row.get('demo_score', 0),
            "start_time": current_time,
            "end_time": current_time,
            "modelDisplayName":"Cash_alpha_trench1_demoscore_backscore",
            "modelVersionId":"v1",
            "subscription_name": "trench alpha beta",
            "message_id": str(uuid.uuid4()),  # Generate random UUID
            "publish_time": current_time,
            "attributes": "{}",  # Empty JSON object
            "calcFeature": json.dumps(calc_feature, default=str)  # Use default=str to handle non-serializable objects
            
        }
        
        output_data.append(output_row)
    
    # Create DataFrame from the output data
    output_df = pd.DataFrame(output_data)
    
    # Save to CSV
    output_df.to_csv(output_file_path, index=False)
    return output_df

# Example usage:
# transformeddata = 'cash_beta_trench1_applied_loans_backscored_20241001_20250831'
# transform_data(f'{LOCALPATH}/{transformeddata}.csv')

In [237]:
transformeddata = f'Cash_alpha_trench1_demoscore_backscore'
dfd = transform_data(f'{LOCALPATH}/{transformeddata}.csv')
print(f"The shape of the {transformeddata} data is: {dfd.shape}")

The shape of the Cash_alpha_trench1_demoscore_backscore data is: (62044, 13)


In [238]:
dfd.head()

Unnamed: 0,customerId,digitalLoanAccountId,crifApplicationId,prediction,start_time,end_time,modelDisplayName,modelVersionId,subscription_name,message_id,publish_time,attributes,calcFeature
0,2945653,e7e3e1dc-de0e-4ad8-b374-2cd2ab2bd87a,03af2758-a65b-4041-951a-7af3b40586d8,0.504412,2025-09-17T14:40:41.091760,2025-09-17T14:40:41.091760,Cash_alpha_trench1_demoscore_backscore,v1,trench alpha beta,c505559e-26ec-4f16-8067-f61b5fa1a532,2025-09-17T14:40:41.091760,{},"{""days_on_book"": 0, ""ln_appln_submit_datetime""..."
1,3436627,e07c1df0-56cd-4d5f-bfd4-8deb3afdc95e,213579e0-9ef2-4483-aca6-ed0a678f0fd7,0.44743,2025-09-17T14:40:41.091760,2025-09-17T14:40:41.091760,Cash_alpha_trench1_demoscore_backscore,v1,trench alpha beta,4d9d0ff1-80ad-41d6-b136-15a478f4ab04,2025-09-17T14:40:41.091760,{},"{""days_on_book"": 0, ""ln_appln_submit_datetime""..."
2,3110665,bac4a185-04bf-4c90-b73e-2ee0d6707df4,d251b9af-2cdd-4b48-9fee-93bc1a1daa9b,0.453835,2025-09-17T14:40:41.091760,2025-09-17T14:40:41.091760,Cash_alpha_trench1_demoscore_backscore,v1,trench alpha beta,99b8dd8e-4431-4eab-8491-406405aacc84,2025-09-17T14:40:41.091760,{},"{""days_on_book"": 0, ""ln_appln_submit_datetime""..."
3,3143585,cc03f3f1-4132-4ed7-9e14-8a096139acc6,d9706daf-4d0e-4775-960c-4f4ac348b99b,0.395059,2025-09-17T14:40:41.092757,2025-09-17T14:40:41.092757,Cash_alpha_trench1_demoscore_backscore,v1,trench alpha beta,0dac25f8-064d-44dd-8bda-c041cfea12c4,2025-09-17T14:40:41.092757,{},"{""days_on_book"": 0, ""ln_appln_submit_datetime""..."
4,3616924,f3bd7ba1-a1a9-4868-97ab-ad1a75799f60,12c9e615-9581-4f4e-a3b3-7e7ad6b694e5,0.570916,2025-09-17T14:40:41.092757,2025-09-17T14:40:41.092757,Cash_alpha_trench1_demoscore_backscore,v1,trench alpha beta,f125f7aa-435b-4bc8-a2a5-1adcc299663a,2025-09-17T14:40:41.092757,{},"{""days_on_book"": 6, ""ln_appln_submit_datetime""..."


In [239]:
df1 = dfd[['customerId', 'digitalLoanAccountId', 'prediction',
       'start_time', 'end_time', 'modelDisplayName', 'modelVersionId',
        'calcFeature'
       ]].copy()

In [240]:
filenames = f'{CURRENT_DATE}_{unique_id}_{transformeddata}'
print(filenames)

results = save_dataframe_multi_format(
     dataframe=df1,
     cloud_path=CLOUDPATH,
     filename=filenames,
     client=client,
     bucket_name=f'{BUCKETNAME}'
 )

20250917_36afc2cd41f7_Cash_alpha_trench1_demoscore_backscore
All files saved successfully!
CSV: gs://prod-asia-southeast1-tonik-aiml-workspace/DC/Model_Monitoring/Model_Tables/20250917_36afc2cd41f7_Cash_alpha_trench1_demoscore_backscore.csv
PICKLE: gs://prod-asia-southeast1-tonik-aiml-workspace/DC/Model_Monitoring/Model_Tables/20250917_36afc2cd41f7_Cash_alpha_trench1_demoscore_backscore.pkl
PARQUET: gs://prod-asia-southeast1-tonik-aiml-workspace/DC/Model_Monitoring/Model_Tables/20250917_36afc2cd41f7_Cash_alpha_trench1_demoscore_backscore.parquet
JOBLIB: gs://prod-asia-southeast1-tonik-aiml-workspace/DC/Model_Monitoring/Model_Tables/20250917_36afc2cd41f7_Cash_alpha_trench1_demoscore_backscore.joblib


# Insert into a table

In [241]:
# Upload to BigQuery
table_id = f"prj-prod-dataplatform.dap_ds_poweruser_playground.temp_model_transformed_data"
print(table_id)
job_config = bigquery.LoadJobConfig(
    write_disposition="WRITE_APPEND",  # or "WRITE_APPEND"
)
job = client.load_table_from_dataframe(df1, table_id, job_config=job_config)
job.result()  # Wait for the job to complete


prj-prod-dataplatform.dap_ds_poweruser_playground.temp_model_transformed_data




LoadJob<project=prj-prod-dataplatform, location=asia-southeast1, id=05f3b494-a604-4a8c-9303-e1d201e2deff>

In [243]:
d1.columns

Index(['customer_id', 'digitalLoanAccountId', 'days_on_book',
       'ln_appln_submit_datetime', 'ln_os_type', 'ln_vas_opted_flag',
       'ln_self_dec_income', 'ln_age', 'ln_source_funds_new_bin',
       'ln_loan_level_user_type', 'ln_industry_new_cat_bin',
       'ln_marital_status', 'ln_doc_type_rolled', 'ln_education_level',
       'ln_ref2_type', 'ln_email_primary_domain', 'ln_province_bin',
       'ln_mature_fspd30_flag', 'ln_fspd30_flag', 'trench_category',
       'ln_loan_type', 'ln_disb_dtime', 'ca_app_cnt_health_and_fitness_ever',
       'ca_app_cnt_shopping_ever', 'ca_app_median_time_bw_installed_mins_ever',
       'ca_app_avg_time_bw_installed_mins_3d', 'ca_app_cnt_crypto_ever',
       'ca_app_cnt_driver_ever', 'ca_app_cnt_payday_180d',
       'ca_app_cnt_gambling_180d', 'ca_cic_max_age_all_contracts_snapshot',
       'ca_cic_ratio_overdue_contracts_to_granted_contracts',
       'ca_cic_ScoreRange', 'ca_cic_ln_loan_level_user_type',
       'ca_cic_has_ever_been_overdue',
  

In [244]:
d2 = d1[d1['ln_os_type'] == 'Android']
d2['ln_os_type'].value_counts()

ln_os_type
Android    28364
Name: count, dtype: int64

In [245]:
import pandas as pd
import json
import uuid
from datetime import datetime

def transform_data(output_file_path):
    # Read the input CSV file
    df = d2.copy()
    
    # Create the output DataFrame with the required structure
    output_data = []
    
    for _, row in df.iterrows():
        # Create the calcFeature JSON with all the feature columns
        feature_columns = [
         'ca_app_cnt_health_and_fitness_ever',
       'ca_app_cnt_shopping_ever', 'ca_app_median_time_bw_installed_mins_ever',
       'ca_app_avg_time_bw_installed_mins_3d', 'ca_app_cnt_crypto_ever',
       'ca_app_cnt_driver_ever', 'ca_app_cnt_payday_180d',
       'ca_app_cnt_gambling_180d',
        ]
        
        calc_feature = {}
        for col in feature_columns:
            if col in row and pd.notna(row[col]):
                # Convert Timestamp objects to string
                if isinstance(row[col], pd.Timestamp):
                    calc_feature[col] = row[col].isoformat()
                else:
                    calc_feature[col] = row[col]
        
       
        # Get current timestamp
        current_time = datetime.now().isoformat()
        
        # Create the output row
        output_row = {
            "customerId": row['customer_id'],
            "digitalLoanAccountId": row['digitalLoanAccountId'],
            "crifApplicationId": str(uuid.uuid4()),  # Generate random UUID
            "prediction": row.get('apps_score', 0),
            "start_time": current_time,
            "end_time": current_time,
            "modelDisplayName":"Cash_alpha_trench1_appscorescore_backscore",
            "modelVersionId":"v1",
            "subscription_name": "trench alpha beta",
            "message_id": str(uuid.uuid4()),  # Generate random UUID
            "publish_time": current_time,
            "attributes": "{}",  # Empty JSON object
            "calcFeature": json.dumps(calc_feature, default=str)  # Use default=str to handle non-serializable objects
            
        }
        
        output_data.append(output_row)
    
    # Create DataFrame from the output data
    output_df = pd.DataFrame(output_data)
    
    # Save to CSV
    output_df.to_csv(output_file_path, index=False)
    return output_df

# Example usage:
# transformeddata = 'cash_beta_trench1_applied_loans_backscored_20241001_20250831'
# transform_data(f'{LOCALPATH}/{transformeddata}.csv')

In [246]:
transformeddata = f'Cash_alpha_trench1_appscorescore_backscore'
dfd = transform_data(f'{LOCALPATH}/{transformeddata}.csv')
print(f"The shape of the {transformeddata} data is: {dfd.shape}")

The shape of the Cash_alpha_trench1_appscorescore_backscore data is: (28364, 13)


In [247]:
dfd.head()

Unnamed: 0,customerId,digitalLoanAccountId,crifApplicationId,prediction,start_time,end_time,modelDisplayName,modelVersionId,subscription_name,message_id,publish_time,attributes,calcFeature
0,3411924,a21d0c56-2495-4bf1-a529-c2344d5583b0,050f4b38-f23a-4a05-bbb1-d44c82d6436e,0.679013,2025-09-17T14:43:42.954242,2025-09-17T14:43:42.954242,Cash_alpha_trench1_appscorescore_backscore,v1,trench alpha beta,6e53036f-33c3-4bb7-9d24-afc4c2e54755,2025-09-17T14:43:42.954242,{},"{""ca_app_cnt_health_and_fitness_ever"": 0.0, ""c..."
1,3493655,0b5b65b9-45c3-41b9-8851-57b3a0565450,86f7544d-8e4e-4950-a783-8bc03e8293db,0.660629,2025-09-17T14:43:42.954242,2025-09-17T14:43:42.954242,Cash_alpha_trench1_appscorescore_backscore,v1,trench alpha beta,229fc439-75f7-4a2e-9d25-18c373c3dcc3,2025-09-17T14:43:42.954242,{},"{""ca_app_cnt_health_and_fitness_ever"": 0.0, ""c..."
2,3199802,8b7801e5-56dd-4dc1-bcf3-9730aab8c9fc,2997a19f-758b-4a21-a8fe-7bad84691be4,0.495682,2025-09-17T14:43:42.955243,2025-09-17T14:43:42.955243,Cash_alpha_trench1_appscorescore_backscore,v1,trench alpha beta,a6ffca3f-3dd9-4089-8360-fe9290d6b099,2025-09-17T14:43:42.955243,{},"{""ca_app_cnt_health_and_fitness_ever"": 0.0, ""c..."
3,3402738,43e5b020-70ec-4839-abe6-2bbd49ac866b,0b8277d2-e0ce-4d84-8e28-58473580c9b4,0.690475,2025-09-17T14:43:42.955243,2025-09-17T14:43:42.955243,Cash_alpha_trench1_appscorescore_backscore,v1,trench alpha beta,da892987-f889-404e-8a3c-c810f9539e79,2025-09-17T14:43:42.955243,{},"{""ca_app_cnt_health_and_fitness_ever"": 0.0, ""c..."
4,3474016,2d426e39-2eed-4485-8e8d-17b373f5f950,5958b332-56c3-44ec-82eb-fce9943d18c3,0.708802,2025-09-17T14:43:42.955243,2025-09-17T14:43:42.955243,Cash_alpha_trench1_appscorescore_backscore,v1,trench alpha beta,915bc710-93b3-441b-be05-615e230852bc,2025-09-17T14:43:42.955243,{},"{""ca_app_cnt_health_and_fitness_ever"": 0.0, ""c..."


In [248]:
df1 = dfd[['customerId', 'digitalLoanAccountId', 'prediction',
       'start_time', 'end_time', 'modelDisplayName', 'modelVersionId',
        'calcFeature'
       ]].copy()

In [249]:
filenames = f'{CURRENT_DATE}_{unique_id}_{transformeddata}'
print(filenames)

results = save_dataframe_multi_format(
     dataframe=df1,
     cloud_path=CLOUDPATH,
     filename=filenames,
     client=client,
     bucket_name=f'{BUCKETNAME}'
 )

20250917_36afc2cd41f7_Cash_alpha_trench1_appscorescore_backscore
All files saved successfully!
CSV: gs://prod-asia-southeast1-tonik-aiml-workspace/DC/Model_Monitoring/Model_Tables/20250917_36afc2cd41f7_Cash_alpha_trench1_appscorescore_backscore.csv
PICKLE: gs://prod-asia-southeast1-tonik-aiml-workspace/DC/Model_Monitoring/Model_Tables/20250917_36afc2cd41f7_Cash_alpha_trench1_appscorescore_backscore.pkl
PARQUET: gs://prod-asia-southeast1-tonik-aiml-workspace/DC/Model_Monitoring/Model_Tables/20250917_36afc2cd41f7_Cash_alpha_trench1_appscorescore_backscore.parquet
JOBLIB: gs://prod-asia-southeast1-tonik-aiml-workspace/DC/Model_Monitoring/Model_Tables/20250917_36afc2cd41f7_Cash_alpha_trench1_appscorescore_backscore.joblib


# Insert into a table

In [250]:
# Upload to BigQuery
table_id = f"prj-prod-dataplatform.dap_ds_poweruser_playground.temp_model_transformed_data"
print(table_id)
job_config = bigquery.LoadJobConfig(
    write_disposition="WRITE_APPEND",  # or "WRITE_APPEND"
)
job = client.load_table_from_dataframe(df1, table_id, job_config=job_config)
job.result()  # Wait for the job to complete


prj-prod-dataplatform.dap_ds_poweruser_playground.temp_model_transformed_data




LoadJob<project=prj-prod-dataplatform, location=asia-southeast1, id=422d7d7c-8f50-4de6-9ce0-5dbf154f9ebd>

In [252]:
d1.columns

Index(['customer_id', 'digitalLoanAccountId', 'days_on_book',
       'ln_appln_submit_datetime', 'ln_os_type', 'ln_vas_opted_flag',
       'ln_self_dec_income', 'ln_age', 'ln_source_funds_new_bin',
       'ln_loan_level_user_type', 'ln_industry_new_cat_bin',
       'ln_marital_status', 'ln_doc_type_rolled', 'ln_education_level',
       'ln_ref2_type', 'ln_email_primary_domain', 'ln_province_bin',
       'ln_mature_fspd30_flag', 'ln_fspd30_flag', 'trench_category',
       'ln_loan_type', 'ln_disb_dtime', 'ca_app_cnt_health_and_fitness_ever',
       'ca_app_cnt_shopping_ever', 'ca_app_median_time_bw_installed_mins_ever',
       'ca_app_avg_time_bw_installed_mins_3d', 'ca_app_cnt_crypto_ever',
       'ca_app_cnt_driver_ever', 'ca_app_cnt_payday_180d',
       'ca_app_cnt_gambling_180d', 'ca_cic_max_age_all_contracts_snapshot',
       'ca_cic_ratio_overdue_contracts_to_granted_contracts',
       'ca_cic_ScoreRange', 'ca_cic_ln_loan_level_user_type',
       'ca_cic_has_ever_been_overdue',
  

In [253]:
import pandas as pd
import json
import uuid
from datetime import datetime

def transform_data(output_file_path):
    # Read the input CSV file
    df = d1.copy()
    
    # Create the output DataFrame with the required structure
    output_data = []
    
    for _, row in df.iterrows():
        # Create the calcFeature JSON with all the feature columns
        feature_columns = [
       'ca_cic_max_age_all_contracts_snapshot',
       'ca_cic_ratio_overdue_contracts_to_granted_contracts',
       'ca_cic_ScoreRange', 'ca_cic_ln_loan_level_user_type',
       'ca_cic_has_ever_been_overdue',
       'ca_cic_latest_granted_contract_overdue_flag',
       'ca_cic_ratio_closed_over_new_granted_cnt_24M',
       'ca_cic_ratio_risky_contracts_to_granted_contracts',
       'ca_cic_Short_and_Term_Loans_granted_contracts_cnt_24M',
       'ca_cic_flg_zero_non_granted_ever',
       'ca_cic_Personal_Loans_granted_contracts_amt_24M',
       'ca_cic_CreditAvgCreditLimit', 'ca_cic_flg_zero_granted_ever',
        ]
        
        calc_feature = {}
        for col in feature_columns:
            if col in row and pd.notna(row[col]):
                # Convert Timestamp objects to string
                if isinstance(row[col], pd.Timestamp):
                    calc_feature[col] = row[col].isoformat()
                else:
                    calc_feature[col] = row[col]
        
       
        # Get current timestamp
        current_time = datetime.now().isoformat()
        
        # Create the output row
        output_row = {
            "customerId": row['customer_id'],
            "digitalLoanAccountId": row['digitalLoanAccountId'],
            "crifApplicationId": str(uuid.uuid4()),  # Generate random UUID
            "prediction": row.get('cic_score', 0),
            "start_time": current_time,
            "end_time": current_time,
            "modelDisplayName":"Cash_alpha_trench1_cicscore_backscore",
            "modelVersionId":"v1",
            "subscription_name": "trench alpha beta",
            "message_id": str(uuid.uuid4()),  # Generate random UUID
            "publish_time": current_time,
            "attributes": "{}",  # Empty JSON object
            "calcFeature": json.dumps(calc_feature, default=str)  # Use default=str to handle non-serializable objects
            
        }
        
        output_data.append(output_row)
    
    # Create DataFrame from the output data
    output_df = pd.DataFrame(output_data)
    
    # Save to CSV
    output_df.to_csv(output_file_path, index=False)
    return output_df

# Example usage:
# transformeddata = 'cash_beta_trench1_applied_loans_backscored_20241001_20250831'
# transform_data(f'{LOCALPATH}/{transformeddata}.csv')

In [254]:
transformeddata = f'Cash_alpha_trench1_cicscore_backscore'
dfd = transform_data(f'{LOCALPATH}/{transformeddata}.csv')
print(f"The shape of the {transformeddata} data is: {dfd.shape}")

The shape of the Cash_alpha_trench1_cicscore_backscore data is: (62044, 13)


In [255]:
dfd.head()

Unnamed: 0,customerId,digitalLoanAccountId,crifApplicationId,prediction,start_time,end_time,modelDisplayName,modelVersionId,subscription_name,message_id,publish_time,attributes,calcFeature
0,2945653,e7e3e1dc-de0e-4ad8-b374-2cd2ab2bd87a,0a64dce8-79d8-4436-98b5-c4967e8bf634,0.568481,2025-09-17T14:45:57.128733,2025-09-17T14:45:57.128733,Cash_alpha_trench1_cicscore_backscore,v1,trench alpha beta,46c8d640-829a-4091-bbf5-71de582e9694,2025-09-17T14:45:57.128733,{},"{""ca_cic_max_age_all_contracts_snapshot"": 357...."
1,3436627,e07c1df0-56cd-4d5f-bfd4-8deb3afdc95e,09eb456e-9640-474a-a3a5-7cd427873d86,0.71152,2025-09-17T14:45:57.129728,2025-09-17T14:45:57.129728,Cash_alpha_trench1_cicscore_backscore,v1,trench alpha beta,f5f65df4-2815-4d5b-9ce2-1e772a4f39ad,2025-09-17T14:45:57.129728,{},"{""ca_cic_max_age_all_contracts_snapshot"": 133...."
2,3110665,bac4a185-04bf-4c90-b73e-2ee0d6707df4,49069c39-7f92-4f9b-b8a3-240d2cac3fff,0.583933,2025-09-17T14:45:57.129728,2025-09-17T14:45:57.129728,Cash_alpha_trench1_cicscore_backscore,v1,trench alpha beta,2be590f7-3da6-4bdc-a1fa-d8b901fbd19e,2025-09-17T14:45:57.129728,{},"{""ca_cic_max_age_all_contracts_snapshot"": 1740..."
3,3143585,cc03f3f1-4132-4ed7-9e14-8a096139acc6,dcc3f8d0-6d29-447f-9177-b875d7f3bd6d,0.409816,2025-09-17T14:45:57.129728,2025-09-17T14:45:57.129728,Cash_alpha_trench1_cicscore_backscore,v1,trench alpha beta,c5719ad8-d95b-47f1-a747-7a48fcdb2d92,2025-09-17T14:45:57.129728,{},"{""ca_cic_max_age_all_contracts_snapshot"": 3493..."
4,3616924,f3bd7ba1-a1a9-4868-97ab-ad1a75799f60,dd85254e-961f-4fbc-a6bb-44b11218ab97,0.507959,2025-09-17T14:45:57.129728,2025-09-17T14:45:57.129728,Cash_alpha_trench1_cicscore_backscore,v1,trench alpha beta,78ed8433-67a9-4ee0-8026-ffd8419cf13f,2025-09-17T14:45:57.129728,{},"{""ca_cic_max_age_all_contracts_snapshot"": 1688..."


In [256]:
df1 = dfd[['customerId', 'digitalLoanAccountId', 'prediction',
       'start_time', 'end_time', 'modelDisplayName', 'modelVersionId',
        'calcFeature'
       ]].copy()

In [257]:
filenames = f'{CURRENT_DATE}_{unique_id}_{transformeddata}'
print(filenames)

results = save_dataframe_multi_format(
     dataframe=df1,
     cloud_path=CLOUDPATH,
     filename=filenames,
     client=client,
     bucket_name=f'{BUCKETNAME}'
 )

20250917_36afc2cd41f7_Cash_alpha_trench1_cicscore_backscore
All files saved successfully!
CSV: gs://prod-asia-southeast1-tonik-aiml-workspace/DC/Model_Monitoring/Model_Tables/20250917_36afc2cd41f7_Cash_alpha_trench1_cicscore_backscore.csv
PICKLE: gs://prod-asia-southeast1-tonik-aiml-workspace/DC/Model_Monitoring/Model_Tables/20250917_36afc2cd41f7_Cash_alpha_trench1_cicscore_backscore.pkl
PARQUET: gs://prod-asia-southeast1-tonik-aiml-workspace/DC/Model_Monitoring/Model_Tables/20250917_36afc2cd41f7_Cash_alpha_trench1_cicscore_backscore.parquet
JOBLIB: gs://prod-asia-southeast1-tonik-aiml-workspace/DC/Model_Monitoring/Model_Tables/20250917_36afc2cd41f7_Cash_alpha_trench1_cicscore_backscore.joblib


# Insert into a table

In [258]:
# Upload to BigQuery
table_id = f"prj-prod-dataplatform.dap_ds_poweruser_playground.temp_model_transformed_data"
print(table_id)
job_config = bigquery.LoadJobConfig(
    write_disposition="WRITE_APPEND",  # or "WRITE_APPEND"
)
job = client.load_table_from_dataframe(df1, table_id, job_config=job_config)
job.result()  # Wait for the job to complete


prj-prod-dataplatform.dap_ds_poweruser_playground.temp_model_transformed_data




LoadJob<project=prj-prod-dataplatform, location=asia-southeast1, id=32725198-9869-47f9-94d0-56121bbb2b33>

In [260]:
d1.columns

Index(['customer_id', 'digitalLoanAccountId', 'days_on_book',
       'ln_appln_submit_datetime', 'ln_os_type', 'ln_vas_opted_flag',
       'ln_self_dec_income', 'ln_age', 'ln_source_funds_new_bin',
       'ln_loan_level_user_type', 'ln_industry_new_cat_bin',
       'ln_marital_status', 'ln_doc_type_rolled', 'ln_education_level',
       'ln_ref2_type', 'ln_email_primary_domain', 'ln_province_bin',
       'ln_mature_fspd30_flag', 'ln_fspd30_flag', 'trench_category',
       'ln_loan_type', 'ln_disb_dtime', 'ca_app_cnt_health_and_fitness_ever',
       'ca_app_cnt_shopping_ever', 'ca_app_median_time_bw_installed_mins_ever',
       'ca_app_avg_time_bw_installed_mins_3d', 'ca_app_cnt_crypto_ever',
       'ca_app_cnt_driver_ever', 'ca_app_cnt_payday_180d',
       'ca_app_cnt_gambling_180d', 'ca_cic_max_age_all_contracts_snapshot',
       'ca_cic_ratio_overdue_contracts_to_granted_contracts',
       'ca_cic_ScoreRange', 'ca_cic_ln_loan_level_user_type',
       'ca_cic_has_ever_been_overdue',
  

In [261]:
import pandas as pd
import json
import uuid
from datetime import datetime

def transform_data(output_file_path):
    # Read the input CSV file
    df = d1.copy()
    
    # Create the output DataFrame with the required structure
    output_data = []
    
    for _, row in df.iterrows():
        # Create the calcFeature JSON with all the feature columns
        feature_columns = [
        'demo_score', 'apps_score', 'credo_score','cic_score',
        ]
        
        calc_feature = {}
        for col in feature_columns:
            if col in row and pd.notna(row[col]):
                # Convert Timestamp objects to string
                if isinstance(row[col], pd.Timestamp):
                    calc_feature[col] = row[col].isoformat()
                else:
                    calc_feature[col] = row[col]
        
       
        # Get current timestamp
        current_time = datetime.now().isoformat()
        
        # Create the output row
        output_row = {
            "customerId": row['customer_id'],
            "digitalLoanAccountId": row['digitalLoanAccountId'],
            "crifApplicationId": str(uuid.uuid4()),  # Generate random UUID
            "prediction": row.get('stack_score', 0),
            "start_time": current_time,
            "end_time": current_time,
            "modelDisplayName":"Cash_alpha_trench1_stackscore_backscore",
            "modelVersionId":"v1",
            "subscription_name": "trench alpha beta",
            "message_id": str(uuid.uuid4()),  # Generate random UUID
            "publish_time": current_time,
            "attributes": "{}",  # Empty JSON object
            "calcFeature": json.dumps(calc_feature, default=str)  # Use default=str to handle non-serializable objects
            
        }
        
        output_data.append(output_row)
    
    # Create DataFrame from the output data
    output_df = pd.DataFrame(output_data)
    
    # Save to CSV
    output_df.to_csv(output_file_path, index=False)
    return output_df

# Example usage:
# transformeddata = 'cash_beta_trench1_applied_loans_backscored_20241001_20250831'
# transform_data(f'{LOCALPATH}/{transformeddata}.csv')

In [262]:
transformeddata = f'Cash_alpha_trench1_stackscore_backscore'
dfd = transform_data(f'{LOCALPATH}/{transformeddata}.csv')
print(f"The shape of the {transformeddata} data is: {dfd.shape}")

The shape of the Cash_alpha_trench1_stackscore_backscore data is: (62044, 13)


In [263]:
dfd.head()

Unnamed: 0,customerId,digitalLoanAccountId,crifApplicationId,prediction,start_time,end_time,modelDisplayName,modelVersionId,subscription_name,message_id,publish_time,attributes,calcFeature
0,2945653,e7e3e1dc-de0e-4ad8-b374-2cd2ab2bd87a,7e2b935c-e843-45ec-aaee-ae5b95574e05,0.62085,2025-09-17T14:47:42.358569,2025-09-17T14:47:42.358569,Cash_alpha_trench1_stackscore_backscore,v1,trench alpha beta,8fbe578f-9733-4275-b8b7-7883fd8288f8,2025-09-17T14:47:42.358569,{},"{""demo_score"": 0.504411926197625, ""credo_score..."
1,3436627,e07c1df0-56cd-4d5f-bfd4-8deb3afdc95e,253826b7-8401-491b-9255-6666b4090cd4,0.640329,2025-09-17T14:47:42.359606,2025-09-17T14:47:42.359606,Cash_alpha_trench1_stackscore_backscore,v1,trench alpha beta,73670948-a5ee-4a13-9bae-d7c1d97ed84a,2025-09-17T14:47:42.359606,{},"{""demo_score"": 0.44742975606632246, ""credo_sco..."
2,3110665,bac4a185-04bf-4c90-b73e-2ee0d6707df4,1aa3cfe5-90a9-4576-8945-963175a12ab8,0.534009,2025-09-17T14:47:42.359606,2025-09-17T14:47:42.359606,Cash_alpha_trench1_stackscore_backscore,v1,trench alpha beta,c92c4da6-a20c-49e9-9119-6afeb99d8bc7,2025-09-17T14:47:42.359606,{},"{""demo_score"": 0.45383506885111236, ""credo_sco..."
3,3143585,cc03f3f1-4132-4ed7-9e14-8a096139acc6,a9e15222-37f9-4e67-8236-9c42bb7b16d2,0.24343,2025-09-17T14:47:42.359606,2025-09-17T14:47:42.359606,Cash_alpha_trench1_stackscore_backscore,v1,trench alpha beta,14a82631-400a-40ae-9ab3-fe48b74b3ca5,2025-09-17T14:47:42.359606,{},"{""demo_score"": 0.39505892294939876, ""credo_sco..."
4,3616924,f3bd7ba1-a1a9-4868-97ab-ad1a75799f60,9529b1fc-7a5c-4056-9105-c6425fa713fa,0.553493,2025-09-17T14:47:42.359606,2025-09-17T14:47:42.359606,Cash_alpha_trench1_stackscore_backscore,v1,trench alpha beta,85d36813-4625-49b7-835b-3853e05899bd,2025-09-17T14:47:42.359606,{},"{""demo_score"": 0.5709163633237098, ""credo_scor..."


In [264]:
df1 = dfd[['customerId', 'digitalLoanAccountId', 'prediction',
       'start_time', 'end_time', 'modelDisplayName', 'modelVersionId',
        'calcFeature'
       ]].copy()

In [265]:
filenames = f'{CURRENT_DATE}_{unique_id}_{transformeddata}'
print(filenames)

results = save_dataframe_multi_format(
     dataframe=df1,
     cloud_path=CLOUDPATH,
     filename=filenames,
     client=client,
     bucket_name=f'{BUCKETNAME}'
 )

20250917_36afc2cd41f7_Cash_alpha_trench1_stackscore_backscore
All files saved successfully!
CSV: gs://prod-asia-southeast1-tonik-aiml-workspace/DC/Model_Monitoring/Model_Tables/20250917_36afc2cd41f7_Cash_alpha_trench1_stackscore_backscore.csv
PICKLE: gs://prod-asia-southeast1-tonik-aiml-workspace/DC/Model_Monitoring/Model_Tables/20250917_36afc2cd41f7_Cash_alpha_trench1_stackscore_backscore.pkl
PARQUET: gs://prod-asia-southeast1-tonik-aiml-workspace/DC/Model_Monitoring/Model_Tables/20250917_36afc2cd41f7_Cash_alpha_trench1_stackscore_backscore.parquet
JOBLIB: gs://prod-asia-southeast1-tonik-aiml-workspace/DC/Model_Monitoring/Model_Tables/20250917_36afc2cd41f7_Cash_alpha_trench1_stackscore_backscore.joblib


# Insert into a table

In [266]:
# Upload to BigQuery
table_id = f"prj-prod-dataplatform.dap_ds_poweruser_playground.temp_model_transformed_data"
print(table_id)
job_config = bigquery.LoadJobConfig(
    write_disposition="WRITE_APPEND",  # or "WRITE_APPEND"
)
job = client.load_table_from_dataframe(df1, table_id, job_config=job_config)
job.result()  # Wait for the job to complete


prj-prod-dataplatform.dap_ds_poweruser_playground.temp_model_transformed_data




LoadJob<project=prj-prod-dataplatform, location=asia-southeast1, id=bc5149cc-cb8a-46ac-9cca-f6769b08b1c2>

# cash_alpha_trench2_applied_loans_backscored_20241001_20250831

# Table

In [268]:
schema1 = 'worktable_data_analysis'
cash_alpha_trench2 = f'cash_alpha_trench2_applied_loans_backscored_20241001_20250831'

# Query

In [269]:
sq = f"""
select * from {schema1}.{cash_alpha_trench2};
"""
d1 = client.query(sq).to_dataframe(progress_bar_type='tqdm')
print(f"The shape of {schema1}.{cash_alpha_trench2} table is:\t {d1.shape}")

Job ID 551b216b-e1a4-4b3e-8fc2-d08363aa94a5 successfully executed: 100%|[32m██████████[0m|




Downloading: 100%|[32m██████████[0m|
The shape of worktable_data_analysis.cash_alpha_trench2_applied_loans_backscored_20241001_20250831 table is:	 (39651, 60)


In [270]:
d1.columns

Index(['customer_id', 'digitalLoanAccountId', 'days_on_book',
       'ln_appln_submit_datetime', 'ln_os_type', 'ln_vas_opted_flag',
       'ln_self_dec_income', 'ln_age', 'ln_source_funds_new_bin',
       'ln_loan_level_user_type', 'ln_industry_new_cat_bin',
       'ln_marital_status', 'ln_doc_type_rolled', 'ln_education_level',
       'ln_ref2_type', 'ln_email_primary_domain', 'ln_province_bin',
       'ln_mature_fspd30_flag', 'ln_fspd30_flag', 'trench_category',
       'ln_loan_type', 'ln_disb_dtime', 'ca_app_cnt_health_and_fitness_ever',
       'ca_app_cnt_shopping_ever', 'ca_app_median_time_bw_installed_mins_ever',
       'ca_app_avg_time_bw_installed_mins_3d', 'ca_app_cnt_crypto_ever',
       'ca_app_cnt_driver_ever', 'ca_app_cnt_payday_180d',
       'ca_app_cnt_gambling_180d', 'ca_t2_tx_meng_ql_calculator_tot_visit_cnt',
       'ca_t2_tx_first_product_user_segment_WOE',
       'ca_t2_tx_first_applied_loan_type_bin_WOE',
       'ca_t2_tx_cnt_rejected_loans',
       'ca_t2_tx_appsf

In [271]:
import pandas as pd
import json
import uuid
from datetime import datetime

def transform_data(output_file_path):
    # Read the input CSV file
    df = d1.copy()
    
    # Create the output DataFrame with the required structure
    output_data = []
    
    for _, row in df.iterrows():
        # Create the calcFeature JSON with all the feature columns
        feature_columns = [
         'days_on_book',
       'ln_appln_submit_datetime', 'ln_os_type', 'ln_vas_opted_flag',
       'ln_self_dec_income', 'ln_age', 'ln_source_funds_new_bin',
       'ln_loan_level_user_type', 'ln_industry_new_cat_bin',
       'ln_marital_status', 'ln_doc_type_rolled', 'ln_education_level',
       'ln_ref2_type', 'ln_email_primary_domain', 'ln_province_bin',
        'trench_category',  'ln_loan_type', 'ln_disb_dtime',
        ]
        
        calc_feature = {}
        for col in feature_columns:
            if col in row and pd.notna(row[col]):
                # Convert Timestamp objects to string
                if isinstance(row[col], pd.Timestamp):
                    calc_feature[col] = row[col].isoformat()
                else:
                    calc_feature[col] = row[col]
        
       
        # Get current timestamp
        current_time = datetime.now().isoformat()
        
        # Create the output row
        output_row = {
            "customerId": row['customer_id'],
            "digitalLoanAccountId": row['digitalLoanAccountId'],
            "crifApplicationId": str(uuid.uuid4()),  # Generate random UUID
            "prediction": row.get('demo_score', 0),
            "start_time": current_time,
            "end_time": current_time,
            "modelDisplayName":"Cash_alpha_trench2_demoscore_backscore",
            "modelVersionId":"v1",
            "subscription_name": "trench alpha beta",
            "message_id": str(uuid.uuid4()),  # Generate random UUID
            "publish_time": current_time,
            "attributes": "{}",  # Empty JSON object
            "calcFeature": json.dumps(calc_feature, default=str)  # Use default=str to handle non-serializable objects
            
        }
        
        output_data.append(output_row)
    
    # Create DataFrame from the output data
    output_df = pd.DataFrame(output_data)
    
    # Save to CSV
    output_df.to_csv(output_file_path, index=False)
    return output_df

# Example usage:
# transformeddata = 'cash_beta_trench1_applied_loans_backscored_20241001_20250831'
# transform_data(f'{LOCALPATH}/{transformeddata}.csv')

In [272]:
transformeddata = f'Cash_alpha_trench2_demoscore_backscore'
dfd = transform_data(f'{LOCALPATH}/{transformeddata}.csv')
print(f"The shape of the {transformeddata} data is: {dfd.shape}")

The shape of the Cash_alpha_trench2_demoscore_backscore data is: (39651, 13)


In [273]:
dfd.head()

Unnamed: 0,customerId,digitalLoanAccountId,crifApplicationId,prediction,start_time,end_time,modelDisplayName,modelVersionId,subscription_name,message_id,publish_time,attributes,calcFeature
0,2278578,527a7986-f598-4498-8dbe-9f5baa0397ce,b7eded3d-1dad-4caf-8449-b9ebf3912fa5,0.591407,2025-09-17T14:51:05.774280,2025-09-17T14:51:05.774280,Cash_alpha_trench2_demoscore_backscore,v1,trench alpha beta,d33da567-6b47-4981-9c91-74243509ff71,2025-09-17T14:51:05.774280,{},"{""days_on_book"": 421, ""ln_appln_submit_datetim..."
1,2808330,18122fb1-947f-4004-9033-c4bdcbbca5e6,bdba29f7-e781-44db-adad-512f22cadbd6,0.464558,2025-09-17T14:51:05.775279,2025-09-17T14:51:05.775279,Cash_alpha_trench2_demoscore_backscore,v1,trench alpha beta,f5277671-e1e8-4912-9c00-452a4f149968,2025-09-17T14:51:05.775279,{},"{""days_on_book"": 40, ""ln_appln_submit_datetime..."
2,3403996,a087c785-8d47-49d7-83f1-896dfc0b007a,6bb7925e-7476-44ea-b6c3-8a447ccf1077,0.460365,2025-09-17T14:51:05.775279,2025-09-17T14:51:05.775279,Cash_alpha_trench2_demoscore_backscore,v1,trench alpha beta,2aa0d915-1dcd-45d8-bfda-1cc476f3331b,2025-09-17T14:51:05.775279,{},"{""days_on_book"": 415, ""ln_appln_submit_datetim..."
3,3172423,965de26f-3ab6-4992-b678-f7f6f8219736,bd8f89dc-d3ca-4d90-bb5d-7de1179495e3,0.583504,2025-09-17T14:51:05.775279,2025-09-17T14:51:05.775279,Cash_alpha_trench2_demoscore_backscore,v1,trench alpha beta,15ae0196-0a54-4378-8ddd-7f703edb60bc,2025-09-17T14:51:05.775279,{},"{""days_on_book"": 31, ""ln_appln_submit_datetime..."
4,2422495,59483165-7e09-4ab2-a038-c13577f02479,c0c4f4d5-16e6-4b59-a67f-c5dc78ebd4bf,0.433422,2025-09-17T14:51:05.775279,2025-09-17T14:51:05.775279,Cash_alpha_trench2_demoscore_backscore,v1,trench alpha beta,5d5f92fc-43b2-4b34-a1e5-a2e0b4176929,2025-09-17T14:51:05.775279,{},"{""days_on_book"": 241, ""ln_appln_submit_datetim..."


In [274]:
df1 = dfd[['customerId', 'digitalLoanAccountId', 'prediction',
       'start_time', 'end_time', 'modelDisplayName', 'modelVersionId',
        'calcFeature'
       ]].copy()

In [275]:
filenames = f'{CURRENT_DATE}_{unique_id}_{transformeddata}'
print(filenames)

results = save_dataframe_multi_format(
     dataframe=df1,
     cloud_path=CLOUDPATH,
     filename=filenames,
     client=client,
     bucket_name=f'{BUCKETNAME}'
 )

20250917_36afc2cd41f7_Cash_alpha_trench2_demoscore_backscore
All files saved successfully!
CSV: gs://prod-asia-southeast1-tonik-aiml-workspace/DC/Model_Monitoring/Model_Tables/20250917_36afc2cd41f7_Cash_alpha_trench2_demoscore_backscore.csv
PICKLE: gs://prod-asia-southeast1-tonik-aiml-workspace/DC/Model_Monitoring/Model_Tables/20250917_36afc2cd41f7_Cash_alpha_trench2_demoscore_backscore.pkl
PARQUET: gs://prod-asia-southeast1-tonik-aiml-workspace/DC/Model_Monitoring/Model_Tables/20250917_36afc2cd41f7_Cash_alpha_trench2_demoscore_backscore.parquet
JOBLIB: gs://prod-asia-southeast1-tonik-aiml-workspace/DC/Model_Monitoring/Model_Tables/20250917_36afc2cd41f7_Cash_alpha_trench2_demoscore_backscore.joblib


# Insert into a table

In [276]:
# Upload to BigQuery
table_id = f"prj-prod-dataplatform.dap_ds_poweruser_playground.temp_model_transformed_data"
print(table_id)
job_config = bigquery.LoadJobConfig(
    write_disposition="WRITE_APPEND",  # or "WRITE_APPEND"
)
job = client.load_table_from_dataframe(df1, table_id, job_config=job_config)
job.result()  # Wait for the job to complete


prj-prod-dataplatform.dap_ds_poweruser_playground.temp_model_transformed_data




LoadJob<project=prj-prod-dataplatform, location=asia-southeast1, id=9c368ec1-0a8f-4679-9157-24da35fc2f59>

# Trench2 app score

In [278]:
d1.columns

Index(['customer_id', 'digitalLoanAccountId', 'days_on_book',
       'ln_appln_submit_datetime', 'ln_os_type', 'ln_vas_opted_flag',
       'ln_self_dec_income', 'ln_age', 'ln_source_funds_new_bin',
       'ln_loan_level_user_type', 'ln_industry_new_cat_bin',
       'ln_marital_status', 'ln_doc_type_rolled', 'ln_education_level',
       'ln_ref2_type', 'ln_email_primary_domain', 'ln_province_bin',
       'ln_mature_fspd30_flag', 'ln_fspd30_flag', 'trench_category',
       'ln_loan_type', 'ln_disb_dtime', 'ca_app_cnt_health_and_fitness_ever',
       'ca_app_cnt_shopping_ever', 'ca_app_median_time_bw_installed_mins_ever',
       'ca_app_avg_time_bw_installed_mins_3d', 'ca_app_cnt_crypto_ever',
       'ca_app_cnt_driver_ever', 'ca_app_cnt_payday_180d',
       'ca_app_cnt_gambling_180d', 'ca_t2_tx_meng_ql_calculator_tot_visit_cnt',
       'ca_t2_tx_first_product_user_segment_WOE',
       'ca_t2_tx_first_applied_loan_type_bin_WOE',
       'ca_t2_tx_cnt_rejected_loans',
       'ca_t2_tx_appsf

In [279]:
d2 = d1[d1['ln_os_type'] == 'Android'].copy()
d2['ln_os_type'].value_counts()

ln_os_type
Android    22572
Name: count, dtype: int64

In [280]:
import pandas as pd
import json
import uuid
from datetime import datetime

def transform_data(output_file_path):
    # Read the input CSV file
    df = d2.copy()
    
    # Create the output DataFrame with the required structure
    output_data = []
    
    for _, row in df.iterrows():
        # Create the calcFeature JSON with all the feature columns
        feature_columns = [
          'ca_app_cnt_health_and_fitness_ever',
       'ca_app_cnt_shopping_ever', 'ca_app_median_time_bw_installed_mins_ever',
       'ca_app_avg_time_bw_installed_mins_3d', 'ca_app_cnt_crypto_ever',
       'ca_app_cnt_driver_ever', 'ca_app_cnt_payday_180d',
       'ca_app_cnt_gambling_180d',
        ]
        
        calc_feature = {}
        for col in feature_columns:
            if col in row and pd.notna(row[col]):
                # Convert Timestamp objects to string
                if isinstance(row[col], pd.Timestamp):
                    calc_feature[col] = row[col].isoformat()
                else:
                    calc_feature[col] = row[col]
        
       
        # Get current timestamp
        current_time = datetime.now().isoformat()
        
        # Create the output row
        output_row = {
            "customerId": row['customer_id'],
            "digitalLoanAccountId": row['digitalLoanAccountId'],
            "crifApplicationId": str(uuid.uuid4()),  # Generate random UUID
            "prediction": row.get('apps_score', 0),
            "start_time": current_time,
            "end_time": current_time,
            "modelDisplayName":"Cash_alpha_trench2_appscore_backscore",
            "modelVersionId":"v1",
            "subscription_name": "trench alpha beta",
            "message_id": str(uuid.uuid4()),  # Generate random UUID
            "publish_time": current_time,
            "attributes": "{}",  # Empty JSON object
            "calcFeature": json.dumps(calc_feature, default=str)  # Use default=str to handle non-serializable objects
            
        }
        
        output_data.append(output_row)
    
    # Create DataFrame from the output data
    output_df = pd.DataFrame(output_data)
    
    # Save to CSV
    output_df.to_csv(output_file_path, index=False)
    return output_df

# Example usage:
# transformeddata = 'cash_beta_trench1_applied_loans_backscored_20241001_20250831'
# transform_data(f'{LOCALPATH}/{transformeddata}.csv')

In [281]:
transformeddata = f'Cash_alpha_trench2_appscore_backscore'
dfd = transform_data(f'{LOCALPATH}/{transformeddata}.csv')
print(f"The shape of the {transformeddata} data is: {dfd.shape}")

The shape of the Cash_alpha_trench2_appscore_backscore data is: (22572, 13)


In [282]:
dfd.head()

Unnamed: 0,customerId,digitalLoanAccountId,crifApplicationId,prediction,start_time,end_time,modelDisplayName,modelVersionId,subscription_name,message_id,publish_time,attributes,calcFeature
0,2899910,7396933c-2a9d-44f5-b733-7a2729d069cc,96aa1f5f-ac04-4075-989a-aca2bc390548,0.47866,2025-09-17T14:53:56.745414,2025-09-17T14:53:56.745414,Cash_alpha_trench2_appscore_backscore,v1,trench alpha beta,67b7c995-ccf2-4662-a104-64aa93c3b779,2025-09-17T14:53:56.745414,{},"{""ca_app_cnt_health_and_fitness_ever"": 0.0, ""c..."
1,1837691,417f4154-8bdf-41fc-8599-9da2a02b3ae0,cff6b370-4f68-4d76-99bd-07c879fe6311,0.48773,2025-09-17T14:53:56.745414,2025-09-17T14:53:56.745414,Cash_alpha_trench2_appscore_backscore,v1,trench alpha beta,9a501f5d-ffa3-4ce5-804f-9225536ab50d,2025-09-17T14:53:56.745414,{},"{""ca_app_cnt_health_and_fitness_ever"": 4.0, ""c..."
2,1003738,6c0b5dda-1b78-4960-a912-58be3c0a23a9,e6a77ff0-8609-47d0-85f6-fc960223dc0f,0.377221,2025-09-17T14:53:56.745414,2025-09-17T14:53:56.745414,Cash_alpha_trench2_appscore_backscore,v1,trench alpha beta,41abcfe7-9066-4feb-9e4a-4c58faec4c24,2025-09-17T14:53:56.745414,{},"{""ca_app_cnt_health_and_fitness_ever"": 1.0, ""c..."
3,1762559,5a37b417-d7fc-482e-8a54-0743b93293b8,b0ea00c9-4009-4dc8-bf60-b2d127a76e5f,0.278161,2025-09-17T14:53:56.745414,2025-09-17T14:53:56.745414,Cash_alpha_trench2_appscore_backscore,v1,trench alpha beta,d5c29402-eb68-432c-81dc-92ada6d13276,2025-09-17T14:53:56.745414,{},"{""ca_app_cnt_health_and_fitness_ever"": 2.0, ""c..."
4,3074855,30e08b8f-93fd-47a2-b14b-028292e4d93a,617312fa-619a-466d-85fd-2b4b60bea524,0.637516,2025-09-17T14:53:56.745414,2025-09-17T14:53:56.745414,Cash_alpha_trench2_appscore_backscore,v1,trench alpha beta,b613eda1-b3b9-4168-ad44-0e43e415172f,2025-09-17T14:53:56.745414,{},"{""ca_app_cnt_health_and_fitness_ever"": 0.0, ""c..."


In [283]:
df1 = dfd[['customerId', 'digitalLoanAccountId', 'prediction',
       'start_time', 'end_time', 'modelDisplayName', 'modelVersionId',
        'calcFeature'
       ]].copy()

In [284]:
filenames = f'{CURRENT_DATE}_{unique_id}_{transformeddata}'
print(filenames)

results = save_dataframe_multi_format(
     dataframe=df1,
     cloud_path=CLOUDPATH,
     filename=filenames,
     client=client,
     bucket_name=f'{BUCKETNAME}'
 )

20250917_36afc2cd41f7_Cash_alpha_trench2_appscore_backscore
All files saved successfully!
CSV: gs://prod-asia-southeast1-tonik-aiml-workspace/DC/Model_Monitoring/Model_Tables/20250917_36afc2cd41f7_Cash_alpha_trench2_appscore_backscore.csv
PICKLE: gs://prod-asia-southeast1-tonik-aiml-workspace/DC/Model_Monitoring/Model_Tables/20250917_36afc2cd41f7_Cash_alpha_trench2_appscore_backscore.pkl
PARQUET: gs://prod-asia-southeast1-tonik-aiml-workspace/DC/Model_Monitoring/Model_Tables/20250917_36afc2cd41f7_Cash_alpha_trench2_appscore_backscore.parquet
JOBLIB: gs://prod-asia-southeast1-tonik-aiml-workspace/DC/Model_Monitoring/Model_Tables/20250917_36afc2cd41f7_Cash_alpha_trench2_appscore_backscore.joblib


# Insert into a table

In [285]:
# Upload to BigQuery
table_id = f"prj-prod-dataplatform.dap_ds_poweruser_playground.temp_model_transformed_data"
print(table_id)
job_config = bigquery.LoadJobConfig(
    write_disposition="WRITE_APPEND",  # or "WRITE_APPEND"
)
job = client.load_table_from_dataframe(df1, table_id, job_config=job_config)
job.result()  # Wait for the job to complete


prj-prod-dataplatform.dap_ds_poweruser_playground.temp_model_transformed_data




LoadJob<project=prj-prod-dataplatform, location=asia-southeast1, id=e2967157-d963-4d7c-a554-5aeb6df69784>

# Trench2 transaction score

In [288]:
d1.columns

Index(['customer_id', 'digitalLoanAccountId', 'days_on_book',
       'ln_appln_submit_datetime', 'ln_os_type', 'ln_vas_opted_flag',
       'ln_self_dec_income', 'ln_age', 'ln_source_funds_new_bin',
       'ln_loan_level_user_type', 'ln_industry_new_cat_bin',
       'ln_marital_status', 'ln_doc_type_rolled', 'ln_education_level',
       'ln_ref2_type', 'ln_email_primary_domain', 'ln_province_bin',
       'ln_mature_fspd30_flag', 'ln_fspd30_flag', 'trench_category',
       'ln_loan_type', 'ln_disb_dtime', 'ca_app_cnt_health_and_fitness_ever',
       'ca_app_cnt_shopping_ever', 'ca_app_median_time_bw_installed_mins_ever',
       'ca_app_avg_time_bw_installed_mins_3d', 'ca_app_cnt_crypto_ever',
       'ca_app_cnt_driver_ever', 'ca_app_cnt_payday_180d',
       'ca_app_cnt_gambling_180d', 'ca_t2_tx_meng_ql_calculator_tot_visit_cnt',
       'ca_t2_tx_first_product_user_segment_WOE',
       'ca_t2_tx_first_applied_loan_type_bin_WOE',
       'ca_t2_tx_cnt_rejected_loans',
       'ca_t2_tx_appsf

In [289]:
import pandas as pd
import json
import uuid
from datetime import datetime

def transform_data(output_file_path):
    # Read the input CSV file
    df = d1.copy()
    
    # Create the output DataFrame with the required structure
    output_data = []
    
    for _, row in df.iterrows():
        # Create the calcFeature JSON with all the feature columns
        feature_columns = [
           'ca_t2_tx_meng_ql_calculator_tot_visit_cnt',
       'ca_t2_tx_first_product_user_segment_WOE',
       'ca_t2_tx_first_applied_loan_type_bin_WOE',
       'ca_t2_tx_cnt_rejected_loans',
       'ca_t2_tx_appsflyer_install_to_registration_minutes',
       'ca_t2_tx_first_applied_loan_amount', 'ca_t2_tx_deposit_accnt_cnt',
       'ca_t2_tx_cnt_cash_in_total', 'ca_t2_tx_cnt_incomplete_loan_apps',
       'ca_t2_tx_amt_cash_in_total',
       'ca_t2_tx_last_applied_loan_tenor_bin_WOE', 
        ]
        
        calc_feature = {}
        for col in feature_columns:
            if col in row and pd.notna(row[col]):
                # Convert Timestamp objects to string
                if isinstance(row[col], pd.Timestamp):
                    calc_feature[col] = row[col].isoformat()
                else:
                    calc_feature[col] = row[col]
        
       
        # Get current timestamp
        current_time = datetime.now().isoformat()
        
        # Create the output row
        output_row = {
            "customerId": row['customer_id'],
            "digitalLoanAccountId": row['digitalLoanAccountId'],
            "crifApplicationId": str(uuid.uuid4()),  # Generate random UUID
            "prediction": row.get('trx_score', 0),
            "start_time": current_time,
            "end_time": current_time,
            "modelDisplayName":"Cash_alpha_trench2_transactionscore_backscore",
            "modelVersionId":"v1",
            "subscription_name": "trench alpha beta",
            "message_id": str(uuid.uuid4()),  # Generate random UUID
            "publish_time": current_time,
            "attributes": "{}",  # Empty JSON object
            "calcFeature": json.dumps(calc_feature, default=str)  # Use default=str to handle non-serializable objects
            
        }
        
        output_data.append(output_row)
    
    # Create DataFrame from the output data
    output_df = pd.DataFrame(output_data)
    
    # Save to CSV
    output_df.to_csv(output_file_path, index=False)
    return output_df

# Example usage:
# transformeddata = 'cash_beta_trench1_applied_loans_backscored_20241001_20250831'
# transform_data(f'{LOCALPATH}/{transformeddata}.csv')

In [290]:
transformeddata = f'trx_score'
dfd = transform_data(f'{LOCALPATH}/{transformeddata}.csv')
print(f"The shape of the {transformeddata} data is: {dfd.shape}")

The shape of the trx_score data is: (39651, 13)


In [291]:
dfd.head()

Unnamed: 0,customerId,digitalLoanAccountId,crifApplicationId,prediction,start_time,end_time,modelDisplayName,modelVersionId,subscription_name,message_id,publish_time,attributes,calcFeature
0,2278578,527a7986-f598-4498-8dbe-9f5baa0397ce,13f0964a-6869-4b4b-a3fd-930de4339193,0.544192,2025-09-17T14:55:42.054879,2025-09-17T14:55:42.054879,Cash_alpha_trench2_transactionscore_backscore,v1,trench alpha beta,e40333e4-ffe3-42ac-998a-10252b064ec4,2025-09-17T14:55:42.054879,{},"{""ca_t2_tx_meng_ql_calculator_tot_visit_cnt"": ..."
1,2808330,18122fb1-947f-4004-9033-c4bdcbbca5e6,7e08fdec-f841-4a60-9a29-53f854d554e0,0.1471,2025-09-17T14:55:42.054879,2025-09-17T14:55:42.054879,Cash_alpha_trench2_transactionscore_backscore,v1,trench alpha beta,990133d0-f66a-469a-95a4-487e8cd6c10f,2025-09-17T14:55:42.054879,{},"{""ca_t2_tx_meng_ql_calculator_tot_visit_cnt"": ..."
2,3403996,a087c785-8d47-49d7-83f1-896dfc0b007a,34b09b4a-1c1d-4c2f-9f5d-272d57e465f3,0.46388,2025-09-17T14:55:42.054879,2025-09-17T14:55:42.054879,Cash_alpha_trench2_transactionscore_backscore,v1,trench alpha beta,98a346af-395c-436d-bb97-7281193ead22,2025-09-17T14:55:42.054879,{},"{""ca_t2_tx_meng_ql_calculator_tot_visit_cnt"": ..."
3,3172423,965de26f-3ab6-4992-b678-f7f6f8219736,b81ad742-1b10-4c83-9c55-12512204931e,0.509618,2025-09-17T14:55:42.054879,2025-09-17T14:55:42.054879,Cash_alpha_trench2_transactionscore_backscore,v1,trench alpha beta,077524a9-32f8-427c-bf80-1459e81c2675,2025-09-17T14:55:42.054879,{},"{""ca_t2_tx_meng_ql_calculator_tot_visit_cnt"": ..."
4,2422495,59483165-7e09-4ab2-a038-c13577f02479,e8463b1b-8cff-4f0e-8581-0a8bf6340e71,0.432925,2025-09-17T14:55:42.054879,2025-09-17T14:55:42.054879,Cash_alpha_trench2_transactionscore_backscore,v1,trench alpha beta,55286ec1-6b9a-40a9-9457-0b74c72585f5,2025-09-17T14:55:42.054879,{},"{""ca_t2_tx_meng_ql_calculator_tot_visit_cnt"": ..."


In [292]:
df1 = dfd[['customerId', 'digitalLoanAccountId', 'prediction',
       'start_time', 'end_time', 'modelDisplayName', 'modelVersionId',
        'calcFeature'
       ]].copy()

In [293]:
filenames = f'{CURRENT_DATE}_{unique_id}_{transformeddata}'
print(filenames)

results = save_dataframe_multi_format(
     dataframe=df1,
     cloud_path=CLOUDPATH,
     filename=filenames,
     client=client,
     bucket_name=f'{BUCKETNAME}'
 )

20250917_36afc2cd41f7_trx_score
All files saved successfully!
CSV: gs://prod-asia-southeast1-tonik-aiml-workspace/DC/Model_Monitoring/Model_Tables/20250917_36afc2cd41f7_trx_score.csv
PICKLE: gs://prod-asia-southeast1-tonik-aiml-workspace/DC/Model_Monitoring/Model_Tables/20250917_36afc2cd41f7_trx_score.pkl
PARQUET: gs://prod-asia-southeast1-tonik-aiml-workspace/DC/Model_Monitoring/Model_Tables/20250917_36afc2cd41f7_trx_score.parquet
JOBLIB: gs://prod-asia-southeast1-tonik-aiml-workspace/DC/Model_Monitoring/Model_Tables/20250917_36afc2cd41f7_trx_score.joblib


# Insert into a table

In [294]:
# Upload to BigQuery
table_id = f"prj-prod-dataplatform.dap_ds_poweruser_playground.temp_model_transformed_data"
print(table_id)
job_config = bigquery.LoadJobConfig(
    write_disposition="WRITE_APPEND",  # or "WRITE_APPEND"
)
job = client.load_table_from_dataframe(df1, table_id, job_config=job_config)
job.result()  # Wait for the job to complete


prj-prod-dataplatform.dap_ds_poweruser_playground.temp_model_transformed_data




LoadJob<project=prj-prod-dataplatform, location=asia-southeast1, id=14f445c1-36ca-404e-ab75-20dce0c91e7e>

# Alpha trench2 cic score

In [296]:
d1.columns

Index(['customer_id', 'digitalLoanAccountId', 'days_on_book',
       'ln_appln_submit_datetime', 'ln_os_type', 'ln_vas_opted_flag',
       'ln_self_dec_income', 'ln_age', 'ln_source_funds_new_bin',
       'ln_loan_level_user_type', 'ln_industry_new_cat_bin',
       'ln_marital_status', 'ln_doc_type_rolled', 'ln_education_level',
       'ln_ref2_type', 'ln_email_primary_domain', 'ln_province_bin',
       'ln_mature_fspd30_flag', 'ln_fspd30_flag', 'trench_category',
       'ln_loan_type', 'ln_disb_dtime', 'ca_app_cnt_health_and_fitness_ever',
       'ca_app_cnt_shopping_ever', 'ca_app_median_time_bw_installed_mins_ever',
       'ca_app_avg_time_bw_installed_mins_3d', 'ca_app_cnt_crypto_ever',
       'ca_app_cnt_driver_ever', 'ca_app_cnt_payday_180d',
       'ca_app_cnt_gambling_180d', 'ca_t2_tx_meng_ql_calculator_tot_visit_cnt',
       'ca_t2_tx_first_product_user_segment_WOE',
       'ca_t2_tx_first_applied_loan_type_bin_WOE',
       'ca_t2_tx_cnt_rejected_loans',
       'ca_t2_tx_appsf

In [297]:
import pandas as pd
import json
import uuid
from datetime import datetime

def transform_data(output_file_path):
    # Read the input CSV file
    df = d1.copy()
    
    # Create the output DataFrame with the required structure
    output_data = []
    
    for _, row in df.iterrows():
        # Create the calcFeature JSON with all the feature columns
        feature_columns = [
           'ca_cic_max_age_all_contracts_snapshot',
       'ca_cic_ratio_overdue_contracts_to_granted_contracts',
       'ca_cic_ScoreRange', 'ca_cic_ln_loan_level_user_type',
       'ca_cic_has_ever_been_overdue',
       'ca_cic_latest_granted_contract_overdue_flag',
       'ca_cic_ratio_closed_over_new_granted_cnt_24M',
       'ca_cic_ratio_risky_contracts_to_granted_contracts',
       'ca_cic_Short_and_Term_Loans_granted_contracts_cnt_24M',
       'ca_cic_flg_zero_non_granted_ever',
       'ca_cic_Personal_Loans_granted_contracts_amt_24M',
       'ca_cic_CreditAvgCreditLimit', 'ca_cic_flg_zero_granted_ever',
        ]
        
        calc_feature = {}
        for col in feature_columns:
            if col in row and pd.notna(row[col]):
                # Convert Timestamp objects to string
                if isinstance(row[col], pd.Timestamp):
                    calc_feature[col] = row[col].isoformat()
                else:
                    calc_feature[col] = row[col]
        
       
        # Get current timestamp
        current_time = datetime.now().isoformat()
        
        # Create the output row
        output_row = {
            "customerId": row['customer_id'],
            "digitalLoanAccountId": row['digitalLoanAccountId'],
            "crifApplicationId": str(uuid.uuid4()),  # Generate random UUID
            "prediction": row.get('cic_score', 0),
            "start_time": current_time,
            "end_time": current_time,
            "modelDisplayName":"Cash_alpha_trench2_cicscore_backscore",
            "modelVersionId":"v1",
            "subscription_name": "trench alpha beta",
            "message_id": str(uuid.uuid4()),  # Generate random UUID
            "publish_time": current_time,
            "attributes": "{}",  # Empty JSON object
            "calcFeature": json.dumps(calc_feature, default=str)  # Use default=str to handle non-serializable objects
            
        }
        
        output_data.append(output_row)
    
    # Create DataFrame from the output data
    output_df = pd.DataFrame(output_data)
    
    # Save to CSV
    output_df.to_csv(output_file_path, index=False)
    return output_df

# Example usage:
# transformeddata = 'cash_beta_trench1_applied_loans_backscored_20241001_20250831'
# transform_data(f'{LOCALPATH}/{transformeddata}.csv')

In [298]:
transformeddata = f'cic_score'
dfd = transform_data(f'{LOCALPATH}/{transformeddata}.csv')
print(f"The shape of the {transformeddata} data is: {dfd.shape}")

The shape of the cic_score data is: (39651, 13)


In [299]:
dfd.head()

Unnamed: 0,customerId,digitalLoanAccountId,crifApplicationId,prediction,start_time,end_time,modelDisplayName,modelVersionId,subscription_name,message_id,publish_time,attributes,calcFeature
0,2278578,527a7986-f598-4498-8dbe-9f5baa0397ce,b251a633-c0aa-45a8-af0a-eb86acdec954,0.74858,2025-09-17T14:57:43.189850,2025-09-17T14:57:43.189850,Cash_alpha_trench2_cicscore_backscore,v1,trench alpha beta,a53470e3-6523-465d-a8bb-302e997b23eb,2025-09-17T14:57:43.189850,{},"{""ca_cic_max_age_all_contracts_snapshot"": 730...."
1,2808330,18122fb1-947f-4004-9033-c4bdcbbca5e6,afe0ea99-3a76-4d82-9d30-892632a438bd,0.413837,2025-09-17T14:57:43.189850,2025-09-17T14:57:43.189850,Cash_alpha_trench2_cicscore_backscore,v1,trench alpha beta,07dac2de-2cfc-47cb-a41f-f6ae9b8a6b87,2025-09-17T14:57:43.189850,{},"{""ca_cic_max_age_all_contracts_snapshot"": 2646..."
2,3403996,a087c785-8d47-49d7-83f1-896dfc0b007a,81085882-8faf-422d-8a55-e46143a72e8b,0.506584,2025-09-17T14:57:43.189850,2025-09-17T14:57:43.189850,Cash_alpha_trench2_cicscore_backscore,v1,trench alpha beta,b747689c-4294-42ca-8e63-a5b4556f33bf,2025-09-17T14:57:43.189850,{},"{""ca_cic_max_age_all_contracts_snapshot"": 89.0..."
3,3172423,965de26f-3ab6-4992-b678-f7f6f8219736,670661d9-8ce1-4b87-ae9b-a9a6f6364247,0.487934,2025-09-17T14:57:43.189850,2025-09-17T14:57:43.189850,Cash_alpha_trench2_cicscore_backscore,v1,trench alpha beta,7674161c-2f61-4642-8fea-5530d2fcf6fd,2025-09-17T14:57:43.189850,{},"{""ca_cic_max_age_all_contracts_snapshot"": 471...."
4,2422495,59483165-7e09-4ab2-a038-c13577f02479,7fb1e9e2-82a5-4f9d-949e-93ab065b58ea,0.394879,2025-09-17T14:57:43.190841,2025-09-17T14:57:43.190841,Cash_alpha_trench2_cicscore_backscore,v1,trench alpha beta,69738c54-d9ab-4167-a944-8df48bdfff03,2025-09-17T14:57:43.190841,{},"{""ca_cic_max_age_all_contracts_snapshot"": 2085..."


In [300]:
df1 = dfd[['customerId', 'digitalLoanAccountId', 'prediction',
       'start_time', 'end_time', 'modelDisplayName', 'modelVersionId',
        'calcFeature'
       ]].copy()

In [301]:
filenames = f'{CURRENT_DATE}_{unique_id}_{transformeddata}'
print(filenames)

results = save_dataframe_multi_format(
     dataframe=df1,
     cloud_path=CLOUDPATH,
     filename=filenames,
     client=client,
     bucket_name=f'{BUCKETNAME}'
 )

20250917_36afc2cd41f7_cic_score
All files saved successfully!
CSV: gs://prod-asia-southeast1-tonik-aiml-workspace/DC/Model_Monitoring/Model_Tables/20250917_36afc2cd41f7_cic_score.csv
PICKLE: gs://prod-asia-southeast1-tonik-aiml-workspace/DC/Model_Monitoring/Model_Tables/20250917_36afc2cd41f7_cic_score.pkl
PARQUET: gs://prod-asia-southeast1-tonik-aiml-workspace/DC/Model_Monitoring/Model_Tables/20250917_36afc2cd41f7_cic_score.parquet
JOBLIB: gs://prod-asia-southeast1-tonik-aiml-workspace/DC/Model_Monitoring/Model_Tables/20250917_36afc2cd41f7_cic_score.joblib


# Insert into a table

In [302]:
# Upload to BigQuery
table_id = f"prj-prod-dataplatform.dap_ds_poweruser_playground.temp_model_transformed_data"
print(table_id)
job_config = bigquery.LoadJobConfig(
    write_disposition="WRITE_APPEND",  # or "WRITE_APPEND"
)
job = client.load_table_from_dataframe(df1, table_id, job_config=job_config)
job.result()  # Wait for the job to complete


prj-prod-dataplatform.dap_ds_poweruser_playground.temp_model_transformed_data




LoadJob<project=prj-prod-dataplatform, location=asia-southeast1, id=70e2e58a-0f9d-4af1-b5af-40cbc39260eb>

# Alpha trench2 Stack Score

In [304]:
d1.columns

Index(['customer_id', 'digitalLoanAccountId', 'days_on_book',
       'ln_appln_submit_datetime', 'ln_os_type', 'ln_vas_opted_flag',
       'ln_self_dec_income', 'ln_age', 'ln_source_funds_new_bin',
       'ln_loan_level_user_type', 'ln_industry_new_cat_bin',
       'ln_marital_status', 'ln_doc_type_rolled', 'ln_education_level',
       'ln_ref2_type', 'ln_email_primary_domain', 'ln_province_bin',
       'ln_mature_fspd30_flag', 'ln_fspd30_flag', 'trench_category',
       'ln_loan_type', 'ln_disb_dtime', 'ca_app_cnt_health_and_fitness_ever',
       'ca_app_cnt_shopping_ever', 'ca_app_median_time_bw_installed_mins_ever',
       'ca_app_avg_time_bw_installed_mins_3d', 'ca_app_cnt_crypto_ever',
       'ca_app_cnt_driver_ever', 'ca_app_cnt_payday_180d',
       'ca_app_cnt_gambling_180d', 'ca_t2_tx_meng_ql_calculator_tot_visit_cnt',
       'ca_t2_tx_first_product_user_segment_WOE',
       'ca_t2_tx_first_applied_loan_type_bin_WOE',
       'ca_t2_tx_cnt_rejected_loans',
       'ca_t2_tx_appsf

In [305]:
import pandas as pd
import json
import uuid
from datetime import datetime

def transform_data(output_file_path):
    # Read the input CSV file
    df = d1.copy()
    
    # Create the output DataFrame with the required structure
    output_data = []
    
    for _, row in df.iterrows():
        # Create the calcFeature JSON with all the feature columns
        feature_columns = [
           'demo_score', 'apps_score', 'credo_score', 'cic_score',
        ]
        
        calc_feature = {}
        for col in feature_columns:
            if col in row and pd.notna(row[col]):
                # Convert Timestamp objects to string
                if isinstance(row[col], pd.Timestamp):
                    calc_feature[col] = row[col].isoformat()
                else:
                    calc_feature[col] = row[col]
        
       
        # Get current timestamp
        current_time = datetime.now().isoformat()
        
        # Create the output row
        output_row = {
            "customerId": row['customer_id'],
            "digitalLoanAccountId": row['digitalLoanAccountId'],
            "crifApplicationId": str(uuid.uuid4()),  # Generate random UUID
            "prediction": row.get('stack_score', 0),
            "start_time": current_time,
            "end_time": current_time,
            "modelDisplayName":"Cash_alpha_trench2_stackscore_backscore",
            "modelVersionId":"v1",
            "subscription_name": "trench alpha beta",
            "message_id": str(uuid.uuid4()),  # Generate random UUID
            "publish_time": current_time,
            "attributes": "{}",  # Empty JSON object
            "calcFeature": json.dumps(calc_feature, default=str)  # Use default=str to handle non-serializable objects
            
        }
        
        output_data.append(output_row)
    
    # Create DataFrame from the output data
    output_df = pd.DataFrame(output_data)
    
    # Save to CSV
    output_df.to_csv(output_file_path, index=False)
    return output_df

# Example usage:
# transformeddata = 'cash_beta_trench1_applied_loans_backscored_20241001_20250831'
# transform_data(f'{LOCALPATH}/{transformeddata}.csv')

In [306]:
transformeddata = f'Cash_alpha_trench2_stackscore_backscore'
dfd = transform_data(f'{LOCALPATH}/{transformeddata}.csv')
print(f"The shape of the {transformeddata} data is: {dfd.shape}")

The shape of the Cash_alpha_trench2_stackscore_backscore data is: (39651, 13)


In [307]:
dfd.head()

Unnamed: 0,customerId,digitalLoanAccountId,crifApplicationId,prediction,start_time,end_time,modelDisplayName,modelVersionId,subscription_name,message_id,publish_time,attributes,calcFeature
0,2278578,527a7986-f598-4498-8dbe-9f5baa0397ce,825ed0bd-a4dd-4873-a310-7e8a94f5520e,0.867653,2025-09-17T15:00:35.056547,2025-09-17T15:00:35.056547,Cash_alpha_trench2_stackscore_backscore,v1,trench alpha beta,da0509d1-838d-4440-aff2-76dc5efb2fb8,2025-09-17T15:00:35.056547,{},"{""demo_score"": 0.5914070310856829, ""credo_scor..."
1,2808330,18122fb1-947f-4004-9033-c4bdcbbca5e6,a5609b89-4f52-4bd8-9f96-70ce47ffa767,0.203759,2025-09-17T15:00:35.056547,2025-09-17T15:00:35.056547,Cash_alpha_trench2_stackscore_backscore,v1,trench alpha beta,7efa9939-cbbd-4738-81d7-0338d36460e0,2025-09-17T15:00:35.056547,{},"{""demo_score"": 0.4645575141164843, ""credo_scor..."
2,3403996,a087c785-8d47-49d7-83f1-896dfc0b007a,073a2b2c-214c-4d31-bebf-cbf4d9a015cb,0.370856,2025-09-17T15:00:35.057561,2025-09-17T15:00:35.057561,Cash_alpha_trench2_stackscore_backscore,v1,trench alpha beta,f37bc94a-f2b8-4f7c-9dd3-91d1ece0a091,2025-09-17T15:00:35.057561,{},"{""demo_score"": 0.4603653023836028, ""credo_scor..."
3,3172423,965de26f-3ab6-4992-b678-f7f6f8219736,916943ba-252a-4463-b396-da90b20b848a,0.54,2025-09-17T15:00:35.057561,2025-09-17T15:00:35.057561,Cash_alpha_trench2_stackscore_backscore,v1,trench alpha beta,a75068a6-d36c-4441-ac06-231743b361c4,2025-09-17T15:00:35.057561,{},"{""demo_score"": 0.5835036923589388, ""credo_scor..."
4,2422495,59483165-7e09-4ab2-a038-c13577f02479,0f821426-b5bd-481f-9046-7f01814611da,0.228603,2025-09-17T15:00:35.057561,2025-09-17T15:00:35.057561,Cash_alpha_trench2_stackscore_backscore,v1,trench alpha beta,3122eac3-7a06-4277-9e5d-a9f10731dc46,2025-09-17T15:00:35.057561,{},"{""demo_score"": 0.43342198257276704, ""credo_sco..."


In [308]:
df1 = dfd[['customerId', 'digitalLoanAccountId', 'prediction',
       'start_time', 'end_time', 'modelDisplayName', 'modelVersionId',
        'calcFeature'
       ]].copy()

In [309]:
filenames = f'{CURRENT_DATE}_{unique_id}_{transformeddata}'
print(filenames)

results = save_dataframe_multi_format(
     dataframe=df1,
     cloud_path=CLOUDPATH,
     filename=filenames,
     client=client,
     bucket_name=f'{BUCKETNAME}'
 )

20250917_36afc2cd41f7_Cash_alpha_trench2_stackscore_backscore
All files saved successfully!
CSV: gs://prod-asia-southeast1-tonik-aiml-workspace/DC/Model_Monitoring/Model_Tables/20250917_36afc2cd41f7_Cash_alpha_trench2_stackscore_backscore.csv
PICKLE: gs://prod-asia-southeast1-tonik-aiml-workspace/DC/Model_Monitoring/Model_Tables/20250917_36afc2cd41f7_Cash_alpha_trench2_stackscore_backscore.pkl
PARQUET: gs://prod-asia-southeast1-tonik-aiml-workspace/DC/Model_Monitoring/Model_Tables/20250917_36afc2cd41f7_Cash_alpha_trench2_stackscore_backscore.parquet
JOBLIB: gs://prod-asia-southeast1-tonik-aiml-workspace/DC/Model_Monitoring/Model_Tables/20250917_36afc2cd41f7_Cash_alpha_trench2_stackscore_backscore.joblib


# Insert into a table

In [310]:
# Upload to BigQuery
table_id = f"prj-prod-dataplatform.dap_ds_poweruser_playground.temp_model_transformed_data"
print(table_id)
job_config = bigquery.LoadJobConfig(
    write_disposition="WRITE_APPEND",  # or "WRITE_APPEND"
)
job = client.load_table_from_dataframe(df1, table_id, job_config=job_config)
job.result()  # Wait for the job to complete


prj-prod-dataplatform.dap_ds_poweruser_playground.temp_model_transformed_data




LoadJob<project=prj-prod-dataplatform, location=asia-southeast1, id=707c1006-35b2-415e-bfa1-b5d2e9e68461>

# cash_alpha_trench3_applied_loans_backscored_20241001_20250831

# Table

In [312]:
schema1 = 'worktable_data_analysis'
cash_alpha_trench3 = f'cash_alpha_trench3_applied_loans_backscored_20241001_20250831'

# Query

In [313]:
sq = f"""
select * from {schema1}.{cash_alpha_trench3};
"""
d1 = client.query(sq).to_dataframe(progress_bar_type='tqdm')
print(f"The shape of {schema1}.{cash_alpha_trench3} table is:\t {d1.shape}")

Job ID c9ca6dd4-ffec-4d0a-963d-fd558986700c successfully executed: 100%|[32m██████████[0m|




Downloading: 100%|[32m██████████[0m|
The shape of worktable_data_analysis.cash_alpha_trench3_applied_loans_backscored_20241001_20250831 table is:	 (16631, 49)


# Alpha trench3 Demo Score

In [314]:
d1.columns

Index(['customer_id', 'digitalLoanAccountId', 'days_on_book',
       'ln_appln_submit_datetime', 'ln_os_type', 'ln_vas_opted_flag',
       'ln_self_dec_income', 'ln_age', 'ln_source_funds_new_bin',
       'ln_loan_level_user_type', 'ln_industry_new_cat_bin',
       'ln_marital_status', 'ln_doc_type_rolled', 'ln_education_level',
       'ln_ref2_type', 'ln_email_primary_domain', 'ln_province_bin',
       'ln_mature_fspd30_flag', 'ln_fspd30_flag', 'trench_category',
       'ln_loan_type', 'ln_disb_dtime', 'c_app_cnt_absence_tag_365d_binned',
       'c_app_cnt_books_and_reference_ever_binned',
       'c_app_cnt_gaming_180d_binned',
       'c_app_cnt_health_and_fitness_ever_binned',
       'c_app_cnt_productivity_ever_binned',
       'c_app_cnt_rated_for_18plus_ever_binned',
       'c_app_last_payday_install_to_apply_days_binned',
       'ca_t3_tx_cnt_installments_paid_tot_with_dpd',
       'ca_t3_tx_time_since_last_applied_loan_application_time',
       'ca_t3_tx_last_applied_loan_decisio

In [315]:
import pandas as pd
import json
import uuid
from datetime import datetime

def transform_data(output_file_path):
    # Read the input CSV file
    df = d1.copy()
    
    # Create the output DataFrame with the required structure
    output_data = []
    
    for _, row in df.iterrows():
        # Create the calcFeature JSON with all the feature columns
        feature_columns = [
           'days_on_book',
       'ln_appln_submit_datetime', 'ln_os_type', 'ln_vas_opted_flag',
       'ln_self_dec_income', 'ln_age', 'ln_source_funds_new_bin',
       'ln_loan_level_user_type', 'ln_industry_new_cat_bin',
       'ln_marital_status', 'ln_doc_type_rolled', 'ln_education_level',
       'ln_ref2_type', 'ln_email_primary_domain', 'ln_province_bin',
       'trench_category',  'ln_loan_type', 'ln_disb_dtime',
        ]
        
        calc_feature = {}
        for col in feature_columns:
            if col in row and pd.notna(row[col]):
                # Convert Timestamp objects to string
                if isinstance(row[col], pd.Timestamp):
                    calc_feature[col] = row[col].isoformat()
                else:
                    calc_feature[col] = row[col]
        
       
        # Get current timestamp
        current_time = datetime.now().isoformat()
        
        # Create the output row
        output_row = {
            "customerId": row['customer_id'],
            "digitalLoanAccountId": row['digitalLoanAccountId'],
            "crifApplicationId": str(uuid.uuid4()),  # Generate random UUID
            "prediction": row.get('demo_score', 0),
            "start_time": current_time,
            "end_time": current_time,
            "modelDisplayName":"Cash_alpha_trench3_demoscore_backscore",
            "modelVersionId":"v1",
            "subscription_name": "trench alpha beta",
            "message_id": str(uuid.uuid4()),  # Generate random UUID
            "publish_time": current_time,
            "attributes": "{}",  # Empty JSON object
            "calcFeature": json.dumps(calc_feature, default=str)  # Use default=str to handle non-serializable objects
            
        }
        
        output_data.append(output_row)
    
    # Create DataFrame from the output data
    output_df = pd.DataFrame(output_data)
    
    # Save to CSV
    output_df.to_csv(output_file_path, index=False)
    return output_df

# Example usage:
# transformeddata = 'cash_beta_trench1_applied_loans_backscored_20241001_20250831'
# transform_data(f'{LOCALPATH}/{transformeddata}.csv')

In [316]:
transformeddata = f'Cash_alpha_trench3_demoscore_backscore'
dfd = transform_data(f'{LOCALPATH}/{transformeddata}.csv')
print(f"The shape of the {transformeddata} data is: {dfd.shape}")

The shape of the Cash_alpha_trench3_demoscore_backscore data is: (16631, 13)


In [317]:
dfd.head()

Unnamed: 0,customerId,digitalLoanAccountId,crifApplicationId,prediction,start_time,end_time,modelDisplayName,modelVersionId,subscription_name,message_id,publish_time,attributes,calcFeature
0,1897828,40801a80-b17a-4d51-adc9-748240e650ee,5f2e105a-b739-4888-a2d2-3cc6aee390d6,0.465872,2025-09-17T15:03:38.390258,2025-09-17T15:03:38.390258,Cash_alpha_trench3_demoscore_backscore,v1,trench alpha beta,0a0624c7-512e-459f-a6bc-40713d41d1b2,2025-09-17T15:03:38.390258,{},"{""days_on_book"": 721, ""ln_appln_submit_datetim..."
1,2441372,e993fa25-eefe-4863-82cf-472d90bd622e,4c7ad20b-64fd-4de6-8dec-dbe59e8535ba,0.488606,2025-09-17T15:03:38.391774,2025-09-17T15:03:38.391774,Cash_alpha_trench3_demoscore_backscore,v1,trench alpha beta,273a8442-b7f1-457e-8bc1-560b91969d80,2025-09-17T15:03:38.391774,{},"{""days_on_book"": 220, ""ln_appln_submit_datetim..."
2,2226781,34568e85-907c-4639-a859-7bc86932b853,7205d3a1-b1a2-4580-ae40-3bbbc3387d26,0.554495,2025-09-17T15:03:38.391774,2025-09-17T15:03:38.391774,Cash_alpha_trench3_demoscore_backscore,v1,trench alpha beta,3f9e2b82-95b5-411a-a4c8-e76e9988c4c2,2025-09-17T15:03:38.391774,{},"{""days_on_book"": 465, ""ln_appln_submit_datetim..."
3,2271357,580c38bc-ec05-4a48-89f4-ec02313afe6b,64631fb2-3101-42d8-b813-4273c89ed2e0,0.501559,2025-09-17T15:03:38.391774,2025-09-17T15:03:38.391774,Cash_alpha_trench3_demoscore_backscore,v1,trench alpha beta,47354609-3633-4e7c-a179-7c5d57bbccbf,2025-09-17T15:03:38.391774,{},"{""days_on_book"": 510, ""ln_appln_submit_datetim..."
4,2173280,a631ab67-e35e-48f0-9051-f84d9c4df5cb,331ca62c-7082-49c0-aa89-3bcf39a86d18,0.343457,2025-09-17T15:03:38.391774,2025-09-17T15:03:38.391774,Cash_alpha_trench3_demoscore_backscore,v1,trench alpha beta,e3c45100-310d-4dec-b762-9a63aaa9c1a8,2025-09-17T15:03:38.391774,{},"{""days_on_book"": 560, ""ln_appln_submit_datetim..."


In [318]:
df1 = dfd[['customerId', 'digitalLoanAccountId', 'prediction',
       'start_time', 'end_time', 'modelDisplayName', 'modelVersionId',
        'calcFeature'
       ]].copy()

In [319]:
filenames = f'{CURRENT_DATE}_{unique_id}_{transformeddata}'
print(filenames)

results = save_dataframe_multi_format(
     dataframe=df1,
     cloud_path=CLOUDPATH,
     filename=filenames,
     client=client,
     bucket_name=f'{BUCKETNAME}'
 )

20250917_36afc2cd41f7_Cash_alpha_trench3_demoscore_backscore
All files saved successfully!
CSV: gs://prod-asia-southeast1-tonik-aiml-workspace/DC/Model_Monitoring/Model_Tables/20250917_36afc2cd41f7_Cash_alpha_trench3_demoscore_backscore.csv
PICKLE: gs://prod-asia-southeast1-tonik-aiml-workspace/DC/Model_Monitoring/Model_Tables/20250917_36afc2cd41f7_Cash_alpha_trench3_demoscore_backscore.pkl
PARQUET: gs://prod-asia-southeast1-tonik-aiml-workspace/DC/Model_Monitoring/Model_Tables/20250917_36afc2cd41f7_Cash_alpha_trench3_demoscore_backscore.parquet
JOBLIB: gs://prod-asia-southeast1-tonik-aiml-workspace/DC/Model_Monitoring/Model_Tables/20250917_36afc2cd41f7_Cash_alpha_trench3_demoscore_backscore.joblib


# Insert into a table

In [320]:
# Upload to BigQuery
table_id = f"prj-prod-dataplatform.dap_ds_poweruser_playground.temp_model_transformed_data"
print(table_id)
job_config = bigquery.LoadJobConfig(
    write_disposition="WRITE_APPEND",  # or "WRITE_APPEND"
)
job = client.load_table_from_dataframe(df1, table_id, job_config=job_config)
job.result()  # Wait for the job to complete


prj-prod-dataplatform.dap_ds_poweruser_playground.temp_model_transformed_data




LoadJob<project=prj-prod-dataplatform, location=asia-southeast1, id=f7b8be61-aa5c-44af-a325-385301edef06>

# Alpha trench3 App Score

In [322]:
d1.columns

Index(['customer_id', 'digitalLoanAccountId', 'days_on_book',
       'ln_appln_submit_datetime', 'ln_os_type', 'ln_vas_opted_flag',
       'ln_self_dec_income', 'ln_age', 'ln_source_funds_new_bin',
       'ln_loan_level_user_type', 'ln_industry_new_cat_bin',
       'ln_marital_status', 'ln_doc_type_rolled', 'ln_education_level',
       'ln_ref2_type', 'ln_email_primary_domain', 'ln_province_bin',
       'ln_mature_fspd30_flag', 'ln_fspd30_flag', 'trench_category',
       'ln_loan_type', 'ln_disb_dtime', 'c_app_cnt_absence_tag_365d_binned',
       'c_app_cnt_books_and_reference_ever_binned',
       'c_app_cnt_gaming_180d_binned',
       'c_app_cnt_health_and_fitness_ever_binned',
       'c_app_cnt_productivity_ever_binned',
       'c_app_cnt_rated_for_18plus_ever_binned',
       'c_app_last_payday_install_to_apply_days_binned',
       'ca_t3_tx_cnt_installments_paid_tot_with_dpd',
       'ca_t3_tx_time_since_last_applied_loan_application_time',
       'ca_t3_tx_last_applied_loan_decisio

In [323]:
d2 = d1[d1['ln_os_type']=='Android']
d2['ln_os_type'].value_counts()

ln_os_type
Android    9547
Name: count, dtype: int64

In [325]:
import pandas as pd
import json
import uuid
from datetime import datetime

def transform_data(output_file_path):
    # Read the input CSV file
    df = d2.copy()
    
    # Create the output DataFrame with the required structure
    output_data = []
    
    for _, row in df.iterrows():
        # Create the calcFeature JSON with all the feature columns
        feature_columns = [
        'c_app_cnt_absence_tag_365d_binned',
        'c_app_cnt_books_and_reference_ever_binned',
        'c_app_cnt_gaming_180d_binned',
        'c_app_cnt_health_and_fitness_ever_binned',
        'c_app_cnt_productivity_ever_binned',
        'c_app_cnt_rated_for_18plus_ever_binned',
        'c_app_last_payday_install_to_apply_days_binned',
        ]
        
        calc_feature = {}
        for col in feature_columns:
            if col in row and pd.notna(row[col]):
                # Convert Timestamp objects to string
                if isinstance(row[col], pd.Timestamp):
                    calc_feature[col] = row[col].isoformat()
                else:
                    calc_feature[col] = row[col]
        
       
        # Get current timestamp
        current_time = datetime.now().isoformat()
        
        # Create the output row
        output_row = {
            "customerId": row['customer_id'],
            "digitalLoanAccountId": row['digitalLoanAccountId'],
            "crifApplicationId": str(uuid.uuid4()),  # Generate random UUID
            "prediction": row.get('apps_score', 0),
            "start_time": current_time,
            "end_time": current_time,
            "modelDisplayName":"Cash_alpha_trench3_appscore_backscore",
            "modelVersionId":"v1",
            "subscription_name": "trench alpha beta",
            "message_id": str(uuid.uuid4()),  # Generate random UUID
            "publish_time": current_time,
            "attributes": "{}",  # Empty JSON object
            "calcFeature": json.dumps(calc_feature, default=str)  # Use default=str to handle non-serializable objects
            
        }
        
        output_data.append(output_row)
    
    # Create DataFrame from the output data
    output_df = pd.DataFrame(output_data)
    
    # Save to CSV
    output_df.to_csv(output_file_path, index=False)
    return output_df

# Example usage:
# transformeddata = 'cash_beta_trench1_applied_loans_backscored_20241001_20250831'
# transform_data(f'{LOCALPATH}/{transformeddata}.csv')

In [326]:
transformeddata = f'Cash_alpha_trench3_appscore_backscore'
dfd = transform_data(f'{LOCALPATH}/{transformeddata}.csv')
print(f"The shape of the {transformeddata} data is: {dfd.shape}")

The shape of the Cash_alpha_trench3_appscore_backscore data is: (9547, 13)


In [327]:
dfd.head()

Unnamed: 0,customerId,digitalLoanAccountId,crifApplicationId,prediction,start_time,end_time,modelDisplayName,modelVersionId,subscription_name,message_id,publish_time,attributes,calcFeature
0,1430364,a3954f49-ba98-4dc7-b0bd-140e55748c2f,7ed747d6-9681-43d4-82bf-917c0934714b,0.371943,2025-09-17T15:07:02.446906,2025-09-17T15:07:02.446906,Cash_alpha_trench3_appscore_backscore,v1,trench alpha beta,c4ba8b8d-b2a9-4264-bd05-be151c9390ef,2025-09-17T15:07:02.446906,{},"{""c_app_cnt_absence_tag_365d_binned"": ""4.182 \..."
1,2108297,4109130a-0c9b-4754-80e8-3c45060e14ce,780dbf38-3059-4520-bbcd-4b44d4303449,0.39547,2025-09-17T15:07:02.446906,2025-09-17T15:07:02.446906,Cash_alpha_trench3_appscore_backscore,v1,trench alpha beta,043de4f1-f96f-4feb-ad0a-cc5ff10709b8,2025-09-17T15:07:02.446906,{},"{""c_app_cnt_absence_tag_365d_binned"": ""> 6.970..."
2,2484864,7b98aca7-c895-4672-a275-4acac0bdac90,b9c0c320-1721-4459-994c-c4542d243d73,0.395863,2025-09-17T15:07:02.446906,2025-09-17T15:07:02.446906,Cash_alpha_trench3_appscore_backscore,v1,trench alpha beta,28de4eeb-798d-4387-b5d9-bdecd57b7157,2025-09-17T15:07:02.446906,{},"{""c_app_cnt_absence_tag_365d_binned"": ""4.182 \..."
3,2392508,b1fa0959-b911-4f2c-971a-7faef8a48bde,302d7b46-fe26-491a-94bc-eb5dc8bd16ab,0.399264,2025-09-17T15:07:02.448175,2025-09-17T15:07:02.448175,Cash_alpha_trench3_appscore_backscore,v1,trench alpha beta,45c5d081-40ee-4b47-830b-433eee777344,2025-09-17T15:07:02.448175,{},"{""c_app_cnt_absence_tag_365d_binned"": ""4.182 \..."
4,1965519,4433f45a-ccae-4256-b65b-19a483ce83b0,ab3e0691-1e4d-4021-9eba-94de7e4c76fd,0.401831,2025-09-17T15:07:02.448175,2025-09-17T15:07:02.448175,Cash_alpha_trench3_appscore_backscore,v1,trench alpha beta,b0227dbe-bba7-4066-b41b-61d227f7143d,2025-09-17T15:07:02.448175,{},"{""c_app_cnt_absence_tag_365d_binned"": ""4.182 \..."


In [328]:
df1 = dfd[['customerId', 'digitalLoanAccountId', 'prediction',
       'start_time', 'end_time', 'modelDisplayName', 'modelVersionId',
        'calcFeature'
       ]].copy()

In [329]:
filenames = f'{CURRENT_DATE}_{unique_id}_{transformeddata}'
print(filenames)

results = save_dataframe_multi_format(
     dataframe=df1,
     cloud_path=CLOUDPATH,
     filename=filenames,
     client=client,
     bucket_name=f'{BUCKETNAME}'
 )

20250917_36afc2cd41f7_Cash_alpha_trench3_appscore_backscore
All files saved successfully!
CSV: gs://prod-asia-southeast1-tonik-aiml-workspace/DC/Model_Monitoring/Model_Tables/20250917_36afc2cd41f7_Cash_alpha_trench3_appscore_backscore.csv
PICKLE: gs://prod-asia-southeast1-tonik-aiml-workspace/DC/Model_Monitoring/Model_Tables/20250917_36afc2cd41f7_Cash_alpha_trench3_appscore_backscore.pkl
PARQUET: gs://prod-asia-southeast1-tonik-aiml-workspace/DC/Model_Monitoring/Model_Tables/20250917_36afc2cd41f7_Cash_alpha_trench3_appscore_backscore.parquet
JOBLIB: gs://prod-asia-southeast1-tonik-aiml-workspace/DC/Model_Monitoring/Model_Tables/20250917_36afc2cd41f7_Cash_alpha_trench3_appscore_backscore.joblib


# Insert into a table

In [330]:
# Upload to BigQuery
table_id = f"prj-prod-dataplatform.dap_ds_poweruser_playground.temp_model_transformed_data"
print(table_id)
job_config = bigquery.LoadJobConfig(
    write_disposition="WRITE_APPEND",  # or "WRITE_APPEND"
)
job = client.load_table_from_dataframe(df1, table_id, job_config=job_config)
job.result()  # Wait for the job to complete


prj-prod-dataplatform.dap_ds_poweruser_playground.temp_model_transformed_data




LoadJob<project=prj-prod-dataplatform, location=asia-southeast1, id=99ca6265-d0ec-4581-b8fe-bbb518d8fbfc>

# Alpha trench3 Transaction Score

In [332]:
d1.columns

Index(['customer_id', 'digitalLoanAccountId', 'days_on_book',
       'ln_appln_submit_datetime', 'ln_os_type', 'ln_vas_opted_flag',
       'ln_self_dec_income', 'ln_age', 'ln_source_funds_new_bin',
       'ln_loan_level_user_type', 'ln_industry_new_cat_bin',
       'ln_marital_status', 'ln_doc_type_rolled', 'ln_education_level',
       'ln_ref2_type', 'ln_email_primary_domain', 'ln_province_bin',
       'ln_mature_fspd30_flag', 'ln_fspd30_flag', 'trench_category',
       'ln_loan_type', 'ln_disb_dtime', 'c_app_cnt_absence_tag_365d_binned',
       'c_app_cnt_books_and_reference_ever_binned',
       'c_app_cnt_gaming_180d_binned',
       'c_app_cnt_health_and_fitness_ever_binned',
       'c_app_cnt_productivity_ever_binned',
       'c_app_cnt_rated_for_18plus_ever_binned',
       'c_app_last_payday_install_to_apply_days_binned',
       'ca_t3_tx_cnt_installments_paid_tot_with_dpd',
       'ca_t3_tx_time_since_last_applied_loan_application_time',
       'ca_t3_tx_last_applied_loan_decisio

In [335]:
import pandas as pd
import json
import uuid
from datetime import datetime

def transform_data(output_file_path):
    # Read the input CSV file
    df = d1.copy()
    
    # Create the output DataFrame with the required structure
    output_data = []
    
    for _, row in df.iterrows():
        # Create the calcFeature JSON with all the feature columns
        feature_columns = [
        'ca_t3_tx_cnt_installments_paid_tot_with_dpd',
       'ca_t3_tx_time_since_last_applied_loan_application_time',
       'ca_t3_tx_last_applied_loan_decision',
       'ca_t3_tx_min_age_completed_loans', 'ca_t3_tx_dob_observation_date',
       'ca_t3_tx_cnt_jira_tickets_created_bin', 'ca_t3_tx_max_ever_dpd',
       'ca_t3_tx_amt_cash_in_total', 'ca_t3_tx_last_applied_loan_type_bin',
       'ca_t3_tx_cnt_completed_loans', 'ca_t3_tx_meng_no_of_logins',
       'ca_t3_tx_last_applied_loan_tenor',
       'ca_t3_tx_med_days_bt_cash_out_trans',
       'ca_t3_tx_avg_days_bt_cash_in_trans',
        ]
        
        calc_feature = {}
        for col in feature_columns:
            if col in row and pd.notna(row[col]):
                # Convert Timestamp objects to string
                if isinstance(row[col], pd.Timestamp):
                    calc_feature[col] = row[col].isoformat()
                else:
                    calc_feature[col] = row[col]
        
       
        # Get current timestamp
        current_time = datetime.now().isoformat()
        
        # Create the output row
        output_row = {
            "customerId": row['customer_id'],
            "digitalLoanAccountId": row['digitalLoanAccountId'],
            "crifApplicationId": str(uuid.uuid4()),  # Generate random UUID
            "prediction": row.get('trx_score', 0),
            "start_time": current_time,
            "end_time": current_time,
            "modelDisplayName":"Cash_alpha_trench3_transactionscore_backscore",
            "modelVersionId":"v1",
            "subscription_name": "trench alpha beta",
            "message_id": str(uuid.uuid4()),  # Generate random UUID
            "publish_time": current_time,
            "attributes": "{}",  # Empty JSON object
            "calcFeature": json.dumps(calc_feature, default=str)  # Use default=str to handle non-serializable objects
            
        }
        
        output_data.append(output_row)
    
    # Create DataFrame from the output data
    output_df = pd.DataFrame(output_data)
    
    # Save to CSV
    output_df.to_csv(output_file_path, index=False)
    return output_df

# Example usage:
# transformeddata = 'cash_beta_trench1_applied_loans_backscored_20241001_20250831'
# transform_data(f'{LOCALPATH}/{transformeddata}.csv')

In [336]:
transformeddata = f'Cash_alpha_trench3_transactionscore_backscore'
dfd = transform_data(f'{LOCALPATH}/{transformeddata}.csv')
print(f"The shape of the {transformeddata} data is: {dfd.shape}")

The shape of the Cash_alpha_trench3_transactionscore_backscore data is: (16631, 13)


In [None]:
dfd.head()

Unnamed: 0,customerId,digitalLoanAccountId,crifApplicationId,prediction,start_time,end_time,modelDisplayName,modelVersionId,subscription_name,message_id,publish_time,attributes,calcFeature
0,1430364,a3954f49-ba98-4dc7-b0bd-140e55748c2f,7ed747d6-9681-43d4-82bf-917c0934714b,0.371943,2025-09-17T15:07:02.446906,2025-09-17T15:07:02.446906,Cash_alpha_trench3_appscore_backscore,v1,trench alpha beta,c4ba8b8d-b2a9-4264-bd05-be151c9390ef,2025-09-17T15:07:02.446906,{},"{""c_app_cnt_absence_tag_365d_binned"": ""4.182 \..."
1,2108297,4109130a-0c9b-4754-80e8-3c45060e14ce,780dbf38-3059-4520-bbcd-4b44d4303449,0.39547,2025-09-17T15:07:02.446906,2025-09-17T15:07:02.446906,Cash_alpha_trench3_appscore_backscore,v1,trench alpha beta,043de4f1-f96f-4feb-ad0a-cc5ff10709b8,2025-09-17T15:07:02.446906,{},"{""c_app_cnt_absence_tag_365d_binned"": ""> 6.970..."
2,2484864,7b98aca7-c895-4672-a275-4acac0bdac90,b9c0c320-1721-4459-994c-c4542d243d73,0.395863,2025-09-17T15:07:02.446906,2025-09-17T15:07:02.446906,Cash_alpha_trench3_appscore_backscore,v1,trench alpha beta,28de4eeb-798d-4387-b5d9-bdecd57b7157,2025-09-17T15:07:02.446906,{},"{""c_app_cnt_absence_tag_365d_binned"": ""4.182 \..."
3,2392508,b1fa0959-b911-4f2c-971a-7faef8a48bde,302d7b46-fe26-491a-94bc-eb5dc8bd16ab,0.399264,2025-09-17T15:07:02.448175,2025-09-17T15:07:02.448175,Cash_alpha_trench3_appscore_backscore,v1,trench alpha beta,45c5d081-40ee-4b47-830b-433eee777344,2025-09-17T15:07:02.448175,{},"{""c_app_cnt_absence_tag_365d_binned"": ""4.182 \..."
4,1965519,4433f45a-ccae-4256-b65b-19a483ce83b0,ab3e0691-1e4d-4021-9eba-94de7e4c76fd,0.401831,2025-09-17T15:07:02.448175,2025-09-17T15:07:02.448175,Cash_alpha_trench3_appscore_backscore,v1,trench alpha beta,b0227dbe-bba7-4066-b41b-61d227f7143d,2025-09-17T15:07:02.448175,{},"{""c_app_cnt_absence_tag_365d_binned"": ""4.182 \..."


In [337]:
df1 = dfd[['customerId', 'digitalLoanAccountId', 'prediction',
       'start_time', 'end_time', 'modelDisplayName', 'modelVersionId',
        'calcFeature'
       ]].copy()

In [338]:
filenames = f'{CURRENT_DATE}_{unique_id}_{transformeddata}'
print(filenames)

results = save_dataframe_multi_format(
     dataframe=df1,
     cloud_path=CLOUDPATH,
     filename=filenames,
     client=client,
     bucket_name=f'{BUCKETNAME}'
 )

20250917_36afc2cd41f7_Cash_alpha_trench3_transactionscore_backscore
All files saved successfully!
CSV: gs://prod-asia-southeast1-tonik-aiml-workspace/DC/Model_Monitoring/Model_Tables/20250917_36afc2cd41f7_Cash_alpha_trench3_transactionscore_backscore.csv
PICKLE: gs://prod-asia-southeast1-tonik-aiml-workspace/DC/Model_Monitoring/Model_Tables/20250917_36afc2cd41f7_Cash_alpha_trench3_transactionscore_backscore.pkl
PARQUET: gs://prod-asia-southeast1-tonik-aiml-workspace/DC/Model_Monitoring/Model_Tables/20250917_36afc2cd41f7_Cash_alpha_trench3_transactionscore_backscore.parquet
JOBLIB: gs://prod-asia-southeast1-tonik-aiml-workspace/DC/Model_Monitoring/Model_Tables/20250917_36afc2cd41f7_Cash_alpha_trench3_transactionscore_backscore.joblib


# Insert into a table

In [339]:
# Upload to BigQuery
table_id = f"prj-prod-dataplatform.dap_ds_poweruser_playground.temp_model_transformed_data"
print(table_id)
job_config = bigquery.LoadJobConfig(
    write_disposition="WRITE_APPEND",  # or "WRITE_APPEND"
)
job = client.load_table_from_dataframe(df1, table_id, job_config=job_config)
job.result()  # Wait for the job to complete


prj-prod-dataplatform.dap_ds_poweruser_playground.temp_model_transformed_data




LoadJob<project=prj-prod-dataplatform, location=asia-southeast1, id=33dfb9e8-7530-43c5-b1f9-4e0027551db9>

# Alpha trench3 Stack Score

In [341]:
d1.columns

Index(['customer_id', 'digitalLoanAccountId', 'days_on_book',
       'ln_appln_submit_datetime', 'ln_os_type', 'ln_vas_opted_flag',
       'ln_self_dec_income', 'ln_age', 'ln_source_funds_new_bin',
       'ln_loan_level_user_type', 'ln_industry_new_cat_bin',
       'ln_marital_status', 'ln_doc_type_rolled', 'ln_education_level',
       'ln_ref2_type', 'ln_email_primary_domain', 'ln_province_bin',
       'ln_mature_fspd30_flag', 'ln_fspd30_flag', 'trench_category',
       'ln_loan_type', 'ln_disb_dtime', 'c_app_cnt_absence_tag_365d_binned',
       'c_app_cnt_books_and_reference_ever_binned',
       'c_app_cnt_gaming_180d_binned',
       'c_app_cnt_health_and_fitness_ever_binned',
       'c_app_cnt_productivity_ever_binned',
       'c_app_cnt_rated_for_18plus_ever_binned',
       'c_app_last_payday_install_to_apply_days_binned',
       'ca_t3_tx_cnt_installments_paid_tot_with_dpd',
       'ca_t3_tx_time_since_last_applied_loan_application_time',
       'ca_t3_tx_last_applied_loan_decisio

In [342]:
import pandas as pd
import json
import uuid
from datetime import datetime

def transform_data(output_file_path):
    # Read the input CSV file
    df = d1.copy()
    
    # Create the output DataFrame with the required structure
    output_data = []
    
    for _, row in df.iterrows():
        # Create the calcFeature JSON with all the feature columns
        feature_columns = [
         'trx_score', 'demo_score',
       'apps_score', 'credo_score', 'cic_score',
        ]
        
        calc_feature = {}
        for col in feature_columns:
            if col in row and pd.notna(row[col]):
                # Convert Timestamp objects to string
                if isinstance(row[col], pd.Timestamp):
                    calc_feature[col] = row[col].isoformat()
                else:
                    calc_feature[col] = row[col]
        
       
        # Get current timestamp
        current_time = datetime.now().isoformat()
        
        # Create the output row
        output_row = {
            "customerId": row['customer_id'],
            "digitalLoanAccountId": row['digitalLoanAccountId'],
            "crifApplicationId": str(uuid.uuid4()),  # Generate random UUID
            "prediction": row.get('stack_score', 0),
            "start_time": current_time,
            "end_time": current_time,
            "modelDisplayName":"Cash_alpha_trench3_stackscore_backscore",
            "modelVersionId":"v1",
            "subscription_name": "trench alpha beta",
            "message_id": str(uuid.uuid4()),  # Generate random UUID
            "publish_time": current_time,
            "attributes": "{}",  # Empty JSON object
            "calcFeature": json.dumps(calc_feature, default=str)  # Use default=str to handle non-serializable objects
            
        }
        
        output_data.append(output_row)
    
    # Create DataFrame from the output data
    output_df = pd.DataFrame(output_data)
    
    # Save to CSV
    output_df.to_csv(output_file_path, index=False)
    return output_df

# Example usage:
# transformeddata = 'cash_beta_trench1_applied_loans_backscored_20241001_20250831'
# transform_data(f'{LOCALPATH}/{transformeddata}.csv')

In [343]:
transformeddata = f'Cash_alpha_trench3_stackscore_backscore'
dfd = transform_data(f'{LOCALPATH}/{transformeddata}.csv')
print(f"The shape of the {transformeddata} data is: {dfd.shape}")

The shape of the Cash_alpha_trench3_stackscore_backscore data is: (16631, 13)


In [344]:
dfd.head()

Unnamed: 0,customerId,digitalLoanAccountId,crifApplicationId,prediction,start_time,end_time,modelDisplayName,modelVersionId,subscription_name,message_id,publish_time,attributes,calcFeature
0,1897828,40801a80-b17a-4d51-adc9-748240e650ee,216c96ac-64c1-4556-8b12-76ffd8cfe2a8,0.529028,2025-09-17T15:56:51.739022,2025-09-17T15:56:51.739022,Cash_alpha_trench3_stackscore_backscore,v1,trench alpha beta,6f00f82e-5b11-4958-804e-c9b2df5215b9,2025-09-17T15:56:51.739022,{},"{""trx_score"": 0.08764952085666868, ""demo_score..."
1,2441372,e993fa25-eefe-4863-82cf-472d90bd622e,ad616f82-5f2a-4b53-b270-6e3f1bdf5a50,0.528184,2025-09-17T15:56:51.739022,2025-09-17T15:56:51.739022,Cash_alpha_trench3_stackscore_backscore,v1,trench alpha beta,edbb1a91-2293-4c1f-ab36-118a3a95449e,2025-09-17T15:56:51.739022,{},"{""trx_score"": 0.08848271844401237, ""demo_score..."
2,2226781,34568e85-907c-4639-a859-7bc86932b853,9bccdef4-b268-4f50-9051-26452784bb2f,0.512387,2025-09-17T15:56:51.739022,2025-09-17T15:56:51.739022,Cash_alpha_trench3_stackscore_backscore,v1,trench alpha beta,35bf5cb0-1355-42c8-a927-af6545438ec7,2025-09-17T15:56:51.739022,{},"{""trx_score"": 0.1259951787689817, ""demo_score""..."
3,2271357,580c38bc-ec05-4a48-89f4-ec02313afe6b,1ccdb9ef-3926-4251-a355-b0dcd2b35a0d,0.475434,2025-09-17T15:56:51.739022,2025-09-17T15:56:51.739022,Cash_alpha_trench3_stackscore_backscore,v1,trench alpha beta,74a6948a-246c-48f3-acf9-8f2d1742e437,2025-09-17T15:56:51.739022,{},"{""trx_score"": 0.07431168569877146, ""demo_score..."
4,2173280,a631ab67-e35e-48f0-9051-f84d9c4df5cb,be1f2d63-2536-4c0a-90ba-778524158042,0.420548,2025-09-17T15:56:51.739022,2025-09-17T15:56:51.739022,Cash_alpha_trench3_stackscore_backscore,v1,trench alpha beta,9aafdc8a-ab06-428c-91b6-157d7ea4a6b4,2025-09-17T15:56:51.739022,{},"{""trx_score"": 0.0485292243181622, ""demo_score""..."


In [345]:
df1 = dfd[['customerId', 'digitalLoanAccountId', 'prediction',
       'start_time', 'end_time', 'modelDisplayName', 'modelVersionId',
        'calcFeature'
       ]].copy()

In [346]:
filenames = f'{CURRENT_DATE}_{unique_id}_{transformeddata}'
print(filenames)

results = save_dataframe_multi_format(
     dataframe=df1,
     cloud_path=CLOUDPATH,
     filename=filenames,
     client=client,
     bucket_name=f'{BUCKETNAME}'
 )

20250917_36afc2cd41f7_Cash_alpha_trench3_stackscore_backscore
All files saved successfully!
CSV: gs://prod-asia-southeast1-tonik-aiml-workspace/DC/Model_Monitoring/Model_Tables/20250917_36afc2cd41f7_Cash_alpha_trench3_stackscore_backscore.csv
PICKLE: gs://prod-asia-southeast1-tonik-aiml-workspace/DC/Model_Monitoring/Model_Tables/20250917_36afc2cd41f7_Cash_alpha_trench3_stackscore_backscore.pkl
PARQUET: gs://prod-asia-southeast1-tonik-aiml-workspace/DC/Model_Monitoring/Model_Tables/20250917_36afc2cd41f7_Cash_alpha_trench3_stackscore_backscore.parquet
JOBLIB: gs://prod-asia-southeast1-tonik-aiml-workspace/DC/Model_Monitoring/Model_Tables/20250917_36afc2cd41f7_Cash_alpha_trench3_stackscore_backscore.joblib


# Insert into a table

In [347]:
# Upload to BigQuery
table_id = f"prj-prod-dataplatform.dap_ds_poweruser_playground.temp_model_transformed_data"
print(table_id)
job_config = bigquery.LoadJobConfig(
    write_disposition="WRITE_APPEND",  # or "WRITE_APPEND"
)
job = client.load_table_from_dataframe(df1, table_id, job_config=job_config)
job.result()  # Wait for the job to complete


prj-prod-dataplatform.dap_ds_poweruser_playground.temp_model_transformed_data




LoadJob<project=prj-prod-dataplatform, location=asia-southeast1, id=99458cda-3993-4bfe-8293-85901a8e3091>

In [348]:
sq = """select modelDisplayName, count(distinct digitalLoanAccountId)cnt from dap_ds_poweruser_playground.temp_model_transformed_data group by 1 order by 1;"""

d2 = client.query(sq).to_dataframe()
d2



Unnamed: 0,modelDisplayName,cnt
0,Cash_alpha_trench1_appscorescore_backscore,28364
1,Cash_alpha_trench1_cicscore_backscore,62044
2,Cash_alpha_trench1_demoscore_backscore,62044
3,Cash_alpha_trench1_stackscore_backscore,62044
4,Cash_alpha_trench2_appscore_backscore,22572
5,Cash_alpha_trench2_cicscore_backscore,39651
6,Cash_alpha_trench2_demoscore_backscore,39651
7,Cash_alpha_trench2_stackscore_backscore,39651
8,Cash_alpha_trench2_transactionscore_backscore,39651
9,Cash_alpha_trench3_appscore_backscore,9547


# Merged final table

In [349]:
schema3 = 'dap_ds_poweruser_playground'
tab1 = 'temp_final_model_transformed_data'

In [351]:
sq = f"""
create or replace table {schema3}.{tab1} as
SELECT cast(customerId as numeric)customerId, digitalLoanAccountId, cast(prediction as string)prediction, start_time, end_time  
, modelDisplayName
, modelVersionId
, calcFeature
, 'ml_model_run_details' source
FROM `prj-prod-dataplatform.audit_balance.ml_model_run_details` 
qualify row_number() over(partition by customerId ,digitalLoanAccountId, modelDisplayName order by start_time
desc) = 1

union all 

select customerId, digitalLoanAccountId, cast(prediction as string)prediction, datetime(start_time)start_time, datetime(end_time) end_time 
, modelDisplayName
, modelVersionId
, calcFeature 
, 'Manual_Backscore_tables' source
from dap_ds_poweruser_playground.temp_model_transformed_data
;
"""
job = client.query(sq)
job.result()  # Wait for the job to complete.
time.sleep(5) # Delays for 30 seconds
print(f'Table {schema3}.{tab1} created successfully')



Table dap_ds_poweruser_playground.temp_final_model_transformed_data created successfully


In [352]:
sq = """select modelDisplayName, source, count(distinct digitalLoanAccountId) from dap_ds_poweruser_playground.temp_final_model_transformed_data
group by 1,2
order by 1,2"""

d3 = client.query(sq).to_dataframe(progress_bar_type='tqdm')
d3

Job ID 1c1fcfab-6708-4628-9177-4ee46be49149 successfully executed: 100%|[32m██████████[0m|




Downloading: 100%|[32m██████████[0m|


Unnamed: 0,modelDisplayName,source,f0_
0,Alpha - IncomeEstimationModel,ml_model_run_details,48386
1,Alpha - CIC-SIL-Model,ml_model_run_details,48386
2,Alpha - StackingModel,ml_model_run_details,48386
3,Beta - AppsScoreModel,ml_model_run_details,68324
4,Beta - DemoScoreModel,ml_model_run_details,77223
5,Beta - IncomeEstimationModel,ml_model_run_details,77223
6,Beta - StackScoreModel,ml_model_run_details,77223
7,Cash_alpha_trench1_appscorescore_backscore,Manual_Backscore_tables,28364
8,Cash_alpha_trench1_cicscore_backscore,Manual_Backscore_tables,62044
9,Cash_alpha_trench1_demoscore_backscore,Manual_Backscore_tables,62044


In [None]:
# d1.to_csv(fr"{LOCALPATH}\{CURRENT_DATE}_{unique_id}_{transformeddata}.csv", index = False)
# d1.to_parquet(fr"{LOCALPATH}\{CURRENT_DATE}_{unique_id}_{transformeddata}.parquet")
# d1.to_pickle(fr"{LOCALPATH}\{CURRENT_DATE}_{unique_id}_{transformeddata}.pkl")
# joblib.dump(d1, f"{LOCALPATH}\{CURRENT_DATE}_{unique_id}_{transformeddata}.joblib")