# <div align="center" style="color: #ff5733;">Income Estimation Regression Model (Catboost) Complete Data</div>

# Settings

In [1]:
def set_all_seeds(seed=42):
    """Set all seeds and environment variables for reproducibility"""
    import os
    # Set environment variables before any other imports
    os.environ['PYTHONHASHSEED'] = str(seed)
    os.environ['TF_DETERMINISTIC_OPS'] = '1'
    os.environ['TF_CUDNN_DETERMINISTIC'] = '1'
    
    # Then set other seeds
    import numpy as np
    import random
    np.random.seed(seed)
    random.seed(seed)
    
    # Force single-thread operations
    os.environ['OMP_NUM_THREADS'] = '1'
    os.environ['MKL_NUM_THREADS'] = '1'
    os.environ['OPENBLAS_NUM_THREADS'] = '1'

In [2]:
# First cell of your notebook
set_all_seeds(42)

# Beta2 Model

## Declare Libraries

In [5]:
# Import Libraries
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
import seaborn as sns
import os
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.ensemble import StackingRegressor, RandomForestRegressor
from sklearn.metrics import (
    mean_absolute_error, 
    mean_squared_error, 
    r2_score, 
    mean_absolute_percentage_error
)
from catboost import CatBoostRegressor, Pool
import catboost as cb
from xgboost import XGBRegressor
from scipy.stats import uniform, randint
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler
import shap
from statsmodels.stats.outliers_influence import variance_inflation_factor
from typing import Union, List
from scipy.stats import mstats
from sklearn.preprocessing import LabelEncoder, FunctionTransformer
from sklearn.impute import SimpleImputer
import time

import pickle
from datetime import datetime
import re
from gensim.models import Word2Vec
from sklearn.cluster import KMeans
from fuzzywuzzy import fuzz
import joblib
from google.cloud import storage
from google.cloud import bigquery
# Connection to Bigquery
client = bigquery.Client(project='prj-prod-dataplatform')

import tempfile

# Suppress warnings
import warnings
warnings.filterwarnings('ignore')

# Settings in this Notebook
pd.set_option("display.max_rows", 1000)
pd.set_option("display.max_columns", None)





# Constants

In [6]:
# Constants

BUCKET_NAME = "prod-asia-southeast1-tonik-aiml-workspace"
CLOUDPATH = "Monthly_Income_Estimation/Income_Estimation_Models/Income_Estimation_Notebook/Note_Data_Book"
CLOUDPATH_TARGET = "Monthly_Income_Estimation/Income_Estimation_Models/Income_Estimation_Notebook/Artifacts"
DATATYPE = "Step2"
LOCALPATH = "/home/jupyter/Models/Income_Estimation_Models/Income_Estimation_Model/Income_Estimation_Model_Complete_code/"
MODELNAME = "Beta2WithOutApp"
VERSIONNAME = "2_0"
PRODUCT_TYPE = 'SIL_Quick'
CURRENT_DATE = datetime.now().strftime("%Y%m%d")

### Version Details

VERSIONNAME - 2-0

With new name of the columns and new table that Bala created

Table name -- To be added later

Date table created -- 

Adding all the functions which might help to move data to google cloud storage or bring it from GCS. 

### Functions

#### dfdescription

In [11]:
%%writefile dfdescription.py

def dfdescription(df):
    print(f"The shape of the data frame is :\t {df.shape}")
    print(f"The data types of columns in dataframe is: \n{df.dtypes}")
    print(f"The description of numerical columns is:\t {df.describe()}")

Writing dfdescription.py


## add_column_prefix

In [12]:
%%writefile add_column_prefix.py

def add_column_prefix(df: pd.DataFrame, 
                      prefix: str, 
                      columns: Union[str, List[str]] = None):
    """
    Add a prefix to specified columns in a DataFrame.
    
    Parameters:
    -----------
    df : pandas.DataFrame
        The input DataFrame whose columns need to be renamed
    prefix : str
        The prefix to be added to selected column names
    columns : str or list of str, optional
        The specific column(s) to add prefix to. 
        If None, applies prefix to all columns.
    
    Returns:
    --------
    pandas.DataFrame
        A new DataFrame with prefixed column names
    
    Examples:
    ---------
    >>> data = pd.DataFrame({
    ...     'name': ['Alice', 'Bob'], 
    ...     'age': [25, 30], 
    ...     'city': ['New York', 'San Francisco']
    ... })
    >>> 
    >>> # Add prefix to specific columns
    >>> prefixed_data = add_column_prefix(data, 'user_', ['name', 'age'])
    >>> print(prefixed_data.columns)
    Index(['user_name', 'user_age', 'city'], dtype='object')
    
    >>> # Add prefix to all columns
    >>> all_prefixed = add_column_prefix(data, 'user_')
    >>> print(all_prefixed.columns)
    Index(['user_name', 'user_age', 'user_city'], dtype='object')
    """
    # Create a copy of the DataFrame to avoid modifying the original
    df_copy = df.copy()
    
    # If no specific columns are provided, use all columns
    if columns is None:
        columns = df.columns.tolist()
    
    # Ensure columns is a list
    if isinstance(columns, str):
        columns = [columns]
    
    # Validate that specified columns exist in the DataFrame
    invalid_columns = set(columns) - set(df.columns)
    if invalid_columns:
        raise ValueError(f"Columns not found in DataFrame: {invalid_columns}")
    
    # Create a dictionary to map selected column names to new column names
    rename_dict = {col: f"{prefix}{col}" for col in columns}
    
    # Rename the specified columns
    df_copy.rename(columns=rename_dict, inplace=True)
    
    return df_copy


Writing add_column_prefix.py


## read_csv_from_gcs

In [13]:
%%writefile read_csv_from_gcs.py

def read_csv_from_gcs(project_id, bucket_name, file_path):
  """Reads a CSV file from a GCS bucket into a pandas DataFrame.

  Args:
    project_id: The Google Cloud project ID.
    bucket_name: The name of the GCS bucket.
    file_path: The path to the CSV file within the bucket.

  Returns:
    A pandas DataFrame containing the CSV data.
  """

  storage_client = storage.Client(project=project_id)
  bucket = storage_client.bucket(bucket_name)
  blob = bucket.blob(file_path)

  with blob.open('r') as f:
    df = pd.read_csv(f)

  return df



Writing read_csv_from_gcs.py


## save_df_to_gcs

In [14]:
%%writefile read_csv_from_gcs

def save_df_to_gcs(df, bucket_name, destination_blob_name, file_format='csv'):
    """Saves a pandas DataFrame to Google Cloud Storage.

    Args:
        df: The pandas DataFrame to save.
        bucket_name: The name of the GCS bucket.
        destination_blob_name: The name of the blob to be created.
        file_format: The file format to save the DataFrame in ('csv' or 'parquet').
    """

    # Create a temporary file
    if file_format == 'csv':
        temp_file = 'temp.csv'
        df.to_csv(temp_file, index=False)
    elif file_format == 'parquet':
        temp_file = 'temp.parquet'
        df.to_parquet(temp_file, index=False)
    else:
        raise ValueError("Invalid file format. Please choose 'csv' or 'parquet'.")

    # Upload the file to GCS
    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(destination_blob_name)

    blob.upload_from_filename(temp_file)

    # Remove the temporary file
    import os
    os.remove(temp_file)
    


Writing read_csv_from_gcs


## Upload_blob

In [15]:
%%writefile upload_blob

def upload_blob(bucket_name, source_file_name, destination_blob_name):
    """Uploads a file to the bucket.   


    Args:
        bucket_name: The name of the bucket.
        source_file_name: The path to the file to upload.
        destination_blob_name: The name of the blob to be created.
    """
    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(destination_blob_name)

    blob.upload_from_filename(source_file_name)

    print(f"File {source_file_name} uploaded to {destination_blob_name}.")
    

Writing upload_blob


## upload_model_to_gcs

In [16]:
%%writefile upload_blob.py

# Upload the model to GCS
def upload_model_to_gcs(bucket_name, source_file_name, destination_blob_name):
    """Uploads a model to a GCS bucket.

    Args:
        bucket_name: The name of the GCS bucket.
        source_file_name: The path to the local model file.
        destination_blob_name: The name of the blob to be created in GCS.
    """
    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(destination_blob_name)

    blob.upload_from_filename(source_file_name) 
    


Writing upload_blob.py


## plot_actual_vs_predicted

In [17]:
%%writefile plot_actual_vs_predicted.py

def plot_actual_vs_predicted(y_true, y_pred, title='Actual vs Predicted Values'):
    """
    Create a scatter plot of actual vs predicted values
    
    Parameters:
    - y_true: True target values
    - y_pred: Predicted target values
    - title: Plot title
    """
    plt.figure(figsize=(10, 6))
    plt.scatter(y_true, y_pred, alpha=0.5)
    plt.plot([y_true.min(), y_true.max()], [y_true.min(), y_true.max()], 'r--', lw=2)
    plt.xlabel('Actual Values')
    plt.ylabel('Predicted Values')
    plt.title(title)
    plt.tight_layout()
    plt.show()

Writing plot_actual_vs_predicted.py


## plot_residuals

In [18]:
%%writefile plot_residuals.py

def plot_residuals(y_true, y_pred, title='Residual Plot'):
    """
    Create a residual plot to visualize model errors with type conversion
    
    Parameters:
    - y_true: True target values
    - y_pred: Predicted target values
    - title: Plot title
    """
    # Convert to numpy float arrays to ensure type compatibility
    y_true_float = np.array(y_true, dtype=float)
    y_pred_float = np.array(y_pred, dtype=float)
    
    residuals = y_true_float - y_pred_float
    
    plt.figure(figsize=(10, 6))
    plt.scatter(y_pred_float, residuals, alpha=0.5)
    plt.hlines(y=0, xmin=y_pred_float.min(), xmax=y_pred_float.max(), color='r', linestyle='--')
    plt.xlabel('Predicted Values')
    plt.ylabel('Residuals')
    plt.title(title)
    plt.tight_layout()
    plt.show()
    


Writing plot_residuals.py


## plot_residuals_hist

In [19]:
%%writefile plot_residuals_hist

def plot_residuals_hist(y_true, y_pred, title='Residual Histogram Plot'):
    """
    Create a residual plot to visualize model errors with type conversion
    
    Parameters:
    - y_true: True target values
    - y_pred: Predicted target values
    - title: Plot title
    """
    # Convert to numpy float arrays to ensure type compatibility
    y_true_float = np.array(y_true, dtype=float)
    y_pred_float = np.array(y_pred, dtype=float)
    
    residuals = y_true_float - y_pred_float

    plt.figure(figsize=(10, 6))
    sns.histplot(residuals, kde=True)
    plt.title('Residual Histogram')
    plt.xlabel('Residuals')
    plt.ylabel('Frequency')
    plt.show()
    


Writing plot_residuals_hist


## plot_lift_chart  

In [21]:
%%writefile plot_lift_chart.py

def plot_lift_chart(y_test, y_pred, n_bins=10):
    """
    Plots a lift chart for a regression model.
    
    Parameters:
        y_test (array-like): Actual target values.
        y_pred (array-like): Predicted target values.
        n_bins (int): Number of bins/quantiles to group the data (default: 10).
    
    Returns:
        None: Displays the lift chart.
    """
    # Combine actual and predicted values into a DataFrame
    results = pd.DataFrame({
        'Actual': y_test,
        'Predicted': y_pred
    })
    
    # Create quantile-based bins
    results['Decile'] = pd.qcut(results['Predicted'], q=n_bins, labels=False)

    # Group by decile and calculate mean actual and predicted values
    lift_chart_data = results.groupby('Decile').agg(
        Avg_Predicted=('Predicted', 'mean'),
        Avg_Actual=('Actual', 'mean')
    ).reset_index()

    # Plot the lift chart
    plt.figure(figsize=(10, 6))
    plt.plot(lift_chart_data['Avg_Predicted'], label='Predicted', marker='o')
    plt.plot(lift_chart_data['Avg_Actual'], label='Actual', marker='s')
    plt.title("Lift Chart")
    plt.xlabel(f"Decile (1-{n_bins})")
    plt.ylabel("Average Value")
    plt.legend()
    plt.grid()
    plt.show()



Writing plot_lift_chart.py


## plot_gain_chart

In [22]:
%%writefile plot_lift_chart.py

def plot_gain_chart(y_test, y_pred, n_bins=10):
    """
    Plots a gain chart for a regression model.
    
    Parameters:
        y_test (array-like): Actual target values.
        y_pred (array-like): Predicted target values.
        n_bins (int): Number of bins/quantiles to group the data (default: 10).
    
    Returns:
        None: Displays the gain chart.
    """
    # Combine actual and predicted values into a DataFrame
    results = pd.DataFrame({
        'Actual': y_test,
        'Predicted': y_pred
    })

    # Sort by predicted values
    results = results.sort_values(by='Predicted', ascending=False).reset_index(drop=True)

    # Calculate cumulative sums for actual and predicted values
    results['Cumulative_Actual'] = results['Actual'].cumsum()
    results['Cumulative_Predicted'] = results['Predicted'].cumsum()

    # Normalize cumulative sums to percentage of total
    results['Cumulative_Actual_Percent'] = results['Cumulative_Actual'] / results['Actual'].sum() * 100
    results['Cumulative_Predicted_Percent'] = results['Cumulative_Predicted'] / results['Predicted'].sum() * 100
    results['Percentage_of_Data'] = np.linspace(1 / len(results), 1, len(results)) * 100

    # Plot the gain chart
    plt.figure(figsize=(10, 6))
    plt.plot(results['Percentage_of_Data'], results['Cumulative_Predicted_Percent'], label='Predicted', marker='o')
    plt.plot(results['Percentage_of_Data'], results['Cumulative_Actual_Percent'], label='Actual', marker='s')
    plt.title("Gain Chart")
    plt.xlabel("Percentage of Data")
    plt.ylabel("Cumulative Percentage")
    plt.legend()
    plt.grid()
    plt.show()



Overwriting plot_lift_chart.py


## save_df_to_gcs

In [23]:
%%writefile save_df_to_gcs.py

def save_df_to_gcs(df, bucket_name, destination_blob_name, file_format='csv'):
    """Saves a pandas DataFrame to Google Cloud Storage.

    Args:
        df: The pandas DataFrame to save.
        bucket_name: The name of the GCS bucket.
        destination_blob_name: The name of the blob to be created.
        file_format: The file format to save the DataFrame in ('csv' or 'parquet').
    """

    # Create a temporary file
    if file_format == 'csv':
        temp_file = 'temp.csv'
        df.to_csv(temp_file, index=False)
    elif file_format == 'parquet':
        temp_file = 'temp.parquet'
        df.to_parquet(temp_file, index=False)
    else:
        raise ValueError("Invalid file format. Please choose 'csv' or 'parquet'.")

    # Upload the file to GCS
    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(destination_blob_name)

    blob.upload_from_filename(temp_file)

    # Remove the temporary file
    import os
    os.remove(temp_file)
    


Writing save_df_to_gcs.py


In [24]:
%%writefile check_categorical_columns.py

## check_categorical_columns

def check_categorical_columns(df, categorical_cols):
    """
    Check if any categorical columns contain numerical values or NaNs.
    """
    for col in categorical_cols:
        if df[col].dtype != 'object':  # Check if the column is not of type 'object'
            print(f"Column {col} is not of type 'object'. It has type: {df[col].dtype}")
            print(f"Unique values in {col}: {df[col].unique()}")
        elif df[col].isnull().any():  # Check if the column contains NaN values
            print(f"Column {col} contains NaN values.")
        else:
            print(f"Column {col} seems fine.")
        
        # Check for numerical data in categorical columns
        numerical_data = df[col][df[col].apply(lambda x: isinstance(x, (int, float)))]
        if not numerical_data.empty:
            print(f"Column {col} contains numerical data: {numerical_data.unique()}")

# # List of categorical columns
# categorical_cols = ['de_gender', 'de_maritalStatus', 'de_city', 'de_barangay', 'de_province',
#                     'de_dependentsCount', 'de_subIndustryDescription', 'de_Education_type',
#                     'deviceType', 'osversion_v2', 'brand', 'app_first_app_cat',
#                     'app_last_app_cat', 'de_natureofwork_grouped']


Writing check_categorical_columns.py


## load_pickle_from_gcs

In [25]:
%%writefile load_pickle_from_gcs.py

def load_pickle_from_gcs(bucket_name, blob_path):
    """
    Load pickle file from Google Cloud Storage
    
    Parameters:
    bucket_name: Name of the GCS bucket
    blob_path: Path to the blob in the bucket
    
    Returns:
    Unpickled data
    """
    from google.cloud import storage
    import pickle
    import io
    
    # Initialize GCS client
    client = storage.Client()
    bucket = client.bucket(bucket_name)
    blob = bucket.blob(blob_path)
    
    # Download blob content into memory
    content = blob.download_as_bytes()
    
    # Load pickle data from memory
    pickle_data = pickle.loads(content)
    
    return pickle_data


Writing load_pickle_from_gcs.py


# Data preparation and preprocessing

## Query for Other Features

In [20]:
# sq = """
# with 
# educate as 
# (select distinct edu.digitalLoanAccountId, edu.education_id, edu1.description
# from `prj-prod-dataplatform.dl_loans_db_raw.tdbk_loan_purpose` edu
# inner join (select id, description from dl_loans_db_raw.tdbk_loan_lov_mtb where module = 'Education') edu1 on edu.education_id = edu1.id
# ),
# educate2 as 
# (select *, row_number() over(partition by digitalLoanAccountId order by education_id desc) rnk from educate),
# educate3 as 
# (select * from educate2 where rnk = 1),
# base as 
# (select 
# b.*,
# CAST(
#             CASE 
#                 WHEN LOWER(b.osversion_v2) LIKE 'android%' THEN 
#                     -- Extract just the first number for android
#                     CAST(SPLIT(REGEXP_EXTRACT(LOWER(b.osversion_v2), r'android(.+)'), '.')[OFFSET(0)] AS FLOAT64)
#                 WHEN LOWER(b.osversion_v2) LIKE 'ios%' THEN
#                     -- Extract just the first number for ios
#                     CAST(SPLIT(REGEXP_EXTRACT(LOWER(b.osversion_v2), r'ios(.+)'), '.')[OFFSET(0)] AS FLOAT64)
#                 ELSE 
#                     CAST(SPLIT(b.osversion_v2, '.')[OFFSET(0)] AS FLOAT64)
#             END AS FLOAT64
#         ) as clean_version,
#   CASE 
#     WHEN DATE_TRUNC(b.decision_date, DAY) BETWEEN '2023-07-01' AND '2024-07-31' THEN 'Train'
#     WHEN DATE_TRUNC(b.decision_date, DAY) BETWEEN '2024-08-01' AND '2024-08-31' THEN 'Test'
#     WHEN DATE_TRUNC(b.decision_date, DAY) BETWEEN '2024-09-01' AND '2024-09-30' THEN 'OOT_SEP_24'
#     WHEN DATE_TRUNC(b.decision_date, DAY) BETWEEN '2024-10-01' AND '2024-10-31' THEN 'OOT_OCT_24'
#     WHEN DATE_TRUNC(b.decision_date, DAY) BETWEEN '2024-11-01' AND '2024-11-30' THEN 'OOT_NOV_24'
#     WHEN DATE_TRUNC(b.decision_date, DAY) BETWEEN '2024-12-01' AND '2024-12-31' THEN 'OOT_DEC_24'
# END AS Dataselection,
# lmt.loanAccountNumber,
# lmt.maritalStatus,
# lmt.dependentsCount,
# lmt.new_loan_type,
# educate3.description Education_type,
# from worktable_data_analysis.beta2_loan_details_jan2023_dec2024 b
# inner join `risk_credit_mis.loan_master_table` lmt on lmt.digitalLoanAccountid = b.digitalLoanAccountId
# left join educate3 on educate3.digitalLoanAccountId = b.digitalLoanAccountId
# where b.digitalLoanAccountId is not null
# and coalesce(lmt.Max_Ever_DPD, 0) < 10
# AND lmt.new_loan_type like'%SIL%'
# AND DATE_TRUNC(lmt.termsAndConditionsSubmitDateTime, DAY) >= '2023-07-01'
# AND DATE(lmt.thirdDueDate) <= CURRENT_DATE()
# AND lmt.flagDisbursement = 1
# AND b.user_type in ('2_New Applicant', '1_Repeat Applicant')
# )
# select 
# base.cust_id,
# base.digitalLoanAccountId,
# base.onboarding_datetime,
# base.first_name,
# base.middle_name,
# base.last_name,
# base.age,
# base.gender,
# base.email,
# base.onb_mobile_no,
# base.onb_city,
# base.onb_province,
# base.onb_postalcode,
# base.onb_barangay,
# base.place_of_birth,
# base.source_funds_new source_funds,
# base.employment_type_new employment_type,
# base.nature_of_work_new nature_of_work,
# base.industry_description_new  industry_description,
# base.onb_document_type,
# base.loan_company_name,
# base.onb_latitude,
# base.onb_longitude,
# base.loan_type,
# base.user_type,
# base.loan_geolocation,
# base.loan_docType,
# base.loan_docNumber,
# base.loan_province,
# base.loan_city,
# base.loan_postalcode,
# base.osversion_v2,
# base.decision_date,
# base.disbursementDateTime,
# date_diff(disbursementDateTime, onboarding_datetime, day) daystoapply,
# base.clean_version,
# base.Dataselection,
# base.loanAccountNumber,
# base.Brand,
# base.maritalStatus,
# base.dependentsCount,
# base.Education_type,
# case when cast(base.loan_monthly_income as numeric) > 300000 then 300000 else cast(base.loan_monthly_income as numeric) end as loan_monthly_income,
# base.loan_monthly_income monthlyIncome
# from base 
# ;
# """
# data = client.query(sq).to_dataframe(progress_bar_type = 'tqdm')
# print(f"The shape of the {MODELNAME}_{DATATYPE}_{VERSIONNAME}_{PRODUCT_TYPE} data is:\t{data.shape}")

In [26]:
sq = """select * from  worktable_data_analysis.beta2_loan_details_jan2023_dec2024 limit 10;"""
dummydf = client.query(sq).to_dataframe()
dummydf.columns.values

array(['customerId', 'onb_email', 'onb_email_verified_flag', 'first_name',
       'middle_name', 'last_name', 'onb_city', 'onb_province',
       'onb_postalcode', 'onb_barangay', 'onb_country',
       'onb_place_of_birth', 'ln_age', 'ln_gender', 'onb_latitude',
       'onb_longitude', 'onb_cnt_ongoing_loans',
       'onb_tot_ongoing_loans_emi', 'digitalLoanAccountId',
       'loanAccountNumber', 'onb_tsa_onboarding_datetime',
       'onb_mobile_no', 'kyc_status', 'kyc_status_upgrade_datetime',
       'ln_appln_submit_datetime', 'ln_mobile_no',
       'ln_alternate_mobile_no', 'ln_purpose', 'ln_disb_dtime',
       'ln_source_funds', 'ln_source_funds_new', 'ln_employment_type',
       'ln_employment_type_new', 'ln_nature_of_work',
       'ln_nature_of_work_new', 'ln_industry', 'ln_industry_new',
       'ln_company_name', 'ln_marital_status', 'ln_dependents_count',
       'ln_education_level', 'ln_ref1_type', 'ln_ref2_type',
       'ln_address_line', 'ln_province', 'ln_city', 'ln_barangay

In [22]:
sq = """
with 
educate as 
(select distinct edu.digitalLoanAccountId, edu.education_id, edu1.description
from `prj-prod-dataplatform.dl_loans_db_raw.tdbk_loan_purpose` edu
inner join (select id, description from dl_loans_db_raw.tdbk_loan_lov_mtb where module = 'Education') edu1 on edu.education_id = edu1.id
),
educate2 as 
(select *, row_number() over(partition by digitalLoanAccountId order by education_id desc) rnk from educate),
educate3 as 
(select * from educate2 where rnk = 1),
base as 
(select 
b.customerId, b.onb_email, b.onb_email_verified_flag,
       b.onb_place_of_birth, b.age, b.onb_latitude, b.onb_longitude,
       b.onb_cnt_ongoing_loans, b.onb_tot_ongoing_loans_emi,
       b.digitalLoanAccountId, b.loanAccountNumber, b.onboarding_datetime,
       b.onb_mobile_no, b.loan_mobile_no, b.loan_alternate_mobile_no,
       b.loan_purpose, b.loan_disbursementDateTime, b.loan_source_funds,
       b.loan_source_funds_new, b.loan_employment_type,
       b.loan_employment_type_new, b.loan_nature_of_work,
       b.loan_nature_of_work_new, b.loan_industry_description,
       b.loan_industry_description_new, b.loan_companyName,
       b.loan_marital_status, b.loan_dependents_count,
       b.loan_education_level, b.loan_ref_type1, b.loan_ref_type2,
       b.loan_addressline, b.loan_province, b.loan_city, b.loan_barangay,
       b.loan_postalcode, b.loan_geolocation, b.loan_docType,
       b.loan_docNumber, b.loan_type, b.loan_product_type,
       b.loan_osversion_v2, b.loan_brand, b.loan_self_dec_income,
       b.loan_salary_scaled_income, b.loan_vas_opted_flag
,
CAST(
            CASE 
                WHEN LOWER(b.loan_osversion_v2) LIKE 'android%' THEN 
                    -- Extract just the first number for android
                    CAST(SPLIT(REGEXP_EXTRACT(LOWER(b.loan_osversion_v2), r'android(.+)'), '.')[OFFSET(0)] AS FLOAT64)
                WHEN LOWER(b.loan_osversion_v2) LIKE 'ios%' THEN
                    -- Extract just the first number for ios
                    CAST(SPLIT(REGEXP_EXTRACT(LOWER(b.loan_osversion_v2), r'ios(.+)'), '.')[OFFSET(0)] AS FLOAT64)
                ELSE 
                    CAST(SPLIT(b.loan_osversion_v2, '.')[OFFSET(0)] AS FLOAT64)
            END AS FLOAT64
        ) as clean_version,
  CASE 
    WHEN DATE_TRUNC(lmt.decision_date, DAY) BETWEEN '2023-07-01' AND '2024-07-31' THEN 'Train'
    WHEN DATE_TRUNC(lmt.decision_date, DAY) BETWEEN '2024-08-01' AND '2024-08-31' THEN 'Test'
    WHEN DATE_TRUNC(lmt.decision_date, DAY) BETWEEN '2024-09-01' AND '2024-09-30' THEN 'OOT_SEP_24'
    WHEN DATE_TRUNC(lmt.decision_date, DAY) BETWEEN '2024-10-01' AND '2024-10-31' THEN 'OOT_OCT_24'
    WHEN DATE_TRUNC(lmt.decision_date, DAY) BETWEEN '2024-11-01' AND '2024-11-30' THEN 'OOT_NOV_24'
    WHEN DATE_TRUNC(lmt.decision_date, DAY) BETWEEN '2024-12-01' AND '2024-12-31' THEN 'OOT_DEC_24'
END AS Dataselection,
lmt.disbursementDateTime,
lmt.new_loan_type,
lmt.Gender,
from worktable_data_analysis.beta2_loan_details_jan2023_dec2024 b
inner join `risk_credit_mis.loan_master_table` lmt on lmt.digitalLoanAccountid = b.digitalLoanAccountId
left join educate3 on educate3.digitalLoanAccountId = b.digitalLoanAccountId
where b.digitalLoanAccountId is not null
and coalesce(lmt.Max_Ever_DPD, 0) < 10
AND (upper(lmt.new_loan_type) like '%SIL%' or upper(lmt.new_loan_type) like '%QUICK%')
AND DATE_TRUNC(lmt.termsAndConditionsSubmitDateTime, DAY) >= '2023-07-01'
AND DATE(lmt.thirdDueDate) <= CURRENT_DATE()
AND lmt.flagDisbursement = 1
-- AND b.user_type in ('2_New Applicant', '1_Repeat Applicant')
)
select 
base.customerId cust_id,
base.digitalLoanAccountId,
base.loanAccountNumber,
base.onboarding_datetime,
base.age,
base.Gender,
base.onb_email email,
base.onb_mobile_no onb_mobile_no,
base.loan_mobile_no,
Case when coalesce(base.onb_mobile_no, '0') = coalesce(base.loan_mobile_no, '0') then 0 else 1 end onb_mobile_Not_match_loan_mobile,
case when loan_alternate_mobile_no is null then 1 else 0 end flag_alternate_mobile_provided,
base.loan_purpose,
base.loan_source_funds_new source_funds,
base.loan_employment_type_new employment_type,
base.loan_nature_of_work_new nature_of_work,
base.loan_industry_description_new industry_description,
base.loan_companyName loan_company_name,
base.loan_marital_status maritalStatus,
base.loan_dependents_count dependentsCount,
base.loan_education_level,
base.loan_ref_type1,
base.loan_ref_type2,
base.loan_province, 
base.loan_city,
base.loan_barangay,
base.loan_postalcode,
base.loan_geolocation,
base.loan_product_type,
base.loan_osversion_v2 osversion_v2,
base.clean_version,
base.loan_brand,
base.loan_self_dec_income monthlyIncome,
case when cast(base.loan_self_dec_income as numeric) > 300000 then 300000 else cast(base.loan_self_dec_income as numeric) end as loan_monthly_income,
base.loan_vas_opted_flag,
base.Dataselection,
base.onb_place_of_birth place_of_birth,
base.onb_latitude,
base.onb_longitude,
base.loan_type,
base.loan_docType,
base.loan_docNumber,
date_diff(disbursementDateTime, onboarding_datetime, day) daystoapply,
from base 
;
"""
data = client.query(sq).to_dataframe(progress_bar_type = 'tqdm')
print(f"The shape of the {MODELNAME}_{DATATYPE}_{VERSIONNAME}_{PRODUCT_TYPE} data is:\t{data.shape}")

Job ID 078face0-368b-45bc-8e28-5530efeb449d successfully executed: 100%|[32m██████████[0m|
Downloading: 100%|[32m██████████[0m|
The shape of the Beta2WithOutApp_Step2_1_0_SIL_Quick data is:	(103926, 42)


In [23]:
lmi_df = data[['digitalLoanAccountId', 'loan_monthly_income', 'monthlyIncome']].copy()
print(f"The shape of the dataframe after selecting only digitalLoanAccountId and loan_monthly_income is:\t {lmi_df.shape}")

The shape of the dataframe after selecting only digitalLoanAccountId and loan_monthly_income is:	 (103926, 3)
