In [1]:
# %% [markdown]
# # Jupyter Notebook Loading Header
#
# This is a custom loading header for Jupyter Notebooks in Visual Studio Code.
# It includes common imports and settings to get you started quickly.

# %% [markdown]
## Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from google.cloud import bigquery
import os
path = r'C:\Users\DwaipayanChakroborti\AppData\Roaming\gcloud\legacy_credentials\dchakroborti@tonikbank.com\adc.json'
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = path
client = bigquery.Client(project='prj-prod-dataplatform')

# %% [markdown]
## Configure Settings
# Set options or configurations as needed
# Example: pd.set_option('display.max_columns', None)

In [2]:
sq = """select * from prj-prod-dataplatform.risk_credit_cic_data.employment_data limit 100;"""

test = client.query(sq).to_dataframe()
test.columns

Index(['digitalLoanAccountId', 'crifApplicationId', 'customerId',
       'processEngineGuid', 'requestGuid', 'LastUpdateDate',
       'AnnualMonthlyIndicator', 'Currency', 'DateHiredFrom', 'DateHiredTo',
       'GrossIncome', 'Occupation', 'OccupationStatus', 'PhoneNumber', 'PSIC',
       'TIN', 'CurrencyDesc', 'OccupationDesc', 'OccupationStatusDesc',
       'PSICDesc', 'TradeName'],
      dtype='object')

In [3]:
test['PSICDesc'].unique()

array([None, '479 - Retail trade not in stores, stalls or markets',
       '56103 - Cafeterias', '4220 - Construction of utility projects',
       '55101 - Hotels and motels', '651 - Insurance',
       '64199 - Banking activities, n.e.c.',
       '7911 - Travel agency activities',
       '822 - Call centers and other related activities',
       '8560 - Educational support services',
       '84113 - Public administration, local government',
       '61201 - Wireless landline services',
       '842 - Provision of services to the community as a whole'],
      dtype=object)

In [4]:
pd.set_option("display.max_rows", None)

sq = """SELECT 
  digitalLoanAccountId, 
  crifApplicationId, 
  customerId,   
  AnnualMonthlyIndicator, 
  Currency, 
  DateHiredFrom, 
  DateHiredTo, 
  GrossIncome,
  CAST(
    CASE 
      WHEN COALESCE(AnnualMonthlyIndicator, 'NA') LIKE 'M' THEN CAST(COALESCE(GrossIncome, '0') AS NUMERIC)
      WHEN COALESCE(AnnualMonthlyIndicator, 'NA') LIKE 'Y' THEN ROUND(CAST(COALESCE(GrossIncome, '0') AS NUMERIC)/12, 0)
      ELSE 0 
    END AS INT64
  ) AS MonthlyIncome,
  CAST(
    CASE 
      WHEN COALESCE(AnnualMonthlyIndicator, 'NA') LIKE 'M' THEN ROUND(CAST(COALESCE(GrossIncome, '0') AS NUMERIC)*12, 0)
      WHEN COALESCE(AnnualMonthlyIndicator, 'NA') LIKE 'Y' THEN CAST(COALESCE(GrossIncome, '0') AS NUMERIC)
      ELSE 0 
    END AS INT64
  ) AS AnnualIncome,    
  OccupationDesc,
  OccupationStatusDesc,
  PSIC, 
  REGEXP_REPLACE(PSICDesc, r'^\d+\s*-\s*', '') AS PSICDesc 
FROM prj-prod-dataplatform.risk_credit_cic_data.employment_data;"""

employmentdata = client.query(sq).to_dataframe(progress_bar_type='tqdm')
employmentdata.columns

Job ID 02ac5958-b07f-43b9-ae73-011dea2dd882 successfully executed: 100%|[32m██████████[0m|
Downloading: 100%|[32m██████████[0m|


Index(['digitalLoanAccountId', 'crifApplicationId', 'customerId',
       'AnnualMonthlyIndicator', 'Currency', 'DateHiredFrom', 'DateHiredTo',
       'GrossIncome', 'MonthlyIncome', 'AnnualIncome', 'OccupationDesc',
       'OccupationStatusDesc', 'PSIC', 'PSICDesc'],
      dtype='object')

In [5]:
employmentdata.sample(10)

Unnamed: 0,digitalLoanAccountId,crifApplicationId,customerId,AnnualMonthlyIndicator,Currency,DateHiredFrom,DateHiredTo,GrossIncome,MonthlyIncome,AnnualIncome,OccupationDesc,OccupationStatusDesc,PSIC,PSICDesc
95403,96fc5822-f231-4ac7-81f9-35a81004a8b7,tonik-tul-320680,2188184,,,,,,0,0,,,,
139494,c6fc3dca-118c-453a-8eba-616151088088,tonik-tul-387712,2306620,,,,,,0,0,,,,
144496,f433982f-650f-4788-a469-4d7cf9b522e8,tonik-tul-334265,1230517,,,,,,0,0,,,,
57746,bea4c771-84d6-481c-a966-16c8eee4346d,tonik-bnpl-514672,2325351,,,,,,0,0,,,,
23277,fbc50091-95fd-48d3-a9a8-8d58cd8853ff,taran-2693871-4559226,2693871,,,,,,0,0,,,,
173767,e317db8e-4087-4d88-be08-e07a83711ed6,tonik-bnpl-413274,2345534,,,,,,0,0,,Permanent Job (Private sector),,
170601,5cba493e-78e2-468f-ba2e-35ee1601483b,tonik-bnpl-534447,2537870,,,,,,0,0,,,,
235973,ca201ac9-ce62-4c51-9533-651ab1ff9dc0,taran-2634767-5820157,2634767,,,,,,0,0,,,,
212143,a48f981c-3f65-4d2d-b9eb-06471e67c073,tonik-bnpl-397984,2322869,,,,,,0,0,,,,
173183,f6c60c46-a1ec-402d-baed-fcca29e39199,tonik-bnpl-414083,2346479,,,,,,0,0,,,,


In [6]:
# Convert 'GrossIncome' to numeric, coercing errors to NaN
employmentdata['GrossIncome'] = pd.to_numeric(employmentdata['GrossIncome'], errors='coerce')

# Fill NA with 0
employmentdata['GrossIncome'] = employmentdata['GrossIncome'].fillna(0)

# Convert to int64
employmentdata['GrossIncome'] = employmentdata['GrossIncome'].astype(np.int64)

# Filter and describe
result = employmentdata['GrossIncome'][employmentdata['AnnualMonthlyIndicator'] == 'M'].describe()

# Format the output to display whole numbers
result_formatted = result.apply(lambda x: f"{x:.0f}")

result_formatted

count         18085
mean         540144
std        68301309
min               0
25%           12000
50%           20000
75%           36000
max      9185214474
Name: GrossIncome, dtype: object

In [7]:
# Convert 'GrossIncome' to numeric, coercing errors to NaN
employmentdata['GrossIncome'] = pd.to_numeric(employmentdata['GrossIncome'], errors='coerce')

# Fill NA with 0
employmentdata['GrossIncome'] = employmentdata['GrossIncome'].fillna(0)

# Convert to int64
employmentdata['GrossIncome'] = employmentdata['GrossIncome'].astype(np.int64)

# Filter and describe
result = employmentdata['GrossIncome'][employmentdata['AnnualMonthlyIndicator'] == 'Y'].describe()

# Format the output to display whole numbers
result_formatted = result.apply(lambda x: f"{x:.0f}")

result_formatted

count       10470
mean       349254
std        463915
min             0
25%        205869
50%        279000
75%        390000
max      17280000
Name: GrossIncome, dtype: object

In [2]:
sq = """ 

WITH
  CICBaseTable AS ----To create combine CIC raw data combining Granted and Non Granted table
  ( -- Query FOR dfgranted
  SELECT
    digitalLoanAccountId,
    crifApplicationId,
    customerId,
    processEngineGuid,
    requestGuid,
    ContractHistoryType,
    CBContractCode,
    ContractEndDate,
    ContractPhase,
    ContractPhaseDesc,
    ContractStartDate,
    ContractStatus,
    ContractStatusDesc,
    ContractType,
    ContractTypeDesc,
    Currency,
    CurrencyDesc,
    LastUpdateDate,
    OriginalCurrency,
    OriginalCurrencyDesc,
    ProviderCodeEncrypted,
    ProviderContractNo,
    ReferenceNo,
    Role,
    RoleDesc,
    BilledAmount,
    BoardResolutionFlag,
    BoardResolutionFlagDesc,
    CancellationDate,
    CardReferenceCode,
    ChargedAmount,
    CreditLimit,
    CreditPurpose,
    CreditPurposeDesc,
    FinancedAmount,
    FirstPaymentDate,
    FlagCardUsed,
    HolderLiability,
    HolderLiabilityDesc,
    InstallmentType,
    InstallmentTypeDesc,
    InstallmentsNumber,
    LastChargeDate,
    LastPaymentAmount,
    LastPaymentDate,
    MinPaymentIndicator,
    MinPaymentIndicatorDesc,
    MinPaymentPercentage,
    MonthlyPaymentAmount,
    NextPayment,
    NextPaymentDate,
    OutstandingBalance,
    OutstandingBalanceUnbilled,
    OutstandingPaymentsNumber,
    OverallCreditLimit,
    OverdueDays,
    OverdueDaysDesc,
    OverduePaymentsAmount,
    OverduePaymentsNumber,
    PaymentMethod,
    PaymentMethodDesc,
    PaymentPeriodicity,
    PaymentPeriodicityDesc,
    PremiumCard,
    PremiumCardDesc,
    ReorganizedCreditCode,
    ReorganizedCreditCodeDesc,
    ServicesLinesNo,
    TimesCardUsed,
    TransactionType,
    TransactionTypeDesc,
    Utilization,
    LinkedSubject_CBSubjectCode,
    LinkedSubject_Name,
    LinkedSubject_Role,
    LinkedSubject_RoleDesc,
    Note_TypeDesc,
    Note_Text,
    Note_Type,
    run_date,
    NULL AS ContractRequestDate,
    'granted' AS SOURCE
  FROM
    prj-prod-dataplatform.risk_credit_cic_data.granted_contracts
  UNION ALL
    -- Query FOR dfnongranted
  SELECT
    digitalLoanAccountId,
    crifApplicationId,
    customerId,
    processEngineGuid,
    requestGuid,
    NULL AS ContractHistoryType,
    CBContractCode,
    NULL AS ContractEndDate,
    ContractPhase,
    ContractPhaseDesc,
    NULL AS ContractStartDate,
    NULL AS ContractStatus,
    NULL AS ContractStatusDesc,
    ContractType,
    ContractTypeDesc,
    NULL AS Currency,
    NULL AS CurrencyDesc,
    LastUpdateDate,
    NULL AS OriginalCurrency,
    NULL AS OriginalCurrencyDesc,
    ProviderCodeEncrypted,
    ProviderContractNo,
    ReferenceNo,
    Role,
    RoleDesc,
    NULL AS BilledAmount,
    NULL AS BoardResolutionFlag,
    NULL AS BoardResolutionFlagDesc,
    NULL AS CancellationDate,
    NULL AS CardReferenceCode,
    NULL AS ChargedAmount,
    CreditLimit,
    NULL AS CreditPurpose,
    NULL AS CreditPurposeDesc,
    FinancedAmount,
    NULL AS FirstPaymentDate,
    NULL AS FlagCardUsed,
    NULL AS HolderLiability,
    NULL AS HolderLiabilityDesc,
    NULL AS InstallmentType,
    NULL AS InstallmentTypeDesc,
    InstallmentsNumber,
    NULL AS LastChargeDate,
    NULL AS LastPaymentAmount,
    NULL AS LastPaymentDate,
    NULL AS MinPaymentIndicator,
    NULL AS MinPaymentIndicatorDesc,
    NULL AS MinPaymentPercentage,
    MonthlyPaymentAmount,
    NULL AS NextPayment,
    NULL AS NextPaymentDate,
    NULL AS OutstandingBalance,
    NULL AS OutstandingBalanceUnbilled,
    NULL AS OutstandingPaymentsNumber,
    NULL AS OverallCreditLimit,
    NULL AS OverdueDays,
    NULL AS OverdueDaysDesc,
    NULL AS OverduePaymentsAmount,
    NULL AS OverduePaymentsNumber,
    NULL AS PaymentMethod,
    NULL AS PaymentMethodDesc,
    PaymentPeriodicity,
    PaymentPeriodicityDesc,
    NULL AS PremiumCard,
    NULL AS PremiumCardDesc,
    NULL AS ReorganizedCreditCode,
    NULL AS ReorganizedCreditCodeDesc,
    NULL AS ServicesLinesNo,
    NULL AS TimesCardUsed,
    NULL AS TransactionType,
    NULL AS TransactionTypeDesc,
    NULL AS Utilization,
    LinkedSubject_CBSubjectCode,
    LinkedSubject_Name,
    LinkedSubject_Role,
    LinkedSubject_RoleDesc,
    Note_TypeDesc,
    Note_Text,
    Note_Type,
    run_date,
    ContractRequestDate,
    'nongranted' AS SOURCE
  FROM
    prj-prod-dataplatform.risk_credit_cic_data.notgranted_contracts )
    -- select * from CICBaseTable where digitalLoanAccountId = 'c7948327-e6a6-46ee-96fc-66c1b3b56f93'
-- SELECT  ContractHistoryType, RoleDesc, count(digitalLoanAccountId) cnt FROM  CICBaseTable where COALESCE(ContractHistoryType, 'NA') in ('Installments', 'CreditCards', 'NA') group by 1,2 order by 3 desc;
,
employementdata as
(SELECT distinct
  digitalLoanAccountId, 
  crifApplicationId, 
  customerId,   
  AnnualMonthlyIndicator, 
  Currency, 
  DateHiredFrom, 
  DateHiredTo, 
  GrossIncome,
  CAST(
    CASE 
      WHEN COALESCE(AnnualMonthlyIndicator, 'NA') LIKE 'M' THEN CAST(COALESCE(GrossIncome, '0') AS NUMERIC)
      WHEN COALESCE(AnnualMonthlyIndicator, 'NA') LIKE 'Y' THEN ROUND(CAST(COALESCE(GrossIncome, '0') AS NUMERIC)/12, 0)
      ELSE 0 
    END AS INT64
  ) AS MonthlyIncome,
  CAST(
    CASE 
      WHEN COALESCE(AnnualMonthlyIndicator, 'NA') LIKE 'M' THEN ROUND(CAST(COALESCE(GrossIncome, '0') AS NUMERIC)*12, 0)
      WHEN COALESCE(AnnualMonthlyIndicator, 'NA') LIKE 'Y' THEN CAST(COALESCE(GrossIncome, '0') AS NUMERIC)
      ELSE 0 
    END AS INT64
  ) AS AnnualIncome,    
  OccupationDesc,
  OccupationStatusDesc,
  PSIC, 
  REGEXP_REPLACE(PSICDesc, r'^\d+\s*-\s*', '') AS PSICDesc ,
  row_number() over (partition by digitalLoanAccountId order by digitalLoanAccountId ) as rnk
FROM prj-prod-dataplatform.risk_credit_cic_data.employment_data),
CICBase2Table as 
(SELECT digitalLoanAccountId, crifApplicationId, customerId,
       processEngineGuid, requestGuid, ContractHistoryType,
       CBContractCode, ContractEndDate, ContractPhase,
       ContractPhaseDesc, ContractStartDate, ContractStatus,
       ContractStatusDesc, ContractType, ContractTypeDesc,
       Currency, CurrencyDesc, LastUpdateDate, OriginalCurrency,
       OriginalCurrencyDesc, ProviderCodeEncrypted,
       ProviderContractNo, ReferenceNo, Role, RoleDesc,
       BilledAmount, BoardResolutionFlag, BoardResolutionFlagDesc,
       CancellationDate, CardReferenceCode, ChargedAmount,
       CreditLimit, CreditPurpose, CreditPurposeDesc,
       FinancedAmount, FirstPaymentDate, FlagCardUsed,
       HolderLiability, HolderLiabilityDesc, InstallmentType,
       InstallmentTypeDesc, InstallmentsNumber, LastChargeDate,
       LastPaymentAmount, LastPaymentDate, MinPaymentIndicator,
       MinPaymentIndicatorDesc, MinPaymentPercentage,
       MonthlyPaymentAmount, NextPayment, NextPaymentDate,
       OutstandingBalance, OutstandingBalanceUnbilled,
       OutstandingPaymentsNumber, OverallCreditLimit, OverdueDays,
       OverdueDaysDesc, OverduePaymentsAmount,
       OverduePaymentsNumber, PaymentMethod, PaymentMethodDesc,
       PaymentPeriodicity, PaymentPeriodicityDesc, PremiumCard,
       PremiumCardDesc, ReorganizedCreditCode,
       ReorganizedCreditCodeDesc, ServicesLinesNo, TimesCardUsed,
       TransactionType, TransactionTypeDesc, Utilization,
       LinkedSubject_CBSubjectCode, LinkedSubject_Name,
       LinkedSubject_Role, LinkedSubject_RoleDesc, Note_TypeDesc,
       Note_Text, Note_Type, run_date, ContractRequestDate,  SOURCE
,
  CASE
    WHEN ContractPhaseDesc = 'Active' AND ContractStatusDesc = '' THEN 'Neutral'
    WHEN ContractPhaseDesc = 'Active' AND ContractStatusDesc is null THEN 'Neutral'
    WHEN ContractPhaseDesc = 'Active' AND ContractStatusDesc = 'Pre-Activated' THEN 'Good'
    WHEN ContractPhaseDesc = 'Active' AND ContractStatusDesc = 'Foreclosure' THEN 'Good'
    WHEN ContractPhaseDesc = 'Closed' AND ContractStatusDesc = '' THEN 'Good'
    WHEN ContractPhaseDesc = 'Closed' AND ContractStatusDesc is null THEN 'Good'
    WHEN ContractPhaseDesc = 'Closed in advance' AND ContractStatusDesc = '' THEN 'Good'
    WHEN ContractPhaseDesc = 'Closed in advance' AND ContractStatusDesc is null THEN 'Good'
    WHEN ContractPhaseDesc = 'Closed in advance' AND ContractStatusDesc = 'Foreclosure' THEN 'Good'
    WHEN ContractStatusDesc IN ('Debt Assumption', 'Repossessed') THEN 'Neutral'
    WHEN ContractStatusDesc IN (
      'Write-off (BLW)', 'Past Due', 'Blocked by the Bank due to Credit Reasons',
      'Under dispute / non performing', 'Under litigation / Delinquent',
      'Blocked or Closed voluntary by the Customer', 'Blocked or Closed due to Restructuring',
      'There are unpaid amounts, Negotiated Settlement', 'Previous delinquency settled',
      'Write-off and Credit transferred to third party / Collection',
      'Write-off and Fully Settled', 'Blocked by the Bank due to card lost/stolen',
      'Blocked by the Bank due to fraud', 'Dispute / Litigation contested'
    ) THEN 'Bad'
    ELSE 'Unknown'
  END AS Repaymentcategory,
CASE
    WHEN ContractTypeDesc IN ('Salary loan', 'Personal Loan', 'Unsecured loan', 'Vehicle Loan', 'Mortgage/Real Estate', 'Time Loan', 'Short Term Loan', 'Benefit Loan', 'Home equity loan', 'Agricultural Loan', 'Student Loan', 'Vehicle leasing', 'Credit Card', 'Credit Card - Shared Limit', 'Credit Card - MultiCurrency', 'Revolving Credit', 'Trust Loan', 'Credit Line') 
      OR (ContractTypeDesc = 'Term Loan' AND CreditPurposeDesc NOT LIKE 'Small and Medium Enterprise Loans%')
      OR (ContractTypeDesc = 'Loan Line' AND CreditPurposeDesc NOT LIKE 'Small and Medium Enterprise Loans%')
      OR (CreditPurposeDesc LIKE 'Loans to Individual%' AND ContractTypeDesc != 'Business Loan')
      OR (CreditPurposeDesc LIKE 'Microfinance Loans' AND ContractTypeDesc != 'Business Loan')
      OR (CreditPurposeDesc LIKE 'Other Agricultural Credit' AND ContractTypeDesc != 'Business Loan')
      OR (ContractHistoryType LIKE 'Installments' AND ContractTypeDesc = 'Term Loan' and CreditPurposeDesc is null)
      OR (ContractHistoryType is null AND ContractTypeDesc = 'Term Loan' and CreditPurposeDesc is null)
      OR CreditPurposeDesc IN ('Agrarian Reform', 'Development Loan Incentives - Socialized Low Cost Housing (Loans to individuals for housing purposes )')
      OR ContractHistoryType = 'CreditCards'
    THEN 'B2C'
    
    WHEN ContractTypeDesc IN ('Business Loan', 'Real estate leasing', 'Equipment leasing')
      OR CreditPurposeDesc IN ('Development Loan Incentives - Cooperatives', 'Development Loan Incentives - Educational Inst.', 'Loan to Government - GOCCs (Other Financial)', 'Loan to Government - GOCCs (Social Security Institutions)', 'Loan to Government - LGUs', 'Loan to Government - National Government', 'Loans to Private Corporation (Financial)', 'Loans to Private Corporation (Non-Financial)', 'Small and Medium Enterprise Loans (Medium Scale Enterprise)', 'Small and Medium Enterprise Loans (Small Scale Enterprise)')
      OR (ContractTypeDesc = 'Vehicle Loan' AND CreditPurposeDesc NOT LIKE 'Loans to Individual%')
      OR (ContractTypeDesc = 'Loan Line' AND CreditPurposeDesc LIKE 'Small and Medium Enterprise Loans%')
      OR (ContractTypeDesc = 'Term Loan' AND CreditPurposeDesc LIKE 'Small and Medium Enterprise Loans%')
    THEN 'B2B'
    
    ELSE 'Unknown'
  END AS BusinessType,
 CASE
    WHEN ContractTypeDesc = 'Time Loan' THEN 'Time Loans'
    WHEN ContractTypeDesc IN ('Short Term Loan', 'Term Loan') THEN 'Short and Term Loans'
    WHEN ContractTypeDesc = 'Home equity loan' THEN 'Home Equity Loans'
    WHEN ContractTypeDesc IN ('Credit Card', 'Credit Card - MultiCurrency', 'Credit Card - Shared Limit') THEN 'Credit Cards'
    WHEN ContractTypeDesc IN ('Loan Line', 'Credit Line') THEN 'Credit Lines'
    WHEN ContractTypeDesc IN ('Mortgage/Real Estate', 'Real estate leasing') THEN 'Real Estate Loans'
    WHEN ContractTypeDesc = 'Trust Loan' THEN 'Trust Loans'
    WHEN ContractTypeDesc = 'Personal Loan' THEN 'Personal Loans'
    ELSE 'Other Loans'
  END AS loan_segment
from CICBaseTable 
  where COALESCE(ContractHistoryType, 'NA') in ('Installments', 'CreditCards', 'NA')
  and COALESCE(RoleDesc, 'NA') in ('Borrower', 'Co-Borrower', 'NA')
),
CICBase3Table as
(select distinct * FROM  CICBase2Table where BusinessType in ('B2C', 'Unknown')
)
-- select distinct * from CICBase3Table where (digitalLoanAccountid||crifApplicationId||run_date||CBContractCode) = 'c7948327-e6a6-46ee-96fc-66c1b3b56f93taran-2629742-80860572024-07-05302970230';
,
custtname as (SELECT distinct  cast(custId as numeric) custid, firstName, middleName, LastName FROM `prj-prod-dataplatform.dl_loans_db_raw.tdbk_loan_customer_details` 
),
stepAtablebase as
(
select 
(a.digitalLoanAccountid||b.crifApplicationId||b.run_date||b.CBContractCode) uniquekey,
a.digitalLoanAccountId,
a.customerId, cn.Firstname, cn.middleName, cn.LastName,
a.loanAccountNumber,
a.flagDisbursement,
a.disbursementDateTime,
a.termsAndConditionsSubmitDateTime,
a.natureofwork,
a.subIndustryDescription,
a.industryDescription,
case when a.reloan_flag = 1 and a.loantype not like 'FLEXUP'then 'Reloan'
      when a.loantype = 'FLEXUP' and a.new_loan_type = 'Flex-up' and a.reloan_flag = 0 and a.flagDisbursement = 1 then 'Flex-up' 
              else a.new_loan_type end as LoanProduct, b.crifApplicationId, 
       processEngineGuid, requestGuid, ContractHistoryType,
       CBContractCode, ContractEndDate, ContractPhase,
       ContractPhaseDesc, ContractStartDate, ContractStatus,
       ContractStatusDesc, ContractType, ContractTypeDesc,
       b.Currency, CurrencyDesc, LastUpdateDate, OriginalCurrency,
       OriginalCurrencyDesc, ProviderCodeEncrypted,
       ProviderContractNo, ReferenceNo, Role, RoleDesc,
       BilledAmount, BoardResolutionFlag, BoardResolutionFlagDesc,
       CancellationDate, CardReferenceCode, ChargedAmount,
       CreditLimit, CreditPurpose, CreditPurposeDesc,
       FinancedAmount, FirstPaymentDate, FlagCardUsed,
       HolderLiability, HolderLiabilityDesc, InstallmentType,
       InstallmentTypeDesc, InstallmentsNumber, LastChargeDate,
       LastPaymentAmount, LastPaymentDate, MinPaymentIndicator,
       MinPaymentIndicatorDesc, MinPaymentPercentage,
       MonthlyPaymentAmount, NextPayment, NextPaymentDate,
       b.OutstandingBalance, OutstandingBalanceUnbilled,
       OutstandingPaymentsNumber, OverallCreditLimit, OverdueDays,
       OverdueDaysDesc, OverduePaymentsAmount,
       OverduePaymentsNumber, PaymentMethod, PaymentMethodDesc,
       PaymentPeriodicity, PaymentPeriodicityDesc, PremiumCard,
       PremiumCardDesc, ReorganizedCreditCode,
       ReorganizedCreditCodeDesc, ServicesLinesNo, TimesCardUsed,
       TransactionType, TransactionTypeDesc, Utilization,
       LinkedSubject_CBSubjectCode, LinkedSubject_Name,
       LinkedSubject_Role, LinkedSubject_RoleDesc, Note_TypeDesc,
       Note_Text, Note_Type, run_date, ContractRequestDate,  SOURCE, Repaymentcategory, BusinessType, loan_segment
       , ed.AnnualMonthlyIndicator, ed.Currency, ed.DateHiredFrom, ed.DateHiredTo, ed.GrossIncome, ed.MonthlyIncome, ed.AnnualIncome, ed.OccupationDesc, ed.OccupationStatusDesc, ed.PSIC, ed.PSICDesc
FROM `risk_credit_mis.loan_master_table` a 
inner join CICBase3Table b
ON a.digitalLoanAccountId = b.digitalLoanAccountId
    AND a.crifApplicationId = b.crifApplicationId
left join (select * from employementdata where rnk = 1) ed on ed.digitalLoanAccountId = a.digitalLoanAccountId
Left join custtname cn on cn.custid = a.customerId
where a.disbursementDateTime is not null
and date_trunc(a.disbursementDateTime, day) >= '2022-10-11'
and date_trunc(a.disbursementDateTime, day) < current_date()
),
stepAtable2base as 
(select *, row_number() over(partition by uniquekey order by uniquekey) rnk from stepAtablebase)
,
-- select * from stepAtable2base where rnk > 1;
-- select * from stepAtablebase where  (digitalLoanAccountid||crifApplicationId||run_date||CBContractCode) = 'c7948327-e6a6-46ee-96fc-66c1b3b56f93taran-2629742-80860572024-07-05302970230'
-- select (digitalLoanAccountid||crifApplicationId||run_date||CBContractCode), count((digitalLoanAccountid||crifApplicationId||run_date||CBContractCode)) from stepAtable2base group by 1 having count((digitalLoanAccountid||crifApplicationId||run_date||CBContractCode)) > 1;
base as 
(select a.*, d.obsFSPD30, d.defFSPD30 
, case when date_trunc(a.disbursementDateTime, day) <= '2024-03-31' then 'Train_Validation' 
         when date_trunc(a.disbursementDateTime, day) >'2024-03-31' and date_trunc(a.disbursementDateTime, day) <= '2024-04-30' then 'Test' else 'Other' end targetdataselectiontype
from stepAtable2base a 
inner join 
(
    SELECT
        loanAccountNumber
        , SUM(CASE WHEN obs_min_inst_def30 >= 1 THEN 1 ELSE 0 END) as obsFPD30
        , SUM(CASE WHEN min_inst_def30 = 1 THEN 1 else 0 END) as defFPD30
        , sum(case when obs_min_inst_def30>=1 then (select max(disbursedloanamount) from `risk_credit_mis.loan_master_table` where loanAccountNumber = a1.loanAccountNumber) else 0 end) as obs_fpd30_vol
        , sum(case when min_inst_def30=1 then (select max(disbursedloanamount) from `risk_credit_mis.loan_master_table` where loanAccountNumber = a1.loanAccountNumber) else 0 end) as def_fpd30_vol
        , SUM(CASE WHEN obs_min_inst_def30 >= 2 THEN 1 ELSE 0 END) as obsFSPD30
        , SUM(CASE WHEN obs_min_inst_def30 >= 2 AND (min_inst_def30 = 2 or min_inst_def30 = 1) THEN 1 else 0 END) as defFSPD30
        , SUM(CASE WHEN obs_min_inst_def30 >= 2 THEN (select max(disbursedloanamount) from `risk_credit_mis.loan_master_table` where loanAccountNumber = a1.loanAccountNumber) ELSE 0 END) as obsFSPD30_vol
        , SUM(CASE WHEN obs_min_inst_def30 >= 2 AND (min_inst_def30 = 2 or min_inst_def30 = 1) THEN (select max(disbursedloanamount) from `risk_credit_mis.loan_master_table` where loanAccountNumber = a1.loanAccountNumber) else 0 END) as defFSPD30_vol
        , SUM(CASE WHEN obs_min_inst_def30 >= 3 THEN 1 ELSE 0 END) as obsFSTPD30
        , SUM(CASE WHEN obs_min_inst_def30 >= 3 AND (min_inst_def30 = 3 or min_inst_def30 = 2 or min_inst_def30 = 1) THEN 1 else 0 END) as defFSTPD30
        , SUM(CASE WHEN obs_min_inst_def30 >= 3 THEN (select max(disbursedloanamount) from `risk_credit_mis.loan_master_table` where loanAccountNumber = a1.loanAccountNumber) ELSE 0 END) as obsFSTPD30_vol
        , SUM(CASE WHEN obs_min_inst_def30 >= 3 AND (min_inst_def30 = 3 or min_inst_def30 = 2 or min_inst_def30 = 1) THEN (select max(disbursedloanamount) from `risk_credit_mis.loan_master_table` where loanAccountNumber = a1.loanAccountNumber) else 0 END) as defFSTPD30_vol
    FROM `risk_credit_mis.loan_deliquency_data` a1 
    GROUP BY 1
  ) d
ON a.loanAccountNumber = d.loanAccountNumber
where 
a.rnk = 1
and a.LoanProduct in ('Quick', 'SIL-Instore')
)
select * from base 
--  where digitalLoanAccountId = '42c268b9-1fe7-445c-b459-8c66d0483884'
-- select digitalLoanAccountId, count(digitalLoanAccountId) cnt from base group by 1 having count(digitalLoanAccountId) > 1
-- select uniquekey, count(uniquekey) from base group by 1 having count(uniquekey) > 1
-- and 
-- uniquekey = 'c7948327-e6a6-46ee-96fc-66c1b3b56f93taran-2629742-80860572024-07-05302970230'
;
"""
df = client.query(sq).to_dataframe(progress_bar_type = 'tqdm')
df.to_csv(r"C:\Users\DwaipayanChakroborti\OneDrive - Tonik Financial Pte Ltd\MyStuff\Biswa\CIC Data Analysis\CICModel\DataPreparation\Data\cicfinaldataset20240807.csv", index = False)
print(f"The rows and columns in cic final dataset are:\t {df.shape}")
pd.set_option("display.max_columns", None)
df.head(10)


Job ID 126be3ff-e6ec-4d12-98e5-c8c918debb11 successfully executed: 100%|[32m██████████[0m|
Downloading: 100%|[32m██████████[0m|
The rows and columns in cic final dataset are:	 (245224, 112)


Unnamed: 0,uniquekey,digitalLoanAccountId,customerId,Firstname,middleName,LastName,loanAccountNumber,flagDisbursement,disbursementDateTime,termsAndConditionsSubmitDateTime,natureofwork,subIndustryDescription,industryDescription,LoanProduct,crifApplicationId,processEngineGuid,requestGuid,ContractHistoryType,CBContractCode,ContractEndDate,ContractPhase,ContractPhaseDesc,ContractStartDate,ContractStatus,ContractStatusDesc,ContractType,ContractTypeDesc,Currency,CurrencyDesc,LastUpdateDate,OriginalCurrency,OriginalCurrencyDesc,ProviderCodeEncrypted,ProviderContractNo,ReferenceNo,Role,RoleDesc,BilledAmount,BoardResolutionFlag,BoardResolutionFlagDesc,CancellationDate,CardReferenceCode,ChargedAmount,CreditLimit,CreditPurpose,CreditPurposeDesc,FinancedAmount,FirstPaymentDate,FlagCardUsed,HolderLiability,HolderLiabilityDesc,InstallmentType,InstallmentTypeDesc,InstallmentsNumber,LastChargeDate,LastPaymentAmount,LastPaymentDate,MinPaymentIndicator,MinPaymentIndicatorDesc,MinPaymentPercentage,MonthlyPaymentAmount,NextPayment,NextPaymentDate,OutstandingBalance,OutstandingBalanceUnbilled,OutstandingPaymentsNumber,OverallCreditLimit,OverdueDays,OverdueDaysDesc,OverduePaymentsAmount,OverduePaymentsNumber,PaymentMethod,PaymentMethodDesc,PaymentPeriodicity,PaymentPeriodicityDesc,PremiumCard,PremiumCardDesc,ReorganizedCreditCode,ReorganizedCreditCodeDesc,ServicesLinesNo,TimesCardUsed,TransactionType,TransactionTypeDesc,Utilization,LinkedSubject_CBSubjectCode,LinkedSubject_Name,LinkedSubject_Role,LinkedSubject_RoleDesc,Note_TypeDesc,Note_Text,Note_Type,run_date,ContractRequestDate,SOURCE,Repaymentcategory,BusinessType,loan_segment,AnnualMonthlyIndicator,Currency_1,DateHiredFrom,DateHiredTo,GrossIncome,MonthlyIncome,AnnualIncome,OccupationDesc,OccupationStatusDesc,PSIC,PSICDesc,rnk,obsFSPD30,defFSPD30,targetdataselectiontype
0,000d5b99-ffd3-45ad-b650-32fe43a95dc7taran-2614...,000d5b99-ffd3-45ad-b650-32fe43a95dc7,2614354,NENITA,UMALI,CAIDIC,60826143540016,1,2024-06-27 16:59:48,2024-06-27 16:55:44,,,,SIL-Instore,taran-2614354-9950345,9be21cf75100ec9fe03b3f154c42f1fe0bfc7364,16b6e2be-fd2d-4b5f-87cb-01e72ffbb5ab,Installments,H04833725,2024-06-14,AC,Active,2023-06-14,,,12,Personal Loan,PHP,Philippine peso,2024-01-31,PHP,Philippine peso,PF002,,1,B,Borrower,,,,,,,,,,24959.0,,,,,,,12,,2481.0,,,,,2481,2481.0,,12405.0,,5.0,,0,Paid as agreed / Current,0.0,0.0,,,M,monthly installments-30 days,,,,,,,,NOT APPLICABLE,,,,,,,,,2024-06-27,,granted,Neutral,B2C,Personal Loans,Y,PHP,,,360000.0,30000.0,360000.0,,Self Employed,,,1,0,0,Other
1,0049b50f-a05e-48da-b8f2-42477d6301aetonik-tul-...,0049b50f-a05e-48da-b8f2-42477d6301ae,1770003,CLARA MAE,YUNTING,SALONGA,60817700030027,1,2023-11-06 10:54:28,2023-11-03 12:02:34,Call Center Agent/Tele Marketer,Finance Company / Consumer Finance Company,Financial Services,Quick,tonik-tul-378938,d28a4d70-79fd-11ee-b07b-0242ace6000f,d26b53c0-79fd-11ee-82ea-0242ace60004,,I05121737,,RQ,Requested,,,,12,Personal Loan,,,2023-08-03,,,PF002,,1,B,Borrower,,,,,,,,,,,,,,,,,4,,,,,,,2500,,,,,,,,,,,,,F,fortnight installments-15 days,,,,,,,,,,,,,,,,,2023-11-03,2023-08-03,nongranted,Unknown,B2C,Personal Loans,,,,,,0.0,0.0,,,,,1,1,0,Train_Validation
2,0065031d-463d-43ec-802b-082390a75bfftonik-bnpl...,0065031d-463d-43ec-802b-082390a75bff,2097298,NARILYN,VEROCIL,REYES,60820972980011,1,2023-06-17 16:02:08,2023-06-17 15:59:23,Other Professional services,"Taxi Service; Transport Aggregator (Grab, Uber...",Services,SIL-Instore,tonik-bnpl-281450,e14a3e00-0ce4-11ee-8088-0242ace60005,e129bdb0-0ce4-11ee-801b-0242ace60015,Installments,L03345314,2021-08-01,CL,Closed,2021-07-19,,,18,Unsecured loan,PHP,Philippine peso,2021-08-31,PHP,Philippine peso,PF003,,1,B,Borrower,,,,,,,,,,200.0,,,,,,,7,,29.0,,,,,200,,,,,,,0,Paid as agreed / Current,0.0,0.0,,,D,Daily installments-1 day,,,,,,,,NOT APPLICABLE,,,,,,,,,2023-06-17,,granted,Good,B2C,Other Loans,,,,,,,,,,,,1,1,0,Train_Validation
3,0065031d-463d-43ec-802b-082390a75bfftonik-bnpl...,0065031d-463d-43ec-802b-082390a75bff,2097298,NARILYN,VEROCIL,REYES,60820972980011,1,2023-06-17 16:02:08,2023-06-17 15:59:23,Other Professional services,"Taxi Service; Transport Aggregator (Grab, Uber...",Services,SIL-Instore,tonik-bnpl-281450,e14a3e00-0ce4-11ee-8088-0242ace60005,e129bdb0-0ce4-11ee-801b-0242ace60015,Installments,L03691149,2022-08-24,CL,Closed,2021-12-13,,,18,Unsecured loan,PHP,Philippine peso,2022-08-31,PHP,Philippine peso,PF003,,1,B,Borrower,,,,,,,,,,15051.0,,,,,,,180,,1923.0,,,,,2140,,,,,,,0,Paid as agreed / Current,0.0,0.0,,,D,Daily installments-1 day,,,,,,,,NOT APPLICABLE,,,,,,,,,2023-06-17,,granted,Good,B2C,Other Loans,,,,,,,,,,,,1,1,0,Train_Validation
4,006bc029-d965-4c7a-b7d8-bb537cf53ff9tonik-bnpl...,006bc029-d965-4c7a-b7d8-bb537cf53ff9,2466893,MIRA FLOR,OROLFO,CARACAS,60824668930019,1,2024-04-16 13:59:09,2024-04-16 10:20:06,,,,SIL-Instore,tonik-bnpl-489416,d8d49cf0-fb97-11ee-b07b-0242ace6000f,d8b750f0-fb97-11ee-989e-0242ace60004,Installments,904647317,2023-07-07,AC,Active,2023-01-07,,,12,Personal Loan,PHP,Philippine peso,2023-07-01,PHP,Philippine peso,PF003,,1,B,Borrower,,,,,,,,,,5599.0,2023-02-07,,,,,,6,,1346.0,,,,,1346,,,1263.0,,1.0,,0,Paid as agreed / Current,0.0,0.0,,,M,monthly installments-30 days,,,,,,,,NOT APPLICABLE,,,,,,,,,2024-04-16,,granted,Neutral,B2C,Personal Loans,,,,,,0.0,0.0,,,,,1,1,0,Test
5,008c1696-18a0-4e06-a824-18353b335e8ctonik-tul-...,008c1696-18a0-4e06-a824-18353b335e8c,2264208,ZYRA,DE DIOS,FELICIANO,60822642080018,1,2023-10-08 09:00:36,2023-10-08 01:40:04,Call Center Agent/Tele Marketer,Others,Financial Services,Quick,tonik-tul-361131,8d669bb0-6538-11ee-a41d-0242ace60015,8d3b1ee0-6538-11ee-be29-0242ace60008,Installments,204220185,2020-06-29,AC,Active,2020-03-10,,,16,Short Term Loan,PHP,Philippine peso,2022-06-30,PHP,Philippine peso,PF002,,1,B,Borrower,,,,,,,,,,11000.0,,,,,,,1,,1000.0,2020-10-10,,,,12760,,,12781.0,,1.0,,N,Too new to be rated / Not Available,12781.0,1.0,,,I,irregular installments,,,,,,,DL,DEMAND LOAN,,,,,,,,,2023-10-08,,granted,Neutral,B2C,Short and Term Loans,,,,,,0.0,0.0,,,,,1,1,0,Train_Validation
6,00ab7cff-a9bf-4461-b7b5-3d0b26e1cdb6tonik-bnpl...,00ab7cff-a9bf-4461-b7b5-3d0b26e1cdb6,2352368,LYKA,TIGLAO,QUINTO,60823523680016,1,2023-12-29 14:02:17,2023-12-29 13:56:29,Accountant,Call Centre / BPO,Services,SIL-Instore,tonik-bnpl-418120,038ebb30-a60f-11ee-b07b-0242ace6000f,036fe890-a60f-11ee-82ea-0242ace60004,Installments,M05064903,2022-04-14,CL,Closed,2022-03-04,,,12,Personal Loan,PHP,Philippine peso,2022-04-30,PHP,Philippine peso,PF002,,1,B,Borrower,,,,,,,,31.0,Loans to Individual for Consumption Purposes -...,358.0,2022-03-14,,,,,,2,,193.0,2022-04-14,,,,193,,,0.0,,0.0,,0,Paid as agreed / Current,0.0,0.0,OTH,Other,M,monthly installments-30 days,,,,,,,,NOT APPLICABLE,,,,,,,,,2023-12-29,,granted,Good,B2C,Personal Loans,,,,,,,,,,,,1,1,0,Train_Validation
7,00b5fefc-9f6e-445b-9bf1-cd3d54e918betonik-tul-...,00b5fefc-9f6e-445b-9bf1-cd3d54e918be,2255660,DARYL JON,SILAGAN,PORILLO,60822556600039,1,2024-05-18 09:13:51,2024-05-17 23:33:11,IT Professional,IT / Tech Consultancy / Projects / AI ML Solut...,Technology,Quick,tonik-tul-523875,c6a86550-1462-11ef-b07b-0242ace6000f,c6863750-1462-11ef-b6ea-0242ace60004,Installments,205362181,2023-11-29,CA,Closed in advance,2023-10-15,FC,Foreclosure,18,Unsecured loan,PHP,Philippine peso,2023-11-30,PHP,Philippine peso,OT001,60822556600013.0,1,B,Borrower,,,,,,,,32.0,Loans to Individual for other purposes,20000.0,2023-11-15,,,,,,12,,22036.0,2023-11-29,,,,2518,,,,,,,0,Paid as agreed / Current,0.0,0.0,CAD,Current Account Debit,M,monthly installments-30 days,,,,,,,,NOT APPLICABLE,,,,,,,,,2024-05-17,,granted,Good,B2C,Other Loans,M,PHP,,,40000.0,40000.0,480000.0,,Permanent Job (Private sector),822.0,Call centers and other related activities,1,0,0,Other
8,00c19184-0d0d-4a6c-bb4a-9280cfd5cbcetonik-bnpl...,00c19184-0d0d-4a6c-bb4a-9280cfd5cbce,2252874,JOENARDO,MANALANG,TACLAS,60822528740013,1,2023-09-29 19:00:12,2023-09-29 18:54:24,,,,SIL-Instore,tonik-bnpl-354715,8ea4d510-5eb6-11ee-a41d-0242ace60015,8e7d4fe0-5eb6-11ee-be29-0242ace60008,Installments,702814369,2021-01-02,CL,Closed,2020-02-29,,,12,Personal Loan,PHP,Philippine peso,2021-01-31,PHP,Philippine peso,PF002,,1,B,Borrower,,0.0,No,,,,,,,12680.0,,,,,,,9,,1405.0,2021-01-01,,,,1409,0.0,,0.0,,0.0,,0,Paid as agreed / Current,0.0,0.0,,,M,monthly installments-30 days,,,0.0,Credit is not re-organized,,,,NOT APPLICABLE,,,,,,,,,2023-09-29,,granted,Good,B2C,Personal Loans,,,,,,,,,,,,1,1,0,Train_Validation
9,00c709d7-37a8-45b0-93ad-c16a5a13200ctonik-bnpl...,00c709d7-37a8-45b0-93ad-c16a5a13200c,2278666,DANILO,MANALO,ALMENDRAS,60822786660029,1,2024-05-11 14:33:26,2024-05-11 14:28:27,,,,SIL-Instore,tonik-bnpl-517049,ae2f9b40-0f5f-11ef-a41d-0242ace60015,ae0704a0-0f5f-11ef-a4cc-0242ace60008,Installments,A04071094,2022-08-22,CL,Closed,2022-01-06,,,12,Personal Loan,PHP,Philippine peso,2022-08-31,PHP,Philippine peso,PF002,,1,B,Borrower,,0.0,No,,,,,,,17806.0,,,,,,,9,,2028.0,2022-08-22,,,,2028,0.0,,0.0,,0.0,,0,Paid as agreed / Current,0.0,0.0,,,M,monthly installments-30 days,,,0.0,Credit is not re-organized,,,,NOT APPLICABLE,,,,,,,,,2024-05-11,,granted,Good,B2C,Personal Loans,,,,,,,,,,,,1,0,0,Other


In [39]:
# Assuming df is your original DataFrame
# Group the data and calculate the sum
grouped_df = df.groupby('targetdataselectiontype')[['defFSPD30', 'obsFSPD30']].sum().reset_index()

# Create the pivot table with the calculated field
pivot_table = pd.pivot_table(
    grouped_df,
    values=['defFSPD30', 'obsFSPD30'],
    index=['targetdataselectiontype'],
    aggfunc='sum',
    margins=True,
    margins_name='Total'
)

# Add the calculated field
pivot_table['FSPD30_ratio'] = pivot_table['defFSPD30'] / pivot_table['obsFSPD30']

# Format the ratio as a percentage
pivot_table['FSPD30_percentage'] = pivot_table['FSPD30_ratio'].apply(lambda x: f"{x:.2%}")

# Reorder columns if desired
pivot_table = pivot_table[['defFSPD30', 'obsFSPD30', 'FSPD30_ratio', 'FSPD30_percentage']]


In [41]:
pivot_table['fspd30share'] = pivot_table['defFSPD30']/(pivot_table['defFSPD30']+pivot_table['obsFSPD30'])
pivot_table['obsfspd30share'] = pivot_table['obsFSPD30']/(pivot_table['defFSPD30']+pivot_table['obsFSPD30'])
pivot_table

Unnamed: 0_level_0,defFSPD30,obsFSPD30,FSPD30_ratio,FSPD30_percentage,fspd30share,obsfspd30share
targetdataselectiontype,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Other,276.0,4396.0,0.062784,6.28%,0.059075,0.940925
Test,2177.0,21170.0,0.102834,10.28%,0.093245,0.906755
Train_Validation,17918.0,133606.0,0.134111,13.41%,0.118252,0.881748
Total,20371.0,159172.0,0.127981,12.80%,0.11346,0.88654


In [48]:


# Assuming df is your original DataFrame
# First, we'll keep only the first occurrence of each digitalLoanAccountId
df_unique = df.drop_duplicates(subset='digitalLoanAccountId', keep='first')

# Now group the data and calculate the sum
grouped_df = df_unique.groupby('targetdataselectiontype')[['defFSPD30', 'obsFSPD30']].sum().reset_index()

# Create the pivot table with the calculated field
pivot_table = pd.pivot_table(
    grouped_df,
    values=['defFSPD30', 'obsFSPD30'],
    index=['targetdataselectiontype'],
    aggfunc='sum',
    margins=True,
    margins_name='Total'
)

# Add the calculated field
pivot_table['FSPD30_ratio'] = pivot_table['defFSPD30'] / pivot_table['obsFSPD30']

# Format the ratio as a percentage
pivot_table['FSPD30_percentage'] = pivot_table['FSPD30_ratio'].apply(lambda x: f"{x:.2%}")

# Reorder columns if desired
pivot_table = pivot_table[['defFSPD30', 'obsFSPD30', 'FSPD30_ratio', 'FSPD30_percentage']]

# Display the pivot table
pivot_table

Unnamed: 0_level_0,defFSPD30,obsFSPD30,FSPD30_ratio,FSPD30_percentage
targetdataselectiontype,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Other,89.0,889.0,0.100112,10.01%
Test,494.0,4123.0,0.119816,11.98%
Train_Validation,3456.0,25241.0,0.13692,13.69%
Total,4039.0,30253.0,0.133507,13.35%


In [49]:
df['digitalLoanAccountId'].nunique()

45625

In [43]:
df['natureofwork'].value_counts()

natureofwork
Call Center Agent/Tele Marketer        32296
Other Professional services            22820
Other Non Professional Services        20283
Govt Employee                          19877
Admin/Secretarial                      19366
IT Professional                        14133
Bank Employee                          12967
Sales/Marketing Professional            9927
Teacher/Instructor/Coach                9484
Accountant                              9249
AIR FORCE                               6623
Engineer                                6227
Doctor/Dentist/Medical Professional     5307
Store/Service Manager                   4840
Sales Marketing Professional            3794
Cashier/Food Server/Waiter              3582
Owner                                   2679
Consultant                              2612
Utility Personnel/Household Help        1381
Insurance Agent/Financial Advisor       1007
Architect/Interior Decorator             587
NAVY                                     4

In [44]:
df['PSICDesc'].value_counts()

PSICDesc
Call centers and other related activities                                                                                                   2503
Banking activities, n.e.c.                                                                                                                  1355
Retail trade not in stores, stalls or markets                                                                                                919
Other information technology and computer service activities                                                                                 826
Educational support services                                                                                                                 628
Public administration, local government                                                                                                      573
Business support service activities, n.e.c.                                                                              

In [45]:
df.columns.values

array(['uniquekey', 'digitalLoanAccountId', 'customerId', 'Firstname',
       'middleName', 'LastName', 'loanAccountNumber', 'flagDisbursement',
       'disbursementDateTime', 'termsAndConditionsSubmitDateTime',
       'natureofwork', 'subIndustryDescription', 'industryDescription',
       'LoanProduct', 'crifApplicationId', 'processEngineGuid',
       'requestGuid', 'ContractHistoryType', 'CBContractCode',
       'ContractEndDate', 'ContractPhase', 'ContractPhaseDesc',
       'ContractStartDate', 'ContractStatus', 'ContractStatusDesc',
       'ContractType', 'ContractTypeDesc', 'Currency', 'CurrencyDesc',
       'LastUpdateDate', 'OriginalCurrency', 'OriginalCurrencyDesc',
       'ProviderCodeEncrypted', 'ProviderContractNo', 'ReferenceNo',
       'Role', 'RoleDesc', 'BilledAmount', 'BoardResolutionFlag',
       'BoardResolutionFlagDesc', 'CancellationDate', 'CardReferenceCode',
       'ChargedAmount', 'CreditLimit', 'CreditPurpose',
       'CreditPurposeDesc', 'FinancedAmount', 'Firs

In [50]:
import pandas as pd

# Assuming df is your original DataFrame

# Calculate duplicate counts
duplicate_counts = df.groupby('targetdataselectiontype')['digitalLoanAccountId'].apply(lambda x: x.duplicated().sum()).reset_index(name='duplicate_count')

# Count total unique digitalLoanAccountId
unique_counts = df.groupby('targetdataselectiontype')['digitalLoanAccountId'].nunique().reset_index(name='unique_count')

# Count where defFSPD30 = 1 and obsFSPD30 = 1
def_count = df[df['defFSPD30'] == 1].groupby('targetdataselectiontype')['digitalLoanAccountId'].nunique().reset_index(name='defFSPD30_count')
obs_count = df[df['obsFSPD30'] == 1].groupby('targetdataselectiontype')['digitalLoanAccountId'].nunique().reset_index(name='obsFSPD30_count')

# Sum defFSPD30 and obsFSPD30
sum_values = df.groupby('targetdataselectiontype')[['defFSPD30', 'obsFSPD30']].sum().reset_index()

# Merge all the dataframes
merged_df = pd.merge(unique_counts, duplicate_counts, on='targetdataselectiontype')
merged_df = pd.merge(merged_df, def_count, on='targetdataselectiontype', how='left')
merged_df = pd.merge(merged_df, obs_count, on='targetdataselectiontype', how='left')
merged_df = pd.merge(merged_df, sum_values, on='targetdataselectiontype', how='left')

# Fill NaN values with 0 for count columns
merged_df[['defFSPD30_count', 'obsFSPD30_count']] = merged_df[['defFSPD30_count', 'obsFSPD30_count']].fillna(0)

# Calculate ratios
merged_df['FSPD30_ratio'] = merged_df['defFSPD30'] / merged_df['obsFSPD30']
merged_df['FSPD30_percentage'] = merged_df['FSPD30_ratio'].apply(lambda x: f"{x:.2%}")

# Create the pivot table
pivot_table = pd.pivot_table(
    merged_df,
    values=['unique_count', 'duplicate_count', 'defFSPD30_count', 'obsFSPD30_count', 'defFSPD30', 'obsFSPD30', 'FSPD30_ratio'],
    index=['targetdataselectiontype'],
    aggfunc='sum',
    margins=True,
    margins_name='Total'
)

# Format the percentage column
pivot_table['FSPD30_percentage'] = pivot_table['FSPD30_ratio'].apply(lambda x: f"{x:.2%}")

# Reorder columns if desired
pivot_table = pivot_table[['unique_count', 'duplicate_count', 'defFSPD30_count', 'obsFSPD30_count', 'defFSPD30', 'obsFSPD30', 'FSPD30_ratio', 'FSPD30_percentage']]

# Display the pivot table
pivot_table

Unnamed: 0_level_0,unique_count,duplicate_count,defFSPD30_count,obsFSPD30_count,defFSPD30,obsFSPD30,FSPD30_ratio,FSPD30_percentage
targetdataselectiontype,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Other,16261,74187,89,889,276,4396,0.062784,6.28%
Test,4123,17047,494,4123,2177,21170,0.102834,10.28%
Train_Validation,25241,108365,3456,25241,17918,133606,0.134111,13.41%
Total,45625,199599,4039,30253,20371,159172,0.299729,29.97%
