# <div align="center" style="color: #ff5733;">CIC Data Analysis</div>

In [2]:
# %% [markdown]
# # Jupyter Notebook Loading Header
#
# This is a custom loading header for Jupyter Notebooks in Visual Studio Code.
# It includes common imports and settings to get you started quickly.

# %% [markdown]
## Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from google.cloud import bigquery
import os
path = r'C:\Users\DwaipayanChakroborti\AppData\Roaming\gcloud\legacy_credentials\dchakroborti@tonikbank.com\adc.json'
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = path
client = bigquery.Client(project='prj-prod-dataplatform')

# %% [markdown]
## Configure Settings
# Set options or configurations as needed
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

# Delinquency FSTPD30 Query with Loan Master

In [21]:
sq = """WITH
  delq AS (
  SELECT
    lmt.digitalLoanAccountId,
    lmt.customerId,
    lmt.new_loan_type,
    lmt.applicationStatus,
    lmt.loanPaidStatus,
    ldd.loanAccountNumber,
    lmt.flagApproval,
    lmt.flagRejection,
    DATE(FORMAT_DATE('%Y-%m-%d', DATE_TRUNC(lmt.startApplyDateTime, day))) Application_date,
    CASE
      WHEN ldd.obs_min_inst_def30 >=3 AND ldd.min_inst_def30 IN (1, 2, 3) THEN ldd.loanAccountNumber
  END
    fstpd30def,
    CASE
      WHEN ldd.obs_min_inst_def30 >=3 THEN ldd.loanAccountNumber
  END
    obsfstpd30
  FROM
    `risk_credit_mis.loan_master_table` lmt
  LEFT JOIN
    prj-prod-dataplatform.risk_credit_mis.loan_deliquency_data ldd
  ON
    ldd.loanAccountNumber = lmt.loanAccountNumber ),
  base AS (
  SELECT
    digitalLoanAccountId,
    customerId,
    loanAccountNumber,
    MAX(new_loan_type) loan_type,
    MAX(Application_date) Application_date,
    MAX(loanPaidStatus) loanPaidStatus,
    MAX(applicationStatus) applicationStatus,
    max(flagApproval) FlagApproved,
    max(flagRejection) FlagRejected,
    COUNT(DISTINCT fstpd30def) fstpd30,
    COUNT(DISTINCT obsfstpd30) obsfstpd30
  FROM
    delq
  GROUP BY
    1,
    2,
    3 )select * from base;"""
    
delqdf = client.query(sq).to_dataframe(progress_bar_type = 'tqdm')
print(f"The rows and columns in this delqdf are\t: {delqdf.shape}")

Job ID f20b24c5-2a4b-483d-a8fa-a615063b29cc successfully executed: |[32m          [0m|
Downloading: 100%|[32m██████████[0m|
The rows and columns in this delqdf are	: (1510513, 11)


In [22]:
delqdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1510513 entries, 0 to 1510512
Data columns (total 11 columns):
 #   Column                Non-Null Count    Dtype 
---  ------                --------------    ----- 
 0   digitalLoanAccountId  1510513 non-null  object
 1   customerId            1509704 non-null  Int64 
 2   loanAccountNumber     140094 non-null   object
 3   loan_type             1510513 non-null  object
 4   Application_date      1236455 non-null  dbdate
 5   loanPaidStatus        156077 non-null   object
 6   applicationStatus     1510513 non-null  object
 7   FlagApproved          1510513 non-null  Int64 
 8   FlagRejected          1510513 non-null  Int64 
 9   fstpd30               1510513 non-null  Int64 
 10  obsfstpd30            1510513 non-null  Int64 
dtypes: Int64(5), dbdate(1), object(5)
memory usage: 134.0+ MB


In [23]:
# Assuming delqdf is your DataFrame
nan_counts = delqdf.isna().sum()

print("NaN counts per column:")
print(nan_counts)

NaN counts per column:
digitalLoanAccountId          0
customerId                  809
loanAccountNumber       1370419
loan_type                     0
Application_date         274058
loanPaidStatus          1354436
applicationStatus             0
FlagApproved                  0
FlagRejected                  0
fstpd30                       0
obsfstpd30                    0
dtype: int64


In [24]:
# Replace pd.NA with NaN for consistency (if using pd.NA)
delqdf['customerId'] = delqdf['customerId'].fillna(-1)
delqdf['loanAccountNumber'] = delqdf['loanAccountNumber'].fillna('NA')

In [25]:
# Convert 'application_date' column to datetime format
delqdf['Application_date'] = pd.to_datetime(delqdf['Application_date'])
# Replace NaT with a very old date within Pandas' supported range
replace_date = pd.Timestamp.min
delqdf['Application_date'] = delqdf['Application_date'].fillna(replace_date)

In [26]:
delqdf.sample(10)

Unnamed: 0,digitalLoanAccountId,customerId,loanAccountNumber,loan_type,Application_date,loanPaidStatus,applicationStatus,FlagApproved,FlagRejected,fstpd30,obsfstpd30
349144,c6923de8-575b-4f5a-a218-f5daa3359b7a,1383023,,Quick,2024-05-14 00:00:00.000000000,,REJECT,0,1,0,0
1491527,6cf59733-6c2b-47f0-a979-71e923241b79,1972434,,Quick,1677-09-21 00:12:43.145224193,,SKYCINITIATE,0,0,0,0
1307131,ba89e81b-f68a-4af1-9fc3-e7dfce3af685,2150282,,Flex,2023-07-22 00:00:00.000000000,,CANCELLED,0,0,0,0
1472392,8f55e68b-2c54-410c-8e82-5f1b72ee87c6,2461781,,Quick,2024-04-12 00:00:00.000000000,,CANCELLED,0,0,0,0
1308488,ebb0b6d5-8d23-43d8-9fb8-4edcb0b6d388,1678962,,Flex,1677-09-21 00:12:43.145224193,,CANCELLED,0,0,0,0
358193,a7c6a833-26a9-45fe-8959-c8b9bcefe347,1738658,,Quick,1677-09-21 00:12:43.145224193,,EXEMPT,0,0,0,0
977211,221a43f7-740a-4af2-a751-95efcdc85588,2343100,,Quick,2023-12-22 00:00:00.000000000,,EXPIRED,0,1,0,0
905788,8fa9cda2-779e-417a-af6e-b4aed6c1bb03,1787583,,Flex,2022-11-21 00:00:00.000000000,,EXPIRED,0,1,0,0
628094,018882c5-e839-4e8f-8e03-f9fb4372613e,2056714,,Quick,2023-05-22 00:00:00.000000000,,REJECT,0,1,0,0
11852,9b9736a5-4e6f-4e1f-a1f1-fc23cc18b01f,1978306,,Quick,2023-04-05 00:00:00.000000000,,RESET,0,0,0,0


In [27]:
delqdf['loan_type'].value_counts()

loan_type
Quick                       1045316
Flex                         294186
SIL-Instore                  144102
SIL-Online(discontinued)      13039
Flex-up                       11565
Big Loan                       2175
ACL TSA                         130
Name: count, dtype: int64

In [28]:
import sweetviz as sv
report = sv.analyze(delqdf, target_feat = 'fstpd30')
report.show_html("delqloan.html")


                                             |          | [  0%]   00:00 -> (? left)

Report delqloan.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.
