# <div align="center" style="color: #ff5733;">CIC Data Analysis</div>

In [1]:
# %% [markdown]
# # Jupyter Notebook Loading Header
#
# This is a custom loading header for Jupyter Notebooks in Visual Studio Code.
# It includes common imports and settings to get you started quickly.

# %% [markdown]
## Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from google.cloud import bigquery
import os
path = r'C:\Users\DwaipayanChakroborti\AppData\Roaming\gcloud\legacy_credentials\dchakroborti@tonikbank.com\adc.json'
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = path
client = bigquery.Client(project='prj-prod-dataplatform')

# %% [markdown]
## Configure Settings
# Set options or configurations as needed
# Example: pd.set_option('display.max_columns', None)

# granted_contract table count of contract counts of customer in cic data

In [2]:
sq = """select customerId, count(distinct digitalLoanAccountId) cntloans 
, count(distinct CBContractCode) loanincb
, sum(case when cast(ContractStartDate as date) < current_date() and ContractPhaseDesc = 'Closed' then 1 else 0 end) closedcontract
, sum(case when cast(ContractStartDate as date) < current_date() and  ContractPhaseDesc = 'Closed in advance' then 1 else 0 end) closedinadvancecontract
, sum(case when cast(ContractStartDate as date) < current_date() and ContractPhaseDesc = 'Active' then 1 else 0 end) Activecontract
from prj-prod-dataplatform.risk_credit_cic_data.granted_contracts
group by 1
;
"""

dfgrantedcontractcnt = client.query(sq).to_dataframe(progress_bar_type = 'tqdm')
dfgrantedcontractcnt.head()

Job ID 05cf8ccb-9235-4d2e-9f6e-e7fb53fffa9a successfully executed: 100%|[32m██████████[0m|
Downloading: 100%|[32m██████████[0m|


Unnamed: 0,customerId,cntloans,loanincb,closedcontract,closedinadvancecontract,Activecontract
0,1159365,1,27,11,13,3
1,1767883,2,46,24,48,19
2,1631238,1,26,5,14,7
3,1864415,1,26,25,0,1
4,1680482,1,30,9,16,5


# Getting the base query ready with CustomerId the common column

`CRITERIA`<BR>
Only for quick loans and for customers with disbursed loan after 2023-01-01 and obsTPD30 = 1

In [3]:
sq = """with customerbase as 
(select customerId, loanAccountNumber, disbursementDateTime, 
row_number() over (partition by customerId order by disbursementDateTime desc) rnk
from `risk_credit_mis.loan_master_table` where flagDisbursement = 1 and date_trunc(disbursementDateTime, day) >= '2023-01-01'
and upper(new_loan_type) = 'QUICK'),
fstpd as
(select lmt.customerId, lmt.loanAccountNumber, lmt.new_loan_type,
sum(case when obsTPD30 = 1
     then case when defFPD30 = 1 then 1
     when defSPD30 = 1 then 1
     when defTPD30 = 1 then 1
     else 0 end else 0 end) FPSTPD30 ,
sum(obsTPD30) obsTPD30,
max(cb.rnk) rnk
 from `risk_credit_mis.loan_master_table` lmt
 inner join customerbase cb on cb.customerId = lmt.customerId and lmt.loanAccountNumber = cb.loanAccountNumber
 WHERE 
  date_trunc(lmt.disbursementDateTime, day) >= '2023-01-01' and upper(lmt.new_loan_type) = 'QUICK'
 group by 1,2,3
),
modelpopulationbase as
(select customerId,new_loan_type,count(distinct loanAccountNumber) cnt_loans
, sum(obsTPD30) obs
, sum(FPSTPD30)FSTPD30 
from fstpd where obsTPD30 = 1 group by 1,2
) 
select customerId,new_loan_type,
max(case when obs > 0 then 1 else 0 end) obs ,
max(case when FSTPD30 > 0 then 1 else 0 end) fstpd30
from modelpopulationbase 
group by 1, 2
;
"""

In [4]:
base = client.query(sq).to_dataframe(progress_bar_type='tqdm')
print(f"The rows and columns downloaded after running the above query are:\n{base.shape}")

Job ID e677c620-6e76-40c7-a459-758853cc6bdd successfully executed: 100%|[32m██████████[0m|
Downloading: 100%|[32m██████████[0m|
The rows and columns downloaded after running the above query are:
(17613, 4)


In [5]:
base.head()

Unnamed: 0,customerId,new_loan_type,obs,fstpd30
0,2169925,Quick,1,0
1,2025886,Quick,1,0
2,2224518,Quick,1,0
3,2029675,Quick,1,0
4,2041621,Quick,1,0


In [6]:
dfgrantedcontractcnt['customerId'] = dfgrantedcontractcnt['customerId'].astype(np.int64)

In [7]:
df = base.merge(dfgrantedcontractcnt, left_on='customerId', right_on = 'customerId', how='left')
df.columns

Index(['customerId', 'new_loan_type', 'obs', 'fstpd30', 'cntloans', 'loanincb',
       'closedcontract', 'closedinadvancecontract', 'Activecontract'],
      dtype='object')

In [8]:
print(f"The shape of dataframe dfgrantedcontractcnt are:/t {dfgrantedcontractcnt.shape}")
print(f"The shape of dataframe base are:/t {base.shape}")
print(f"The shape of dataframe df are:/t {df.shape}")

The shape of dataframe dfgrantedcontractcnt are:/t (84289, 6)
The shape of dataframe base are:/t (17613, 4)
The shape of dataframe df are:/t (17613, 9)


In [9]:
df.columns

Index(['customerId', 'new_loan_type', 'obs', 'fstpd30', 'cntloans', 'loanincb',
       'closedcontract', 'closedinadvancecontract', 'Activecontract'],
      dtype='object')

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 17613 entries, 0 to 17612
Data columns (total 9 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   customerId               17613 non-null  Int64 
 1   new_loan_type            17613 non-null  object
 2   obs                      17613 non-null  Int64 
 3   fstpd30                  17613 non-null  Int64 
 4   cntloans                 8995 non-null   Int64 
 5   loanincb                 8995 non-null   Int64 
 6   closedcontract           8995 non-null   Int64 
 7   closedinadvancecontract  8995 non-null   Int64 
 8   Activecontract           8995 non-null   Int64 
dtypes: Int64(8), object(1)
memory usage: 1.5+ MB


In [11]:
# delete the dataframe dfgrantedcontractcnt
dfgrantedcontractcnt.to_csv("dfgrantedcontractcnt.csv")
del(dfgrantedcontractcnt)

In [12]:
df.rename(columns = {'cntloans':'tonikdigitalloanid'}, inplace = True)
df['tonikdigitalloanid'] = df['tonikdigitalloanid'].fillna(0)
df['loanincb'] = df['loanincb'].fillna(0)
df['closedcontract'] = df['closedcontract'].fillna(0)
df['closedinadvancecontract'] = df['closedinadvancecontract'].fillna(0)
df['Activecontract'] = df['Activecontract'].fillna(0)
df.head()

Unnamed: 0,customerId,new_loan_type,obs,fstpd30,tonikdigitalloanid,loanincb,closedcontract,closedinadvancecontract,Activecontract
0,2169925,Quick,1,0,1,3,2,0,1
1,2025886,Quick,1,0,0,0,0,0,0
2,2224518,Quick,1,0,0,0,0,0,0
3,2029675,Quick,1,0,0,0,0,0,0
4,2041621,Quick,1,0,1,10,2,1,7


# Weight of Evidence Function and Information Value

In [13]:
def woe_iv(df, feature, target):
    """
    Calculate the Weight of Evidence (WOE) and Information Value (IV) for a given feature and target variable in a pandas DataFrame.
   
    Parameters:
    df (pandas.DataFrame): The DataFrame containing the feature and target variables.
    feature (str): The name of the feature variable.
    target (str): The name of the target variable.
   
    Returns:
    pandas.DataFrame: A DataFrame containing the WOE and IV for each unique value of the feature variable.
    """
   
    # Calculate the total number of events and non-events in the target variable
    total_events = df[target].sum()
    total_non_events = df[target].count() - total_events
   
    # Group the DataFrame by unique values of the feature variable
    grouped = df.groupby(feature)
   
    # Calculate the number of events and non-events in each group
    events = grouped[target].sum()
    non_events = grouped[target].count() - events
   
    # Calculate the percentage of events and non-events in each group
    event_pct = events / total_events
    non_event_pct = non_events / total_non_events
   
    # Calculate the WOE for each group
    woe = np.log(non_event_pct / event_pct)
   
    # Calculate the IV for each group
    iv = (non_event_pct - event_pct) * woe
   
    # Combine the results into a DataFrame
    result = pd.DataFrame({
        'event': events,
        'non_event': non_events,
        'event_pct': event_pct,
        'non_event_pct': non_event_pct,
        'woe': woe,
        'iv': iv
    })
   
    return result




# Granted Table Contract Status

In [14]:
sq = """select customerId,ContractStatus, ContractStatusDesc from prj-prod-dataplatform.risk_credit_cic_data.granted_contracts;"""
dfcs = client.query(sq).to_dataframe(progress_bar_type = 'tqdm')
dfcs["ContractStatus"] = dfcs["ContractStatus"].fillna('NA')
dfcs["ContractStatusDesc"] = dfcs["ContractStatusDesc"].fillna('NA') 

dfcs.head()

Job ID 705d50e2-bcab-403b-a6d9-6c73e1d3aeff successfully executed: 100%|[32m██████████[0m|
Downloading: 100%|[32m██████████[0m|


Unnamed: 0,customerId,ContractStatus,ContractStatusDesc
0,1825461,,
1,1879689,PD,Past Due
2,1879702,,
3,1108397,,
4,1072595,,


In [15]:
df1 = df.copy()
dfcs['customerId'] = dfcs['customerId'].astype(np.int64)
df1 = df1.merge(dfcs, left_on='customerId', right_on='customerId', how='left')
df1.head()

Unnamed: 0,customerId,new_loan_type,obs,fstpd30,tonikdigitalloanid,loanincb,closedcontract,closedinadvancecontract,Activecontract,ContractStatus,ContractStatusDesc
0,2169925,Quick,1,0,1,3,2,0,1,,
1,2169925,Quick,1,0,1,3,2,0,1,CV,Blocked or Closed voluntary by the Customer
2,2169925,Quick,1,0,1,3,2,0,1,CV,Blocked or Closed voluntary by the Customer
3,2025886,Quick,1,0,0,0,0,0,0,,
4,2224518,Quick,1,0,0,0,0,0,0,,


In [16]:
gd = pd.get_dummies(dfcs["ContractStatus"], prefix='ContractStatus', prefix_sep="_")
dfcs = pd.concat([dfcs, gd], axis = 1)


In [17]:
dfcs.drop(columns = 'ContractStatus', inplace = True)

In [18]:
dfcs.drop(columns = 'ContractStatusDesc', inplace = True)

In [19]:
dfcs.columns

Index(['customerId', 'ContractStatus_BC', 'ContractStatus_BF',
       'ContractStatus_BL', 'ContractStatus_CR', 'ContractStatus_CV',
       'ContractStatus_DA', 'ContractStatus_DI', 'ContractStatus_DS',
       'ContractStatus_FC', 'ContractStatus_LT', 'ContractStatus_NA',
       'ContractStatus_NP', 'ContractStatus_NS', 'ContractStatus_PA',
       'ContractStatus_PD', 'ContractStatus_RP', 'ContractStatus_WC',
       'ContractStatus_WF', 'ContractStatus_WO'],
      dtype='object')

In [20]:
dfcs = dfcs.groupby('customerId')[['ContractStatus_BC', 'ContractStatus_BF',
       'ContractStatus_BL', 'ContractStatus_CR', 'ContractStatus_CV',
       'ContractStatus_DA', 'ContractStatus_DI', 'ContractStatus_DS',
       'ContractStatus_FC', 'ContractStatus_LT', 'ContractStatus_NA',
       'ContractStatus_NP', 'ContractStatus_NS', 'ContractStatus_PA',
       'ContractStatus_PD', 'ContractStatus_RP', 'ContractStatus_WC',
       'ContractStatus_WF', 'ContractStatus_WO']].sum().reset_index()
dfcs.head()

Unnamed: 0,customerId,ContractStatus_BC,ContractStatus_BF,ContractStatus_BL,ContractStatus_CR,ContractStatus_CV,ContractStatus_DA,ContractStatus_DI,ContractStatus_DS,ContractStatus_FC,ContractStatus_LT,ContractStatus_NA,ContractStatus_NP,ContractStatus_NS,ContractStatus_PA,ContractStatus_PD,ContractStatus_RP,ContractStatus_WC,ContractStatus_WF,ContractStatus_WO
0,1000001,3,0,6,0,3,0,0,0,0,0,9,0,0,0,3,0,0,0,0
1,1000085,0,0,0,0,4,0,0,0,0,0,22,0,0,0,0,0,0,0,0
2,1000192,0,0,0,0,2,0,0,0,2,0,13,0,0,0,0,0,0,0,4
3,1000243,0,0,0,0,1,0,0,0,0,0,28,0,0,0,0,0,0,0,0
4,1000278,0,0,0,0,0,0,0,0,0,0,103,0,0,0,1,0,3,0,5


In [21]:
# Check for duplicate digitalLoanAccountId
has_duplicates = dfcs['customerId'].duplicated()

# Get the actual duplicate loan ids
duplicate_ids = dfcs[has_duplicates]['customerId']

# Print the duplicate loan ids
if duplicate_ids.empty:
    print("There are no duplicate digitalLoanAccountId in the dataframe.")
else:
    print("The following digitalLoanAccountId are duplicates:")
    print(duplicate_ids.tolist())

There are no duplicate digitalLoanAccountId in the dataframe.


# Get the unique values for customer Contract Status

In [22]:
sq = """select distinct ContractStatus, ContractStatusDesc from prj-prod-dataplatform.risk_credit_cic_data.granted_contracts;"""
dfd = client.query(sq).to_dataframe()
dfd.to_excel("ContractStatus_Codedescription.xlsx", index = False)

In [23]:
# base['fstpd30_per'] = base["fstpd30"]/base['obs']*100

# consolidated_fstpd30 = base['fstpd30_per'].mean()

# # Print the consolidated FSTPD30%
# print(f"The consolidated FSTPD30% is: {consolidated_fstpd30:.2f}%")


# Merge the dfcs with df

In [24]:
dfcs['customerId'] = dfcs['customerId'].astype(np.int64)
df = df.merge(dfcs, left_on= 'customerId', right_on = 'customerId', how = 'left')
df.columns

Index(['customerId', 'new_loan_type', 'obs', 'fstpd30', 'tonikdigitalloanid',
       'loanincb', 'closedcontract', 'closedinadvancecontract',
       'Activecontract', 'ContractStatus_BC', 'ContractStatus_BF',
       'ContractStatus_BL', 'ContractStatus_CR', 'ContractStatus_CV',
       'ContractStatus_DA', 'ContractStatus_DI', 'ContractStatus_DS',
       'ContractStatus_FC', 'ContractStatus_LT', 'ContractStatus_NA',
       'ContractStatus_NP', 'ContractStatus_NS', 'ContractStatus_PA',
       'ContractStatus_PD', 'ContractStatus_RP', 'ContractStatus_WC',
       'ContractStatus_WF', 'ContractStatus_WO'],
      dtype='object')

In [25]:
df.head()

Unnamed: 0,customerId,new_loan_type,obs,fstpd30,tonikdigitalloanid,loanincb,closedcontract,closedinadvancecontract,Activecontract,ContractStatus_BC,...,ContractStatus_LT,ContractStatus_NA,ContractStatus_NP,ContractStatus_NS,ContractStatus_PA,ContractStatus_PD,ContractStatus_RP,ContractStatus_WC,ContractStatus_WF,ContractStatus_WO
0,2169925,Quick,1,0,1,3,2,0,1,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2025886,Quick,1,0,0,0,0,0,0,,...,,,,,,,,,,
2,2224518,Quick,1,0,0,0,0,0,0,,...,,,,,,,,,,
3,2029675,Quick,1,0,0,0,0,0,0,,...,,,,,,,,,,
4,2041621,Quick,1,0,1,10,2,1,7,0.0,...,0.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0


In [26]:
df1.head()

Unnamed: 0,customerId,new_loan_type,obs,fstpd30,tonikdigitalloanid,loanincb,closedcontract,closedinadvancecontract,Activecontract,ContractStatus,ContractStatusDesc
0,2169925,Quick,1,0,1,3,2,0,1,,
1,2169925,Quick,1,0,1,3,2,0,1,CV,Blocked or Closed voluntary by the Customer
2,2169925,Quick,1,0,1,3,2,0,1,CV,Blocked or Closed voluntary by the Customer
3,2025886,Quick,1,0,0,0,0,0,0,,
4,2224518,Quick,1,0,0,0,0,0,0,,


In [27]:
result = woe_iv(df1, 'ContractStatus', 'fstpd30')
print(result)

                event  non_event  event_pct  non_event_pct       woe        iv
ContractStatus                                                                
BC                136        758   0.007506       0.009496  0.235216  0.000468
BF                  7         87   0.000386        0.00109  1.037185   0.00073
BL                 45        230   0.002484       0.002881  0.148604  0.000059
CR                 17        120   0.000938       0.001503  0.471465  0.000266
CV                139        902   0.007672         0.0113  0.387328  0.001406
DS                  3         55   0.000166       0.000689  1.425908  0.000746
FC                112       1748   0.006181       0.021899  1.264916  0.019882
LT                  0          5        0.0       0.000063       inf       inf
NA              13841      66654   0.763894       0.835054  0.089067  0.006338
NP                 43        100   0.002373       0.001253 -0.638843  0.000716
NS                 14        104   0.000773       0.

In [28]:
df1.to_csv("df1.csv")
df.to_csv("df.csv")

In [29]:
# delete dataframe dfcs

dfcs.to_csv("dfcs.csv")
del(dfcs)


# Contract History Type

In [30]:
sq = """select  customerId , ContractHistoryType from prj-prod-dataplatform.risk_credit_cic_data.granted_contracts ;"""
dfd = client.query(sq).to_dataframe(progress_bar_type = 'tqdm')
dfd.head()



Job ID 8775bca2-0301-47a4-a416-7f2644fd3ba8 successfully executed: 100%|[32m██████████[0m|
Downloading: 100%|[32m██████████[0m|


Unnamed: 0,customerId,ContractHistoryType
0,2366859,Installments
1,2366859,Installments
2,2371916,Installments
3,2371916,Installments
4,2371916,Installments


In [31]:
gd = pd.get_dummies(dfd["ContractHistoryType"], prefix='ContractHistoryType', prefix_sep="_")
dfd = pd.concat([dfd, gd], axis = 1)

In [32]:
dfd.drop(columns = 'ContractHistoryType', inplace = True)
dfd.columns

Index(['customerId', 'ContractHistoryType_CreditCards',
       'ContractHistoryType_Installments',
       'ContractHistoryType_NonInstallments', 'ContractHistoryType_Utilities'],
      dtype='object')

In [33]:
dfd = dfd.groupby("customerId")[['ContractHistoryType_CreditCards',
       'ContractHistoryType_Installments',
       'ContractHistoryType_NonInstallments', 'ContractHistoryType_Utilities']].sum().reset_index()
dfd.head()

Unnamed: 0,customerId,ContractHistoryType_CreditCards,ContractHistoryType_Installments,ContractHistoryType_NonInstallments,ContractHistoryType_Utilities
0,1000001,24,0,0,0
1,1000085,26,0,0,0
2,1000192,10,9,2,0
3,1000243,17,11,1,0
4,1000278,12,94,6,0


In [34]:
dfd['customerId'] = dfd['customerId'].astype(np.int64)
df = df.merge(dfd, left_on = 'customerId', right_on = "customerId", how = 'left')
pd.set_option("display.max_columns", None)
df.head()

Unnamed: 0,customerId,new_loan_type,obs,fstpd30,tonikdigitalloanid,loanincb,closedcontract,closedinadvancecontract,Activecontract,ContractStatus_BC,ContractStatus_BF,ContractStatus_BL,ContractStatus_CR,ContractStatus_CV,ContractStatus_DA,ContractStatus_DI,ContractStatus_DS,ContractStatus_FC,ContractStatus_LT,ContractStatus_NA,ContractStatus_NP,ContractStatus_NS,ContractStatus_PA,ContractStatus_PD,ContractStatus_RP,ContractStatus_WC,ContractStatus_WF,ContractStatus_WO,ContractHistoryType_CreditCards,ContractHistoryType_Installments,ContractHistoryType_NonInstallments,ContractHistoryType_Utilities
0,2169925,Quick,1,0,1,3,2,0,1,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,1.0,0.0
1,2025886,Quick,1,0,0,0,0,0,0,,,,,,,,,,,,,,,,,,,,,,,
2,2224518,Quick,1,0,0,0,0,0,0,,,,,,,,,,,,,,,,,,,,,,,
3,2029675,Quick,1,0,0,0,0,0,0,,,,,,,,,,,,,,,,,,,,,,,
4,2041621,Quick,1,0,1,10,2,1,7,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,3.0,6.0,1.0,0.0


In [35]:
# select only numerical columns
num_cols = df.select_dtypes(include=['int', 'float']).columns

# fill NaN values with 0 in numerical columns
df[num_cols] = df[num_cols].fillna(0)


In [36]:
df.head()

Unnamed: 0,customerId,new_loan_type,obs,fstpd30,tonikdigitalloanid,loanincb,closedcontract,closedinadvancecontract,Activecontract,ContractStatus_BC,ContractStatus_BF,ContractStatus_BL,ContractStatus_CR,ContractStatus_CV,ContractStatus_DA,ContractStatus_DI,ContractStatus_DS,ContractStatus_FC,ContractStatus_LT,ContractStatus_NA,ContractStatus_NP,ContractStatus_NS,ContractStatus_PA,ContractStatus_PD,ContractStatus_RP,ContractStatus_WC,ContractStatus_WF,ContractStatus_WO,ContractHistoryType_CreditCards,ContractHistoryType_Installments,ContractHistoryType_NonInstallments,ContractHistoryType_Utilities
0,2169925,Quick,1,0,1,3,2,0,1,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,1.0,0.0
1,2025886,Quick,1,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2224518,Quick,1,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2029675,Quick,1,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2041621,Quick,1,0,1,10,2,1,7,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,3.0,6.0,1.0,0.0


In [37]:
df.to_csv("df.csv", index = False)
dfd.to_csv("ContractHistoryType.csv")
del(dfd)

# ContractTypeDesc

In [38]:
sq = """select customerId, ContractTypeDesc from prj-prod-dataplatform.risk_credit_cic_data.granted_contracts"""
dfd = client.query(sq).to_dataframe()
gd = pd.get_dummies(dfd["ContractTypeDesc"], prefix='ContractTypeDesc', prefix_sep="_")
dfd = pd.concat([dfd, gd], axis = 1)
dfd.drop(columns = 'ContractTypeDesc', inplace = True)

dfd.columns

Index(['customerId', 'ContractTypeDesc_Agricultural Loan',
       'ContractTypeDesc_Benefit Loan', 'ContractTypeDesc_Business Loan',
       'ContractTypeDesc_Credit Card',
       'ContractTypeDesc_Credit Card - MultiCurrency',
       'ContractTypeDesc_Credit Card - Shared Limit',
       'ContractTypeDesc_Credit Line', 'ContractTypeDesc_Home equity loan',
       'ContractTypeDesc_Insurance - Life Insurance', 'ContractTypeDesc_L/C',
       'ContractTypeDesc_Loan Line', 'ContractTypeDesc_Mortgage/Real Estate',
       'ContractTypeDesc_Omnibus Line', 'ContractTypeDesc_Personal Loan',
       'ContractTypeDesc_Provident Loan',
       'ContractTypeDesc_Real estate leasing',
       'ContractTypeDesc_Revolving Credit', 'ContractTypeDesc_SWAP Loan',
       'ContractTypeDesc_Salary loan', 'ContractTypeDesc_Short Term Loan',
       'ContractTypeDesc_Student Loan', 'ContractTypeDesc_Term Loan',
       'ContractTypeDesc_Time Loan', 'ContractTypeDesc_Trust Loan',
       'ContractTypeDesc_Unsecured lo

In [39]:
dfd = dfd.groupby('customerId')[['ContractTypeDesc_Agricultural Loan',
       'ContractTypeDesc_Benefit Loan', 'ContractTypeDesc_Business Loan',
       'ContractTypeDesc_Credit Card',
       'ContractTypeDesc_Credit Card - MultiCurrency',
       'ContractTypeDesc_Credit Card - Shared Limit',
       'ContractTypeDesc_Credit Line', 'ContractTypeDesc_Home equity loan',
       'ContractTypeDesc_Insurance - Life Insurance', 'ContractTypeDesc_L/C',
       'ContractTypeDesc_Loan Line', 'ContractTypeDesc_Mortgage/Real Estate',
       'ContractTypeDesc_Omnibus Line', 'ContractTypeDesc_Personal Loan',
       'ContractTypeDesc_Provident Loan',
       'ContractTypeDesc_Real estate leasing',
       'ContractTypeDesc_Revolving Credit', 'ContractTypeDesc_SWAP Loan',
       'ContractTypeDesc_Salary loan', 'ContractTypeDesc_Short Term Loan',
       'ContractTypeDesc_Student Loan', 'ContractTypeDesc_Term Loan',
       'ContractTypeDesc_Time Loan', 'ContractTypeDesc_Trust Loan',
       'ContractTypeDesc_Unsecured loan', 'ContractTypeDesc_Vehicle Loan',
       'ContractTypeDesc_Vehicle leasing']].sum().reset_index()
dfd.head()

Unnamed: 0,customerId,ContractTypeDesc_Agricultural Loan,ContractTypeDesc_Benefit Loan,ContractTypeDesc_Business Loan,ContractTypeDesc_Credit Card,ContractTypeDesc_Credit Card - MultiCurrency,ContractTypeDesc_Credit Card - Shared Limit,ContractTypeDesc_Credit Line,ContractTypeDesc_Home equity loan,ContractTypeDesc_Insurance - Life Insurance,ContractTypeDesc_L/C,ContractTypeDesc_Loan Line,ContractTypeDesc_Mortgage/Real Estate,ContractTypeDesc_Omnibus Line,ContractTypeDesc_Personal Loan,ContractTypeDesc_Provident Loan,ContractTypeDesc_Real estate leasing,ContractTypeDesc_Revolving Credit,ContractTypeDesc_SWAP Loan,ContractTypeDesc_Salary loan,ContractTypeDesc_Short Term Loan,ContractTypeDesc_Student Loan,ContractTypeDesc_Term Loan,ContractTypeDesc_Time Loan,ContractTypeDesc_Trust Loan,ContractTypeDesc_Unsecured loan,ContractTypeDesc_Vehicle Loan,ContractTypeDesc_Vehicle leasing
0,1000001,0,0,0,6,0,18,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1000085,0,0,0,10,6,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,1000192,0,0,0,8,0,2,2,0,0,0,0,0,0,4,0,0,0,0,3,0,0,0,0,0,2,0,0
3,1000243,0,0,0,11,0,5,1,0,0,0,0,0,0,9,0,0,1,0,1,0,0,0,0,0,1,0,0
4,1000278,0,0,0,12,0,0,6,0,0,0,0,9,0,39,0,0,0,0,1,42,0,0,0,0,3,0,0


In [40]:
dfd["customerId"] = dfd["customerId"].astype(np.int64)
df = df.merge(dfd, left_on='customerId', right_on='customerId', how = 'left')
df.head()

Unnamed: 0,customerId,new_loan_type,obs,fstpd30,tonikdigitalloanid,loanincb,closedcontract,closedinadvancecontract,Activecontract,ContractStatus_BC,ContractStatus_BF,ContractStatus_BL,ContractStatus_CR,ContractStatus_CV,ContractStatus_DA,ContractStatus_DI,ContractStatus_DS,ContractStatus_FC,ContractStatus_LT,ContractStatus_NA,ContractStatus_NP,ContractStatus_NS,ContractStatus_PA,ContractStatus_PD,ContractStatus_RP,ContractStatus_WC,ContractStatus_WF,ContractStatus_WO,ContractHistoryType_CreditCards,ContractHistoryType_Installments,ContractHistoryType_NonInstallments,ContractHistoryType_Utilities,ContractTypeDesc_Agricultural Loan,ContractTypeDesc_Benefit Loan,ContractTypeDesc_Business Loan,ContractTypeDesc_Credit Card,ContractTypeDesc_Credit Card - MultiCurrency,ContractTypeDesc_Credit Card - Shared Limit,ContractTypeDesc_Credit Line,ContractTypeDesc_Home equity loan,ContractTypeDesc_Insurance - Life Insurance,ContractTypeDesc_L/C,ContractTypeDesc_Loan Line,ContractTypeDesc_Mortgage/Real Estate,ContractTypeDesc_Omnibus Line,ContractTypeDesc_Personal Loan,ContractTypeDesc_Provident Loan,ContractTypeDesc_Real estate leasing,ContractTypeDesc_Revolving Credit,ContractTypeDesc_SWAP Loan,ContractTypeDesc_Salary loan,ContractTypeDesc_Short Term Loan,ContractTypeDesc_Student Loan,ContractTypeDesc_Term Loan,ContractTypeDesc_Time Loan,ContractTypeDesc_Trust Loan,ContractTypeDesc_Unsecured loan,ContractTypeDesc_Vehicle Loan,ContractTypeDesc_Vehicle leasing
0,2169925,Quick,1,0,1,3,2,0,1,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2025886,Quick,1,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,2224518,Quick,1,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,2029675,Quick,1,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,2041621,Quick,1,0,1,10,2,1,7,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,3.0,6.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [41]:
dfd.to_csv("ContractTypeDesc.csv")
del(dfd)
df.to_csv("df.csv", index = False)

In [42]:
# select only numerical columns
num_cols = df.select_dtypes(include=['int', 'float']).columns

# fill NaN values with 0 in numerical columns
df[num_cols] = df[num_cols].fillna(0)
df.head()

Unnamed: 0,customerId,new_loan_type,obs,fstpd30,tonikdigitalloanid,loanincb,closedcontract,closedinadvancecontract,Activecontract,ContractStatus_BC,ContractStatus_BF,ContractStatus_BL,ContractStatus_CR,ContractStatus_CV,ContractStatus_DA,ContractStatus_DI,ContractStatus_DS,ContractStatus_FC,ContractStatus_LT,ContractStatus_NA,ContractStatus_NP,ContractStatus_NS,ContractStatus_PA,ContractStatus_PD,ContractStatus_RP,ContractStatus_WC,ContractStatus_WF,ContractStatus_WO,ContractHistoryType_CreditCards,ContractHistoryType_Installments,ContractHistoryType_NonInstallments,ContractHistoryType_Utilities,ContractTypeDesc_Agricultural Loan,ContractTypeDesc_Benefit Loan,ContractTypeDesc_Business Loan,ContractTypeDesc_Credit Card,ContractTypeDesc_Credit Card - MultiCurrency,ContractTypeDesc_Credit Card - Shared Limit,ContractTypeDesc_Credit Line,ContractTypeDesc_Home equity loan,ContractTypeDesc_Insurance - Life Insurance,ContractTypeDesc_L/C,ContractTypeDesc_Loan Line,ContractTypeDesc_Mortgage/Real Estate,ContractTypeDesc_Omnibus Line,ContractTypeDesc_Personal Loan,ContractTypeDesc_Provident Loan,ContractTypeDesc_Real estate leasing,ContractTypeDesc_Revolving Credit,ContractTypeDesc_SWAP Loan,ContractTypeDesc_Salary loan,ContractTypeDesc_Short Term Loan,ContractTypeDesc_Student Loan,ContractTypeDesc_Term Loan,ContractTypeDesc_Time Loan,ContractTypeDesc_Trust Loan,ContractTypeDesc_Unsecured loan,ContractTypeDesc_Vehicle Loan,ContractTypeDesc_Vehicle leasing
0,2169925,Quick,1,0,1,3,2,0,1,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2025886,Quick,1,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2224518,Quick,1,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2029675,Quick,1,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2041621,Quick,1,0,1,10,2,1,7,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,3.0,6.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


# Role Description

In [43]:
sq = """select customerId, RoleDesc from prj-prod-dataplatform.risk_credit_cic_data.granted_contracts;"""
dfd = client.query(sq).to_dataframe()
gd = pd.get_dummies(dfd["RoleDesc"], prefix='RoleDesc', prefix_sep="_")
dfd = pd.concat([dfd, gd], axis = 1)
dfd.drop(columns = 'RoleDesc', inplace = True)
dfd.columns

Index(['customerId', 'RoleDesc_Borrower', 'RoleDesc_Co-Borrower',
       'RoleDesc_Guarantor/Surety'],
      dtype='object')

In [44]:
dfd.rename(columns = {"RoleDesc_Guarantor/Surety":"RoleDesc_Guarantor_Surety"}, inplace = True)
dfd = dfd.groupby("customerId")[['RoleDesc_Borrower', 'RoleDesc_Co-Borrower','RoleDesc_Guarantor_Surety']].sum().reset_index()
dfd.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 84289 entries, 0 to 84288
Data columns (total 4 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   customerId                 84289 non-null  object
 1   RoleDesc_Borrower          84289 non-null  uint64
 2   RoleDesc_Co-Borrower       84289 non-null  uint64
 3   RoleDesc_Guarantor_Surety  84289 non-null  uint64
dtypes: object(1), uint64(3)
memory usage: 2.6+ MB


In [45]:
dfd['customerId'] = dfd['customerId'].astype(np.int64)
df = df.merge(dfd, left_on='customerId', right_on='customerId', how = 'left')
dfd.to_csv("RoleDesc.csv")
df.to_csv("df.csv", index = False)
del(dfd)

In [46]:
df.head()

Unnamed: 0,customerId,new_loan_type,obs,fstpd30,tonikdigitalloanid,loanincb,closedcontract,closedinadvancecontract,Activecontract,ContractStatus_BC,ContractStatus_BF,ContractStatus_BL,ContractStatus_CR,ContractStatus_CV,ContractStatus_DA,ContractStatus_DI,ContractStatus_DS,ContractStatus_FC,ContractStatus_LT,ContractStatus_NA,ContractStatus_NP,ContractStatus_NS,ContractStatus_PA,ContractStatus_PD,ContractStatus_RP,ContractStatus_WC,ContractStatus_WF,ContractStatus_WO,ContractHistoryType_CreditCards,ContractHistoryType_Installments,ContractHistoryType_NonInstallments,ContractHistoryType_Utilities,ContractTypeDesc_Agricultural Loan,ContractTypeDesc_Benefit Loan,ContractTypeDesc_Business Loan,ContractTypeDesc_Credit Card,ContractTypeDesc_Credit Card - MultiCurrency,ContractTypeDesc_Credit Card - Shared Limit,ContractTypeDesc_Credit Line,ContractTypeDesc_Home equity loan,ContractTypeDesc_Insurance - Life Insurance,ContractTypeDesc_L/C,ContractTypeDesc_Loan Line,ContractTypeDesc_Mortgage/Real Estate,ContractTypeDesc_Omnibus Line,ContractTypeDesc_Personal Loan,ContractTypeDesc_Provident Loan,ContractTypeDesc_Real estate leasing,ContractTypeDesc_Revolving Credit,ContractTypeDesc_SWAP Loan,ContractTypeDesc_Salary loan,ContractTypeDesc_Short Term Loan,ContractTypeDesc_Student Loan,ContractTypeDesc_Term Loan,ContractTypeDesc_Time Loan,ContractTypeDesc_Trust Loan,ContractTypeDesc_Unsecured loan,ContractTypeDesc_Vehicle Loan,ContractTypeDesc_Vehicle leasing,RoleDesc_Borrower,RoleDesc_Co-Borrower,RoleDesc_Guarantor_Surety
0,2169925,Quick,1,0,1,3,2,0,1,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0
1,2025886,Quick,1,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,
2,2224518,Quick,1,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,
3,2029675,Quick,1,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,
4,2041621,Quick,1,0,1,10,2,1,7,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,3.0,6.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,10.0,0.0,0.0


# Credit Limit - Only for Credit Card loan type

In [47]:
sq = """select customerId, min(coalesce(cast(CreditLimit as int64), 0)) mincreditlimit, max(coalesce(cast(CreditLimit as int64), 0)) maxcreditlimit 
from prj-prod-dataplatform.risk_credit_cic_data.granted_contracts
group by 1;"""

dfd = client.query(sq).to_dataframe(progress_bar_type='tqdm')
dfd['customerId'] = dfd["customerId"].astype(np.int64)
df = df.merge(dfd, left_on='customerId', right_on='customerId', how = 'left')
dfd.to_csv("creditlimit.csv")
df.to_csv("df.csv", index = False)
del(dfd)
df.head()


Job ID de56b21d-8da2-48a7-8a77-e2c7d349ac83 successfully executed: 100%|[32m██████████[0m|
Downloading: 100%|[32m██████████[0m|


Unnamed: 0,customerId,new_loan_type,obs,fstpd30,tonikdigitalloanid,loanincb,closedcontract,closedinadvancecontract,Activecontract,ContractStatus_BC,ContractStatus_BF,ContractStatus_BL,ContractStatus_CR,ContractStatus_CV,ContractStatus_DA,ContractStatus_DI,ContractStatus_DS,ContractStatus_FC,ContractStatus_LT,ContractStatus_NA,ContractStatus_NP,ContractStatus_NS,ContractStatus_PA,ContractStatus_PD,ContractStatus_RP,ContractStatus_WC,ContractStatus_WF,ContractStatus_WO,ContractHistoryType_CreditCards,ContractHistoryType_Installments,ContractHistoryType_NonInstallments,ContractHistoryType_Utilities,ContractTypeDesc_Agricultural Loan,ContractTypeDesc_Benefit Loan,ContractTypeDesc_Business Loan,ContractTypeDesc_Credit Card,ContractTypeDesc_Credit Card - MultiCurrency,ContractTypeDesc_Credit Card - Shared Limit,ContractTypeDesc_Credit Line,ContractTypeDesc_Home equity loan,ContractTypeDesc_Insurance - Life Insurance,ContractTypeDesc_L/C,ContractTypeDesc_Loan Line,ContractTypeDesc_Mortgage/Real Estate,ContractTypeDesc_Omnibus Line,ContractTypeDesc_Personal Loan,ContractTypeDesc_Provident Loan,ContractTypeDesc_Real estate leasing,ContractTypeDesc_Revolving Credit,ContractTypeDesc_SWAP Loan,ContractTypeDesc_Salary loan,ContractTypeDesc_Short Term Loan,ContractTypeDesc_Student Loan,ContractTypeDesc_Term Loan,ContractTypeDesc_Time Loan,ContractTypeDesc_Trust Loan,ContractTypeDesc_Unsecured loan,ContractTypeDesc_Vehicle Loan,ContractTypeDesc_Vehicle leasing,RoleDesc_Borrower,RoleDesc_Co-Borrower,RoleDesc_Guarantor_Surety,mincreditlimit,maxcreditlimit
0,2169925,Quick,1,0,1,3,2,0,1,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,9000.0
1,2025886,Quick,1,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,
2,2224518,Quick,1,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,
3,2029675,Quick,1,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,
4,2041621,Quick,1,0,1,10,2,1,7,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,3.0,6.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,10.0,0.0,0.0,0.0,180000.0


In [48]:
# select only numerical columns
num_cols = df.select_dtypes(include=['int', 'float']).columns

# fill NaN values with 0 in numerical columns
df[num_cols] = df[num_cols].fillna(0)
df.head()

Unnamed: 0,customerId,new_loan_type,obs,fstpd30,tonikdigitalloanid,loanincb,closedcontract,closedinadvancecontract,Activecontract,ContractStatus_BC,ContractStatus_BF,ContractStatus_BL,ContractStatus_CR,ContractStatus_CV,ContractStatus_DA,ContractStatus_DI,ContractStatus_DS,ContractStatus_FC,ContractStatus_LT,ContractStatus_NA,ContractStatus_NP,ContractStatus_NS,ContractStatus_PA,ContractStatus_PD,ContractStatus_RP,ContractStatus_WC,ContractStatus_WF,ContractStatus_WO,ContractHistoryType_CreditCards,ContractHistoryType_Installments,ContractHistoryType_NonInstallments,ContractHistoryType_Utilities,ContractTypeDesc_Agricultural Loan,ContractTypeDesc_Benefit Loan,ContractTypeDesc_Business Loan,ContractTypeDesc_Credit Card,ContractTypeDesc_Credit Card - MultiCurrency,ContractTypeDesc_Credit Card - Shared Limit,ContractTypeDesc_Credit Line,ContractTypeDesc_Home equity loan,ContractTypeDesc_Insurance - Life Insurance,ContractTypeDesc_L/C,ContractTypeDesc_Loan Line,ContractTypeDesc_Mortgage/Real Estate,ContractTypeDesc_Omnibus Line,ContractTypeDesc_Personal Loan,ContractTypeDesc_Provident Loan,ContractTypeDesc_Real estate leasing,ContractTypeDesc_Revolving Credit,ContractTypeDesc_SWAP Loan,ContractTypeDesc_Salary loan,ContractTypeDesc_Short Term Loan,ContractTypeDesc_Student Loan,ContractTypeDesc_Term Loan,ContractTypeDesc_Time Loan,ContractTypeDesc_Trust Loan,ContractTypeDesc_Unsecured loan,ContractTypeDesc_Vehicle Loan,ContractTypeDesc_Vehicle leasing,RoleDesc_Borrower,RoleDesc_Co-Borrower,RoleDesc_Guarantor_Surety,mincreditlimit,maxcreditlimit
0,2169925,Quick,1,0,1,3,2,0,1,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0,9000
1,2025886,Quick,1,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0
2,2224518,Quick,1,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0
3,2029675,Quick,1,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0
4,2041621,Quick,1,0,1,10,2,1,7,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,3.0,6.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,10.0,0.0,0.0,0,180000


# Overdue days

In [53]:
sq = """select customerId, OverdueDays, OverdueDaysDesc from prj-prod-dataplatform.risk_credit_cic_data.granted_contracts ;"""
dfd = client.query(sq).to_dataframe(progress_bar_type='tqdm')
gd = pd.get_dummies(dfd["OverdueDays"], prefix='OverdueDays', prefix_sep="_")
dfd = pd.concat([dfd, gd], axis = 1)
dfd.to_csv("Overduedays.csv")
dfd.drop(columns = ['OverdueDays', 'OverdueDaysDesc'], inplace = True)

dfd.columns

Job ID a3a5e314-634b-4718-88a6-aa46f8cefb8c successfully executed: 100%|[32m██████████[0m|
Downloading: 100%|[32m██████████[0m|


Index(['customerId', 'OverdueDays_0', 'OverdueDays_1', 'OverdueDays_2',
       'OverdueDays_3', 'OverdueDays_4', 'OverdueDays_5', 'OverdueDays_6',
       'OverdueDays_N'],
      dtype='object')

In [54]:
dfd = dfd.groupby("customerId")[['OverdueDays_0', 'OverdueDays_1', 'OverdueDays_2',
       'OverdueDays_3', 'OverdueDays_4', 'OverdueDays_5', 'OverdueDays_6',
       'OverdueDays_N']].sum().reset_index()
dfd.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 84289 entries, 0 to 84288
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   customerId     84289 non-null  object
 1   OverdueDays_0  84289 non-null  uint8 
 2   OverdueDays_1  84289 non-null  uint8 
 3   OverdueDays_2  84289 non-null  uint8 
 4   OverdueDays_3  84289 non-null  uint8 
 5   OverdueDays_4  84289 non-null  uint8 
 6   OverdueDays_5  84289 non-null  uint8 
 7   OverdueDays_6  84289 non-null  uint8 
 8   OverdueDays_N  84289 non-null  uint8 
dtypes: object(1), uint8(8)
memory usage: 1.3+ MB


In [55]:
dfd['customerId'] = dfd['customerId'].astype(np.int64)
df = df.merge(dfd, left_on='customerId', right_on='customerId', how = 'left')
dfd.to_csv("overduedata.csv")
df.to_csv("df.csv", index = False)
del(dfd)
df.head()

Unnamed: 0,customerId,new_loan_type,obs,fstpd30,tonikdigitalloanid,loanincb,closedcontract,closedinadvancecontract,Activecontract,ContractStatus_BC,ContractStatus_BF,ContractStatus_BL,ContractStatus_CR,ContractStatus_CV,ContractStatus_DA,ContractStatus_DI,ContractStatus_DS,ContractStatus_FC,ContractStatus_LT,ContractStatus_NA,ContractStatus_NP,ContractStatus_NS,ContractStatus_PA,ContractStatus_PD,ContractStatus_RP,ContractStatus_WC,ContractStatus_WF,ContractStatus_WO,ContractHistoryType_CreditCards,ContractHistoryType_Installments,ContractHistoryType_NonInstallments,ContractHistoryType_Utilities,ContractTypeDesc_Agricultural Loan,ContractTypeDesc_Benefit Loan,ContractTypeDesc_Business Loan,ContractTypeDesc_Credit Card,ContractTypeDesc_Credit Card - MultiCurrency,ContractTypeDesc_Credit Card - Shared Limit,ContractTypeDesc_Credit Line,ContractTypeDesc_Home equity loan,ContractTypeDesc_Insurance - Life Insurance,ContractTypeDesc_L/C,ContractTypeDesc_Loan Line,ContractTypeDesc_Mortgage/Real Estate,ContractTypeDesc_Omnibus Line,ContractTypeDesc_Personal Loan,ContractTypeDesc_Provident Loan,ContractTypeDesc_Real estate leasing,ContractTypeDesc_Revolving Credit,ContractTypeDesc_SWAP Loan,ContractTypeDesc_Salary loan,ContractTypeDesc_Short Term Loan,ContractTypeDesc_Student Loan,ContractTypeDesc_Term Loan,ContractTypeDesc_Time Loan,ContractTypeDesc_Trust Loan,ContractTypeDesc_Unsecured loan,ContractTypeDesc_Vehicle Loan,ContractTypeDesc_Vehicle leasing,RoleDesc_Borrower,RoleDesc_Co-Borrower,RoleDesc_Guarantor_Surety,mincreditlimit,maxcreditlimit,OverdueDays_0,OverdueDays_1,OverdueDays_2,OverdueDays_3,OverdueDays_4,OverdueDays_5,OverdueDays_6,OverdueDays_N
0,2169925,Quick,1,0,1,3,2,0,1,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0,9000,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2025886,Quick,1,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,,,,,,,,
2,2224518,Quick,1,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,,,,,,,,
3,2029675,Quick,1,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,,,,,,,,
4,2041621,Quick,1,0,1,10,2,1,7,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,3.0,6.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,10.0,0.0,0.0,0,180000,5.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0


# Credit Purpose

In [57]:
sq = """select customerId, CreditPurpose, CreditPurposeDesc from prj-prod-dataplatform.risk_credit_cic_data.granted_contracts;"""
dfd = client.query(sq).to_dataframe(progress_bar_type='tqdm')
gd = pd.get_dummies(dfd["CreditPurpose"], prefix='CreditPurpose', prefix_sep="_")
dfd = pd.concat([dfd, gd], axis = 1)
dfd.to_csv("CreditPurpose.csv")
dfd.drop(columns = ['CreditPurpose', 'CreditPurposeDesc'], inplace = True)
dfd.head()


Job ID 7a3d72dd-2589-4556-b412-c2f74b4e1eaf successfully executed: 100%|[32m██████████[0m|
Downloading: 100%|[32m██████████[0m|


Unnamed: 0,customerId,CreditPurpose_10,CreditPurpose_12,CreditPurpose_13,CreditPurpose_15,CreditPurpose_16,CreditPurpose_18,CreditPurpose_21,CreditPurpose_22,CreditPurpose_23,CreditPurpose_24,CreditPurpose_25,CreditPurpose_26,CreditPurpose_27,CreditPurpose_28,CreditPurpose_30,CreditPurpose_31,CreditPurpose_32
0,2000926,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,2000891,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,1812709,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,1853096,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,2001151,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [58]:
dfd.columns

Index(['customerId', 'CreditPurpose_10', 'CreditPurpose_12',
       'CreditPurpose_13', 'CreditPurpose_15', 'CreditPurpose_16',
       'CreditPurpose_18', 'CreditPurpose_21', 'CreditPurpose_22',
       'CreditPurpose_23', 'CreditPurpose_24', 'CreditPurpose_25',
       'CreditPurpose_26', 'CreditPurpose_27', 'CreditPurpose_28',
       'CreditPurpose_30', 'CreditPurpose_31', 'CreditPurpose_32'],
      dtype='object')

In [59]:
dfd = dfd.groupby("customerId")[['CreditPurpose_10', 'CreditPurpose_12',
       'CreditPurpose_13', 'CreditPurpose_15', 'CreditPurpose_16',
       'CreditPurpose_18', 'CreditPurpose_21', 'CreditPurpose_22',
       'CreditPurpose_23', 'CreditPurpose_24', 'CreditPurpose_25',
       'CreditPurpose_26', 'CreditPurpose_27', 'CreditPurpose_28',
       'CreditPurpose_30', 'CreditPurpose_31', 'CreditPurpose_32']].sum().reset_index()
dfd.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 84289 entries, 0 to 84288
Data columns (total 18 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   customerId        84289 non-null  object
 1   CreditPurpose_10  84289 non-null  uint8 
 2   CreditPurpose_12  84289 non-null  uint8 
 3   CreditPurpose_13  84289 non-null  uint8 
 4   CreditPurpose_15  84289 non-null  uint8 
 5   CreditPurpose_16  84289 non-null  uint8 
 6   CreditPurpose_18  84289 non-null  uint8 
 7   CreditPurpose_21  84289 non-null  uint8 
 8   CreditPurpose_22  84289 non-null  uint8 
 9   CreditPurpose_23  84289 non-null  uint8 
 10  CreditPurpose_24  84289 non-null  uint8 
 11  CreditPurpose_25  84289 non-null  uint8 
 12  CreditPurpose_26  84289 non-null  uint8 
 13  CreditPurpose_27  84289 non-null  uint8 
 14  CreditPurpose_28  84289 non-null  uint8 
 15  CreditPurpose_30  84289 non-null  uint8 
 16  CreditPurpose_31  84289 non-null  uint8 
 17  CreditPurpos

In [60]:
dfd['customerId'] = dfd['customerId'].astype(np.int64)
df = df.merge(dfd, left_on='customerId', right_on='customerId', how = 'left')
dfd.to_csv("creditpurpose.csv")
df.to_csv("df.csv", index = False)
del(dfd)
df.head()

Unnamed: 0,customerId,new_loan_type,obs,fstpd30,tonikdigitalloanid,loanincb,closedcontract,closedinadvancecontract,Activecontract,ContractStatus_BC,ContractStatus_BF,ContractStatus_BL,ContractStatus_CR,ContractStatus_CV,ContractStatus_DA,ContractStatus_DI,ContractStatus_DS,ContractStatus_FC,ContractStatus_LT,ContractStatus_NA,ContractStatus_NP,ContractStatus_NS,ContractStatus_PA,ContractStatus_PD,ContractStatus_RP,ContractStatus_WC,ContractStatus_WF,ContractStatus_WO,ContractHistoryType_CreditCards,ContractHistoryType_Installments,ContractHistoryType_NonInstallments,ContractHistoryType_Utilities,ContractTypeDesc_Agricultural Loan,ContractTypeDesc_Benefit Loan,ContractTypeDesc_Business Loan,ContractTypeDesc_Credit Card,ContractTypeDesc_Credit Card - MultiCurrency,ContractTypeDesc_Credit Card - Shared Limit,ContractTypeDesc_Credit Line,ContractTypeDesc_Home equity loan,ContractTypeDesc_Insurance - Life Insurance,ContractTypeDesc_L/C,ContractTypeDesc_Loan Line,ContractTypeDesc_Mortgage/Real Estate,ContractTypeDesc_Omnibus Line,ContractTypeDesc_Personal Loan,ContractTypeDesc_Provident Loan,ContractTypeDesc_Real estate leasing,ContractTypeDesc_Revolving Credit,ContractTypeDesc_SWAP Loan,ContractTypeDesc_Salary loan,ContractTypeDesc_Short Term Loan,ContractTypeDesc_Student Loan,ContractTypeDesc_Term Loan,ContractTypeDesc_Time Loan,ContractTypeDesc_Trust Loan,ContractTypeDesc_Unsecured loan,ContractTypeDesc_Vehicle Loan,ContractTypeDesc_Vehicle leasing,RoleDesc_Borrower,RoleDesc_Co-Borrower,RoleDesc_Guarantor_Surety,mincreditlimit,maxcreditlimit,OverdueDays_0,OverdueDays_1,OverdueDays_2,OverdueDays_3,OverdueDays_4,OverdueDays_5,OverdueDays_6,OverdueDays_N,CreditPurpose_10,CreditPurpose_12,CreditPurpose_13,CreditPurpose_15,CreditPurpose_16,CreditPurpose_18,CreditPurpose_21,CreditPurpose_22,CreditPurpose_23,CreditPurpose_24,CreditPurpose_25,CreditPurpose_26,CreditPurpose_27,CreditPurpose_28,CreditPurpose_30,CreditPurpose_31,CreditPurpose_32
0,2169925,Quick,1,0,1,3,2,0,1,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0,9000,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2025886,Quick,1,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,,,,,,,,,,,,,,,,,,,,,,,,,
2,2224518,Quick,1,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,,,,,,,,,,,,,,,,,,,,,,,,,
3,2029675,Quick,1,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,,,,,,,,,,,,,,,,,,,,,,,,,
4,2041621,Quick,1,0,1,10,2,1,7,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,3.0,6.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,10.0,0.0,0.0,0,180000,5.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0


# CIC Summary Table EDA

## First let us find the duplicate entries

In [49]:
sq = """
with customerbase as 
(select customerId, loanAccountNumber, disbursementDateTime, 
row_number() over (partition by customerId order by disbursementDateTime desc) rnk
from `risk_credit_mis.loan_master_table` where flagDisbursement = 1 and date_trunc(disbursementDateTime, day) >= '2023-01-01'
and upper(new_loan_type) = 'QUICK'),
fstpd as
(select lmt.customerId, lmt.loanAccountNumber, lmt.new_loan_type,
sum(case when obsTPD30 = 1
     then case when defFPD30 = 1 then 1
     when defSPD30 = 1 then 1
     when defTPD30 = 1 then 1
     else 0 end else 0 end) FPSTPD30 ,
sum(obsTPD30) obsTPD30,
max(cb.rnk) rnk
 from `risk_credit_mis.loan_master_table` lmt
 inner join customerbase cb on cb.customerId = lmt.customerId and lmt.loanAccountNumber = cb.loanAccountNumber
 WHERE 
  date_trunc(lmt.disbursementDateTime, day) >= '2023-01-01' and upper(lmt.new_loan_type) = 'QUICK'
 group by 1,2,3
),
modelpopulationbase as
(select customerId,new_loan_type,count(distinct loanAccountNumber) cnt_loans
, sum(obsTPD30) obs
, sum(FPSTPD30)FSTPD30 
from fstpd where obsTPD30 = 1 group by 1,2
) 
,a as 
(select customerId,new_loan_type,
max(case when obs > 0 then 1 else 0 end) obs ,
max(case when FSTPD30 > 0 then 1 else 0 end) fstpd30
from modelpopulationbase 
group by 1, 2
)
select cs.customerId, count(cs.customerId) cnt 
from prj-prod-dataplatform.risk_credit_cic_data.cic_summary  cs 
inner join a on cast(a.customerId as string) = cs.customerId
group by 1 having count(cs.customerId) > 1
order by 2 desc
;
"""
cicdup = client.query(sq).to_dataframe(progress_bar_type = 'tqdm')
cicdup.sample(10)

Job ID 3f586bd6-2c66-44e1-8655-57861d7f4f9f successfully executed: 100%|[32m██████████[0m|
Downloading: 100%|[32m██████████[0m|


Unnamed: 0,customerId,cnt
55,2037365,4
1830,1974712,2
3545,2219125,2
804,1915735,3
3750,1960319,2
3421,1586477,2
3639,2146664,2
888,1920545,3
3316,2011084,2
1003,1903614,2


In [50]:
cicdup.to_csv("Cic_duplicate_customerid.csv", index = False)

## CIC Table Columns Understanding

In [51]:
sq = """
with customerbase as 
(select customerId, loanAccountNumber, disbursementDateTime, 
row_number() over (partition by customerId order by disbursementDateTime desc) rnk
from `risk_credit_mis.loan_master_table` where flagDisbursement = 1 and date_trunc(disbursementDateTime, day) >= '2023-01-01'
and upper(new_loan_type) = 'QUICK'),
fstpd as
(select lmt.customerId, lmt.loanAccountNumber, lmt.new_loan_type,
sum(case when obsTPD30 = 1
     then case when defFPD30 = 1 then 1
     when defSPD30 = 1 then 1
     when defTPD30 = 1 then 1
     else 0 end else 0 end) FPSTPD30 ,
sum(obsTPD30) obsTPD30,
max(cb.rnk) rnk
 from `risk_credit_mis.loan_master_table` lmt
 inner join customerbase cb on cb.customerId = lmt.customerId and lmt.loanAccountNumber = cb.loanAccountNumber
 WHERE 
  date_trunc(lmt.disbursementDateTime, day) >= '2023-01-01' and upper(lmt.new_loan_type) = 'QUICK'
 group by 1,2,3
),
modelpopulationbase as
(select customerId,new_loan_type,count(distinct loanAccountNumber) cnt_loans
, sum(obsTPD30) obs
, sum(FPSTPD30)FSTPD30 
from fstpd where obsTPD30 = 1 group by 1,2
) 
,a as 
(select customerId,new_loan_type,
max(case when obs > 0 then 1 else 0 end) obs ,
max(case when FSTPD30 > 0 then 1 else 0 end) fstpd30
from modelpopulationbase 
group by 1, 2
)
select distinct cs.digitalLoanAccountId
, lmt.startApplyDateTime
, cs.customerId,
       DescisionValue, DescisionDescription,
       ExclusionRuleCode, exclusionRuleLabelValue,
       exclusionRuleLabelDescription, ScoreRaw, ScoreRange,
       Accounts30Days, Accounts30DaysOther, Accounts60Days,
       Accounts60DaysOther, Accounts90Days, Accounts90DaysOther,
       AgeAtLoanTermination, AgeAtLoanTerminationMax,
       CreditAvgCreditLimit, CreditHighestCreditLimit,
       CreditMaxOutstandingBalance, CreditMaxOutstandingBalanceDate,
       CreditMaxOverdueAmount, CreditMaxOverdueAmountDate,
       CreditMaxOverdueDays, CreditMaxOverdueDaysDate,
       CreditNumberOfContracts, CreditTotalCreditLimit,
       CreditTotalOutstandingBalanceAmount, CreditTotalOverdueAmount,
       DataFromTest, InstMaxOverdueAmount, InstMaxOverdueAmountDate,
       InstMaxOverdueDays, InstMaxOverdueDaysDate, InstNumberOfContracts,
       InstTotalMonthlyPaymentsAmount, InstTotalOutstandingBalanceAmount,
       InstTotalOverdueAmount, LoanToValue, MonthsOfBooks,
       MonthsOfBooksOther, NonInstAvgCreditLimit,
       NonInstHighestCreditLimit, NonInstNumberOfContracts,
       NonInstTotalCreditLimit, NonInstTotalOverdraftAmount,
       NonInstTotalUtilization, NumberOfContract, NumberOfContractAC,
       NumberOfContractACOther, NumberOfContractCLCA,
       NumberOfContractCLCAOther, NumberOfContractOther,
       NumberOfContractRF, NumberOfContractRFOther, NumberOfContractRN,
       NumberOfContractRNOther, NumberOfContractRQ,
       NumberOfContractRQOther, OverlimitFlag, OverlimitFlagOther,
       ReportingProvidersNumber, ReportingProvidersNumberOther,
    --    SubjectEventDate, SubjectInfoTypeCode, SubjectInfoTypeCodeDesc,
       TotalOutstanding, TotalOutstandingOther, UtilityMaxBilledAmount,
       UtilityMaxBilledAmountDate, UtilityMaxOutstandingBalance,
       UtilityMaxOutstandingBalanceDate, UtilityMaxOverdueAmount,
       UtilityMaxOverdueAmountDate, UtilityMaxOverdueDays,
       UtilityMaxOverdueDaysDate, UtilityNumberOfContracts,
       UtilityTotalBilledAmount, UtilityTotalOutstandingBalanceAmount,
       UtilityTotalOverdueAmount, UtilizationRateCreditCard,
       UtilizationRateCreditCardOther, UtilizationRateNonInstallment,
       UtilizationRateNonInstallmentOther,scoreLabelDesc,
       scoreLabelValue, a.obs, a.fstpd30
from prj-prod-dataplatform.risk_credit_cic_data.cic_summary  cs 
inner join a on cast(a.customerId as string) = cs.customerId
left join `risk_credit_mis.loan_master_table` lmt on lmt.digitalLoanAccountId = cs.digitalLoanAccountId
;

"""

cic = client.query(sq).to_dataframe(progress_bar_type = 'tqdm')
print(f"The data extracted from the above query are: {cic.shape}")

Job ID 606fee07-a408-4ccc-8b9d-ddcaba4c76f1 successfully executed: 100%|[32m██████████[0m|
Downloading: 100%|[32m██████████[0m|
The data extracted from the above query are: (21930, 86)


### Columns in this table

In [52]:
cic.columns

Index(['digitalLoanAccountId', 'startApplyDateTime', 'customerId',
       'DescisionValue', 'DescisionDescription', 'ExclusionRuleCode',
       'exclusionRuleLabelValue', 'exclusionRuleLabelDescription', 'ScoreRaw',
       'ScoreRange', 'Accounts30Days', 'Accounts30DaysOther', 'Accounts60Days',
       'Accounts60DaysOther', 'Accounts90Days', 'Accounts90DaysOther',
       'AgeAtLoanTermination', 'AgeAtLoanTerminationMax',
       'CreditAvgCreditLimit', 'CreditHighestCreditLimit',
       'CreditMaxOutstandingBalance', 'CreditMaxOutstandingBalanceDate',
       'CreditMaxOverdueAmount', 'CreditMaxOverdueAmountDate',
       'CreditMaxOverdueDays', 'CreditMaxOverdueDaysDate',
       'CreditNumberOfContracts', 'CreditTotalCreditLimit',
       'CreditTotalOutstandingBalanceAmount', 'CreditTotalOverdueAmount',
       'DataFromTest', 'InstMaxOverdueAmount', 'InstMaxOverdueAmountDate',
       'InstMaxOverdueDays', 'InstMaxOverdueDaysDate', 'InstNumberOfContracts',
       'InstTotalMonthlyPayments

In [None]:
cic.info()

In [None]:
# Check for duplicate digitalLoanAccountId
has_duplicates = cic['digitalLoanAccountId'].duplicated()

# Get the actual duplicate loan ids
duplicate_ids = cic[has_duplicates]['digitalLoanAccountId']

# Print the duplicate loan ids
if duplicate_ids.empty:
    print("There are no duplicate digitalLoanAccountId in the dataframe.")
else:
    print("The following digitalLoanAccountId are duplicates:")
    print(duplicate_ids.tolist())

In [None]:
pd.set_option("Display.max_columns", None)

In [None]:
cic[cic.digitalLoanAccountId == 'd5aa07b7-3e36-4ca8-8d06-01b1bfb867fe']

In [None]:
# Check for duplicate customerId

has_duplicates = cic['customerId'].duplicated()

# Get the actual duplicate customer ids

duplicate_ids = cic[has_duplicates]['customerId']

# Print the duplicate customer ids

if duplicate_ids.empty:
    print("There are no duplicate customerId in the dataframe.")
else:
    print("The following customerId are duplicates:")
    print(duplicate_ids.tolist())

In [None]:
cic[cic.customerId == '1909097']

In [None]:
cic["DescisionValue"] = cic["DescisionValue"].fillna('NA')
cic.DescisionValue.value_counts()

In [None]:
dv = pd.get_dummies(cic["DescisionValue"], prefix='DecisionValue', prefix_sep="_")

In [None]:
cic = pd.concat([cic, dv], axis = 1)

In [None]:
cic.drop(columns="DescisionValue", inplace = True)

In [None]:
cic["DescisionDescription"] = cic["DescisionDescription"].fillna('NA')
cic.DescisionDescription.value_counts()

In [None]:
cic.drop(columns = "DescisionDescription", inplace = True)

In [None]:
cic["ExclusionRuleCode"] = cic["ExclusionRuleCode"].fillna('NA')
cic.ExclusionRuleCode.value_counts()

In [None]:
erc = pd.get_dummies(cic["ExclusionRuleCode"], prefix='ExclusionRuleCode', prefix_sep="_")
cic = pd.concat([cic, erc], axis = 1)
cic.drop(columns = 'ExclusionRuleCode', inplace = True)

In [None]:
cic.columns

In [None]:
cic["exclusionRuleLabelValue"] = cic["exclusionRuleLabelValue"].fillna('NA')
cic.exclusionRuleLabelValue.value_counts()

In [None]:
cic["exclusionRuleLabelDescription"] = cic["exclusionRuleLabelDescription"].fillna('NA')
cic["exclusionRuleLabelDescription"].value_counts()

In [None]:
cic[["exclusionRuleLabelValue", "exclusionRuleLabelDescription"]].value_counts()

This makes sense now. 

In [None]:

cic.ScoreRaw.value_counts()

In [None]:
cic["ScoreRange"] = cic.ScoreRange.fillna("NA")
cic.ScoreRange.value_counts()

In [None]:
sr = pd.get_dummies(cic['ScoreRange'], prefix="ScoreRange", prefix_sep="_")
cic = pd.concat([cic, sr], axis = 1)
cic.drop(columns="ScoreRange", inplace = True)


In [None]:
pd.set_option("Display.max_rows", None)
cic.columns.values

In [None]:
cic["ScoreRaw"] = cic["ScoreRaw"].fillna(0)

In [None]:
# Group by ScoreRange and get the min and max values of ScoreRaw
grouped_df = cic.groupby('ScoreRange')['ScoreRaw'].agg(['min', 'max'])

# Print the grouped DataFrame
print(f"The Min and Max Score raw for each scoreRange are: /n")
grouped_df.reset_index()


In [None]:
# sr = [[0, 150, '0-150'], [151, 343, '151-343'], [346, 397, '346-397'], [399, 428, '399-397'], [430, 450, '430-450'], [451, 466, '451-466'], [467, 475, '467-475'], [477, 485, '477-485'], [486, 494, '486-494'], [497, 536, '497-536']]

# bins = [x[0] for x in sr[:-1]] + [sr[-1][0]]  # Creating bins excluding NA values
# numbers = {k: int(k.split("-")[0]) for k in [x[2] for x in sr]}  # Mapping string labels to numbers
# score_bin_labels = [numbers[l] if l in numbers else None for l in score_bin_labels]  # Replace old labels with numbers

# score_binned = pd.cut(cic['ScoreRaw'], bins=bins, right=False, labels=score_bin_labels)
# cic["scorebingroup"] = score_binned

In [None]:
cic["Accounts30Days"].value_counts()

In [None]:
cic[['digitalLoanAccountId', 'startApplyDateTime', 'customerId','Accounts30Days', 'Accounts30DaysOther', 'Accounts60Days',
       'Accounts60DaysOther', 'Accounts90Days', 'Accounts90DaysOther']][cic.customerId == '1909097']