In [2]:
# %% [markdown]
# # Jupyter Notebook Loading Header
#
# This is a custom loading header for Jupyter Notebooks in Visual Studio Code.
# It includes common imports and settings to get you started quickly.

# %% [markdown]
## Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from google.cloud import bigquery
import os
path = r'C:\Users\DwaipayanChakroborti\AppData\Roaming\gcloud\legacy_credentials\dchakroborti@tonikbank.com\adc.json'
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = path
client = bigquery.Client(project='prj-prod-dataplatform')

# %% [markdown]
## Configure Settings
# Set options or configurations as needed
# Example: pd.set_option('display.max_columns', None)

In [3]:
sq = """with allcust as 
(select cust_id from prj-prod-dataplatform.dl_customers_db_raw.tdbk_customer_mtb where cust_id is not null),
  custrejectlist --- List of all customer id where were ever rejected
  AS (
  SELECT
    customerId, min(startApplyDateTime) firstapplicationdate, max(startApplyDateTime) lastapplicationdate, max(applicationStatus) applicationStatus
  FROM
    `risk_credit_mis.loan_master_table`
  WHERE
    applicationStatus LIKE 'REJECT' 
  GROUP BY 
    customerId),
  Activecustlist ---- List of customer id with active loan
  AS (
  SELECT
    customerId,
    digitalLoanAccountId,
    loanPaidStatus
  FROM
    `risk_credit_mis.loan_master_table`
  WHERE
    COALESCE(loanPaidStatus, 'NA') IN ('Normal',
      'In Arrears')),
  RejectcustwithnoActiveLoan --- List of customer id who have a rejected loan earlier and no active current loan
    AS (
    SELECT
      customerId,
      max(applicationStatus) applicationstatus
    FROM
      custrejectlist
    WHERE
      customerId NOT IN (
      SELECT
        customerId
      FROM
        Activecustlist) group by 1),
  RejectcustwithActiveLoan --- List of customer id who have a rejected loan earlier and active current loan
    AS (
    SELECT
      customerId,
      max(applicationStatus) applicationstatus
    FROM
      custrejectlist
    WHERE
      customerId IN (
      SELECT
        customerId
      FROM
        Activecustlist) group by 1),
  TSA_Account as 
  (select distinct customer_id FROM `prj-prod-dataplatform.risk_mart.customer_transactions` WHERE account_type = 'Tonik Account'),
b as 
(select a1.cust_id,
  a2.customerId customerwithrejectloans,
  a3.customerId customerwithactiveloans,
  a4.customerId custwithrejectloanandnotactiveloan,
  a6.customerId custwithrejectloanandactiveloan,
  a5.customer_id Tsaaccountcustomer
  from allcust a1
  left join custrejectlist a2 on a2.customerId = cast(a1.cust_id as numeric)
  left join Activecustlist a3 on a3.customerId = cast(a1.cust_id as numeric)
  left join RejectcustwithnoActiveLoan a4 on a4.customerId = cast(a1.cust_id as numeric)
  left join TSA_Account a5 on a5.customer_id = a1.cust_id
  left join RejectcustwithActiveLoan a6 on a6.customerId = cast(a1.cust_id as numeric)
)
# select 
# count(distinct cust_id) allcustcnt, 
# count(distinct customerwithrejectloans) customerwithrejectloans_cnt,
# count(distinct customerwithactiveloans) customerwithactiveloans_cnt,
# count(distinct custwithrejectloanandnotactiveloan) custwithrejectloanandnotactiveloan_cnt,
# count(distinct Tsaaccountcustomer) Tsaaccountcustomer_cnt,
# count(distinct case when customerwithrejectloans is not null and Tsaaccountcustomer is null then customerwithrejectloans end) cust_with_rejectloan_No_tsa_account,
# count(distinct case when Tsaaccountcustomer is not null and customerwithrejectloans is null then Tsaaccountcustomer end) tsacustomerwithnotrejectloan,
# count(distinct case when Tsaaccountcustomer is not null and customerwithactiveloans is null then Tsaaccountcustomer end) tsacustmerwithactiveloans,
# count(distinct case when customerwithactiveloans is not null and Tsaaccountcustomer is null then customerwithactiveloans end) activeloanwithnoTSA,
# count(distinct case when Tsaaccountcustomer is not null and custwithrejectloanandnotactiveloan is null then Tsaaccountcustomer end) Tsacustwithnorejectloanandnoactiveloan,
# count(distinct case when custwithrejectloanandnotactiveloan is not null and Tsaaccountcustomer is null then custwithrejectloanandnotactiveloan end) custwithrejectandnoactiveloanbutnotsa,
# count(distinct case when custwithrejectloanandactiveloan is not null and Tsaaccountcustomer is null then custwithrejectloanandactiveloan end) custwithrejectandactiveloanbutnotsa
# from b
select * from b
"""

In [11]:
df = client.query(sq).to_dataframe(progress_bar_type = 'tqdm')

Job ID 594afbf0-1451-4644-91e8-f7ac2c717106 successfully executed: 100%|[32m██████████[0m|
Downloading: 100%|[32m██████████[0m|


In [13]:
df.shape

(1465006, 6)

In [18]:
d = df[(df['cust_id'].notna()) & (df['Tsaaccountcustomer'].notna()) & (df['custwithrejectloanandactiveloan'].notna())]

In [19]:
len(d)

1

In [20]:
d.head()

Unnamed: 0,cust_id,customerwithrejectloans,customerwithactiveloans,custwithrejectloanandnotactiveloan,custwithrejectloanandactiveloan,Tsaaccountcustomer
66013,1070155,1070155,1070155,,1070155,1070155


In [21]:
df1 = d.reset_index().copy()
df1

Unnamed: 0,index,cust_id,customerwithrejectloans,customerwithactiveloans,custwithrejectloanandnotactiveloan,custwithrejectloanandactiveloan,Tsaaccountcustomer
0,66013,1070155,1070155,1070155,,1070155,1070155


In [22]:
df1['desc'] = 'TSA Customer with rejected loan and current active loan'
df1

Unnamed: 0,index,cust_id,customerwithrejectloans,customerwithactiveloans,custwithrejectloanandnotactiveloan,custwithrejectloanandactiveloan,Tsaaccountcustomer,desc
0,66013,1070155,1070155,1070155,,1070155,1070155,TSA Customer with rejected loan and current ac...


In [37]:
d = df[(df['cust_id'].notna()) & (df['Tsaaccountcustomer'].notna()) & (df['customerwithrejectloans'].notna())]
print(f"The count of the rows are {d.shape}" )
d.head()

The count of the rows are (35473, 6)


Unnamed: 0,cust_id,customerwithrejectloans,customerwithactiveloans,custwithrejectloanandnotactiveloan,custwithrejectloanandactiveloan,Tsaaccountcustomer
1,1235197,1235197,,1235197,,1235197
17,1235056,1235056,,1235056,,1235056
19,1234163,1234163,,1234163,,1234163
24,1234538,1234538,,1234538,,1234538
28,1234978,1234978,,1234978,,1234978


In [38]:
d['desc'] = "TSA Account Customer with at least on rejected loans"
d.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  d['desc'] = "TSA Account Customer with at least on rejected loans"


Unnamed: 0,cust_id,customerwithrejectloans,customerwithactiveloans,custwithrejectloanandnotactiveloan,custwithrejectloanandactiveloan,Tsaaccountcustomer,desc
1,1235197,1235197,,1235197,,1235197,TSA Account Customer with at least on rejected...
17,1235056,1235056,,1235056,,1235056,TSA Account Customer with at least on rejected...
19,1234163,1234163,,1234163,,1234163,TSA Account Customer with at least on rejected...
24,1234538,1234538,,1234538,,1234538,TSA Account Customer with at least on rejected...
28,1234978,1234978,,1234978,,1234978,TSA Account Customer with at least on rejected...


In [39]:
d.head()

Unnamed: 0,cust_id,customerwithrejectloans,customerwithactiveloans,custwithrejectloanandnotactiveloan,custwithrejectloanandactiveloan,Tsaaccountcustomer,desc
1,1235197,1235197,,1235197,,1235197,TSA Account Customer with at least on rejected...
17,1235056,1235056,,1235056,,1235056,TSA Account Customer with at least on rejected...
19,1234163,1234163,,1234163,,1234163,TSA Account Customer with at least on rejected...
24,1234538,1234538,,1234538,,1234538,TSA Account Customer with at least on rejected...
28,1234978,1234978,,1234978,,1234978,TSA Account Customer with at least on rejected...


In [41]:
import pandas as pd
import numpy as np

def fill_empty_columns(main_df, small_df):
    extra_cols = set(small_df.columns) - set(main_df.columns)
    for col in extra_cols:
        main_df[col] = np.nan
    return main_df

df1 = fill_empty_columns(df1, d)
merged_df = pd.concat([df1, d], ignore_index=False)
print(f"Merged DataFrame Shape: {merged_df.shape}")
merged_df.head()

Merged DataFrame Shape: (35474, 8)


Unnamed: 0,index,cust_id,customerwithrejectloans,customerwithactiveloans,custwithrejectloanandnotactiveloan,custwithrejectloanandactiveloan,Tsaaccountcustomer,desc
0,66013.0,1070155,1070155,1070155.0,,1070155.0,1070155,TSA Customer with rejected loan and current ac...
1,,1235197,1235197,,1235197.0,,1235197,TSA Account Customer with at least on rejected...
17,,1235056,1235056,,1235056.0,,1235056,TSA Account Customer with at least on rejected...
19,,1234163,1234163,,1234163.0,,1234163,TSA Account Customer with at least on rejected...
24,,1234538,1234538,,1234538.0,,1234538,TSA Account Customer with at least on rejected...


In [42]:
df1 = merged_df.copy()

In [47]:
d = df[(df['cust_id'].notna()) & (df['Tsaaccountcustomer'].isnull())]
print(f"The shape of the dataframe is:\t{d.shape}")

The shape of the dataframe is:	(1124661, 6)


In [48]:
d.head()

Unnamed: 0,cust_id,customerwithrejectloans,customerwithactiveloans,custwithrejectloanandnotactiveloan,custwithrejectloanandactiveloan,Tsaaccountcustomer
2,1233825,1233825.0,,1233825.0,,
3,1234118,1234118.0,,1234118.0,,
4,1234967,,,,,
5,1234562,1234562.0,,1234562.0,,
6,1234222,1234222.0,,1234222.0,,


In [49]:
d['desc'] = "Customer MTB Account present but TSA Account Not Present"
d.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  d['desc'] = "Customer MTB Account present but TSA Account Not Present"


Unnamed: 0,cust_id,customerwithrejectloans,customerwithactiveloans,custwithrejectloanandnotactiveloan,custwithrejectloanandactiveloan,Tsaaccountcustomer,desc
2,1233825,1233825.0,,1233825.0,,,Customer MTB Account present but TSA Account N...
3,1234118,1234118.0,,1234118.0,,,Customer MTB Account present but TSA Account N...
4,1234967,,,,,,Customer MTB Account present but TSA Account N...
5,1234562,1234562.0,,1234562.0,,,Customer MTB Account present but TSA Account N...
6,1234222,1234222.0,,1234222.0,,,Customer MTB Account present but TSA Account N...


In [50]:
import pandas as pd
import numpy as np

def fill_empty_columns(main_df, small_df):
    extra_cols = set(small_df.columns) - set(main_df.columns)
    for col in extra_cols:
        main_df[col] = np.nan
    return main_df

df1 = fill_empty_columns(df1, d)
merged_df = pd.concat([df1, d], ignore_index=False)
print(f"Merged DataFrame Shape: {merged_df.shape}")
merged_df.head()

Merged DataFrame Shape: (1160135, 8)


Unnamed: 0,index,cust_id,customerwithrejectloans,customerwithactiveloans,custwithrejectloanandnotactiveloan,custwithrejectloanandactiveloan,Tsaaccountcustomer,desc
0,66013.0,1070155,1070155,1070155.0,,1070155.0,1070155,TSA Customer with rejected loan and current ac...
1,,1235197,1235197,,1235197.0,,1235197,TSA Account Customer with at least on rejected...
17,,1235056,1235056,,1235056.0,,1235056,TSA Account Customer with at least on rejected...
19,,1234163,1234163,,1234163.0,,1234163,TSA Account Customer with at least on rejected...
24,,1234538,1234538,,1234538.0,,1234538,TSA Account Customer with at least on rejected...


In [51]:
merged_df['desc'].value_counts()

desc
Customer MTB Account present but TSA Account Not Present    1124661
TSA Account Customer with at least on rejected loans          35473
TSA Customer with rejected loan and current active loan           1
Name: count, dtype: int64

In [53]:
d = df[(df['cust_id'].notna()) & (df['Tsaaccountcustomer'].isnull())&(df['custwithrejectloanandnotactiveloan'].notna())]
print(f"The shape of the dataframe is:\t{d.shape}")

The shape of the dataframe is:	(227896, 6)


In [54]:
d['desc'] = "Customer MTB Account present, TSA Account Not Present and customer at least one reject loan and not current active loan"
d.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  d['desc'] = "Customer MTB Account present, TSA Account Not Present and customer at least one reject loan and not current active loan"


Unnamed: 0,cust_id,customerwithrejectloans,customerwithactiveloans,custwithrejectloanandnotactiveloan,custwithrejectloanandactiveloan,Tsaaccountcustomer,desc
2,1233825,1233825,,1233825,,,"Customer MTB Account present, TSA Account Not ..."
3,1234118,1234118,,1234118,,,"Customer MTB Account present, TSA Account Not ..."
5,1234562,1234562,,1234562,,,"Customer MTB Account present, TSA Account Not ..."
6,1234222,1234222,,1234222,,,"Customer MTB Account present, TSA Account Not ..."
8,1234618,1234618,,1234618,,,"Customer MTB Account present, TSA Account Not ..."


In [55]:
df1 = merged_df.copy()

In [56]:
import pandas as pd
import numpy as np

def fill_empty_columns(main_df, small_df):
    extra_cols = set(small_df.columns) - set(main_df.columns)
    for col in extra_cols:
        main_df[col] = np.nan
    return main_df

df1 = fill_empty_columns(df1, d)
merged_df = pd.concat([df1, d], ignore_index=False)
print(f"Merged DataFrame Shape: {merged_df.shape}")
merged_df.head()

Merged DataFrame Shape: (1388031, 8)


Unnamed: 0,index,cust_id,customerwithrejectloans,customerwithactiveloans,custwithrejectloanandnotactiveloan,custwithrejectloanandactiveloan,Tsaaccountcustomer,desc
0,66013.0,1070155,1070155,1070155.0,,1070155.0,1070155,TSA Customer with rejected loan and current ac...
1,,1235197,1235197,,1235197.0,,1235197,TSA Account Customer with at least on rejected...
17,,1235056,1235056,,1235056.0,,1235056,TSA Account Customer with at least on rejected...
19,,1234163,1234163,,1234163.0,,1234163,TSA Account Customer with at least on rejected...
24,,1234538,1234538,,1234538.0,,1234538,TSA Account Customer with at least on rejected...


In [57]:
merged_df['desc'].value_counts()

desc
Customer MTB Account present but TSA Account Not Present                                                                   1124661
Customer MTB Account present, TSA Account Not Present and customer at least one reject loan and not current active loan     227896
TSA Account Customer with at least on rejected loans                                                                         35473
TSA Customer with rejected loan and current active loan                                                                          1
Name: count, dtype: int64

In [58]:
df1 = merged_df.copy()

In [59]:
df.columns

Index(['cust_id', 'customerwithrejectloans', 'customerwithactiveloans',
       'custwithrejectloanandnotactiveloan', 'custwithrejectloanandactiveloan',
       'Tsaaccountcustomer'],
      dtype='object')

In [60]:
d = df[(df['cust_id'].notna()) & (df['Tsaaccountcustomer'].isnull())&(df['customerwithactiveloans'].notna())]
print(f"The shape of the dataframe is:\t{d.shape}")

The shape of the dataframe is:	(5714, 6)


In [63]:
d['desc'] = "Customer MTB Account present, TSA Account Not Present and customer with current active loan"
d.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  d['desc'] = "Customer MTB Account present, TSA Account Not Present and customer with current active loan"


Unnamed: 0,cust_id,customerwithrejectloans,customerwithactiveloans,custwithrejectloanandnotactiveloan,custwithrejectloanandactiveloan,Tsaaccountcustomer,desc
3624,2261288,,2261288,,,,"Customer MTB Account present, TSA Account Not ..."
3630,2261016,,2261016,,,,"Customer MTB Account present, TSA Account Not ..."
4260,2419019,,2419019,,,,"Customer MTB Account present, TSA Account Not ..."
4262,2418876,,2418876,,,,"Customer MTB Account present, TSA Account Not ..."
4427,2418646,,2418646,,,,"Customer MTB Account present, TSA Account Not ..."


In [64]:
import pandas as pd
import numpy as np

def fill_empty_columns(main_df, small_df):
    extra_cols = set(small_df.columns) - set(main_df.columns)
    for col in extra_cols:
        main_df[col] = np.nan
    return main_df

df1 = fill_empty_columns(df1, d)
merged_df = pd.concat([df1, d], ignore_index=False)
print(f"Merged DataFrame Shape: {merged_df.shape}")
merged_df.head()

Merged DataFrame Shape: (1393745, 8)


Unnamed: 0,index,cust_id,customerwithrejectloans,customerwithactiveloans,custwithrejectloanandnotactiveloan,custwithrejectloanandactiveloan,Tsaaccountcustomer,desc
0,66013.0,1070155,1070155,1070155.0,,1070155.0,1070155,TSA Customer with rejected loan and current ac...
1,,1235197,1235197,,1235197.0,,1235197,TSA Account Customer with at least on rejected...
17,,1235056,1235056,,1235056.0,,1235056,TSA Account Customer with at least on rejected...
19,,1234163,1234163,,1234163.0,,1234163,TSA Account Customer with at least on rejected...
24,,1234538,1234538,,1234538.0,,1234538,TSA Account Customer with at least on rejected...


In [65]:
merged_df['desc'].value_counts()

desc
Customer MTB Account present but TSA Account Not Present                                                                   1124661
Customer MTB Account present, TSA Account Not Present and customer at least one reject loan and not current active loan     227896
TSA Account Customer with at least on rejected loans                                                                         35473
Customer MTB Account present, TSA Account Not Present and customer with current active loan                                   5714
TSA Customer with rejected loan and current active loan                                                                          1
Name: count, dtype: int64