# <div align="center" style="color: #ff5733;">7 11 Data Analysis</div>

In [1]:
# %% [markdown]
# # Jupyter Notebook Loading Header
#
# This is a custom loading header for Jupyter Notebooks in Visual Studio Code.
# It includes common imports and settings to get you started quickly.

# %% [markdown]
## Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from google.cloud import bigquery
import os
path = r'C:\Users\DwaipayanChakroborti\AppData\Roaming\gcloud\legacy_credentials\dchakroborti@tonikbank.com\adc.json'
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = path
client = bigquery.Client(project='prj-prod-dataplatform')

# %% [markdown]
## Configure Settings
# Set options or configurations as needed
# Example: pd.set_option('display.max_columns', None)

In [2]:
sq = """with mb as 
(select customer_id, right(new_mobile_number, 10) mobile from `risk_credit_mis.customer_contact_details`
union all 
select customer_id, right(old_mobile_number, 10) mobile from `risk_credit_mis.customer_contact_details` where old_mobile_number is not null
),
mb1 
as
(select distinct customer_id, mobile from mb),
mb2
as
(select *, row_number() over(partition by customer_id order by customer_id) custrank from mb1),
delqb as
(Select lmt.customerId, case when a1.obs_min_inst_def30 >= 3 then lmt.customerId end obsfstpd30 
, case when a1.min_inst_def30 in (1,2,3) then lmt.customerId end fstpd30dev
from prj-prod-dataplatform.risk_credit_mis.loan_deliquency_data a1
left join `risk_credit_mis.loan_master_table` lmt on lmt.loanAccountNumber = a1.loanAccountNumber
),delqb1 
as
(select customerId, 
count(distinct case when obsfstpd30 is not null and fstpd30dev is not null then customerId end) fstpd30def,
count(distinct case when obsfstpd30 is not null then customerId else 0 end) obsfpd30
from delqb where obsfstpd30 is not null
group by 1
),
base as
(select * from 
`prj-prod-dataplatform.manual_source_extracts.partner_711_payment_transcation_backtest_20240125` a1
left join mb2 on cast(mb2.mobile as numeric) = cast(a1.mobile_num as numeric)
left join delqb1 on delqb1.customerId = cast(mb2.customer_id as numeric)
)
select * from base"""
df = client.query(sq).to_dataframe(progress_bar_type='tqdm')

Job ID 187453c2-b1c7-4e41-9864-b8db3830d98a successfully executed: 100%|[32m██████████[0m|
Downloading: 100%|[32m██████████[0m|


In [3]:
print(f"The shape of the dataset is:\t {df.shape}")

The shape of the dataset is:	 (1196007, 13)


In [4]:
df.columns

Index(['mobile_num', 'txn_id', 'amount', 'sector', 'merchant_name',
       'txn_datetime', 'store_id', 'customer_id', 'mobile', 'custrank',
       'customerId', 'fstpd30def', 'obsfpd30'],
      dtype='object')

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1196007 entries, 0 to 1196006
Data columns (total 13 columns):
 #   Column         Non-Null Count    Dtype              
---  ------         --------------    -----              
 0   mobile_num     1196007 non-null  Int64              
 1   txn_id         1196007 non-null  Int64              
 2   amount         1196007 non-null  float64            
 3   sector         1196007 non-null  object             
 4   merchant_name  1196007 non-null  object             
 5   txn_datetime   1196007 non-null  datetime64[us, UTC]
 6   store_id       1196007 non-null  Int64              
 7   customer_id    1157189 non-null  Int64              
 8   mobile         1157189 non-null  object             
 9   custrank       1157189 non-null  Int64              
 10  customerId     518297 non-null   Int64              
 11  fstpd30def     518297 non-null   Int64              
 12  obsfpd30       518297 non-null   Int64              
dtypes: Int64(8),

In [6]:
df['mobile_num'] = df['mobile_num'].astype(str)

In [7]:
df['sector'].value_counts()

sector
EMI                       831125
Loans                     149071
Bank                       76768
Prepaid Load               65593
Postpaid Land Lines        12963
Transpo or Toll            11468
Government                 10260
Utility-Water               8992
Internet                    7733
Remittance                  3897
Utility-Power               3036
Credit Card                 2908
Cable TV                    2823
Postpaid Mobile             2634
E-Commerce                  1864
E-Wallet                    1238
Airlines                     983
Gaming                       916
Collections                  498
Retailer Load                474
Insurance                    353
Tuition                      304
Other Prepaid Services       106
Name: count, dtype: int64

In [8]:
# Count matching lp_id with mobile
matching_count = df[df['mobile_num'].isin(df['mobile'])].shape[0]
# Count lp_id that are missing in mobile
missing_count = df[~df['mobile_num'].isin(df['mobile'])].shape[0]

# Total number of lp_id records
total_lp_id = df.shape[0]

# Calculate share of lp_id in mobile
share_in_mobile = (matching_count / total_lp_id) * 100

print(f"Number of lp_id matching with mobile: {matching_count}")
print(f"Number of lp_id missing in mobile: {missing_count}")
print(f"Share of lp_id in mobile: {share_in_mobile:.2f}%")


Number of lp_id matching with mobile: 1157189
Number of lp_id missing in mobile: 38818
Share of lp_id in mobile: 96.75%


In [9]:
df[df['mobile_num'].isin(df['mobile'])].to_csv("Matchingmobilenumberwithoutdatabase.csv", index = False)

In [10]:
df[~df['mobile_num'].isin(df['mobile'])].to_csv("Missingmobilenumberfromourdatabasewhenmatchingwith711data.csv", index = False)

In [11]:
# df[~df['mobile_num'].isin(df['mobile'])]

In [12]:
# Get unique lp_id and mobile values
unique_lp_ids = df['mobile_num'].unique()
unique_mobiles = df['mobile'].unique()

# Count how many unique lp_id values are in unique mobiles
matching_count = sum(lp_id in unique_mobiles for lp_id in unique_lp_ids)

# Total number of unique lp_id records
total_unique_lp_id = len(unique_lp_ids)

# Calculate share of unique lp_id in unique mobiles
share_in_mobile = (matching_count / total_unique_lp_id) * 100

print(f"Number of unique lp_id matching with unique mobile: {matching_count}")
print(f"Total number of unique lp_id: {total_unique_lp_id}")
print(f"Share of unique lp_id in unique mobile: {share_in_mobile:.2f}%")

Number of unique lp_id matching with unique mobile: 58521
Total number of unique lp_id: 60686
Share of unique lp_id in unique mobile: 96.43%


In [13]:
df.columns

Index(['mobile_num', 'txn_id', 'amount', 'sector', 'merchant_name',
       'txn_datetime', 'store_id', 'customer_id', 'mobile', 'custrank',
       'customerId', 'fstpd30def', 'obsfpd30'],
      dtype='object')

In [19]:

# Calculate count of unique customers per sector
unique_customers_count = df.groupby('sector')['customer_id'].nunique()
unique_711mobile_count = df.groupby('sector')['mobile_num'].nunique()
unique_obsfstpd30_customer_count = df.groupby('sector')['customerId'].nunique()
unique_fstpd30_customer_count = df[df['fstpd30def']==1].groupby('sector')['customerId'].nunique()

# Combine results into a DataFrame
result_df = pd.DataFrame({
    'Sector': sector_default_rate.index,
    'Unique Customers Count': unique_customers_count,
    'unique 711mobile count': unique_711mobile_count,
    'Observed fstpd30 customer count': unique_obsfstpd30_customer_count,
    'unique_fstpd30_customer_count': unique_fstpd30_customer_count,
})

result_df['Default Rate (%)'] = round(result_df['unique_fstpd30_customer_count']/result_df['Observed fstpd30 customer count']*100, 3)

# Sort by Default Rate in descending order
result_df_sorted = result_df.sort_values(by='Default Rate (%)', ascending=False)

print("Sector-wise Default Rate and Unique Customers Count (sorted from highest to lowest default rate):")
sector_result = result_df_sorted.copy()
sector_result

Sector-wise Default Rate and Unique Customers Count (sorted from highest to lowest default rate):


Unnamed: 0_level_0,Sector,Unique Customers Count,unique 711mobile count,Observed fstpd30 customer count,unique_fstpd30_customer_count,Default Rate (%)
sector,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Other Prepaid Services,Other Prepaid Services,49,50,27,13,48.148
Tuition,Tuition,119,121,57,23,40.351
Prepaid Load,Prepaid Load,15558,16077,7179,2686,37.415
Utility-Power,Utility-Power,889,923,377,135,35.809
Bank,Bank,4041,4234,2044,700,34.247
Gaming,Gaming,98,100,44,15,34.091
Cable TV,Cable TV,771,798,413,139,33.656
Postpaid Land Lines,Postpaid Land Lines,3273,3373,1628,538,33.047
E-Wallet,E-Wallet,382,393,154,50,32.468
Loans,Loans,15353,15997,7865,2500,31.786


In [20]:
result_df.head()

Unnamed: 0_level_0,Sector,Unique Customers Count,unique 711mobile count,Observed fstpd30 customer count,unique_fstpd30_customer_count,Default Rate (%)
sector,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Airlines,Airlines,625,656,336,87,25.893
Bank,Bank,4041,4234,2044,700,34.247
Cable TV,Cable TV,771,798,413,139,33.656
Collections,Collections,82,84,33,8,24.242
Credit Card,Credit Card,657,681,381,88,23.097


In [24]:
# Assuming you have already defined unique_customers_count, unique_711mobile_count,
# unique_obsfstpd30_customer_count, and unique_fstpd30_customer_count

# Combine results into a DataFrame
result_df = pd.DataFrame({
    'Sector': unique_customers_count.index.get_level_values(0),
    'Merchant': unique_customers_count.index.get_level_values(1),
    'Unique Customers Count': unique_customers_count,
    'unique 711mobile count': unique_711mobile_count,
    'Observed fstpd30 customer count': unique_obsfstpd30_customer_count,
    'fstpd30 Customer Count': unique_fstpd30_customer_count,
})

# Calculate default rate
result_df['sector_merchant_default_rate'] = round(result_df['fstpd30 Customer Count'] / result_df['Observed fstpd30 customer count'] * 100, 3)

# Sort by Default Rate in descending order
result_df_sorted = result_df.sort_values(by='sector_merchant_default_rate', ascending=False)

pd.set_option('display.max_rows', None)
print("Sector-wise and Merchant-wise Default Rate and Unique Customers Count (sorted from highest to lowest default rate):")
result_df_sorted


Sector-wise and Merchant-wise Default Rate and Unique Customers Count (sorted from highest to lowest default rate):


Unnamed: 0_level_0,Unnamed: 1_level_0,Sector,Merchant,Unique Customers Count,unique 711mobile count,Observed fstpd30 customer count,fstpd30 Customer Count,sector_merchant_default_rate
sector,merchant_name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Loans,SURECYCLE FINANCE,Loans,SURECYCLE FINANCE,9,9,2,2.0,100.0
Airlines,PAL (OTP Booking),Airlines,PAL (OTP Booking),4,4,1,1.0,100.0
Credit Card,CHINABANK CREDITCARD,Credit Card,CHINABANK CREDITCARD,3,3,2,2.0,100.0
Utility-Water,CARCAR WATER DISTRICT,Utility-Water,CARCAR WATER DISTRICT,2,2,1,1.0,100.0
Collections,VIVAMAX INC,Collections,VIVAMAX INC,3,3,1,1.0,100.0
Utility-Water,METRO LIPA WATER DISTRICT,Utility-Water,METRO LIPA WATER DISTRICT,6,6,2,2.0,100.0
Prepaid Load,GLOBE GO400,Prepaid Load,GLOBE GO400,5,5,2,2.0,100.0
Collections,JOHNDORF,Collections,JOHNDORF,3,3,1,1.0,100.0
Loans,U PESO LENDING,Loans,U PESO LENDING,1,1,1,1.0,100.0
Loans,JUAN CASH,Loans,JUAN CASH,1,2,1,1.0,100.0


In [25]:

# Calculate count of unique customers per sector and merchant
unique_customers_count = df.groupby(['sector', 'merchant_name'])['customer_id'].nunique()
unique_711mobile_count = df.groupby(['sector', 'merchant_name'])['mobile_num'].nunique()
unique_obsfstpd30_customer_count = df.groupby(['sector', 'merchant_name'])['customerId'].nunique()
unique_fstpd30_customer_count = df[df['fstpd30def']==1].groupby(['sector', 'merchant_name'])['customerId'].nunique()
sector_merchant_default_rate = round(df[df['fstpd30def']==1].groupby(['sector', 'merchant_name'])['customerId'].nunique() / df.groupby(['sector', 'merchant_name'])['customerId'].nunique()*100,3)

# Combine results into a DataFrame
result_df = pd.DataFrame({
    'Sector': sector_merchant_default_rate.index.get_level_values(0),
    'Merchant': sector_merchant_default_rate.index.get_level_values(1),
    'sector_merchant_default_rate':sector_merchant_default_rate,
    'Unique Customers Count': unique_customers_count,
    'unique 711mobile count': unique_711mobile_count,
    'Observed fstpd30 customer count': unique_obsfstpd30_customer_count,
    'fstpd30 Customer Count': unique_fstpd30_customer_count,
})

# result_df['sector_merchant_default_rate'] = round(result_df['fstpd30 Customer Count']/result_df['Observed fstpd30 customer count'] *100, 3) 
# Sort by Default Rate in descending order
result_df_sorted = result_df.sort_values(by='sector_merchant_default_rate', ascending=False)

pd.set_option('display.max_rows', None)
print("Sector-wise and Merchant-wise Default Rate and Unique Customers Count (sorted from highest to lowest default rate):")
sector_merchant_result = result_df_sorted.copy()
sector_merchant_result

Sector-wise and Merchant-wise Default Rate and Unique Customers Count (sorted from highest to lowest default rate):


Unnamed: 0_level_0,Unnamed: 1_level_0,Sector,Merchant,sector_merchant_default_rate,Unique Customers Count,unique 711mobile count,Observed fstpd30 customer count,fstpd30 Customer Count
sector,merchant_name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Loans,SURECYCLE FINANCE,Loans,SURECYCLE FINANCE,100.0,9,9,2,2.0
Airlines,PAL (OTP Booking),Airlines,PAL (OTP Booking),100.0,4,4,1,1.0
Credit Card,CHINABANK CREDITCARD,Credit Card,CHINABANK CREDITCARD,100.0,3,3,2,2.0
Utility-Water,CARCAR WATER DISTRICT,Utility-Water,CARCAR WATER DISTRICT,100.0,2,2,1,1.0
Collections,VIVAMAX INC,Collections,VIVAMAX INC,100.0,3,3,1,1.0
Utility-Water,METRO LIPA WATER DISTRICT,Utility-Water,METRO LIPA WATER DISTRICT,100.0,6,6,2,2.0
Prepaid Load,GLOBE GO400,Prepaid Load,GLOBE GO400,100.0,5,5,2,2.0
Collections,JOHNDORF,Collections,JOHNDORF,100.0,3,3,1,1.0
Loans,U PESO LENDING,Loans,U PESO LENDING,100.0,1,1,1,1.0
Loans,JUAN CASH,Loans,JUAN CASH,100.0,1,2,1,1.0


In [26]:
# Create a Pandas Excel writer using ExcelWriter
excel_file = r'C:\Users\DwaipayanChakroborti\OneDrive - Tonik Financial Pte Ltd\MyStuff\Personal\research\711_Analysis\Data\sector_analysis.xlsx'
with pd.ExcelWriter(excel_file) as writer:
    # Write each DataFrame to a separate sheet
    sector_result.to_excel(writer, sheet_name='Sector Result', index=False)
    sector_merchant_result.to_excel(writer, sheet_name='Sector Merchant Result', index=False)

print(f"Excel file '{excel_file}' has been created with two sheets.")

Excel file 'C:\Users\DwaipayanChakroborti\OneDrive - Tonik Financial Pte Ltd\MyStuff\Personal\research\711_Analysis\Data\sector_analysis.xlsx' has been created with two sheets.
