# <div align="center" style="color: #ff5733;">App Categorization</div>

In [1]:
# %% [markdown]
# # Jupyter Notebook Loading Header
#
# This is a custom loading header for Jupyter Notebooks in Visual Studio Code.
# It includes common imports and settings to get you started quickly.

# %% [markdown]
## Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from google.cloud import bigquery
import os
path = r'C:\Users\DwaipayanChakroborti\AppData\Roaming\gcloud\legacy_credentials\dchakroborti@tonikbank.com\adc.json'
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = path
client = bigquery.Client(project='prj-prod-dataplatform')

# %% [markdown]
## Configure Settings
# Set options or configurations as needed
# Example: pd.set_option('display.max_columns', None)

In [18]:
sq = """WITH
  b AS (
  SELECT
    loanAccountNumber,
    min_inst_def30,
    obs_min_inst_def30
  FROM
    prj-prod-dataplatform.risk_credit_mis.loan_deliquency_data
  WHERE
    obs_min_inst_def30 >= 2),
lmt as
(SELECT
  lmt.loanAccountNumber,
  lmt.customerId,
  lmt.digitalLoanAccountId,
  lmt.tsa_onboarding_time,
  lmt.startApplyDateTime,
  lmt.termsAndConditionsSubmitDateTime,
  lmt.isTermsAndConditionsAccepted,
  lmt.disbursementDateTime,
  lmt.flagDisbursement,
  lmt.loanPaidStatus,
  case when b.obs_min_inst_def30 >=2 and b.min_inst_def30 in (1,2) then lmt.loanAccountNumber end FSPD30_loancnt,
  case when b.obs_min_inst_def30 >=2 then lmt.loanAccountNumber end obsFSPD30_loancnt
FROM
  `risk_credit_mis.loan_master_table` lmt
left JOIN
  b
ON
  lmt.loanAccountNumber = b.loanAccountNumber 
),
base as
(select 
distinct
  lmt.customerId,
  lmt.digitalLoanAccountId,
  lmt.loanAccountNumber,
  lmt.tsa_onboarding_time,
  lmt.startApplyDateTime,
  lmt.termsAndConditionsSubmitDateTime,
  lmt.isTermsAndConditionsAccepted,
  lmt.disbursementDateTime,
  lmt.flagDisbursement,
  lmt.loanPaidStatus,
  t3.creditScoreUpdated   ,
  t3.fraudScore   ,	
  t3.fraudScoreUpdated    ,
  t3.calculateddate   ,
  t4.run_date ,
  ca.package_name ,
  ca.first_install_time    ,
  ca.last_update_time      ,
  ca.version_name, 
  ca.version_code,
  t4.GeneralInfo.brand     ,
  t4.Hardware.device__brand   ,
  t4.Hardware.device__manufacturer   ,
  t4.Hardware.device__model,
  t4.GeneralData.telephony_info__network_operator_name,
  t4.GeneralData.telephony_info__network_operator,
  t4.GeneralData.sim_operator_name,
  ptat.Category,
  -- ptat.Rating,
  case when ptat.Rating = 'rated for 3+' then 1 else 0 end rated_for_3_plus,
  case when ptat.Rating = 'rated for 7+' then 1 else 0 end rated_for_7_plus,
  case when ptat.Rating = 'rated for 12+' then 1 else 0 end rated_for_12_plus,
  case when ptat.Rating = 'rated for 16+' then 1 else 0 end rated_for_16_plus,
  case when ptat.Rating = 'rated for 18+' then 1 else 0 end rated_for_18_plus,
  case when ptat.Rating = 'undefined' then 1 else 0 end undefined,
  case when ptat.Rating = 'unrated' then 1 else 0 end unrated,
  case when ptat.Rating is null then 1 else 0 end Rating_Not_Available,
  ptat.Is_Paid,

  lmt.FSPD30_loancnt,     ---- FSPD30 = 1 when this value is not null(provided this as there were be duplicate rows in this dataset because of package name)
  lmt.obsFSPD30_loancnt   ---- obsFSPD30 = 1 when this value is not null (provided this as there were be duplicate rows in this dataset because of package name)
from lmt
LEFT JOIN
`prj-prod-dataplatform.dl_loans_db_raw.tdbk_digital_loan_application` t2
ON lmt.digitalLoanAccountId = t2.digitalLoanAccountId
LEFT JOIN
`prj-prod-dataplatform.dl_loans_db_raw.tdbk_credolab_track` t3
ON t2.credolabRefNumber = t3.refno
LEFT JOIN
`prj-prod-dataplatform.credolab_raw.android_credolab_datasets_struct_columns` t4
ON t3.refno = t4.deviceId
inner join
`prj-prod-dataplatform.core_raw.loan_accounts` loan
on loan.CUSTOMERID = lmt.customerId
 INNER JOIN
(select deviceId, af.package_name as package_name, af.first_install_time as first_install_time , af.last_update_time as last_update_time 
, version_name, version_code
from `prj-prod-dataplatform.credolab_raw.android_credolab_Application`  ,
unnest(Application) as af) ca
ON ca.deviceId = t3.refno
LEFT JOIN prj-prod-dataplatform.dap_ds_poweruser_playground.PH_Tonikbank_Application_Temp ptat
ON REGEXP_REPLACE(ca.package_name, r'[ ._]', '') = REGEXP_REPLACE(ptat.Package_Name, r'[ ._]', '')
where date(lmt.startApplyDateTime) >='2023-07-01'   ---- Please change the date as per your requirement. This is Loan Application Apply Date
-- and lmt.FSPD30_loancnt is not null
)
select package_name, count(distinct customerId) cntcust 
, (select count(distinct customerId) from base) totalcust
from base group by 1 order by 2 desc

;"""

In [19]:
dfpackages = client.query(sq).to_dataframe(progress_bar_type='tqdm')

Job ID 295dda74-ad15-419b-ae59-cdd741808518 successfully executed: 100%|[32m██████████[0m|
Downloading: 100%|[32m██████████[0m|


In [21]:
dfpackages['share'] = round(dfpackages['cntcust']/dfpackages['totalcust'] *100, 4)

In [22]:
dfpackages.sort_values(by = 'cntcust', ascending=False)

Unnamed: 0,package_name,cntcust,totalcust,share
0,com.tonik.mobile,117181,117181,100.0
1,android,117020,117181,99.8626
7,com.android.proxyhandler,117019,117181,99.8618
6,com.android.settings,117019,117181,99.8618
17,com.android.sharedstoragebackup,117019,117181,99.8618
...,...,...,...,...
137174,org.chromium.webapk.a671e5db0d16b219b_v2,1,117181,0.0009
137175,org.chromium.webapk.ab110c25b17cd5f0e_v2,1,117181,0.0009
137176,com.lightcurvesoftware.filipinotoenglishtransl...,1,117181,0.0009
137177,net.eurekare.user1.e_care,1,117181,0.0009


In [23]:
dfpackages.to_csv("Distinctpackagenameavailableincredolabdata.csv", index = False)

In [24]:
dfpackages['package_name'].nunique()

137202