# <div align="center" style="color: #ff5733;">Daily Credo Lab Data Snapshot</div>

In [1]:
# %% [markdown]
# # Jupyter Notebook Loading Header
#
# This is a custom loading header for Jupyter Notebooks in Visual Studio Code.
# It includes common imports and settings to get you started quickly.

# %% [markdown]
## Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from google.cloud import bigquery
import os
path = r'C:\Users\DwaipayanChakroborti\AppData\Roaming\gcloud\legacy_credentials\dchakroborti@tonikbank.com\adc.json'
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = path
client = bigquery.Client(project='prj-prod-dataplatform')

# %% [markdown]
## Configure Settings
# Set options or configurations as needed
pd.set_option('display.max_columns', None)

# Function to get the date range

In [2]:
import datetime
from google.cloud import bigquery

def get_date_range(start_date, end_date):
    for n in range(int((end_date - start_date).days)):
        yield start_date + datetime.timedelta(n)

# Function to create the 12th day date and 150th day date

In [3]:
def calculate_dates(onboarding_date):
    # Convert the input date to datetime format
    onboarding_date = pd.to_datetime(onboarding_date)
    
    # Calculate the 120th and 150th days from the onboarding date
    day_120 = onboarding_date + pd.Timedelta(days=120)
    day_150 = onboarding_date + pd.Timedelta(days=150)
    
    return day_120, day_150

# Function to identify the customer and the onboarding date on a particular date

In [4]:
def CustomerIdentify(date):
    sq = f"""select distinct cust_id, created_dt onboardingDate FROM `dl_customers_db_raw.tdbk_customer_mtb` WHERE 1=1 and date(created_dt) = "{date}" and cust_id is not null; """
    df = client.query(sq).to_dataframe()
    return df

In [5]:
dfapp = pd.read_csv(r"C:\Users\DwaipayanChakroborti\OneDrive - Tonik Financial Pte Ltd\MyStuff\Biswa\Customer_360_Data_Prep\Data\SupportingData\PH_TonikBank_applications_20240807.csv")

In [6]:
# Function to run the Credolab Data preparation query

# cusmtb - Customer Basic information from credo lab

In [7]:
sq = """     
with cusmtb as
(select distinct cust_id, date(created_dt) onboardingDate 
, date_add(date(created_dt), INTERVAL 120 day) day_120
, date_add(date(created_dt), INTERVAL 150 day) day_150
FROM `dl_customers_db_raw.tdbk_customer_mtb` 
WHERE 1=1 and date(created_dt) >= '2023-01-01' and cust_id is not null
),
base as 
(select 
b.cust_id ,
        b.onboardingDate,
        b.day_120,
        b.day_150,
        tlcd.kycStatus      ,
        t3.creditScoreUpdated   ,
        t3.fraudScore   ,	
        t3.fraudScoreUpdated    ,
        t3.calculateddate   ,
        t4.run_date ,
        ca.package_name ,
        ca.first_install_time    ,
        t4.GeneralInfo.brand     ,
        t4.Hardware.device__brand   ,
        t4.Hardware.device__manufacturer   ,
        t4.Hardware.device__model,
        t4.GeneralData.telephony_info__network_operator_name,
        t4.GeneralData.telephony_info__network_operator,
        t4.GeneralData.sim_operator_name,
    from cusmtb b
    inner join prj-prod-dataplatform.dl_loans_db_raw.tdbk_loan_customer_details tlcd on tlcd.custId = b.cust_id  
    inner JOIN `prj-prod-dataplatform.dl_loans_db_raw.tdbk_credolab_track` t3 ON cast(tlcd.credolabRefNumber as string) = cast(t3.refno as string) and date(t3.createdOn) <= b.day_120
    left JOIN `prj-prod-dataplatform.credolab_raw.android_credolab_datasets_struct_columns` t4 ON t3.refno = t4.deviceId and date(run_date) <= b.day_120
    INNER JOIN
    (select deviceId, af.package_name as package_name, af.first_install_time as first_install_time from `prj-prod-dataplatform.credolab_raw.android_credolab_Application`  ,
    unnest(Application) as af) ca ON ca.deviceId = t3.refno and date(run_date) <= b.day_120
),
base2 as
(select distinct cust_id, onboardingDate, day_120, day_150, kycStatus, creditScoreUpdated, fraudScore , fraudScoreUpdated, run_date 
, brand, device__brand, device__manufacturer, device__model
, telephony_info__network_operator_name, telephony_info__network_operator, sim_operator_name 
from base
)
select * from base2;
"""

cusmtb = client.query(sq).to_dataframe(progress_bar_type='tqdm')
print(f"The shape of cusmtb is :\t{cusmtb.shape}")
cusmtb.to_csv(r"cusmtb.csv", index = False)

Job ID 12ad8087-287b-4395-9d36-868513ec1edf successfully executed: 100%|[32m██████████[0m|
Downloading: 100%|[32m██████████[0m|
The shape of cusmtb is :	(523193, 16)


# cusfirsttimeinstall  - Min first install time

In [8]:
sq = """with cusmtb as
(select distinct cust_id, date(created_dt) onboardingDate 
, date_add(date(created_dt), INTERVAL 120 day) day_120
, date_add(date(created_dt), INTERVAL 150 day) day_150
FROM `dl_customers_db_raw.tdbk_customer_mtb` 
WHERE 1=1 and date(created_dt) >= '2023-01-01' and cust_id is not null
),
base as 
(select 
b.cust_id ,
        b.onboardingDate,
        b.day_120,
        b.day_150,
        tlcd.kycStatus      ,
        t3.creditScoreUpdated   ,
        t3.fraudScore   ,	
        t3.fraudScoreUpdated    ,
        t3.calculateddate   ,
        t4.run_date ,
        ca.package_name ,
        ca.first_install_time    ,
        t4.GeneralInfo.brand     ,
        t4.Hardware.device__brand   ,
        t4.Hardware.device__manufacturer   ,
        t4.Hardware.device__model,
        t4.GeneralData.telephony_info__network_operator_name,
        t4.GeneralData.telephony_info__network_operator,
        t4.GeneralData.sim_operator_name,
    from cusmtb b
    inner join prj-prod-dataplatform.dl_loans_db_raw.tdbk_loan_customer_details tlcd on tlcd.custId = b.cust_id  
    inner JOIN `prj-prod-dataplatform.dl_loans_db_raw.tdbk_credolab_track` t3 ON cast(tlcd.credolabRefNumber as string) = cast(t3.refno as string) and date(t3.createdOn) <= b.day_120
    left JOIN `prj-prod-dataplatform.credolab_raw.android_credolab_datasets_struct_columns` t4 ON t3.refno = t4.deviceId and date(run_date) <= b.day_120
    INNER JOIN
    (select deviceId, af.package_name as package_name, af.first_install_time as first_install_time from `prj-prod-dataplatform.credolab_raw.android_credolab_Application`  ,
    unnest(Application) as af) ca ON ca.deviceId = t3.refno and date(run_date) <= b.day_120
),
base2 as
(select distinct cust_id, onboardingDate, day_120, day_150, kycStatus, creditScoreUpdated, fraudScore , fraudScoreUpdated, run_date 
, brand, device__brand, device__manufacturer, device__model
, telephony_info__network_operator_name, telephony_info__network_operator, sim_operator_name 
from base
)
-- select * from base2 where cust_id = '2242663';
-- select cust_id, count(cust_id) from base2 group by 1 having count(cust_id) > 1;
----There is no duplicate in base2
, base3 as 
(select cust_id, min(first_install_time) first_install_time from base group by 1)
select * from base3;"""
cusfirsttimeinstall = client.query(sq).to_dataframe(progress_bar_type='tqdm')


Job ID ba47d21e-0bda-4806-951d-aec20070802f successfully executed: 100%|[32m██████████[0m|
Downloading: 100%|[32m██████████[0m|


In [9]:
print(f"The shape of cusfirsttimeinstall is :\t{cusfirsttimeinstall.shape}")
cusfirsttimeinstall.to_csv(r"cusfirsttimeinstall.csv", index = False)

The shape of cusfirsttimeinstall is :	(523193, 2)


# results - For data related to package details for customers

I have to run this in bigquery notebook and download the csv file there and then download it back to my laptop for further processing

In [None]:
# sq = """with cusmtb as
# (select distinct cust_id, date(created_dt) onboardingDate 
# , date_add(date(created_dt), INTERVAL 120 day) day_120
# , date_add(date(created_dt), INTERVAL 150 day) day_150
# FROM `dl_customers_db_raw.tdbk_customer_mtb` 
# WHERE 1=1 and date(created_dt) >= '2023-01-01' and date(created_dt) <= '2024-03-31' and cust_id is not null
# ),
# base as 
# (select 
# b.cust_id ,
#         b.onboardingDate,
#         b.day_120,
#         b.day_150,
#         tlcd.kycStatus      ,
#         t3.creditScoreUpdated   ,
#         t3.fraudScore   ,	
#         t3.fraudScoreUpdated    ,
#         t3.calculateddate   ,
#         t4.run_date ,
#         ca.package_name ,
#         ca.first_install_time    ,
#         t4.GeneralInfo.brand     ,
#         t4.Hardware.device__brand   ,
#         t4.Hardware.device__manufacturer   ,
#         t4.Hardware.device__model,
#         t4.GeneralData.telephony_info__network_operator_name,
#         t4.GeneralData.telephony_info__network_operator,
#         t4.GeneralData.sim_operator_name,
#     from cusmtb b
#     inner join prj-prod-dataplatform.dl_loans_db_raw.tdbk_loan_customer_details tlcd on tlcd.custId = b.cust_id  
#     inner JOIN `prj-prod-dataplatform.dl_loans_db_raw.tdbk_credolab_track` t3 ON cast(tlcd.credolabRefNumber as string) = cast(t3.refno as string) and date(t3.createdOn) <= b.day_120
#     left JOIN `prj-prod-dataplatform.credolab_raw.android_credolab_datasets_struct_columns` t4 ON t3.refno = t4.deviceId and date(run_date) <= b.day_120
#     INNER JOIN
#     (select deviceId, af.package_name as package_name, af.first_install_time as first_install_time from `prj-prod-dataplatform.credolab_raw.android_credolab_Application`  ,
#     unnest(Application) as af) ca ON ca.deviceId = t3.refno and date(run_date) <= b.day_120
# ),
# base2 as
# (select distinct cust_id, onboardingDate, day_120, day_150, kycStatus, creditScoreUpdated, fraudScore , fraudScoreUpdated, run_date 
# , brand, device__brand, device__manufacturer, device__model
# , telephony_info__network_operator_name, telephony_info__network_operator, sim_operator_name 
# from base
# )
# , base3 as 
# (select cust_id, min(first_install_time) first_install_time from base group by 1),
# base4 as 
# (select distinct cust_id, package_name from base )
# select * from base4;
# """
# cuspackage = client.query(sq).to_dataframe(progress_bar_type='tqdm')
# print(f"The shape of cuspackage is :\t{cuspackage.shape}")
# cuspackage.to_csv(r"cuspackage.csv", index = False)

# Read the result csv file

In [10]:
results = pd.read_csv(r"C:\Users\DwaipayanChakroborti\OneDrive - Tonik Financial Pte Ltd\MyStuff\Biswa\Customer_360_Data_Prep\Data\SupportingData\results.csv")

# Read the package information csv file provided by credolab for package categorization

In [11]:
 dfapp = pd.read_csv(r"C:\Users\DwaipayanChakroborti\OneDrive - Tonik Financial Pte Ltd\MyStuff\Biswa\Customer_360_Data_Prep\Data\SupportingData\PH_TonikBank_applications_20240807.csv")

In [12]:
dfapp.head()

Unnamed: 0,Package Name,# of datasets,# of datasets where app is considered as user installed,Category,Rating,Rating Description,Is Paid
0,com.tonik.mobile,685556,685553,finance,rated for 3+,undefined,free
1,android,684929,0,,,,
2,com.android.providers.settings,684922,0,,,,
3,com.android.externalstorage,684920,0,,,,
4,com.android.settings,684920,0,,,,


In [13]:
dfapp['Category'].unique()

array(['finance', nan, 'tools', 'business', 'communication',
       'video players & editors', 'travel & local', 'social',
       'productivity', 'photography', 'auto & vehicles', 'shopping',
       'music & audio', 'entertainment', 'action', 'news & magazines',
       'food & drink', 'lifestyle', 'personalization', 'art & design',
       'maps & navigation', 'adventure', 'weather', 'health & fitness',
       'casual', 'education', 'sports', 'strategy', 'casino', 'puzzle',
       'books & reference', 'word', 'arcade', 'educational', 'medical',
       'card', 'role playing', 'dating', 'comics', 'board', 'music',
       'simulation', 'racing', 'trivia', 'parenting', 'events', 'beauty',
       'house & home', 'libraries & demo', 'undefined'], dtype=object)

In [14]:
import re

def categorize_package(package_name):
    if isinstance(package_name, str):
        pattern = r'linkedin|jobstreet|glint'
        if re.search(pattern, package_name, re.IGNORECASE):
            return 'Job'
        pattern1 = r'office|gdrive|word|powerpoint|excel|'
        if re.search(pattern, package_name, re.IGNORECASE):
            return 'Professional'
    return 'Other'

dfapp['CustomCategory'] = dfapp['Package Name'].apply(categorize_package)
dfapp.head()

Unnamed: 0,Package Name,# of datasets,# of datasets where app is considered as user installed,Category,Rating,Rating Description,Is Paid,CustomCategory
0,com.tonik.mobile,685556,685553,finance,rated for 3+,undefined,free,Other
1,android,684929,0,,,,,Other
2,com.android.providers.settings,684922,0,,,,,Other
3,com.android.externalstorage,684920,0,,,,,Other
4,com.android.settings,684920,0,,,,,Other


In [15]:
dfapp['CombinedCategory'] = np.where((dfapp['CustomCategory'].notna()) & (dfapp['CustomCategory'] != 'Other'),
                                     dfapp['CustomCategory'],
                                     dfapp['Category'])

In [16]:
dfapp.drop(columns=['Category', 'CustomCategory'], inplace=True)
dfapp.rename(columns={'CombinedCategory':'Category'}, inplace = True)
dfapp.head()

Unnamed: 0,Package Name,# of datasets,# of datasets where app is considered as user installed,Rating,Rating Description,Is Paid,Category
0,com.tonik.mobile,685556,685553,rated for 3+,undefined,free,finance
1,android,684929,0,,,,
2,com.android.providers.settings,684922,0,,,,
3,com.android.externalstorage,684920,0,,,,
4,com.android.settings,684920,0,,,,


In [17]:
# For df_risk_table_2
results['package'] = results['package_name'].str.replace(r'[ ._]', '', regex=True)

# For dfapp
dfapp['package'] = dfapp['Package Name'].str.replace(r'[ ._]', '', regex=True)


In [18]:
dfapp.columns

Index(['Package Name', '# of datasets',
       '# of datasets where app is considered as user installed', 'Rating',
       'Rating Description', 'Is Paid', 'Category', 'package'],
      dtype='object')

In [19]:
dfa = dfapp[['package', 'Category', 'Rating','Rating Description', 'Is Paid']].copy()

In [20]:
Credodata = results.merge(dfa, on='package', how = 'outer')

In [21]:
Credodata.head()

Unnamed: 0,cust_id,package_name,package,Category,Rating,Rating Description,Is Paid
0,,,ACENAPK,,,,
1,2201933.0,AM.callernameannouncer.callernamespeaker,AMcallernameannouncercallernamespeaker,,,,
2,,,AO3ArchiveofOurOwnhwawi,,,,
3,,,ARRulerBirdmanStudionet,,,,
4,1907498.0,AcrylicNails.huawei,AcrylicNailshuawei,,,,


# Check the category wise unique count of customer id

In [22]:
Credodata.groupby('Category')['cust_id'].nunique().sort_values(ascending = False)

Category
tools                      301891
communication              282929
finance                    268781
productivity               263906
video players & editors    232980
social                     196271
music & audio              181230
business                   177089
shopping                   174701
photography                166829
entertainment              166793
travel & local             157398
auto & vehicles             92462
lifestyle                   83014
casual                      75665
action                      74058
maps & navigation           72704
personalization             68502
health & fitness            58894
food & drink                58109
news & magazines            53699
books & reference           46150
education                   45502
puzzle                      44238
Job                         38633
arcade                      34101
simulation                  28339
strategy                    24806
adventure                   24251
casin

# Check the rating wise unique count of customer id

In [23]:
Credodata.groupby(['Rating'])['cust_id'].nunique().sort_values(ascending = False)

Rating
rated for 3+     304238
rated for 12+    299217
rated for 18+     92870
rated for 7+      57719
rated for 16+     39868
undefined            33
unrated               3
Name: cust_id, dtype: int64

In [24]:
Credodata.columns

Index(['cust_id', 'package_name', 'package', 'Category', 'Rating',
       'Rating Description', 'Is Paid'],
      dtype='object')

In [25]:
Credodata['Category'] = Credodata['Category'].fillna('Others')

In [26]:
Credodata[Credodata['cust_id'].isnull() == True]

Unnamed: 0,cust_id,package_name,package,Category,Rating,Rating Description,Is Paid
0,,,ACENAPK,Others,,,
2,,,AO3ArchiveofOurOwnhwawi,Others,,,
3,,,ARRulerBirdmanStudionet,Others,,,
9,,,AutoLoanCalculatorPRODPsoftwareorg,Others,,,
12,,,BIMobileWombat,Others,,,
...,...,...,...,...,...,...,...
31758063,,,zygonstreamcomapp,music & audio,rated for 3+,undefined,free
31758064,,,zyxdolindachat,Others,,,
31758065,,,zyxxyballbox,Others,,,
31758067,,,zzmdgclelectroniccigarette,tools,rated for 3+,undefined,free


# Drop all rows where cust_id is null

In [27]:
Credodata = Credodata.dropna(subset=['cust_id'])

In [28]:
Credodata.shape

(31491899, 7)

In [29]:
# Find all rows with duplicates (including the first occurrence)
all_duplicate_rows = Credodata[Credodata.duplicated(keep=False)]

# Display all duplicate rows
all_duplicate_rows.head()

Unnamed: 0,cust_id,package_name,package,Category,Rating,Rating Description,Is Paid
446480,1854806.0,business.ideas,businessideas,business,rated for 3+,undefined,free
446481,1854806.0,business.ideas,businessideas,business,rated for 3+,undefined,free
446482,1959976.0,business.ideas,businessideas,business,rated for 3+,undefined,free
446483,1959976.0,business.ideas,businessideas,business,rated for 3+,undefined,free
463751,2132428.0,cn.oneplus.oemtcma,cnoneplusoemtcma,Others,,,


# Create each row for each unique package found for customer id with the number of unique different packages found in each category

# dfapp1

In [30]:
dfapp1 = Credodata.pivot_table(index = 'cust_id', columns='Category', values='package', aggfunc='nunique').reset_index()
dfapp1.head()

Category,cust_id,Job,Others,action,adventure,arcade,art & design,auto & vehicles,beauty,board,books & reference,business,card,casino,casual,comics,communication,dating,education,educational,entertainment,events,finance,food & drink,health & fitness,house & home,libraries & demo,lifestyle,maps & navigation,medical,music,music & audio,news & magazines,parenting,personalization,photography,productivity,puzzle,racing,role playing,shopping,simulation,social,sports,strategy,tools,travel & local,trivia,undefined,video players & editors,weather,word
0,1845909.0,,73.0,,,,,1.0,,,1.0,1.0,,1.0,1.0,,4.0,,1.0,,1.0,,7.0,,1.0,,,1.0,,,,3.0,,,1.0,1.0,3.0,,,,1.0,,1.0,,,10.0,2.0,,,2.0,,
1,1845911.0,,58.0,,,,,,,,1.0,,,,,,3.0,,,,3.0,,2.0,1.0,,,,,,,,1.0,,,,,3.0,,,,,,1.0,,,6.0,,,,1.0,,
2,1845913.0,,41.0,,,,,,,,,1.0,,,,,1.0,,,,,,5.0,,,,,1.0,,,,,1.0,,,1.0,2.0,,,,,,,,,7.0,,,,2.0,,
3,1845915.0,,45.0,,,,,,,,1.0,,,,,,2.0,,,,,,2.0,,,,,,,,,,,,,,1.0,,,,1.0,,1.0,,,6.0,1.0,,,4.0,,
4,1845916.0,,65.0,1.0,,,,,,,,1.0,,,1.0,,2.0,,,,1.0,,2.0,,,,,,,,,2.0,,,,,,,,,1.0,,,,,1.0,,,,2.0,,


In [31]:
dfapp1 = dfapp1.fillna(0.0)

In [32]:
dfapp1['cust_id'] = dfapp1['cust_id'].astype(str)

In [33]:
dfapp1.columns

Index(['cust_id', 'Job', 'Others', 'action', 'adventure', 'arcade',
       'art & design', 'auto & vehicles', 'beauty', 'board',
       'books & reference', 'business', 'card', 'casino', 'casual', 'comics',
       'communication', 'dating', 'education', 'educational', 'entertainment',
       'events', 'finance', 'food & drink', 'health & fitness', 'house & home',
       'libraries & demo', 'lifestyle', 'maps & navigation', 'medical',
       'music', 'music & audio', 'news & magazines', 'parenting',
       'personalization', 'photography', 'productivity', 'puzzle', 'racing',
       'role playing', 'shopping', 'simulation', 'social', 'sports',
       'strategy', 'tools', 'travel & local', 'trivia', 'undefined',
       'video players & editors', 'weather', 'word'],
      dtype='object', name='Category')

In [34]:
dfapp1 = dfapp1[['cust_id', 'Others', 'action', 'adventure', 'arcade', 'art & design',
       'auto & vehicles', 'beauty', 'board', 'books & reference', 'business',
       'card', 'casino', 'casual', 'comics', 'communication', 'dating',
       'education', 'educational', 'entertainment', 'finance', 'food & drink',
       'health & fitness', 'lifestyle', 'maps & navigation', 'medical',
       'music', 'music & audio', 'news & magazines', 'personalization',
       'photography', 'productivity', 'puzzle', 'racing', 'role playing',
       'shopping', 'simulation', 'social', 'sports', 'strategy', 'tools',
       'travel & local', 'trivia', 'video players & editors', 'weather',
       'word']].copy()

# dfrating

In [35]:
dfrating = Credodata.pivot_table(index = 'cust_id', columns='Rating', values='package', aggfunc='nunique').reset_index()
dfrating.head()

Rating,cust_id,rated for 12+,rated for 16+,rated for 18+,rated for 3+,rated for 7+,undefined,unrated
0,1845909.0,9.0,,,35.0,,,
1,1845911.0,4.0,,,18.0,,,
2,1845913.0,1.0,,1.0,19.0,,,
3,1845915.0,7.0,,,12.0,,,
4,1845916.0,6.0,1.0,,7.0,,,


In [36]:
dfrating1 = dfrating[['cust_id', 'rated for 12+', 'rated for 16+', 'rated for 18+']].copy()

In [37]:
dfrating1['cust_id'] = dfrating1['cust_id'].astype(str)

# dfpaid

In [38]:
dfpaid = Credodata.pivot_table(index = 'cust_id', columns='Is Paid', values='package', aggfunc='nunique').reset_index()
dfpaid = dfpaid.fillna(0)

In [39]:
dfpaid['cust_id'] = dfpaid['cust_id'].astype(str)

In [40]:
print(f"The shape of cusmtb is :\t {cusmtb.shape}")
print(f"The shape of cusfirsttimeinstall is:\t {cusfirsttimeinstall.shape}")
print(f"The shape of dfapp1 is:\t{dfapp1.shape}")
print(f"The shape of dfrating1 is:\t {dfrating1.shape}")
print(f"The shape of dfpaid is:\t{dfpaid.shape}")

The shape of cusmtb is :	 (523193, 16)
The shape of cusfirsttimeinstall is:	 (523193, 2)
The shape of dfapp1 is:	(304331, 46)
The shape of dfrating1 is:	 (304289, 4)
The shape of dfpaid is:	(304289, 4)


In [42]:
finaldf  = cusmtb.merge(cusfirsttimeinstall, on='cust_id', how = 'left')
finaldf = finaldf.merge(dfapp1, on='cust_id', how = 'left')
finaldf = finaldf.merge(dfrating1, on='cust_id', how = 'left')
finaldf = finaldf.merge(dfpaid , on='cust_id', how ='left')

In [43]:
finaldf.head()

Unnamed: 0,cust_id,onboardingDate,day_120,day_150,kycStatus,creditScoreUpdated,fraudScore,fraudScoreUpdated,run_date,brand,device__brand,device__manufacturer,device__model,telephony_info__network_operator_name,telephony_info__network_operator,sim_operator_name,first_install_time,Others,action,adventure,arcade,art & design,auto & vehicles,beauty,board,books & reference,business,card,casino,casual,comics,communication,dating,education,educational,entertainment,finance,food & drink,health & fitness,lifestyle,maps & navigation,medical,music,music & audio,news & magazines,personalization,photography,productivity,puzzle,racing,role playing,shopping,simulation,social,sports,strategy,tools,travel & local,trivia,video players & editors,weather,word,rated for 12+,rated for 16+,rated for 18+,free,pay,undefined
0,2404576,2024-02-18,2024-06-17,2024-07-17,SKYC,,944.0,,2024-02-18,samsung,samsung,,SM-A235F,Zain KW,41902,SMART,1970-01-01 08:00:00+00:00,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,2107532,2023-06-25,2023-10-23,2023-11-22,BKYC,,944.0,,2023-06-25,samsung,samsung,,SM-A326B,etisalat,42402,Etisalat,1970-01-01 08:00:00+00:00,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,2088176,2023-06-11,2023-10-09,2023-11-08,SKYC,Y,944.0,Y,2023-06-11,samsung,samsung,,SM-A207F,Smart,51503,,2008-12-31 23:00:00+00:00,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,1935020,2023-03-08,2023-07-06,2023-08-05,SKYC,Y,944.0,Y,2023-03-08,xiaomi,xiaomi,,Redmi Note 8,515 66,51566,DITO,1989-04-01 11:53:20+00:00,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,1913762,2023-02-23,2023-06-23,2023-07-23,SKYC,Y,824.0,Y,2023-03-26,samsung,samsung,,SM-M625F,UMNIAH,41603,GLOBE,1970-01-01 08:00:00+00:00,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


# Select only SKYC customer

In [44]:
finaldf['kycStatus'].value_counts()

kycStatus
SKYC    489036
BKYC     34156
FKYC         1
Name: count, dtype: int64

In [46]:
finaldf = finaldf[finaldf['kycStatus'] != 'BKYC']
finaldf['kycStatus'].value_counts()

kycStatus
SKYC    489036
FKYC         1
Name: count, dtype: int64

In [47]:
finaldf.to_csv("Credolabdata20240911.csv", index = False)

# Rename column name

In [49]:
# Read the Excel file (replace 'your_file.xlsx' with the actual file name)
excel_data = pd.read_csv(r'C:\Users\DwaipayanChakroborti\OneDrive - Tonik Financial Pte Ltd\MyStuff\Biswa\Customer_360_Data_Prep\Data\SupportingData\Credolabdatacolumnrename.csv')
# excel_data.columns
# Create a dictionary to map current column names to updated column names
rename_dict = dict(zip(excel_data['Columnname'], excel_data['Updated_Name']))

rename_dict

{'cust_id': 'cust_id',
 'onboardingDate': 'onboardingDate',
 'day_120': 'cl_day_120',
 'day_150': 'cl_day_150',
 'kycStatus': 'cl_kycStatus',
 'creditScoreUpdated': 'cl_creditScoreUpdated',
 'fraudScore': 'cl_fraudScore',
 'fraudScoreUpdated': 'cl_fraudScoreUpdated',
 'run_date': 'cl_run_date',
 'brand': 'cl_brand',
 'device__brand': 'cl_device__brand',
 'device__manufacturer': 'cl_device__manufacturer',
 'device__model': 'cl_device__model',
 'telephony_info__network_operator_name': 'cl_telephony_info__network_operator_name',
 'telephony_info__network_operator': 'cl_telephony_info__network_operator',
 'sim_operator_name': 'cl_sim_operator_name',
 'first_install_time': 'cl_first_install_time',
 'Others': 'cl_pkg_Others',
 'action': 'cl_pkg_action',
 'adventure': 'cl_pkg_adventure',
 'arcade': 'cl_pkg_arcade',
 'art & design': 'cl_pkg_art _ design',
 'auto & vehicles': 'cl_pkg_auto _ vehicles',
 'beauty': 'cl_pkg_beauty',
 'board': 'cl_pkg_board',
 'books & reference': 'cl_pkg_books _ re

# Rename the dataframe column

In [50]:
# Rename columns in the DataFrame 'res'
finaldf.rename(columns=rename_dict, inplace=True)
finaldf.columns

Index(['cust_id', 'onboardingDate', 'cl_day_120', 'cl_day_150', 'cl_kycStatus',
       'cl_creditScoreUpdated', 'cl_fraudScore', 'cl_fraudScoreUpdated',
       'cl_run_date', 'cl_brand', 'cl_device__brand',
       'cl_device__manufacturer', 'cl_device__model',
       'cl_telephony_info__network_operator_name',
       'cl_telephony_info__network_operator', 'cl_sim_operator_name',
       'cl_first_install_time', 'cl_pkg_Others', 'cl_pkg_action',
       'cl_pkg_adventure', 'cl_pkg_arcade', 'cl_pkg_art _ design',
       'cl_pkg_auto _ vehicles', 'cl_pkg_beauty', 'cl_pkg_board',
       'cl_pkg_books _ reference', 'cl_pkg_business', 'cl_pkg_card',
       'cl_pkg_casino', 'cl_pkg_casual', 'cl_pkg_comics',
       'cl_pkg_communication', 'cl_pkg_dating', 'cl_pkg_education',
       'cl_pkg_educational', 'cl_pkg_entertainment', 'cl_pkg_finance',
       'cl_pkg_food _ drink', 'cl_pkg_health _ fitness', 'cl_pkg_lifestyle',
       'cl_pkg_maps _ navigation', 'cl_pkg_medical', 'cl_pkg_music',
       

In [51]:
finaldf.to_csv(r"C:\Users\DwaipayanChakroborti\OneDrive - Tonik Financial Pte Ltd\MyStuff\Biswa\Customer_360_Data_Prep\Data\FinalData\Credolabdatawithrenamedcolumns20240911.csv", index = False)

In [53]:
finaldf[finaldf['cust_id'].isnull()]

Unnamed: 0,cust_id,onboardingDate,cl_day_120,cl_day_150,cl_kycStatus,cl_creditScoreUpdated,cl_fraudScore,cl_fraudScoreUpdated,cl_run_date,cl_brand,cl_device__brand,cl_device__manufacturer,cl_device__model,cl_telephony_info__network_operator_name,cl_telephony_info__network_operator,cl_sim_operator_name,cl_first_install_time,cl_pkg_Others,cl_pkg_action,cl_pkg_adventure,cl_pkg_arcade,cl_pkg_art _ design,cl_pkg_auto _ vehicles,cl_pkg_beauty,cl_pkg_board,cl_pkg_books _ reference,cl_pkg_business,cl_pkg_card,cl_pkg_casino,cl_pkg_casual,cl_pkg_comics,cl_pkg_communication,cl_pkg_dating,cl_pkg_education,cl_pkg_educational,cl_pkg_entertainment,cl_pkg_finance,cl_pkg_food _ drink,cl_pkg_health _ fitness,cl_pkg_lifestyle,cl_pkg_maps _ navigation,cl_pkg_medical,cl_pkg_music,cl_pkg_music _ audio,cl_pkg_news _ magazines,cl_pkg_personalization,cl_pkg_photography,cl_pkg_productivity,cl_pkg_puzzle,cl_pkg_racing,cl_pkg_roleplaying,cl_pkg_shopping,cl_pkg_simulation,cl_pkg_social,cl_pkg_sports,cl_pkg_strategy,cl_pkg_tools,cl_pkg_travel _ local,cl_pkg_trivia,cl_pkg_videoplayers _ editors,cl_pkg_weather,cl_pkg_word,cl_pkg_rated for 12_plus,cl_pkg_rated for 16_plus,cl_pkg_rated for 18_plus,cl_pkg_free,cl_pkg_pay,cl_pkg_undefined
