In [1]:
# %% [markdown]
# # Jupyter Notebook Loading Header
#
# This is a custom loading header for Jupyter Notebooks in Visual Studio Code.
# It includes common imports and settings to get you started quickly.

# %% [markdown]
## Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from google.cloud import bigquery
import os
path = r'C:\Users\DwaipayanChakroborti\AppData\Roaming\gcloud\legacy_credentials\dchakroborti@tonikbank.com\adc.json'
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = path
client = bigquery.Client(project='prj-prod-dataplatform')

# %% [markdown]
## Configure Settings
# Set options or configurations as needed
pd.set_option('display.max_columns', None)

# Read the customer id from Raj CSV file

In [2]:
rajdf = pd.read_csv(r"C:\Users\DwaipayanChakroborti\OneDrive - Tonik Financial Pte Ltd\MyStuff\Biswa\Customer_360_Data_Prep\customerid_onboarded_1stJan2024.csv")

In [3]:
a = tuple(rajdf['customer_id'])
a[0:5]

(2355677, 2355517, 2355698, 2355673, 2355268)

In [4]:
rajdf['customer_id'] = rajdf['customer_id'].astype(str)

In [5]:
b = tuple(rajdf['customer_id'])
b[0:5]

('2355677', '2355517', '2355698', '2355673', '2355268')

# Education Type

In [6]:
sq = f"""
with
educate as 
(select distinct edu.digitalLoanAccountId, edu.education_id, edu1.description
from `prj-prod-dataplatform.dl_loans_db_raw.tdbk_loan_purpose` edu
inner join (select id, description from dl_loans_db_raw.tdbk_loan_lov_mtb where module = 'Education') edu1 on edu.education_id = edu1.id
),
educate2 as 
(select *, row_number() over(partition by digitalLoanAccountId order by education_id desc) rnk from educate),
educate3 as 
(select * from educate2 where rnk = 1)
select lmt.customerId,
educate3.education_id,
educate3.description Education_Type, 
educate3.rnk educationrnk
from educate3 
inner join prj-prod-dataplatform.risk_credit_mis.loan_master_table lmt on lmt.digitalLoanAccountId = educate3.digitalLoanAccountId
where lmt.customerId in {a}
"""
educationdf = client.query(sq).to_dataframe(progress_bar_type='tqdm')
educationdf


Job ID ac2b6faf-c7b1-4c74-9914-a2dc26c13aa4 successfully executed: 100%|[32m██████████[0m|
Downloading: 100%|[32m██████████[0m|


Unnamed: 0,customerId,education_id,Education_Type,educationrnk
0,2355259,14,College Graduate,1
1,2355229,14,College Graduate,1
2,2355604,11,High School Graduate,1
3,2355554,13,College Undergraduate,1
4,2355712,14,College Graduate,1
...,...,...,...,...
91,2355443,11,High School Graduate,1
92,2355454,14,College Graduate,1
93,2355293,14,College Graduate,1
94,2355351,13,College Undergraduate,1


# Telco Provider

In [7]:
sq = f"""
WITH ac_created AS 
(
  ## Base data of onboarded users
  SELECT DISTINCT 
  product,
  CASE 
    WHEN product = 'LOAN1.0' THEN 'Borrow'
    WHEN product = 'TSA1.0' THEN 'Save'
    WHEN product = 'TSAE1.0' THEN 'Explore'
    ELSE product END type,
  cust_id,
  user_id,
  device_id,
  created_dt,
  gender,
  mobile_no,
  birthplace,
  FROM prj-prod-dataplatform.dl_customers_db_raw.tdbk_customer_mtb
  WHERE 1=1
  AND cast(cust_id as numeric) in {a}
  AND created_dt >= '2023-06-01'
  ORDER BY 4
)
select 
    ac.cust_id,
    ac.user_id,
    ac.device_id,
    ac.created_dt,
    ac.gender,
    ac.birthplace,
    ac.mobile_no,
    ac.product,
    ac.type,
    t4.GeneralData.telephony_info__network_operator_name telcoProvider
from ac_created ac
inner join prj-prod-dataplatform.dl_loans_db_raw.tdbk_loan_customer_details tlcd on tlcd.custId = ac.cust_id  
inner JOIN `prj-prod-dataplatform.dl_loans_db_raw.tdbk_credolab_track` t3 ON cast(tlcd.credolabRefNumber as string) = cast(t3.refno as string)
left JOIN `prj-prod-dataplatform.credolab_raw.android_credolab_datasets_struct_columns` t4 ON t3.refno = t4.deviceId
where cast(ac.cust_id as numeric) in {a}
"""
clientdemodf1 = client.query(sq).to_dataframe(progress_bar_type='tqdm')

clientdemodf1.head()



Job ID 0e293eae-473d-4775-8a68-6cfc08d88138 successfully executed: 100%|[32m██████████[0m|
Downloading: 100%|[32m██████████[0m|


Unnamed: 0,cust_id,user_id,device_id,created_dt,gender,birthplace,mobile_no,product,type,telcoProvider
0,2355581,3304835027e14ac395cc56cbef285431,fabb5cf5919342c881d3ef86399c9a11w82Cd6g7,2024-01-01 09:59:55+00:00,M,Sta cruz manila,639669537748,LOAN1.0,Borrow,GLOBE
1,2355563,c8fb5e994d6440439f55cee2572eb499,53ab4af425194c0f84206373fff6a72ellIf1SB5,2024-01-01 09:40:26+00:00,F,QUEZON PROVINCE,639051919921,LOAN1.0,Borrow,SMART
2,2355344,51ccb77b9d35417b8d2f661be5ffd6b9,a248cce6047e4cdc84b726411b437d5deLLkqpJx,2024-01-01 05:10:06+00:00,M,AKLAN,639852820980,LOAN1.0,Borrow,TNT
3,2355333,a969317f7fa443dfb2385310a0d23ee4,15a005a97e084320ad2e707e83b2cc62jDaAlyG8,2024-01-01 04:58:36+00:00,M,Zamboanga Del Sur,639301120138,LOAN1.0,Borrow,GLOBE
4,2355487,af40aaebdd97435598b3007805e01800,ef306eef6a52432e817b8e1035662b0ebPUPc1YM,2024-01-01 08:18:08+00:00,F,Manila,639531327834,TSAE1.0,Explore,TM


# Merge Client Demo 1 with educationdf 

In [8]:
clientdemodf1['cust_id'] = clientdemodf1['cust_id'].astype(np.int64)

In [9]:
df = clientdemodf1.merge(educationdf, left_on='cust_id', right_on='customerId', how = 'left')
df.drop(columns=['customerId', 'educationrnk'], inplace = True)
df.head()

Unnamed: 0,cust_id,user_id,device_id,created_dt,gender,birthplace,mobile_no,product,type,telcoProvider,education_id,Education_Type
0,2355581,3304835027e14ac395cc56cbef285431,fabb5cf5919342c881d3ef86399c9a11w82Cd6g7,2024-01-01 09:59:55+00:00,M,Sta cruz manila,639669537748,LOAN1.0,Borrow,GLOBE,12,Technical/Vocational Graduate
1,2355581,3304835027e14ac395cc56cbef285431,fabb5cf5919342c881d3ef86399c9a11w82Cd6g7,2024-01-01 09:59:55+00:00,M,Sta cruz manila,639669537748,LOAN1.0,Borrow,GLOBE,12,Technical/Vocational Graduate
2,2355563,c8fb5e994d6440439f55cee2572eb499,53ab4af425194c0f84206373fff6a72ellIf1SB5,2024-01-01 09:40:26+00:00,F,QUEZON PROVINCE,639051919921,LOAN1.0,Borrow,SMART,11,High School Graduate
3,2355344,51ccb77b9d35417b8d2f661be5ffd6b9,a248cce6047e4cdc84b726411b437d5deLLkqpJx,2024-01-01 05:10:06+00:00,M,AKLAN,639852820980,LOAN1.0,Borrow,TNT,11,High School Graduate
4,2355333,a969317f7fa443dfb2385310a0d23ee4,15a005a97e084320ad2e707e83b2cc62jDaAlyG8,2024-01-01 04:58:36+00:00,M,Zamboanga Del Sur,639301120138,LOAN1.0,Borrow,GLOBE,12,Technical/Vocational Graduate


# Customer Details

In [10]:
sq = f"""    
SELECT 
  custid, dateOfBirth, 
  signUpAccNo, 
  kycStatus,
  addressline1, 
  city, 
  province, 
  barangay, 
  provinceCode, 
  postalcode, 
  mailingCity, 
  mailingPostalCode, 
  mailingProvince,
  natureofwork, 
  employmentstatus, 
  mobileOs, 
  docType, 
  onboardingDate, 
  created_dt, 
  ABS(TIMESTAMP_DIFF(
    PARSE_TIMESTAMP('%Y-%m-%d %H:%M:%E6S', onboardingDate),
    CAST(created_dt AS TIMESTAMP),
    MINUTE
  )) AS Minutestakentoonboard,
  companyName, 
  industry, 
  device_dtl,
FROM 
  `prj-prod-dataplatform.dl_loans_db_raw.tdbk_loan_customer_details`
WHERE 
  CAST(custId AS NUMERIC) IN {a}
"""
cddf = client.query(sq).to_dataframe(progress_bar_type='tqdm')
cddf.head()


Job ID 8183d1c1-2f6d-4781-a29c-c021afeb28df successfully executed: 100%|[32m██████████[0m|
Downloading: 100%|[32m██████████[0m|


Unnamed: 0,custid,dateOfBirth,signUpAccNo,kycStatus,addressline1,city,province,barangay,provinceCode,postalcode,mailingCity,mailingPostalCode,mailingProvince,natureofwork,employmentstatus,mobileOs,docType,onboardingDate,created_dt,Minutestakentoonboard,companyName,industry,device_dtl
0,2355694,21 Apr 1979,60823556940001,SKYC,UNIT 3339 TOWER 2 FAME RESIDENCES EDSA AND MAY...,CITY OF MANDALUYONG,METRO MANILA,HIGHWAY HILLS,,1550,,,,28,1,,Philippines - ePassport,2024-01-01 14:21:34.000000,2024-01-14 07:16:46,18295,,38,
1,2355664,07 Feb 2001,60823556640005,SKYC,480 P Narciso Street,CITY OF SAN JUAN,METRO MANILA,CORAZON DE JESUS,,1500,,,,39,1,IOS,Philippines - ePassport,2024-01-01 12:55:41.000000,2024-04-22 09:32:31,161076,FCB CB INC.,37,IOS-17.4.1
2,2355677,28 Dec 1981,60823556770001,SKYC,PUROK 2AAMPAYON BUTUAN CITY AGUSAN DEL NORTE P...,BUTUAN CITY,Agusan del Norte,AMPAYON,,8600,CITY OF PASIG,1600.0,METRO MANILA,28,1,Android,Philippines - UMID Card,2024-01-01 13:41:54.000000,2024-01-01 13:49:20,7,"National Economic & Development Authority""",2,
3,2355427,19 Aug 2000,60823554270004,SKYC,DOUBLE R RESORT SAN MIGUEL CITY OF TARLAC T...,SAN MIGUEL,Tarlac,SAN MIGUEL,,2301,TARLAC CITY,2300.0,Tarlac,6,1,Android,Philippines - Id Card (2020),2024-01-01 06:46:10.000000,2024-01-01 06:48:35,2,Cloudworkers,38,
4,2355776,15 Aug 1996,60823557760002,SKYC,BLK 32 LOT 2 CELINA HOMES 2 BRGY 168 CALOOCAN...,CITY OF CALOOCAN,METRO MANILA,KAYBIGA DEPARO,,1420,CITY OF CALOOCAN,1420.0,METRO MANILA,41,1,,Philippines - UMID Card (2016),2024-01-01 22:10:05.000000,2024-01-01 22:19:37,9,,7,


In [11]:
cddf['custid'] = cddf['custid'].astype(np.int64)
cddf.rename(columns={'custid':'cust_id'}, inplace=True)
cddf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 84 entries, 0 to 83
Data columns (total 23 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   cust_id                84 non-null     int64         
 1   dateOfBirth            84 non-null     object        
 2   signUpAccNo            80 non-null     object        
 3   kycStatus              84 non-null     object        
 4   addressline1           84 non-null     object        
 5   city                   84 non-null     object        
 6   province               84 non-null     object        
 7   barangay               84 non-null     object        
 8   provinceCode           0 non-null      object        
 9   postalcode             84 non-null     object        
 10  mailingCity            6 non-null      object        
 11  mailingPostalCode      6 non-null      object        
 12  mailingProvince        6 non-null      object        
 13  natureo

In [12]:
df = df.merge(cddf, on='cust_id', how = 'left')
df.columns

Index(['cust_id', 'user_id', 'device_id', 'created_dt_x', 'gender',
       'birthplace', 'mobile_no', 'product', 'type', 'telcoProvider',
       'education_id', 'Education_Type', 'dateOfBirth', 'signUpAccNo',
       'kycStatus', 'addressline1', 'city', 'province', 'barangay',
       'provinceCode', 'postalcode', 'mailingCity', 'mailingPostalCode',
       'mailingProvince', 'natureofwork', 'employmentstatus', 'mobileOs',
       'docType', 'onboardingDate', 'created_dt_y', 'Minutestakentoonboard',
       'companyName', 'industry', 'device_dtl'],
      dtype='object')

In [13]:
df.drop(columns=['created_dt_y'], inplace=True)
df.rename(columns={'created_dt_x':'created_dt'}, inplace = True)


![image.png](attachment:image.png)

In [14]:
df.columns

Index(['cust_id', 'user_id', 'device_id', 'created_dt', 'gender', 'birthplace',
       'mobile_no', 'product', 'type', 'telcoProvider', 'education_id',
       'Education_Type', 'dateOfBirth', 'signUpAccNo', 'kycStatus',
       'addressline1', 'city', 'province', 'barangay', 'provinceCode',
       'postalcode', 'mailingCity', 'mailingPostalCode', 'mailingProvince',
       'natureofwork', 'employmentstatus', 'mobileOs', 'docType',
       'onboardingDate', 'Minutestakentoonboard', 'companyName', 'industry',
       'device_dtl'],
      dtype='object')

In [15]:
%%writefile age_calculator.py
from datetime import datetime
import pandas as pd

def calculate_age_in_days(date_of_birth):
    # Parse the date of birth
    dob = datetime.strptime(date_of_birth, "%d %b %Y")
    
    # Get the current date
    current_date = datetime.now()
    
    # Calculate the difference
    age_days = (current_date - dob).days
    
    return age_days

def add_age_column(df, date_column='dateOfBirth', new_column='AgeInDays'):
    # Apply the calculate_age_in_days function to the date column
    df[new_column] = df[date_column].apply(calculate_age_in_days)
    return df

Overwriting age_calculator.py


In [16]:
from age_calculator import calculate_age_in_days

# Add AgeInYears column (approximate)
df['Age'] = df['dateOfBirth'].apply(calculate_age_in_days)
df['Age'] = df['Age'] //365

# Region

In [17]:
sq = f"""
Select distinct 
  a.user_id,
  cast(a.cust_Id as numeric) cust_id,
  hm_postalcode,
  hm_barangay,
  hm_city,
  hm_province,
    CASE
    WHEN hm_province = 'KALINGA' THEN 'CAR – Cordillera Administrative Region'
    WHEN hm_province = 'OCCIDENTAL MINDORO' THEN 'MIMAROPA Region'
    WHEN hm_province = 'ORIENTAL MINDORO' THEN  'MIMAROPA Region'
    WHEN hm_province = 'ORODNIM' THEN 'MIMAROPA Region'
    WHEN hm_province = 'SAMAR (WESTERN SAMAR)' THEN 'Region VIII – Eastern Visayas'
    WHEN hm_province = 'ZAMBOANGA SIBUGAY' THEN 'Region IX – Zamboanga Peninsula'
    ELSE Region_name END Region_Name 
FROM prj-prod-dataplatform.dl_customers_db_raw.tdbk_customer_mtb a
LEFT JOIN prj-prod-dataplatform.dl_customers_db_raw.tdbk_customer_add_mtb f ON a.user_id = f.user_id
LEFT JOIN prj-prod-dataplatform.dap_ds_poweruser_playground.region_mappings h ON LOWER(f.hm_province) = LOWER(h.province)
where cast(a.cust_Id as numeric) in {a}
;
"""

regiondf = client.query(sq).to_dataframe(progress_bar_type = 'tqdm')

Job ID 674c730e-c748-493b-9b26-6f87104ef367 successfully executed: 100%|[32m██████████[0m|
Downloading: 100%|[32m██████████[0m|


In [18]:
regiondf = regiondf[['cust_id', 'hm_postalcode', 'hm_barangay', 'hm_city',
       'hm_province', 'Region_Name']].copy()

In [19]:
regiondf['cust_id'] = regiondf['cust_id'].astype(np.int64)
regiondf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119 entries, 0 to 118
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   cust_id        119 non-null    int64 
 1   hm_postalcode  119 non-null    object
 2   hm_barangay    119 non-null    object
 3   hm_city        119 non-null    object
 4   hm_province    119 non-null    object
 5   Region_Name    119 non-null    object
dtypes: int64(1), object(5)
memory usage: 5.7+ KB


In [20]:
df = df.merge(regiondf, on='cust_id', how = 'left')

In [21]:
# List of columns to check and their corresponding 'hm_' columns
columns_to_check = [
    ('city', 'hm_city'),
    ('province', 'hm_province'),
    ('barangay', 'hm_barangay'),
    ('postalcode', 'hm_postalcode')
]

# Loop through the columns and replace null values
for col, hm_col in columns_to_check:
    df[col] = df[col].fillna(df[hm_col])

In [22]:
df.drop(columns = ['hm_postalcode', 'hm_barangay', 'hm_city',
       'hm_province'], inplace = True)

In [23]:
df.columns

Index(['cust_id', 'user_id', 'device_id', 'created_dt', 'gender', 'birthplace',
       'mobile_no', 'product', 'type', 'telcoProvider', 'education_id',
       'Education_Type', 'dateOfBirth', 'signUpAccNo', 'kycStatus',
       'addressline1', 'city', 'province', 'barangay', 'provinceCode',
       'postalcode', 'mailingCity', 'mailingPostalCode', 'mailingProvince',
       'natureofwork', 'employmentstatus', 'mobileOs', 'docType',
       'onboardingDate', 'Minutestakentoonboard', 'companyName', 'industry',
       'device_dtl', 'Age', 'Region_Name'],
      dtype='object')

# Device Location at Onboarding

In [24]:
sq =f"""SELECT 
  customer_id, 
  CASE WHEN event_description = 'Onboarding' THEN latitude END AS Onboarding_latitude,
  CASE WHEN event_description = 'Onboarding' THEN longitude END AS Onboarding_longitude,
  CASE WHEN event_description = 'Loan Journey' THEN latitude END AS loanjourney_latitude,
  CASE WHEN event_description = 'Loan Journey' THEN longitude END AS loanjouney_longitude,
  CASE WHEN event_description = 'Apigee Logs' THEN latitude END AS ApigeeLogs_latitude,
  CASE WHEN event_description = 'Apigee Logs' THEN longitude END AS ApigeeLogs_longitude
FROM `prj-prod-dataplatform.risk_mart.customer_gps_location`
where cast(customer_id as numeric) in {a}
QUALIFY ROW_NUMBER() OVER (PARTITION BY customer_id ORDER BY gps_collection_date DESC) = 1
;
"""

locationdf = client.query(sq).to_dataframe(progress_bar_type = 'tqdm')
locationdf.head()

Job ID 704b7b7e-bf29-444d-b25a-6ddf3accd8b7 successfully executed: 100%|[32m██████████[0m|
Downloading: 100%|[32m██████████[0m|


Unnamed: 0,customer_id,Onboarding_latitude,Onboarding_longitude,loanjourney_latitude,loanjouney_longitude,ApigeeLogs_latitude,ApigeeLogs_longitude
0,2355350,,,,,14.2560747,121.4162694
1,2355361,,,,,undefined,undefined
2,2355591,,,,,undefined,undefined
3,2355698,,,,,undefined,undefined
4,2355236,,,,,15.1764224,120.5209046


In [25]:
locationdf = locationdf.rename(columns={'customer_id': 'cust_id'}).astype({'cust_id': np.int64})

locationdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119 entries, 0 to 118
Data columns (total 7 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   cust_id               119 non-null    int64 
 1   Onboarding_latitude   6 non-null      object
 2   Onboarding_longitude  6 non-null      object
 3   loanjourney_latitude  2 non-null      object
 4   loanjouney_longitude  2 non-null      object
 5   ApigeeLogs_latitude   111 non-null    object
 6   ApigeeLogs_longitude  111 non-null    object
dtypes: int64(1), object(6)
memory usage: 6.6+ KB


In [26]:
df = df.merge(locationdf, on = 'cust_id', how = 'left')
df.head()

Unnamed: 0,cust_id,user_id,device_id,created_dt,gender,birthplace,mobile_no,product,type,telcoProvider,education_id,Education_Type,dateOfBirth,signUpAccNo,kycStatus,addressline1,city,province,barangay,provinceCode,postalcode,mailingCity,mailingPostalCode,mailingProvince,natureofwork,employmentstatus,mobileOs,docType,onboardingDate,Minutestakentoonboard,companyName,industry,device_dtl,Age,Region_Name,Onboarding_latitude,Onboarding_longitude,loanjourney_latitude,loanjouney_longitude,ApigeeLogs_latitude,ApigeeLogs_longitude
0,2355581,3304835027e14ac395cc56cbef285431,fabb5cf5919342c881d3ef86399c9a11w82Cd6g7,2024-01-01 09:59:55+00:00,M,Sta cruz manila,639669537748,LOAN1.0,Borrow,GLOBE,12,Technical/Vocational Graduate,04 Jul 1992,60823555810003,SKYC,Sitio ibabaw,TERESA,Rizal,DULUMBAYAN,,1880,,,,40,1,Android,Philippines - Driving License (2022),2024-01-01 09:59:55.000000,7,,10,,32,Region IV-A – CALABARZON,,,,,14.5406754,121.1969619
1,2355581,3304835027e14ac395cc56cbef285431,fabb5cf5919342c881d3ef86399c9a11w82Cd6g7,2024-01-01 09:59:55+00:00,M,Sta cruz manila,639669537748,LOAN1.0,Borrow,GLOBE,12,Technical/Vocational Graduate,04 Jul 1992,60823555810003,SKYC,Sitio ibabaw,TERESA,Rizal,DULUMBAYAN,,1880,,,,40,1,Android,Philippines - Driving License (2022),2024-01-01 09:59:55.000000,7,,10,,32,Region IV-A – CALABARZON,,,,,14.5406754,121.1969619
2,2355563,c8fb5e994d6440439f55cee2572eb499,53ab4af425194c0f84206373fff6a72ellIf1SB5,2024-01-01 09:40:26+00:00,F,QUEZON PROVINCE,639051919921,LOAN1.0,Borrow,SMART,11,High School Graduate,07 Dec 1977,60823555630001,SKYC,426 Tramo rd Amaya 1,TANZA,Cavite,AMAYA I,,4108,,,,0,10,Android,Philippines - Postal Id Card (2016),2024-01-01 09:40:26.000000,1,,0,,46,Region IV-A – CALABARZON,,,,,undefined,undefined
3,2355344,51ccb77b9d35417b8d2f661be5ffd6b9,a248cce6047e4cdc84b726411b437d5deLLkqpJx,2024-01-01 05:10:06+00:00,M,AKLAN,639852820980,LOAN1.0,Borrow,TNT,11,High School Graduate,02 Dec 1977,60823553440002,SKYC,PUROK 4 BRGY HUGO PEREZ TRECE MARTIRES CITY CA...,TRECE MARTIRES CITY,Cavite,PEREZ LUCBANAN,,4109,,,,41,1,Android,Philippines - Driving License (2022),2024-01-01 05:10:06.000000,3,,17,,46,Region IV-A – CALABARZON,,,,,14.2776998,120.8907359
4,2355333,a969317f7fa443dfb2385310a0d23ee4,15a005a97e084320ad2e707e83b2cc62jDaAlyG8,2024-01-01 04:58:36+00:00,M,Zamboanga Del Sur,639301120138,LOAN1.0,Borrow,GLOBE,12,Technical/Vocational Graduate,01 Apr 1987,60823553330009,SKYC,N 953 PNR SITE FTI COMPOUND WESTERN BICUTAN TA...,CITY OF TAGUIG,METRO MANILA,WESTERN BICUTAN,,1630,,,,41,1,Android,Philippines - Driving License (2022),2024-01-01 04:58:36.000000,2,,36,,37,NCR – National Capital Region,14.5073303,121.0533762,,,,


# Marital Status, No. of Dependents, income

In [27]:
sq = f"""  
select 
customerId,
maritalStatus,
dependentsCount, 
monthlyIncome,
startApplyDateTime,
from
prj-prod-dataplatform.risk_credit_mis.loan_master_table
where (maritalStatus is not null or dependentsCount is not null or monthlyIncome is not null)
and customerId in {a}
qualify row_number() over (partition by customerId order by startApplyDateTime desc) = 1
;
"""
loandf = client.query(sq).to_dataframe(progress_bar_type = 'tqdm')
loandf.rename(columns={'customerId':'cust_id'}, inplace = True)
print(loandf.shape)
loandf.columns


Job ID 1bdc4928-bf1a-4d01-a72d-fad052a84369 successfully executed: 100%|[32m██████████[0m|
Downloading: 100%|[32m██████████[0m|
(80, 5)


Index(['cust_id', 'maritalStatus', 'dependentsCount', 'monthlyIncome',
       'startApplyDateTime'],
      dtype='object')

In [28]:
df = df.merge(loandf, on = 'cust_id', how = 'left')

# Prepaid and PostPaid Flag

In [29]:
sq = f"""    
WITH ac_created AS 
(
  ## Base data of onboarded users
  SELECT DISTINCT 
  product,
  CASE 
    WHEN product = 'LOAN1.0' THEN 'Borrow'
    WHEN product = 'TSA1.0' THEN 'Save'
    WHEN product = 'TSAE1.0' THEN 'Explore'
    ELSE product END type,
  cust_id,
  user_id,
  device_id,
  created_dt,
  gender,
  mobile_no,
  FROM prj-prod-dataplatform.dl_customers_db_raw.tdbk_customer_mtb
  WHERE 1=1
  AND cast(cust_id as numeric) in {a}
  AND created_dt >= '2023-06-01'
  ORDER BY 4
)

, f_loan AS 
(
  ## First applied loan of a customer 
  SELECT DISTINCT customerId, new_loan_type, applicationStatus, disbursementdatetime, startApplydatetime
  FROM `risk_credit_mis.loan_master_table`
  WHERE 1=1
  AND customerId in {a}
  AND startApplyDatetime >= '2023-06-01'
  QUALIFY ROW_NUMBER() OVER (PARTITION BY customerId ORDER BY startApplyDatetime ASC) =1
)


, deposit_balance AS 
(
  ## Gets the balance of a customer on their 90th day upon account opening
    ## For the 1st condition of the balance of a customer upon the observation date
  SELECT DISTINCT a.ofdateopened,a.ofcustomerid, a.ofstandardaccountid,account_type,balanceDateAsOf, clearedbalance,
  FROM `prj-prod-dataplatform.core_raw.customer_accounts` a
  JOIN `risk_mart.customer_balance` b ON a.ofcustomerid = b.client_id AND a.ofstandardaccountid = b.accountid
  WHERE 1=1
  AND cast(a.ofcustomerid as numeric) in {a}
  -- AND clearedbalance >= 100
  -- AND ofcustomerid IN ('1514439','1252865','1248952')
  QUALIFY ROW_NUMBER() OVER (PARTITION BY ofcustomerid, ofstandardaccountid ORDER BY balancedateasof ASC) = 90
)

, af_link AS
(
  ## To get the AF ID and Customer ID Link (using the first install of a customer)
  SELECT DISTINCT customer_user_id, appsflyer_id, media_source, partner, campaign
  FROM `appsflyer_raw.organic_in_app_events_report`
  WHERE 1=1
  and customer_user_id in {b}
  AND customer_user_id IS NOT NULL
  QUALIFY ROW_NUMBER() OVER (PARTITION BY customer_user_id ORDER BY install_time ASC) = 1
  
  UNION ALL
  
  SELECT DISTINCT customer_user_id, appsflyer_id, media_source, partner, campaign
  FROM `appsflyer_raw.in_app_events_report`
  WHERE 1=1
  and customer_user_id in {b}
  AND customer_user_id IS NOT NULL
  QUALIFY ROW_NUMBER() OVER (PARTITION BY customer_user_id ORDER BY install_time ASC) = 1
)


, demog_details AS 
(
  ## Demographic Details
  SELECT DISTINCT
  a.user_id,
  a.cust_Id,
  a.device_id,
  b.id,
  c.description source_of_funds,
  d.description employment_status,
  monthly_income,
  hm_postalcode,
  hm_barangay,
  hm_city,
  hm_province,
  CASE
    WHEN hm_province = 'KALINGA' THEN 'CAR – Cordillera Administrative Region'
    WHEN hm_province = 'OCCIDENTAL MINDORO' THEN 'MIMAROPA Region'
    WHEN hm_province = 'ORIENTAL MINDORO' THEN  'MIMAROPA Region'
    WHEN hm_province = 'ORODNIM' THEN 'MIMAROPA Region'
    WHEN hm_province = 'SAMAR (WESTERN SAMAR)' THEN 'Region VIII – Eastern Visayas'
    WHEN hm_province = 'ZAMBOANGA SIBUGAY' THEN 'Region IX – Zamboanga Peninsula'
    ELSE Region_name END Region_Name
  FROM prj-prod-dataplatform.dl_customers_db_raw.tdbk_customer_mtb a
  LEFT JOIN prj-prod-dataplatform.dl_dynamo_db_raw.tdbk_regfinancial_profile1 b ON a.device_id = b.deviceId
  LEFT JOIN prj-prod-dataplatform.dl_dynamo_db_raw.tdbk_source_of_funds_mtb c ON b.sourceOfFundKey = c.id  
  LEFT JOIN prj-prod-dataplatform.dl_dynamo_db_raw.tdbk_employment_status_mtb d ON b.employmentStatusKey = d.id
  LEFT JOIN prj-prod-dataplatform.dl_customers_db_raw.tdbk_cust_profile_mtb e ON a.user_id = e.user_id
  LEFT JOIN prj-prod-dataplatform.dl_customers_db_raw.tdbk_customer_add_mtb f ON a.user_id = f.user_id
  LEFT JOIN prj-prod-dataplatform.dap_ds_poweruser_playground.region_mappings h ON LOWER(f.hm_province) = LOWER(h.province)
  where cast(a.cust_Id as numeric) in {a}
)


## Base query used to connect to other features 
SELECT DISTINCT
  a.created_dt registration_date,
  -- startApplyDateTime,
  -- ofdateopened,
  -- balancedateasof,
  a.cust_id,
  a.user_id,
  a.product,
  a.type,
  -- account_type,
  -- ofstandardaccountid,
  -- clearedbalance, 
  a.gender,
  customer_age,
  Region_Name,
  hm_province,
  hm_city,
  hm_barangay,
  hm_postalcode,
  source_of_funds,
  employment_status,
  CAST(monthly_income AS FLOAT64)*12.5 self_declared_annual_income,
  -- add the salary scale annual income
  -- double check if i can switch networks given the same number

  mobile_no,
  IFNULL(h.network_group,g.network_group) network_group,
  CASE
    WHEN g.network_group = 'Globe' AND h.network IS NULL THEN 'Prepaid'
    WHEN g.network_group = 'Globe' AND h.network IS NOT NULL THEN 'Postpaid'
    WHEN g.network_group = 'Smart' AND (LEFT(mobile_no,5) = '63920' OR LEFT(mobile_no,5) = '63918') THEN 'Postpaid'
    WHEN g.network_group NOT IN ('Globe','Smart') THEN NULL
    ELSE 'Prepaid' END network,
CASE 
  ### social media ###
  -- anything from social
  WHEN LOWER(COALESCE(Media_Source,Partner,Campaign)) LIKE '%social%' THEN 'Social'
  ### direct ###
  -- fwb / referral
  WHEN COALESCE(Media_Source,Partner,Campaign) = 'af_app_invites' THEN 'FWB'
  WHEN (Media_Source = 'invalid_media_source_name' OR Media_Source IS NULL) AND LOWER(Campaign) LIKE '%refer%' THEN 'FWB'
  WHEN LOWER(COALESCE(Media_Source,Partner,Campaign)) LIKE '%refer%' THEN 'FWB'
  WHEN LOWER(COALESCE(Media_Source,Partner,Campaign)) LIKE '%moengage%' THEN 'FWB'
  -- apple search
  WHEN LOWER(COALESCE(Media_Source,Partner,Campaign)) LIKE '%apple search ads%' THEN 'AppleSearch'
  -- facebook / meta
  WHEN COALESCE(Media_Source,Partner,Campaign) LIKE '%Facebook Ads%' THEN 'Ads_Meta'
  WHEN COALESCE(Media_Source,Partner,Campaign) LIKE '%meta%' THEN 'Ads_Meta'
  WHEN COALESCE(Media_Source,Partner,Campaign) = 'facebook' THEN 'Ads_Meta'
  WHEN LOWER(COALESCE(Media_Source,Partner,Campaign)) = 'restricted' THEN 'Ads_Meta'
  -- google
  WHEN COALESCE(Media_Source,Partner,Campaign) LIKE '%google%' THEN 'Ads_Google'
    -- tiktok
  WHEN COALESCE(Media_Source,Partner,Campaign) like '%bytedanceglobal%' THEN 'TikTok_Paid'
  ### affiliates ###
  -- pokkt source
  WHEN LOWER(COALESCE(Media_Source,Partner,Campaign)) LIKE '%pokkt%' THEN 'Aff_Pokkt'
  WHEN COALESCE(Media_Source,Partner,Campaign) LIKE '%Tonik_CPA%' THEN 'Aff_Pokkt'
  WHEN COALESCE(Media_Source,Partner,Campaign) LIKE '%campaign_name%' THEN 'Aff_Pokkt'
  -- tyr ads
  WHEN LOWER(COALESCE(Media_Source,Partner,Campaign)) IN ('tyrads_int','tjzymob_int','tyrads','ta_tonik_aos_ph') THEN 'Aff_TyrAds'
  -- sales doubler
  WHEN LOWER(COALESCE(Media_Source,Partner,Campaign)) LIKE '%salesdoubler%' THEN 'Aff_SalesDoubler'
  
  -- imoney
  WHEN COALESCE(Media_Source,Partner,Campaign) LIKE '%imoney%' THEN 'Aff_iMoney'
  -- moneymax
  WHEN COALESCE(Media_Source,Partner,Campaign) LIKE '%moneymax%' THEN 'Aff_MoneyMax'
  -- jeff
  WHEN COALESCE(Media_Source,Partner,Campaign) LIKE '%jeff%' THEN 'Aff_Jeff'
  -- shareit
  WHEN COALESCE(Media_Source,Partner,Campaign) LIKE '%shareit%' THEN 'Aff_ShareIt'
  
  -- appnext
  WHEN COALESCE(Media_Source,Partner,Campaign) LIKE '%appnext%' THEN 'Aff_AppNext'
  -- mediadonuts / entravision
  WHEN LOWER(COALESCE(Media_Source,Partner,Campaign)) LIKE '%mediadonuts%' THEN 'Aff_Mediadonuts'
  WHEN LOWER(COALESCE(Media_Source,Partner,Campaign)) LIKE '%entravision%' THEN 'Aff_Mediadonuts'
  -- shopback
  WHEN LOWER(COALESCE(Media_Source,Partner,Campaign)) LIKE '%shopback_int%' THEN 'Aff_Shopback'
  -- deepsea
  WHEN LOWER(COALESCE(Media_Source,Partner,Campaign)) LIKE '%deepsea%' THEN 'Aff_DeepSea'
  -- avow
  WHEN COALESCE(Media_Source,Partner,Campaign) LIKE '%avow%' THEN 'Aff_AvowTech'
  -- avow
  WHEN COALESCE(Media_Source,Partner,Campaign) LIKE '%vivo%' THEN 'Aff_Vivo'
  -- 711
  WHEN COALESCE(Media_Source,Partner,Campaign) LIKE '%SevenEleven%' THEN 'SevenEleven'
  ## organic ##
  WHEN COALESCE(Media_Source,Partner,Campaign) IN ('af_banner') THEN 'af_Banner'
  WHEN COALESCE(Media_Source,Partner,Campaign) LIKE '%sendgrid%' THEN 'Organic'
  WHEN COALESCE(Media_Source,Partner,Campaign) IN ('Website') THEN 'Website'
  
  ## owned media ##
  WHEN media_source IN ('PRFlexLoans_WebAndroidPressRelease') THEN 'Owned Media'
  
  WHEN media_source IN ('invalid_media_source_name') THEN 'Invalid Media Source'
  WHEN media_source IN ('None') THEN 'Broken OneLink'
  WHEN media_source IS NULL AND partner IS NULL AND campaign IS NULL THEN 'Organic'
  ELSE COALESCE(media_source,partner,campaign) END as Source,
  media_source,
  partner,
  campaign,
  
FROM ac_created a
LEFT JOIN f_loan b ON a.cust_id  = CAST(b.customerId AS STRING)
LEFT JOIN deposit_balance c ON a.cust_id = c.ofcustomerid
JOIN af_link d ON a.cust_id = d.customer_user_id
JOIN prj-prod-dataplatform.dl_customers_db_derived.Tdbk_customer_mtb_age_derived e ON a.cust_id = e.cust_id
LEFT JOIN demog_details f ON a.cust_id = f.cust_id
LEFT JOIN `prj-prod-dataplatform.manual_source_extracts.mobile_carrier_mapping` g
  ON LEFT(RIGHT(mobile_no,LENGTH(mobile_no)-2),3) = CAST(g.number_prefix AS STRING)
LEFT JOIN `prj-prod-dataplatform.manual_source_extracts.mobile_carrier_mapping` h 
  ON LEFT(RIGHT(mobile_no,LENGTH(mobile_no)-2),4) = CAST(h.number_prefix AS STRING)
WHERE 1=1
## The deposit account of a customer upon the 90th day is greater than 100 
AND clearedbalance >= 100

## The customer did not initiate any loan process within 90 days from account creation
AND (DATE_DIFF(DATE(startApplyDateTime),DATE(created_dt),DAY)>=90 OR b.customerid IS NULL)
AND c.ofcustomerid IS NOT NULL
AND cast( a.cust_id as numeric) in {a}
"""

demogdf2 = client.query(sq).to_dataframe(progress_bar_type = 'tqdm')
demogdf2.head()



Job ID 7f867152-2f6f-47aa-933c-02ff58856fa1 successfully executed: 100%|[32m██████████[0m|
Downloading: 100%|[32m██████████[0m|


Unnamed: 0,registration_date,cust_id,user_id,product,type,gender,customer_age,Region_Name,hm_province,hm_city,hm_barangay,hm_postalcode,source_of_funds,employment_status,self_declared_annual_income,mobile_no,network_group,network,Source,media_source,partner,campaign
0,2024-01-01 19:45:56+00:00,2355763,53b04751a1684f90a552232fca84dd2a,TSAE1.0,Explore,F,24,Region VII – Central Visayas,CEBU,MANDAUE CITY,CASUNTINGAN,6014,Salary,Employed,250000.0,639062687567,Globe,Prepaid,Organic,,,
1,2024-01-01 14:11:04+00:00,2355688,d51b23e3e6e1438dbf96a14e11c9cad6,TSAE1.0,Explore,F,35,Region IV-A – CALABARZON,CAVITE,BACOOR,REAL I,4102,Salary,Employed,437500.0,639165968842,Globe,Prepaid,Organic,,,
2,2024-01-01 23:58:20+00:00,2355794,38491888b57c4a1bbcf5390d375745e9,TSAE1.0,Explore,M,24,NCR – National Capital Region,METRO MANILA,QUEZON CITY,PASONG TAMO,1107,Salary,Employed,312500.0,639067351005,Globe,Prepaid,Organic,,,
3,2024-01-01 05:07:14+00:00,2355339,7cfee2279d7d4fc28611db1893ce7a70,TSAE1.0,Explore,F,27,NCR – National Capital Region,METRO MANILA,CITY OF MAKATI,POBLACION,1210,Salary,Employed,325000.0,639054555046,Globe,Prepaid,Organic,,,
4,2024-01-01 03:33:55+00:00,2355280,e534457c3c364817a58a1918d63445e1,TSA1.0,Save,M,38,NCR – National Capital Region,METRO MANILA,CITY OF MANILA,BARANGAY 713,1004,Salary,Employed,500000.0,639215507324,Smart,Prepaid,Aff_Pokkt,,pokktmkt,POKKT_CPA


In [30]:
demogdf2.columns

demogdf2 = demogdf2[['cust_id', 'network_group', 'Source',	'media_source',	'partner','campaign']].copy()

In [31]:
demogdf2['cust_id'] = demogdf2['cust_id'].astype(np.int64)
demogdf2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33 entries, 0 to 32
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   cust_id        33 non-null     int64 
 1   network_group  33 non-null     object
 2   Source         33 non-null     object
 3   media_source   0 non-null      object
 4   partner        3 non-null      object
 5   campaign       3 non-null      object
dtypes: int64(1), object(5)
memory usage: 1.7+ KB


In [32]:
duplicate_ids = demogdf2[demogdf2['cust_id'].duplicated()]['cust_id']
duplicate_ids

27    2355366
30    2355298
32    2355673
Name: cust_id, dtype: int64

In [33]:
demogdf2[demogdf2['cust_id'] == 2355366]

Unnamed: 0,cust_id,network_group,Source,media_source,partner,campaign
14,2355366,Smart,Organic,,,
27,2355366,Globe,Organic,,,


In [34]:
demogdf2 = demogdf2.drop_duplicates(subset='cust_id', keep='first')

In [35]:
df = df.merge(demogdf2, on = 'cust_id', how='left')

# MOB

In [36]:
df.columns

Index(['cust_id', 'user_id', 'device_id', 'created_dt', 'gender', 'birthplace',
       'mobile_no', 'product', 'type', 'telcoProvider', 'education_id',
       'Education_Type', 'dateOfBirth', 'signUpAccNo', 'kycStatus',
       'addressline1', 'city', 'province', 'barangay', 'provinceCode',
       'postalcode', 'mailingCity', 'mailingPostalCode', 'mailingProvince',
       'natureofwork', 'employmentstatus', 'mobileOs', 'docType',
       'onboardingDate', 'Minutestakentoonboard', 'companyName', 'industry',
       'device_dtl', 'Age', 'Region_Name', 'Onboarding_latitude',
       'Onboarding_longitude', 'loanjourney_latitude', 'loanjouney_longitude',
       'ApigeeLogs_latitude', 'ApigeeLogs_longitude', 'maritalStatus',
       'dependentsCount', 'monthlyIncome', 'startApplyDateTime',
       'network_group', 'Source', 'media_source', 'partner', 'campaign'],
      dtype='object')

In [37]:
# Step 1: Convert columns to datetime datatype
df['onboardingDate'] = pd.to_datetime(df['onboardingDate'])
df['startApplyDateTime'] = pd.to_datetime(df['startApplyDateTime'])

# Step 2: Find the earliest date between onboardingDate and startApplyDateTime
df['earliest_date'] = df[['onboardingDate', 'startApplyDateTime']].min(axis=1)

# Step 3: Calculate the difference in months from the earliest date to current date
current_date = pd.Timestamp.now()
df['months_difference'] = (current_date - df['earliest_date']).dt.days / 30.44  # Average days in a month

# Round the result to 2 decimal places
df['months_difference'] = df['months_difference'].round(2)



In [38]:
df['DataSet'] = 'Demographic Data'

In [39]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 93 entries, 0 to 92
Data columns (total 53 columns):
 #   Column                 Non-Null Count  Dtype              
---  ------                 --------------  -----              
 0   cust_id                93 non-null     int64              
 1   user_id                93 non-null     object             
 2   device_id              93 non-null     object             
 3   created_dt             93 non-null     datetime64[us, UTC]
 4   gender                 93 non-null     object             
 5   birthplace             93 non-null     object             
 6   mobile_no              93 non-null     object             
 7   product                93 non-null     object             
 8   type                   93 non-null     object             
 9   telcoProvider          72 non-null     object             
 10  education_id           93 non-null     object             
 11  Education_Type         93 non-null     object             
 

In [40]:
df.head()

Unnamed: 0,cust_id,user_id,device_id,created_dt,gender,birthplace,mobile_no,product,type,telcoProvider,education_id,Education_Type,dateOfBirth,signUpAccNo,kycStatus,addressline1,city,province,barangay,provinceCode,postalcode,mailingCity,mailingPostalCode,mailingProvince,natureofwork,employmentstatus,mobileOs,docType,onboardingDate,Minutestakentoonboard,companyName,industry,device_dtl,Age,Region_Name,Onboarding_latitude,Onboarding_longitude,loanjourney_latitude,loanjouney_longitude,ApigeeLogs_latitude,ApigeeLogs_longitude,maritalStatus,dependentsCount,monthlyIncome,startApplyDateTime,network_group,Source,media_source,partner,campaign,earliest_date,months_difference,DataSet
0,2355581,3304835027e14ac395cc56cbef285431,fabb5cf5919342c881d3ef86399c9a11w82Cd6g7,2024-01-01 09:59:55+00:00,M,Sta cruz manila,639669537748,LOAN1.0,Borrow,GLOBE,12,Technical/Vocational Graduate,04 Jul 1992,60823555810003,SKYC,Sitio ibabaw,TERESA,Rizal,DULUMBAYAN,,1880,,,,40,1,Android,Philippines - Driving License (2022),2024-01-01 09:59:55,7,,10,,32,Region IV-A – CALABARZON,,,,,14.5406754,121.1969619,Married,4,18000,2024-07-19 12:35:53,,,,,,2024-01-01 09:59:55,8.18,Demographic Data
1,2355581,3304835027e14ac395cc56cbef285431,fabb5cf5919342c881d3ef86399c9a11w82Cd6g7,2024-01-01 09:59:55+00:00,M,Sta cruz manila,639669537748,LOAN1.0,Borrow,GLOBE,12,Technical/Vocational Graduate,04 Jul 1992,60823555810003,SKYC,Sitio ibabaw,TERESA,Rizal,DULUMBAYAN,,1880,,,,40,1,Android,Philippines - Driving License (2022),2024-01-01 09:59:55,7,,10,,32,Region IV-A – CALABARZON,,,,,14.5406754,121.1969619,Married,4,18000,2024-07-19 12:35:53,,,,,,2024-01-01 09:59:55,8.18,Demographic Data
2,2355563,c8fb5e994d6440439f55cee2572eb499,53ab4af425194c0f84206373fff6a72ellIf1SB5,2024-01-01 09:40:26+00:00,F,QUEZON PROVINCE,639051919921,LOAN1.0,Borrow,SMART,11,High School Graduate,07 Dec 1977,60823555630001,SKYC,426 Tramo rd Amaya 1,TANZA,Cavite,AMAYA I,,4108,,,,0,10,Android,Philippines - Postal Id Card (2016),2024-01-01 09:40:26,1,,0,,46,Region IV-A – CALABARZON,,,,,undefined,undefined,Single,No dependents,30000,2024-01-01 17:41:35,,,,,,2024-01-01 09:40:26,8.18,Demographic Data
3,2355344,51ccb77b9d35417b8d2f661be5ffd6b9,a248cce6047e4cdc84b726411b437d5deLLkqpJx,2024-01-01 05:10:06+00:00,M,AKLAN,639852820980,LOAN1.0,Borrow,TNT,11,High School Graduate,02 Dec 1977,60823553440002,SKYC,PUROK 4 BRGY HUGO PEREZ TRECE MARTIRES CITY CA...,TRECE MARTIRES CITY,Cavite,PEREZ LUCBANAN,,4109,,,,41,1,Android,Philippines - Driving License (2022),2024-01-01 05:10:06,3,,17,,46,Region IV-A – CALABARZON,,,,,14.2776998,120.8907359,Single,1,25000,2024-01-01 13:13:41,,,,,,2024-01-01 05:10:06,8.18,Demographic Data
4,2355333,a969317f7fa443dfb2385310a0d23ee4,15a005a97e084320ad2e707e83b2cc62jDaAlyG8,2024-01-01 04:58:36+00:00,M,Zamboanga Del Sur,639301120138,LOAN1.0,Borrow,GLOBE,12,Technical/Vocational Graduate,01 Apr 1987,60823553330009,SKYC,N 953 PNR SITE FTI COMPOUND WESTERN BICUTAN TA...,CITY OF TAGUIG,METRO MANILA,WESTERN BICUTAN,,1630,,,,41,1,Android,Philippines - Driving License (2022),2024-01-01 04:58:36,2,,36,,37,NCR – National Capital Region,14.5073303,121.0533762,,,,,Live-in Partner,2,30000,2024-01-01 13:02:07,,,,,,2024-01-01 04:58:36,8.18,Demographic Data


In [41]:
df.to_csv("B_Demographic_data.csv", index = False)

1.  **Age**                                 -   <span style="color: green;">Done</span>
2.  **Gender**                              -   <span style="color: green;">Done</span>
3.  **Education Type**                      -   <span style="color: green;">Done</span>
4.  **Region**                              -   <span style="color: green;">Done</span>
5.  **Province**                            -   <span style="color: green;">Done</span>
6.  **City**                                -   <span style="color: green;">Done</span>
7.  **Device Location at Onboarding**       -   <span style="color: green;">Done</span>
8.  **Marital Status**                      -   <span style="color: green;">Done</span>
9.  **Number of Dependents**                -   <span style="color: green;">Done</span>
10. **Income**                              -   <span style="color: green;">Done</span>
11. **MOB**                                 -   <span style="color: red;">Pending -- Calculate from Oboarding or StartApplyDateTime whichever is first</span>
12. **Type of ID**                          -   <span style="color: green;">Done</span>
13. **Telco Provider**                      -   <span style="color: green;">Done</span>
14. **Prepaid or Postpaid Flag**            -   <span style="color: green;">Done</span>

Aquisition Channel:

1.  **Referral vs Non-Referral Flag**               -   <span style="color: red;">Pending</span>  -- Need to check with Ivannah
2.  **Organic vs Non-Organic Channels**             -   <span style="color: green;">Done</span>
3.  **Marketing campaign the user came from if any**-   <span style="color: green;">Done</span>


In [42]:
df['campaign'].unique()

array([nan, None], dtype=object)

In [43]:
sq = """ 
with
af_link AS
(
  ## To get the AF ID and Customer ID Link (using the first install of a customer)
  SELECT DISTINCT customer_user_id, appsflyer_id, media_source, partner, campaign
  FROM `appsflyer_raw.organic_in_app_events_report`
  WHERE 1=1
  
  AND customer_user_id IS NOT NULL
  QUALIFY ROW_NUMBER() OVER (PARTITION BY customer_user_id ORDER BY install_time ASC) = 1
  
  UNION ALL
  
  SELECT DISTINCT customer_user_id, appsflyer_id, media_source, partner, campaign
  FROM `appsflyer_raw.in_app_events_report`
  WHERE 1=1
  AND customer_user_id IS NOT NULL
  QUALIFY ROW_NUMBER() OVER (PARTITION BY customer_user_id ORDER BY install_time ASC) = 1
)
select * from af_link;
"""

acquisitioncampaign = client.query(sq).to_dataframe(progress_bar_type='tqdm')

acquisitioncampaign.head()


Job ID 66c10d8b-ce40-4ba4-bb20-7a113d5ae04e successfully executed: 100%|[32m██████████[0m|
Downloading: 100%|[32m██████████[0m|


Unnamed: 0,customer_user_id,appsflyer_id,media_source,partner,campaign
0,1175537,1689226005563-4728231645735341211,,mediadonuts,md_tonik_cpe_ph
1,1626654,1663346716784-5392692476232280203,Website,,Tonikapp_GP
2,1644861,1664848212219-3063112039600898488,bytedanceglobal_int,,restricted
3,1651599,1664187906016-3992186911412117562,googleadwords_int,acesaatchi,LOANS QL | APP CAMPAIGN | LOANS_QL_OFFER | AU...
4,1684973,1665237269018-2977316731260079329,Facebook Ads,acesaatchi,acesaatchi_Loans | October 2022 | App Ads UPDA...


In [44]:
acquisitioncampaign.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1236478 entries, 0 to 1236477
Data columns (total 5 columns):
 #   Column            Non-Null Count    Dtype 
---  ------            --------------    ----- 
 0   customer_user_id  1236478 non-null  object
 1   appsflyer_id      1236478 non-null  object
 2   media_source      240873 non-null   object
 3   partner           249438 non-null   object
 4   campaign          440276 non-null   object
dtypes: object(5)
memory usage: 47.2+ MB


In [45]:
acquisitioncampaign[acquisitioncampaign['customer_user_id']=='ad9ac875-6eaa-42f4-9d8d-dfa2c4f086cd']

Unnamed: 0,customer_user_id,appsflyer_id,media_source,partner,campaign
2302,ad9ac875-6eaa-42f4-9d8d-dfa2c4f086cd,1692770903181-17584051823765840604,,mediadonuts,o_0ccca8d461


In [47]:
acquisitioncampaign['media_source'].value_counts(dropna = False)

media_source
None                         995605
googleadwords_int             87629
shareit_int                   57692
af_app_invites                19985
Facebook Ads                  16529
Website                       10536
af_banner                      9429
bytedanceglobal_int            7603
restricted                     6545
salesdoubler_int               5932
jeffapp_int                    5897
imoneyph                       3540
moneymax                       2841
appnext_int                    1875
google                         1270
Social_facebook                1200
Apple Search Ads               1101
None                            298
facebook                        279
meta                            275
tjzymob_int                     161
Social_instagram                 52
SevenEleven                      39
Social media                     29
Social_youtube                   27
vivoglobal_int                   20
social                           17
invalid_media_s

In [48]:
counts = acquisitioncampaign['media_source'].value_counts(dropna=False)
percentages = counts / len(acquisitioncampaign) * 100

result = pd.concat([counts, percentages], axis=1, keys=['Counts', 'Percentage'])
result

Unnamed: 0_level_0,Counts,Percentage
media_source,Unnamed: 1_level_1,Unnamed: 2_level_1
,995605,80.519427
googleadwords_int,87629,7.086984
shareit_int,57692,4.665833
af_app_invites,19985,1.616284
Facebook Ads,16529,1.336781
Website,10536,0.852098
af_banner,9429,0.762569
bytedanceglobal_int,7603,0.614892
restricted,6545,0.529326
salesdoubler_int,5932,0.47975


In [49]:
acquisitioncampaign['partner'].value_counts(dropna = False)

partner
None            987040
pokktmkt        134319
mediadonuts      62795
acesaatchi       44029
tyrads            7512
entravision        674
salesdoubler        77
avowtech            32
Name: count, dtype: int64

In [50]:
counts = acquisitioncampaign['partner'].value_counts(dropna=False)
percentages = counts / len(acquisitioncampaign) * 100

result = pd.concat([counts, percentages], axis=1, keys=['Counts', 'Percentage'])
result

Unnamed: 0_level_0,Counts,Percentage
partner,Unnamed: 1_level_1,Unnamed: 2_level_1
,987040,79.826734
pokktmkt,134319,10.863032
mediadonuts,62795,5.078538
acesaatchi,44029,3.56084
tyrads,7512,0.607532
entravision,674,0.05451
salesdoubler,77,0.006227
avowtech,32,0.002588


In [51]:
pd.set_option("Display.max_rows", None)
acquisitioncampaign['campaign'].value_counts(dropna = False)

campaign
None                                                                       796202
None                                                                       108573
Tonik_CPA                                                                   75744
Tonik_ACL-Acquisition-CreditBuilder_UAC                                     49102
POKKT                                                                       26656
LOANS QL  | APP CAMPAIGN | ONBOARDING_SUCCESS |  Q4 2022                    16933
pokkt_tonik_ph_andriod_cpe                                                  14748
Refer & Earn                                                                13734
LOANS QL | APP CAMPAIGN | QL_LOANS_OFFER_ ACCEPTED | JULY 2023 (25,600)      9695
TA_Tonik_AOS_PH                                                              7512
Tonik_Cash | 2024 | App Download                                             7160
POKKT_CPA                                                                    7002
Anymind

In [52]:
counts = acquisitioncampaign['campaign'].value_counts(dropna=False)
percentages = counts / len(acquisitioncampaign) * 100

result = pd.concat([counts, percentages], axis=1, keys=['Counts', 'Percentage'])
result

Unnamed: 0_level_0,Counts,Percentage
campaign,Unnamed: 1_level_1,Unnamed: 2_level_1
,796202,64.392735
,108573,8.780827
Tonik_CPA,75744,6.125786
Tonik_ACL-Acquisition-CreditBuilder_UAC,49102,3.971118
POKKT,26656,2.155801
LOANS QL | APP CAMPAIGN | ONBOARDING_SUCCESS | Q4 2022,16933,1.369454
pokkt_tonik_ph_andriod_cpe,14748,1.192743
Refer & Earn,13734,1.110735
"LOANS QL | APP CAMPAIGN | QL_LOANS_OFFER_ ACCEPTED | JULY 2023 (25,600)",9695,0.784082
TA_Tonik_AOS_PH,7512,0.607532
