# <div align="center" style="color: #ff5733;">Demographic Data</div>

# Demographic Data

In [19]:
# %% [markdown]
# # Jupyter Notebook Loading Header
#
# This is a custom loading header for Jupyter Notebooks in Visual Studio Code.
# It includes common imports and settings to get you started quickly.

# %% [markdown]
## Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from google.cloud import bigquery
import os
path = r'C:\Users\DwaipayanChakroborti\AppData\Roaming\gcloud\legacy_credentials\dchakroborti@tonikbank.com\adc.json'
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = path
client = bigquery.Client(project='prj-prod-dataplatform')

# %% [markdown]
## Configure Settings
# Set options or configurations as needed
pd.set_option('display.max_columns', None)


# Function to get the date range

In [2]:
import datetime
from google.cloud import bigquery

def get_date_range(start_date, end_date):
    for n in range(int((end_date - start_date).days)):
        yield start_date + datetime.timedelta(n)

# Function to identify the customer and the onboarding date on a particular date

In [3]:
def CustomerIdentify(date):
    sq = f"""select distinct cust_id, created_dt onboardingDate FROM `dl_customers_db_raw.tdbk_customer_mtb` WHERE 1=1 and date(created_dt) = "{date}" and cust_id is not null; """
    df = client.query(sq).to_dataframe()
    return df

# Function to calculate the Age of customer

In [4]:
def calculate_age_in_days(date_of_birth):
    from datetime import datetime
    # Parse the date of birth
    dob = datetime.strptime(date_of_birth, "%d %b %Y")

    # Get the current date
    current_date = datetime.now()

    # Calculate the difference
    age_days = (current_date - dob).days

    return age_days

def add_age_column(df, date_column='dateOfBirth', new_column='AgeInDays'):
    # Apply the calculate_age_in_days function to the date column
    df[new_column] = df[date_column].apply(calculate_age_in_days)
    return df

# Function to run the demographic Data preparation query

In [5]:
def demographicdata(a, b, df):
    # Education Type
    # print("Education Type")
    sq = f"""
    with
    educate as 
    (select distinct edu.digitalLoanAccountId, edu.education_id, edu1.description
    from `prj-prod-dataplatform.dl_loans_db_raw.tdbk_loan_purpose` edu
    inner join (select id, description from dl_loans_db_raw.tdbk_loan_lov_mtb where module = 'Education') edu1 on edu.education_id = edu1.id
    ),
    educate2 as 
    (select *, row_number() over(partition by digitalLoanAccountId order by education_id desc) rnk from educate),
    educate3 as 
    (select * from educate2 where rnk = 1)
    select lmt.customerId,
    educate3.education_id,
    educate3.description Education_Type, 
    educate3.rnk educationrnk
    from educate3 
    inner join prj-prod-dataplatform.risk_credit_mis.loan_master_table lmt on lmt.digitalLoanAccountId = educate3.digitalLoanAccountId
    where lmt.customerId in {a}
    """
    educationdf = client.query(sq).to_dataframe(progress_bar_type='tqdm')
    # Telco Provider
    # print("Telco Provider")
    sq = f"""
    WITH ac_created AS 
    (
    ## Base data of onboarded users
    SELECT DISTINCT 
    product,
    CASE 
        WHEN product = 'LOAN1.0' THEN 'Borrow'
        WHEN product = 'TSA1.0' THEN 'Save'
        WHEN product = 'TSAE1.0' THEN 'Explore'
        ELSE product END type,
    cust_id,
    user_id,
    device_id,
    created_dt,
    gender,
    mobile_no,
    birthplace,
    FROM prj-prod-dataplatform.dl_customers_db_raw.tdbk_customer_mtb
    WHERE 1=1
    AND cast(cust_id as numeric) in {a}
    --AND created_dt >= '2023-06-01'
    ORDER BY 4
    )
    select 
        ac.cust_id,
        ac.user_id,
        ac.device_id,
        ac.created_dt,
        ac.gender,
        ac.birthplace,
        ac.mobile_no,
        ac.product,
        ac.type,
        t4.GeneralData.telephony_info__network_operator_name telcoProvider
    from ac_created ac
    inner join prj-prod-dataplatform.dl_loans_db_raw.tdbk_loan_customer_details tlcd on tlcd.custId = ac.cust_id  
    inner JOIN `prj-prod-dataplatform.dl_loans_db_raw.tdbk_credolab_track` t3 ON cast(tlcd.credolabRefNumber as string) = cast(t3.refno as string)
    left JOIN `prj-prod-dataplatform.credolab_raw.android_credolab_datasets_struct_columns` t4 ON t3.refno = t4.deviceId
    where cast(ac.cust_id as numeric) in {a}
    """
    clientdemodf1 = client.query(sq).to_dataframe(progress_bar_type='tqdm')
    # Merge Client Demo 1 with educationdf 
    print("Merge Client Demo 1 with educationdf ")
    df['cust_id'] = df['cust_id'].astype(np.int64)
    clientdemodf1['cust_id'] = clientdemodf1['cust_id'].astype(np.int64)
    dfmain = df.merge(clientdemodf1, on = 'cust_id', how = 'left')
    dfmain = clientdemodf1.merge(educationdf, left_on='cust_id', right_on='customerId', how = 'left')
    dfmain.drop(columns=['customerId', 'educationrnk'], inplace = True)
    # Customer Details
    # print("Customer Details")
    sq = f"""    
    SELECT 
    custid, dateOfBirth, 
    signUpAccNo, 
    kycStatus,
    addressline1, 
    city, 
    province, 
    barangay, 
    provinceCode, 
    postalcode, 
    mailingCity, 
    mailingPostalCode, 
    mailingProvince,
    natureofwork, 
    employmentstatus, 
    mobileOs, 
    docType, 
    onboardingDate, 
    created_dt, 
    ABS(TIMESTAMP_DIFF(
        PARSE_TIMESTAMP('%Y-%m-%d %H:%M:%E6S', onboardingDate),
        CAST(created_dt AS TIMESTAMP),
        MINUTE
    )) AS Minutestakentoonboard,
    companyName, 
    industry, 
    device_dtl,
    FROM 
    `prj-prod-dataplatform.dl_loans_db_raw.tdbk_loan_customer_details`
    WHERE 
    CAST(custId AS NUMERIC) IN {a}
    """
    cddf = client.query(sq).to_dataframe(progress_bar_type='tqdm')
    cddf['custid'] = cddf['custid'].astype(np.int64)
    cddf.rename(columns={'custid':'cust_id'}, inplace=True)
    dfmain = dfmain.merge(cddf, on='cust_id', how = 'left')
    dfmain.drop(columns=['created_dt_y'], inplace=True)
    dfmain.rename(columns={'created_dt_x':'created_dt'}, inplace = True)
    
    # Add AgeInYears column (approximate)
    # print("Add AgeInYears column (approximate)")
    dfmain['Age'] = dfmain['dateOfBirth'].apply(calculate_age_in_days)
    dfmain['Age'] = dfmain['Age'] //365
    
    # Region
    # print("Region")
    
    sq = f"""
    Select distinct 
    a.user_id,
    cast(a.cust_Id as numeric) cust_id,
    hm_postalcode,
    hm_barangay,
    hm_city,
    hm_province,
        CASE
        WHEN hm_province = 'KALINGA' THEN 'CAR – Cordillera Administrative Region'
        WHEN hm_province = 'OCCIDENTAL MINDORO' THEN 'MIMAROPA Region'
        WHEN hm_province = 'ORIENTAL MINDORO' THEN  'MIMAROPA Region'
        WHEN hm_province = 'ORODNIM' THEN 'MIMAROPA Region'
        WHEN hm_province = 'SAMAR (WESTERN SAMAR)' THEN 'Region VIII – Eastern Visayas'
        WHEN hm_province = 'ZAMBOANGA SIBUGAY' THEN 'Region IX – Zamboanga Peninsula'
        ELSE Region_name END Region_Name 
    FROM prj-prod-dataplatform.dl_customers_db_raw.tdbk_customer_mtb a
    LEFT JOIN prj-prod-dataplatform.dl_customers_db_raw.tdbk_customer_add_mtb f ON a.user_id = f.user_id
    LEFT JOIN prj-prod-dataplatform.dap_ds_poweruser_playground.region_mappings h ON LOWER(f.hm_province) = LOWER(h.province)
    where cast(a.cust_Id as numeric) in {a}
    ;
    """

    regiondf = client.query(sq).to_dataframe(progress_bar_type = 'tqdm')
    
    regiondf = regiondf[['cust_id', 'hm_postalcode', 'hm_barangay', 'hm_city',
       'hm_province', 'Region_Name']].copy()
    
    regiondf['cust_id'] = regiondf['cust_id'].astype(np.int64)
    dfmain = dfmain.merge(regiondf, on='cust_id', how = 'left')
    # List of columns to check and their corresponding 'hm_' columns
    # print("List of columns to check and their corresponding 'hm_' columns")
    columns_to_check = [
        ('city', 'hm_city'),
        ('province', 'hm_province'),
        ('barangay', 'hm_barangay'),
        ('postalcode', 'hm_postalcode')
    ]

    # Loop through the columns and replace null values
    # print("Loop through the columns and replace null values")
    for col, hm_col in columns_to_check:
        dfmain[col] = dfmain[col].fillna(dfmain[hm_col])
    
    dfmain.drop(columns = ['hm_postalcode', 'hm_barangay', 'hm_city',
       'hm_province'], inplace = True)
    
    # Device Location at Onboarding
    # print("Device Location at Onboarding")
    
    sq =f"""SELECT 
    customer_id, 
    CASE WHEN event_description = 'Onboarding' THEN latitude END AS Onboarding_latitude,
    CASE WHEN event_description = 'Onboarding' THEN longitude END AS Onboarding_longitude,
    CASE WHEN event_description = 'Loan Journey' THEN latitude END AS loanjourney_latitude,
    CASE WHEN event_description = 'Loan Journey' THEN longitude END AS loanjouney_longitude,
    CASE WHEN event_description = 'Apigee Logs' THEN latitude END AS ApigeeLogs_latitude,
    CASE WHEN event_description = 'Apigee Logs' THEN longitude END AS ApigeeLogs_longitude
    FROM `prj-prod-dataplatform.risk_mart.customer_gps_location`
    where cast(customer_id as numeric) in {a}
    QUALIFY ROW_NUMBER() OVER (PARTITION BY customer_id ORDER BY gps_collection_date DESC) = 1
    ;
    """

    locationdf = client.query(sq).to_dataframe(progress_bar_type = 'tqdm')
    
    locationdf = locationdf.rename(columns={'customer_id': 'cust_id'}).astype({'cust_id': np.int64})

    dfmain = dfmain.merge(locationdf, on = 'cust_id', how = 'left')
    
    # Marital Status, No. of Dependents, income
    # print("Marital Status, No. of Dependents, income")
    sq = f"""  
    select 
    customerId,
    maritalStatus,
    dependentsCount, 
    monthlyIncome,
    startApplyDateTime,
    from
    prj-prod-dataplatform.risk_credit_mis.loan_master_table
    where (maritalStatus is not null or dependentsCount is not null or monthlyIncome is not null)
    and customerId in {a}
    qualify row_number() over (partition by customerId order by startApplyDateTime desc) = 1
    ;
    """
    loandf = client.query(sq).to_dataframe(progress_bar_type = 'tqdm')
    loandf.rename(columns={'customerId':'cust_id'}, inplace = True)
    dfmain = dfmain.merge(loandf, on = 'cust_id', how = 'left')

    # Prepaid and PostPaid Flag
    
    # print("Prepaid and PostPaid Flag")
    
    sq = f"""    
    WITH ac_created AS 
    (
    ## Base data of onboarded users
    SELECT DISTINCT 
    product,
    CASE 
        WHEN product = 'LOAN1.0' THEN 'Borrow'
        WHEN product = 'TSA1.0' THEN 'Save'
        WHEN product = 'TSAE1.0' THEN 'Explore'
        ELSE product END type,
    cust_id,
    user_id,
    device_id,
    created_dt,
    gender,
    mobile_no,
    FROM prj-prod-dataplatform.dl_customers_db_raw.tdbk_customer_mtb
    WHERE 1=1
    AND cast(cust_id as numeric) in {a}
    AND created_dt >= '2023-06-01'
    ORDER BY 4
    )

    , f_loan AS 
    (
    ## First applied loan of a customer 
    SELECT DISTINCT customerId, new_loan_type, applicationStatus, disbursementdatetime, startApplydatetime
    FROM `risk_credit_mis.loan_master_table`
    WHERE 1=1
    AND customerId in {a}
    AND startApplyDatetime >= '2023-06-01'
    QUALIFY ROW_NUMBER() OVER (PARTITION BY customerId ORDER BY startApplyDatetime ASC) =1
    )


    , deposit_balance AS 
    (
    ## Gets the balance of a customer on their 90th day upon account opening
        ## For the 1st condition of the balance of a customer upon the observation date
    SELECT DISTINCT a.ofdateopened,a.ofcustomerid, a.ofstandardaccountid,account_type,balanceDateAsOf, clearedbalance,
    FROM `prj-prod-dataplatform.core_raw.customer_accounts` a
    JOIN `risk_mart.customer_balance` b ON a.ofcustomerid = b.client_id AND a.ofstandardaccountid = b.accountid
    WHERE 1=1
    AND cast(a.ofcustomerid as numeric) in {a}
    -- AND clearedbalance >= 100
    -- AND ofcustomerid IN ('1514439','1252865','1248952')
    QUALIFY ROW_NUMBER() OVER (PARTITION BY ofcustomerid, ofstandardaccountid ORDER BY balancedateasof ASC) = 90
    )

    , af_link AS
    (
    ## To get the AF ID and Customer ID Link (using the first install of a customer)
    SELECT DISTINCT customer_user_id, appsflyer_id, media_source, partner, campaign
    FROM `appsflyer_raw.organic_in_app_events_report`
    WHERE 1=1
    and customer_user_id in {b}
    AND customer_user_id IS NOT NULL
    QUALIFY ROW_NUMBER() OVER (PARTITION BY customer_user_id ORDER BY install_time ASC) = 1
    
    UNION ALL
    
    SELECT DISTINCT customer_user_id, appsflyer_id, media_source, partner, campaign
    FROM `appsflyer_raw.in_app_events_report`
    WHERE 1=1
    and customer_user_id in {b}
    AND customer_user_id IS NOT NULL
    QUALIFY ROW_NUMBER() OVER (PARTITION BY customer_user_id ORDER BY install_time ASC) = 1
    )


    , demog_details AS 
    (
    ## Demographic Details
    SELECT DISTINCT
    a.user_id,
    a.cust_Id,
    a.device_id,
    b.id,
    c.description source_of_funds,
    d.description employment_status,
    monthly_income,
    hm_postalcode,
    hm_barangay,
    hm_city,
    hm_province,
    CASE
        WHEN hm_province = 'KALINGA' THEN 'CAR – Cordillera Administrative Region'
        WHEN hm_province = 'OCCIDENTAL MINDORO' THEN 'MIMAROPA Region'
        WHEN hm_province = 'ORIENTAL MINDORO' THEN  'MIMAROPA Region'
        WHEN hm_province = 'ORODNIM' THEN 'MIMAROPA Region'
        WHEN hm_province = 'SAMAR (WESTERN SAMAR)' THEN 'Region VIII – Eastern Visayas'
        WHEN hm_province = 'ZAMBOANGA SIBUGAY' THEN 'Region IX – Zamboanga Peninsula'
        ELSE Region_name END Region_Name
    FROM prj-prod-dataplatform.dl_customers_db_raw.tdbk_customer_mtb a
    LEFT JOIN prj-prod-dataplatform.dl_dynamo_db_raw.tdbk_regfinancial_profile1 b ON a.device_id = b.deviceId
    LEFT JOIN prj-prod-dataplatform.dl_dynamo_db_raw.tdbk_source_of_funds_mtb c ON b.sourceOfFundKey = c.id  
    LEFT JOIN prj-prod-dataplatform.dl_dynamo_db_raw.tdbk_employment_status_mtb d ON b.employmentStatusKey = d.id
    LEFT JOIN prj-prod-dataplatform.dl_customers_db_raw.tdbk_cust_profile_mtb e ON a.user_id = e.user_id
    LEFT JOIN prj-prod-dataplatform.dl_customers_db_raw.tdbk_customer_add_mtb f ON a.user_id = f.user_id
    LEFT JOIN prj-prod-dataplatform.dap_ds_poweruser_playground.region_mappings h ON LOWER(f.hm_province) = LOWER(h.province)
    where cast(a.cust_Id as numeric) in {a}
    )


    ## Base query used to connect to other features 
    SELECT DISTINCT
    a.created_dt registration_date,
    -- startApplyDateTime,
    -- ofdateopened,
    -- balancedateasof,
    a.cust_id,
    a.user_id,
    a.product,
    a.type,
    -- account_type,
    -- ofstandardaccountid,
    -- clearedbalance, 
    a.gender,
    customer_age,
    Region_Name,
    hm_province,
    hm_city,
    hm_barangay,
    hm_postalcode,
    source_of_funds,
    employment_status,
    CAST(monthly_income AS FLOAT64)*12.5 self_declared_annual_income,
    -- add the salary scale annual income
    -- double check if i can switch networks given the same number

    mobile_no,
    IFNULL(h.network_group,g.network_group) network_group,
    CASE
        WHEN g.network_group = 'Globe' AND h.network IS NULL THEN 'Prepaid'
        WHEN g.network_group = 'Globe' AND h.network IS NOT NULL THEN 'Postpaid'
        WHEN g.network_group = 'Smart' AND (LEFT(mobile_no,5) = '63920' OR LEFT(mobile_no,5) = '63918') THEN 'Postpaid'
        WHEN g.network_group NOT IN ('Globe','Smart') THEN NULL
        ELSE 'Prepaid' END network,
    CASE 
    ### social media ###
    -- anything from social
    WHEN LOWER(COALESCE(Media_Source,Partner,Campaign)) LIKE '%social%' THEN 'Social'
    ### direct ###
    -- fwb / referral
    WHEN COALESCE(Media_Source,Partner,Campaign) = 'af_app_invites' THEN 'FWB'
    WHEN (Media_Source = 'invalid_media_source_name' OR Media_Source IS NULL) AND LOWER(Campaign) LIKE '%refer%' THEN 'FWB'
    WHEN LOWER(COALESCE(Media_Source,Partner,Campaign)) LIKE '%refer%' THEN 'FWB'
    WHEN LOWER(COALESCE(Media_Source,Partner,Campaign)) LIKE '%moengage%' THEN 'FWB'
    -- apple search
    WHEN LOWER(COALESCE(Media_Source,Partner,Campaign)) LIKE '%apple search ads%' THEN 'AppleSearch'
    -- facebook / meta
    WHEN COALESCE(Media_Source,Partner,Campaign) LIKE '%Facebook Ads%' THEN 'Ads_Meta'
    WHEN COALESCE(Media_Source,Partner,Campaign) LIKE '%meta%' THEN 'Ads_Meta'
    WHEN COALESCE(Media_Source,Partner,Campaign) = 'facebook' THEN 'Ads_Meta'
    WHEN LOWER(COALESCE(Media_Source,Partner,Campaign)) = 'restricted' THEN 'Ads_Meta'
    -- google
    WHEN COALESCE(Media_Source,Partner,Campaign) LIKE '%google%' THEN 'Ads_Google'
        -- tiktok
    WHEN COALESCE(Media_Source,Partner,Campaign) like '%bytedanceglobal%' THEN 'TikTok_Paid'
    ### affiliates ###
    -- pokkt source
    WHEN LOWER(COALESCE(Media_Source,Partner,Campaign)) LIKE '%pokkt%' THEN 'Aff_Pokkt'
    WHEN COALESCE(Media_Source,Partner,Campaign) LIKE '%Tonik_CPA%' THEN 'Aff_Pokkt'
    WHEN COALESCE(Media_Source,Partner,Campaign) LIKE '%campaign_name%' THEN 'Aff_Pokkt'
    -- tyr ads
    WHEN LOWER(COALESCE(Media_Source,Partner,Campaign)) IN ('tyrads_int','tjzymob_int','tyrads','ta_tonik_aos_ph') THEN 'Aff_TyrAds'
    -- sales doubler
    WHEN LOWER(COALESCE(Media_Source,Partner,Campaign)) LIKE '%salesdoubler%' THEN 'Aff_SalesDoubler'
    
    -- imoney
    WHEN COALESCE(Media_Source,Partner,Campaign) LIKE '%imoney%' THEN 'Aff_iMoney'
    -- moneymax
    WHEN COALESCE(Media_Source,Partner,Campaign) LIKE '%moneymax%' THEN 'Aff_MoneyMax'
    -- jeff
    WHEN COALESCE(Media_Source,Partner,Campaign) LIKE '%jeff%' THEN 'Aff_Jeff'
    -- shareit
    WHEN COALESCE(Media_Source,Partner,Campaign) LIKE '%shareit%' THEN 'Aff_ShareIt'
    
    -- appnext
    WHEN COALESCE(Media_Source,Partner,Campaign) LIKE '%appnext%' THEN 'Aff_AppNext'
    -- mediadonuts / entravision
    WHEN LOWER(COALESCE(Media_Source,Partner,Campaign)) LIKE '%mediadonuts%' THEN 'Aff_Mediadonuts'
    WHEN LOWER(COALESCE(Media_Source,Partner,Campaign)) LIKE '%entravision%' THEN 'Aff_Mediadonuts'
    -- shopback
    WHEN LOWER(COALESCE(Media_Source,Partner,Campaign)) LIKE '%shopback_int%' THEN 'Aff_Shopback'
    -- deepsea
    WHEN LOWER(COALESCE(Media_Source,Partner,Campaign)) LIKE '%deepsea%' THEN 'Aff_DeepSea'
    -- avow
    WHEN COALESCE(Media_Source,Partner,Campaign) LIKE '%avow%' THEN 'Aff_AvowTech'
    -- avow
    WHEN COALESCE(Media_Source,Partner,Campaign) LIKE '%vivo%' THEN 'Aff_Vivo'
    -- 711
    WHEN COALESCE(Media_Source,Partner,Campaign) LIKE '%SevenEleven%' THEN 'SevenEleven'
    ## organic ##
    WHEN COALESCE(Media_Source,Partner,Campaign) IN ('af_banner') THEN 'af_Banner'
    WHEN COALESCE(Media_Source,Partner,Campaign) LIKE '%sendgrid%' THEN 'Organic'
    WHEN COALESCE(Media_Source,Partner,Campaign) IN ('Website') THEN 'Website'
    
    ## owned media ##
    WHEN media_source IN ('PRFlexLoans_WebAndroidPressRelease') THEN 'Owned Media'
    
    WHEN media_source IN ('invalid_media_source_name') THEN 'Invalid Media Source'
    WHEN media_source IN ('None') THEN 'Broken OneLink'
    WHEN media_source IS NULL AND partner IS NULL AND campaign IS NULL THEN 'Organic'
    ELSE COALESCE(media_source,partner,campaign) END as Source,
    media_source,
    partner,
    campaign,
    
    FROM ac_created a
    LEFT JOIN f_loan b ON a.cust_id  = CAST(b.customerId AS STRING)
    LEFT JOIN deposit_balance c ON a.cust_id = c.ofcustomerid
    JOIN af_link d ON cast(a.cust_id as string) = d.customer_user_id
    JOIN prj-prod-dataplatform.dl_customers_db_derived.Tdbk_customer_mtb_age_derived e ON a.cust_id = e.cust_id
    LEFT JOIN demog_details f ON a.cust_id = f.cust_id
    LEFT JOIN `prj-prod-dataplatform.manual_source_extracts.mobile_carrier_mapping` g
    ON LEFT(RIGHT(mobile_no,LENGTH(mobile_no)-2),3) = CAST(g.number_prefix AS STRING)
    LEFT JOIN `prj-prod-dataplatform.manual_source_extracts.mobile_carrier_mapping` h 
    ON LEFT(RIGHT(mobile_no,LENGTH(mobile_no)-2),4) = CAST(h.number_prefix AS STRING)
    WHERE 1=1
    ## The deposit account of a customer upon the 90th day is greater than 100 
    AND clearedbalance >= 100

    ## The customer did not initiate any loan process within 90 days from account creation
    AND (DATE_DIFF(DATE(startApplyDateTime),DATE(created_dt),DAY)>=90 OR b.customerid IS NULL)
    AND c.ofcustomerid IS NOT NULL
    AND cast( a.cust_id as numeric) in {a}
    """

    demogdf2 = client.query(sq).to_dataframe(progress_bar_type = 'tqdm')
    
    demogdf2 = demogdf2[['cust_id', 'network_group', 'Source',	'media_source',	'partner','campaign']].copy()
    
    demogdf2 = demogdf2.drop_duplicates(subset='cust_id', keep='first')
    demogdf2['cust_id'] = demogdf2['cust_id'].astype(np.int64)
    # Dictionary mapping Source values to their SourceGroups
    # print("Dictionary mapping Source values to their SourceGroups")
    source_groups_mapping = {
        'Ads_Google': 'Direct',
        'Ads_Meta': 'Direct',
        'Aff_AppNext': 'Affiliates',
        'Aff_AvowTech': 'Affiliates',
        'Aff_DeepSea': 'Affiliates',
        'Aff_Jeff': 'Affiliates',
        'Aff_Mediadonuts': 'Affiliates',
        'Aff_MoneyMax': 'Affiliates',
        'Aff_Pokkt': 'Affiliates',
        'Aff_SalesDoubler': 'Affiliates',
        'Aff_ShareIt': 'Affiliates',
        'Aff_Shopback': 'Affiliates',
        'Aff_TyrAds': 'Affiliates',
        'Aff_Vivo': 'Affiliates',
        'Aff_iMoney': 'Affiliates',
        'acesaatchi': 'Affiliates',
        'AppleSearch': 'Direct',
        'Broken OneLink': 'Others',
        'Email': 'Others',
        'FWB': 'Direct',
        'OpsEngine_post': 'Others',
        'Organic': 'Organic',
        'Owned Media': 'Others',
        'Partnership': 'Others',
        'SevenEleven': 'Others',
        'Social': 'Organic',
        'Test': 'Others',
        'TikTok_Paid': 'Direct',
        'Website': 'Organic',
        'af_Banner': 'Organic',
        'Tonik_PHL_default': 'Others'
    }

    # Assuming demogdf2 is your DataFrame
    # Create the SourceGroups column by mapping the Source column to the corresponding groups
    demogdf2['SourceGroups'] = demogdf2['Source'].map(source_groups_mapping)

    
    dfmain = dfmain.merge(demogdf2, on = 'cust_id', how='left')
    
    # MOB
    # print(" MOB")
    
    # Step 1: Convert columns to datetime datatype
    dfmain['onboardingDate'] = pd.to_datetime(dfmain['onboardingDate'])
    dfmain['startApplyDateTime'] = pd.to_datetime(dfmain['startApplyDateTime'])

    # Step 2: Find the earliest date between onboardingDate and startApplyDateTime
    dfmain['earliest_date'] = dfmain[['onboardingDate', 'startApplyDateTime']].min(axis=1)

    # Step 3: Calculate the difference in months from the earliest date to current date
    current_date = pd.Timestamp.now()
    dfmain['months_difference'] = (current_date - dfmain['earliest_date']).dt.days / 30.44  # Average days in a month

    # Round the result to 2 decimal places
    dfmain['months_difference'] = dfmain['months_difference'].round(2)
    # dfmain['DataSet'] = 'Demographic Data'
    
    return dfmain
    
    
    
    
    
    

# Main Query to run the loop between start date and end date

In [6]:
start_date = datetime.date(2023, 1, 1)
# end_date = datetime.date(2023, 3, 1)
end_date = datetime.date.today() - datetime.timedelta(days=1)  # Yesterday
res =  pd.DataFrame()
for date in get_date_range(start_date, end_date):
    print(f"Processing date: {date}")
    df = CustomerIdentify(date)
    print(f"The shape of the dataframe for date {date} is:\t{df.shape}")
    b = tuple(df['cust_id'])
    df['cust_id'] = df['cust_id'].astype(np.int64)
    a = tuple(df['cust_id'])
    # print(a[0:5])
    # print(b[0:5])
    dfd = demographicdata(a,b, df)
    # print(dfd.columns)
    res = pd.concat([res,dfd])
    # Assuming your dataframe is named 'res' and the column is 'onboardingDate'
    res['onboardedDate'] = res['onboardingDate'].dt.date


Processing date: 2023-01-01
The shape of the dataframe for date 2023-01-01 is:	(778, 2)
Job ID 546a889a-304d-4b73-9955-38f2fa10ac68 successfully executed: 100%|[32m██████████[0m|
Downloading: 100%|[32m██████████[0m|
Job ID a42df99b-7131-4c6a-a922-5af65da77bfc successfully executed: 100%|[32m██████████[0m|
Downloading: 100%|[32m██████████[0m|
Merge Client Demo 1 with educationdf 
Job ID fcaad739-9b4d-4996-b2e4-16a72bc3b83f successfully executed: 100%|[32m██████████[0m|
Downloading: 100%|[32m██████████[0m|
Job ID eaf550ce-f9c3-4f13-8c17-dcb05ec8bf4d successfully executed: 100%|[32m██████████[0m|
Downloading: 100%|[32m██████████[0m|
Job ID fc04c04e-c330-4b87-a95c-e76aa42c8a03 successfully executed: 100%|[32m██████████[0m|
Downloading: 100%|[32m██████████[0m|
Job ID be6cc3c7-4136-4072-b647-f1d60e80a5dc successfully executed: 100%|[32m██████████[0m|
Downloading: 100%|[32m██████████[0m|
Job ID 5030565f-a10d-4624-af0a-8b61bf34d357 successfully executed: 100%|[32m█████

In [None]:
print(f"The total rows for {res.shape}")
res.head()


In [None]:
# # Rename columns and add prefix de_

# columns_to_rename = ['gender', 'birthplace', 'mobile_no', 'product', 'type', 'telcoProvider',
#                      'education_id', 'Education_Type', 'dateOfBirth', 'signUpAccNo', 'kycStatus',
#                      'addressline1', 'city', 'province', 'barangay', 'provinceCode', 'postalcode',
#                      'mailingCity', 'mailingPostalCode', 'mailingProvince', 'natureofwork',
#                      'employmentstatus', 'mobileOs', 'docType', 'onboardingDate',
#                      'Minutestakentoonboard', 'companyName', 'industry', 'device_dtl', 'Age',
#                      'Region_Name', 'Onboarding_latitude', 'Onboarding_longitude',
#                      'loanjourney_latitude', 'loanjouney_longitude', 'ApigeeLogs_latitude',
#                      'ApigeeLogs_longitude', 'maritalStatus', 'dependentsCount', 'monthlyIncome',
#                      'startApplyDateTime', 'network_group', 'Source', 'media_source', 'partner',
#                      'campaign', 'SourceGroups', 'earliest_date', 'months_difference']

# # Create a dictionary for renaming the columns
# rename_dict = {col: f'de_{col}' for col in columns_to_rename}

# # Apply the renaming to the DataFrame
# res = res.rename(columns=rename_dict)

# res.head()


In [7]:
res['onboardedDate'].value_counts(dropna=False).sort_index()

onboardedDate
2023-01-01     486
2023-01-02     902
2023-01-03     772
2023-01-04     806
2023-01-05     710
              ... 
2024-09-03    1555
2024-09-04    1307
2024-09-05    1393
2024-09-06    1590
2024-09-07    1739
Name: count, Length: 616, dtype: int64

## Save the csv file after creating the raw data by running the loop for each day

In [8]:
res.to_csv(r"C:\Users\DwaipayanChakroborti\OneDrive - Tonik Financial Pte Ltd\MyStuff\Biswa\Customer_360_Data_Prep\Data\Demographic_Daily_Snapshot_Data_20240909.csv", index = False)

## Check the column name in dataframe "Res"

In [9]:
res.columns

Index(['cust_id', 'user_id', 'device_id', 'created_dt', 'gender', 'birthplace',
       'mobile_no', 'product', 'type', 'telcoProvider', 'education_id',
       'Education_Type', 'dateOfBirth', 'signUpAccNo', 'kycStatus',
       'addressline1', 'city', 'province', 'barangay', 'provinceCode',
       'postalcode', 'mailingCity', 'mailingPostalCode', 'mailingProvince',
       'natureofwork', 'employmentstatus', 'mobileOs', 'docType',
       'onboardingDate', 'Minutestakentoonboard', 'companyName', 'industry',
       'device_dtl', 'Age', 'Region_Name', 'Onboarding_latitude',
       'Onboarding_longitude', 'loanjourney_latitude', 'loanjouney_longitude',
       'ApigeeLogs_latitude', 'ApigeeLogs_longitude', 'maritalStatus',
       'dependentsCount', 'monthlyIncome', 'startApplyDateTime',
       'network_group', 'Source', 'media_source', 'partner', 'campaign',
       'SourceGroups', 'earliest_date', 'months_difference', 'onboardedDate'],
      dtype='object')

## Created the rename dictionary from A_Demographic_data_bnb_update excel sheet

In [11]:
# Read the Excel file (replace 'your_file.xlsx' with the actual file name)
excel_data = pd.read_excel(r'C:\Users\DwaipayanChakroborti\OneDrive - Tonik Financial Pte Ltd\MyStuff\Biswa\Customer_360_Data_Prep\Data\SupportingData\A_Demographic_data_bnb_update.xlsx', sheet_name = 'rename_columns_as')

# Create a dictionary to map current column names to updated column names
rename_dict = dict(zip(excel_data['Current_Name'], excel_data['Updated_Name']))

rename_dict

{'cust_id': 'cust_id',
 'user_id': 'user_id',
 'device_id': 'device_id',
 'created_dt': 'created_dt',
 'gender': 'de_gender',
 'birthplace': 'de_birthplace',
 'mobile_no': 'de_mobile_no',
 'product': 'de_first_product',
 'type': 'de_usage_type',
 'telcoProvider': 'de_telco_provider',
 'education_id': 'de_education_id',
 'Education_Type': 'de_education_type',
 'dateOfBirth': 'de_date_of_birth',
 'signUpAccNo': 'de_sign_up_acc_no',
 'kycStatus': 'de_kyc_status',
 'addressline1': 'de_addressline1',
 'city': 'de_city',
 'province': 'de_province',
 'barangay': 'de_barangay',
 'provinceCode': 'de_province_code',
 'postalcode': 'de_postal_code',
 'mailingCity': 'de_mailing_city',
 'mailingPostalCode': 'de_mailing_postal_code',
 'mailingProvince': 'de_mailing_province',
 'natureofwork': 'de_nature_of_work',
 'employmentstatus': 'de_employment_status',
 'mobileOs': 'de_mobile_os',
 'docType': 'de_doc_type',
 'onboardingDate': 'de_onboarding_date',
 'Minutestakentoonboard': 'de_time_to_onboard_m

## Rename the column as per A_Demographic_data_bnb_update Excel Sheet

In [12]:
# Rename columns in the DataFrame 'res'
res.rename(columns=rename_dict, inplace=True)
res.columns

Index(['cust_id', 'user_id', 'device_id', 'created_dt', 'de_gender',
       'de_birthplace', 'de_mobile_no', 'de_first_product', 'de_usage_type',
       'de_telco_provider', 'de_education_id', 'de_education_type',
       'de_date_of_birth', 'de_sign_up_acc_no', 'de_kyc_status',
       'de_addressline1', 'de_city', 'de_province', 'de_barangay',
       'de_province_code', 'de_postal_code', 'de_mailing_city',
       'de_mailing_postal_code', 'de_mailing_province', 'de_nature_of_work',
       'de_employment_status', 'de_mobile_os', 'de_doc_type',
       'de_onboarding_date', 'de_time_to_onboard_mins', 'de_company_name',
       'de_industry', 'de_device_dtl', 'de_age', 'de_region_name',
       'de_onboarding_lat', 'de_onboarding_long', 'de_loan_journey_lat',
       'de_loan_jouney_long', 'de_apigee_logs_lat', 'de_apigee_logs_long',
       'de_marital_status', 'de_dependents_cnt', 'de_monthly_income',
       'de_start_apply_date_time', 'de_network_group', 'de_source',
       'de_media_source

## Reshape the Column Position in DataFrame "Res"

In [14]:
res = res[['onboardedDate','cust_id', 'user_id', 'device_id', 'created_dt', 'de_gender',
       'de_birthplace', 'de_mobile_no', 'de_first_product', 'de_usage_type',
       'de_telco_provider', 'de_education_id', 'de_education_type',
       'de_date_of_birth', 'de_sign_up_acc_no', 'de_kyc_status',
       'de_addressline1', 'de_city', 'de_province', 'de_barangay',
       'de_province_code', 'de_postal_code', 'de_mailing_city',
       'de_mailing_postal_code', 'de_mailing_province', 'de_nature_of_work',
       'de_employment_status', 'de_mobile_os', 'de_doc_type',
       'de_onboarding_date', 'de_time_to_onboard_mins', 'de_company_name',
       'de_industry', 'de_device_dtl', 'de_age', 'de_region_name',
       'de_onboarding_lat', 'de_onboarding_long', 'de_loan_journey_lat',
       'de_loan_jouney_long', 'de_apigee_logs_lat', 'de_apigee_logs_long',
       'de_marital_status', 'de_dependents_cnt', 'de_monthly_income',
       'de_start_apply_date_time', 'de_network_group', 'de_source',
       'de_media_source', 'de_partner', 'de_acq_campaign', 'SourceGroups',
       'de_earliest_date', 'de_months_difference']].copy()

## Create CSV file with new Column name

In [15]:
res.to_csv(r"C:\Users\DwaipayanChakroborti\OneDrive - Tonik Financial Pte Ltd\MyStuff\Biswa\Customer_360_Data_Prep\Data\Demographic_Daily_Snapshot_Data_renamedcolumn_20240909.csv", index = False)

In [17]:
res.info()

<class 'pandas.core.frame.DataFrame'>
Index: 742699 entries, 0 to 1738
Data columns (total 54 columns):
 #   Column                    Non-Null Count   Dtype              
---  ------                    --------------   -----              
 0   onboardedDate             742699 non-null  object             
 1   cust_id                   742699 non-null  int64              
 2   user_id                   742699 non-null  object             
 3   device_id                 742699 non-null  object             
 4   created_dt                742699 non-null  datetime64[us, UTC]
 5   de_gender                 742699 non-null  object             
 6   de_birthplace             742699 non-null  object             
 7   de_mobile_no              742699 non-null  object             
 8   de_first_product          682841 non-null  object             
 9   de_usage_type             682841 non-null  object             
 10  de_telco_provider         573524 non-null  object             
 11  de_educ

In [38]:
res = rescopy.copy()

In [40]:
res.columns

Index(['onboardedDate', 'cust_id', 'user_id', 'device_id', 'created_dt',
       'de_gender', 'de_birthplace', 'de_mobile_no', 'de_first_product',
       'de_usage_type', 'de_telco_provider', 'de_education_id',
       'de_education_type', 'de_date_of_birth', 'de_sign_up_acc_no',
       'de_kyc_status', 'de_addressline1', 'de_city', 'de_province',
       'de_barangay', 'de_province_code', 'de_postal_code', 'de_mailing_city',
       'de_mailing_postal_code', 'de_mailing_province', 'de_nature_of_work',
       'de_employment_status', 'de_mobile_os', 'de_doc_type',
       'de_onboarding_date', 'de_time_to_onboard_mins', 'de_company_name',
       'de_industry', 'de_device_dtl', 'de_age', 'de_region_name',
       'de_onboarding_lat', 'de_onboarding_long', 'de_loan_journey_lat',
       'de_loan_jouney_long', 'de_apigee_logs_lat', 'de_apigee_logs_long',
       'de_marital_status', 'de_dependents_cnt', 'de_monthly_income',
       'de_start_apply_date_time', 'de_network_group', 'de_source',
      

In [41]:
res1 = res[['onboardedDate', 'cust_id', 'user_id', 'device_id', 'created_dt',
       'de_gender', 'de_birthplace', 'de_mobile_no', 'de_first_product',
       'de_usage_type', 'de_telco_provider', 'de_date_of_birth', 'de_sign_up_acc_no',
       'de_kyc_status', 'de_addressline1', 'de_city', 'de_province',
       'de_barangay', 'de_province_code', 'de_postal_code', 'de_mailing_city',
       'de_mailing_postal_code', 'de_mailing_province', 'de_nature_of_work',
       'de_employment_status', 'de_mobile_os', 'de_doc_type',
       'de_onboarding_date', 'de_time_to_onboard_mins', 'de_company_name',
       'de_industry', 'de_device_dtl', 'de_age', 'de_region_name',
       'de_onboarding_lat', 'de_onboarding_long', 'de_loan_journey_lat',
       'de_loan_jouney_long', 'de_apigee_logs_lat', 'de_apigee_logs_long',
       'de_marital_status', 'de_dependents_cnt', 'de_monthly_income',
       'de_start_apply_date_time', 'de_network_group', 'de_source',
       'de_media_source', 'de_partner', 'de_acq_campaign', 'SourceGroups',
       'de_earliest_date', 'de_months_difference']].copy()

In [45]:
res1.shape

(742699, 52)

In [46]:
res1 = res1.drop_duplicates(keep='first')
res1.shape

(698470, 52)

In [49]:
res2 = res[['cust_id', 'de_education_id','de_education_type']].copy()

In [50]:
res2.shape

(742699, 3)

In [51]:
# Filtering rows with the max de_education_id for each cust_id
res2_filtered = res2[res2['de_education_id'] == res2.groupby('cust_id')['de_education_id'].transform('max')]
res2_filtered.shape

(459501, 3)

In [57]:
res2_filtered = res2_filtered.drop_duplicates(keep='first')
res2_filtered.shape

(423618, 3)

In [58]:
finaldata = res1.merge(res2_filtered, on = 'cust_id', how = 'left')
finaldata.shape

(698470, 54)

In [26]:
rescopy = res.copy()

In [59]:
duplicate_rows = finaldata[finaldata.duplicated('cust_id', keep=False)]

In [60]:
duplicate_rows['cust_id'].unique()

array([], dtype=int64)

In [62]:
finaldata[finaldata['cust_id'] == 1846420]

Unnamed: 0,onboardedDate,cust_id,user_id,device_id,created_dt,de_gender,de_birthplace,de_mobile_no,de_first_product,de_usage_type,de_telco_provider,de_date_of_birth,de_sign_up_acc_no,de_kyc_status,de_addressline1,de_city,de_province,de_barangay,de_province_code,de_postal_code,de_mailing_city,de_mailing_postal_code,de_mailing_province,de_nature_of_work,de_employment_status,de_mobile_os,de_doc_type,de_onboarding_date,de_time_to_onboard_mins,de_company_name,de_industry,de_device_dtl,de_age,de_region_name,de_onboarding_lat,de_onboarding_long,de_loan_journey_lat,de_loan_jouney_long,de_apigee_logs_lat,de_apigee_logs_long,de_marital_status,de_dependents_cnt,de_monthly_income,de_start_apply_date_time,de_network_group,de_source,de_media_source,de_partner,de_acq_campaign,SourceGroups,de_earliest_date,de_months_difference,de_education_id,de_education_type
2,2023-01-01,1846420,3d5cfd27ae59498785e40c7c0a9bea58,35dafe140e6644bb8551325866e62cb4iFvx32sI,2023-01-01 11:37:51+00:00,M,Santo tomas,639621306533,,,,18 Jun 1985,60818464200009,SKYC,0470 JASMIN ST PINEDA SUBD DAU MABALACAT CIT...,MABALACAT,Pampanga,DAU,,2010,MABALACAT,2010,Pampanga,6,1,iOS,Philippines - UMID Card,2023-01-01 11:37:51,28,UPS International Inc,8,IOS-17.5.1,39,Region III – Central Luzon,,,,,15.188132037168486,120.5866214461408,Live-in Partner,No dependents,45000,2024-08-25 09:16:37,,,,,,,2023-01-01 11:37:51,20.27,14,College Graduate


In [63]:
finaldata.columns

Index(['onboardedDate', 'cust_id', 'user_id', 'device_id', 'created_dt',
       'de_gender', 'de_birthplace', 'de_mobile_no', 'de_first_product',
       'de_usage_type', 'de_telco_provider', 'de_date_of_birth',
       'de_sign_up_acc_no', 'de_kyc_status', 'de_addressline1', 'de_city',
       'de_province', 'de_barangay', 'de_province_code', 'de_postal_code',
       'de_mailing_city', 'de_mailing_postal_code', 'de_mailing_province',
       'de_nature_of_work', 'de_employment_status', 'de_mobile_os',
       'de_doc_type', 'de_onboarding_date', 'de_time_to_onboard_mins',
       'de_company_name', 'de_industry', 'de_device_dtl', 'de_age',
       'de_region_name', 'de_onboarding_lat', 'de_onboarding_long',
       'de_loan_journey_lat', 'de_loan_jouney_long', 'de_apigee_logs_lat',
       'de_apigee_logs_long', 'de_marital_status', 'de_dependents_cnt',
       'de_monthly_income', 'de_start_apply_date_time', 'de_network_group',
       'de_source', 'de_media_source', 'de_partner', 'de_acq_campa

In [64]:
finaldata = finaldata[['onboardedDate', 'cust_id', 'user_id', 'device_id', 'created_dt',
       'de_gender', 'de_birthplace', 'de_mobile_no', 'de_first_product',
       'de_usage_type', 'de_telco_provider', 'de_education_id', 'de_education_type', 'de_date_of_birth',
       'de_sign_up_acc_no', 'de_kyc_status', 'de_addressline1', 'de_city',
       'de_province', 'de_barangay', 'de_province_code', 'de_postal_code',
       'de_mailing_city', 'de_mailing_postal_code', 'de_mailing_province',
       'de_nature_of_work', 'de_employment_status', 'de_mobile_os',
       'de_doc_type', 'de_onboarding_date', 'de_time_to_onboard_mins',
       'de_company_name', 'de_industry', 'de_device_dtl', 'de_age',
       'de_region_name', 'de_onboarding_lat', 'de_onboarding_long',
       'de_loan_journey_lat', 'de_loan_jouney_long', 'de_apigee_logs_lat',
       'de_apigee_logs_long', 'de_marital_status', 'de_dependents_cnt',
       'de_monthly_income', 'de_start_apply_date_time', 'de_network_group',
       'de_source', 'de_media_source', 'de_partner', 'de_acq_campaign',
       'SourceGroups', 'de_earliest_date', 'de_months_difference']].copy()

In [65]:
finaldata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 698470 entries, 0 to 698469
Data columns (total 54 columns):
 #   Column                    Non-Null Count   Dtype              
---  ------                    --------------   -----              
 0   onboardedDate             698470 non-null  datetime64[ns]     
 1   cust_id                   698470 non-null  int64              
 2   user_id                   698470 non-null  object             
 3   device_id                 698470 non-null  object             
 4   created_dt                698470 non-null  datetime64[us, UTC]
 5   de_gender                 698470 non-null  object             
 6   de_birthplace             698470 non-null  object             
 7   de_mobile_no              698470 non-null  object             
 8   de_first_product          640213 non-null  object             
 9   de_usage_type             640213 non-null  object             
 10  de_telco_provider         543654 non-null  object             
 11  

In [69]:
# sq = """drop table if exists worktable_data_analysis.customer_demographic_snapshot_data;"""
# client.query(sq)

# Convert 'onboardedDate' to a datetime format
res['onboardedDate'] = pd.to_datetime(res['onboardedDate'], errors='coerce')

# Define the dataset and table name
dataset_id = 'worktable_data_analysis'
table_id = 'customer_demographic_data'

# Define the schema based on the DataFrame structure
schema = [
    bigquery.SchemaField("onboardedDate", "TIMESTAMP"),  # Update to TIMESTAMP
    bigquery.SchemaField("cust_id", "INT64"),
    bigquery.SchemaField("user_id", "STRING"),
    bigquery.SchemaField("device_id", "STRING"),
    bigquery.SchemaField("created_dt", "TIMESTAMP"),
    bigquery.SchemaField("de_gender", "STRING"),
    bigquery.SchemaField("de_birthplace", "STRING"),
    bigquery.SchemaField("de_mobile_no", "STRING"),
    bigquery.SchemaField("de_first_product", "STRING"),
    bigquery.SchemaField("de_usage_type", "STRING"),
    bigquery.SchemaField("de_telco_provider", "STRING"),
    bigquery.SchemaField("de_education_id", "STRING"),
    bigquery.SchemaField("de_education_type", "STRING"),
    bigquery.SchemaField("de_date_of_birth", "STRING"),
    bigquery.SchemaField("de_sign_up_acc_no", "STRING"),
    bigquery.SchemaField("de_kyc_status", "STRING"),
    bigquery.SchemaField("de_addressline1", "STRING"),
    bigquery.SchemaField("de_city", "STRING"),
    bigquery.SchemaField("de_province", "STRING"),
    bigquery.SchemaField("de_barangay", "STRING"),
    bigquery.SchemaField("de_province_code", "STRING"),
    bigquery.SchemaField("de_postal_code", "STRING"),
    bigquery.SchemaField("de_mailing_city", "STRING"),
    bigquery.SchemaField("de_mailing_postal_code", "STRING"),
    bigquery.SchemaField("de_mailing_province", "STRING"),
    bigquery.SchemaField("de_nature_of_work", "STRING"),
    bigquery.SchemaField("de_employment_status", "STRING"),
    bigquery.SchemaField("de_mobile_os", "STRING"),
    bigquery.SchemaField("de_doc_type", "STRING"),
    bigquery.SchemaField("de_onboarding_date", "TIMESTAMP"),
    bigquery.SchemaField("de_time_to_onboard_mins", "INT64"),
    bigquery.SchemaField("de_company_name", "STRING"),
    bigquery.SchemaField("de_industry", "STRING"),
    bigquery.SchemaField("de_device_dtl", "STRING"),
    bigquery.SchemaField("de_age", "INT64"),
    bigquery.SchemaField("de_region_name", "STRING"),
    bigquery.SchemaField("de_onboarding_lat", "STRING"),
    bigquery.SchemaField("de_onboarding_long", "STRING"),
    bigquery.SchemaField("de_loan_journey_lat", "STRING"),
    bigquery.SchemaField("de_loan_jouney_long", "STRING"),
    bigquery.SchemaField("de_apigee_logs_lat", "STRING"),
    bigquery.SchemaField("de_apigee_logs_long", "STRING"),
    bigquery.SchemaField("de_marital_status", "STRING"),
    bigquery.SchemaField("de_dependents_cnt", "STRING"),
    bigquery.SchemaField("de_monthly_income", "STRING"),
    bigquery.SchemaField("de_start_apply_date_time", "TIMESTAMP"),
    bigquery.SchemaField("de_network_group", "STRING"),
    bigquery.SchemaField("de_source", "STRING"),
    bigquery.SchemaField("de_media_source", "STRING"),
    bigquery.SchemaField("de_partner", "STRING"),
    bigquery.SchemaField("de_acq_campaign", "STRING"),
    bigquery.SchemaField("SourceGroups", "STRING"),
    bigquery.SchemaField("de_earliest_date", "TIMESTAMP"),
    bigquery.SchemaField("de_months_difference", "FLOAT64")
]

# Create the dataset reference
dataset_ref = client.dataset(dataset_id)

# Define the table reference
table_ref = dataset_ref.table(table_id)

# Configure the job to overwrite the table if it already exists
job_config = bigquery.LoadJobConfig(schema=schema)

# Load the DataFrame into BigQuery
job = client.load_table_from_dataframe(finaldata, table_ref, job_config=job_config)

# Wait for the job to complete
job.result()

print(f"Table {table_id} created in dataset {dataset_id}.")


Table customer_demographic_data created in dataset worktable_data_analysis.


In [67]:
finaldata[finaldata['cust_id'] == 2297061]

Unnamed: 0,onboardedDate,cust_id,user_id,device_id,created_dt,de_gender,de_birthplace,de_mobile_no,de_first_product,de_usage_type,de_telco_provider,de_education_id,de_education_type,de_date_of_birth,de_sign_up_acc_no,de_kyc_status,de_addressline1,de_city,de_province,de_barangay,de_province_code,de_postal_code,de_mailing_city,de_mailing_postal_code,de_mailing_province,de_nature_of_work,de_employment_status,de_mobile_os,de_doc_type,de_onboarding_date,de_time_to_onboard_mins,de_company_name,de_industry,de_device_dtl,de_age,de_region_name,de_onboarding_lat,de_onboarding_long,de_loan_journey_lat,de_loan_jouney_long,de_apigee_logs_lat,de_apigee_logs_long,de_marital_status,de_dependents_cnt,de_monthly_income,de_start_apply_date_time,de_network_group,de_source,de_media_source,de_partner,de_acq_campaign,SourceGroups,de_earliest_date,de_months_difference
334160,2023-11-07,2297061,6773680d603e4760bec72eeb937e756c,9e34a152e8204209b532d70ad0ca6f88p5f9rC5u,2023-11-07 12:03:50+00:00,M,Bucal MaragondonCavite,639087228122,TSAE1.0,Explore,,14,College Graduate,06 Dec 2000,60822970610006,SKYC,BUCAL III B MARAGONDON CAVITE REGION 4,MARAGONDON,Cavite,BUCAL III B,,4112,MARAGONDON,4112,Cavite,0,8,iOS,Philippines - Driving License,2023-11-07 12:03:50,2,NONE,0,,23,Region IV-A – CALABARZON,,,,,14.273879344145676,120.75503172997,Single,No dependents,70000,2024-08-16 17:34:32,,,,,,,2023-11-07 12:03:50,10.09
