# KOL Data Standardization - Step 0 (Dummy Data Input Generation)

In [1]:
"""
KOL Data Standardization - Step 0 (Preparing Input Data in Batches for the ETL process)

This module generates a dummy KOL input dataset using the Python's Faker package.
This module serves as a utility tool to generate input dataset for KOL Data Standardization Pipeline.
It is based on the fact that a KOL (Key Opinion Leader) can have multiple degrees as well as multiple specialities.
"""

import os
import datetime
import faker
import pandas as pd
import utils

In [2]:
# Fetching Master Tables to prepare the dummy input data for KOL Data Standardization

SPECIALITY_MASTER_PATH = "master_tables/speciality_master.csv"
PROFILE_STATUS_MATER_PATH = "master_tables/profile_status_master.csv"
DEGREE_MASTER_PATH = "master_tables/degree_master.csv"

speciality_df = pd.read_csv(SPECIALITY_MASTER_PATH)
SPECIALITIES = speciality_df["speciality"].tolist()

degree_df = pd.read_csv(DEGREE_MASTER_PATH)
DEGREES = degree_df["degree"].tolist()

In [3]:
def generate_kol_data():
    """Main Function to generate dummy input data (with random missing values) for KOL Data Standardization input"""
    faker_obj = faker.Faker()
    dob = faker_obj.date_of_birth(minimum_age=18, maximum_age=60)
    age = (datetime.datetime.today().date() - dob).days // 365
    gender = utils.random_utility("choice", {"seq": ["Male", "Female", "Other"]})
    if gender == "Male":
        f_name = faker_obj.first_name_male()
        l_name = faker_obj.last_name_male()
    elif gender == "Female":
        f_name = faker_obj.first_name_female()
        l_name = faker_obj.last_name_female()
    else:
        f_name = faker_obj.first_name_nonbinary()
        l_name = faker_obj.last_name_nonbinary()
    state = utils.random_utility("choice", {"seq": [None, faker_obj.state()]})
    city = utils.random_utility("choice", {"seq": [None, faker_obj.city()]})
    degree = []
    for i in range(utils.random_utility("randint", {"a": 0, "b": 3})):
        degree.append(utils.random_utility("choice", {"seq": DEGREES}))
    degree = None if len(degree) == 0 else ",".join(degree)
    speciality = []
    for i in range(utils.random_utility("randint", {"a": 0, "b": 4})):
        speciality.append(utils.random_utility("choice", {"seq": SPECIALITIES}))
    speciality = None if len(speciality) == 0 else ",".join(speciality)
    profile_status = utils.random_utility("choice", {"seq": [None, 0, 1, 2]})

    kol_data = {
        "mdm_id": utils.random_utility("randint", {"a": 100, "b": 200}),
        "first_name": utils.random_utility("choice", {"seq": [None, f_name]}),
        "last_name": utils.random_utility("choice", {"seq": [None, l_name]}),
        "age": utils.random_utility("choice", {"seq": [None, age]}),
        "city": city,
        "state": state,
        "profile_status": profile_status,
        "speciality": speciality,
        "degree": degree,
    }
    return kol_data

In [4]:
if __name__ == "__main__":
    DATA_STORE_PATH = "data_store"
    BATCH_SIZE = 10
    BATCHES = 3

    print(
        f"Generating the KOL Input data.\nBatches: {BATCHES}.\nEach Batch Size: {BATCH_SIZE}\n\n"
    )
    for batch in range(1, BATCHES + 1):
        data_list = []
        for i in range(BATCH_SIZE):
            data_list.append(generate_kol_data())
        batch_df = pd.DataFrame(data_list)
        file_path = os.path.join(DATA_STORE_PATH, f"batch_{batch}.xlsx")
        batch_df.to_excel(file_path, index=False)
        print(f"Saved Batch: {file_path}")
    print(
        "\nSuccessfully generated the input data for KOL Data Standardization process"
    )

Generating the KOL Input data.
Batches: 3.
Each Batch Size: 10


Saved Batch: data_store\batch_1.xlsx
Saved Batch: data_store\batch_2.xlsx
Saved Batch: data_store\batch_3.xlsx

Successfully generated the input data for KOL Data Standardization process
