# KOL Data Standardization - Step 3 (Data Loading)

In [1]:
"""
KOL Data Standardization - Step 3 (Extracting the staging area data and loading it to the reporting area)

This module represents the final step in KOL Data Standardization process where we load the staging area data and make it adequate for the reporting layer.
It includes the following steps:
- Loading the staging area dataframe into the memory.
- Separating out the mdm_ids and their degree/speciality mappings to store in separate tables.
- Store the data into the reporting layer.
The above steps will ensure that the data is ready for the reporting layer which is suitable for the end-users.
"""

import pandas as pd
from IPython.display import display

In [2]:
STAGING_AREA_FILE_PATH = "output_store/staging_area.xlsx"
REPORTING_AREA_FILE_PATH = "output_store/reporting_area.xlsx"
REPORTING_AREA_SP_MAP_FILE_PATH = "output_store/reporting_area_sp_map.xlsx"
REPORTING_AREA_DEG_MAP_FILE_PATH = "output_store/reporting_area_deg_map.xlsx"

In [3]:
print("TASK: Loading the Staging Area Dataframe")
staging_df = pd.read_excel(STAGING_AREA_FILE_PATH)
display(staging_df.head())

TASK: Loading the Staging Area Dataframe


Unnamed: 0,mdm_id,first_name,last_name,age,city,state,profile_status,speciality,degree,batch_id
0,109,Justin,Davenport,-1,Laurenport,Washington,Not Profiled,295,236,2024-09-07 14:43:31.929
1,119,Jared,-,43,-,North Dakota,Not Profiled,1061873839,2,2024-09-07 14:43:31.966
2,124,Justin,Sexton,-1,Bethstad,Colorado,Not Profiled,330466,0,2024-09-07 14:43:31.929
3,130,Courtney,-,-1,Jamesview,Montana,Not Profiled,0,0,2024-09-07 14:43:31.966
4,133,Alex,-,19,Thomasside,-,Partially Profiled,30770,8,2024-09-07 14:43:31.929


In [4]:
print("TASK: Preparing the KOL speciality and degree maps")
kol_speciality_map = staging_df.assign(speciality_id=staging_df['speciality'].str.split(',')).explode('speciality_id')[["mdm_id", "speciality_id"]]
kol_degree_map = staging_df.assign(degree_id=staging_df['degree'].str.split(',')).explode('degree_id')[["mdm_id", "degree_id"]]
print("TASK: Preparing the KOL reporting layer dataset")
reporting_df = staging_df.drop(["speciality", "degree", "batch_id"], axis=1)

TASK: Preparing the KOL speciality and degree maps
TASK: Preparing the KOL reporting layer dataset


In [5]:
display(kol_speciality_map.head())

Unnamed: 0,mdm_id,speciality_id
0,109,295
1,119,106
1,119,187
1,119,383
1,119,9


In [6]:
display(kol_degree_map.head())

Unnamed: 0,mdm_id,degree_id
0,109,2
0,109,3
0,109,6
1,119,2
2,124,0


In [7]:
display(reporting_df.head())

Unnamed: 0,mdm_id,first_name,last_name,age,city,state,profile_status
0,109,Justin,Davenport,-1,Laurenport,Washington,Not Profiled
1,119,Jared,-,43,-,North Dakota,Not Profiled
2,124,Justin,Sexton,-1,Bethstad,Colorado,Not Profiled
3,130,Courtney,-,-1,Jamesview,Montana,Not Profiled
4,133,Alex,-,19,Thomasside,-,Partially Profiled


In [8]:
print("TASK: Saving the data maps and reporting area dataset")
kol_speciality_map.to_excel(REPORTING_AREA_SP_MAP_FILE_PATH, index=False)
kol_degree_map.to_excel(REPORTING_AREA_DEG_MAP_FILE_PATH, index=False)
reporting_df.to_excel(REPORTING_AREA_FILE_PATH, index=False)
print("TASK: Successfully loaded the data to the reporting area")

TASK: Saving the data maps and reporting area dataset
TASK: Successfully loaded the data to the reporting area
