In [1]:
%load_ext autoreload
%autoreload 2

In [1]:
import pandas as pd
import numpy as np
import os
from datanooblol.configuration.config_manager import LoadFeatureConfig, LoadRepoConfig
from datanooblol.extractor.data_extractor import DataExtractor
from datanooblol.uploader.data_uploader import DataUploader
from datanooblol.data_quality.health_check import HealthCheck
from datanooblol.feature_store.feature_register import FeatureRegister

In [2]:
FEATURE_GROUPS = ["target", "proportion", "geography", "toxic", "monetary"]

In [7]:
hc = HealthCheck()
freg = FeatureRegister()
dxtr = DataExtractor(end_date="2022-04-01", start_date="2022-04-01", partition="load_dt")
dulr = DataUploader()

In [4]:
# save to staging: cleaned

for feature_grp in FEATURE_GROUPS:
    dulr.upload_zone(zone="staging", step="cleaned", feature_grp=feature_grp, 
                     data=dxtr.extract_zone(zone="landing", 
                                            step=None, 
                                            feature_grp=feature_grp))

In [5]:
# save to staging: aggregated

for feature_grp in FEATURE_GROUPS:
    dulr.upload_zone(zone="staging", step="aggregated", feature_grp=feature_grp, 
                     data=dxtr.extract_zone(zone="staging", 
                                            step="cleaned", 
                                            feature_grp=feature_grp))

# Feature Engineering and register feature group

In [14]:
target_df = dxtr.extract_zone(zone="staging", step="aggregated", feature_grp="target")
freg.register(feature_grp="target", features=["MEDV"], data=target_df)
freg.register(feature_grp="entity", features=["house_id"], data=target_df)
freg.register(feature_grp="load_date", features=["load_dt"], data=target_df)
freg.register(feature_grp="timestamp_field", features=["event_timestamp"], data=target_df)

In [15]:
proportion_df = dxtr.extract_zone(zone="staging", step="aggregated", feature_grp="proportion")
freg.register(feature_grp="proportion", features=["ZN", "INDUS", "RM", "PTRATIO", "B", "LSTAT"], data=proportion_df)

In [16]:
geography_df = dxtr.extract_zone(zone="staging", step="aggregated", feature_grp="geography")
freg.register(feature_grp="geography", features=["CHAS", "DIS", "RAD"], data=geography_df)

In [17]:
toxic_df = dxtr.extract_zone(zone="staging", step="aggregated", feature_grp="toxic")
freg.register(feature_grp="toxic", features=["NOX"], data=toxic_df)

In [18]:
monetary_df = dxtr.extract_zone(zone="staging", step="aggregated", feature_grp="monetary")
freg.register(feature_grp="monetary", features=["TAX"], data=monetary_df)

In [13]:
# save to feature_store

for feature_grp in FEATURE_GROUPS:
    dulr.upload_feature_store(feature_grp=feature_grp, 
                     data=dxtr.extract_zone(zone="staging", 
                                            step="aggregated", 
                                            feature_grp=feature_grp))