# 0. Imports

In [16]:
import pandas as pd
import numpy as np
import datetime as dt
import time
import os

# 1. Prepare training data

In [9]:
# Take just a subset of the data for now (year==2008)
data = pd.read_csv('../data/household_power_consumption.txt', sep=';', na_values='?')
data = data[data.Date.str.endswith('2008')]
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 527040 entries, 547596 to 1074635
Data columns (total 9 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   Date                   527040 non-null  object 
 1   Time                   527040 non-null  object 
 2   Global_active_power    526905 non-null  float64
 3   Global_reactive_power  526905 non-null  float64
 4   Voltage                526905 non-null  float64
 5   Global_intensity       526905 non-null  float64
 6   Sub_metering_1         526905 non-null  float64
 7   Sub_metering_2         526905 non-null  float64
 8   Sub_metering_3         526905 non-null  float64
dtypes: float64(7), object(2)
memory usage: 40.2+ MB


In [10]:
# Create X, y and save datetime in a separeted column
labels = ['Sub_metering_1', 'Sub_metering_2', 'Sub_metering_3']
X = data.drop(columns=labels)
y = data[labels]
X.shape, y.shape

((527040, 6), (527040, 3))

In [11]:
X.head()

Unnamed: 0,Date,Time,Global_active_power,Global_reactive_power,Voltage,Global_intensity
547596,1/1/2008,00:00:00,1.62,0.07,241.25,6.6
547597,1/1/2008,00:01:00,1.626,0.072,241.74,6.6
547598,1/1/2008,00:02:00,1.622,0.072,241.52,6.6
547599,1/1/2008,00:03:00,1.612,0.07,240.82,6.6
547600,1/1/2008,00:04:00,1.612,0.07,240.8,6.6


In [12]:
y.head()

Unnamed: 0,Sub_metering_1,Sub_metering_2,Sub_metering_3
547596,0.0,0.0,18.0
547597,0.0,0.0,18.0
547598,0.0,0.0,18.0
547599,0.0,0.0,18.0
547600,0.0,0.0,18.0


# 2. Building preprocessor

In [None]:
# Get rid of nan values

In [None]:
# Create "global_consumption"

In [None]:
# Handle datetime format

In [13]:
# Create new datetime features

In [14]:
# Standard scaler 

In [None]:
# Build final preprocessor
preprocessor = ...

# 3. Building the estimator (model)

In [None]:
# Random forest model
estimator = ...

# 4. Building the whole pipeline and save it

In [15]:
# Wrap everything in a pipeline
from sklearn.pipeline import Pipeline
pipeline = Pipeline(
    ('preprocessor', preprocessor),
    ('estimator', estimator) 
)

In [None]:
# Save the pipeline locally
def save_pipeline(pipeline: Pipeline = None,
                  params: dict = None,
                  metrics: dict = None) -> None:
    """
    persist trained pipeline, params and metrics
    """

    timestamp = time.strftime("%Y%m%d-%H%M%S")

    if os.environ.get("MODEL_TARGET") == "mlflow":

        # retrieve mlflow env params
        mlflow_tracking_uri = os.environ.get("MLFLOW_TRACKING_URI")
        mlflow_experiment = os.environ.get("MLFLOW_EXPERIMENT")
        mlflow_model_name = os.environ.get("MLFLOW_MODEL_NAME")

        # configure mlflow
        mlflow.set_tracking_uri(mlflow_tracking_uri)
        mlflow.set_experiment(experiment_name=mlflow_experiment)

        with mlflow.start_run():

            # STEP 1: push parameters to mlflow
            if params is not None:
                mlflow.log_params(params)

            # STEP 2: push metrics to mlflow
            if metrics is not None:
                mlflow.log_metrics(metrics)

            # STEP 3: push model to mlflow
            if model is not None:

                mlflow.keras.log_model(keras_model=model,
                                       artifact_path="model",
                                       keras_module="tensorflow.keras",
                                       registered_model_name=mlflow_model_name)

        print("\n✅ data saved to mlflow")

        return None

    print(Fore.BLUE + "\nSave model to local disk..." + Style.RESET_ALL)

    # save params
    if params is not None:
        params_path = os.path.join(LOCAL_REGISTRY_PATH, "params", timestamp + ".pickle")
        print(f"- params path: {params_path}")
        with open(params_path, "wb") as file:
            pickle.dump(params, file)

    # save metrics
    if metrics is not None:
        metrics_path = os.path.join(LOCAL_REGISTRY_PATH, "metrics", timestamp + ".pickle")
        print(f"- metrics path: {metrics_path}")
        with open(metrics_path, "wb") as file:
            pickle.dump(metrics, file)

    # save model
    if model is not None:
        model_path = os.path.join(LOCAL_REGISTRY_PATH, "models", timestamp)
        print(f"- model path: {model_path}")
        model.save(model_path)

    print("\n✅ data saved locally")

    return None

In [None]:
params = {}

metrics = {}

save_pipeline(pipeline, params, metrics)