# 0. Imports

In [7]:
# imports
import pandas as pd
import numpy as np
import datetime as dt
import time
import os
import pickle

from colorama import Fore, Style
from sklearn.pipeline import Pipeline
from joblib import dump, load

from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import FunctionTransformer, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

# Importing those functions from the API package so they can be used
# by the loaded pipeline in the API package later

from decipherer.ml_logic.encoders import ffill_nan, add_datetime_features

In [8]:
# environment variables
LOCAL_REGISTRY_PATH = "../training_outputs"

# prerequisite: having a "training_outputs" folder at the root of the project
! tree ../training_outputs

[01;34m../training_outputs[0m
├── [01;34mmetrics[0m
│   ├── 20221123-163916.pickle
│   ├── 20221123-170451.pickle
│   └── 20221124-115822.pickle
├── [01;34mmodels[0m
│   ├── 20221123-163916.joblib
│   ├── 20221123-170451.joblib
│   └── 20221124-115822.joblib
└── [01;34mparams[0m
    ├── 20221123-163916.pickle
    ├── 20221123-170451.pickle
    └── 20221124-115822.pickle

3 directories, 9 files


# 1. Build pipeline

In [9]:
# Get rid of nan values using ffil
# def ffill_nan(X):
#     return X.fillna(method='ffill', axis = 0)

In [10]:
# # Create new datetime features
# def add_datetime_features(X):
    
#     # Copy X to avoid pandas warning
#     X_rep = X.copy()
    
#     # Handle datetime format
#     datetime = pd.to_datetime(X_rep['date'] + ' ' + X_rep['time'], format="%d/%m/%Y %H:%M:%S")
    
#     # Create new features using month, weekday, hour and minute
#     X_rep['month'] = datetime.dt.month
#     X_rep['weekday'] = datetime.dt.weekday
#     X_rep['hour'] = datetime.dt.hour
#     X_rep['minute'] = datetime.dt.minute
    
#     # Consider periodic effects
#     X_rep['month_sin'] = np.sin(2*np.pi*X_rep['month']/12)
#     X_rep['month_cos'] = np.cos(2*np.pi*X_rep['month']/12)
    
#     # Get rid of Datetime
#     return X_rep.drop(columns=['date', 'time'])

In [11]:
# Create a pipeline to preprocess the data

n_estimators = 10

features = ['global_active_power', 'global_reactive_power', 'voltage', 'global_intensity', 'global_consumption']
datetimes = ['date', 'time']

preparator = ColumnTransformer([ 
    ('imputer', FunctionTransformer(ffill_nan), features),
    ('datetime_features_adder', FunctionTransformer(add_datetime_features), datetimes)
])

pipeline = Pipeline([
    ('preparator', preparator),
    ('std_scaler', StandardScaler()),
    ('estimator', RandomForestRegressor(n_estimators=n_estimators))
])

pipeline

# 3. Prepare the data to train

In [12]:
# Take just a subset of the data for now (year==2008)
data = pd.read_csv('../data/household_power_consumption.txt', sep=';', na_values='?')
data = data[data.Date.str.endswith('2008')]
data.columns = data.columns.str.lower()
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 527040 entries, 547596 to 1074635
Data columns (total 9 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   date                   527040 non-null  object 
 1   time                   527040 non-null  object 
 2   global_active_power    526905 non-null  float64
 3   global_reactive_power  526905 non-null  float64
 4   voltage                526905 non-null  float64
 5   global_intensity       526905 non-null  float64
 6   sub_metering_1         526905 non-null  float64
 7   sub_metering_2         526905 non-null  float64
 8   sub_metering_3         526905 non-null  float64
dtypes: float64(7), object(2)
memory usage: 40.2+ MB


In [13]:
labels = ['sub_metering_1', 'sub_metering_2', 'sub_metering_3']
data['global_consumption'] = data[labels].sum(axis=1)

In [14]:
# Create X, y and save datetime in a separeted column
X = data.drop(columns=labels)
y = data[labels]
X.shape, y.shape

((527040, 7), (527040, 3))

In [15]:
y.isna().sum()

sub_metering_1    135
sub_metering_2    135
sub_metering_3    135
dtype: int64

In [16]:
# Remove na values from y
y = y.fillna(method='ffill', axis=0)

In [17]:
X.head()

Unnamed: 0,date,time,global_active_power,global_reactive_power,voltage,global_intensity,global_consumption
547596,1/1/2008,00:00:00,1.62,0.07,241.25,6.6,18.0
547597,1/1/2008,00:01:00,1.626,0.072,241.74,6.6,18.0
547598,1/1/2008,00:02:00,1.622,0.072,241.52,6.6,18.0
547599,1/1/2008,00:03:00,1.612,0.07,240.82,6.6,18.0
547600,1/1/2008,00:04:00,1.612,0.07,240.8,6.6,18.0


In [18]:
y.head()

Unnamed: 0,sub_metering_1,sub_metering_2,sub_metering_3
547596,0.0,0.0,18.0
547597,0.0,0.0,18.0
547598,0.0,0.0,18.0
547599,0.0,0.0,18.0
547600,0.0,0.0,18.0


In [19]:
# Train/test Split /!\ For later, if we use sequential models (ARIMA, RNN): see if we consider a TS special train/test split (to keep sequence's logic)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((368928, 7), (158112, 7), (368928, 3), (158112, 3))

In [20]:
X_train.head()

Unnamed: 0,date,time,global_active_power,global_reactive_power,voltage,global_intensity,global_consumption
752369,22/5/2008,04:53:00,0.192,0.0,240.16,0.8,1.0
962667,15/10/2008,05:51:00,0.37,0.08,239.61,1.6,2.0
659918,19/3/2008,00:02:00,0.464,0.262,247.98,2.2,1.0
607017,11/2/2008,06:21:00,0.346,0.106,246.11,1.4,0.0
869632,11/8/2008,15:16:00,0.174,0.152,240.71,0.8,1.0


# 4. Train and save the pipeline

In [21]:
# Function to save the pipeline locally
def save_pipeline(pipeline: Pipeline = None,
                  params: dict = None,
                  metrics: dict = None) -> None:
    """
    persist trained pipeline, params and metrics
    """

    timestamp = time.strftime("%Y%m%d-%H%M%S")

    print(Fore.BLUE + "\nSave pipeline to local disk..." + Style.RESET_ALL)

    # save params
    if params is not None:
        params_path = os.path.join(LOCAL_REGISTRY_PATH, "params", timestamp + ".pickle")
        with open(params_path, "wb") as file:
            pickle.dump(params, file)

    # save metrics
    if metrics is not None:
        metrics_path = os.path.join(LOCAL_REGISTRY_PATH, "metrics", timestamp + ".pickle")
        with open(metrics_path, "wb") as file:
            pickle.dump(metrics, file)

    # save pipeline
    if pipeline is not None:
        pipeline_path = os.path.join(LOCAL_REGISTRY_PATH, "models", timestamp + ".joblib")
        dump(pipeline, pipeline_path)
         
    print("\n✅ data saved locally")

    return None

In [22]:
%%time

# Fit the pipeline
pipeline.fit(X_train, y_train)

# Mesure its performance
r2_score = pipeline.score(X_test, y_test)
#r2_score = pipeline.score(X_test[['Voltage']].dropna(), y_test.dropna())

# Save it locally
params = dict(
    # Model parameters
    n_estimators=10,

    # Package behavior
    context="train",

    # Data used to fit
    dataset_start=data.date.iloc[0],
    dataset_end=data.date.iloc[-1]
)

metrics = dict(r2_score=r2_score)
print(metrics)

save_pipeline(pipeline, params, metrics)

{'r2_score': 0.8891811100942913}
[34m
Save pipeline to local disk...[0m

✅ data saved locally
CPU times: user 1min 20s, sys: 22.1 ms, total: 1min 20s
Wall time: 1min 20s


# 5. Load the pipeline and try to predict

In [23]:
pipeline_path = os.path.join(LOCAL_REGISTRY_PATH, "models", "20221123-170451.joblib")
pipeline_loaded = load(pipeline_path)
pipeline_loaded

In [24]:
X_test

Unnamed: 0,date,time,global_active_power,global_reactive_power,voltage,global_intensity,global_consumption
1064234,24/12/2008,18:38:00,1.532,0.058,242.53,6.2,19.0
698710,14/4/2008,22:34:00,2.256,0.000,244.56,9.2,1.0
1038917,7/12/2008,04:41:00,0.256,0.076,246.41,1.0,0.0
579273,22/1/2008,23:57:00,0.422,0.090,243.35,2.0,2.0
718694,28/4/2008,19:38:00,0.344,0.108,240.00,1.4,0.0
...,...,...,...,...,...,...,...
776213,7/6/2008,18:17:00,1.274,0.000,237.93,5.4,18.0
584907,26/1/2008,21:51:00,2.730,0.000,235.75,11.6,17.0
896878,30/8/2008,13:22:00,0.188,0.148,239.96,1.0,1.0
1074523,31/12/2008,22:07:00,1.778,0.130,244.44,7.4,0.0


In [25]:
y_pred = pd.DataFrame(pipeline_loaded.predict(X_test), columns=labels)
y_pred['datetime'] = pd.to_datetime(X_test['date'] + ' ' + X_test['time'])
y_pred

Unnamed: 0,sub_metering_1,sub_metering_2,sub_metering_3,datetime
0,0.0,0.9,18.1,NaT
1,0.0,0.0,1.0,NaT
2,0.0,0.0,0.0,NaT
3,0.0,1.8,0.2,NaT
4,0.0,0.0,0.0,NaT
...,...,...,...,...
158107,0.0,0.0,18.0,NaT
158108,0.0,0.0,17.0,NaT
158109,0.0,0.0,1.0,NaT
158110,0.0,0.0,0.0,NaT
