# 0. Imports

In [5]:
%load_ext autoreload
%autoreload 2

# imports
import pandas as pd
import os
from joblib import load

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

# Importing those functions from the API package so they can be used
# by the loaded pipeline in the API package later
from decipherer.ml_logic.encoders import ffill_nan, add_datetime_features
from decipherer.ml_logic.registry import save_pipeline

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
# environment variables
LOCAL_REGISTRY_PATH = "../../training_outputs"
LOCAL_DATASET_PATH = "../../data/input_datasets"

# prerequisite: having a "training_outputs" folder at the root of the project. Example:
#
# /training_outputs
# ├── metrics
# │   ├── room-20221129-144720.pickle
# │   ├── room-20221129-144757.pickle
# │   └── room-20221129-174324.pickle
# ├── models
# │   ├── room-20221129-144720.joblib
# │   ├── room-20221129-144757.joblib
# │   └── room-20221129-174324.joblib
# ├── params
# │   ├── room-20221129-144720.pickle
# │   ├── room-20221129-144757.pickle
# │   └── room-20221129-174324.pickle

# 1. Build pipeline

In [3]:
# Create a pipeline to preprocess the data

n_estimators = 10

features = ['global_active_power', 'global_reactive_power', 'voltage', 'global_intensity', 'global_consumption']
datetimes = ['date', 'time']

preparator = ColumnTransformer([ 
    ('imputer', FunctionTransformer(ffill_nan), features),
    ('datetime_features_adder', FunctionTransformer(add_datetime_features), datetimes)
])

pipeline = Pipeline([
    ('preparator', preparator),
    ('std_scaler', StandardScaler()),
    ('estimator', RandomForestRegressor(n_estimators=n_estimators))
])

pipeline

# 3. Prepare the data to train

In [6]:
# Take just a subset of the data for now (year==2008)
data = pd.read_csv('../../data/household_power_consumption.txt', sep=';', na_values='?')
data = data[data.Date.str.endswith('2008')]
data.columns = data.columns.str.lower()
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 527040 entries, 547596 to 1074635
Data columns (total 9 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   date                   527040 non-null  object 
 1   time                   527040 non-null  object 
 2   global_active_power    526905 non-null  float64
 3   global_reactive_power  526905 non-null  float64
 4   voltage                526905 non-null  float64
 5   global_intensity       526905 non-null  float64
 6   sub_metering_1         526905 non-null  float64
 7   sub_metering_2         526905 non-null  float64
 8   sub_metering_3         526905 non-null  float64
dtypes: float64(7), object(2)
memory usage: 40.2+ MB


In [7]:
labels = ['sub_metering_1', 'sub_metering_2', 'sub_metering_3']
data['global_consumption'] = data[labels].sum(axis=1)

In [8]:
# Create X, y and save datetime in a separeted column
X = data.drop(columns=labels)
y = data[labels]
X.shape, y.shape

((527040, 7), (527040, 3))

In [9]:
y.isna().sum()

sub_metering_1    135
sub_metering_2    135
sub_metering_3    135
dtype: int64

In [10]:
# Remove na values from y
y = y.fillna(method='ffill', axis=0)

In [11]:
X.head()

Unnamed: 0,date,time,global_active_power,global_reactive_power,voltage,global_intensity,global_consumption
547596,1/1/2008,00:00:00,1.62,0.07,241.25,6.6,18.0
547597,1/1/2008,00:01:00,1.626,0.072,241.74,6.6,18.0
547598,1/1/2008,00:02:00,1.622,0.072,241.52,6.6,18.0
547599,1/1/2008,00:03:00,1.612,0.07,240.82,6.6,18.0
547600,1/1/2008,00:04:00,1.612,0.07,240.8,6.6,18.0


In [12]:
y.head()

Unnamed: 0,sub_metering_1,sub_metering_2,sub_metering_3
547596,0.0,0.0,18.0
547597,0.0,0.0,18.0
547598,0.0,0.0,18.0
547599,0.0,0.0,18.0
547600,0.0,0.0,18.0


In [13]:
# Train/test Split /!\ For later, if we use sequential models (ARIMA, RNN): see if we consider a TS special train/test split (to keep sequence's logic)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((368928, 7), (158112, 7), (368928, 3), (158112, 3))

In [14]:
X_train.head()

Unnamed: 0,date,time,global_active_power,global_reactive_power,voltage,global_intensity,global_consumption
746516,18/5/2008,03:20:00,0.32,0.244,239.55,1.6,1.0
846633,26/7/2008,15:57:00,1.482,0.196,238.66,6.2,18.0
695955,13/4/2008,00:39:00,3.882,0.102,237.71,16.2,57.0
976191,24/10/2008,15:15:00,0.244,0.0,242.17,1.0,1.0
731108,7/5/2008,10:32:00,0.348,0.12,241.22,1.4,1.0


# 4. Train and save the pipeline

In [17]:
%%time

# Fit the pipeline
pipeline.fit(X_train, y_train)

# Mesure its performance
r2_score = pipeline.score(X_test, y_test)

# Save it locally
params = dict(
    # Model parameters
    n_estimators=10,

    # Package behavior
    context="train",

    # Data used to fit
    dataset_start=data.date.iloc[0],
    dataset_end=data.date.iloc[-1]
)

metrics = dict(r2_score=r2_score)
print(metrics)

save_pipeline(pipeline, params, metrics, pipeline_type='room', local_registry_path=LOCAL_REGISTRY_PATH)

{'r2_score': 0.8937405605122081}
[34m
Save pipeline to local disk...[0m

✅ data saved locally
CPU times: user 26.1 s, sys: 96.4 ms, total: 26.2 s
Wall time: 26.2 s


# 5. Load the pipeline and try to predict

In [6]:
# # To load a specific local pipeline
pipeline_path = os.path.join(LOCAL_REGISTRY_PATH, "models", "room-20221129-174324.joblib")
pipeline_loaded = load(pipeline_path)
pipeline_loaded

In [8]:
# To use X_pred
# X_pred = X_test

# To load data from a csv file
input_data = os.path.join(LOCAL_DATASET_PATH, "dataset_2008-01-01_2008-01-28_40000rows.csv")
X_pred = pd.read_csv(input_data)
X_pred.columns = X_pred.columns.str.lower()
X_pred

Unnamed: 0,date,time,global_active_power,global_reactive_power,voltage,global_intensity,global_consumption
0,1/1/2008,00:00:00,1.620,0.070,241.25,6.6,18.0
1,1/1/2008,00:01:00,1.626,0.072,241.74,6.6,18.0
2,1/1/2008,00:02:00,1.622,0.072,241.52,6.6,18.0
3,1/1/2008,00:03:00,1.612,0.070,240.82,6.6,18.0
4,1/1/2008,00:04:00,1.612,0.070,240.80,6.6,18.0
...,...,...,...,...,...,...,...
39995,28/1/2008,18:35:00,1.382,0.000,236.45,5.8,0.0
39996,28/1/2008,18:36:00,1.328,0.000,235.70,5.6,0.0
39997,28/1/2008,18:37:00,1.302,0.056,234.97,5.6,0.0
39998,28/1/2008,18:38:00,1.308,0.056,235.58,5.6,0.0


In [11]:
labels = ['sub_metering_1', 'sub_metering_2', 'sub_metering_3']

y_pred = pd.DataFrame(pipeline_loaded.predict(X_pred), columns=labels)
y_pred['datetime'] = pd.to_datetime(X_pred['date'] + ' ' + X_pred['time'], 
                                    format="%d/%m/%Y %H:%M:%S"
                                    ).reset_index(drop=True)
y_pred

Unnamed: 0,sub_metering_1,sub_metering_2,sub_metering_3,datetime
0,0.0,0.0,18.0,2008-01-01 00:00:00
1,0.0,0.0,18.0,2008-01-01 00:01:00
2,0.0,0.0,18.0,2008-01-01 00:02:00
3,0.1,0.1,17.8,2008-01-01 00:03:00
4,0.1,0.1,17.8,2008-01-01 00:04:00
...,...,...,...,...
39995,0.0,0.0,0.0,2008-01-28 18:35:00
39996,0.0,0.0,0.0,2008-01-28 18:36:00
39997,0.0,0.0,0.0,2008-01-28 18:37:00
39998,0.0,0.0,0.0,2008-01-28 18:38:00


In [12]:
file_name = f"y_pred_40000rows.csv"
file_path = os.path.join(LOCAL_DATASET_PATH, file_name)
print(file_path)
y_pred.to_csv(file_path, sep=',', encoding='utf-8', index=False)

../../data/input_datasets/y_pred_40000rows.csv
